diff options
Diffstat (limited to 'net/ipv4')
44 files changed, 913 insertions, 429 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f8c49ce5b283..f032688d20d3 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -55,4 +55,4 @@ obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o | |||
55 | obj-$(CONFIG_NETLABEL) += cipso_ipv4.o | 55 | obj-$(CONFIG_NETLABEL) += cipso_ipv4.o |
56 | 56 | ||
57 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 57 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
58 | xfrm4_output.o | 58 | xfrm4_output.o xfrm4_protocol.o |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 19ab78aca547..8c54870db792 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -1505,9 +1505,9 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) | |||
1505 | bhptr = per_cpu_ptr(mib[0], cpu); | 1505 | bhptr = per_cpu_ptr(mib[0], cpu); |
1506 | syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); | 1506 | syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); |
1507 | do { | 1507 | do { |
1508 | start = u64_stats_fetch_begin_bh(syncp); | 1508 | start = u64_stats_fetch_begin_irq(syncp); |
1509 | v = *(((u64 *) bhptr) + offt); | 1509 | v = *(((u64 *) bhptr) + offt); |
1510 | } while (u64_stats_fetch_retry_bh(syncp, start)); | 1510 | } while (u64_stats_fetch_retry_irq(syncp, start)); |
1511 | 1511 | ||
1512 | res += v; | 1512 | res += v; |
1513 | } | 1513 | } |
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 717902669d2f..a2afa89513a0 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) | |||
155 | struct iphdr *iph, *top_iph; | 155 | struct iphdr *iph, *top_iph; |
156 | struct ip_auth_hdr *ah; | 156 | struct ip_auth_hdr *ah; |
157 | struct ah_data *ahp; | 157 | struct ah_data *ahp; |
158 | int seqhi_len = 0; | ||
159 | __be32 *seqhi; | ||
160 | int sglists = 0; | ||
161 | struct scatterlist *seqhisg; | ||
158 | 162 | ||
159 | ahp = x->data; | 163 | ahp = x->data; |
160 | ahash = ahp->ahash; | 164 | ahash = ahp->ahash; |
@@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) | |||
167 | ah = ip_auth_hdr(skb); | 171 | ah = ip_auth_hdr(skb); |
168 | ihl = ip_hdrlen(skb); | 172 | ihl = ip_hdrlen(skb); |
169 | 173 | ||
174 | if (x->props.flags & XFRM_STATE_ESN) { | ||
175 | sglists = 1; | ||
176 | seqhi_len = sizeof(*seqhi); | ||
177 | } | ||
170 | err = -ENOMEM; | 178 | err = -ENOMEM; |
171 | iph = ah_alloc_tmp(ahash, nfrags, ihl); | 179 | iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); |
172 | if (!iph) | 180 | if (!iph) |
173 | goto out; | 181 | goto out; |
174 | 182 | seqhi = (__be32 *)((char *)iph + ihl); | |
175 | icv = ah_tmp_icv(ahash, iph, ihl); | 183 | icv = ah_tmp_icv(ahash, seqhi, seqhi_len); |
176 | req = ah_tmp_req(ahash, icv); | 184 | req = ah_tmp_req(ahash, icv); |
177 | sg = ah_req_sg(ahash, req); | 185 | sg = ah_req_sg(ahash, req); |
186 | seqhisg = sg + nfrags; | ||
178 | 187 | ||
179 | memset(ah->auth_data, 0, ahp->icv_trunc_len); | 188 | memset(ah->auth_data, 0, ahp->icv_trunc_len); |
180 | 189 | ||
@@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) | |||
210 | ah->spi = x->id.spi; | 219 | ah->spi = x->id.spi; |
211 | ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); | 220 | ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); |
212 | 221 | ||
213 | sg_init_table(sg, nfrags); | 222 | sg_init_table(sg, nfrags + sglists); |
214 | skb_to_sgvec(skb, sg, 0, skb->len); | 223 | skb_to_sgvec_nomark(skb, sg, 0, skb->len); |
215 | 224 | ||
216 | ahash_request_set_crypt(req, sg, icv, skb->len); | 225 | if (x->props.flags & XFRM_STATE_ESN) { |
226 | /* Attach seqhi sg right after packet payload */ | ||
227 | *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); | ||
228 | sg_set_buf(seqhisg, seqhi, seqhi_len); | ||
229 | } | ||
230 | ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); | ||
217 | ahash_request_set_callback(req, 0, ah_output_done, skb); | 231 | ahash_request_set_callback(req, 0, ah_output_done, skb); |
218 | 232 | ||
219 | AH_SKB_CB(skb)->tmp = iph; | 233 | AH_SKB_CB(skb)->tmp = iph; |
@@ -295,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
295 | struct ip_auth_hdr *ah; | 309 | struct ip_auth_hdr *ah; |
296 | struct ah_data *ahp; | 310 | struct ah_data *ahp; |
297 | int err = -ENOMEM; | 311 | int err = -ENOMEM; |
312 | int seqhi_len = 0; | ||
313 | __be32 *seqhi; | ||
314 | int sglists = 0; | ||
315 | struct scatterlist *seqhisg; | ||
298 | 316 | ||
299 | if (!pskb_may_pull(skb, sizeof(*ah))) | 317 | if (!pskb_may_pull(skb, sizeof(*ah))) |
300 | goto out; | 318 | goto out; |
@@ -335,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
335 | iph = ip_hdr(skb); | 353 | iph = ip_hdr(skb); |
336 | ihl = ip_hdrlen(skb); | 354 | ihl = ip_hdrlen(skb); |
337 | 355 | ||
338 | work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); | 356 | if (x->props.flags & XFRM_STATE_ESN) { |
357 | sglists = 1; | ||
358 | seqhi_len = sizeof(*seqhi); | ||
359 | } | ||
360 | |||
361 | work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + | ||
362 | ahp->icv_trunc_len + seqhi_len); | ||
339 | if (!work_iph) | 363 | if (!work_iph) |
340 | goto out; | 364 | goto out; |
341 | 365 | ||
342 | auth_data = ah_tmp_auth(work_iph, ihl); | 366 | seqhi = (__be32 *)((char *)work_iph + ihl); |
367 | auth_data = ah_tmp_auth(seqhi, seqhi_len); | ||
343 | icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); | 368 | icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); |
344 | req = ah_tmp_req(ahash, icv); | 369 | req = ah_tmp_req(ahash, icv); |
345 | sg = ah_req_sg(ahash, req); | 370 | sg = ah_req_sg(ahash, req); |
371 | seqhisg = sg + nfrags; | ||
346 | 372 | ||
347 | memcpy(work_iph, iph, ihl); | 373 | memcpy(work_iph, iph, ihl); |
348 | memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); | 374 | memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); |
@@ -361,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
361 | 387 | ||
362 | skb_push(skb, ihl); | 388 | skb_push(skb, ihl); |
363 | 389 | ||
364 | sg_init_table(sg, nfrags); | 390 | sg_init_table(sg, nfrags + sglists); |
365 | skb_to_sgvec(skb, sg, 0, skb->len); | 391 | skb_to_sgvec_nomark(skb, sg, 0, skb->len); |
366 | 392 | ||
367 | ahash_request_set_crypt(req, sg, icv, skb->len); | 393 | if (x->props.flags & XFRM_STATE_ESN) { |
394 | /* Attach seqhi sg right after packet payload */ | ||
395 | *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; | ||
396 | sg_set_buf(seqhisg, seqhi, seqhi_len); | ||
397 | } | ||
398 | ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); | ||
368 | ahash_request_set_callback(req, 0, ah_input_done, skb); | 399 | ahash_request_set_callback(req, 0, ah_input_done, skb); |
369 | 400 | ||
370 | AH_SKB_CB(skb)->tmp = work_iph; | 401 | AH_SKB_CB(skb)->tmp = work_iph; |
@@ -397,7 +428,7 @@ out: | |||
397 | return err; | 428 | return err; |
398 | } | 429 | } |
399 | 430 | ||
400 | static void ah4_err(struct sk_buff *skb, u32 info) | 431 | static int ah4_err(struct sk_buff *skb, u32 info) |
401 | { | 432 | { |
402 | struct net *net = dev_net(skb->dev); | 433 | struct net *net = dev_net(skb->dev); |
403 | const struct iphdr *iph = (const struct iphdr *)skb->data; | 434 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
@@ -407,23 +438,25 @@ static void ah4_err(struct sk_buff *skb, u32 info) | |||
407 | switch (icmp_hdr(skb)->type) { | 438 | switch (icmp_hdr(skb)->type) { |
408 | case ICMP_DEST_UNREACH: | 439 | case ICMP_DEST_UNREACH: |
409 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 440 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
410 | return; | 441 | return 0; |
411 | case ICMP_REDIRECT: | 442 | case ICMP_REDIRECT: |
412 | break; | 443 | break; |
413 | default: | 444 | default: |
414 | return; | 445 | return 0; |
415 | } | 446 | } |
416 | 447 | ||
417 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 448 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
418 | ah->spi, IPPROTO_AH, AF_INET); | 449 | ah->spi, IPPROTO_AH, AF_INET); |
419 | if (!x) | 450 | if (!x) |
420 | return; | 451 | return 0; |
421 | 452 | ||
422 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) | 453 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) |
423 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); | 454 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); |
424 | else | 455 | else |
425 | ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); | 456 | ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); |
426 | xfrm_state_put(x); | 457 | xfrm_state_put(x); |
458 | |||
459 | return 0; | ||
427 | } | 460 | } |
428 | 461 | ||
429 | static int ah_init_state(struct xfrm_state *x) | 462 | static int ah_init_state(struct xfrm_state *x) |
@@ -505,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x) | |||
505 | kfree(ahp); | 538 | kfree(ahp); |
506 | } | 539 | } |
507 | 540 | ||
541 | static int ah4_rcv_cb(struct sk_buff *skb, int err) | ||
542 | { | ||
543 | return 0; | ||
544 | } | ||
508 | 545 | ||
509 | static const struct xfrm_type ah_type = | 546 | static const struct xfrm_type ah_type = |
510 | { | 547 | { |
@@ -518,11 +555,12 @@ static const struct xfrm_type ah_type = | |||
518 | .output = ah_output | 555 | .output = ah_output |
519 | }; | 556 | }; |
520 | 557 | ||
521 | static const struct net_protocol ah4_protocol = { | 558 | static struct xfrm4_protocol ah4_protocol = { |
522 | .handler = xfrm4_rcv, | 559 | .handler = xfrm4_rcv, |
560 | .input_handler = xfrm_input, | ||
561 | .cb_handler = ah4_rcv_cb, | ||
523 | .err_handler = ah4_err, | 562 | .err_handler = ah4_err, |
524 | .no_policy = 1, | 563 | .priority = 0, |
525 | .netns_ok = 1, | ||
526 | }; | 564 | }; |
527 | 565 | ||
528 | static int __init ah4_init(void) | 566 | static int __init ah4_init(void) |
@@ -531,7 +569,7 @@ static int __init ah4_init(void) | |||
531 | pr_info("%s: can't add xfrm type\n", __func__); | 569 | pr_info("%s: can't add xfrm type\n", __func__); |
532 | return -EAGAIN; | 570 | return -EAGAIN; |
533 | } | 571 | } |
534 | if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { | 572 | if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { |
535 | pr_info("%s: can't add protocol\n", __func__); | 573 | pr_info("%s: can't add protocol\n", __func__); |
536 | xfrm_unregister_type(&ah_type, AF_INET); | 574 | xfrm_unregister_type(&ah_type, AF_INET); |
537 | return -EAGAIN; | 575 | return -EAGAIN; |
@@ -541,7 +579,7 @@ static int __init ah4_init(void) | |||
541 | 579 | ||
542 | static void __exit ah4_fini(void) | 580 | static void __exit ah4_fini(void) |
543 | { | 581 | { |
544 | if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) | 582 | if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) |
545 | pr_info("%s: can't remove protocol\n", __func__); | 583 | pr_info("%s: can't remove protocol\n", __func__); |
546 | if (xfrm_unregister_type(&ah_type, AF_INET) < 0) | 584 | if (xfrm_unregister_type(&ah_type, AF_INET) < 0) |
547 | pr_info("%s: can't remove xfrm type\n", __func__); | 585 | pr_info("%s: can't remove xfrm type\n", __func__); |
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 7785b28061ac..360b565918c4 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -473,7 +473,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) | |||
473 | net_adj) & ~(blksize - 1)) + net_adj - 2; | 473 | net_adj) & ~(blksize - 1)) + net_adj - 2; |
474 | } | 474 | } |
475 | 475 | ||
476 | static void esp4_err(struct sk_buff *skb, u32 info) | 476 | static int esp4_err(struct sk_buff *skb, u32 info) |
477 | { | 477 | { |
478 | struct net *net = dev_net(skb->dev); | 478 | struct net *net = dev_net(skb->dev); |
479 | const struct iphdr *iph = (const struct iphdr *)skb->data; | 479 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
@@ -483,23 +483,25 @@ static void esp4_err(struct sk_buff *skb, u32 info) | |||
483 | switch (icmp_hdr(skb)->type) { | 483 | switch (icmp_hdr(skb)->type) { |
484 | case ICMP_DEST_UNREACH: | 484 | case ICMP_DEST_UNREACH: |
485 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 485 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
486 | return; | 486 | return 0; |
487 | case ICMP_REDIRECT: | 487 | case ICMP_REDIRECT: |
488 | break; | 488 | break; |
489 | default: | 489 | default: |
490 | return; | 490 | return 0; |
491 | } | 491 | } |
492 | 492 | ||
493 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 493 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
494 | esph->spi, IPPROTO_ESP, AF_INET); | 494 | esph->spi, IPPROTO_ESP, AF_INET); |
495 | if (!x) | 495 | if (!x) |
496 | return; | 496 | return 0; |
497 | 497 | ||
498 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) | 498 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) |
499 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); | 499 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); |
500 | else | 500 | else |
501 | ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); | 501 | ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); |
502 | xfrm_state_put(x); | 502 | xfrm_state_put(x); |
503 | |||
504 | return 0; | ||
503 | } | 505 | } |
504 | 506 | ||
505 | static void esp_destroy(struct xfrm_state *x) | 507 | static void esp_destroy(struct xfrm_state *x) |
@@ -672,6 +674,11 @@ error: | |||
672 | return err; | 674 | return err; |
673 | } | 675 | } |
674 | 676 | ||
677 | static int esp4_rcv_cb(struct sk_buff *skb, int err) | ||
678 | { | ||
679 | return 0; | ||
680 | } | ||
681 | |||
675 | static const struct xfrm_type esp_type = | 682 | static const struct xfrm_type esp_type = |
676 | { | 683 | { |
677 | .description = "ESP4", | 684 | .description = "ESP4", |
@@ -685,11 +692,12 @@ static const struct xfrm_type esp_type = | |||
685 | .output = esp_output | 692 | .output = esp_output |
686 | }; | 693 | }; |
687 | 694 | ||
688 | static const struct net_protocol esp4_protocol = { | 695 | static struct xfrm4_protocol esp4_protocol = { |
689 | .handler = xfrm4_rcv, | 696 | .handler = xfrm4_rcv, |
697 | .input_handler = xfrm_input, | ||
698 | .cb_handler = esp4_rcv_cb, | ||
690 | .err_handler = esp4_err, | 699 | .err_handler = esp4_err, |
691 | .no_policy = 1, | 700 | .priority = 0, |
692 | .netns_ok = 1, | ||
693 | }; | 701 | }; |
694 | 702 | ||
695 | static int __init esp4_init(void) | 703 | static int __init esp4_init(void) |
@@ -698,7 +706,7 @@ static int __init esp4_init(void) | |||
698 | pr_info("%s: can't add xfrm type\n", __func__); | 706 | pr_info("%s: can't add xfrm type\n", __func__); |
699 | return -EAGAIN; | 707 | return -EAGAIN; |
700 | } | 708 | } |
701 | if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { | 709 | if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) { |
702 | pr_info("%s: can't add protocol\n", __func__); | 710 | pr_info("%s: can't add protocol\n", __func__); |
703 | xfrm_unregister_type(&esp_type, AF_INET); | 711 | xfrm_unregister_type(&esp_type, AF_INET); |
704 | return -EAGAIN; | 712 | return -EAGAIN; |
@@ -708,7 +716,7 @@ static int __init esp4_init(void) | |||
708 | 716 | ||
709 | static void __exit esp4_fini(void) | 717 | static void __exit esp4_fini(void) |
710 | { | 718 | { |
711 | if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) | 719 | if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0) |
712 | pr_info("%s: can't remove protocol\n", __func__); | 720 | pr_info("%s: can't remove protocol\n", __func__); |
713 | if (xfrm_unregister_type(&esp_type, AF_INET) < 0) | 721 | if (xfrm_unregister_type(&esp_type, AF_INET) < 0) |
714 | pr_info("%s: can't remove xfrm type\n", __func__); | 722 | pr_info("%s: can't remove xfrm type\n", __func__); |
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c7539e22868b..1a629f870274 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -659,7 +659,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) | |||
659 | 659 | ||
660 | if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && | 660 | if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && |
661 | ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) | 661 | ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) |
662 | return ip_rt_dump(skb, cb); | 662 | return skb->len; |
663 | 663 | ||
664 | s_h = cb->args[0]; | 664 | s_h = cb->args[0]; |
665 | s_e = cb->args[1]; | 665 | s_e = cb->args[1]; |
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index f3869c186d97..be8abe73bb9f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c | |||
@@ -127,6 +127,10 @@ int ip_forward(struct sk_buff *skb) | |||
127 | struct rtable *rt; /* Route we use */ | 127 | struct rtable *rt; /* Route we use */ |
128 | struct ip_options *opt = &(IPCB(skb)->opt); | 128 | struct ip_options *opt = &(IPCB(skb)->opt); |
129 | 129 | ||
130 | /* that should never happen */ | ||
131 | if (skb->pkt_type != PACKET_HOST) | ||
132 | goto drop; | ||
133 | |||
130 | if (skb_warn_if_lro(skb)) | 134 | if (skb_warn_if_lro(skb)) |
131 | goto drop; | 135 | goto drop; |
132 | 136 | ||
@@ -136,9 +140,6 @@ int ip_forward(struct sk_buff *skb) | |||
136 | if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) | 140 | if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) |
137 | return NET_RX_SUCCESS; | 141 | return NET_RX_SUCCESS; |
138 | 142 | ||
139 | if (skb->pkt_type != PACKET_HOST) | ||
140 | goto drop; | ||
141 | |||
142 | skb_forward_csum(skb); | 143 | skb_forward_csum(skb); |
143 | 144 | ||
144 | /* | 145 | /* |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 73c6b63bba74..1a0755fea491 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -446,7 +446,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) | |||
446 | __be16 not_last_frag; | 446 | __be16 not_last_frag; |
447 | struct rtable *rt = skb_rtable(skb); | 447 | struct rtable *rt = skb_rtable(skb); |
448 | int err = 0; | 448 | int err = 0; |
449 | bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; | ||
450 | 449 | ||
451 | dev = rt->dst.dev; | 450 | dev = rt->dst.dev; |
452 | 451 | ||
@@ -456,7 +455,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) | |||
456 | 455 | ||
457 | iph = ip_hdr(skb); | 456 | iph = ip_hdr(skb); |
458 | 457 | ||
459 | mtu = ip_dst_mtu_maybe_forward(&rt->dst, forwarding); | 458 | mtu = ip_skb_dst_mtu(skb); |
460 | if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || | 459 | if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || |
461 | (IPCB(skb)->frag_max_size && | 460 | (IPCB(skb)->frag_max_size && |
462 | IPCB(skb)->frag_max_size > mtu))) { | 461 | IPCB(skb)->frag_max_size > mtu))) { |
@@ -822,8 +821,7 @@ static int __ip_append_data(struct sock *sk, | |||
822 | 821 | ||
823 | fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); | 822 | fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); |
824 | maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; | 823 | maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; |
825 | maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? | 824 | maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; |
826 | mtu : 0xFFFF; | ||
827 | 825 | ||
828 | if (cork->length + length > maxnonfragsize - fragheaderlen) { | 826 | if (cork->length + length > maxnonfragsize - fragheaderlen) { |
829 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, | 827 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, |
@@ -1146,8 +1144,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, | |||
1146 | 1144 | ||
1147 | fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); | 1145 | fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); |
1148 | maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; | 1146 | maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; |
1149 | maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? | 1147 | maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; |
1150 | mtu : 0xFFFF; | ||
1151 | 1148 | ||
1152 | if (cork->length + size > maxnonfragsize - fragheaderlen) { | 1149 | if (cork->length + size > maxnonfragsize - fragheaderlen) { |
1153 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, | 1150 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, |
@@ -1308,8 +1305,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, | |||
1308 | * to fragment the frame generated here. No matter, what transforms | 1305 | * to fragment the frame generated here. No matter, what transforms |
1309 | * how transforms change size of the packet, it will come out. | 1306 | * how transforms change size of the packet, it will come out. |
1310 | */ | 1307 | */ |
1311 | if (inet->pmtudisc < IP_PMTUDISC_DO) | 1308 | skb->local_df = ip_sk_local_df(sk); |
1312 | skb->local_df = 1; | ||
1313 | 1309 | ||
1314 | /* DF bit is set when we want to see DF on outgoing frames. | 1310 | /* DF bit is set when we want to see DF on outgoing frames. |
1315 | * If local_df is set too, we still allow to fragment this frame | 1311 | * If local_df is set too, we still allow to fragment this frame |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 580dd96666e0..64741b938632 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -186,7 +186,8 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) | |||
186 | } | 186 | } |
187 | EXPORT_SYMBOL(ip_cmsg_recv); | 187 | EXPORT_SYMBOL(ip_cmsg_recv); |
188 | 188 | ||
189 | int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) | 189 | int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, |
190 | bool allow_ipv6) | ||
190 | { | 191 | { |
191 | int err, val; | 192 | int err, val; |
192 | struct cmsghdr *cmsg; | 193 | struct cmsghdr *cmsg; |
@@ -194,6 +195,22 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) | |||
194 | for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { | 195 | for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { |
195 | if (!CMSG_OK(msg, cmsg)) | 196 | if (!CMSG_OK(msg, cmsg)) |
196 | return -EINVAL; | 197 | return -EINVAL; |
198 | #if defined(CONFIG_IPV6) | ||
199 | if (allow_ipv6 && | ||
200 | cmsg->cmsg_level == SOL_IPV6 && | ||
201 | cmsg->cmsg_type == IPV6_PKTINFO) { | ||
202 | struct in6_pktinfo *src_info; | ||
203 | |||
204 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) | ||
205 | return -EINVAL; | ||
206 | src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); | ||
207 | if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) | ||
208 | return -EINVAL; | ||
209 | ipc->oif = src_info->ipi6_ifindex; | ||
210 | ipc->addr = src_info->ipi6_addr.s6_addr32[3]; | ||
211 | continue; | ||
212 | } | ||
213 | #endif | ||
197 | if (cmsg->cmsg_level != SOL_IP) | 214 | if (cmsg->cmsg_level != SOL_IP) |
198 | continue; | 215 | continue; |
199 | switch (cmsg->cmsg_type) { | 216 | switch (cmsg->cmsg_type) { |
@@ -626,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
626 | inet->nodefrag = val ? 1 : 0; | 643 | inet->nodefrag = val ? 1 : 0; |
627 | break; | 644 | break; |
628 | case IP_MTU_DISCOVER: | 645 | case IP_MTU_DISCOVER: |
629 | if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE) | 646 | if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) |
630 | goto e_inval; | 647 | goto e_inval; |
631 | inet->pmtudisc = val; | 648 | inet->pmtudisc = val; |
632 | break; | 649 | break; |
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a82a22d8f77f..e77381d1df9a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c | |||
@@ -235,13 +235,17 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, | |||
235 | { | 235 | { |
236 | unsigned int h; | 236 | unsigned int h; |
237 | __be32 remote; | 237 | __be32 remote; |
238 | __be32 i_key = parms->i_key; | ||
238 | 239 | ||
239 | if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) | 240 | if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) |
240 | remote = parms->iph.daddr; | 241 | remote = parms->iph.daddr; |
241 | else | 242 | else |
242 | remote = 0; | 243 | remote = 0; |
243 | 244 | ||
244 | h = ip_tunnel_hash(parms->i_key, remote); | 245 | if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) |
246 | i_key = 0; | ||
247 | |||
248 | h = ip_tunnel_hash(i_key, remote); | ||
245 | return &itn->tunnels[h]; | 249 | return &itn->tunnels[h]; |
246 | } | 250 | } |
247 | 251 | ||
@@ -398,7 +402,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, | |||
398 | fbt = netdev_priv(itn->fb_tunnel_dev); | 402 | fbt = netdev_priv(itn->fb_tunnel_dev); |
399 | dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); | 403 | dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); |
400 | if (IS_ERR(dev)) | 404 | if (IS_ERR(dev)) |
401 | return NULL; | 405 | return ERR_CAST(dev); |
402 | 406 | ||
403 | dev->mtu = ip_tunnel_bind_dev(dev); | 407 | dev->mtu = ip_tunnel_bind_dev(dev); |
404 | 408 | ||
@@ -748,9 +752,13 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) | |||
748 | 752 | ||
749 | t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); | 753 | t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); |
750 | 754 | ||
751 | if (!t && (cmd == SIOCADDTUNNEL)) | 755 | if (!t && (cmd == SIOCADDTUNNEL)) { |
752 | t = ip_tunnel_create(net, itn, p); | 756 | t = ip_tunnel_create(net, itn, p); |
753 | 757 | if (IS_ERR(t)) { | |
758 | err = PTR_ERR(t); | ||
759 | break; | ||
760 | } | ||
761 | } | ||
754 | if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | 762 | if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { |
755 | if (t != NULL) { | 763 | if (t != NULL) { |
756 | if (t->dev != dev) { | 764 | if (t->dev != dev) { |
@@ -777,8 +785,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) | |||
777 | if (t) { | 785 | if (t) { |
778 | err = 0; | 786 | err = 0; |
779 | ip_tunnel_update(itn, t, dev, p, true); | 787 | ip_tunnel_update(itn, t, dev, p, true); |
780 | } else | 788 | } else { |
781 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | 789 | err = -ENOENT; |
790 | } | ||
782 | break; | 791 | break; |
783 | 792 | ||
784 | case SIOCDELTUNNEL: | 793 | case SIOCDELTUNNEL: |
@@ -993,19 +1002,13 @@ int ip_tunnel_init(struct net_device *dev) | |||
993 | { | 1002 | { |
994 | struct ip_tunnel *tunnel = netdev_priv(dev); | 1003 | struct ip_tunnel *tunnel = netdev_priv(dev); |
995 | struct iphdr *iph = &tunnel->parms.iph; | 1004 | struct iphdr *iph = &tunnel->parms.iph; |
996 | int i, err; | 1005 | int err; |
997 | 1006 | ||
998 | dev->destructor = ip_tunnel_dev_free; | 1007 | dev->destructor = ip_tunnel_dev_free; |
999 | dev->tstats = alloc_percpu(struct pcpu_sw_netstats); | 1008 | dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); |
1000 | if (!dev->tstats) | 1009 | if (!dev->tstats) |
1001 | return -ENOMEM; | 1010 | return -ENOMEM; |
1002 | 1011 | ||
1003 | for_each_possible_cpu(i) { | ||
1004 | struct pcpu_sw_netstats *ipt_stats; | ||
1005 | ipt_stats = per_cpu_ptr(dev->tstats, i); | ||
1006 | u64_stats_init(&ipt_stats->syncp); | ||
1007 | } | ||
1008 | |||
1009 | tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); | 1012 | tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); |
1010 | if (!tunnel->dst_cache) { | 1013 | if (!tunnel->dst_cache) { |
1011 | free_percpu(dev->tstats); | 1014 | free_percpu(dev->tstats); |
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 8d69626f2206..e0c2b1d2ea4e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c | |||
@@ -162,12 +162,12 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, | |||
162 | unsigned int start; | 162 | unsigned int start; |
163 | 163 | ||
164 | do { | 164 | do { |
165 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | 165 | start = u64_stats_fetch_begin_irq(&tstats->syncp); |
166 | rx_packets = tstats->rx_packets; | 166 | rx_packets = tstats->rx_packets; |
167 | tx_packets = tstats->tx_packets; | 167 | tx_packets = tstats->tx_packets; |
168 | rx_bytes = tstats->rx_bytes; | 168 | rx_bytes = tstats->rx_bytes; |
169 | tx_bytes = tstats->tx_bytes; | 169 | tx_bytes = tstats->tx_bytes; |
170 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | 170 | } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); |
171 | 171 | ||
172 | tot->rx_packets += rx_packets; | 172 | tot->rx_packets += rx_packets; |
173 | tot->tx_packets += tx_packets; | 173 | tot->tx_packets += tx_packets; |
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 48eafae51769..687ddef4e574 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/init.h> | 34 | #include <linux/init.h> |
35 | #include <linux/netfilter_ipv4.h> | 35 | #include <linux/netfilter_ipv4.h> |
36 | #include <linux/if_ether.h> | 36 | #include <linux/if_ether.h> |
37 | #include <linux/icmpv6.h> | ||
37 | 38 | ||
38 | #include <net/sock.h> | 39 | #include <net/sock.h> |
39 | #include <net/ip.h> | 40 | #include <net/ip.h> |
@@ -49,8 +50,8 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; | |||
49 | static int vti_net_id __read_mostly; | 50 | static int vti_net_id __read_mostly; |
50 | static int vti_tunnel_init(struct net_device *dev); | 51 | static int vti_tunnel_init(struct net_device *dev); |
51 | 52 | ||
52 | /* We dont digest the packet therefore let the packet pass */ | 53 | static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, |
53 | static int vti_rcv(struct sk_buff *skb) | 54 | int encap_type) |
54 | { | 55 | { |
55 | struct ip_tunnel *tunnel; | 56 | struct ip_tunnel *tunnel; |
56 | const struct iphdr *iph = ip_hdr(skb); | 57 | const struct iphdr *iph = ip_hdr(skb); |
@@ -60,79 +61,120 @@ static int vti_rcv(struct sk_buff *skb) | |||
60 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, | 61 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, |
61 | iph->saddr, iph->daddr, 0); | 62 | iph->saddr, iph->daddr, 0); |
62 | if (tunnel != NULL) { | 63 | if (tunnel != NULL) { |
63 | struct pcpu_sw_netstats *tstats; | 64 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) |
64 | u32 oldmark = skb->mark; | 65 | goto drop; |
65 | int ret; | 66 | |
66 | 67 | XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; | |
67 | 68 | skb->mark = be32_to_cpu(tunnel->parms.i_key); | |
68 | /* temporarily mark the skb with the tunnel o_key, to | 69 | |
69 | * only match policies with this mark. | 70 | return xfrm_input(skb, nexthdr, spi, encap_type); |
70 | */ | 71 | } |
71 | skb->mark = be32_to_cpu(tunnel->parms.o_key); | 72 | |
72 | ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb); | 73 | return -EINVAL; |
73 | skb->mark = oldmark; | 74 | drop: |
74 | if (!ret) | 75 | kfree_skb(skb); |
75 | return -1; | 76 | return 0; |
76 | 77 | } | |
77 | tstats = this_cpu_ptr(tunnel->dev->tstats); | 78 | |
78 | u64_stats_update_begin(&tstats->syncp); | 79 | static int vti_rcv(struct sk_buff *skb) |
79 | tstats->rx_packets++; | 80 | { |
80 | tstats->rx_bytes += skb->len; | 81 | XFRM_SPI_SKB_CB(skb)->family = AF_INET; |
81 | u64_stats_update_end(&tstats->syncp); | 82 | XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); |
82 | 83 | ||
83 | secpath_reset(skb); | 84 | return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); |
84 | skb->dev = tunnel->dev; | 85 | } |
86 | |||
87 | static int vti_rcv_cb(struct sk_buff *skb, int err) | ||
88 | { | ||
89 | unsigned short family; | ||
90 | struct net_device *dev; | ||
91 | struct pcpu_sw_netstats *tstats; | ||
92 | struct xfrm_state *x; | ||
93 | struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; | ||
94 | |||
95 | if (!tunnel) | ||
85 | return 1; | 96 | return 1; |
97 | |||
98 | dev = tunnel->dev; | ||
99 | |||
100 | if (err) { | ||
101 | dev->stats.rx_errors++; | ||
102 | dev->stats.rx_dropped++; | ||
103 | |||
104 | return 0; | ||
86 | } | 105 | } |
87 | 106 | ||
88 | return -1; | 107 | x = xfrm_input_state(skb); |
108 | family = x->inner_mode->afinfo->family; | ||
109 | |||
110 | if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) | ||
111 | return -EPERM; | ||
112 | |||
113 | skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); | ||
114 | skb->dev = dev; | ||
115 | |||
116 | tstats = this_cpu_ptr(dev->tstats); | ||
117 | |||
118 | u64_stats_update_begin(&tstats->syncp); | ||
119 | tstats->rx_packets++; | ||
120 | tstats->rx_bytes += skb->len; | ||
121 | u64_stats_update_end(&tstats->syncp); | ||
122 | |||
123 | return 0; | ||
89 | } | 124 | } |
90 | 125 | ||
91 | /* This function assumes it is being called from dev_queue_xmit() | 126 | static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) |
92 | * and that skb is filled properly by that function. | 127 | { |
93 | */ | 128 | xfrm_address_t *daddr = (xfrm_address_t *)&dst; |
129 | xfrm_address_t *saddr = (xfrm_address_t *)&src; | ||
94 | 130 | ||
95 | static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 131 | /* if there is no transform then this tunnel is not functional. |
132 | * Or if the xfrm is not mode tunnel. | ||
133 | */ | ||
134 | if (!x || x->props.mode != XFRM_MODE_TUNNEL || | ||
135 | x->props.family != AF_INET) | ||
136 | return false; | ||
137 | |||
138 | if (!dst) | ||
139 | return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); | ||
140 | |||
141 | if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) | ||
142 | return false; | ||
143 | |||
144 | return true; | ||
145 | } | ||
146 | |||
147 | static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, | ||
148 | struct flowi *fl) | ||
96 | { | 149 | { |
97 | struct ip_tunnel *tunnel = netdev_priv(dev); | 150 | struct ip_tunnel *tunnel = netdev_priv(dev); |
98 | struct iphdr *tiph = &tunnel->parms.iph; | 151 | struct ip_tunnel_parm *parms = &tunnel->parms; |
99 | u8 tos; | 152 | struct dst_entry *dst = skb_dst(skb); |
100 | struct rtable *rt; /* Route to the other host */ | ||
101 | struct net_device *tdev; /* Device to other host */ | 153 | struct net_device *tdev; /* Device to other host */ |
102 | struct iphdr *old_iph = ip_hdr(skb); | ||
103 | __be32 dst = tiph->daddr; | ||
104 | struct flowi4 fl4; | ||
105 | int err; | 154 | int err; |
106 | 155 | ||
107 | if (skb->protocol != htons(ETH_P_IP)) | 156 | if (!dst) { |
108 | goto tx_error; | 157 | dev->stats.tx_carrier_errors++; |
109 | 158 | goto tx_error_icmp; | |
110 | tos = old_iph->tos; | 159 | } |
111 | 160 | ||
112 | memset(&fl4, 0, sizeof(fl4)); | 161 | dst_hold(dst); |
113 | flowi4_init_output(&fl4, tunnel->parms.link, | 162 | dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); |
114 | be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos), | 163 | if (IS_ERR(dst)) { |
115 | RT_SCOPE_UNIVERSE, | ||
116 | IPPROTO_IPIP, 0, | ||
117 | dst, tiph->saddr, 0, 0); | ||
118 | rt = ip_route_output_key(dev_net(dev), &fl4); | ||
119 | if (IS_ERR(rt)) { | ||
120 | dev->stats.tx_carrier_errors++; | 164 | dev->stats.tx_carrier_errors++; |
121 | goto tx_error_icmp; | 165 | goto tx_error_icmp; |
122 | } | 166 | } |
123 | /* if there is no transform then this tunnel is not functional. | 167 | |
124 | * Or if the xfrm is not mode tunnel. | 168 | if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { |
125 | */ | ||
126 | if (!rt->dst.xfrm || | ||
127 | rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { | ||
128 | dev->stats.tx_carrier_errors++; | 169 | dev->stats.tx_carrier_errors++; |
129 | ip_rt_put(rt); | 170 | dst_release(dst); |
130 | goto tx_error_icmp; | 171 | goto tx_error_icmp; |
131 | } | 172 | } |
132 | tdev = rt->dst.dev; | 173 | |
174 | tdev = dst->dev; | ||
133 | 175 | ||
134 | if (tdev == dev) { | 176 | if (tdev == dev) { |
135 | ip_rt_put(rt); | 177 | dst_release(dst); |
136 | dev->stats.collisions++; | 178 | dev->stats.collisions++; |
137 | goto tx_error; | 179 | goto tx_error; |
138 | } | 180 | } |
@@ -146,10 +188,8 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
146 | tunnel->err_count = 0; | 188 | tunnel->err_count = 0; |
147 | } | 189 | } |
148 | 190 | ||
149 | memset(IPCB(skb), 0, sizeof(*IPCB(skb))); | 191 | skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); |
150 | skb_dst_drop(skb); | 192 | skb_dst_set(skb, dst); |
151 | skb_dst_set(skb, &rt->dst); | ||
152 | nf_reset(skb); | ||
153 | skb->dev = skb_dst(skb)->dev; | 193 | skb->dev = skb_dst(skb)->dev; |
154 | 194 | ||
155 | err = dst_output(skb); | 195 | err = dst_output(skb); |
@@ -166,6 +206,95 @@ tx_error: | |||
166 | return NETDEV_TX_OK; | 206 | return NETDEV_TX_OK; |
167 | } | 207 | } |
168 | 208 | ||
209 | /* This function assumes it is being called from dev_queue_xmit() | ||
210 | * and that skb is filled properly by that function. | ||
211 | */ | ||
212 | static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | ||
213 | { | ||
214 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
215 | struct flowi fl; | ||
216 | |||
217 | memset(&fl, 0, sizeof(fl)); | ||
218 | |||
219 | skb->mark = be32_to_cpu(tunnel->parms.o_key); | ||
220 | |||
221 | switch (skb->protocol) { | ||
222 | case htons(ETH_P_IP): | ||
223 | xfrm_decode_session(skb, &fl, AF_INET); | ||
224 | memset(IPCB(skb), 0, sizeof(*IPCB(skb))); | ||
225 | break; | ||
226 | case htons(ETH_P_IPV6): | ||
227 | xfrm_decode_session(skb, &fl, AF_INET6); | ||
228 | memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); | ||
229 | break; | ||
230 | default: | ||
231 | dev->stats.tx_errors++; | ||
232 | dev_kfree_skb(skb); | ||
233 | return NETDEV_TX_OK; | ||
234 | } | ||
235 | |||
236 | return vti_xmit(skb, dev, &fl); | ||
237 | } | ||
238 | |||
239 | static int vti4_err(struct sk_buff *skb, u32 info) | ||
240 | { | ||
241 | __be32 spi; | ||
242 | struct xfrm_state *x; | ||
243 | struct ip_tunnel *tunnel; | ||
244 | struct ip_esp_hdr *esph; | ||
245 | struct ip_auth_hdr *ah ; | ||
246 | struct ip_comp_hdr *ipch; | ||
247 | struct net *net = dev_net(skb->dev); | ||
248 | const struct iphdr *iph = (const struct iphdr *)skb->data; | ||
249 | int protocol = iph->protocol; | ||
250 | struct ip_tunnel_net *itn = net_generic(net, vti_net_id); | ||
251 | |||
252 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, | ||
253 | iph->daddr, iph->saddr, 0); | ||
254 | if (!tunnel) | ||
255 | return -1; | ||
256 | |||
257 | switch (protocol) { | ||
258 | case IPPROTO_ESP: | ||
259 | esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); | ||
260 | spi = esph->spi; | ||
261 | break; | ||
262 | case IPPROTO_AH: | ||
263 | ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); | ||
264 | spi = ah->spi; | ||
265 | break; | ||
266 | case IPPROTO_COMP: | ||
267 | ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); | ||
268 | spi = htonl(ntohs(ipch->cpi)); | ||
269 | break; | ||
270 | default: | ||
271 | return 0; | ||
272 | } | ||
273 | |||
274 | switch (icmp_hdr(skb)->type) { | ||
275 | case ICMP_DEST_UNREACH: | ||
276 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | ||
277 | return 0; | ||
278 | case ICMP_REDIRECT: | ||
279 | break; | ||
280 | default: | ||
281 | return 0; | ||
282 | } | ||
283 | |||
284 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | ||
285 | spi, protocol, AF_INET); | ||
286 | if (!x) | ||
287 | return 0; | ||
288 | |||
289 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) | ||
290 | ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); | ||
291 | else | ||
292 | ipv4_redirect(skb, net, 0, 0, protocol, 0); | ||
293 | xfrm_state_put(x); | ||
294 | |||
295 | return 0; | ||
296 | } | ||
297 | |||
169 | static int | 298 | static int |
170 | vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) | 299 | vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) |
171 | { | 300 | { |
@@ -181,12 +310,13 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) | |||
181 | return -EINVAL; | 310 | return -EINVAL; |
182 | } | 311 | } |
183 | 312 | ||
313 | p.i_flags |= VTI_ISVTI; | ||
184 | err = ip_tunnel_ioctl(dev, &p, cmd); | 314 | err = ip_tunnel_ioctl(dev, &p, cmd); |
185 | if (err) | 315 | if (err) |
186 | return err; | 316 | return err; |
187 | 317 | ||
188 | if (cmd != SIOCDELTUNNEL) { | 318 | if (cmd != SIOCDELTUNNEL) { |
189 | p.i_flags |= GRE_KEY | VTI_ISVTI; | 319 | p.i_flags |= GRE_KEY; |
190 | p.o_flags |= GRE_KEY; | 320 | p.o_flags |= GRE_KEY; |
191 | } | 321 | } |
192 | 322 | ||
@@ -224,7 +354,6 @@ static int vti_tunnel_init(struct net_device *dev) | |||
224 | dev->flags = IFF_NOARP; | 354 | dev->flags = IFF_NOARP; |
225 | dev->iflink = 0; | 355 | dev->iflink = 0; |
226 | dev->addr_len = 4; | 356 | dev->addr_len = 4; |
227 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
228 | dev->features |= NETIF_F_LLTX; | 357 | dev->features |= NETIF_F_LLTX; |
229 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 358 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
230 | 359 | ||
@@ -241,9 +370,28 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) | |||
241 | iph->ihl = 5; | 370 | iph->ihl = 5; |
242 | } | 371 | } |
243 | 372 | ||
244 | static struct xfrm_tunnel_notifier vti_handler __read_mostly = { | 373 | static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { |
245 | .handler = vti_rcv, | 374 | .handler = vti_rcv, |
246 | .priority = 1, | 375 | .input_handler = vti_input, |
376 | .cb_handler = vti_rcv_cb, | ||
377 | .err_handler = vti4_err, | ||
378 | .priority = 100, | ||
379 | }; | ||
380 | |||
381 | static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { | ||
382 | .handler = vti_rcv, | ||
383 | .input_handler = vti_input, | ||
384 | .cb_handler = vti_rcv_cb, | ||
385 | .err_handler = vti4_err, | ||
386 | .priority = 100, | ||
387 | }; | ||
388 | |||
389 | static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { | ||
390 | .handler = vti_rcv, | ||
391 | .input_handler = vti_input, | ||
392 | .cb_handler = vti_rcv_cb, | ||
393 | .err_handler = vti4_err, | ||
394 | .priority = 100, | ||
247 | }; | 395 | }; |
248 | 396 | ||
249 | static int __net_init vti_init_net(struct net *net) | 397 | static int __net_init vti_init_net(struct net *net) |
@@ -287,6 +435,8 @@ static void vti_netlink_parms(struct nlattr *data[], | |||
287 | if (!data) | 435 | if (!data) |
288 | return; | 436 | return; |
289 | 437 | ||
438 | parms->i_flags = VTI_ISVTI; | ||
439 | |||
290 | if (data[IFLA_VTI_LINK]) | 440 | if (data[IFLA_VTI_LINK]) |
291 | parms->link = nla_get_u32(data[IFLA_VTI_LINK]); | 441 | parms->link = nla_get_u32(data[IFLA_VTI_LINK]); |
292 | 442 | ||
@@ -382,10 +532,31 @@ static int __init vti_init(void) | |||
382 | err = register_pernet_device(&vti_net_ops); | 532 | err = register_pernet_device(&vti_net_ops); |
383 | if (err < 0) | 533 | if (err < 0) |
384 | return err; | 534 | return err; |
385 | err = xfrm4_mode_tunnel_input_register(&vti_handler); | 535 | err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); |
536 | if (err < 0) { | ||
537 | unregister_pernet_device(&vti_net_ops); | ||
538 | pr_info("vti init: can't register tunnel\n"); | ||
539 | |||
540 | return err; | ||
541 | } | ||
542 | |||
543 | err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); | ||
544 | if (err < 0) { | ||
545 | xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); | ||
546 | unregister_pernet_device(&vti_net_ops); | ||
547 | pr_info("vti init: can't register tunnel\n"); | ||
548 | |||
549 | return err; | ||
550 | } | ||
551 | |||
552 | err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); | ||
386 | if (err < 0) { | 553 | if (err < 0) { |
554 | xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); | ||
555 | xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); | ||
387 | unregister_pernet_device(&vti_net_ops); | 556 | unregister_pernet_device(&vti_net_ops); |
388 | pr_info("vti init: can't register tunnel\n"); | 557 | pr_info("vti init: can't register tunnel\n"); |
558 | |||
559 | return err; | ||
389 | } | 560 | } |
390 | 561 | ||
391 | err = rtnl_link_register(&vti_link_ops); | 562 | err = rtnl_link_register(&vti_link_ops); |
@@ -395,7 +566,9 @@ static int __init vti_init(void) | |||
395 | return err; | 566 | return err; |
396 | 567 | ||
397 | rtnl_link_failed: | 568 | rtnl_link_failed: |
398 | xfrm4_mode_tunnel_input_deregister(&vti_handler); | 569 | xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); |
570 | xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); | ||
571 | xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); | ||
399 | unregister_pernet_device(&vti_net_ops); | 572 | unregister_pernet_device(&vti_net_ops); |
400 | return err; | 573 | return err; |
401 | } | 574 | } |
@@ -403,8 +576,13 @@ rtnl_link_failed: | |||
403 | static void __exit vti_fini(void) | 576 | static void __exit vti_fini(void) |
404 | { | 577 | { |
405 | rtnl_link_unregister(&vti_link_ops); | 578 | rtnl_link_unregister(&vti_link_ops); |
406 | if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) | 579 | if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) |
407 | pr_info("vti close: can't deregister tunnel\n"); | 580 | pr_info("vti close: can't deregister tunnel\n"); |
581 | if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) | ||
582 | pr_info("vti close: can't deregister tunnel\n"); | ||
583 | if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) | ||
584 | pr_info("vti close: can't deregister tunnel\n"); | ||
585 | |||
408 | 586 | ||
409 | unregister_pernet_device(&vti_net_ops); | 587 | unregister_pernet_device(&vti_net_ops); |
410 | } | 588 | } |
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 826be4cb482a..c0855d50a3fa 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
@@ -23,7 +23,7 @@ | |||
23 | #include <net/protocol.h> | 23 | #include <net/protocol.h> |
24 | #include <net/sock.h> | 24 | #include <net/sock.h> |
25 | 25 | ||
26 | static void ipcomp4_err(struct sk_buff *skb, u32 info) | 26 | static int ipcomp4_err(struct sk_buff *skb, u32 info) |
27 | { | 27 | { |
28 | struct net *net = dev_net(skb->dev); | 28 | struct net *net = dev_net(skb->dev); |
29 | __be32 spi; | 29 | __be32 spi; |
@@ -34,24 +34,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) | |||
34 | switch (icmp_hdr(skb)->type) { | 34 | switch (icmp_hdr(skb)->type) { |
35 | case ICMP_DEST_UNREACH: | 35 | case ICMP_DEST_UNREACH: |
36 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 36 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
37 | return; | 37 | return 0; |
38 | case ICMP_REDIRECT: | 38 | case ICMP_REDIRECT: |
39 | break; | 39 | break; |
40 | default: | 40 | default: |
41 | return; | 41 | return 0; |
42 | } | 42 | } |
43 | 43 | ||
44 | spi = htonl(ntohs(ipch->cpi)); | 44 | spi = htonl(ntohs(ipch->cpi)); |
45 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 45 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
46 | spi, IPPROTO_COMP, AF_INET); | 46 | spi, IPPROTO_COMP, AF_INET); |
47 | if (!x) | 47 | if (!x) |
48 | return; | 48 | return 0; |
49 | 49 | ||
50 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) | 50 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) |
51 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); | 51 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); |
52 | else | 52 | else |
53 | ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); | 53 | ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); |
54 | xfrm_state_put(x); | 54 | xfrm_state_put(x); |
55 | |||
56 | return 0; | ||
55 | } | 57 | } |
56 | 58 | ||
57 | /* We always hold one tunnel user reference to indicate a tunnel */ | 59 | /* We always hold one tunnel user reference to indicate a tunnel */ |
@@ -147,6 +149,11 @@ out: | |||
147 | return err; | 149 | return err; |
148 | } | 150 | } |
149 | 151 | ||
152 | static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) | ||
153 | { | ||
154 | return 0; | ||
155 | } | ||
156 | |||
150 | static const struct xfrm_type ipcomp_type = { | 157 | static const struct xfrm_type ipcomp_type = { |
151 | .description = "IPCOMP4", | 158 | .description = "IPCOMP4", |
152 | .owner = THIS_MODULE, | 159 | .owner = THIS_MODULE, |
@@ -157,11 +164,12 @@ static const struct xfrm_type ipcomp_type = { | |||
157 | .output = ipcomp_output | 164 | .output = ipcomp_output |
158 | }; | 165 | }; |
159 | 166 | ||
160 | static const struct net_protocol ipcomp4_protocol = { | 167 | static struct xfrm4_protocol ipcomp4_protocol = { |
161 | .handler = xfrm4_rcv, | 168 | .handler = xfrm4_rcv, |
169 | .input_handler = xfrm_input, | ||
170 | .cb_handler = ipcomp4_rcv_cb, | ||
162 | .err_handler = ipcomp4_err, | 171 | .err_handler = ipcomp4_err, |
163 | .no_policy = 1, | 172 | .priority = 0, |
164 | .netns_ok = 1, | ||
165 | }; | 173 | }; |
166 | 174 | ||
167 | static int __init ipcomp4_init(void) | 175 | static int __init ipcomp4_init(void) |
@@ -170,7 +178,7 @@ static int __init ipcomp4_init(void) | |||
170 | pr_info("%s: can't add xfrm type\n", __func__); | 178 | pr_info("%s: can't add xfrm type\n", __func__); |
171 | return -EAGAIN; | 179 | return -EAGAIN; |
172 | } | 180 | } |
173 | if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { | 181 | if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) { |
174 | pr_info("%s: can't add protocol\n", __func__); | 182 | pr_info("%s: can't add protocol\n", __func__); |
175 | xfrm_unregister_type(&ipcomp_type, AF_INET); | 183 | xfrm_unregister_type(&ipcomp_type, AF_INET); |
176 | return -EAGAIN; | 184 | return -EAGAIN; |
@@ -180,7 +188,7 @@ static int __init ipcomp4_init(void) | |||
180 | 188 | ||
181 | static void __exit ipcomp4_fini(void) | 189 | static void __exit ipcomp4_fini(void) |
182 | { | 190 | { |
183 | if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) | 191 | if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) |
184 | pr_info("%s: can't remove protocol\n", __func__); | 192 | pr_info("%s: can't remove protocol\n", __func__); |
185 | if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) | 193 | if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) |
186 | pr_info("%s: can't remove xfrm type\n", __func__); | 194 | pr_info("%s: can't remove xfrm type\n", __func__); |
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c3e0adea9c27..7ebd6e37875c 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c | |||
@@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) | |||
61 | skb_dst_set(skb, NULL); | 61 | skb_dst_set(skb, NULL); |
62 | dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); | 62 | dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); |
63 | if (IS_ERR(dst)) | 63 | if (IS_ERR(dst)) |
64 | return PTR_ERR(dst);; | 64 | return PTR_ERR(dst); |
65 | skb_dst_set(skb, dst); | 65 | skb_dst_set(skb, dst); |
66 | } | 66 | } |
67 | #endif | 67 | #endif |
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 59da7cde0724..f95b6f93814b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c | |||
@@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name, | |||
1044 | 1044 | ||
1045 | xt_free_table_info(oldinfo); | 1045 | xt_free_table_info(oldinfo); |
1046 | if (copy_to_user(counters_ptr, counters, | 1046 | if (copy_to_user(counters_ptr, counters, |
1047 | sizeof(struct xt_counters) * num_counters) != 0) | 1047 | sizeof(struct xt_counters) * num_counters) != 0) { |
1048 | ret = -EFAULT; | 1048 | /* Silent error, can't fail, new table is already in place */ |
1049 | net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); | ||
1050 | } | ||
1049 | vfree(counters); | 1051 | vfree(counters); |
1050 | xt_table_unlock(t); | 1052 | xt_table_unlock(t); |
1051 | return ret; | 1053 | return ret; |
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 718dfbd30cbe..99e810f84671 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c | |||
@@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, | |||
1231 | 1231 | ||
1232 | xt_free_table_info(oldinfo); | 1232 | xt_free_table_info(oldinfo); |
1233 | if (copy_to_user(counters_ptr, counters, | 1233 | if (copy_to_user(counters_ptr, counters, |
1234 | sizeof(struct xt_counters) * num_counters) != 0) | 1234 | sizeof(struct xt_counters) * num_counters) != 0) { |
1235 | ret = -EFAULT; | 1235 | /* Silent error, can't fail, new table is already in place */ |
1236 | net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); | ||
1237 | } | ||
1236 | vfree(counters); | 1238 | vfree(counters); |
1237 | xt_table_unlock(t); | 1239 | xt_table_unlock(t); |
1238 | return ret; | 1240 | return ret; |
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2d11c094296e..f4b19e5dde54 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
@@ -727,7 +727,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m | |||
727 | sock_tx_timestamp(sk, &ipc.tx_flags); | 727 | sock_tx_timestamp(sk, &ipc.tx_flags); |
728 | 728 | ||
729 | if (msg->msg_controllen) { | 729 | if (msg->msg_controllen) { |
730 | err = ip_cmsg_send(sock_net(sk), msg, &ipc); | 730 | err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); |
731 | if (err) | 731 | if (err) |
732 | return err; | 732 | return err; |
733 | if (ipc.opt) | 733 | if (ipc.opt) |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a6c8a80ec9d6..ad737fad6d8b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
273 | SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), | 273 | SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), |
274 | SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), | 274 | SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), |
275 | SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), | 275 | SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), |
276 | SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), | ||
276 | SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), | 277 | SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), |
277 | SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), | 278 | SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), |
278 | SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), | 279 | SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), |
@@ -280,6 +281,11 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
280 | SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), | 281 | SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), |
281 | SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), | 282 | SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), |
282 | SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), | 283 | SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), |
284 | SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), | ||
285 | SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), | ||
286 | SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), | ||
287 | SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), | ||
288 | SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), | ||
283 | SNMP_MIB_SENTINEL | 289 | SNMP_MIB_SENTINEL |
284 | }; | 290 | }; |
285 | 291 | ||
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index c04518f4850a..a9dbe58bdfe7 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -524,7 +524,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
524 | ipc.oif = sk->sk_bound_dev_if; | 524 | ipc.oif = sk->sk_bound_dev_if; |
525 | 525 | ||
526 | if (msg->msg_controllen) { | 526 | if (msg->msg_controllen) { |
527 | err = ip_cmsg_send(sock_net(sk), msg, &ipc); | 527 | err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); |
528 | if (err) | 528 | if (err) |
529 | goto out; | 529 | goto out; |
530 | if (ipc.opt) | 530 | if (ipc.opt) |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4c011ec69ed4..34d094cadb11 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -139,11 +139,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, | |||
139 | struct sk_buff *skb); | 139 | struct sk_buff *skb); |
140 | static void ipv4_dst_destroy(struct dst_entry *dst); | 140 | static void ipv4_dst_destroy(struct dst_entry *dst); |
141 | 141 | ||
142 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | ||
143 | int how) | ||
144 | { | ||
145 | } | ||
146 | |||
147 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) | 142 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) |
148 | { | 143 | { |
149 | WARN_ON(1); | 144 | WARN_ON(1); |
@@ -162,7 +157,6 @@ static struct dst_ops ipv4_dst_ops = { | |||
162 | .mtu = ipv4_mtu, | 157 | .mtu = ipv4_mtu, |
163 | .cow_metrics = ipv4_cow_metrics, | 158 | .cow_metrics = ipv4_cow_metrics, |
164 | .destroy = ipv4_dst_destroy, | 159 | .destroy = ipv4_dst_destroy, |
165 | .ifdown = ipv4_dst_ifdown, | ||
166 | .negative_advice = ipv4_negative_advice, | 160 | .negative_advice = ipv4_negative_advice, |
167 | .link_failure = ipv4_link_failure, | 161 | .link_failure = ipv4_link_failure, |
168 | .update_pmtu = ip_rt_update_pmtu, | 162 | .update_pmtu = ip_rt_update_pmtu, |
@@ -194,7 +188,7 @@ const __u8 ip_tos2prio[16] = { | |||
194 | EXPORT_SYMBOL(ip_tos2prio); | 188 | EXPORT_SYMBOL(ip_tos2prio); |
195 | 189 | ||
196 | static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); | 190 | static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); |
197 | #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) | 191 | #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) |
198 | 192 | ||
199 | #ifdef CONFIG_PROC_FS | 193 | #ifdef CONFIG_PROC_FS |
200 | static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) | 194 | static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) |
@@ -697,7 +691,6 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, | |||
697 | 691 | ||
698 | out_unlock: | 692 | out_unlock: |
699 | spin_unlock_bh(&fnhe_lock); | 693 | spin_unlock_bh(&fnhe_lock); |
700 | return; | ||
701 | } | 694 | } |
702 | 695 | ||
703 | static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, | 696 | static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, |
@@ -2475,11 +2468,6 @@ errout_free: | |||
2475 | goto errout; | 2468 | goto errout; |
2476 | } | 2469 | } |
2477 | 2470 | ||
2478 | int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) | ||
2479 | { | ||
2480 | return skb->len; | ||
2481 | } | ||
2482 | |||
2483 | void ip_rt_multicast_event(struct in_device *in_dev) | 2471 | void ip_rt_multicast_event(struct in_device *in_dev) |
2484 | { | 2472 | { |
2485 | rt_cache_flush(dev_net(in_dev->dev)); | 2473 | rt_cache_flush(dev_net(in_dev->dev)); |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 97c8f5620c43..4bd6d52eeffb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk) | |||
387 | INIT_LIST_HEAD(&tp->tsq_node); | 387 | INIT_LIST_HEAD(&tp->tsq_node); |
388 | 388 | ||
389 | icsk->icsk_rto = TCP_TIMEOUT_INIT; | 389 | icsk->icsk_rto = TCP_TIMEOUT_INIT; |
390 | tp->mdev = TCP_TIMEOUT_INIT; | 390 | tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); |
391 | 391 | ||
392 | /* So many TCP implementations out there (incorrectly) count the | 392 | /* So many TCP implementations out there (incorrectly) count the |
393 | * initial SYN frame in their delayed-ACK and congestion control | 393 | * initial SYN frame in their delayed-ACK and congestion control |
@@ -2341,7 +2341,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
2341 | 2341 | ||
2342 | sk->sk_shutdown = 0; | 2342 | sk->sk_shutdown = 0; |
2343 | sock_reset_flag(sk, SOCK_DONE); | 2343 | sock_reset_flag(sk, SOCK_DONE); |
2344 | tp->srtt = 0; | 2344 | tp->srtt_us = 0; |
2345 | if ((tp->write_seq += tp->max_window + 2) == 0) | 2345 | if ((tp->write_seq += tp->max_window + 2) == 0) |
2346 | tp->write_seq = 1; | 2346 | tp->write_seq = 1; |
2347 | icsk->icsk_backoff = 0; | 2347 | icsk->icsk_backoff = 0; |
@@ -2785,8 +2785,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) | |||
2785 | 2785 | ||
2786 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; | 2786 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; |
2787 | info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; | 2787 | info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; |
2788 | info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; | 2788 | info->tcpi_rtt = tp->srtt_us >> 3; |
2789 | info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; | 2789 | info->tcpi_rttvar = tp->mdev_us >> 2; |
2790 | info->tcpi_snd_ssthresh = tp->snd_ssthresh; | 2790 | info->tcpi_snd_ssthresh = tp->snd_ssthresh; |
2791 | info->tcpi_snd_cwnd = tp->snd_cwnd; | 2791 | info->tcpi_snd_cwnd = tp->snd_cwnd; |
2792 | info->tcpi_advmss = tp->advmss; | 2792 | info->tcpi_advmss = tp->advmss; |
@@ -2796,6 +2796,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) | |||
2796 | info->tcpi_rcv_space = tp->rcvq_space.space; | 2796 | info->tcpi_rcv_space = tp->rcvq_space.space; |
2797 | 2797 | ||
2798 | info->tcpi_total_retrans = tp->total_retrans; | 2798 | info->tcpi_total_retrans = tp->total_retrans; |
2799 | |||
2800 | info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? | ||
2801 | sk->sk_pacing_rate : ~0ULL; | ||
2802 | info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? | ||
2803 | sk->sk_max_pacing_rate : ~0ULL; | ||
2799 | } | 2804 | } |
2800 | EXPORT_SYMBOL_GPL(tcp_get_info); | 2805 | EXPORT_SYMBOL_GPL(tcp_get_info); |
2801 | 2806 | ||
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2388275adb9b..2b9464c93b88 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -361,21 +361,12 @@ u32 tcp_reno_ssthresh(struct sock *sk) | |||
361 | } | 361 | } |
362 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | 362 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); |
363 | 363 | ||
364 | /* Lower bound on congestion window with halving. */ | ||
365 | u32 tcp_reno_min_cwnd(const struct sock *sk) | ||
366 | { | ||
367 | const struct tcp_sock *tp = tcp_sk(sk); | ||
368 | return tp->snd_ssthresh/2; | ||
369 | } | ||
370 | EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); | ||
371 | |||
372 | struct tcp_congestion_ops tcp_reno = { | 364 | struct tcp_congestion_ops tcp_reno = { |
373 | .flags = TCP_CONG_NON_RESTRICTED, | 365 | .flags = TCP_CONG_NON_RESTRICTED, |
374 | .name = "reno", | 366 | .name = "reno", |
375 | .owner = THIS_MODULE, | 367 | .owner = THIS_MODULE, |
376 | .ssthresh = tcp_reno_ssthresh, | 368 | .ssthresh = tcp_reno_ssthresh, |
377 | .cong_avoid = tcp_reno_cong_avoid, | 369 | .cong_avoid = tcp_reno_cong_avoid, |
378 | .min_cwnd = tcp_reno_min_cwnd, | ||
379 | }; | 370 | }; |
380 | 371 | ||
381 | /* Initial congestion control used (until SYN) | 372 | /* Initial congestion control used (until SYN) |
@@ -387,6 +378,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = { | |||
387 | .owner = THIS_MODULE, | 378 | .owner = THIS_MODULE, |
388 | .ssthresh = tcp_reno_ssthresh, | 379 | .ssthresh = tcp_reno_ssthresh, |
389 | .cong_avoid = tcp_reno_cong_avoid, | 380 | .cong_avoid = tcp_reno_cong_avoid, |
390 | .min_cwnd = tcp_reno_min_cwnd, | ||
391 | }; | 381 | }; |
392 | EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); | 382 | EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); |
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 828e4c3ffbaf..8bf224516ba2 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c | |||
@@ -476,10 +476,6 @@ static int __init cubictcp_register(void) | |||
476 | /* divide by bic_scale and by constant Srtt (100ms) */ | 476 | /* divide by bic_scale and by constant Srtt (100ms) */ |
477 | do_div(cube_factor, bic_scale * 10); | 477 | do_div(cube_factor, bic_scale * 10); |
478 | 478 | ||
479 | /* hystart needs ms clock resolution */ | ||
480 | if (hystart && HZ < 1000) | ||
481 | cubictcp.flags |= TCP_CONG_RTT_STAMP; | ||
482 | |||
483 | return tcp_register_congestion_control(&cubictcp); | 479 | return tcp_register_congestion_control(&cubictcp); |
484 | } | 480 | } |
485 | 481 | ||
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8ed9305dfdf4..8b9e7bad77c0 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
@@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = { | |||
162 | .init = hstcp_init, | 162 | .init = hstcp_init, |
163 | .ssthresh = hstcp_ssthresh, | 163 | .ssthresh = hstcp_ssthresh, |
164 | .cong_avoid = hstcp_cong_avoid, | 164 | .cong_avoid = hstcp_cong_avoid, |
165 | .min_cwnd = tcp_reno_min_cwnd, | ||
166 | 165 | ||
167 | .owner = THIS_MODULE, | 166 | .owner = THIS_MODULE, |
168 | .name = "highspeed" | 167 | .name = "highspeed" |
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 478fe82611bf..a15a799bf768 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c | |||
@@ -21,7 +21,7 @@ struct hybla { | |||
21 | u32 rho2; /* Rho * Rho, integer part */ | 21 | u32 rho2; /* Rho * Rho, integer part */ |
22 | u32 rho_3ls; /* Rho parameter, <<3 */ | 22 | u32 rho_3ls; /* Rho parameter, <<3 */ |
23 | u32 rho2_7ls; /* Rho^2, <<7 */ | 23 | u32 rho2_7ls; /* Rho^2, <<7 */ |
24 | u32 minrtt; /* Minimum smoothed round trip time value seen */ | 24 | u32 minrtt_us; /* Minimum smoothed round trip time value seen */ |
25 | }; | 25 | }; |
26 | 26 | ||
27 | /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ | 27 | /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ |
@@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk) | |||
35 | { | 35 | { |
36 | struct hybla *ca = inet_csk_ca(sk); | 36 | struct hybla *ca = inet_csk_ca(sk); |
37 | 37 | ||
38 | ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); | 38 | ca->rho_3ls = max_t(u32, |
39 | tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), | ||
40 | 8U); | ||
39 | ca->rho = ca->rho_3ls >> 3; | 41 | ca->rho = ca->rho_3ls >> 3; |
40 | ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; | 42 | ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; |
41 | ca->rho2 = ca->rho2_7ls >> 7; | 43 | ca->rho2 = ca->rho2_7ls >> 7; |
@@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk) | |||
59 | hybla_recalc_param(sk); | 61 | hybla_recalc_param(sk); |
60 | 62 | ||
61 | /* set minimum rtt as this is the 1st ever seen */ | 63 | /* set minimum rtt as this is the 1st ever seen */ |
62 | ca->minrtt = tp->srtt; | 64 | ca->minrtt_us = tp->srtt_us; |
63 | tp->snd_cwnd = ca->rho; | 65 | tp->snd_cwnd = ca->rho; |
64 | } | 66 | } |
65 | 67 | ||
@@ -94,9 +96,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, | |||
94 | int is_slowstart = 0; | 96 | int is_slowstart = 0; |
95 | 97 | ||
96 | /* Recalculate rho only if this srtt is the lowest */ | 98 | /* Recalculate rho only if this srtt is the lowest */ |
97 | if (tp->srtt < ca->minrtt){ | 99 | if (tp->srtt_us < ca->minrtt_us) { |
98 | hybla_recalc_param(sk); | 100 | hybla_recalc_param(sk); |
99 | ca->minrtt = tp->srtt; | 101 | ca->minrtt_us = tp->srtt_us; |
100 | } | 102 | } |
101 | 103 | ||
102 | if (!tcp_is_cwnd_limited(sk, in_flight)) | 104 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
@@ -166,7 +168,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, | |||
166 | static struct tcp_congestion_ops tcp_hybla __read_mostly = { | 168 | static struct tcp_congestion_ops tcp_hybla __read_mostly = { |
167 | .init = hybla_init, | 169 | .init = hybla_init, |
168 | .ssthresh = tcp_reno_ssthresh, | 170 | .ssthresh = tcp_reno_ssthresh, |
169 | .min_cwnd = tcp_reno_min_cwnd, | ||
170 | .cong_avoid = hybla_cong_avoid, | 171 | .cong_avoid = hybla_cong_avoid, |
171 | .set_state = hybla_state, | 172 | .set_state = hybla_state, |
172 | 173 | ||
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index e498a62b8f97..863d105e3015 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c | |||
@@ -325,10 +325,8 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, | |||
325 | } | 325 | } |
326 | 326 | ||
327 | static struct tcp_congestion_ops tcp_illinois __read_mostly = { | 327 | static struct tcp_congestion_ops tcp_illinois __read_mostly = { |
328 | .flags = TCP_CONG_RTT_STAMP, | ||
329 | .init = tcp_illinois_init, | 328 | .init = tcp_illinois_init, |
330 | .ssthresh = tcp_illinois_ssthresh, | 329 | .ssthresh = tcp_illinois_ssthresh, |
331 | .min_cwnd = tcp_reno_min_cwnd, | ||
332 | .cong_avoid = tcp_illinois_cong_avoid, | 330 | .cong_avoid = tcp_illinois_cong_avoid, |
333 | .set_state = tcp_illinois_state, | 331 | .set_state = tcp_illinois_state, |
334 | .get_info = tcp_illinois_info, | 332 | .get_info = tcp_illinois_info, |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eeaac399420d..e1661f46fd19 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -667,11 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) | |||
667 | * To save cycles in the RFC 1323 implementation it was better to break | 667 | * To save cycles in the RFC 1323 implementation it was better to break |
668 | * it up into three procedures. -- erics | 668 | * it up into three procedures. -- erics |
669 | */ | 669 | */ |
670 | static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | 670 | static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) |
671 | { | 671 | { |
672 | struct tcp_sock *tp = tcp_sk(sk); | 672 | struct tcp_sock *tp = tcp_sk(sk); |
673 | long m = mrtt; /* RTT */ | 673 | long m = mrtt_us; /* RTT */ |
674 | u32 srtt = tp->srtt; | 674 | u32 srtt = tp->srtt_us; |
675 | 675 | ||
676 | /* The following amusing code comes from Jacobson's | 676 | /* The following amusing code comes from Jacobson's |
677 | * article in SIGCOMM '88. Note that rtt and mdev | 677 | * article in SIGCOMM '88. Note that rtt and mdev |
@@ -694,7 +694,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
694 | srtt += m; /* rtt = 7/8 rtt + 1/8 new */ | 694 | srtt += m; /* rtt = 7/8 rtt + 1/8 new */ |
695 | if (m < 0) { | 695 | if (m < 0) { |
696 | m = -m; /* m is now abs(error) */ | 696 | m = -m; /* m is now abs(error) */ |
697 | m -= (tp->mdev >> 2); /* similar update on mdev */ | 697 | m -= (tp->mdev_us >> 2); /* similar update on mdev */ |
698 | /* This is similar to one of Eifel findings. | 698 | /* This is similar to one of Eifel findings. |
699 | * Eifel blocks mdev updates when rtt decreases. | 699 | * Eifel blocks mdev updates when rtt decreases. |
700 | * This solution is a bit different: we use finer gain | 700 | * This solution is a bit different: we use finer gain |
@@ -706,28 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
706 | if (m > 0) | 706 | if (m > 0) |
707 | m >>= 3; | 707 | m >>= 3; |
708 | } else { | 708 | } else { |
709 | m -= (tp->mdev >> 2); /* similar update on mdev */ | 709 | m -= (tp->mdev_us >> 2); /* similar update on mdev */ |
710 | } | 710 | } |
711 | tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ | 711 | tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ |
712 | if (tp->mdev > tp->mdev_max) { | 712 | if (tp->mdev_us > tp->mdev_max_us) { |
713 | tp->mdev_max = tp->mdev; | 713 | tp->mdev_max_us = tp->mdev_us; |
714 | if (tp->mdev_max > tp->rttvar) | 714 | if (tp->mdev_max_us > tp->rttvar_us) |
715 | tp->rttvar = tp->mdev_max; | 715 | tp->rttvar_us = tp->mdev_max_us; |
716 | } | 716 | } |
717 | if (after(tp->snd_una, tp->rtt_seq)) { | 717 | if (after(tp->snd_una, tp->rtt_seq)) { |
718 | if (tp->mdev_max < tp->rttvar) | 718 | if (tp->mdev_max_us < tp->rttvar_us) |
719 | tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; | 719 | tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; |
720 | tp->rtt_seq = tp->snd_nxt; | 720 | tp->rtt_seq = tp->snd_nxt; |
721 | tp->mdev_max = tcp_rto_min(sk); | 721 | tp->mdev_max_us = tcp_rto_min_us(sk); |
722 | } | 722 | } |
723 | } else { | 723 | } else { |
724 | /* no previous measure. */ | 724 | /* no previous measure. */ |
725 | srtt = m << 3; /* take the measured time to be rtt */ | 725 | srtt = m << 3; /* take the measured time to be rtt */ |
726 | tp->mdev = m << 1; /* make sure rto = 3*rtt */ | 726 | tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ |
727 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | 727 | tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); |
728 | tp->mdev_max_us = tp->rttvar_us; | ||
728 | tp->rtt_seq = tp->snd_nxt; | 729 | tp->rtt_seq = tp->snd_nxt; |
729 | } | 730 | } |
730 | tp->srtt = max(1U, srtt); | 731 | tp->srtt_us = max(1U, srtt); |
731 | } | 732 | } |
732 | 733 | ||
733 | /* Set the sk_pacing_rate to allow proper sizing of TSO packets. | 734 | /* Set the sk_pacing_rate to allow proper sizing of TSO packets. |
@@ -742,20 +743,12 @@ static void tcp_update_pacing_rate(struct sock *sk) | |||
742 | u64 rate; | 743 | u64 rate; |
743 | 744 | ||
744 | /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ | 745 | /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ |
745 | rate = (u64)tp->mss_cache * 2 * (HZ << 3); | 746 | rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); |
746 | 747 | ||
747 | rate *= max(tp->snd_cwnd, tp->packets_out); | 748 | rate *= max(tp->snd_cwnd, tp->packets_out); |
748 | 749 | ||
749 | /* Correction for small srtt and scheduling constraints. | 750 | if (likely(tp->srtt_us)) |
750 | * For small rtt, consider noise is too high, and use | 751 | do_div(rate, tp->srtt_us); |
751 | * the minimal value (srtt = 1 -> 125 us for HZ=1000) | ||
752 | * | ||
753 | * We probably need usec resolution in the future. | ||
754 | * Note: This also takes care of possible srtt=0 case, | ||
755 | * when tcp_rtt_estimator() was not yet called. | ||
756 | */ | ||
757 | if (tp->srtt > 8 + 2) | ||
758 | do_div(rate, tp->srtt); | ||
759 | 752 | ||
760 | /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate | 753 | /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate |
761 | * without any lock. We want to make sure compiler wont store | 754 | * without any lock. We want to make sure compiler wont store |
@@ -1122,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, | |||
1122 | } | 1115 | } |
1123 | 1116 | ||
1124 | struct tcp_sacktag_state { | 1117 | struct tcp_sacktag_state { |
1125 | int reord; | 1118 | int reord; |
1126 | int fack_count; | 1119 | int fack_count; |
1127 | int flag; | 1120 | long rtt_us; /* RTT measured by SACKing never-retransmitted data */ |
1128 | s32 rtt; /* RTT measured by SACKing never-retransmitted data */ | 1121 | int flag; |
1129 | }; | 1122 | }; |
1130 | 1123 | ||
1131 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, | 1124 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, |
@@ -1186,7 +1179,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | |||
1186 | static u8 tcp_sacktag_one(struct sock *sk, | 1179 | static u8 tcp_sacktag_one(struct sock *sk, |
1187 | struct tcp_sacktag_state *state, u8 sacked, | 1180 | struct tcp_sacktag_state *state, u8 sacked, |
1188 | u32 start_seq, u32 end_seq, | 1181 | u32 start_seq, u32 end_seq, |
1189 | int dup_sack, int pcount, u32 xmit_time) | 1182 | int dup_sack, int pcount, |
1183 | const struct skb_mstamp *xmit_time) | ||
1190 | { | 1184 | { |
1191 | struct tcp_sock *tp = tcp_sk(sk); | 1185 | struct tcp_sock *tp = tcp_sk(sk); |
1192 | int fack_count = state->fack_count; | 1186 | int fack_count = state->fack_count; |
@@ -1227,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1227 | if (!after(end_seq, tp->high_seq)) | 1221 | if (!after(end_seq, tp->high_seq)) |
1228 | state->flag |= FLAG_ORIG_SACK_ACKED; | 1222 | state->flag |= FLAG_ORIG_SACK_ACKED; |
1229 | /* Pick the earliest sequence sacked for RTT */ | 1223 | /* Pick the earliest sequence sacked for RTT */ |
1230 | if (state->rtt < 0) | 1224 | if (state->rtt_us < 0) { |
1231 | state->rtt = tcp_time_stamp - xmit_time; | 1225 | struct skb_mstamp now; |
1226 | |||
1227 | skb_mstamp_get(&now); | ||
1228 | state->rtt_us = skb_mstamp_us_delta(&now, | ||
1229 | xmit_time); | ||
1230 | } | ||
1232 | } | 1231 | } |
1233 | 1232 | ||
1234 | if (sacked & TCPCB_LOST) { | 1233 | if (sacked & TCPCB_LOST) { |
@@ -1287,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1287 | */ | 1286 | */ |
1288 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, | 1287 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, |
1289 | start_seq, end_seq, dup_sack, pcount, | 1288 | start_seq, end_seq, dup_sack, pcount, |
1290 | TCP_SKB_CB(skb)->when); | 1289 | &skb->skb_mstamp); |
1291 | 1290 | ||
1292 | if (skb == tp->lost_skb_hint) | 1291 | if (skb == tp->lost_skb_hint) |
1293 | tp->lost_cnt_hint += pcount; | 1292 | tp->lost_cnt_hint += pcount; |
@@ -1565,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1565 | TCP_SKB_CB(skb)->end_seq, | 1564 | TCP_SKB_CB(skb)->end_seq, |
1566 | dup_sack, | 1565 | dup_sack, |
1567 | tcp_skb_pcount(skb), | 1566 | tcp_skb_pcount(skb), |
1568 | TCP_SKB_CB(skb)->when); | 1567 | &skb->skb_mstamp); |
1569 | 1568 | ||
1570 | if (!before(TCP_SKB_CB(skb)->seq, | 1569 | if (!before(TCP_SKB_CB(skb)->seq, |
1571 | tcp_highest_sack_seq(tp))) | 1570 | tcp_highest_sack_seq(tp))) |
@@ -1622,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl | |||
1622 | 1621 | ||
1623 | static int | 1622 | static int |
1624 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | 1623 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, |
1625 | u32 prior_snd_una, s32 *sack_rtt) | 1624 | u32 prior_snd_una, long *sack_rtt_us) |
1626 | { | 1625 | { |
1627 | struct tcp_sock *tp = tcp_sk(sk); | 1626 | struct tcp_sock *tp = tcp_sk(sk); |
1628 | const unsigned char *ptr = (skb_transport_header(ack_skb) + | 1627 | const unsigned char *ptr = (skb_transport_header(ack_skb) + |
@@ -1640,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1640 | 1639 | ||
1641 | state.flag = 0; | 1640 | state.flag = 0; |
1642 | state.reord = tp->packets_out; | 1641 | state.reord = tp->packets_out; |
1643 | state.rtt = -1; | 1642 | state.rtt_us = -1L; |
1644 | 1643 | ||
1645 | if (!tp->sacked_out) { | 1644 | if (!tp->sacked_out) { |
1646 | if (WARN_ON(tp->fackets_out)) | 1645 | if (WARN_ON(tp->fackets_out)) |
@@ -1824,7 +1823,7 @@ out: | |||
1824 | WARN_ON((int)tp->retrans_out < 0); | 1823 | WARN_ON((int)tp->retrans_out < 0); |
1825 | WARN_ON((int)tcp_packets_in_flight(tp) < 0); | 1824 | WARN_ON((int)tcp_packets_in_flight(tp) < 0); |
1826 | #endif | 1825 | #endif |
1827 | *sack_rtt = state.rtt; | 1826 | *sack_rtt_us = state.rtt_us; |
1828 | return state.flag; | 1827 | return state.flag; |
1829 | } | 1828 | } |
1830 | 1829 | ||
@@ -2035,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) | |||
2035 | * available, or RTO is scheduled to fire first. | 2034 | * available, or RTO is scheduled to fire first. |
2036 | */ | 2035 | */ |
2037 | if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || | 2036 | if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || |
2038 | (flag & FLAG_ECE) || !tp->srtt) | 2037 | (flag & FLAG_ECE) || !tp->srtt_us) |
2039 | return false; | 2038 | return false; |
2040 | 2039 | ||
2041 | delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); | 2040 | delay = max(usecs_to_jiffies(tp->srtt_us >> 5), |
2041 | msecs_to_jiffies(2)); | ||
2042 | |||
2042 | if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) | 2043 | if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) |
2043 | return false; | 2044 | return false; |
2044 | 2045 | ||
@@ -2885,7 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
2885 | } | 2886 | } |
2886 | 2887 | ||
2887 | static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, | 2888 | static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, |
2888 | s32 seq_rtt, s32 sack_rtt) | 2889 | long seq_rtt_us, long sack_rtt_us) |
2889 | { | 2890 | { |
2890 | const struct tcp_sock *tp = tcp_sk(sk); | 2891 | const struct tcp_sock *tp = tcp_sk(sk); |
2891 | 2892 | ||
@@ -2895,10 +2896,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, | |||
2895 | * is acked (RFC6298). | 2896 | * is acked (RFC6298). |
2896 | */ | 2897 | */ |
2897 | if (flag & FLAG_RETRANS_DATA_ACKED) | 2898 | if (flag & FLAG_RETRANS_DATA_ACKED) |
2898 | seq_rtt = -1; | 2899 | seq_rtt_us = -1L; |
2899 | 2900 | ||
2900 | if (seq_rtt < 0) | 2901 | if (seq_rtt_us < 0) |
2901 | seq_rtt = sack_rtt; | 2902 | seq_rtt_us = sack_rtt_us; |
2902 | 2903 | ||
2903 | /* RTTM Rule: A TSecr value received in a segment is used to | 2904 | /* RTTM Rule: A TSecr value received in a segment is used to |
2904 | * update the averaged RTT measurement only if the segment | 2905 | * update the averaged RTT measurement only if the segment |
@@ -2906,14 +2907,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, | |||
2906 | * left edge of the send window. | 2907 | * left edge of the send window. |
2907 | * See draft-ietf-tcplw-high-performance-00, section 3.3. | 2908 | * See draft-ietf-tcplw-high-performance-00, section 3.3. |
2908 | */ | 2909 | */ |
2909 | if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 2910 | if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
2910 | flag & FLAG_ACKED) | 2911 | flag & FLAG_ACKED) |
2911 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 2912 | seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); |
2912 | 2913 | ||
2913 | if (seq_rtt < 0) | 2914 | if (seq_rtt_us < 0) |
2914 | return false; | 2915 | return false; |
2915 | 2916 | ||
2916 | tcp_rtt_estimator(sk, seq_rtt); | 2917 | tcp_rtt_estimator(sk, seq_rtt_us); |
2917 | tcp_set_rto(sk); | 2918 | tcp_set_rto(sk); |
2918 | 2919 | ||
2919 | /* RFC6298: only reset backoff on valid RTT measurement. */ | 2920 | /* RFC6298: only reset backoff on valid RTT measurement. */ |
@@ -2925,16 +2926,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, | |||
2925 | static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) | 2926 | static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) |
2926 | { | 2927 | { |
2927 | struct tcp_sock *tp = tcp_sk(sk); | 2928 | struct tcp_sock *tp = tcp_sk(sk); |
2928 | s32 seq_rtt = -1; | 2929 | long seq_rtt_us = -1L; |
2929 | 2930 | ||
2930 | if (synack_stamp && !tp->total_retrans) | 2931 | if (synack_stamp && !tp->total_retrans) |
2931 | seq_rtt = tcp_time_stamp - synack_stamp; | 2932 | seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); |
2932 | 2933 | ||
2933 | /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets | 2934 | /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets |
2934 | * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() | 2935 | * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() |
2935 | */ | 2936 | */ |
2936 | if (!tp->srtt) | 2937 | if (!tp->srtt_us) |
2937 | tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); | 2938 | tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); |
2938 | } | 2939 | } |
2939 | 2940 | ||
2940 | static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) | 2941 | static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) |
@@ -3023,26 +3024,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) | |||
3023 | * arrived at the other end. | 3024 | * arrived at the other end. |
3024 | */ | 3025 | */ |
3025 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | 3026 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, |
3026 | u32 prior_snd_una, s32 sack_rtt) | 3027 | u32 prior_snd_una, long sack_rtt_us) |
3027 | { | 3028 | { |
3028 | struct tcp_sock *tp = tcp_sk(sk); | ||
3029 | const struct inet_connection_sock *icsk = inet_csk(sk); | 3029 | const struct inet_connection_sock *icsk = inet_csk(sk); |
3030 | struct sk_buff *skb; | 3030 | struct skb_mstamp first_ackt, last_ackt, now; |
3031 | u32 now = tcp_time_stamp; | 3031 | struct tcp_sock *tp = tcp_sk(sk); |
3032 | u32 prior_sacked = tp->sacked_out; | ||
3033 | u32 reord = tp->packets_out; | ||
3032 | bool fully_acked = true; | 3034 | bool fully_acked = true; |
3033 | int flag = 0; | 3035 | long ca_seq_rtt_us = -1L; |
3036 | long seq_rtt_us = -1L; | ||
3037 | struct sk_buff *skb; | ||
3034 | u32 pkts_acked = 0; | 3038 | u32 pkts_acked = 0; |
3035 | u32 reord = tp->packets_out; | ||
3036 | u32 prior_sacked = tp->sacked_out; | ||
3037 | s32 seq_rtt = -1; | ||
3038 | s32 ca_seq_rtt = -1; | ||
3039 | ktime_t last_ackt = net_invalid_timestamp(); | ||
3040 | bool rtt_update; | 3039 | bool rtt_update; |
3040 | int flag = 0; | ||
3041 | |||
3042 | first_ackt.v64 = 0; | ||
3041 | 3043 | ||
3042 | while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { | 3044 | while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { |
3043 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); | 3045 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); |
3044 | u32 acked_pcount; | ||
3045 | u8 sacked = scb->sacked; | 3046 | u8 sacked = scb->sacked; |
3047 | u32 acked_pcount; | ||
3046 | 3048 | ||
3047 | /* Determine how many packets and what bytes were acked, tso and else */ | 3049 | /* Determine how many packets and what bytes were acked, tso and else */ |
3048 | if (after(scb->end_seq, tp->snd_una)) { | 3050 | if (after(scb->end_seq, tp->snd_una)) { |
@@ -3064,11 +3066,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3064 | tp->retrans_out -= acked_pcount; | 3066 | tp->retrans_out -= acked_pcount; |
3065 | flag |= FLAG_RETRANS_DATA_ACKED; | 3067 | flag |= FLAG_RETRANS_DATA_ACKED; |
3066 | } else { | 3068 | } else { |
3067 | ca_seq_rtt = now - scb->when; | 3069 | last_ackt = skb->skb_mstamp; |
3068 | last_ackt = skb->tstamp; | 3070 | WARN_ON_ONCE(last_ackt.v64 == 0); |
3069 | if (seq_rtt < 0) { | 3071 | if (!first_ackt.v64) |
3070 | seq_rtt = ca_seq_rtt; | 3072 | first_ackt = last_ackt; |
3071 | } | 3073 | |
3072 | if (!(sacked & TCPCB_SACKED_ACKED)) | 3074 | if (!(sacked & TCPCB_SACKED_ACKED)) |
3073 | reord = min(pkts_acked, reord); | 3075 | reord = min(pkts_acked, reord); |
3074 | if (!after(scb->end_seq, tp->high_seq)) | 3076 | if (!after(scb->end_seq, tp->high_seq)) |
@@ -3114,7 +3116,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3114 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | 3116 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
3115 | flag |= FLAG_SACK_RENEGING; | 3117 | flag |= FLAG_SACK_RENEGING; |
3116 | 3118 | ||
3117 | rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); | 3119 | skb_mstamp_get(&now); |
3120 | if (first_ackt.v64) { | ||
3121 | seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); | ||
3122 | ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); | ||
3123 | } | ||
3124 | |||
3125 | rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); | ||
3118 | 3126 | ||
3119 | if (flag & FLAG_ACKED) { | 3127 | if (flag & FLAG_ACKED) { |
3120 | const struct tcp_congestion_ops *ca_ops | 3128 | const struct tcp_congestion_ops *ca_ops |
@@ -3142,25 +3150,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3142 | 3150 | ||
3143 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); | 3151 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); |
3144 | 3152 | ||
3145 | if (ca_ops->pkts_acked) { | 3153 | if (ca_ops->pkts_acked) |
3146 | s32 rtt_us = -1; | 3154 | ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); |
3147 | |||
3148 | /* Is the ACK triggering packet unambiguous? */ | ||
3149 | if (!(flag & FLAG_RETRANS_DATA_ACKED)) { | ||
3150 | /* High resolution needed and available? */ | ||
3151 | if (ca_ops->flags & TCP_CONG_RTT_STAMP && | ||
3152 | !ktime_equal(last_ackt, | ||
3153 | net_invalid_timestamp())) | ||
3154 | rtt_us = ktime_us_delta(ktime_get_real(), | ||
3155 | last_ackt); | ||
3156 | else if (ca_seq_rtt >= 0) | ||
3157 | rtt_us = jiffies_to_usecs(ca_seq_rtt); | ||
3158 | } | ||
3159 | 3155 | ||
3160 | ca_ops->pkts_acked(sk, pkts_acked, rtt_us); | 3156 | } else if (skb && rtt_update && sack_rtt_us >= 0 && |
3161 | } | 3157 | sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { |
3162 | } else if (skb && rtt_update && sack_rtt >= 0 && | ||
3163 | sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { | ||
3164 | /* Do not re-arm RTO if the sack RTT is measured from data sent | 3158 | /* Do not re-arm RTO if the sack RTT is measured from data sent |
3165 | * after when the head was last (re)transmitted. Otherwise the | 3159 | * after when the head was last (re)transmitted. Otherwise the |
3166 | * timeout may continue to extend in loss recovery. | 3160 | * timeout may continue to extend in loss recovery. |
@@ -3370,12 +3364,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3370 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3364 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
3371 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3365 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
3372 | bool is_dupack = false; | 3366 | bool is_dupack = false; |
3373 | u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; | 3367 | u32 prior_in_flight; |
3374 | u32 prior_fackets; | 3368 | u32 prior_fackets; |
3375 | int prior_packets = tp->packets_out; | 3369 | int prior_packets = tp->packets_out; |
3376 | const int prior_unsacked = tp->packets_out - tp->sacked_out; | 3370 | const int prior_unsacked = tp->packets_out - tp->sacked_out; |
3377 | int acked = 0; /* Number of packets newly acked */ | 3371 | int acked = 0; /* Number of packets newly acked */ |
3378 | s32 sack_rtt = -1; | 3372 | long sack_rtt_us = -1L; |
3379 | 3373 | ||
3380 | /* If the ack is older than previous acks | 3374 | /* If the ack is older than previous acks |
3381 | * then we can probably ignore it. | 3375 | * then we can probably ignore it. |
@@ -3433,7 +3427,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3433 | 3427 | ||
3434 | if (TCP_SKB_CB(skb)->sacked) | 3428 | if (TCP_SKB_CB(skb)->sacked) |
3435 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, | 3429 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3436 | &sack_rtt); | 3430 | &sack_rtt_us); |
3437 | 3431 | ||
3438 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) | 3432 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) |
3439 | flag |= FLAG_ECE; | 3433 | flag |= FLAG_ECE; |
@@ -3452,7 +3446,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3452 | 3446 | ||
3453 | /* See if we can take anything off of the retransmit queue. */ | 3447 | /* See if we can take anything off of the retransmit queue. */ |
3454 | acked = tp->packets_out; | 3448 | acked = tp->packets_out; |
3455 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); | 3449 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, |
3450 | sack_rtt_us); | ||
3456 | acked -= tp->packets_out; | 3451 | acked -= tp->packets_out; |
3457 | 3452 | ||
3458 | /* Advance cwnd if state allows */ | 3453 | /* Advance cwnd if state allows */ |
@@ -3475,8 +3470,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3475 | 3470 | ||
3476 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) | 3471 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) |
3477 | tcp_schedule_loss_probe(sk); | 3472 | tcp_schedule_loss_probe(sk); |
3478 | if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) | 3473 | tcp_update_pacing_rate(sk); |
3479 | tcp_update_pacing_rate(sk); | ||
3480 | return 1; | 3474 | return 1; |
3481 | 3475 | ||
3482 | no_queue: | 3476 | no_queue: |
@@ -3505,7 +3499,7 @@ old_ack: | |||
3505 | */ | 3499 | */ |
3506 | if (TCP_SKB_CB(skb)->sacked) { | 3500 | if (TCP_SKB_CB(skb)->sacked) { |
3507 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, | 3501 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3508 | &sack_rtt); | 3502 | &sack_rtt_us); |
3509 | tcp_fastretrans_alert(sk, acked, prior_unsacked, | 3503 | tcp_fastretrans_alert(sk, acked, prior_unsacked, |
3510 | is_dupack, flag); | 3504 | is_dupack, flag); |
3511 | } | 3505 | } |
@@ -5401,9 +5395,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | |||
5401 | break; | 5395 | break; |
5402 | } | 5396 | } |
5403 | tcp_rearm_rto(sk); | 5397 | tcp_rearm_rto(sk); |
5398 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); | ||
5404 | return true; | 5399 | return true; |
5405 | } | 5400 | } |
5406 | tp->syn_data_acked = tp->syn_data; | 5401 | tp->syn_data_acked = tp->syn_data; |
5402 | if (tp->syn_data_acked) | ||
5403 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); | ||
5407 | return false; | 5404 | return false; |
5408 | } | 5405 | } |
5409 | 5406 | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1e4eac779f51..6379894ec210 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -435,7 +435,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
435 | break; | 435 | break; |
436 | 436 | ||
437 | icsk->icsk_backoff--; | 437 | icsk->icsk_backoff--; |
438 | inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : | 438 | inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : |
439 | TCP_TIMEOUT_INIT) << icsk->icsk_backoff; | 439 | TCP_TIMEOUT_INIT) << icsk->icsk_backoff; |
440 | tcp_bound_rto(sk); | 440 | tcp_bound_rto(sk); |
441 | 441 | ||
@@ -854,8 +854,10 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) | |||
854 | { | 854 | { |
855 | int res = tcp_v4_send_synack(sk, NULL, req, 0); | 855 | int res = tcp_v4_send_synack(sk, NULL, req, 0); |
856 | 856 | ||
857 | if (!res) | 857 | if (!res) { |
858 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); | 858 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
859 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | ||
860 | } | ||
859 | return res; | 861 | return res; |
860 | } | 862 | } |
861 | 863 | ||
@@ -878,8 +880,6 @@ bool tcp_syn_flood_action(struct sock *sk, | |||
878 | bool want_cookie = false; | 880 | bool want_cookie = false; |
879 | struct listen_sock *lopt; | 881 | struct listen_sock *lopt; |
880 | 882 | ||
881 | |||
882 | |||
883 | #ifdef CONFIG_SYN_COOKIES | 883 | #ifdef CONFIG_SYN_COOKIES |
884 | if (sysctl_tcp_syncookies) { | 884 | if (sysctl_tcp_syncookies) { |
885 | msg = "Sending cookies"; | 885 | msg = "Sending cookies"; |
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 991d62a2f9bb..c9aecae31327 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c | |||
@@ -315,11 +315,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) | |||
315 | } | 315 | } |
316 | 316 | ||
317 | static struct tcp_congestion_ops tcp_lp __read_mostly = { | 317 | static struct tcp_congestion_ops tcp_lp __read_mostly = { |
318 | .flags = TCP_CONG_RTT_STAMP, | ||
319 | .init = tcp_lp_init, | 318 | .init = tcp_lp_init, |
320 | .ssthresh = tcp_reno_ssthresh, | 319 | .ssthresh = tcp_reno_ssthresh, |
321 | .cong_avoid = tcp_lp_cong_avoid, | 320 | .cong_avoid = tcp_lp_cong_avoid, |
322 | .min_cwnd = tcp_reno_min_cwnd, | ||
323 | .pkts_acked = tcp_lp_pkts_acked, | 321 | .pkts_acked = tcp_lp_pkts_acked, |
324 | 322 | ||
325 | .owner = THIS_MODULE, | 323 | .owner = THIS_MODULE, |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index f7e522c558ba..d4f015ad6c84 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c | |||
@@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |||
103 | } | 103 | } |
104 | 104 | ||
105 | static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, | 105 | static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, |
106 | const char *buffer) | 106 | char *buffer) |
107 | { | 107 | { |
108 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 108 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
109 | unsigned long long val; | 109 | unsigned long long val; |
@@ -219,7 +219,7 @@ static struct cftype tcp_files[] = { | |||
219 | 219 | ||
220 | static int __init tcp_memcontrol_init(void) | 220 | static int __init tcp_memcontrol_init(void) |
221 | { | 221 | { |
222 | WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); | 222 | WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); |
223 | return 0; | 223 | return 0; |
224 | } | 224 | } |
225 | __initcall(tcp_memcontrol_init); | 225 | __initcall(tcp_memcontrol_init); |
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d547075d8300..dcaf72f10216 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c | |||
@@ -33,6 +33,11 @@ struct tcp_fastopen_metrics { | |||
33 | struct tcp_fastopen_cookie cookie; | 33 | struct tcp_fastopen_cookie cookie; |
34 | }; | 34 | }; |
35 | 35 | ||
36 | /* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility | ||
37 | * Kernel only stores RTT and RTTVAR in usec resolution | ||
38 | */ | ||
39 | #define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) | ||
40 | |||
36 | struct tcp_metrics_block { | 41 | struct tcp_metrics_block { |
37 | struct tcp_metrics_block __rcu *tcpm_next; | 42 | struct tcp_metrics_block __rcu *tcpm_next; |
38 | struct inetpeer_addr tcpm_saddr; | 43 | struct inetpeer_addr tcpm_saddr; |
@@ -41,7 +46,7 @@ struct tcp_metrics_block { | |||
41 | u32 tcpm_ts; | 46 | u32 tcpm_ts; |
42 | u32 tcpm_ts_stamp; | 47 | u32 tcpm_ts_stamp; |
43 | u32 tcpm_lock; | 48 | u32 tcpm_lock; |
44 | u32 tcpm_vals[TCP_METRIC_MAX + 1]; | 49 | u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; |
45 | struct tcp_fastopen_metrics tcpm_fastopen; | 50 | struct tcp_fastopen_metrics tcpm_fastopen; |
46 | 51 | ||
47 | struct rcu_head rcu_head; | 52 | struct rcu_head rcu_head; |
@@ -59,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm, | |||
59 | return tm->tcpm_vals[idx]; | 64 | return tm->tcpm_vals[idx]; |
60 | } | 65 | } |
61 | 66 | ||
62 | static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, | ||
63 | enum tcp_metric_index idx) | ||
64 | { | ||
65 | return msecs_to_jiffies(tm->tcpm_vals[idx]); | ||
66 | } | ||
67 | |||
68 | static void tcp_metric_set(struct tcp_metrics_block *tm, | 67 | static void tcp_metric_set(struct tcp_metrics_block *tm, |
69 | enum tcp_metric_index idx, | 68 | enum tcp_metric_index idx, |
70 | u32 val) | 69 | u32 val) |
@@ -72,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, | |||
72 | tm->tcpm_vals[idx] = val; | 71 | tm->tcpm_vals[idx] = val; |
73 | } | 72 | } |
74 | 73 | ||
75 | static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, | ||
76 | enum tcp_metric_index idx, | ||
77 | u32 val) | ||
78 | { | ||
79 | tm->tcpm_vals[idx] = jiffies_to_msecs(val); | ||
80 | } | ||
81 | |||
82 | static bool addr_same(const struct inetpeer_addr *a, | 74 | static bool addr_same(const struct inetpeer_addr *a, |
83 | const struct inetpeer_addr *b) | 75 | const struct inetpeer_addr *b) |
84 | { | 76 | { |
@@ -101,9 +93,11 @@ struct tcpm_hash_bucket { | |||
101 | 93 | ||
102 | static DEFINE_SPINLOCK(tcp_metrics_lock); | 94 | static DEFINE_SPINLOCK(tcp_metrics_lock); |
103 | 95 | ||
104 | static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, | 96 | static void tcpm_suck_dst(struct tcp_metrics_block *tm, |
97 | const struct dst_entry *dst, | ||
105 | bool fastopen_clear) | 98 | bool fastopen_clear) |
106 | { | 99 | { |
100 | u32 msval; | ||
107 | u32 val; | 101 | u32 val; |
108 | 102 | ||
109 | tm->tcpm_stamp = jiffies; | 103 | tm->tcpm_stamp = jiffies; |
@@ -121,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, | |||
121 | val |= 1 << TCP_METRIC_REORDERING; | 115 | val |= 1 << TCP_METRIC_REORDERING; |
122 | tm->tcpm_lock = val; | 116 | tm->tcpm_lock = val; |
123 | 117 | ||
124 | tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); | 118 | msval = dst_metric_raw(dst, RTAX_RTT); |
125 | tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); | 119 | tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; |
120 | |||
121 | msval = dst_metric_raw(dst, RTAX_RTTVAR); | ||
122 | tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; | ||
126 | tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); | 123 | tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); |
127 | tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); | 124 | tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); |
128 | tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); | 125 | tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); |
@@ -384,7 +381,7 @@ void tcp_update_metrics(struct sock *sk) | |||
384 | dst_confirm(dst); | 381 | dst_confirm(dst); |
385 | 382 | ||
386 | rcu_read_lock(); | 383 | rcu_read_lock(); |
387 | if (icsk->icsk_backoff || !tp->srtt) { | 384 | if (icsk->icsk_backoff || !tp->srtt_us) { |
388 | /* This session failed to estimate rtt. Why? | 385 | /* This session failed to estimate rtt. Why? |
389 | * Probably, no packets returned in time. Reset our | 386 | * Probably, no packets returned in time. Reset our |
390 | * results. | 387 | * results. |
@@ -399,8 +396,8 @@ void tcp_update_metrics(struct sock *sk) | |||
399 | if (!tm) | 396 | if (!tm) |
400 | goto out_unlock; | 397 | goto out_unlock; |
401 | 398 | ||
402 | rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); | 399 | rtt = tcp_metric_get(tm, TCP_METRIC_RTT); |
403 | m = rtt - tp->srtt; | 400 | m = rtt - tp->srtt_us; |
404 | 401 | ||
405 | /* If newly calculated rtt larger than stored one, store new | 402 | /* If newly calculated rtt larger than stored one, store new |
406 | * one. Otherwise, use EWMA. Remember, rtt overestimation is | 403 | * one. Otherwise, use EWMA. Remember, rtt overestimation is |
@@ -408,10 +405,10 @@ void tcp_update_metrics(struct sock *sk) | |||
408 | */ | 405 | */ |
409 | if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { | 406 | if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { |
410 | if (m <= 0) | 407 | if (m <= 0) |
411 | rtt = tp->srtt; | 408 | rtt = tp->srtt_us; |
412 | else | 409 | else |
413 | rtt -= (m >> 3); | 410 | rtt -= (m >> 3); |
414 | tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); | 411 | tcp_metric_set(tm, TCP_METRIC_RTT, rtt); |
415 | } | 412 | } |
416 | 413 | ||
417 | if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { | 414 | if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { |
@@ -422,16 +419,16 @@ void tcp_update_metrics(struct sock *sk) | |||
422 | 419 | ||
423 | /* Scale deviation to rttvar fixed point */ | 420 | /* Scale deviation to rttvar fixed point */ |
424 | m >>= 1; | 421 | m >>= 1; |
425 | if (m < tp->mdev) | 422 | if (m < tp->mdev_us) |
426 | m = tp->mdev; | 423 | m = tp->mdev_us; |
427 | 424 | ||
428 | var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | 425 | var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); |
429 | if (m >= var) | 426 | if (m >= var) |
430 | var = m; | 427 | var = m; |
431 | else | 428 | else |
432 | var -= (var - m) >> 2; | 429 | var -= (var - m) >> 2; |
433 | 430 | ||
434 | tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); | 431 | tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); |
435 | } | 432 | } |
436 | 433 | ||
437 | if (tcp_in_initial_slowstart(tp)) { | 434 | if (tcp_in_initial_slowstart(tp)) { |
@@ -528,7 +525,7 @@ void tcp_init_metrics(struct sock *sk) | |||
528 | tp->reordering = val; | 525 | tp->reordering = val; |
529 | } | 526 | } |
530 | 527 | ||
531 | crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); | 528 | crtt = tcp_metric_get(tm, TCP_METRIC_RTT); |
532 | rcu_read_unlock(); | 529 | rcu_read_unlock(); |
533 | reset: | 530 | reset: |
534 | /* The initial RTT measurement from the SYN/SYN-ACK is not ideal | 531 | /* The initial RTT measurement from the SYN/SYN-ACK is not ideal |
@@ -551,18 +548,20 @@ reset: | |||
551 | * to low value, and then abruptly stops to do it and starts to delay | 548 | * to low value, and then abruptly stops to do it and starts to delay |
552 | * ACKs, wait for troubles. | 549 | * ACKs, wait for troubles. |
553 | */ | 550 | */ |
554 | if (crtt > tp->srtt) { | 551 | if (crtt > tp->srtt_us) { |
555 | /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ | 552 | /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ |
556 | crtt >>= 3; | 553 | crtt /= 8 * USEC_PER_MSEC; |
557 | inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); | 554 | inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); |
558 | } else if (tp->srtt == 0) { | 555 | } else if (tp->srtt_us == 0) { |
559 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | 556 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from |
560 | * 3WHS. This is most likely due to retransmission, | 557 | * 3WHS. This is most likely due to retransmission, |
561 | * including spurious one. Reset the RTO back to 3secs | 558 | * including spurious one. Reset the RTO back to 3secs |
562 | * from the more aggressive 1sec to avoid more spurious | 559 | * from the more aggressive 1sec to avoid more spurious |
563 | * retransmission. | 560 | * retransmission. |
564 | */ | 561 | */ |
565 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | 562 | tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); |
563 | tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; | ||
564 | |||
566 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | 565 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; |
567 | } | 566 | } |
568 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | 567 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been |
@@ -809,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, | |||
809 | nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); | 808 | nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); |
810 | if (!nest) | 809 | if (!nest) |
811 | goto nla_put_failure; | 810 | goto nla_put_failure; |
812 | for (i = 0; i < TCP_METRIC_MAX + 1; i++) { | 811 | for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { |
813 | if (!tm->tcpm_vals[i]) | 812 | u32 val = tm->tcpm_vals[i]; |
813 | |||
814 | if (!val) | ||
814 | continue; | 815 | continue; |
815 | if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) | 816 | if (i == TCP_METRIC_RTT) { |
817 | if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, | ||
818 | val) < 0) | ||
819 | goto nla_put_failure; | ||
820 | n++; | ||
821 | val = max(val / 1000, 1U); | ||
822 | } | ||
823 | if (i == TCP_METRIC_RTTVAR) { | ||
824 | if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, | ||
825 | val) < 0) | ||
826 | goto nla_put_failure; | ||
827 | n++; | ||
828 | val = max(val / 1000, 1U); | ||
829 | } | ||
830 | if (nla_put_u32(msg, i + 1, val) < 0) | ||
816 | goto nla_put_failure; | 831 | goto nla_put_failure; |
817 | n++; | 832 | n++; |
818 | } | 833 | } |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7a436c517e44..ca788ada5bd3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -398,8 +398,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
398 | 398 | ||
399 | tcp_init_wl(newtp, treq->rcv_isn); | 399 | tcp_init_wl(newtp, treq->rcv_isn); |
400 | 400 | ||
401 | newtp->srtt = 0; | 401 | newtp->srtt_us = 0; |
402 | newtp->mdev = TCP_TIMEOUT_INIT; | 402 | newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); |
403 | newicsk->icsk_rto = TCP_TIMEOUT_INIT; | 403 | newicsk->icsk_rto = TCP_TIMEOUT_INIT; |
404 | 404 | ||
405 | newtp->packets_out = 0; | 405 | newtp->packets_out = 0; |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 17a11e65e57f..699fb102e971 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | |||
86 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | 86 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { |
87 | tcp_rearm_rto(sk); | 87 | tcp_rearm_rto(sk); |
88 | } | 88 | } |
89 | |||
90 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, | ||
91 | tcp_skb_pcount(skb)); | ||
89 | } | 92 | } |
90 | 93 | ||
91 | /* SND.NXT, if window was not shrunk. | 94 | /* SND.NXT, if window was not shrunk. |
@@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window); | |||
269 | static u16 tcp_select_window(struct sock *sk) | 272 | static u16 tcp_select_window(struct sock *sk) |
270 | { | 273 | { |
271 | struct tcp_sock *tp = tcp_sk(sk); | 274 | struct tcp_sock *tp = tcp_sk(sk); |
275 | u32 old_win = tp->rcv_wnd; | ||
272 | u32 cur_win = tcp_receive_window(tp); | 276 | u32 cur_win = tcp_receive_window(tp); |
273 | u32 new_win = __tcp_select_window(sk); | 277 | u32 new_win = __tcp_select_window(sk); |
274 | 278 | ||
@@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk) | |||
281 | * | 285 | * |
282 | * Relax Will Robinson. | 286 | * Relax Will Robinson. |
283 | */ | 287 | */ |
288 | if (new_win == 0) | ||
289 | NET_INC_STATS(sock_net(sk), | ||
290 | LINUX_MIB_TCPWANTZEROWINDOWADV); | ||
284 | new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); | 291 | new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); |
285 | } | 292 | } |
286 | tp->rcv_wnd = new_win; | 293 | tp->rcv_wnd = new_win; |
@@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk) | |||
298 | new_win >>= tp->rx_opt.rcv_wscale; | 305 | new_win >>= tp->rx_opt.rcv_wscale; |
299 | 306 | ||
300 | /* If we advertise zero window, disable fast path. */ | 307 | /* If we advertise zero window, disable fast path. */ |
301 | if (new_win == 0) | 308 | if (new_win == 0) { |
302 | tp->pred_flags = 0; | 309 | tp->pred_flags = 0; |
310 | if (old_win) | ||
311 | NET_INC_STATS(sock_net(sk), | ||
312 | LINUX_MIB_TCPTOZEROWINDOWADV); | ||
313 | } else if (old_win == 0) { | ||
314 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); | ||
315 | } | ||
303 | 316 | ||
304 | return new_win; | 317 | return new_win; |
305 | } | 318 | } |
@@ -867,11 +880,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
867 | if (clone_it) { | 880 | if (clone_it) { |
868 | const struct sk_buff *fclone = skb + 1; | 881 | const struct sk_buff *fclone = skb + 1; |
869 | 882 | ||
870 | /* If congestion control is doing timestamping, we must | 883 | skb_mstamp_get(&skb->skb_mstamp); |
871 | * take such a timestamp before we potentially clone/copy. | ||
872 | */ | ||
873 | if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) | ||
874 | __net_timestamp(skb); | ||
875 | 884 | ||
876 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && | 885 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && |
877 | fclone->fclone == SKB_FCLONE_CLONE)) | 886 | fclone->fclone == SKB_FCLONE_CLONE)) |
@@ -884,6 +893,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
884 | skb = skb_clone(skb, gfp_mask); | 893 | skb = skb_clone(skb, gfp_mask); |
885 | if (unlikely(!skb)) | 894 | if (unlikely(!skb)) |
886 | return -ENOBUFS; | 895 | return -ENOBUFS; |
896 | /* Our usage of tstamp should remain private */ | ||
897 | skb->tstamp.tv64 = 0; | ||
887 | } | 898 | } |
888 | 899 | ||
889 | inet = inet_sk(sk); | 900 | inet = inet_sk(sk); |
@@ -1426,7 +1437,7 @@ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, | |||
1426 | * With Minshall's modification: all sent small packets are ACKed. | 1437 | * With Minshall's modification: all sent small packets are ACKed. |
1427 | */ | 1438 | */ |
1428 | static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, | 1439 | static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, |
1429 | unsigned int mss_now, int nonagle) | 1440 | int nonagle) |
1430 | { | 1441 | { |
1431 | return partial && | 1442 | return partial && |
1432 | ((nonagle & TCP_NAGLE_CORK) || | 1443 | ((nonagle & TCP_NAGLE_CORK) || |
@@ -1458,7 +1469,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, | |||
1458 | * to include this last segment in this skb. | 1469 | * to include this last segment in this skb. |
1459 | * Otherwise, we'll split the skb at last MSS boundary | 1470 | * Otherwise, we'll split the skb at last MSS boundary |
1460 | */ | 1471 | */ |
1461 | if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) | 1472 | if (tcp_nagle_check(partial != 0, tp, nonagle)) |
1462 | return needed - partial; | 1473 | return needed - partial; |
1463 | 1474 | ||
1464 | return needed; | 1475 | return needed; |
@@ -1521,7 +1532,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf | |||
1521 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) | 1532 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) |
1522 | return true; | 1533 | return true; |
1523 | 1534 | ||
1524 | if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) | 1535 | if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) |
1525 | return true; | 1536 | return true; |
1526 | 1537 | ||
1527 | return false; | 1538 | return false; |
@@ -1975,7 +1986,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
1975 | struct inet_connection_sock *icsk = inet_csk(sk); | 1986 | struct inet_connection_sock *icsk = inet_csk(sk); |
1976 | struct tcp_sock *tp = tcp_sk(sk); | 1987 | struct tcp_sock *tp = tcp_sk(sk); |
1977 | u32 timeout, tlp_time_stamp, rto_time_stamp; | 1988 | u32 timeout, tlp_time_stamp, rto_time_stamp; |
1978 | u32 rtt = tp->srtt >> 3; | 1989 | u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); |
1979 | 1990 | ||
1980 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) | 1991 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) |
1981 | return false; | 1992 | return false; |
@@ -1997,7 +2008,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
1997 | /* Schedule a loss probe in 2*RTT for SACK capable connections | 2008 | /* Schedule a loss probe in 2*RTT for SACK capable connections |
1998 | * in Open state, that are either limited by cwnd or application. | 2009 | * in Open state, that are either limited by cwnd or application. |
1999 | */ | 2010 | */ |
2000 | if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || | 2011 | if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || |
2001 | !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) | 2012 | !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) |
2002 | return false; | 2013 | return false; |
2003 | 2014 | ||
@@ -2082,7 +2093,6 @@ rearm_timer: | |||
2082 | if (likely(!err)) | 2093 | if (likely(!err)) |
2083 | NET_INC_STATS_BH(sock_net(sk), | 2094 | NET_INC_STATS_BH(sock_net(sk), |
2084 | LINUX_MIB_TCPLOSSPROBES); | 2095 | LINUX_MIB_TCPLOSSPROBES); |
2085 | return; | ||
2086 | } | 2096 | } |
2087 | 2097 | ||
2088 | /* Push out any pending frames which were held back due to | 2098 | /* Push out any pending frames which were held back due to |
@@ -2180,7 +2190,8 @@ u32 __tcp_select_window(struct sock *sk) | |||
2180 | */ | 2190 | */ |
2181 | int mss = icsk->icsk_ack.rcv_mss; | 2191 | int mss = icsk->icsk_ack.rcv_mss; |
2182 | int free_space = tcp_space(sk); | 2192 | int free_space = tcp_space(sk); |
2183 | int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); | 2193 | int allowed_space = tcp_full_space(sk); |
2194 | int full_space = min_t(int, tp->window_clamp, allowed_space); | ||
2184 | int window; | 2195 | int window; |
2185 | 2196 | ||
2186 | if (mss > full_space) | 2197 | if (mss > full_space) |
@@ -2193,7 +2204,19 @@ u32 __tcp_select_window(struct sock *sk) | |||
2193 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, | 2204 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, |
2194 | 4U * tp->advmss); | 2205 | 4U * tp->advmss); |
2195 | 2206 | ||
2196 | if (free_space < mss) | 2207 | /* free_space might become our new window, make sure we don't |
2208 | * increase it due to wscale. | ||
2209 | */ | ||
2210 | free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); | ||
2211 | |||
2212 | /* if free space is less than mss estimate, or is below 1/16th | ||
2213 | * of the maximum allowed, try to move to zero-window, else | ||
2214 | * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and | ||
2215 | * new incoming data is dropped due to memory limits. | ||
2216 | * With large window, mss test triggers way too late in order | ||
2217 | * to announce zero window in time before rmem limit kicks in. | ||
2218 | */ | ||
2219 | if (free_space < (allowed_space >> 4) || free_space < mss) | ||
2197 | return 0; | 2220 | return 0; |
2198 | } | 2221 | } |
2199 | 2222 | ||
@@ -2431,7 +2454,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2431 | if (err == 0) { | 2454 | if (err == 0) { |
2432 | /* Update global TCP statistics. */ | 2455 | /* Update global TCP statistics. */ |
2433 | TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); | 2456 | TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); |
2434 | 2457 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) | |
2458 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | ||
2435 | tp->total_retrans++; | 2459 | tp->total_retrans++; |
2436 | 2460 | ||
2437 | #if FASTRETRANS_DEBUG > 0 | 2461 | #if FASTRETRANS_DEBUG > 0 |
@@ -2717,7 +2741,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2717 | int tcp_header_size; | 2741 | int tcp_header_size; |
2718 | int mss; | 2742 | int mss; |
2719 | 2743 | ||
2720 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); | 2744 | skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); |
2721 | if (unlikely(!skb)) { | 2745 | if (unlikely(!skb)) { |
2722 | dst_release(dst); | 2746 | dst_release(dst); |
2723 | return NULL; | 2747 | return NULL; |
@@ -2787,7 +2811,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2787 | th->window = htons(min(req->rcv_wnd, 65535U)); | 2811 | th->window = htons(min(req->rcv_wnd, 65535U)); |
2788 | tcp_options_write((__be32 *)(th + 1), tp, &opts); | 2812 | tcp_options_write((__be32 *)(th + 1), tp, &opts); |
2789 | th->doff = (tcp_header_size >> 2); | 2813 | th->doff = (tcp_header_size >> 2); |
2790 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); | 2814 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); |
2791 | 2815 | ||
2792 | #ifdef CONFIG_TCP_MD5SIG | 2816 | #ifdef CONFIG_TCP_MD5SIG |
2793 | /* Okay, we have all we need - do the md5 hash if needed */ | 2817 | /* Okay, we have all we need - do the md5 hash if needed */ |
@@ -2959,9 +2983,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
2959 | tcp_connect_queue_skb(sk, data); | 2983 | tcp_connect_queue_skb(sk, data); |
2960 | fo->copied = data->len; | 2984 | fo->copied = data->len; |
2961 | 2985 | ||
2986 | /* syn_data is about to be sent, we need to take current time stamps | ||
2987 | * for the packets that are in write queue : SYN packet and DATA | ||
2988 | */ | ||
2989 | skb_mstamp_get(&syn->skb_mstamp); | ||
2990 | data->skb_mstamp = syn->skb_mstamp; | ||
2991 | |||
2962 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { | 2992 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { |
2963 | tp->syn_data = (fo->copied > 0); | 2993 | tp->syn_data = (fo->copied > 0); |
2964 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); | 2994 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); |
2965 | goto done; | 2995 | goto done; |
2966 | } | 2996 | } |
2967 | syn_data = NULL; | 2997 | syn_data = NULL; |
@@ -3049,8 +3079,9 @@ void tcp_send_delayed_ack(struct sock *sk) | |||
3049 | * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements | 3079 | * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements |
3050 | * directly. | 3080 | * directly. |
3051 | */ | 3081 | */ |
3052 | if (tp->srtt) { | 3082 | if (tp->srtt_us) { |
3053 | int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); | 3083 | int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), |
3084 | TCP_DELACK_MIN); | ||
3054 | 3085 | ||
3055 | if (rtt < max_ato) | 3086 | if (rtt < max_ato) |
3056 | max_ato = rtt; | 3087 | max_ato = rtt; |
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 1f2d37613c9e..3b66610d4156 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c | |||
@@ -154,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
154 | p->snd_wnd = tp->snd_wnd; | 154 | p->snd_wnd = tp->snd_wnd; |
155 | p->rcv_wnd = tp->rcv_wnd; | 155 | p->rcv_wnd = tp->rcv_wnd; |
156 | p->ssthresh = tcp_current_ssthresh(sk); | 156 | p->ssthresh = tcp_current_ssthresh(sk); |
157 | p->srtt = tp->srtt >> 3; | 157 | p->srtt = tp->srtt_us >> 3; |
158 | 158 | ||
159 | tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); | 159 | tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); |
160 | } | 160 | } |
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 19ea6c2951f3..0ac50836da4d 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c | |||
@@ -39,7 +39,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) | |||
39 | static struct tcp_congestion_ops tcp_scalable __read_mostly = { | 39 | static struct tcp_congestion_ops tcp_scalable __read_mostly = { |
40 | .ssthresh = tcp_scalable_ssthresh, | 40 | .ssthresh = tcp_scalable_ssthresh, |
41 | .cong_avoid = tcp_scalable_cong_avoid, | 41 | .cong_avoid = tcp_scalable_cong_avoid, |
42 | .min_cwnd = tcp_reno_min_cwnd, | ||
43 | 42 | ||
44 | .owner = THIS_MODULE, | 43 | .owner = THIS_MODULE, |
45 | .name = "scalable", | 44 | .name = "scalable", |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 64f0354c84c7..286227abed10 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -165,6 +165,9 @@ static int tcp_write_timeout(struct sock *sk) | |||
165 | dst_negative_advice(sk); | 165 | dst_negative_advice(sk); |
166 | if (tp->syn_fastopen || tp->syn_data) | 166 | if (tp->syn_fastopen || tp->syn_data) |
167 | tcp_fastopen_cache_set(sk, 0, NULL, true); | 167 | tcp_fastopen_cache_set(sk, 0, NULL, true); |
168 | if (tp->syn_data) | ||
169 | NET_INC_STATS_BH(sock_net(sk), | ||
170 | LINUX_MIB_TCPFASTOPENACTIVEFAIL); | ||
168 | } | 171 | } |
169 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; | 172 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
170 | syn_set = true; | 173 | syn_set = true; |
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 06cae62bf208..48539fff6357 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c | |||
@@ -306,11 +306,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) | |||
306 | EXPORT_SYMBOL_GPL(tcp_vegas_get_info); | 306 | EXPORT_SYMBOL_GPL(tcp_vegas_get_info); |
307 | 307 | ||
308 | static struct tcp_congestion_ops tcp_vegas __read_mostly = { | 308 | static struct tcp_congestion_ops tcp_vegas __read_mostly = { |
309 | .flags = TCP_CONG_RTT_STAMP, | ||
310 | .init = tcp_vegas_init, | 309 | .init = tcp_vegas_init, |
311 | .ssthresh = tcp_reno_ssthresh, | 310 | .ssthresh = tcp_reno_ssthresh, |
312 | .cong_avoid = tcp_vegas_cong_avoid, | 311 | .cong_avoid = tcp_vegas_cong_avoid, |
313 | .min_cwnd = tcp_reno_min_cwnd, | ||
314 | .pkts_acked = tcp_vegas_pkts_acked, | 312 | .pkts_acked = tcp_vegas_pkts_acked, |
315 | .set_state = tcp_vegas_state, | 313 | .set_state = tcp_vegas_state, |
316 | .cwnd_event = tcp_vegas_cwnd_event, | 314 | .cwnd_event = tcp_vegas_cwnd_event, |
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 326475a94865..1b8e28fcd7e1 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c | |||
@@ -203,7 +203,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk) | |||
203 | } | 203 | } |
204 | 204 | ||
205 | static struct tcp_congestion_ops tcp_veno __read_mostly = { | 205 | static struct tcp_congestion_ops tcp_veno __read_mostly = { |
206 | .flags = TCP_CONG_RTT_STAMP, | ||
207 | .init = tcp_veno_init, | 206 | .init = tcp_veno_init, |
208 | .ssthresh = tcp_veno_ssthresh, | 207 | .ssthresh = tcp_veno_ssthresh, |
209 | .cong_avoid = tcp_veno_cong_avoid, | 208 | .cong_avoid = tcp_veno_cong_avoid, |
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 76a1e23259e1..b94a04ae2ed5 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c | |||
@@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { | |||
276 | .init = tcp_westwood_init, | 276 | .init = tcp_westwood_init, |
277 | .ssthresh = tcp_reno_ssthresh, | 277 | .ssthresh = tcp_reno_ssthresh, |
278 | .cong_avoid = tcp_reno_cong_avoid, | 278 | .cong_avoid = tcp_reno_cong_avoid, |
279 | .min_cwnd = tcp_westwood_bw_rttmin, | ||
280 | .cwnd_event = tcp_westwood_event, | 279 | .cwnd_event = tcp_westwood_event, |
281 | .get_info = tcp_westwood_info, | 280 | .get_info = tcp_westwood_info, |
282 | .pkts_acked = tcp_westwood_pkts_acked, | 281 | .pkts_acked = tcp_westwood_pkts_acked, |
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 1a8d271f994d..5ede0e727945 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c | |||
@@ -227,11 +227,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { | |||
227 | } | 227 | } |
228 | 228 | ||
229 | static struct tcp_congestion_ops tcp_yeah __read_mostly = { | 229 | static struct tcp_congestion_ops tcp_yeah __read_mostly = { |
230 | .flags = TCP_CONG_RTT_STAMP, | ||
231 | .init = tcp_yeah_init, | 230 | .init = tcp_yeah_init, |
232 | .ssthresh = tcp_yeah_ssthresh, | 231 | .ssthresh = tcp_yeah_ssthresh, |
233 | .cong_avoid = tcp_yeah_cong_avoid, | 232 | .cong_avoid = tcp_yeah_cong_avoid, |
234 | .min_cwnd = tcp_reno_min_cwnd, | ||
235 | .set_state = tcp_vegas_state, | 233 | .set_state = tcp_vegas_state, |
236 | .cwnd_event = tcp_vegas_cwnd_event, | 234 | .cwnd_event = tcp_vegas_cwnd_event, |
237 | .get_info = tcp_vegas_get_info, | 235 | .get_info = tcp_vegas_get_info, |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 77bd16fa9f34..4468e1adc094 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -931,7 +931,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
931 | sock_tx_timestamp(sk, &ipc.tx_flags); | 931 | sock_tx_timestamp(sk, &ipc.tx_flags); |
932 | 932 | ||
933 | if (msg->msg_controllen) { | 933 | if (msg->msg_controllen) { |
934 | err = ip_cmsg_send(sock_net(sk), msg, &ipc); | 934 | err = ip_cmsg_send(sock_net(sk), msg, &ipc, |
935 | sk->sk_family == AF_INET6); | ||
935 | if (err) | 936 | if (err) |
936 | return err; | 937 | return err; |
937 | if (ipc.opt) | 938 | if (ipc.opt) |
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 1f12c8b45864..aac6197b7a71 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c | |||
@@ -37,15 +37,6 @@ drop: | |||
37 | return NET_RX_DROP; | 37 | return NET_RX_DROP; |
38 | } | 38 | } |
39 | 39 | ||
40 | int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, | ||
41 | int encap_type) | ||
42 | { | ||
43 | XFRM_SPI_SKB_CB(skb)->family = AF_INET; | ||
44 | XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); | ||
45 | return xfrm_input(skb, nexthdr, spi, encap_type); | ||
46 | } | ||
47 | EXPORT_SYMBOL(xfrm4_rcv_encap); | ||
48 | |||
49 | int xfrm4_transport_finish(struct sk_buff *skb, int async) | 40 | int xfrm4_transport_finish(struct sk_buff *skb, int async) |
50 | { | 41 | { |
51 | struct iphdr *iph = ip_hdr(skb); | 42 | struct iphdr *iph = ip_hdr(skb); |
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 31b18152528f..05f2b484954f 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c | |||
@@ -15,65 +15,6 @@ | |||
15 | #include <net/ip.h> | 15 | #include <net/ip.h> |
16 | #include <net/xfrm.h> | 16 | #include <net/xfrm.h> |
17 | 17 | ||
18 | /* Informational hook. The decap is still done here. */ | ||
19 | static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly; | ||
20 | static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); | ||
21 | |||
22 | int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler) | ||
23 | { | ||
24 | struct xfrm_tunnel_notifier __rcu **pprev; | ||
25 | struct xfrm_tunnel_notifier *t; | ||
26 | int ret = -EEXIST; | ||
27 | int priority = handler->priority; | ||
28 | |||
29 | mutex_lock(&xfrm4_mode_tunnel_input_mutex); | ||
30 | |||
31 | for (pprev = &rcv_notify_handlers; | ||
32 | (t = rcu_dereference_protected(*pprev, | ||
33 | lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; | ||
34 | pprev = &t->next) { | ||
35 | if (t->priority > priority) | ||
36 | break; | ||
37 | if (t->priority == priority) | ||
38 | goto err; | ||
39 | |||
40 | } | ||
41 | |||
42 | handler->next = *pprev; | ||
43 | rcu_assign_pointer(*pprev, handler); | ||
44 | |||
45 | ret = 0; | ||
46 | |||
47 | err: | ||
48 | mutex_unlock(&xfrm4_mode_tunnel_input_mutex); | ||
49 | return ret; | ||
50 | } | ||
51 | EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); | ||
52 | |||
53 | int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler) | ||
54 | { | ||
55 | struct xfrm_tunnel_notifier __rcu **pprev; | ||
56 | struct xfrm_tunnel_notifier *t; | ||
57 | int ret = -ENOENT; | ||
58 | |||
59 | mutex_lock(&xfrm4_mode_tunnel_input_mutex); | ||
60 | for (pprev = &rcv_notify_handlers; | ||
61 | (t = rcu_dereference_protected(*pprev, | ||
62 | lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; | ||
63 | pprev = &t->next) { | ||
64 | if (t == handler) { | ||
65 | *pprev = handler->next; | ||
66 | ret = 0; | ||
67 | break; | ||
68 | } | ||
69 | } | ||
70 | mutex_unlock(&xfrm4_mode_tunnel_input_mutex); | ||
71 | synchronize_net(); | ||
72 | |||
73 | return ret; | ||
74 | } | ||
75 | EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister); | ||
76 | |||
77 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) | 18 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) |
78 | { | 19 | { |
79 | struct iphdr *inner_iph = ipip_hdr(skb); | 20 | struct iphdr *inner_iph = ipip_hdr(skb); |
@@ -127,14 +68,8 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) | |||
127 | return 0; | 68 | return 0; |
128 | } | 69 | } |
129 | 70 | ||
130 | #define for_each_input_rcu(head, handler) \ | ||
131 | for (handler = rcu_dereference(head); \ | ||
132 | handler != NULL; \ | ||
133 | handler = rcu_dereference(handler->next)) | ||
134 | |||
135 | static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | 71 | static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) |
136 | { | 72 | { |
137 | struct xfrm_tunnel_notifier *handler; | ||
138 | int err = -EINVAL; | 73 | int err = -EINVAL; |
139 | 74 | ||
140 | if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) | 75 | if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) |
@@ -143,9 +78,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | |||
143 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | 78 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) |
144 | goto out; | 79 | goto out; |
145 | 80 | ||
146 | for_each_input_rcu(rcv_notify_handlers, handler) | ||
147 | handler->handler(skb); | ||
148 | |||
149 | err = skb_unclone(skb, GFP_ATOMIC); | 81 | err = skb_unclone(skb, GFP_ATOMIC); |
150 | if (err) | 82 | if (err) |
151 | goto out; | 83 | goto out; |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e1a63930a967..6156f68a1e90 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
@@ -325,6 +325,7 @@ void __init xfrm4_init(void) | |||
325 | 325 | ||
326 | xfrm4_state_init(); | 326 | xfrm4_state_init(); |
327 | xfrm4_policy_init(); | 327 | xfrm4_policy_init(); |
328 | xfrm4_protocol_init(); | ||
328 | #ifdef CONFIG_SYSCTL | 329 | #ifdef CONFIG_SYSCTL |
329 | register_pernet_subsys(&xfrm4_net_ops); | 330 | register_pernet_subsys(&xfrm4_net_ops); |
330 | #endif | 331 | #endif |
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c new file mode 100644 index 000000000000..7f7b243e8139 --- /dev/null +++ b/net/ipv4/xfrm4_protocol.c | |||
@@ -0,0 +1,286 @@ | |||
1 | /* xfrm4_protocol.c - Generic xfrm protocol multiplexer. | ||
2 | * | ||
3 | * Copyright (C) 2013 secunet Security Networks AG | ||
4 | * | ||
5 | * Author: | ||
6 | * Steffen Klassert <steffen.klassert@secunet.com> | ||
7 | * | ||
8 | * Based on: | ||
9 | * net/ipv4/tunnel4.c | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public License | ||
13 | * as published by the Free Software Foundation; either version | ||
14 | * 2 of the License, or (at your option) any later version. | ||
15 | */ | ||
16 | |||
17 | #include <linux/init.h> | ||
18 | #include <linux/mutex.h> | ||
19 | #include <linux/skbuff.h> | ||
20 | #include <net/icmp.h> | ||
21 | #include <net/ip.h> | ||
22 | #include <net/protocol.h> | ||
23 | #include <net/xfrm.h> | ||
24 | |||
25 | static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; | ||
26 | static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; | ||
27 | static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; | ||
28 | static DEFINE_MUTEX(xfrm4_protocol_mutex); | ||
29 | |||
30 | static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) | ||
31 | { | ||
32 | switch (protocol) { | ||
33 | case IPPROTO_ESP: | ||
34 | return &esp4_handlers; | ||
35 | case IPPROTO_AH: | ||
36 | return &ah4_handlers; | ||
37 | case IPPROTO_COMP: | ||
38 | return &ipcomp4_handlers; | ||
39 | } | ||
40 | |||
41 | return NULL; | ||
42 | } | ||
43 | |||
44 | #define for_each_protocol_rcu(head, handler) \ | ||
45 | for (handler = rcu_dereference(head); \ | ||
46 | handler != NULL; \ | ||
47 | handler = rcu_dereference(handler->next)) \ | ||
48 | |||
49 | int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) | ||
50 | { | ||
51 | int ret; | ||
52 | struct xfrm4_protocol *handler; | ||
53 | |||
54 | for_each_protocol_rcu(*proto_handlers(protocol), handler) | ||
55 | if ((ret = handler->cb_handler(skb, err)) <= 0) | ||
56 | return ret; | ||
57 | |||
58 | return 0; | ||
59 | } | ||
60 | EXPORT_SYMBOL(xfrm4_rcv_cb); | ||
61 | |||
62 | int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, | ||
63 | int encap_type) | ||
64 | { | ||
65 | int ret; | ||
66 | struct xfrm4_protocol *handler; | ||
67 | |||
68 | XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; | ||
69 | XFRM_SPI_SKB_CB(skb)->family = AF_INET; | ||
70 | XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); | ||
71 | |||
72 | for_each_protocol_rcu(*proto_handlers(nexthdr), handler) | ||
73 | if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) | ||
74 | return ret; | ||
75 | |||
76 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | ||
77 | |||
78 | kfree_skb(skb); | ||
79 | return 0; | ||
80 | } | ||
81 | EXPORT_SYMBOL(xfrm4_rcv_encap); | ||
82 | |||
83 | static int xfrm4_esp_rcv(struct sk_buff *skb) | ||
84 | { | ||
85 | int ret; | ||
86 | struct xfrm4_protocol *handler; | ||
87 | |||
88 | XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; | ||
89 | |||
90 | for_each_protocol_rcu(esp4_handlers, handler) | ||
91 | if ((ret = handler->handler(skb)) != -EINVAL) | ||
92 | return ret; | ||
93 | |||
94 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | ||
95 | |||
96 | kfree_skb(skb); | ||
97 | return 0; | ||
98 | } | ||
99 | |||
100 | static void xfrm4_esp_err(struct sk_buff *skb, u32 info) | ||
101 | { | ||
102 | struct xfrm4_protocol *handler; | ||
103 | |||
104 | for_each_protocol_rcu(esp4_handlers, handler) | ||
105 | if (!handler->err_handler(skb, info)) | ||
106 | break; | ||
107 | } | ||
108 | |||
109 | static int xfrm4_ah_rcv(struct sk_buff *skb) | ||
110 | { | ||
111 | int ret; | ||
112 | struct xfrm4_protocol *handler; | ||
113 | |||
114 | XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; | ||
115 | |||
116 | for_each_protocol_rcu(ah4_handlers, handler) | ||
117 | if ((ret = handler->handler(skb)) != -EINVAL) | ||
118 | return ret;; | ||
119 | |||
120 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | ||
121 | |||
122 | kfree_skb(skb); | ||
123 | return 0; | ||
124 | } | ||
125 | |||
126 | static void xfrm4_ah_err(struct sk_buff *skb, u32 info) | ||
127 | { | ||
128 | struct xfrm4_protocol *handler; | ||
129 | |||
130 | for_each_protocol_rcu(ah4_handlers, handler) | ||
131 | if (!handler->err_handler(skb, info)) | ||
132 | break; | ||
133 | } | ||
134 | |||
135 | static int xfrm4_ipcomp_rcv(struct sk_buff *skb) | ||
136 | { | ||
137 | int ret; | ||
138 | struct xfrm4_protocol *handler; | ||
139 | |||
140 | XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; | ||
141 | |||
142 | for_each_protocol_rcu(ipcomp4_handlers, handler) | ||
143 | if ((ret = handler->handler(skb)) != -EINVAL) | ||
144 | return ret; | ||
145 | |||
146 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | ||
147 | |||
148 | kfree_skb(skb); | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) | ||
153 | { | ||
154 | struct xfrm4_protocol *handler; | ||
155 | |||
156 | for_each_protocol_rcu(ipcomp4_handlers, handler) | ||
157 | if (!handler->err_handler(skb, info)) | ||
158 | break; | ||
159 | } | ||
160 | |||
161 | static const struct net_protocol esp4_protocol = { | ||
162 | .handler = xfrm4_esp_rcv, | ||
163 | .err_handler = xfrm4_esp_err, | ||
164 | .no_policy = 1, | ||
165 | .netns_ok = 1, | ||
166 | }; | ||
167 | |||
168 | static const struct net_protocol ah4_protocol = { | ||
169 | .handler = xfrm4_ah_rcv, | ||
170 | .err_handler = xfrm4_ah_err, | ||
171 | .no_policy = 1, | ||
172 | .netns_ok = 1, | ||
173 | }; | ||
174 | |||
175 | static const struct net_protocol ipcomp4_protocol = { | ||
176 | .handler = xfrm4_ipcomp_rcv, | ||
177 | .err_handler = xfrm4_ipcomp_err, | ||
178 | .no_policy = 1, | ||
179 | .netns_ok = 1, | ||
180 | }; | ||
181 | |||
182 | static struct xfrm_input_afinfo xfrm4_input_afinfo = { | ||
183 | .family = AF_INET, | ||
184 | .owner = THIS_MODULE, | ||
185 | .callback = xfrm4_rcv_cb, | ||
186 | }; | ||
187 | |||
188 | static inline const struct net_protocol *netproto(unsigned char protocol) | ||
189 | { | ||
190 | switch (protocol) { | ||
191 | case IPPROTO_ESP: | ||
192 | return &esp4_protocol; | ||
193 | case IPPROTO_AH: | ||
194 | return &ah4_protocol; | ||
195 | case IPPROTO_COMP: | ||
196 | return &ipcomp4_protocol; | ||
197 | } | ||
198 | |||
199 | return NULL; | ||
200 | } | ||
201 | |||
202 | int xfrm4_protocol_register(struct xfrm4_protocol *handler, | ||
203 | unsigned char protocol) | ||
204 | { | ||
205 | struct xfrm4_protocol __rcu **pprev; | ||
206 | struct xfrm4_protocol *t; | ||
207 | bool add_netproto = false; | ||
208 | int ret = -EEXIST; | ||
209 | int priority = handler->priority; | ||
210 | |||
211 | mutex_lock(&xfrm4_protocol_mutex); | ||
212 | |||
213 | if (!rcu_dereference_protected(*proto_handlers(protocol), | ||
214 | lockdep_is_held(&xfrm4_protocol_mutex))) | ||
215 | add_netproto = true; | ||
216 | |||
217 | for (pprev = proto_handlers(protocol); | ||
218 | (t = rcu_dereference_protected(*pprev, | ||
219 | lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; | ||
220 | pprev = &t->next) { | ||
221 | if (t->priority < priority) | ||
222 | break; | ||
223 | if (t->priority == priority) | ||
224 | goto err; | ||
225 | } | ||
226 | |||
227 | handler->next = *pprev; | ||
228 | rcu_assign_pointer(*pprev, handler); | ||
229 | |||
230 | ret = 0; | ||
231 | |||
232 | err: | ||
233 | mutex_unlock(&xfrm4_protocol_mutex); | ||
234 | |||
235 | if (add_netproto) { | ||
236 | if (inet_add_protocol(netproto(protocol), protocol)) { | ||
237 | pr_err("%s: can't add protocol\n", __func__); | ||
238 | ret = -EAGAIN; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | return ret; | ||
243 | } | ||
244 | EXPORT_SYMBOL(xfrm4_protocol_register); | ||
245 | |||
246 | int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, | ||
247 | unsigned char protocol) | ||
248 | { | ||
249 | struct xfrm4_protocol __rcu **pprev; | ||
250 | struct xfrm4_protocol *t; | ||
251 | int ret = -ENOENT; | ||
252 | |||
253 | mutex_lock(&xfrm4_protocol_mutex); | ||
254 | |||
255 | for (pprev = proto_handlers(protocol); | ||
256 | (t = rcu_dereference_protected(*pprev, | ||
257 | lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; | ||
258 | pprev = &t->next) { | ||
259 | if (t == handler) { | ||
260 | *pprev = handler->next; | ||
261 | ret = 0; | ||
262 | break; | ||
263 | } | ||
264 | } | ||
265 | |||
266 | if (!rcu_dereference_protected(*proto_handlers(protocol), | ||
267 | lockdep_is_held(&xfrm4_protocol_mutex))) { | ||
268 | if (inet_del_protocol(netproto(protocol), protocol) < 0) { | ||
269 | pr_err("%s: can't remove protocol\n", __func__); | ||
270 | ret = -EAGAIN; | ||
271 | } | ||
272 | } | ||
273 | |||
274 | mutex_unlock(&xfrm4_protocol_mutex); | ||
275 | |||
276 | synchronize_net(); | ||
277 | |||
278 | return ret; | ||
279 | } | ||
280 | EXPORT_SYMBOL(xfrm4_protocol_deregister); | ||
281 | |||
282 | void __init xfrm4_protocol_init(void) | ||
283 | { | ||
284 | xfrm_input_register_afinfo(&xfrm4_input_afinfo); | ||
285 | } | ||
286 | EXPORT_SYMBOL(xfrm4_protocol_init); | ||