aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/ah4.c78
-rw-r--r--net/ipv4/esp4.c26
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/ip_forward.c7
-rw-r--r--net/ipv4/ip_output.c12
-rw-r--r--net/ipv4/ip_sockglue.c21
-rw-r--r--net/ipv4/ip_tunnel.c31
-rw-r--r--net/ipv4/ip_tunnel_core.c4
-rw-r--r--net/ipv4/ip_vti.c310
-rw-r--r--net/ipv4/ipcomp.c26
-rw-r--r--net/ipv4/netfilter.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c6
-rw-r--r--net/ipv4/netfilter/ip_tables.c6
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c6
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c14
-rw-r--r--net/ipv4/tcp.c13
-rw-r--r--net/ipv4/tcp_cong.c10
-rw-r--r--net/ipv4/tcp_cubic.c4
-rw-r--r--net/ipv4/tcp_highspeed.c1
-rw-r--r--net/ipv4/tcp_hybla.c13
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c187
-rw-r--r--net/ipv4/tcp_ipv4.c8
-rw-r--r--net/ipv4/tcp_lp.c2
-rw-r--r--net/ipv4/tcp_memcontrol.c4
-rw-r--r--net/ipv4/tcp_metrics.c83
-rw-r--r--net/ipv4/tcp_minisocks.c4
-rw-r--r--net/ipv4/tcp_output.c71
-rw-r--r--net/ipv4/tcp_probe.c2
-rw-r--r--net/ipv4/tcp_scalable.c1
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/tcp_vegas.c2
-rw-r--r--net/ipv4/tcp_veno.c1
-rw-r--r--net/ipv4/tcp_westwood.c1
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c3
-rw-r--r--net/ipv4/xfrm4_input.c9
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c68
-rw-r--r--net/ipv4/xfrm4_policy.c1
-rw-r--r--net/ipv4/xfrm4_protocol.c286
44 files changed, 913 insertions, 429 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f8c49ce5b283..f032688d20d3 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -55,4 +55,4 @@ obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
55obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 55obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
56 56
57obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 57obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
58 xfrm4_output.o 58 xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 19ab78aca547..8c54870db792 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1505,9 +1505,9 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
1505 bhptr = per_cpu_ptr(mib[0], cpu); 1505 bhptr = per_cpu_ptr(mib[0], cpu);
1506 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); 1506 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
1507 do { 1507 do {
1508 start = u64_stats_fetch_begin_bh(syncp); 1508 start = u64_stats_fetch_begin_irq(syncp);
1509 v = *(((u64 *) bhptr) + offt); 1509 v = *(((u64 *) bhptr) + offt);
1510 } while (u64_stats_fetch_retry_bh(syncp, start)); 1510 } while (u64_stats_fetch_retry_irq(syncp, start));
1511 1511
1512 res += v; 1512 res += v;
1513 } 1513 }
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 717902669d2f..a2afa89513a0 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
155 struct iphdr *iph, *top_iph; 155 struct iphdr *iph, *top_iph;
156 struct ip_auth_hdr *ah; 156 struct ip_auth_hdr *ah;
157 struct ah_data *ahp; 157 struct ah_data *ahp;
158 int seqhi_len = 0;
159 __be32 *seqhi;
160 int sglists = 0;
161 struct scatterlist *seqhisg;
158 162
159 ahp = x->data; 163 ahp = x->data;
160 ahash = ahp->ahash; 164 ahash = ahp->ahash;
@@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
167 ah = ip_auth_hdr(skb); 171 ah = ip_auth_hdr(skb);
168 ihl = ip_hdrlen(skb); 172 ihl = ip_hdrlen(skb);
169 173
174 if (x->props.flags & XFRM_STATE_ESN) {
175 sglists = 1;
176 seqhi_len = sizeof(*seqhi);
177 }
170 err = -ENOMEM; 178 err = -ENOMEM;
171 iph = ah_alloc_tmp(ahash, nfrags, ihl); 179 iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
172 if (!iph) 180 if (!iph)
173 goto out; 181 goto out;
174 182 seqhi = (__be32 *)((char *)iph + ihl);
175 icv = ah_tmp_icv(ahash, iph, ihl); 183 icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
176 req = ah_tmp_req(ahash, icv); 184 req = ah_tmp_req(ahash, icv);
177 sg = ah_req_sg(ahash, req); 185 sg = ah_req_sg(ahash, req);
186 seqhisg = sg + nfrags;
178 187
179 memset(ah->auth_data, 0, ahp->icv_trunc_len); 188 memset(ah->auth_data, 0, ahp->icv_trunc_len);
180 189
@@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
210 ah->spi = x->id.spi; 219 ah->spi = x->id.spi;
211 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); 220 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
212 221
213 sg_init_table(sg, nfrags); 222 sg_init_table(sg, nfrags + sglists);
214 skb_to_sgvec(skb, sg, 0, skb->len); 223 skb_to_sgvec_nomark(skb, sg, 0, skb->len);
215 224
216 ahash_request_set_crypt(req, sg, icv, skb->len); 225 if (x->props.flags & XFRM_STATE_ESN) {
226 /* Attach seqhi sg right after packet payload */
227 *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
228 sg_set_buf(seqhisg, seqhi, seqhi_len);
229 }
230 ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
217 ahash_request_set_callback(req, 0, ah_output_done, skb); 231 ahash_request_set_callback(req, 0, ah_output_done, skb);
218 232
219 AH_SKB_CB(skb)->tmp = iph; 233 AH_SKB_CB(skb)->tmp = iph;
@@ -295,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
295 struct ip_auth_hdr *ah; 309 struct ip_auth_hdr *ah;
296 struct ah_data *ahp; 310 struct ah_data *ahp;
297 int err = -ENOMEM; 311 int err = -ENOMEM;
312 int seqhi_len = 0;
313 __be32 *seqhi;
314 int sglists = 0;
315 struct scatterlist *seqhisg;
298 316
299 if (!pskb_may_pull(skb, sizeof(*ah))) 317 if (!pskb_may_pull(skb, sizeof(*ah)))
300 goto out; 318 goto out;
@@ -335,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
335 iph = ip_hdr(skb); 353 iph = ip_hdr(skb);
336 ihl = ip_hdrlen(skb); 354 ihl = ip_hdrlen(skb);
337 355
338 work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); 356 if (x->props.flags & XFRM_STATE_ESN) {
357 sglists = 1;
358 seqhi_len = sizeof(*seqhi);
359 }
360
361 work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
362 ahp->icv_trunc_len + seqhi_len);
339 if (!work_iph) 363 if (!work_iph)
340 goto out; 364 goto out;
341 365
342 auth_data = ah_tmp_auth(work_iph, ihl); 366 seqhi = (__be32 *)((char *)work_iph + ihl);
367 auth_data = ah_tmp_auth(seqhi, seqhi_len);
343 icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); 368 icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
344 req = ah_tmp_req(ahash, icv); 369 req = ah_tmp_req(ahash, icv);
345 sg = ah_req_sg(ahash, req); 370 sg = ah_req_sg(ahash, req);
371 seqhisg = sg + nfrags;
346 372
347 memcpy(work_iph, iph, ihl); 373 memcpy(work_iph, iph, ihl);
348 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); 374 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
@@ -361,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
361 387
362 skb_push(skb, ihl); 388 skb_push(skb, ihl);
363 389
364 sg_init_table(sg, nfrags); 390 sg_init_table(sg, nfrags + sglists);
365 skb_to_sgvec(skb, sg, 0, skb->len); 391 skb_to_sgvec_nomark(skb, sg, 0, skb->len);
366 392
367 ahash_request_set_crypt(req, sg, icv, skb->len); 393 if (x->props.flags & XFRM_STATE_ESN) {
394 /* Attach seqhi sg right after packet payload */
395 *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
396 sg_set_buf(seqhisg, seqhi, seqhi_len);
397 }
398 ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
368 ahash_request_set_callback(req, 0, ah_input_done, skb); 399 ahash_request_set_callback(req, 0, ah_input_done, skb);
369 400
370 AH_SKB_CB(skb)->tmp = work_iph; 401 AH_SKB_CB(skb)->tmp = work_iph;
@@ -397,7 +428,7 @@ out:
397 return err; 428 return err;
398} 429}
399 430
400static void ah4_err(struct sk_buff *skb, u32 info) 431static int ah4_err(struct sk_buff *skb, u32 info)
401{ 432{
402 struct net *net = dev_net(skb->dev); 433 struct net *net = dev_net(skb->dev);
403 const struct iphdr *iph = (const struct iphdr *)skb->data; 434 const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -407,23 +438,25 @@ static void ah4_err(struct sk_buff *skb, u32 info)
407 switch (icmp_hdr(skb)->type) { 438 switch (icmp_hdr(skb)->type) {
408 case ICMP_DEST_UNREACH: 439 case ICMP_DEST_UNREACH:
409 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 440 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
410 return; 441 return 0;
411 case ICMP_REDIRECT: 442 case ICMP_REDIRECT:
412 break; 443 break;
413 default: 444 default:
414 return; 445 return 0;
415 } 446 }
416 447
417 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 448 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
418 ah->spi, IPPROTO_AH, AF_INET); 449 ah->spi, IPPROTO_AH, AF_INET);
419 if (!x) 450 if (!x)
420 return; 451 return 0;
421 452
422 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 453 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
423 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); 454 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
424 else 455 else
425 ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); 456 ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
426 xfrm_state_put(x); 457 xfrm_state_put(x);
458
459 return 0;
427} 460}
428 461
429static int ah_init_state(struct xfrm_state *x) 462static int ah_init_state(struct xfrm_state *x)
@@ -505,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x)
505 kfree(ahp); 538 kfree(ahp);
506} 539}
507 540
541static int ah4_rcv_cb(struct sk_buff *skb, int err)
542{
543 return 0;
544}
508 545
509static const struct xfrm_type ah_type = 546static const struct xfrm_type ah_type =
510{ 547{
@@ -518,11 +555,12 @@ static const struct xfrm_type ah_type =
518 .output = ah_output 555 .output = ah_output
519}; 556};
520 557
521static const struct net_protocol ah4_protocol = { 558static struct xfrm4_protocol ah4_protocol = {
522 .handler = xfrm4_rcv, 559 .handler = xfrm4_rcv,
560 .input_handler = xfrm_input,
561 .cb_handler = ah4_rcv_cb,
523 .err_handler = ah4_err, 562 .err_handler = ah4_err,
524 .no_policy = 1, 563 .priority = 0,
525 .netns_ok = 1,
526}; 564};
527 565
528static int __init ah4_init(void) 566static int __init ah4_init(void)
@@ -531,7 +569,7 @@ static int __init ah4_init(void)
531 pr_info("%s: can't add xfrm type\n", __func__); 569 pr_info("%s: can't add xfrm type\n", __func__);
532 return -EAGAIN; 570 return -EAGAIN;
533 } 571 }
534 if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { 572 if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
535 pr_info("%s: can't add protocol\n", __func__); 573 pr_info("%s: can't add protocol\n", __func__);
536 xfrm_unregister_type(&ah_type, AF_INET); 574 xfrm_unregister_type(&ah_type, AF_INET);
537 return -EAGAIN; 575 return -EAGAIN;
@@ -541,7 +579,7 @@ static int __init ah4_init(void)
541 579
542static void __exit ah4_fini(void) 580static void __exit ah4_fini(void)
543{ 581{
544 if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) 582 if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
545 pr_info("%s: can't remove protocol\n", __func__); 583 pr_info("%s: can't remove protocol\n", __func__);
546 if (xfrm_unregister_type(&ah_type, AF_INET) < 0) 584 if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
547 pr_info("%s: can't remove xfrm type\n", __func__); 585 pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 7785b28061ac..360b565918c4 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -473,7 +473,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
473 net_adj) & ~(blksize - 1)) + net_adj - 2; 473 net_adj) & ~(blksize - 1)) + net_adj - 2;
474} 474}
475 475
476static void esp4_err(struct sk_buff *skb, u32 info) 476static int esp4_err(struct sk_buff *skb, u32 info)
477{ 477{
478 struct net *net = dev_net(skb->dev); 478 struct net *net = dev_net(skb->dev);
479 const struct iphdr *iph = (const struct iphdr *)skb->data; 479 const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -483,23 +483,25 @@ static void esp4_err(struct sk_buff *skb, u32 info)
483 switch (icmp_hdr(skb)->type) { 483 switch (icmp_hdr(skb)->type) {
484 case ICMP_DEST_UNREACH: 484 case ICMP_DEST_UNREACH:
485 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 485 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
486 return; 486 return 0;
487 case ICMP_REDIRECT: 487 case ICMP_REDIRECT:
488 break; 488 break;
489 default: 489 default:
490 return; 490 return 0;
491 } 491 }
492 492
493 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 493 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
494 esph->spi, IPPROTO_ESP, AF_INET); 494 esph->spi, IPPROTO_ESP, AF_INET);
495 if (!x) 495 if (!x)
496 return; 496 return 0;
497 497
498 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 498 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
499 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); 499 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
500 else 500 else
501 ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); 501 ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
502 xfrm_state_put(x); 502 xfrm_state_put(x);
503
504 return 0;
503} 505}
504 506
505static void esp_destroy(struct xfrm_state *x) 507static void esp_destroy(struct xfrm_state *x)
@@ -672,6 +674,11 @@ error:
672 return err; 674 return err;
673} 675}
674 676
677static int esp4_rcv_cb(struct sk_buff *skb, int err)
678{
679 return 0;
680}
681
675static const struct xfrm_type esp_type = 682static const struct xfrm_type esp_type =
676{ 683{
677 .description = "ESP4", 684 .description = "ESP4",
@@ -685,11 +692,12 @@ static const struct xfrm_type esp_type =
685 .output = esp_output 692 .output = esp_output
686}; 693};
687 694
688static const struct net_protocol esp4_protocol = { 695static struct xfrm4_protocol esp4_protocol = {
689 .handler = xfrm4_rcv, 696 .handler = xfrm4_rcv,
697 .input_handler = xfrm_input,
698 .cb_handler = esp4_rcv_cb,
690 .err_handler = esp4_err, 699 .err_handler = esp4_err,
691 .no_policy = 1, 700 .priority = 0,
692 .netns_ok = 1,
693}; 701};
694 702
695static int __init esp4_init(void) 703static int __init esp4_init(void)
@@ -698,7 +706,7 @@ static int __init esp4_init(void)
698 pr_info("%s: can't add xfrm type\n", __func__); 706 pr_info("%s: can't add xfrm type\n", __func__);
699 return -EAGAIN; 707 return -EAGAIN;
700 } 708 }
701 if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { 709 if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
702 pr_info("%s: can't add protocol\n", __func__); 710 pr_info("%s: can't add protocol\n", __func__);
703 xfrm_unregister_type(&esp_type, AF_INET); 711 xfrm_unregister_type(&esp_type, AF_INET);
704 return -EAGAIN; 712 return -EAGAIN;
@@ -708,7 +716,7 @@ static int __init esp4_init(void)
708 716
709static void __exit esp4_fini(void) 717static void __exit esp4_fini(void)
710{ 718{
711 if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) 719 if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
712 pr_info("%s: can't remove protocol\n", __func__); 720 pr_info("%s: can't remove protocol\n", __func__);
713 if (xfrm_unregister_type(&esp_type, AF_INET) < 0) 721 if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
714 pr_info("%s: can't remove xfrm type\n", __func__); 722 pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c7539e22868b..1a629f870274 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -659,7 +659,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
659 659
660 if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && 660 if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
661 ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) 661 ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
662 return ip_rt_dump(skb, cb); 662 return skb->len;
663 663
664 s_h = cb->args[0]; 664 s_h = cb->args[0];
665 s_e = cb->args[1]; 665 s_e = cb->args[1];
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index f3869c186d97..be8abe73bb9f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -127,6 +127,10 @@ int ip_forward(struct sk_buff *skb)
127 struct rtable *rt; /* Route we use */ 127 struct rtable *rt; /* Route we use */
128 struct ip_options *opt = &(IPCB(skb)->opt); 128 struct ip_options *opt = &(IPCB(skb)->opt);
129 129
130 /* that should never happen */
131 if (skb->pkt_type != PACKET_HOST)
132 goto drop;
133
130 if (skb_warn_if_lro(skb)) 134 if (skb_warn_if_lro(skb))
131 goto drop; 135 goto drop;
132 136
@@ -136,9 +140,6 @@ int ip_forward(struct sk_buff *skb)
136 if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) 140 if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
137 return NET_RX_SUCCESS; 141 return NET_RX_SUCCESS;
138 142
139 if (skb->pkt_type != PACKET_HOST)
140 goto drop;
141
142 skb_forward_csum(skb); 143 skb_forward_csum(skb);
143 144
144 /* 145 /*
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 73c6b63bba74..1a0755fea491 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -446,7 +446,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
446 __be16 not_last_frag; 446 __be16 not_last_frag;
447 struct rtable *rt = skb_rtable(skb); 447 struct rtable *rt = skb_rtable(skb);
448 int err = 0; 448 int err = 0;
449 bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
450 449
451 dev = rt->dst.dev; 450 dev = rt->dst.dev;
452 451
@@ -456,7 +455,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
456 455
457 iph = ip_hdr(skb); 456 iph = ip_hdr(skb);
458 457
459 mtu = ip_dst_mtu_maybe_forward(&rt->dst, forwarding); 458 mtu = ip_skb_dst_mtu(skb);
460 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || 459 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
461 (IPCB(skb)->frag_max_size && 460 (IPCB(skb)->frag_max_size &&
462 IPCB(skb)->frag_max_size > mtu))) { 461 IPCB(skb)->frag_max_size > mtu))) {
@@ -822,8 +821,7 @@ static int __ip_append_data(struct sock *sk,
822 821
823 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 822 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
824 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 823 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
825 maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? 824 maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu;
826 mtu : 0xFFFF;
827 825
828 if (cork->length + length > maxnonfragsize - fragheaderlen) { 826 if (cork->length + length > maxnonfragsize - fragheaderlen) {
829 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 827 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1146,8 +1144,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1146 1144
1147 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1145 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1148 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1146 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1149 maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? 1147 maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu;
1150 mtu : 0xFFFF;
1151 1148
1152 if (cork->length + size > maxnonfragsize - fragheaderlen) { 1149 if (cork->length + size > maxnonfragsize - fragheaderlen) {
1153 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 1150 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1308,8 +1305,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1308 * to fragment the frame generated here. No matter, what transforms 1305 * to fragment the frame generated here. No matter, what transforms
1309 * how transforms change size of the packet, it will come out. 1306 * how transforms change size of the packet, it will come out.
1310 */ 1307 */
1311 if (inet->pmtudisc < IP_PMTUDISC_DO) 1308 skb->local_df = ip_sk_local_df(sk);
1312 skb->local_df = 1;
1313 1309
1314 /* DF bit is set when we want to see DF on outgoing frames. 1310 /* DF bit is set when we want to see DF on outgoing frames.
1315 * If local_df is set too, we still allow to fragment this frame 1311 * If local_df is set too, we still allow to fragment this frame
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 580dd96666e0..64741b938632 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -186,7 +186,8 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
186} 186}
187EXPORT_SYMBOL(ip_cmsg_recv); 187EXPORT_SYMBOL(ip_cmsg_recv);
188 188
189int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) 189int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
190 bool allow_ipv6)
190{ 191{
191 int err, val; 192 int err, val;
192 struct cmsghdr *cmsg; 193 struct cmsghdr *cmsg;
@@ -194,6 +195,22 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
194 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { 195 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
195 if (!CMSG_OK(msg, cmsg)) 196 if (!CMSG_OK(msg, cmsg))
196 return -EINVAL; 197 return -EINVAL;
198#if defined(CONFIG_IPV6)
199 if (allow_ipv6 &&
200 cmsg->cmsg_level == SOL_IPV6 &&
201 cmsg->cmsg_type == IPV6_PKTINFO) {
202 struct in6_pktinfo *src_info;
203
204 if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
205 return -EINVAL;
206 src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
207 if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
208 return -EINVAL;
209 ipc->oif = src_info->ipi6_ifindex;
210 ipc->addr = src_info->ipi6_addr.s6_addr32[3];
211 continue;
212 }
213#endif
197 if (cmsg->cmsg_level != SOL_IP) 214 if (cmsg->cmsg_level != SOL_IP)
198 continue; 215 continue;
199 switch (cmsg->cmsg_type) { 216 switch (cmsg->cmsg_type) {
@@ -626,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
626 inet->nodefrag = val ? 1 : 0; 643 inet->nodefrag = val ? 1 : 0;
627 break; 644 break;
628 case IP_MTU_DISCOVER: 645 case IP_MTU_DISCOVER:
629 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE) 646 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
630 goto e_inval; 647 goto e_inval;
631 inet->pmtudisc = val; 648 inet->pmtudisc = val;
632 break; 649 break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index a82a22d8f77f..e77381d1df9a 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -235,13 +235,17 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
235{ 235{
236 unsigned int h; 236 unsigned int h;
237 __be32 remote; 237 __be32 remote;
238 __be32 i_key = parms->i_key;
238 239
239 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 240 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
240 remote = parms->iph.daddr; 241 remote = parms->iph.daddr;
241 else 242 else
242 remote = 0; 243 remote = 0;
243 244
244 h = ip_tunnel_hash(parms->i_key, remote); 245 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
246 i_key = 0;
247
248 h = ip_tunnel_hash(i_key, remote);
245 return &itn->tunnels[h]; 249 return &itn->tunnels[h];
246} 250}
247 251
@@ -398,7 +402,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
398 fbt = netdev_priv(itn->fb_tunnel_dev); 402 fbt = netdev_priv(itn->fb_tunnel_dev);
399 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); 403 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
400 if (IS_ERR(dev)) 404 if (IS_ERR(dev))
401 return NULL; 405 return ERR_CAST(dev);
402 406
403 dev->mtu = ip_tunnel_bind_dev(dev); 407 dev->mtu = ip_tunnel_bind_dev(dev);
404 408
@@ -748,9 +752,13 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
748 752
749 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 753 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
750 754
751 if (!t && (cmd == SIOCADDTUNNEL)) 755 if (!t && (cmd == SIOCADDTUNNEL)) {
752 t = ip_tunnel_create(net, itn, p); 756 t = ip_tunnel_create(net, itn, p);
753 757 if (IS_ERR(t)) {
758 err = PTR_ERR(t);
759 break;
760 }
761 }
754 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 762 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
755 if (t != NULL) { 763 if (t != NULL) {
756 if (t->dev != dev) { 764 if (t->dev != dev) {
@@ -777,8 +785,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
777 if (t) { 785 if (t) {
778 err = 0; 786 err = 0;
779 ip_tunnel_update(itn, t, dev, p, true); 787 ip_tunnel_update(itn, t, dev, p, true);
780 } else 788 } else {
781 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); 789 err = -ENOENT;
790 }
782 break; 791 break;
783 792
784 case SIOCDELTUNNEL: 793 case SIOCDELTUNNEL:
@@ -993,19 +1002,13 @@ int ip_tunnel_init(struct net_device *dev)
993{ 1002{
994 struct ip_tunnel *tunnel = netdev_priv(dev); 1003 struct ip_tunnel *tunnel = netdev_priv(dev);
995 struct iphdr *iph = &tunnel->parms.iph; 1004 struct iphdr *iph = &tunnel->parms.iph;
996 int i, err; 1005 int err;
997 1006
998 dev->destructor = ip_tunnel_dev_free; 1007 dev->destructor = ip_tunnel_dev_free;
999 dev->tstats = alloc_percpu(struct pcpu_sw_netstats); 1008 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1000 if (!dev->tstats) 1009 if (!dev->tstats)
1001 return -ENOMEM; 1010 return -ENOMEM;
1002 1011
1003 for_each_possible_cpu(i) {
1004 struct pcpu_sw_netstats *ipt_stats;
1005 ipt_stats = per_cpu_ptr(dev->tstats, i);
1006 u64_stats_init(&ipt_stats->syncp);
1007 }
1008
1009 tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); 1012 tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1010 if (!tunnel->dst_cache) { 1013 if (!tunnel->dst_cache) {
1011 free_percpu(dev->tstats); 1014 free_percpu(dev->tstats);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 8d69626f2206..e0c2b1d2ea4e 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -162,12 +162,12 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
162 unsigned int start; 162 unsigned int start;
163 163
164 do { 164 do {
165 start = u64_stats_fetch_begin_bh(&tstats->syncp); 165 start = u64_stats_fetch_begin_irq(&tstats->syncp);
166 rx_packets = tstats->rx_packets; 166 rx_packets = tstats->rx_packets;
167 tx_packets = tstats->tx_packets; 167 tx_packets = tstats->tx_packets;
168 rx_bytes = tstats->rx_bytes; 168 rx_bytes = tstats->rx_bytes;
169 tx_bytes = tstats->tx_bytes; 169 tx_bytes = tstats->tx_bytes;
170 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); 170 } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
171 171
172 tot->rx_packets += rx_packets; 172 tot->rx_packets += rx_packets;
173 tot->tx_packets += tx_packets; 173 tot->tx_packets += tx_packets;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 48eafae51769..687ddef4e574 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -34,6 +34,7 @@
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/netfilter_ipv4.h> 35#include <linux/netfilter_ipv4.h>
36#include <linux/if_ether.h> 36#include <linux/if_ether.h>
37#include <linux/icmpv6.h>
37 38
38#include <net/sock.h> 39#include <net/sock.h>
39#include <net/ip.h> 40#include <net/ip.h>
@@ -49,8 +50,8 @@ static struct rtnl_link_ops vti_link_ops __read_mostly;
49static int vti_net_id __read_mostly; 50static int vti_net_id __read_mostly;
50static int vti_tunnel_init(struct net_device *dev); 51static int vti_tunnel_init(struct net_device *dev);
51 52
52/* We dont digest the packet therefore let the packet pass */ 53static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
53static int vti_rcv(struct sk_buff *skb) 54 int encap_type)
54{ 55{
55 struct ip_tunnel *tunnel; 56 struct ip_tunnel *tunnel;
56 const struct iphdr *iph = ip_hdr(skb); 57 const struct iphdr *iph = ip_hdr(skb);
@@ -60,79 +61,120 @@ static int vti_rcv(struct sk_buff *skb)
60 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, 61 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
61 iph->saddr, iph->daddr, 0); 62 iph->saddr, iph->daddr, 0);
62 if (tunnel != NULL) { 63 if (tunnel != NULL) {
63 struct pcpu_sw_netstats *tstats; 64 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
64 u32 oldmark = skb->mark; 65 goto drop;
65 int ret; 66
66 67 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
67 68 skb->mark = be32_to_cpu(tunnel->parms.i_key);
68 /* temporarily mark the skb with the tunnel o_key, to 69
69 * only match policies with this mark. 70 return xfrm_input(skb, nexthdr, spi, encap_type);
70 */ 71 }
71 skb->mark = be32_to_cpu(tunnel->parms.o_key); 72
72 ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb); 73 return -EINVAL;
73 skb->mark = oldmark; 74drop:
74 if (!ret) 75 kfree_skb(skb);
75 return -1; 76 return 0;
76 77}
77 tstats = this_cpu_ptr(tunnel->dev->tstats); 78
78 u64_stats_update_begin(&tstats->syncp); 79static int vti_rcv(struct sk_buff *skb)
79 tstats->rx_packets++; 80{
80 tstats->rx_bytes += skb->len; 81 XFRM_SPI_SKB_CB(skb)->family = AF_INET;
81 u64_stats_update_end(&tstats->syncp); 82 XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
82 83
83 secpath_reset(skb); 84 return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
84 skb->dev = tunnel->dev; 85}
86
87static int vti_rcv_cb(struct sk_buff *skb, int err)
88{
89 unsigned short family;
90 struct net_device *dev;
91 struct pcpu_sw_netstats *tstats;
92 struct xfrm_state *x;
93 struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
94
95 if (!tunnel)
85 return 1; 96 return 1;
97
98 dev = tunnel->dev;
99
100 if (err) {
101 dev->stats.rx_errors++;
102 dev->stats.rx_dropped++;
103
104 return 0;
86 } 105 }
87 106
88 return -1; 107 x = xfrm_input_state(skb);
108 family = x->inner_mode->afinfo->family;
109
110 if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
111 return -EPERM;
112
113 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
114 skb->dev = dev;
115
116 tstats = this_cpu_ptr(dev->tstats);
117
118 u64_stats_update_begin(&tstats->syncp);
119 tstats->rx_packets++;
120 tstats->rx_bytes += skb->len;
121 u64_stats_update_end(&tstats->syncp);
122
123 return 0;
89} 124}
90 125
91/* This function assumes it is being called from dev_queue_xmit() 126static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
92 * and that skb is filled properly by that function. 127{
93 */ 128 xfrm_address_t *daddr = (xfrm_address_t *)&dst;
129 xfrm_address_t *saddr = (xfrm_address_t *)&src;
94 130
95static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 131 /* if there is no transform then this tunnel is not functional.
132 * Or if the xfrm is not mode tunnel.
133 */
134 if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
135 x->props.family != AF_INET)
136 return false;
137
138 if (!dst)
139 return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
140
141 if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
142 return false;
143
144 return true;
145}
146
147static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
148 struct flowi *fl)
96{ 149{
97 struct ip_tunnel *tunnel = netdev_priv(dev); 150 struct ip_tunnel *tunnel = netdev_priv(dev);
98 struct iphdr *tiph = &tunnel->parms.iph; 151 struct ip_tunnel_parm *parms = &tunnel->parms;
99 u8 tos; 152 struct dst_entry *dst = skb_dst(skb);
100 struct rtable *rt; /* Route to the other host */
101 struct net_device *tdev; /* Device to other host */ 153 struct net_device *tdev; /* Device to other host */
102 struct iphdr *old_iph = ip_hdr(skb);
103 __be32 dst = tiph->daddr;
104 struct flowi4 fl4;
105 int err; 154 int err;
106 155
107 if (skb->protocol != htons(ETH_P_IP)) 156 if (!dst) {
108 goto tx_error; 157 dev->stats.tx_carrier_errors++;
109 158 goto tx_error_icmp;
110 tos = old_iph->tos; 159 }
111 160
112 memset(&fl4, 0, sizeof(fl4)); 161 dst_hold(dst);
113 flowi4_init_output(&fl4, tunnel->parms.link, 162 dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0);
114 be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos), 163 if (IS_ERR(dst)) {
115 RT_SCOPE_UNIVERSE,
116 IPPROTO_IPIP, 0,
117 dst, tiph->saddr, 0, 0);
118 rt = ip_route_output_key(dev_net(dev), &fl4);
119 if (IS_ERR(rt)) {
120 dev->stats.tx_carrier_errors++; 164 dev->stats.tx_carrier_errors++;
121 goto tx_error_icmp; 165 goto tx_error_icmp;
122 } 166 }
123 /* if there is no transform then this tunnel is not functional. 167
124 * Or if the xfrm is not mode tunnel. 168 if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
125 */
126 if (!rt->dst.xfrm ||
127 rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
128 dev->stats.tx_carrier_errors++; 169 dev->stats.tx_carrier_errors++;
129 ip_rt_put(rt); 170 dst_release(dst);
130 goto tx_error_icmp; 171 goto tx_error_icmp;
131 } 172 }
132 tdev = rt->dst.dev; 173
174 tdev = dst->dev;
133 175
134 if (tdev == dev) { 176 if (tdev == dev) {
135 ip_rt_put(rt); 177 dst_release(dst);
136 dev->stats.collisions++; 178 dev->stats.collisions++;
137 goto tx_error; 179 goto tx_error;
138 } 180 }
@@ -146,10 +188,8 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
146 tunnel->err_count = 0; 188 tunnel->err_count = 0;
147 } 189 }
148 190
149 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 191 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
150 skb_dst_drop(skb); 192 skb_dst_set(skb, dst);
151 skb_dst_set(skb, &rt->dst);
152 nf_reset(skb);
153 skb->dev = skb_dst(skb)->dev; 193 skb->dev = skb_dst(skb)->dev;
154 194
155 err = dst_output(skb); 195 err = dst_output(skb);
@@ -166,6 +206,95 @@ tx_error:
166 return NETDEV_TX_OK; 206 return NETDEV_TX_OK;
167} 207}
168 208
209/* This function assumes it is being called from dev_queue_xmit()
210 * and that skb is filled properly by that function.
211 */
212static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
213{
214 struct ip_tunnel *tunnel = netdev_priv(dev);
215 struct flowi fl;
216
217 memset(&fl, 0, sizeof(fl));
218
219 skb->mark = be32_to_cpu(tunnel->parms.o_key);
220
221 switch (skb->protocol) {
222 case htons(ETH_P_IP):
223 xfrm_decode_session(skb, &fl, AF_INET);
224 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
225 break;
226 case htons(ETH_P_IPV6):
227 xfrm_decode_session(skb, &fl, AF_INET6);
228 memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
229 break;
230 default:
231 dev->stats.tx_errors++;
232 dev_kfree_skb(skb);
233 return NETDEV_TX_OK;
234 }
235
236 return vti_xmit(skb, dev, &fl);
237}
238
239static int vti4_err(struct sk_buff *skb, u32 info)
240{
241 __be32 spi;
242 struct xfrm_state *x;
243 struct ip_tunnel *tunnel;
244 struct ip_esp_hdr *esph;
245 struct ip_auth_hdr *ah ;
246 struct ip_comp_hdr *ipch;
247 struct net *net = dev_net(skb->dev);
248 const struct iphdr *iph = (const struct iphdr *)skb->data;
249 int protocol = iph->protocol;
250 struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
251
252 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
253 iph->daddr, iph->saddr, 0);
254 if (!tunnel)
255 return -1;
256
257 switch (protocol) {
258 case IPPROTO_ESP:
259 esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
260 spi = esph->spi;
261 break;
262 case IPPROTO_AH:
263 ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
264 spi = ah->spi;
265 break;
266 case IPPROTO_COMP:
267 ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
268 spi = htonl(ntohs(ipch->cpi));
269 break;
270 default:
271 return 0;
272 }
273
274 switch (icmp_hdr(skb)->type) {
275 case ICMP_DEST_UNREACH:
276 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
277 return 0;
278 case ICMP_REDIRECT:
279 break;
280 default:
281 return 0;
282 }
283
284 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
285 spi, protocol, AF_INET);
286 if (!x)
287 return 0;
288
289 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
290 ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
291 else
292 ipv4_redirect(skb, net, 0, 0, protocol, 0);
293 xfrm_state_put(x);
294
295 return 0;
296}
297
169static int 298static int
170vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) 299vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
171{ 300{
@@ -181,12 +310,13 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
181 return -EINVAL; 310 return -EINVAL;
182 } 311 }
183 312
313 p.i_flags |= VTI_ISVTI;
184 err = ip_tunnel_ioctl(dev, &p, cmd); 314 err = ip_tunnel_ioctl(dev, &p, cmd);
185 if (err) 315 if (err)
186 return err; 316 return err;
187 317
188 if (cmd != SIOCDELTUNNEL) { 318 if (cmd != SIOCDELTUNNEL) {
189 p.i_flags |= GRE_KEY | VTI_ISVTI; 319 p.i_flags |= GRE_KEY;
190 p.o_flags |= GRE_KEY; 320 p.o_flags |= GRE_KEY;
191 } 321 }
192 322
@@ -224,7 +354,6 @@ static int vti_tunnel_init(struct net_device *dev)
224 dev->flags = IFF_NOARP; 354 dev->flags = IFF_NOARP;
225 dev->iflink = 0; 355 dev->iflink = 0;
226 dev->addr_len = 4; 356 dev->addr_len = 4;
227 dev->features |= NETIF_F_NETNS_LOCAL;
228 dev->features |= NETIF_F_LLTX; 357 dev->features |= NETIF_F_LLTX;
229 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 358 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
230 359
@@ -241,9 +370,28 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)
241 iph->ihl = 5; 370 iph->ihl = 5;
242} 371}
243 372
244static struct xfrm_tunnel_notifier vti_handler __read_mostly = { 373static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
245 .handler = vti_rcv, 374 .handler = vti_rcv,
246 .priority = 1, 375 .input_handler = vti_input,
376 .cb_handler = vti_rcv_cb,
377 .err_handler = vti4_err,
378 .priority = 100,
379};
380
381static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
382 .handler = vti_rcv,
383 .input_handler = vti_input,
384 .cb_handler = vti_rcv_cb,
385 .err_handler = vti4_err,
386 .priority = 100,
387};
388
389static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
390 .handler = vti_rcv,
391 .input_handler = vti_input,
392 .cb_handler = vti_rcv_cb,
393 .err_handler = vti4_err,
394 .priority = 100,
247}; 395};
248 396
249static int __net_init vti_init_net(struct net *net) 397static int __net_init vti_init_net(struct net *net)
@@ -287,6 +435,8 @@ static void vti_netlink_parms(struct nlattr *data[],
287 if (!data) 435 if (!data)
288 return; 436 return;
289 437
438 parms->i_flags = VTI_ISVTI;
439
290 if (data[IFLA_VTI_LINK]) 440 if (data[IFLA_VTI_LINK])
291 parms->link = nla_get_u32(data[IFLA_VTI_LINK]); 441 parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
292 442
@@ -382,10 +532,31 @@ static int __init vti_init(void)
382 err = register_pernet_device(&vti_net_ops); 532 err = register_pernet_device(&vti_net_ops);
383 if (err < 0) 533 if (err < 0)
384 return err; 534 return err;
385 err = xfrm4_mode_tunnel_input_register(&vti_handler); 535 err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
536 if (err < 0) {
537 unregister_pernet_device(&vti_net_ops);
538 pr_info("vti init: can't register tunnel\n");
539
540 return err;
541 }
542
543 err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
544 if (err < 0) {
545 xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
546 unregister_pernet_device(&vti_net_ops);
547 pr_info("vti init: can't register tunnel\n");
548
549 return err;
550 }
551
552 err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
386 if (err < 0) { 553 if (err < 0) {
554 xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
555 xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
387 unregister_pernet_device(&vti_net_ops); 556 unregister_pernet_device(&vti_net_ops);
388 pr_info("vti init: can't register tunnel\n"); 557 pr_info("vti init: can't register tunnel\n");
558
559 return err;
389 } 560 }
390 561
391 err = rtnl_link_register(&vti_link_ops); 562 err = rtnl_link_register(&vti_link_ops);
@@ -395,7 +566,9 @@ static int __init vti_init(void)
395 return err; 566 return err;
396 567
397rtnl_link_failed: 568rtnl_link_failed:
398 xfrm4_mode_tunnel_input_deregister(&vti_handler); 569 xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
570 xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
571 xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
399 unregister_pernet_device(&vti_net_ops); 572 unregister_pernet_device(&vti_net_ops);
400 return err; 573 return err;
401} 574}
@@ -403,8 +576,13 @@ rtnl_link_failed:
403static void __exit vti_fini(void) 576static void __exit vti_fini(void)
404{ 577{
405 rtnl_link_unregister(&vti_link_ops); 578 rtnl_link_unregister(&vti_link_ops);
406 if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) 579 if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP))
407 pr_info("vti close: can't deregister tunnel\n"); 580 pr_info("vti close: can't deregister tunnel\n");
581 if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH))
582 pr_info("vti close: can't deregister tunnel\n");
583 if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP))
584 pr_info("vti close: can't deregister tunnel\n");
585
408 586
409 unregister_pernet_device(&vti_net_ops); 587 unregister_pernet_device(&vti_net_ops);
410} 588}
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 826be4cb482a..c0855d50a3fa 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -23,7 +23,7 @@
23#include <net/protocol.h> 23#include <net/protocol.h>
24#include <net/sock.h> 24#include <net/sock.h>
25 25
26static void ipcomp4_err(struct sk_buff *skb, u32 info) 26static int ipcomp4_err(struct sk_buff *skb, u32 info)
27{ 27{
28 struct net *net = dev_net(skb->dev); 28 struct net *net = dev_net(skb->dev);
29 __be32 spi; 29 __be32 spi;
@@ -34,24 +34,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
34 switch (icmp_hdr(skb)->type) { 34 switch (icmp_hdr(skb)->type) {
35 case ICMP_DEST_UNREACH: 35 case ICMP_DEST_UNREACH:
36 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 36 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
37 return; 37 return 0;
38 case ICMP_REDIRECT: 38 case ICMP_REDIRECT:
39 break; 39 break;
40 default: 40 default:
41 return; 41 return 0;
42 } 42 }
43 43
44 spi = htonl(ntohs(ipch->cpi)); 44 spi = htonl(ntohs(ipch->cpi));
45 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 45 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
46 spi, IPPROTO_COMP, AF_INET); 46 spi, IPPROTO_COMP, AF_INET);
47 if (!x) 47 if (!x)
48 return; 48 return 0;
49 49
50 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 50 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
51 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); 51 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
52 else 52 else
53 ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); 53 ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
54 xfrm_state_put(x); 54 xfrm_state_put(x);
55
56 return 0;
55} 57}
56 58
57/* We always hold one tunnel user reference to indicate a tunnel */ 59/* We always hold one tunnel user reference to indicate a tunnel */
@@ -147,6 +149,11 @@ out:
147 return err; 149 return err;
148} 150}
149 151
152static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
153{
154 return 0;
155}
156
150static const struct xfrm_type ipcomp_type = { 157static const struct xfrm_type ipcomp_type = {
151 .description = "IPCOMP4", 158 .description = "IPCOMP4",
152 .owner = THIS_MODULE, 159 .owner = THIS_MODULE,
@@ -157,11 +164,12 @@ static const struct xfrm_type ipcomp_type = {
157 .output = ipcomp_output 164 .output = ipcomp_output
158}; 165};
159 166
160static const struct net_protocol ipcomp4_protocol = { 167static struct xfrm4_protocol ipcomp4_protocol = {
161 .handler = xfrm4_rcv, 168 .handler = xfrm4_rcv,
169 .input_handler = xfrm_input,
170 .cb_handler = ipcomp4_rcv_cb,
162 .err_handler = ipcomp4_err, 171 .err_handler = ipcomp4_err,
163 .no_policy = 1, 172 .priority = 0,
164 .netns_ok = 1,
165}; 173};
166 174
167static int __init ipcomp4_init(void) 175static int __init ipcomp4_init(void)
@@ -170,7 +178,7 @@ static int __init ipcomp4_init(void)
170 pr_info("%s: can't add xfrm type\n", __func__); 178 pr_info("%s: can't add xfrm type\n", __func__);
171 return -EAGAIN; 179 return -EAGAIN;
172 } 180 }
173 if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { 181 if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
174 pr_info("%s: can't add protocol\n", __func__); 182 pr_info("%s: can't add protocol\n", __func__);
175 xfrm_unregister_type(&ipcomp_type, AF_INET); 183 xfrm_unregister_type(&ipcomp_type, AF_INET);
176 return -EAGAIN; 184 return -EAGAIN;
@@ -180,7 +188,7 @@ static int __init ipcomp4_init(void)
180 188
181static void __exit ipcomp4_fini(void) 189static void __exit ipcomp4_fini(void)
182{ 190{
183 if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) 191 if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
184 pr_info("%s: can't remove protocol\n", __func__); 192 pr_info("%s: can't remove protocol\n", __func__);
185 if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) 193 if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
186 pr_info("%s: can't remove xfrm type\n", __func__); 194 pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3e0adea9c27..7ebd6e37875c 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
61 skb_dst_set(skb, NULL); 61 skb_dst_set(skb, NULL);
62 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); 62 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
63 if (IS_ERR(dst)) 63 if (IS_ERR(dst))
64 return PTR_ERR(dst);; 64 return PTR_ERR(dst);
65 skb_dst_set(skb, dst); 65 skb_dst_set(skb, dst);
66 } 66 }
67#endif 67#endif
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 59da7cde0724..f95b6f93814b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name,
1044 1044
1045 xt_free_table_info(oldinfo); 1045 xt_free_table_info(oldinfo);
1046 if (copy_to_user(counters_ptr, counters, 1046 if (copy_to_user(counters_ptr, counters,
1047 sizeof(struct xt_counters) * num_counters) != 0) 1047 sizeof(struct xt_counters) * num_counters) != 0) {
1048 ret = -EFAULT; 1048 /* Silent error, can't fail, new table is already in place */
1049 net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
1050 }
1049 vfree(counters); 1051 vfree(counters);
1050 xt_table_unlock(t); 1052 xt_table_unlock(t);
1051 return ret; 1053 return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 718dfbd30cbe..99e810f84671 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1231 1231
1232 xt_free_table_info(oldinfo); 1232 xt_free_table_info(oldinfo);
1233 if (copy_to_user(counters_ptr, counters, 1233 if (copy_to_user(counters_ptr, counters,
1234 sizeof(struct xt_counters) * num_counters) != 0) 1234 sizeof(struct xt_counters) * num_counters) != 0) {
1235 ret = -EFAULT; 1235 /* Silent error, can't fail, new table is already in place */
1236 net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
1237 }
1236 vfree(counters); 1238 vfree(counters);
1237 xt_table_unlock(t); 1239 xt_table_unlock(t);
1238 return ret; 1240 return ret;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2d11c094296e..f4b19e5dde54 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -727,7 +727,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
727 sock_tx_timestamp(sk, &ipc.tx_flags); 727 sock_tx_timestamp(sk, &ipc.tx_flags);
728 728
729 if (msg->msg_controllen) { 729 if (msg->msg_controllen) {
730 err = ip_cmsg_send(sock_net(sk), msg, &ipc); 730 err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
731 if (err) 731 if (err)
732 return err; 732 return err;
733 if (ipc.opt) 733 if (ipc.opt)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a6c8a80ec9d6..ad737fad6d8b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
273 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), 273 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
274 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), 274 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
275 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), 275 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
276 SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
276 SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), 277 SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
277 SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), 278 SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
278 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), 279 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
@@ -280,6 +281,11 @@ static const struct snmp_mib snmp4_net_list[] = {
280 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), 281 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
281 SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), 282 SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
282 SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), 283 SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
284 SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
285 SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
286 SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
287 SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
288 SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
283 SNMP_MIB_SENTINEL 289 SNMP_MIB_SENTINEL
284}; 290};
285 291
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index c04518f4850a..a9dbe58bdfe7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -524,7 +524,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
524 ipc.oif = sk->sk_bound_dev_if; 524 ipc.oif = sk->sk_bound_dev_if;
525 525
526 if (msg->msg_controllen) { 526 if (msg->msg_controllen) {
527 err = ip_cmsg_send(sock_net(sk), msg, &ipc); 527 err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
528 if (err) 528 if (err)
529 goto out; 529 goto out;
530 if (ipc.opt) 530 if (ipc.opt)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4c011ec69ed4..34d094cadb11 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -139,11 +139,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
139 struct sk_buff *skb); 139 struct sk_buff *skb);
140static void ipv4_dst_destroy(struct dst_entry *dst); 140static void ipv4_dst_destroy(struct dst_entry *dst);
141 141
142static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
143 int how)
144{
145}
146
147static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 142static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
148{ 143{
149 WARN_ON(1); 144 WARN_ON(1);
@@ -162,7 +157,6 @@ static struct dst_ops ipv4_dst_ops = {
162 .mtu = ipv4_mtu, 157 .mtu = ipv4_mtu,
163 .cow_metrics = ipv4_cow_metrics, 158 .cow_metrics = ipv4_cow_metrics,
164 .destroy = ipv4_dst_destroy, 159 .destroy = ipv4_dst_destroy,
165 .ifdown = ipv4_dst_ifdown,
166 .negative_advice = ipv4_negative_advice, 160 .negative_advice = ipv4_negative_advice,
167 .link_failure = ipv4_link_failure, 161 .link_failure = ipv4_link_failure,
168 .update_pmtu = ip_rt_update_pmtu, 162 .update_pmtu = ip_rt_update_pmtu,
@@ -194,7 +188,7 @@ const __u8 ip_tos2prio[16] = {
194EXPORT_SYMBOL(ip_tos2prio); 188EXPORT_SYMBOL(ip_tos2prio);
195 189
196static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 190static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
197#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 191#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
198 192
199#ifdef CONFIG_PROC_FS 193#ifdef CONFIG_PROC_FS
200static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 194static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
@@ -697,7 +691,6 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
697 691
698out_unlock: 692out_unlock:
699 spin_unlock_bh(&fnhe_lock); 693 spin_unlock_bh(&fnhe_lock);
700 return;
701} 694}
702 695
703static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 696static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
@@ -2475,11 +2468,6 @@ errout_free:
2475 goto errout; 2468 goto errout;
2476} 2469}
2477 2470
2478int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2479{
2480 return skb->len;
2481}
2482
2483void ip_rt_multicast_event(struct in_device *in_dev) 2471void ip_rt_multicast_event(struct in_device *in_dev)
2484{ 2472{
2485 rt_cache_flush(dev_net(in_dev->dev)); 2473 rt_cache_flush(dev_net(in_dev->dev));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 97c8f5620c43..4bd6d52eeffb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
387 INIT_LIST_HEAD(&tp->tsq_node); 387 INIT_LIST_HEAD(&tp->tsq_node);
388 388
389 icsk->icsk_rto = TCP_TIMEOUT_INIT; 389 icsk->icsk_rto = TCP_TIMEOUT_INIT;
390 tp->mdev = TCP_TIMEOUT_INIT; 390 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
391 391
392 /* So many TCP implementations out there (incorrectly) count the 392 /* So many TCP implementations out there (incorrectly) count the
393 * initial SYN frame in their delayed-ACK and congestion control 393 * initial SYN frame in their delayed-ACK and congestion control
@@ -2341,7 +2341,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2341 2341
2342 sk->sk_shutdown = 0; 2342 sk->sk_shutdown = 0;
2343 sock_reset_flag(sk, SOCK_DONE); 2343 sock_reset_flag(sk, SOCK_DONE);
2344 tp->srtt = 0; 2344 tp->srtt_us = 0;
2345 if ((tp->write_seq += tp->max_window + 2) == 0) 2345 if ((tp->write_seq += tp->max_window + 2) == 0)
2346 tp->write_seq = 1; 2346 tp->write_seq = 1;
2347 icsk->icsk_backoff = 0; 2347 icsk->icsk_backoff = 0;
@@ -2785,8 +2785,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2785 2785
2786 info->tcpi_pmtu = icsk->icsk_pmtu_cookie; 2786 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2787 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; 2787 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2788 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; 2788 info->tcpi_rtt = tp->srtt_us >> 3;
2789 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; 2789 info->tcpi_rttvar = tp->mdev_us >> 2;
2790 info->tcpi_snd_ssthresh = tp->snd_ssthresh; 2790 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2791 info->tcpi_snd_cwnd = tp->snd_cwnd; 2791 info->tcpi_snd_cwnd = tp->snd_cwnd;
2792 info->tcpi_advmss = tp->advmss; 2792 info->tcpi_advmss = tp->advmss;
@@ -2796,6 +2796,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2796 info->tcpi_rcv_space = tp->rcvq_space.space; 2796 info->tcpi_rcv_space = tp->rcvq_space.space;
2797 2797
2798 info->tcpi_total_retrans = tp->total_retrans; 2798 info->tcpi_total_retrans = tp->total_retrans;
2799
2800 info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ?
2801 sk->sk_pacing_rate : ~0ULL;
2802 info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
2803 sk->sk_max_pacing_rate : ~0ULL;
2799} 2804}
2800EXPORT_SYMBOL_GPL(tcp_get_info); 2805EXPORT_SYMBOL_GPL(tcp_get_info);
2801 2806
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2388275adb9b..2b9464c93b88 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -361,21 +361,12 @@ u32 tcp_reno_ssthresh(struct sock *sk)
361} 361}
362EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 362EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
363 363
364/* Lower bound on congestion window with halving. */
365u32 tcp_reno_min_cwnd(const struct sock *sk)
366{
367 const struct tcp_sock *tp = tcp_sk(sk);
368 return tp->snd_ssthresh/2;
369}
370EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
371
372struct tcp_congestion_ops tcp_reno = { 364struct tcp_congestion_ops tcp_reno = {
373 .flags = TCP_CONG_NON_RESTRICTED, 365 .flags = TCP_CONG_NON_RESTRICTED,
374 .name = "reno", 366 .name = "reno",
375 .owner = THIS_MODULE, 367 .owner = THIS_MODULE,
376 .ssthresh = tcp_reno_ssthresh, 368 .ssthresh = tcp_reno_ssthresh,
377 .cong_avoid = tcp_reno_cong_avoid, 369 .cong_avoid = tcp_reno_cong_avoid,
378 .min_cwnd = tcp_reno_min_cwnd,
379}; 370};
380 371
381/* Initial congestion control used (until SYN) 372/* Initial congestion control used (until SYN)
@@ -387,6 +378,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = {
387 .owner = THIS_MODULE, 378 .owner = THIS_MODULE,
388 .ssthresh = tcp_reno_ssthresh, 379 .ssthresh = tcp_reno_ssthresh,
389 .cong_avoid = tcp_reno_cong_avoid, 380 .cong_avoid = tcp_reno_cong_avoid,
390 .min_cwnd = tcp_reno_min_cwnd,
391}; 381};
392EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); 382EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 828e4c3ffbaf..8bf224516ba2 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -476,10 +476,6 @@ static int __init cubictcp_register(void)
476 /* divide by bic_scale and by constant Srtt (100ms) */ 476 /* divide by bic_scale and by constant Srtt (100ms) */
477 do_div(cube_factor, bic_scale * 10); 477 do_div(cube_factor, bic_scale * 10);
478 478
479 /* hystart needs ms clock resolution */
480 if (hystart && HZ < 1000)
481 cubictcp.flags |= TCP_CONG_RTT_STAMP;
482
483 return tcp_register_congestion_control(&cubictcp); 479 return tcp_register_congestion_control(&cubictcp);
484} 480}
485 481
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8ed9305dfdf4..8b9e7bad77c0 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
162 .init = hstcp_init, 162 .init = hstcp_init,
163 .ssthresh = hstcp_ssthresh, 163 .ssthresh = hstcp_ssthresh,
164 .cong_avoid = hstcp_cong_avoid, 164 .cong_avoid = hstcp_cong_avoid,
165 .min_cwnd = tcp_reno_min_cwnd,
166 165
167 .owner = THIS_MODULE, 166 .owner = THIS_MODULE,
168 .name = "highspeed" 167 .name = "highspeed"
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 478fe82611bf..a15a799bf768 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -21,7 +21,7 @@ struct hybla {
21 u32 rho2; /* Rho * Rho, integer part */ 21 u32 rho2; /* Rho * Rho, integer part */
22 u32 rho_3ls; /* Rho parameter, <<3 */ 22 u32 rho_3ls; /* Rho parameter, <<3 */
23 u32 rho2_7ls; /* Rho^2, <<7 */ 23 u32 rho2_7ls; /* Rho^2, <<7 */
24 u32 minrtt; /* Minimum smoothed round trip time value seen */ 24 u32 minrtt_us; /* Minimum smoothed round trip time value seen */
25}; 25};
26 26
27/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ 27/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
@@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk)
35{ 35{
36 struct hybla *ca = inet_csk_ca(sk); 36 struct hybla *ca = inet_csk_ca(sk);
37 37
38 ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); 38 ca->rho_3ls = max_t(u32,
39 tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
40 8U);
39 ca->rho = ca->rho_3ls >> 3; 41 ca->rho = ca->rho_3ls >> 3;
40 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; 42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
41 ca->rho2 = ca->rho2_7ls >> 7; 43 ca->rho2 = ca->rho2_7ls >> 7;
@@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk)
59 hybla_recalc_param(sk); 61 hybla_recalc_param(sk);
60 62
61 /* set minimum rtt as this is the 1st ever seen */ 63 /* set minimum rtt as this is the 1st ever seen */
62 ca->minrtt = tp->srtt; 64 ca->minrtt_us = tp->srtt_us;
63 tp->snd_cwnd = ca->rho; 65 tp->snd_cwnd = ca->rho;
64} 66}
65 67
@@ -94,9 +96,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
94 int is_slowstart = 0; 96 int is_slowstart = 0;
95 97
96 /* Recalculate rho only if this srtt is the lowest */ 98 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){ 99 if (tp->srtt_us < ca->minrtt_us) {
98 hybla_recalc_param(sk); 100 hybla_recalc_param(sk);
99 ca->minrtt = tp->srtt; 101 ca->minrtt_us = tp->srtt_us;
100 } 102 }
101 103
102 if (!tcp_is_cwnd_limited(sk, in_flight)) 104 if (!tcp_is_cwnd_limited(sk, in_flight))
@@ -166,7 +168,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
166static struct tcp_congestion_ops tcp_hybla __read_mostly = { 168static struct tcp_congestion_ops tcp_hybla __read_mostly = {
167 .init = hybla_init, 169 .init = hybla_init,
168 .ssthresh = tcp_reno_ssthresh, 170 .ssthresh = tcp_reno_ssthresh,
169 .min_cwnd = tcp_reno_min_cwnd,
170 .cong_avoid = hybla_cong_avoid, 171 .cong_avoid = hybla_cong_avoid,
171 .set_state = hybla_state, 172 .set_state = hybla_state,
172 173
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index e498a62b8f97..863d105e3015 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -325,10 +325,8 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
325} 325}
326 326
327static struct tcp_congestion_ops tcp_illinois __read_mostly = { 327static struct tcp_congestion_ops tcp_illinois __read_mostly = {
328 .flags = TCP_CONG_RTT_STAMP,
329 .init = tcp_illinois_init, 328 .init = tcp_illinois_init,
330 .ssthresh = tcp_illinois_ssthresh, 329 .ssthresh = tcp_illinois_ssthresh,
331 .min_cwnd = tcp_reno_min_cwnd,
332 .cong_avoid = tcp_illinois_cong_avoid, 330 .cong_avoid = tcp_illinois_cong_avoid,
333 .set_state = tcp_illinois_state, 331 .set_state = tcp_illinois_state,
334 .get_info = tcp_illinois_info, 332 .get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eeaac399420d..e1661f46fd19 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -667,11 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
667 * To save cycles in the RFC 1323 implementation it was better to break 667 * To save cycles in the RFC 1323 implementation it was better to break
668 * it up into three procedures. -- erics 668 * it up into three procedures. -- erics
669 */ 669 */
670static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) 670static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
671{ 671{
672 struct tcp_sock *tp = tcp_sk(sk); 672 struct tcp_sock *tp = tcp_sk(sk);
673 long m = mrtt; /* RTT */ 673 long m = mrtt_us; /* RTT */
674 u32 srtt = tp->srtt; 674 u32 srtt = tp->srtt_us;
675 675
676 /* The following amusing code comes from Jacobson's 676 /* The following amusing code comes from Jacobson's
677 * article in SIGCOMM '88. Note that rtt and mdev 677 * article in SIGCOMM '88. Note that rtt and mdev
@@ -694,7 +694,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
694 srtt += m; /* rtt = 7/8 rtt + 1/8 new */ 694 srtt += m; /* rtt = 7/8 rtt + 1/8 new */
695 if (m < 0) { 695 if (m < 0) {
696 m = -m; /* m is now abs(error) */ 696 m = -m; /* m is now abs(error) */
697 m -= (tp->mdev >> 2); /* similar update on mdev */ 697 m -= (tp->mdev_us >> 2); /* similar update on mdev */
698 /* This is similar to one of Eifel findings. 698 /* This is similar to one of Eifel findings.
699 * Eifel blocks mdev updates when rtt decreases. 699 * Eifel blocks mdev updates when rtt decreases.
700 * This solution is a bit different: we use finer gain 700 * This solution is a bit different: we use finer gain
@@ -706,28 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
706 if (m > 0) 706 if (m > 0)
707 m >>= 3; 707 m >>= 3;
708 } else { 708 } else {
709 m -= (tp->mdev >> 2); /* similar update on mdev */ 709 m -= (tp->mdev_us >> 2); /* similar update on mdev */
710 } 710 }
711 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 711 tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
712 if (tp->mdev > tp->mdev_max) { 712 if (tp->mdev_us > tp->mdev_max_us) {
713 tp->mdev_max = tp->mdev; 713 tp->mdev_max_us = tp->mdev_us;
714 if (tp->mdev_max > tp->rttvar) 714 if (tp->mdev_max_us > tp->rttvar_us)
715 tp->rttvar = tp->mdev_max; 715 tp->rttvar_us = tp->mdev_max_us;
716 } 716 }
717 if (after(tp->snd_una, tp->rtt_seq)) { 717 if (after(tp->snd_una, tp->rtt_seq)) {
718 if (tp->mdev_max < tp->rttvar) 718 if (tp->mdev_max_us < tp->rttvar_us)
719 tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; 719 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
720 tp->rtt_seq = tp->snd_nxt; 720 tp->rtt_seq = tp->snd_nxt;
721 tp->mdev_max = tcp_rto_min(sk); 721 tp->mdev_max_us = tcp_rto_min_us(sk);
722 } 722 }
723 } else { 723 } else {
724 /* no previous measure. */ 724 /* no previous measure. */
725 srtt = m << 3; /* take the measured time to be rtt */ 725 srtt = m << 3; /* take the measured time to be rtt */
726 tp->mdev = m << 1; /* make sure rto = 3*rtt */ 726 tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
727 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 727 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
728 tp->mdev_max_us = tp->rttvar_us;
728 tp->rtt_seq = tp->snd_nxt; 729 tp->rtt_seq = tp->snd_nxt;
729 } 730 }
730 tp->srtt = max(1U, srtt); 731 tp->srtt_us = max(1U, srtt);
731} 732}
732 733
733/* Set the sk_pacing_rate to allow proper sizing of TSO packets. 734/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
@@ -742,20 +743,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
742 u64 rate; 743 u64 rate;
743 744
744 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ 745 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
745 rate = (u64)tp->mss_cache * 2 * (HZ << 3); 746 rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
746 747
747 rate *= max(tp->snd_cwnd, tp->packets_out); 748 rate *= max(tp->snd_cwnd, tp->packets_out);
748 749
749 /* Correction for small srtt and scheduling constraints. 750 if (likely(tp->srtt_us))
750 * For small rtt, consider noise is too high, and use 751 do_div(rate, tp->srtt_us);
751 * the minimal value (srtt = 1 -> 125 us for HZ=1000)
752 *
753 * We probably need usec resolution in the future.
754 * Note: This also takes care of possible srtt=0 case,
755 * when tcp_rtt_estimator() was not yet called.
756 */
757 if (tp->srtt > 8 + 2)
758 do_div(rate, tp->srtt);
759 752
760 /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate 753 /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
761 * without any lock. We want to make sure compiler wont store 754 * without any lock. We want to make sure compiler wont store
@@ -1122,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1122} 1115}
1123 1116
1124struct tcp_sacktag_state { 1117struct tcp_sacktag_state {
1125 int reord; 1118 int reord;
1126 int fack_count; 1119 int fack_count;
1127 int flag; 1120 long rtt_us; /* RTT measured by SACKing never-retransmitted data */
1128 s32 rtt; /* RTT measured by SACKing never-retransmitted data */ 1121 int flag;
1129}; 1122};
1130 1123
1131/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1124/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1186,7 +1179,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1186static u8 tcp_sacktag_one(struct sock *sk, 1179static u8 tcp_sacktag_one(struct sock *sk,
1187 struct tcp_sacktag_state *state, u8 sacked, 1180 struct tcp_sacktag_state *state, u8 sacked,
1188 u32 start_seq, u32 end_seq, 1181 u32 start_seq, u32 end_seq,
1189 int dup_sack, int pcount, u32 xmit_time) 1182 int dup_sack, int pcount,
1183 const struct skb_mstamp *xmit_time)
1190{ 1184{
1191 struct tcp_sock *tp = tcp_sk(sk); 1185 struct tcp_sock *tp = tcp_sk(sk);
1192 int fack_count = state->fack_count; 1186 int fack_count = state->fack_count;
@@ -1227,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
1227 if (!after(end_seq, tp->high_seq)) 1221 if (!after(end_seq, tp->high_seq))
1228 state->flag |= FLAG_ORIG_SACK_ACKED; 1222 state->flag |= FLAG_ORIG_SACK_ACKED;
1229 /* Pick the earliest sequence sacked for RTT */ 1223 /* Pick the earliest sequence sacked for RTT */
1230 if (state->rtt < 0) 1224 if (state->rtt_us < 0) {
1231 state->rtt = tcp_time_stamp - xmit_time; 1225 struct skb_mstamp now;
1226
1227 skb_mstamp_get(&now);
1228 state->rtt_us = skb_mstamp_us_delta(&now,
1229 xmit_time);
1230 }
1232 } 1231 }
1233 1232
1234 if (sacked & TCPCB_LOST) { 1233 if (sacked & TCPCB_LOST) {
@@ -1287,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1287 */ 1286 */
1288 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 1287 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1289 start_seq, end_seq, dup_sack, pcount, 1288 start_seq, end_seq, dup_sack, pcount,
1290 TCP_SKB_CB(skb)->when); 1289 &skb->skb_mstamp);
1291 1290
1292 if (skb == tp->lost_skb_hint) 1291 if (skb == tp->lost_skb_hint)
1293 tp->lost_cnt_hint += pcount; 1292 tp->lost_cnt_hint += pcount;
@@ -1565,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1565 TCP_SKB_CB(skb)->end_seq, 1564 TCP_SKB_CB(skb)->end_seq,
1566 dup_sack, 1565 dup_sack,
1567 tcp_skb_pcount(skb), 1566 tcp_skb_pcount(skb),
1568 TCP_SKB_CB(skb)->when); 1567 &skb->skb_mstamp);
1569 1568
1570 if (!before(TCP_SKB_CB(skb)->seq, 1569 if (!before(TCP_SKB_CB(skb)->seq,
1571 tcp_highest_sack_seq(tp))) 1570 tcp_highest_sack_seq(tp)))
@@ -1622,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
1622 1621
1623static int 1622static int
1624tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, 1623tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1625 u32 prior_snd_una, s32 *sack_rtt) 1624 u32 prior_snd_una, long *sack_rtt_us)
1626{ 1625{
1627 struct tcp_sock *tp = tcp_sk(sk); 1626 struct tcp_sock *tp = tcp_sk(sk);
1628 const unsigned char *ptr = (skb_transport_header(ack_skb) + 1627 const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1640,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1640 1639
1641 state.flag = 0; 1640 state.flag = 0;
1642 state.reord = tp->packets_out; 1641 state.reord = tp->packets_out;
1643 state.rtt = -1; 1642 state.rtt_us = -1L;
1644 1643
1645 if (!tp->sacked_out) { 1644 if (!tp->sacked_out) {
1646 if (WARN_ON(tp->fackets_out)) 1645 if (WARN_ON(tp->fackets_out))
@@ -1824,7 +1823,7 @@ out:
1824 WARN_ON((int)tp->retrans_out < 0); 1823 WARN_ON((int)tp->retrans_out < 0);
1825 WARN_ON((int)tcp_packets_in_flight(tp) < 0); 1824 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1826#endif 1825#endif
1827 *sack_rtt = state.rtt; 1826 *sack_rtt_us = state.rtt_us;
1828 return state.flag; 1827 return state.flag;
1829} 1828}
1830 1829
@@ -2035,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2035 * available, or RTO is scheduled to fire first. 2034 * available, or RTO is scheduled to fire first.
2036 */ 2035 */
2037 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || 2036 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2038 (flag & FLAG_ECE) || !tp->srtt) 2037 (flag & FLAG_ECE) || !tp->srtt_us)
2039 return false; 2038 return false;
2040 2039
2041 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); 2040 delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
2041 msecs_to_jiffies(2));
2042
2042 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) 2043 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2043 return false; 2044 return false;
2044 2045
@@ -2885,7 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2885} 2886}
2886 2887
2887static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2888static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2888 s32 seq_rtt, s32 sack_rtt) 2889 long seq_rtt_us, long sack_rtt_us)
2889{ 2890{
2890 const struct tcp_sock *tp = tcp_sk(sk); 2891 const struct tcp_sock *tp = tcp_sk(sk);
2891 2892
@@ -2895,10 +2896,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2895 * is acked (RFC6298). 2896 * is acked (RFC6298).
2896 */ 2897 */
2897 if (flag & FLAG_RETRANS_DATA_ACKED) 2898 if (flag & FLAG_RETRANS_DATA_ACKED)
2898 seq_rtt = -1; 2899 seq_rtt_us = -1L;
2899 2900
2900 if (seq_rtt < 0) 2901 if (seq_rtt_us < 0)
2901 seq_rtt = sack_rtt; 2902 seq_rtt_us = sack_rtt_us;
2902 2903
2903 /* RTTM Rule: A TSecr value received in a segment is used to 2904 /* RTTM Rule: A TSecr value received in a segment is used to
2904 * update the averaged RTT measurement only if the segment 2905 * update the averaged RTT measurement only if the segment
@@ -2906,14 +2907,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2906 * left edge of the send window. 2907 * left edge of the send window.
2907 * See draft-ietf-tcplw-high-performance-00, section 3.3. 2908 * See draft-ietf-tcplw-high-performance-00, section 3.3.
2908 */ 2909 */
2909 if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2910 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2910 flag & FLAG_ACKED) 2911 flag & FLAG_ACKED)
2911 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 2912 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
2912 2913
2913 if (seq_rtt < 0) 2914 if (seq_rtt_us < 0)
2914 return false; 2915 return false;
2915 2916
2916 tcp_rtt_estimator(sk, seq_rtt); 2917 tcp_rtt_estimator(sk, seq_rtt_us);
2917 tcp_set_rto(sk); 2918 tcp_set_rto(sk);
2918 2919
2919 /* RFC6298: only reset backoff on valid RTT measurement. */ 2920 /* RFC6298: only reset backoff on valid RTT measurement. */
@@ -2925,16 +2926,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2925static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) 2926static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
2926{ 2927{
2927 struct tcp_sock *tp = tcp_sk(sk); 2928 struct tcp_sock *tp = tcp_sk(sk);
2928 s32 seq_rtt = -1; 2929 long seq_rtt_us = -1L;
2929 2930
2930 if (synack_stamp && !tp->total_retrans) 2931 if (synack_stamp && !tp->total_retrans)
2931 seq_rtt = tcp_time_stamp - synack_stamp; 2932 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
2932 2933
2933 /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets 2934 /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
2934 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() 2935 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
2935 */ 2936 */
2936 if (!tp->srtt) 2937 if (!tp->srtt_us)
2937 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); 2938 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
2938} 2939}
2939 2940
2940static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 2941static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
@@ -3023,26 +3024,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3023 * arrived at the other end. 3024 * arrived at the other end.
3024 */ 3025 */
3025static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3026static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3026 u32 prior_snd_una, s32 sack_rtt) 3027 u32 prior_snd_una, long sack_rtt_us)
3027{ 3028{
3028 struct tcp_sock *tp = tcp_sk(sk);
3029 const struct inet_connection_sock *icsk = inet_csk(sk); 3029 const struct inet_connection_sock *icsk = inet_csk(sk);
3030 struct sk_buff *skb; 3030 struct skb_mstamp first_ackt, last_ackt, now;
3031 u32 now = tcp_time_stamp; 3031 struct tcp_sock *tp = tcp_sk(sk);
3032 u32 prior_sacked = tp->sacked_out;
3033 u32 reord = tp->packets_out;
3032 bool fully_acked = true; 3034 bool fully_acked = true;
3033 int flag = 0; 3035 long ca_seq_rtt_us = -1L;
3036 long seq_rtt_us = -1L;
3037 struct sk_buff *skb;
3034 u32 pkts_acked = 0; 3038 u32 pkts_acked = 0;
3035 u32 reord = tp->packets_out;
3036 u32 prior_sacked = tp->sacked_out;
3037 s32 seq_rtt = -1;
3038 s32 ca_seq_rtt = -1;
3039 ktime_t last_ackt = net_invalid_timestamp();
3040 bool rtt_update; 3039 bool rtt_update;
3040 int flag = 0;
3041
3042 first_ackt.v64 = 0;
3041 3043
3042 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3044 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3043 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3045 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3044 u32 acked_pcount;
3045 u8 sacked = scb->sacked; 3046 u8 sacked = scb->sacked;
3047 u32 acked_pcount;
3046 3048
3047 /* Determine how many packets and what bytes were acked, tso and else */ 3049 /* Determine how many packets and what bytes were acked, tso and else */
3048 if (after(scb->end_seq, tp->snd_una)) { 3050 if (after(scb->end_seq, tp->snd_una)) {
@@ -3064,11 +3066,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3064 tp->retrans_out -= acked_pcount; 3066 tp->retrans_out -= acked_pcount;
3065 flag |= FLAG_RETRANS_DATA_ACKED; 3067 flag |= FLAG_RETRANS_DATA_ACKED;
3066 } else { 3068 } else {
3067 ca_seq_rtt = now - scb->when; 3069 last_ackt = skb->skb_mstamp;
3068 last_ackt = skb->tstamp; 3070 WARN_ON_ONCE(last_ackt.v64 == 0);
3069 if (seq_rtt < 0) { 3071 if (!first_ackt.v64)
3070 seq_rtt = ca_seq_rtt; 3072 first_ackt = last_ackt;
3071 } 3073
3072 if (!(sacked & TCPCB_SACKED_ACKED)) 3074 if (!(sacked & TCPCB_SACKED_ACKED))
3073 reord = min(pkts_acked, reord); 3075 reord = min(pkts_acked, reord);
3074 if (!after(scb->end_seq, tp->high_seq)) 3076 if (!after(scb->end_seq, tp->high_seq))
@@ -3114,7 +3116,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3114 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3116 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3115 flag |= FLAG_SACK_RENEGING; 3117 flag |= FLAG_SACK_RENEGING;
3116 3118
3117 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); 3119 skb_mstamp_get(&now);
3120 if (first_ackt.v64) {
3121 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
3122 ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
3123 }
3124
3125 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
3118 3126
3119 if (flag & FLAG_ACKED) { 3127 if (flag & FLAG_ACKED) {
3120 const struct tcp_congestion_ops *ca_ops 3128 const struct tcp_congestion_ops *ca_ops
@@ -3142,25 +3150,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3142 3150
3143 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3151 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3144 3152
3145 if (ca_ops->pkts_acked) { 3153 if (ca_ops->pkts_acked)
3146 s32 rtt_us = -1; 3154 ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
3147
3148 /* Is the ACK triggering packet unambiguous? */
3149 if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
3150 /* High resolution needed and available? */
3151 if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
3152 !ktime_equal(last_ackt,
3153 net_invalid_timestamp()))
3154 rtt_us = ktime_us_delta(ktime_get_real(),
3155 last_ackt);
3156 else if (ca_seq_rtt >= 0)
3157 rtt_us = jiffies_to_usecs(ca_seq_rtt);
3158 }
3159 3155
3160 ca_ops->pkts_acked(sk, pkts_acked, rtt_us); 3156 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3161 } 3157 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
3162 } else if (skb && rtt_update && sack_rtt >= 0 &&
3163 sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
3164 /* Do not re-arm RTO if the sack RTT is measured from data sent 3158 /* Do not re-arm RTO if the sack RTT is measured from data sent
3165 * after when the head was last (re)transmitted. Otherwise the 3159 * after when the head was last (re)transmitted. Otherwise the
3166 * timeout may continue to extend in loss recovery. 3160 * timeout may continue to extend in loss recovery.
@@ -3370,12 +3364,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3370 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3364 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3371 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3365 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3372 bool is_dupack = false; 3366 bool is_dupack = false;
3373 u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; 3367 u32 prior_in_flight;
3374 u32 prior_fackets; 3368 u32 prior_fackets;
3375 int prior_packets = tp->packets_out; 3369 int prior_packets = tp->packets_out;
3376 const int prior_unsacked = tp->packets_out - tp->sacked_out; 3370 const int prior_unsacked = tp->packets_out - tp->sacked_out;
3377 int acked = 0; /* Number of packets newly acked */ 3371 int acked = 0; /* Number of packets newly acked */
3378 s32 sack_rtt = -1; 3372 long sack_rtt_us = -1L;
3379 3373
3380 /* If the ack is older than previous acks 3374 /* If the ack is older than previous acks
3381 * then we can probably ignore it. 3375 * then we can probably ignore it.
@@ -3433,7 +3427,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3433 3427
3434 if (TCP_SKB_CB(skb)->sacked) 3428 if (TCP_SKB_CB(skb)->sacked)
3435 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3429 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3436 &sack_rtt); 3430 &sack_rtt_us);
3437 3431
3438 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3432 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
3439 flag |= FLAG_ECE; 3433 flag |= FLAG_ECE;
@@ -3452,7 +3446,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3452 3446
3453 /* See if we can take anything off of the retransmit queue. */ 3447 /* See if we can take anything off of the retransmit queue. */
3454 acked = tp->packets_out; 3448 acked = tp->packets_out;
3455 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); 3449 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
3450 sack_rtt_us);
3456 acked -= tp->packets_out; 3451 acked -= tp->packets_out;
3457 3452
3458 /* Advance cwnd if state allows */ 3453 /* Advance cwnd if state allows */
@@ -3475,8 +3470,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3475 3470
3476 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3471 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3477 tcp_schedule_loss_probe(sk); 3472 tcp_schedule_loss_probe(sk);
3478 if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) 3473 tcp_update_pacing_rate(sk);
3479 tcp_update_pacing_rate(sk);
3480 return 1; 3474 return 1;
3481 3475
3482no_queue: 3476no_queue:
@@ -3505,7 +3499,7 @@ old_ack:
3505 */ 3499 */
3506 if (TCP_SKB_CB(skb)->sacked) { 3500 if (TCP_SKB_CB(skb)->sacked) {
3507 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3501 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3508 &sack_rtt); 3502 &sack_rtt_us);
3509 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3503 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3510 is_dupack, flag); 3504 is_dupack, flag);
3511 } 3505 }
@@ -5401,9 +5395,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5401 break; 5395 break;
5402 } 5396 }
5403 tcp_rearm_rto(sk); 5397 tcp_rearm_rto(sk);
5398 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
5404 return true; 5399 return true;
5405 } 5400 }
5406 tp->syn_data_acked = tp->syn_data; 5401 tp->syn_data_acked = tp->syn_data;
5402 if (tp->syn_data_acked)
5403 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
5407 return false; 5404 return false;
5408} 5405}
5409 5406
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1e4eac779f51..6379894ec210 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -435,7 +435,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
435 break; 435 break;
436 436
437 icsk->icsk_backoff--; 437 icsk->icsk_backoff--;
438 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : 438 inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
439 TCP_TIMEOUT_INIT) << icsk->icsk_backoff; 439 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
440 tcp_bound_rto(sk); 440 tcp_bound_rto(sk);
441 441
@@ -854,8 +854,10 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
854{ 854{
855 int res = tcp_v4_send_synack(sk, NULL, req, 0); 855 int res = tcp_v4_send_synack(sk, NULL, req, 0);
856 856
857 if (!res) 857 if (!res) {
858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
859 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
860 }
859 return res; 861 return res;
860} 862}
861 863
@@ -878,8 +880,6 @@ bool tcp_syn_flood_action(struct sock *sk,
878 bool want_cookie = false; 880 bool want_cookie = false;
879 struct listen_sock *lopt; 881 struct listen_sock *lopt;
880 882
881
882
883#ifdef CONFIG_SYN_COOKIES 883#ifdef CONFIG_SYN_COOKIES
884 if (sysctl_tcp_syncookies) { 884 if (sysctl_tcp_syncookies) {
885 msg = "Sending cookies"; 885 msg = "Sending cookies";
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 991d62a2f9bb..c9aecae31327 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -315,11 +315,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
315} 315}
316 316
317static struct tcp_congestion_ops tcp_lp __read_mostly = { 317static struct tcp_congestion_ops tcp_lp __read_mostly = {
318 .flags = TCP_CONG_RTT_STAMP,
319 .init = tcp_lp_init, 318 .init = tcp_lp_init,
320 .ssthresh = tcp_reno_ssthresh, 319 .ssthresh = tcp_reno_ssthresh,
321 .cong_avoid = tcp_lp_cong_avoid, 320 .cong_avoid = tcp_lp_cong_avoid,
322 .min_cwnd = tcp_reno_min_cwnd,
323 .pkts_acked = tcp_lp_pkts_acked, 321 .pkts_acked = tcp_lp_pkts_acked,
324 322
325 .owner = THIS_MODULE, 323 .owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7e522c558ba..d4f015ad6c84 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
103} 103}
104 104
105static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 105static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
106 const char *buffer) 106 char *buffer)
107{ 107{
108 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 108 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
109 unsigned long long val; 109 unsigned long long val;
@@ -219,7 +219,7 @@ static struct cftype tcp_files[] = {
219 219
220static int __init tcp_memcontrol_init(void) 220static int __init tcp_memcontrol_init(void)
221{ 221{
222 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); 222 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
223 return 0; 223 return 0;
224} 224}
225__initcall(tcp_memcontrol_init); 225__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index d547075d8300..dcaf72f10216 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -33,6 +33,11 @@ struct tcp_fastopen_metrics {
33 struct tcp_fastopen_cookie cookie; 33 struct tcp_fastopen_cookie cookie;
34}; 34};
35 35
36/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
37 * Kernel only stores RTT and RTTVAR in usec resolution
38 */
39#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
40
36struct tcp_metrics_block { 41struct tcp_metrics_block {
37 struct tcp_metrics_block __rcu *tcpm_next; 42 struct tcp_metrics_block __rcu *tcpm_next;
38 struct inetpeer_addr tcpm_saddr; 43 struct inetpeer_addr tcpm_saddr;
@@ -41,7 +46,7 @@ struct tcp_metrics_block {
41 u32 tcpm_ts; 46 u32 tcpm_ts;
42 u32 tcpm_ts_stamp; 47 u32 tcpm_ts_stamp;
43 u32 tcpm_lock; 48 u32 tcpm_lock;
44 u32 tcpm_vals[TCP_METRIC_MAX + 1]; 49 u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
45 struct tcp_fastopen_metrics tcpm_fastopen; 50 struct tcp_fastopen_metrics tcpm_fastopen;
46 51
47 struct rcu_head rcu_head; 52 struct rcu_head rcu_head;
@@ -59,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm,
59 return tm->tcpm_vals[idx]; 64 return tm->tcpm_vals[idx];
60} 65}
61 66
62static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
63 enum tcp_metric_index idx)
64{
65 return msecs_to_jiffies(tm->tcpm_vals[idx]);
66}
67
68static void tcp_metric_set(struct tcp_metrics_block *tm, 67static void tcp_metric_set(struct tcp_metrics_block *tm,
69 enum tcp_metric_index idx, 68 enum tcp_metric_index idx,
70 u32 val) 69 u32 val)
@@ -72,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
72 tm->tcpm_vals[idx] = val; 71 tm->tcpm_vals[idx] = val;
73} 72}
74 73
75static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
76 enum tcp_metric_index idx,
77 u32 val)
78{
79 tm->tcpm_vals[idx] = jiffies_to_msecs(val);
80}
81
82static bool addr_same(const struct inetpeer_addr *a, 74static bool addr_same(const struct inetpeer_addr *a,
83 const struct inetpeer_addr *b) 75 const struct inetpeer_addr *b)
84{ 76{
@@ -101,9 +93,11 @@ struct tcpm_hash_bucket {
101 93
102static DEFINE_SPINLOCK(tcp_metrics_lock); 94static DEFINE_SPINLOCK(tcp_metrics_lock);
103 95
104static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, 96static void tcpm_suck_dst(struct tcp_metrics_block *tm,
97 const struct dst_entry *dst,
105 bool fastopen_clear) 98 bool fastopen_clear)
106{ 99{
100 u32 msval;
107 u32 val; 101 u32 val;
108 102
109 tm->tcpm_stamp = jiffies; 103 tm->tcpm_stamp = jiffies;
@@ -121,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
121 val |= 1 << TCP_METRIC_REORDERING; 115 val |= 1 << TCP_METRIC_REORDERING;
122 tm->tcpm_lock = val; 116 tm->tcpm_lock = val;
123 117
124 tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); 118 msval = dst_metric_raw(dst, RTAX_RTT);
125 tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); 119 tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
120
121 msval = dst_metric_raw(dst, RTAX_RTTVAR);
122 tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
126 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); 123 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
127 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); 124 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
128 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); 125 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
@@ -384,7 +381,7 @@ void tcp_update_metrics(struct sock *sk)
384 dst_confirm(dst); 381 dst_confirm(dst);
385 382
386 rcu_read_lock(); 383 rcu_read_lock();
387 if (icsk->icsk_backoff || !tp->srtt) { 384 if (icsk->icsk_backoff || !tp->srtt_us) {
388 /* This session failed to estimate rtt. Why? 385 /* This session failed to estimate rtt. Why?
389 * Probably, no packets returned in time. Reset our 386 * Probably, no packets returned in time. Reset our
390 * results. 387 * results.
@@ -399,8 +396,8 @@ void tcp_update_metrics(struct sock *sk)
399 if (!tm) 396 if (!tm)
400 goto out_unlock; 397 goto out_unlock;
401 398
402 rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); 399 rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
403 m = rtt - tp->srtt; 400 m = rtt - tp->srtt_us;
404 401
405 /* If newly calculated rtt larger than stored one, store new 402 /* If newly calculated rtt larger than stored one, store new
406 * one. Otherwise, use EWMA. Remember, rtt overestimation is 403 * one. Otherwise, use EWMA. Remember, rtt overestimation is
@@ -408,10 +405,10 @@ void tcp_update_metrics(struct sock *sk)
408 */ 405 */
409 if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { 406 if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
410 if (m <= 0) 407 if (m <= 0)
411 rtt = tp->srtt; 408 rtt = tp->srtt_us;
412 else 409 else
413 rtt -= (m >> 3); 410 rtt -= (m >> 3);
414 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); 411 tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
415 } 412 }
416 413
417 if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { 414 if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
@@ -422,16 +419,16 @@ void tcp_update_metrics(struct sock *sk)
422 419
423 /* Scale deviation to rttvar fixed point */ 420 /* Scale deviation to rttvar fixed point */
424 m >>= 1; 421 m >>= 1;
425 if (m < tp->mdev) 422 if (m < tp->mdev_us)
426 m = tp->mdev; 423 m = tp->mdev_us;
427 424
428 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); 425 var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
429 if (m >= var) 426 if (m >= var)
430 var = m; 427 var = m;
431 else 428 else
432 var -= (var - m) >> 2; 429 var -= (var - m) >> 2;
433 430
434 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); 431 tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
435 } 432 }
436 433
437 if (tcp_in_initial_slowstart(tp)) { 434 if (tcp_in_initial_slowstart(tp)) {
@@ -528,7 +525,7 @@ void tcp_init_metrics(struct sock *sk)
528 tp->reordering = val; 525 tp->reordering = val;
529 } 526 }
530 527
531 crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); 528 crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
532 rcu_read_unlock(); 529 rcu_read_unlock();
533reset: 530reset:
534 /* The initial RTT measurement from the SYN/SYN-ACK is not ideal 531 /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
@@ -551,18 +548,20 @@ reset:
551 * to low value, and then abruptly stops to do it and starts to delay 548 * to low value, and then abruptly stops to do it and starts to delay
552 * ACKs, wait for troubles. 549 * ACKs, wait for troubles.
553 */ 550 */
554 if (crtt > tp->srtt) { 551 if (crtt > tp->srtt_us) {
555 /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ 552 /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
556 crtt >>= 3; 553 crtt /= 8 * USEC_PER_MSEC;
557 inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); 554 inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
558 } else if (tp->srtt == 0) { 555 } else if (tp->srtt_us == 0) {
559 /* RFC6298: 5.7 We've failed to get a valid RTT sample from 556 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
560 * 3WHS. This is most likely due to retransmission, 557 * 3WHS. This is most likely due to retransmission,
561 * including spurious one. Reset the RTO back to 3secs 558 * including spurious one. Reset the RTO back to 3secs
562 * from the more aggressive 1sec to avoid more spurious 559 * from the more aggressive 1sec to avoid more spurious
563 * retransmission. 560 * retransmission.
564 */ 561 */
565 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; 562 tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
563 tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
564
566 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; 565 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
567 } 566 }
568 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been 567 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
@@ -809,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
809 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); 808 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
810 if (!nest) 809 if (!nest)
811 goto nla_put_failure; 810 goto nla_put_failure;
812 for (i = 0; i < TCP_METRIC_MAX + 1; i++) { 811 for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
813 if (!tm->tcpm_vals[i]) 812 u32 val = tm->tcpm_vals[i];
813
814 if (!val)
814 continue; 815 continue;
815 if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) 816 if (i == TCP_METRIC_RTT) {
817 if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
818 val) < 0)
819 goto nla_put_failure;
820 n++;
821 val = max(val / 1000, 1U);
822 }
823 if (i == TCP_METRIC_RTTVAR) {
824 if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
825 val) < 0)
826 goto nla_put_failure;
827 n++;
828 val = max(val / 1000, 1U);
829 }
830 if (nla_put_u32(msg, i + 1, val) < 0)
816 goto nla_put_failure; 831 goto nla_put_failure;
817 n++; 832 n++;
818 } 833 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7a436c517e44..ca788ada5bd3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -398,8 +398,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
398 398
399 tcp_init_wl(newtp, treq->rcv_isn); 399 tcp_init_wl(newtp, treq->rcv_isn);
400 400
401 newtp->srtt = 0; 401 newtp->srtt_us = 0;
402 newtp->mdev = TCP_TIMEOUT_INIT; 402 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
403 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 403 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
404 404
405 newtp->packets_out = 0; 405 newtp->packets_out = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 17a11e65e57f..699fb102e971 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
86 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 86 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
87 tcp_rearm_rto(sk); 87 tcp_rearm_rto(sk);
88 } 88 }
89
90 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
91 tcp_skb_pcount(skb));
89} 92}
90 93
91/* SND.NXT, if window was not shrunk. 94/* SND.NXT, if window was not shrunk.
@@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window);
269static u16 tcp_select_window(struct sock *sk) 272static u16 tcp_select_window(struct sock *sk)
270{ 273{
271 struct tcp_sock *tp = tcp_sk(sk); 274 struct tcp_sock *tp = tcp_sk(sk);
275 u32 old_win = tp->rcv_wnd;
272 u32 cur_win = tcp_receive_window(tp); 276 u32 cur_win = tcp_receive_window(tp);
273 u32 new_win = __tcp_select_window(sk); 277 u32 new_win = __tcp_select_window(sk);
274 278
@@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk)
281 * 285 *
282 * Relax Will Robinson. 286 * Relax Will Robinson.
283 */ 287 */
288 if (new_win == 0)
289 NET_INC_STATS(sock_net(sk),
290 LINUX_MIB_TCPWANTZEROWINDOWADV);
284 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); 291 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
285 } 292 }
286 tp->rcv_wnd = new_win; 293 tp->rcv_wnd = new_win;
@@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk)
298 new_win >>= tp->rx_opt.rcv_wscale; 305 new_win >>= tp->rx_opt.rcv_wscale;
299 306
300 /* If we advertise zero window, disable fast path. */ 307 /* If we advertise zero window, disable fast path. */
301 if (new_win == 0) 308 if (new_win == 0) {
302 tp->pred_flags = 0; 309 tp->pred_flags = 0;
310 if (old_win)
311 NET_INC_STATS(sock_net(sk),
312 LINUX_MIB_TCPTOZEROWINDOWADV);
313 } else if (old_win == 0) {
314 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
315 }
303 316
304 return new_win; 317 return new_win;
305} 318}
@@ -867,11 +880,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
867 if (clone_it) { 880 if (clone_it) {
868 const struct sk_buff *fclone = skb + 1; 881 const struct sk_buff *fclone = skb + 1;
869 882
870 /* If congestion control is doing timestamping, we must 883 skb_mstamp_get(&skb->skb_mstamp);
871 * take such a timestamp before we potentially clone/copy.
872 */
873 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
874 __net_timestamp(skb);
875 884
876 if (unlikely(skb->fclone == SKB_FCLONE_ORIG && 885 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
877 fclone->fclone == SKB_FCLONE_CLONE)) 886 fclone->fclone == SKB_FCLONE_CLONE))
@@ -884,6 +893,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
884 skb = skb_clone(skb, gfp_mask); 893 skb = skb_clone(skb, gfp_mask);
885 if (unlikely(!skb)) 894 if (unlikely(!skb))
886 return -ENOBUFS; 895 return -ENOBUFS;
896 /* Our usage of tstamp should remain private */
897 skb->tstamp.tv64 = 0;
887 } 898 }
888 899
889 inet = inet_sk(sk); 900 inet = inet_sk(sk);
@@ -1426,7 +1437,7 @@ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1426 * With Minshall's modification: all sent small packets are ACKed. 1437 * With Minshall's modification: all sent small packets are ACKed.
1427 */ 1438 */
1428static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, 1439static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1429 unsigned int mss_now, int nonagle) 1440 int nonagle)
1430{ 1441{
1431 return partial && 1442 return partial &&
1432 ((nonagle & TCP_NAGLE_CORK) || 1443 ((nonagle & TCP_NAGLE_CORK) ||
@@ -1458,7 +1469,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
1458 * to include this last segment in this skb. 1469 * to include this last segment in this skb.
1459 * Otherwise, we'll split the skb at last MSS boundary 1470 * Otherwise, we'll split the skb at last MSS boundary
1460 */ 1471 */
1461 if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) 1472 if (tcp_nagle_check(partial != 0, tp, nonagle))
1462 return needed - partial; 1473 return needed - partial;
1463 1474
1464 return needed; 1475 return needed;
@@ -1521,7 +1532,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
1521 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1532 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1522 return true; 1533 return true;
1523 1534
1524 if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) 1535 if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
1525 return true; 1536 return true;
1526 1537
1527 return false; 1538 return false;
@@ -1975,7 +1986,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
1975 struct inet_connection_sock *icsk = inet_csk(sk); 1986 struct inet_connection_sock *icsk = inet_csk(sk);
1976 struct tcp_sock *tp = tcp_sk(sk); 1987 struct tcp_sock *tp = tcp_sk(sk);
1977 u32 timeout, tlp_time_stamp, rto_time_stamp; 1988 u32 timeout, tlp_time_stamp, rto_time_stamp;
1978 u32 rtt = tp->srtt >> 3; 1989 u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
1979 1990
1980 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) 1991 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
1981 return false; 1992 return false;
@@ -1997,7 +2008,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
1997 /* Schedule a loss probe in 2*RTT for SACK capable connections 2008 /* Schedule a loss probe in 2*RTT for SACK capable connections
1998 * in Open state, that are either limited by cwnd or application. 2009 * in Open state, that are either limited by cwnd or application.
1999 */ 2010 */
2000 if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || 2011 if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
2001 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2012 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2002 return false; 2013 return false;
2003 2014
@@ -2082,7 +2093,6 @@ rearm_timer:
2082 if (likely(!err)) 2093 if (likely(!err))
2083 NET_INC_STATS_BH(sock_net(sk), 2094 NET_INC_STATS_BH(sock_net(sk),
2084 LINUX_MIB_TCPLOSSPROBES); 2095 LINUX_MIB_TCPLOSSPROBES);
2085 return;
2086} 2096}
2087 2097
2088/* Push out any pending frames which were held back due to 2098/* Push out any pending frames which were held back due to
@@ -2180,7 +2190,8 @@ u32 __tcp_select_window(struct sock *sk)
2180 */ 2190 */
2181 int mss = icsk->icsk_ack.rcv_mss; 2191 int mss = icsk->icsk_ack.rcv_mss;
2182 int free_space = tcp_space(sk); 2192 int free_space = tcp_space(sk);
2183 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); 2193 int allowed_space = tcp_full_space(sk);
2194 int full_space = min_t(int, tp->window_clamp, allowed_space);
2184 int window; 2195 int window;
2185 2196
2186 if (mss > full_space) 2197 if (mss > full_space)
@@ -2193,7 +2204,19 @@ u32 __tcp_select_window(struct sock *sk)
2193 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 2204 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2194 4U * tp->advmss); 2205 4U * tp->advmss);
2195 2206
2196 if (free_space < mss) 2207 /* free_space might become our new window, make sure we don't
2208 * increase it due to wscale.
2209 */
2210 free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2211
2212 /* if free space is less than mss estimate, or is below 1/16th
2213 * of the maximum allowed, try to move to zero-window, else
2214 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
2215 * new incoming data is dropped due to memory limits.
2216 * With large window, mss test triggers way too late in order
2217 * to announce zero window in time before rmem limit kicks in.
2218 */
2219 if (free_space < (allowed_space >> 4) || free_space < mss)
2197 return 0; 2220 return 0;
2198 } 2221 }
2199 2222
@@ -2431,7 +2454,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2431 if (err == 0) { 2454 if (err == 0) {
2432 /* Update global TCP statistics. */ 2455 /* Update global TCP statistics. */
2433 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); 2456 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
2434 2457 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2458 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2435 tp->total_retrans++; 2459 tp->total_retrans++;
2436 2460
2437#if FASTRETRANS_DEBUG > 0 2461#if FASTRETRANS_DEBUG > 0
@@ -2717,7 +2741,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2717 int tcp_header_size; 2741 int tcp_header_size;
2718 int mss; 2742 int mss;
2719 2743
2720 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2744 skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
2721 if (unlikely(!skb)) { 2745 if (unlikely(!skb)) {
2722 dst_release(dst); 2746 dst_release(dst);
2723 return NULL; 2747 return NULL;
@@ -2787,7 +2811,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2787 th->window = htons(min(req->rcv_wnd, 65535U)); 2811 th->window = htons(min(req->rcv_wnd, 65535U));
2788 tcp_options_write((__be32 *)(th + 1), tp, &opts); 2812 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2789 th->doff = (tcp_header_size >> 2); 2813 th->doff = (tcp_header_size >> 2);
2790 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); 2814 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
2791 2815
2792#ifdef CONFIG_TCP_MD5SIG 2816#ifdef CONFIG_TCP_MD5SIG
2793 /* Okay, we have all we need - do the md5 hash if needed */ 2817 /* Okay, we have all we need - do the md5 hash if needed */
@@ -2959,9 +2983,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
2959 tcp_connect_queue_skb(sk, data); 2983 tcp_connect_queue_skb(sk, data);
2960 fo->copied = data->len; 2984 fo->copied = data->len;
2961 2985
2986 /* syn_data is about to be sent, we need to take current time stamps
2987 * for the packets that are in write queue : SYN packet and DATA
2988 */
2989 skb_mstamp_get(&syn->skb_mstamp);
2990 data->skb_mstamp = syn->skb_mstamp;
2991
2962 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { 2992 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
2963 tp->syn_data = (fo->copied > 0); 2993 tp->syn_data = (fo->copied > 0);
2964 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); 2994 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
2965 goto done; 2995 goto done;
2966 } 2996 }
2967 syn_data = NULL; 2997 syn_data = NULL;
@@ -3049,8 +3079,9 @@ void tcp_send_delayed_ack(struct sock *sk)
3049 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements 3079 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
3050 * directly. 3080 * directly.
3051 */ 3081 */
3052 if (tp->srtt) { 3082 if (tp->srtt_us) {
3053 int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); 3083 int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3084 TCP_DELACK_MIN);
3054 3085
3055 if (rtt < max_ato) 3086 if (rtt < max_ato)
3056 max_ato = rtt; 3087 max_ato = rtt;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 1f2d37613c9e..3b66610d4156 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -154,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
154 p->snd_wnd = tp->snd_wnd; 154 p->snd_wnd = tp->snd_wnd;
155 p->rcv_wnd = tp->rcv_wnd; 155 p->rcv_wnd = tp->rcv_wnd;
156 p->ssthresh = tcp_current_ssthresh(sk); 156 p->ssthresh = tcp_current_ssthresh(sk);
157 p->srtt = tp->srtt >> 3; 157 p->srtt = tp->srtt_us >> 3;
158 158
159 tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); 159 tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
160 } 160 }
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 19ea6c2951f3..0ac50836da4d 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -39,7 +39,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
39static struct tcp_congestion_ops tcp_scalable __read_mostly = { 39static struct tcp_congestion_ops tcp_scalable __read_mostly = {
40 .ssthresh = tcp_scalable_ssthresh, 40 .ssthresh = tcp_scalable_ssthresh,
41 .cong_avoid = tcp_scalable_cong_avoid, 41 .cong_avoid = tcp_scalable_cong_avoid,
42 .min_cwnd = tcp_reno_min_cwnd,
43 42
44 .owner = THIS_MODULE, 43 .owner = THIS_MODULE,
45 .name = "scalable", 44 .name = "scalable",
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 64f0354c84c7..286227abed10 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -165,6 +165,9 @@ static int tcp_write_timeout(struct sock *sk)
165 dst_negative_advice(sk); 165 dst_negative_advice(sk);
166 if (tp->syn_fastopen || tp->syn_data) 166 if (tp->syn_fastopen || tp->syn_data)
167 tcp_fastopen_cache_set(sk, 0, NULL, true); 167 tcp_fastopen_cache_set(sk, 0, NULL, true);
168 if (tp->syn_data)
169 NET_INC_STATS_BH(sock_net(sk),
170 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
168 } 171 }
169 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 172 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
170 syn_set = true; 173 syn_set = true;
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 06cae62bf208..48539fff6357 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -306,11 +306,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
306EXPORT_SYMBOL_GPL(tcp_vegas_get_info); 306EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
307 307
308static struct tcp_congestion_ops tcp_vegas __read_mostly = { 308static struct tcp_congestion_ops tcp_vegas __read_mostly = {
309 .flags = TCP_CONG_RTT_STAMP,
310 .init = tcp_vegas_init, 309 .init = tcp_vegas_init,
311 .ssthresh = tcp_reno_ssthresh, 310 .ssthresh = tcp_reno_ssthresh,
312 .cong_avoid = tcp_vegas_cong_avoid, 311 .cong_avoid = tcp_vegas_cong_avoid,
313 .min_cwnd = tcp_reno_min_cwnd,
314 .pkts_acked = tcp_vegas_pkts_acked, 312 .pkts_acked = tcp_vegas_pkts_acked,
315 .set_state = tcp_vegas_state, 313 .set_state = tcp_vegas_state,
316 .cwnd_event = tcp_vegas_cwnd_event, 314 .cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 326475a94865..1b8e28fcd7e1 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -203,7 +203,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
203} 203}
204 204
205static struct tcp_congestion_ops tcp_veno __read_mostly = { 205static struct tcp_congestion_ops tcp_veno __read_mostly = {
206 .flags = TCP_CONG_RTT_STAMP,
207 .init = tcp_veno_init, 206 .init = tcp_veno_init,
208 .ssthresh = tcp_veno_ssthresh, 207 .ssthresh = tcp_veno_ssthresh,
209 .cong_avoid = tcp_veno_cong_avoid, 208 .cong_avoid = tcp_veno_cong_avoid,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 76a1e23259e1..b94a04ae2ed5 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
276 .init = tcp_westwood_init, 276 .init = tcp_westwood_init,
277 .ssthresh = tcp_reno_ssthresh, 277 .ssthresh = tcp_reno_ssthresh,
278 .cong_avoid = tcp_reno_cong_avoid, 278 .cong_avoid = tcp_reno_cong_avoid,
279 .min_cwnd = tcp_westwood_bw_rttmin,
280 .cwnd_event = tcp_westwood_event, 279 .cwnd_event = tcp_westwood_event,
281 .get_info = tcp_westwood_info, 280 .get_info = tcp_westwood_info,
282 .pkts_acked = tcp_westwood_pkts_acked, 281 .pkts_acked = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 1a8d271f994d..5ede0e727945 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -227,11 +227,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
227} 227}
228 228
229static struct tcp_congestion_ops tcp_yeah __read_mostly = { 229static struct tcp_congestion_ops tcp_yeah __read_mostly = {
230 .flags = TCP_CONG_RTT_STAMP,
231 .init = tcp_yeah_init, 230 .init = tcp_yeah_init,
232 .ssthresh = tcp_yeah_ssthresh, 231 .ssthresh = tcp_yeah_ssthresh,
233 .cong_avoid = tcp_yeah_cong_avoid, 232 .cong_avoid = tcp_yeah_cong_avoid,
234 .min_cwnd = tcp_reno_min_cwnd,
235 .set_state = tcp_vegas_state, 233 .set_state = tcp_vegas_state,
236 .cwnd_event = tcp_vegas_cwnd_event, 234 .cwnd_event = tcp_vegas_cwnd_event,
237 .get_info = tcp_vegas_get_info, 235 .get_info = tcp_vegas_get_info,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 77bd16fa9f34..4468e1adc094 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -931,7 +931,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
931 sock_tx_timestamp(sk, &ipc.tx_flags); 931 sock_tx_timestamp(sk, &ipc.tx_flags);
932 932
933 if (msg->msg_controllen) { 933 if (msg->msg_controllen) {
934 err = ip_cmsg_send(sock_net(sk), msg, &ipc); 934 err = ip_cmsg_send(sock_net(sk), msg, &ipc,
935 sk->sk_family == AF_INET6);
935 if (err) 936 if (err)
936 return err; 937 return err;
937 if (ipc.opt) 938 if (ipc.opt)
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 1f12c8b45864..aac6197b7a71 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -37,15 +37,6 @@ drop:
37 return NET_RX_DROP; 37 return NET_RX_DROP;
38} 38}
39 39
40int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
41 int encap_type)
42{
43 XFRM_SPI_SKB_CB(skb)->family = AF_INET;
44 XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
45 return xfrm_input(skb, nexthdr, spi, encap_type);
46}
47EXPORT_SYMBOL(xfrm4_rcv_encap);
48
49int xfrm4_transport_finish(struct sk_buff *skb, int async) 40int xfrm4_transport_finish(struct sk_buff *skb, int async)
50{ 41{
51 struct iphdr *iph = ip_hdr(skb); 42 struct iphdr *iph = ip_hdr(skb);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 31b18152528f..05f2b484954f 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,65 +15,6 @@
15#include <net/ip.h> 15#include <net/ip.h>
16#include <net/xfrm.h> 16#include <net/xfrm.h>
17 17
18/* Informational hook. The decap is still done here. */
19static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly;
20static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
21
22int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler)
23{
24 struct xfrm_tunnel_notifier __rcu **pprev;
25 struct xfrm_tunnel_notifier *t;
26 int ret = -EEXIST;
27 int priority = handler->priority;
28
29 mutex_lock(&xfrm4_mode_tunnel_input_mutex);
30
31 for (pprev = &rcv_notify_handlers;
32 (t = rcu_dereference_protected(*pprev,
33 lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
34 pprev = &t->next) {
35 if (t->priority > priority)
36 break;
37 if (t->priority == priority)
38 goto err;
39
40 }
41
42 handler->next = *pprev;
43 rcu_assign_pointer(*pprev, handler);
44
45 ret = 0;
46
47err:
48 mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
49 return ret;
50}
51EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
52
53int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler)
54{
55 struct xfrm_tunnel_notifier __rcu **pprev;
56 struct xfrm_tunnel_notifier *t;
57 int ret = -ENOENT;
58
59 mutex_lock(&xfrm4_mode_tunnel_input_mutex);
60 for (pprev = &rcv_notify_handlers;
61 (t = rcu_dereference_protected(*pprev,
62 lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
63 pprev = &t->next) {
64 if (t == handler) {
65 *pprev = handler->next;
66 ret = 0;
67 break;
68 }
69 }
70 mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
71 synchronize_net();
72
73 return ret;
74}
75EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
76
77static inline void ipip_ecn_decapsulate(struct sk_buff *skb) 18static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
78{ 19{
79 struct iphdr *inner_iph = ipip_hdr(skb); 20 struct iphdr *inner_iph = ipip_hdr(skb);
@@ -127,14 +68,8 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
127 return 0; 68 return 0;
128} 69}
129 70
130#define for_each_input_rcu(head, handler) \
131 for (handler = rcu_dereference(head); \
132 handler != NULL; \
133 handler = rcu_dereference(handler->next))
134
135static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) 71static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
136{ 72{
137 struct xfrm_tunnel_notifier *handler;
138 int err = -EINVAL; 73 int err = -EINVAL;
139 74
140 if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) 75 if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -143,9 +78,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
143 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 78 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
144 goto out; 79 goto out;
145 80
146 for_each_input_rcu(rcv_notify_handlers, handler)
147 handler->handler(skb);
148
149 err = skb_unclone(skb, GFP_ATOMIC); 81 err = skb_unclone(skb, GFP_ATOMIC);
150 if (err) 82 if (err)
151 goto out; 83 goto out;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e1a63930a967..6156f68a1e90 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -325,6 +325,7 @@ void __init xfrm4_init(void)
325 325
326 xfrm4_state_init(); 326 xfrm4_state_init();
327 xfrm4_policy_init(); 327 xfrm4_policy_init();
328 xfrm4_protocol_init();
328#ifdef CONFIG_SYSCTL 329#ifdef CONFIG_SYSCTL
329 register_pernet_subsys(&xfrm4_net_ops); 330 register_pernet_subsys(&xfrm4_net_ops);
330#endif 331#endif
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
new file mode 100644
index 000000000000..7f7b243e8139
--- /dev/null
+++ b/net/ipv4/xfrm4_protocol.c
@@ -0,0 +1,286 @@
1/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
2 *
3 * Copyright (C) 2013 secunet Security Networks AG
4 *
5 * Author:
6 * Steffen Klassert <steffen.klassert@secunet.com>
7 *
8 * Based on:
9 * net/ipv4/tunnel4.c
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 */
16
17#include <linux/init.h>
18#include <linux/mutex.h>
19#include <linux/skbuff.h>
20#include <net/icmp.h>
21#include <net/ip.h>
22#include <net/protocol.h>
23#include <net/xfrm.h>
24
25static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
26static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
27static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
28static DEFINE_MUTEX(xfrm4_protocol_mutex);
29
30static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
31{
32 switch (protocol) {
33 case IPPROTO_ESP:
34 return &esp4_handlers;
35 case IPPROTO_AH:
36 return &ah4_handlers;
37 case IPPROTO_COMP:
38 return &ipcomp4_handlers;
39 }
40
41 return NULL;
42}
43
44#define for_each_protocol_rcu(head, handler) \
45 for (handler = rcu_dereference(head); \
46 handler != NULL; \
47 handler = rcu_dereference(handler->next)) \
48
49int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
50{
51 int ret;
52 struct xfrm4_protocol *handler;
53
54 for_each_protocol_rcu(*proto_handlers(protocol), handler)
55 if ((ret = handler->cb_handler(skb, err)) <= 0)
56 return ret;
57
58 return 0;
59}
60EXPORT_SYMBOL(xfrm4_rcv_cb);
61
62int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
63 int encap_type)
64{
65 int ret;
66 struct xfrm4_protocol *handler;
67
68 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
69 XFRM_SPI_SKB_CB(skb)->family = AF_INET;
70 XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
71
72 for_each_protocol_rcu(*proto_handlers(nexthdr), handler)
73 if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
74 return ret;
75
76 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
77
78 kfree_skb(skb);
79 return 0;
80}
81EXPORT_SYMBOL(xfrm4_rcv_encap);
82
83static int xfrm4_esp_rcv(struct sk_buff *skb)
84{
85 int ret;
86 struct xfrm4_protocol *handler;
87
88 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
89
90 for_each_protocol_rcu(esp4_handlers, handler)
91 if ((ret = handler->handler(skb)) != -EINVAL)
92 return ret;
93
94 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
95
96 kfree_skb(skb);
97 return 0;
98}
99
100static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
101{
102 struct xfrm4_protocol *handler;
103
104 for_each_protocol_rcu(esp4_handlers, handler)
105 if (!handler->err_handler(skb, info))
106 break;
107}
108
109static int xfrm4_ah_rcv(struct sk_buff *skb)
110{
111 int ret;
112 struct xfrm4_protocol *handler;
113
114 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
115
116 for_each_protocol_rcu(ah4_handlers, handler)
117 if ((ret = handler->handler(skb)) != -EINVAL)
118 return ret;;
119
120 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
121
122 kfree_skb(skb);
123 return 0;
124}
125
126static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
127{
128 struct xfrm4_protocol *handler;
129
130 for_each_protocol_rcu(ah4_handlers, handler)
131 if (!handler->err_handler(skb, info))
132 break;
133}
134
135static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
136{
137 int ret;
138 struct xfrm4_protocol *handler;
139
140 XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
141
142 for_each_protocol_rcu(ipcomp4_handlers, handler)
143 if ((ret = handler->handler(skb)) != -EINVAL)
144 return ret;
145
146 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
147
148 kfree_skb(skb);
149 return 0;
150}
151
152static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
153{
154 struct xfrm4_protocol *handler;
155
156 for_each_protocol_rcu(ipcomp4_handlers, handler)
157 if (!handler->err_handler(skb, info))
158 break;
159}
160
161static const struct net_protocol esp4_protocol = {
162 .handler = xfrm4_esp_rcv,
163 .err_handler = xfrm4_esp_err,
164 .no_policy = 1,
165 .netns_ok = 1,
166};
167
168static const struct net_protocol ah4_protocol = {
169 .handler = xfrm4_ah_rcv,
170 .err_handler = xfrm4_ah_err,
171 .no_policy = 1,
172 .netns_ok = 1,
173};
174
175static const struct net_protocol ipcomp4_protocol = {
176 .handler = xfrm4_ipcomp_rcv,
177 .err_handler = xfrm4_ipcomp_err,
178 .no_policy = 1,
179 .netns_ok = 1,
180};
181
182static struct xfrm_input_afinfo xfrm4_input_afinfo = {
183 .family = AF_INET,
184 .owner = THIS_MODULE,
185 .callback = xfrm4_rcv_cb,
186};
187
188static inline const struct net_protocol *netproto(unsigned char protocol)
189{
190 switch (protocol) {
191 case IPPROTO_ESP:
192 return &esp4_protocol;
193 case IPPROTO_AH:
194 return &ah4_protocol;
195 case IPPROTO_COMP:
196 return &ipcomp4_protocol;
197 }
198
199 return NULL;
200}
201
202int xfrm4_protocol_register(struct xfrm4_protocol *handler,
203 unsigned char protocol)
204{
205 struct xfrm4_protocol __rcu **pprev;
206 struct xfrm4_protocol *t;
207 bool add_netproto = false;
208 int ret = -EEXIST;
209 int priority = handler->priority;
210
211 mutex_lock(&xfrm4_protocol_mutex);
212
213 if (!rcu_dereference_protected(*proto_handlers(protocol),
214 lockdep_is_held(&xfrm4_protocol_mutex)))
215 add_netproto = true;
216
217 for (pprev = proto_handlers(protocol);
218 (t = rcu_dereference_protected(*pprev,
219 lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
220 pprev = &t->next) {
221 if (t->priority < priority)
222 break;
223 if (t->priority == priority)
224 goto err;
225 }
226
227 handler->next = *pprev;
228 rcu_assign_pointer(*pprev, handler);
229
230 ret = 0;
231
232err:
233 mutex_unlock(&xfrm4_protocol_mutex);
234
235 if (add_netproto) {
236 if (inet_add_protocol(netproto(protocol), protocol)) {
237 pr_err("%s: can't add protocol\n", __func__);
238 ret = -EAGAIN;
239 }
240 }
241
242 return ret;
243}
244EXPORT_SYMBOL(xfrm4_protocol_register);
245
246int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
247 unsigned char protocol)
248{
249 struct xfrm4_protocol __rcu **pprev;
250 struct xfrm4_protocol *t;
251 int ret = -ENOENT;
252
253 mutex_lock(&xfrm4_protocol_mutex);
254
255 for (pprev = proto_handlers(protocol);
256 (t = rcu_dereference_protected(*pprev,
257 lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
258 pprev = &t->next) {
259 if (t == handler) {
260 *pprev = handler->next;
261 ret = 0;
262 break;
263 }
264 }
265
266 if (!rcu_dereference_protected(*proto_handlers(protocol),
267 lockdep_is_held(&xfrm4_protocol_mutex))) {
268 if (inet_del_protocol(netproto(protocol), protocol) < 0) {
269 pr_err("%s: can't remove protocol\n", __func__);
270 ret = -EAGAIN;
271 }
272 }
273
274 mutex_unlock(&xfrm4_protocol_mutex);
275
276 synchronize_net();
277
278 return ret;
279}
280EXPORT_SYMBOL(xfrm4_protocol_deregister);
281
282void __init xfrm4_protocol_init(void)
283{
284 xfrm_input_register_afinfo(&xfrm4_input_afinfo);
285}
286EXPORT_SYMBOL(xfrm4_protocol_init);