aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 21:40:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 21:40:54 -0400
commit35a9ad8af0bb0fa3525e6d0d20e32551d226f38e (patch)
tree15b4b33206818886d9cff371fd2163e073b70568 /net/ipv4
parentd5935b07da53f74726e2a65dd4281d0f2c70e5d4 (diff)
parent64b1f00a0830e1c53874067273a096b228d83d36 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Most notable changes in here: 1) By far the biggest accomplishment, thanks to a large range of contributors, is the addition of multi-send for transmit. This is the result of discussions back in Chicago, and the hard work of several individuals. Now, when the ->ndo_start_xmit() method of a driver sees skb->xmit_more as true, it can choose to defer the doorbell telling the driver to start processing the new TX queue entires. skb->xmit_more means that the generic networking is guaranteed to call the driver immediately with another SKB to send. There is logic added to the qdisc layer to dequeue multiple packets at a time, and the handling mis-predicted offloads in software is now done with no locks held. Finally, pktgen is extended to have a "burst" parameter that can be used to test a multi-send implementation. Several drivers have xmit_more support: i40e, igb, ixgbe, mlx4, virtio_net Adding support is almost trivial, so export more drivers to support this optimization soon. I want to thank, in no particular or implied order, Jesper Dangaard Brouer, Eric Dumazet, Alexander Duyck, Tom Herbert, Jamal Hadi Salim, John Fastabend, Florian Westphal, Daniel Borkmann, David Tat, Hannes Frederic Sowa, and Rusty Russell. 2) PTP and timestamping support in bnx2x, from Michal Kalderon. 3) Allow adjusting the rx_copybreak threshold for a driver via ethtool, and add rx_copybreak support to enic driver. From Govindarajulu Varadarajan. 4) Significant enhancements to the generic PHY layer and the bcm7xxx driver in particular (EEE support, auto power down, etc.) from Florian Fainelli. 5) Allow raw buffers to be used for flow dissection, allowing drivers to determine the optimal "linear pull" size for devices that DMA into pools of pages. The objective is to get exactly the necessary amount of headers into the linear SKB area pre-pulled, but no more. The new interface drivers use is eth_get_headlen(). From WANG Cong, with driver conversions (several had their own by-hand duplicated implementations) by Alexander Duyck and Eric Dumazet. 6) Support checksumming more smoothly and efficiently for encapsulations, and add "foo over UDP" facility. From Tom Herbert. 7) Add Broadcom SF2 switch driver to DSA layer, from Florian Fainelli. 8) eBPF now can load programs via a system call and has an extensive testsuite. Alexei Starovoitov and Daniel Borkmann. 9) Major overhaul of the packet scheduler to use RCU in several major areas such as the classifiers and rate estimators. From John Fastabend. 10) Add driver for Intel FM10000 Ethernet Switch, from Alexander Duyck. 11) Rearrange TCP_SKB_CB() to reduce cache line misses, from Eric Dumazet. 12) Add Datacenter TCP congestion control algorithm support, From Florian Westphal. 13) Reorganize sk_buff so that __copy_skb_header() is significantly faster. From Eric Dumazet" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1558 commits) netlabel: directly return netlbl_unlabel_genl_init() net: add netdev_txq_bql_{enqueue, complete}_prefetchw() helpers net: description of dma_cookie cause make xmldocs warning cxgb4: clean up a type issue cxgb4: potential shift wrapping bug i40e: skb->xmit_more support net: fs_enet: Add NAPI TX net: fs_enet: Remove non NAPI RX r8169:add support for RTL8168EP net_sched: copy exts->type in tcf_exts_change() wimax: convert printk to pr_foo() af_unix: remove 0 assignment on static ipv6: Do not warn for informational ICMP messages, regardless of type. Update Intel Ethernet Driver maintainers list bridge: Save frag_max_size between PRE_ROUTING and POST_ROUTING tipc: fix bug in multicast congestion handling net: better IFF_XMIT_DST_RELEASE support net/mlx4_en: remove NETDEV_TX_BUSY 3c59x: fix bad split of cpu_to_le32(pci_map_single()) net: bcmgenet: fix Tx ring priority programming ...
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig51
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c47
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/arp.c6
-rw-r--r--net/ipv4/cipso_ipv4.c2
-rw-r--r--net/ipv4/fib_frontend.c14
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fou.c514
-rw-r--r--net/ipv4/geneve.c373
-rw-r--r--net/ipv4/gre_demux.c9
-rw-r--r--net/ipv4/gre_offload.c55
-rw-r--r--net/ipv4/icmp.c64
-rw-r--r--net/ipv4/igmp.c35
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/inetpeer.c21
-rw-r--r--net/ipv4/ip_fragment.c4
-rw-r--r--net/ipv4/ip_gre.c94
-rw-r--r--net/ipv4/ip_options.c6
-rw-r--r--net/ipv4/ip_output.c10
-rw-r--r--net/ipv4/ip_sockglue.c19
-rw-r--r--net/ipv4/ip_tunnel.c106
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipconfig.c3
-rw-r--r--net/ipv4/ipip.c82
-rw-r--r--net/ipv4/netfilter/Kconfig39
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c2
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c108
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--net/ipv4/netfilter/iptable_nat.c233
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c199
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c153
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c127
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c157
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c77
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c1
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/route.c14
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c40
-rw-r--r--net/ipv4/tcp.c36
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c55
-rw-r--r--net/ipv4/tcp_cubic.c18
-rw-r--r--net/ipv4/tcp_dctcp.c344
-rw-r--r--net/ipv4/tcp_diag.c5
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_highspeed.c145
-rw-r--r--net/ipv4/tcp_htcp.c6
-rw-r--r--net/ipv4/tcp_hybla.c1
-rw-r--r--net/ipv4/tcp_illinois.c3
-rw-r--r--net/ipv4/tcp_input.c285
-rw-r--r--net/ipv4/tcp_ipv4.c68
-rw-r--r--net/ipv4/tcp_minisocks.c13
-rw-r--r--net/ipv4/tcp_offload.c72
-rw-r--r--net/ipv4/tcp_output.c124
-rw-r--r--net/ipv4/tcp_probe.c6
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c52
-rw-r--r--net/ipv4/tcp_vegas.c3
-rw-r--r--net/ipv4/tcp_veno.c1
-rw-r--r--net/ipv4/tcp_westwood.c35
-rw-r--r--net/ipv4/tcp_yeah.c9
-rw-r--r--net/ipv4/udp.c13
-rw-r--r--net/ipv4/udp_offload.c171
-rw-r--r--net/ipv4/udp_tunnel.c138
69 files changed, 3104 insertions, 1210 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index dbc10d84161f..e682b48e0709 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -309,8 +309,33 @@ config NET_IPVTI
309 309
310config NET_UDP_TUNNEL 310config NET_UDP_TUNNEL
311 tristate 311 tristate
312 select NET_IP_TUNNEL
312 default n 313 default n
313 314
315config NET_FOU
316 tristate "IP: Foo (IP protocols) over UDP"
317 select XFRM
318 select NET_UDP_TUNNEL
319 ---help---
320 Foo over UDP allows any IP protocol to be directly encapsulated
321 over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP
322 network mechanisms and optimizations for UDP (such as ECMP
323 and RSS) can be leveraged to provide better service.
324
325config GENEVE
326 tristate "Generic Network Virtualization Encapsulation (Geneve)"
327 depends on INET
328 select NET_UDP_TUNNEL
329 ---help---
330 This allows one to create Geneve virtual interfaces that provide
331 Layer 2 Networks over Layer 3 Networks. Geneve is often used
332 to tunnel virtual network infrastructure in virtualized environments.
333 For more information see:
334 http://tools.ietf.org/html/draft-gross-geneve-01
335
336 To compile this driver as a module, choose M here: the module
337
338
314config INET_AH 339config INET_AH
315 tristate "IP: AH transformation" 340 tristate "IP: AH transformation"
316 select XFRM_ALGO 341 select XFRM_ALGO
@@ -560,6 +585,27 @@ config TCP_CONG_ILLINOIS
560 For further details see: 585 For further details see:
561 http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html 586 http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
562 587
588config TCP_CONG_DCTCP
589 tristate "DataCenter TCP (DCTCP)"
590 default n
591 ---help---
592 DCTCP leverages Explicit Congestion Notification (ECN) in the network to
593 provide multi-bit feedback to the end hosts. It is designed to provide:
594
595 - High burst tolerance (incast due to partition/aggregate),
596 - Low latency (short flows, queries),
597 - High throughput (continuous data updates, large file transfers) with
598 commodity, shallow-buffered switches.
599
600 All switches in the data center network running DCTCP must support
601 ECN marking and be configured for marking when reaching defined switch
602 buffer thresholds. The default ECN marking threshold heuristic for
603 DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
604 (~100KB) at 10Gbps, but might need further careful tweaking.
605
606 For further details see:
607 http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
608
563choice 609choice
564 prompt "Default TCP congestion control" 610 prompt "Default TCP congestion control"
565 default DEFAULT_CUBIC 611 default DEFAULT_CUBIC
@@ -588,9 +634,11 @@ choice
588 config DEFAULT_WESTWOOD 634 config DEFAULT_WESTWOOD
589 bool "Westwood" if TCP_CONG_WESTWOOD=y 635 bool "Westwood" if TCP_CONG_WESTWOOD=y
590 636
637 config DEFAULT_DCTCP
638 bool "DCTCP" if TCP_CONG_DCTCP=y
639
591 config DEFAULT_RENO 640 config DEFAULT_RENO
592 bool "Reno" 641 bool "Reno"
593
594endchoice 642endchoice
595 643
596endif 644endif
@@ -610,6 +658,7 @@ config DEFAULT_TCP_CONG
610 default "westwood" if DEFAULT_WESTWOOD 658 default "westwood" if DEFAULT_WESTWOOD
611 default "veno" if DEFAULT_VENO 659 default "veno" if DEFAULT_VENO
612 default "reno" if DEFAULT_RENO 660 default "reno" if DEFAULT_RENO
661 default "dctcp" if DEFAULT_DCTCP
613 default "cubic" 662 default "cubic"
614 663
615config TCP_MD5SIG 664config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8ee1cd4053ee..518c04ed666e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
20obj-$(CONFIG_IP_MROUTE) += ipmr.o 20obj-$(CONFIG_IP_MROUTE) += ipmr.o
21obj-$(CONFIG_NET_IPIP) += ipip.o 21obj-$(CONFIG_NET_IPIP) += ipip.o
22gre-y := gre_demux.o 22gre-y := gre_demux.o
23obj-$(CONFIG_NET_FOU) += fou.o
23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o 24obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
24obj-$(CONFIG_NET_IPGRE) += ip_gre.o 25obj-$(CONFIG_NET_IPGRE) += ip_gre.o
25obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o 26obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
@@ -42,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
42obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o 43obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
43obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 44obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
44obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o 45obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
46obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
45obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 47obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
46obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o 48obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
47obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o 49obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
@@ -54,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
54obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o 56obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
55obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o 57obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
56obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 58obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
59obj-$(CONFIG_GENEVE) += geneve.o
57 60
58obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 61obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
59 xfrm4_output.o xfrm4_protocol.o 62 xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d156b3c5f363..92db7a69f2b9 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -418,10 +418,6 @@ int inet_release(struct socket *sock)
418} 418}
419EXPORT_SYMBOL(inet_release); 419EXPORT_SYMBOL(inet_release);
420 420
421/* It is off by default, see below. */
422int sysctl_ip_nonlocal_bind __read_mostly;
423EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
424
425int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 421int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
426{ 422{
427 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 423 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
@@ -461,7 +457,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
461 * is temporarily down) 457 * is temporarily down)
462 */ 458 */
463 err = -EADDRNOTAVAIL; 459 err = -EADDRNOTAVAIL;
464 if (!sysctl_ip_nonlocal_bind && 460 if (!net->ipv4.sysctl_ip_nonlocal_bind &&
465 !(inet->freebind || inet->transparent) && 461 !(inet->freebind || inet->transparent) &&
466 addr->sin_addr.s_addr != htonl(INADDR_ANY) && 462 addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
467 chk_addr_ret != RTN_LOCAL && 463 chk_addr_ret != RTN_LOCAL &&
@@ -1201,40 +1197,6 @@ int inet_sk_rebuild_header(struct sock *sk)
1201} 1197}
1202EXPORT_SYMBOL(inet_sk_rebuild_header); 1198EXPORT_SYMBOL(inet_sk_rebuild_header);
1203 1199
1204static int inet_gso_send_check(struct sk_buff *skb)
1205{
1206 const struct net_offload *ops;
1207 const struct iphdr *iph;
1208 int proto;
1209 int ihl;
1210 int err = -EINVAL;
1211
1212 if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
1213 goto out;
1214
1215 iph = ip_hdr(skb);
1216 ihl = iph->ihl * 4;
1217 if (ihl < sizeof(*iph))
1218 goto out;
1219
1220 proto = iph->protocol;
1221
1222 /* Warning: after this point, iph might be no longer valid */
1223 if (unlikely(!pskb_may_pull(skb, ihl)))
1224 goto out;
1225 __skb_pull(skb, ihl);
1226
1227 skb_reset_transport_header(skb);
1228 err = -EPROTONOSUPPORT;
1229
1230 ops = rcu_dereference(inet_offloads[proto]);
1231 if (likely(ops && ops->callbacks.gso_send_check))
1232 err = ops->callbacks.gso_send_check(skb);
1233
1234out:
1235 return err;
1236}
1237
1238static struct sk_buff *inet_gso_segment(struct sk_buff *skb, 1200static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1239 netdev_features_t features) 1201 netdev_features_t features)
1240{ 1202{
@@ -1407,6 +1369,9 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1407 * immediately following this IP hdr. 1369 * immediately following this IP hdr.
1408 */ 1370 */
1409 1371
1372 /* Note : No need to call skb_gro_postpull_rcsum() here,
1373 * as we already checked checksum over ipv4 header was 0
1374 */
1410 skb_gro_pull(skb, sizeof(*iph)); 1375 skb_gro_pull(skb, sizeof(*iph));
1411 skb_set_transport_header(skb, skb_gro_offset(skb)); 1376 skb_set_transport_header(skb, skb_gro_offset(skb));
1412 1377
@@ -1659,7 +1624,6 @@ static int ipv4_proc_init(void);
1659static struct packet_offload ip_packet_offload __read_mostly = { 1624static struct packet_offload ip_packet_offload __read_mostly = {
1660 .type = cpu_to_be16(ETH_P_IP), 1625 .type = cpu_to_be16(ETH_P_IP),
1661 .callbacks = { 1626 .callbacks = {
1662 .gso_send_check = inet_gso_send_check,
1663 .gso_segment = inet_gso_segment, 1627 .gso_segment = inet_gso_segment,
1664 .gro_receive = inet_gro_receive, 1628 .gro_receive = inet_gro_receive,
1665 .gro_complete = inet_gro_complete, 1629 .gro_complete = inet_gro_complete,
@@ -1668,8 +1632,9 @@ static struct packet_offload ip_packet_offload __read_mostly = {
1668 1632
1669static const struct net_offload ipip_offload = { 1633static const struct net_offload ipip_offload = {
1670 .callbacks = { 1634 .callbacks = {
1671 .gso_send_check = inet_gso_send_check,
1672 .gso_segment = inet_gso_segment, 1635 .gso_segment = inet_gso_segment,
1636 .gro_receive = inet_gro_receive,
1637 .gro_complete = inet_gro_complete,
1673 }, 1638 },
1674}; 1639};
1675 1640
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index a2afa89513a0..ac9a32ec3ee4 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -505,8 +505,6 @@ static int ah_init_state(struct xfrm_state *x)
505 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; 505 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
506 ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; 506 ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
507 507
508 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
509
510 if (x->props.flags & XFRM_STATE_ALIGN4) 508 if (x->props.flags & XFRM_STATE_ALIGN4)
511 x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + 509 x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
512 ahp->icv_trunc_len); 510 ahp->icv_trunc_len);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1a9b99e04465..16acb59d665e 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -953,10 +953,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
953{ 953{
954 const struct arphdr *arp; 954 const struct arphdr *arp;
955 955
956 /* do not tweak dropwatch on an ARP we will ignore */
956 if (dev->flags & IFF_NOARP || 957 if (dev->flags & IFF_NOARP ||
957 skb->pkt_type == PACKET_OTHERHOST || 958 skb->pkt_type == PACKET_OTHERHOST ||
958 skb->pkt_type == PACKET_LOOPBACK) 959 skb->pkt_type == PACKET_LOOPBACK)
959 goto freeskb; 960 goto consumeskb;
960 961
961 skb = skb_share_check(skb, GFP_ATOMIC); 962 skb = skb_share_check(skb, GFP_ATOMIC);
962 if (!skb) 963 if (!skb)
@@ -974,6 +975,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
974 975
975 return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); 976 return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
976 977
978consumeskb:
979 consume_skb(skb);
980 return 0;
977freeskb: 981freeskb:
978 kfree_skb(skb); 982 kfree_skb(skb);
979out_of_mem: 983out_of_mem:
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 05b708bbdb0d..4715f25dfe03 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -246,7 +246,7 @@ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
246 * success, negative values on error. 246 * success, negative values on error.
247 * 247 *
248 */ 248 */
249static int cipso_v4_cache_init(void) 249static int __init cipso_v4_cache_init(void)
250{ 250{
251 u32 iter; 251 u32 iter;
252 252
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 255aa9946fe7..23104a3f2924 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -243,7 +243,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
243 u8 tos, int oif, struct net_device *dev, 243 u8 tos, int oif, struct net_device *dev,
244 int rpf, struct in_device *idev, u32 *itag) 244 int rpf, struct in_device *idev, u32 *itag)
245{ 245{
246 int ret, no_addr, accept_local; 246 int ret, no_addr;
247 struct fib_result res; 247 struct fib_result res;
248 struct flowi4 fl4; 248 struct flowi4 fl4;
249 struct net *net; 249 struct net *net;
@@ -258,16 +258,17 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
258 258
259 no_addr = idev->ifa_list == NULL; 259 no_addr = idev->ifa_list == NULL;
260 260
261 accept_local = IN_DEV_ACCEPT_LOCAL(idev);
262 fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; 261 fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
263 262
264 net = dev_net(dev); 263 net = dev_net(dev);
265 if (fib_lookup(net, &fl4, &res)) 264 if (fib_lookup(net, &fl4, &res))
266 goto last_resort; 265 goto last_resort;
267 if (res.type != RTN_UNICAST) { 266 if (res.type != RTN_UNICAST &&
268 if (res.type != RTN_LOCAL || !accept_local) 267 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
269 goto e_inval; 268 goto e_inval;
270 } 269 if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
270 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
271 goto last_resort;
271 fib_combine_itag(itag, &res); 272 fib_combine_itag(itag, &res);
272 dev_match = false; 273 dev_match = false;
273 274
@@ -321,6 +322,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
321 int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); 322 int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
322 323
323 if (!r && !fib_num_tclassid_users(dev_net(dev)) && 324 if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
325 IN_DEV_ACCEPT_LOCAL(idev) &&
324 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { 326 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
325 *itag = 0; 327 *itag = 0;
326 return 0; 328 return 0;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b10cd43a4722..5b6efb3d2308 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -157,9 +157,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp)
157 157
158static void free_nh_exceptions(struct fib_nh *nh) 158static void free_nh_exceptions(struct fib_nh *nh)
159{ 159{
160 struct fnhe_hash_bucket *hash = nh->nh_exceptions; 160 struct fnhe_hash_bucket *hash;
161 int i; 161 int i;
162 162
163 hash = rcu_dereference_protected(nh->nh_exceptions, 1);
164 if (!hash)
165 return;
163 for (i = 0; i < FNHE_HASH_SIZE; i++) { 166 for (i = 0; i < FNHE_HASH_SIZE; i++) {
164 struct fib_nh_exception *fnhe; 167 struct fib_nh_exception *fnhe;
165 168
@@ -205,8 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
205 change_nexthops(fi) { 208 change_nexthops(fi) {
206 if (nexthop_nh->nh_dev) 209 if (nexthop_nh->nh_dev)
207 dev_put(nexthop_nh->nh_dev); 210 dev_put(nexthop_nh->nh_dev);
208 if (nexthop_nh->nh_exceptions) 211 free_nh_exceptions(nexthop_nh);
209 free_nh_exceptions(nexthop_nh);
210 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); 212 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
211 rt_fibinfo_free(&nexthop_nh->nh_rth_input); 213 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
212 } endfor_nexthops(fi); 214 } endfor_nexthops(fi);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
new file mode 100644
index 000000000000..efa70ad44906
--- /dev/null
+++ b/net/ipv4/fou.c
@@ -0,0 +1,514 @@
1#include <linux/module.h>
2#include <linux/errno.h>
3#include <linux/socket.h>
4#include <linux/skbuff.h>
5#include <linux/ip.h>
6#include <linux/udp.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <net/genetlink.h>
10#include <net/gue.h>
11#include <net/ip.h>
12#include <net/protocol.h>
13#include <net/udp.h>
14#include <net/udp_tunnel.h>
15#include <net/xfrm.h>
16#include <uapi/linux/fou.h>
17#include <uapi/linux/genetlink.h>
18
19static DEFINE_SPINLOCK(fou_lock);
20static LIST_HEAD(fou_list);
21
22struct fou {
23 struct socket *sock;
24 u8 protocol;
25 u16 port;
26 struct udp_offload udp_offloads;
27 struct list_head list;
28};
29
30struct fou_cfg {
31 u16 type;
32 u8 protocol;
33 struct udp_port_cfg udp_config;
34};
35
36static inline struct fou *fou_from_sock(struct sock *sk)
37{
38 return sk->sk_user_data;
39}
40
41static int fou_udp_encap_recv_deliver(struct sk_buff *skb,
42 u8 protocol, size_t len)
43{
44 struct iphdr *iph = ip_hdr(skb);
45
46 /* Remove 'len' bytes from the packet (UDP header and
47 * FOU header if present), modify the protocol to the one
48 * we found, and then call rcv_encap.
49 */
50 iph->tot_len = htons(ntohs(iph->tot_len) - len);
51 __skb_pull(skb, len);
52 skb_postpull_rcsum(skb, udp_hdr(skb), len);
53 skb_reset_transport_header(skb);
54
55 return -protocol;
56}
57
58static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
59{
60 struct fou *fou = fou_from_sock(sk);
61
62 if (!fou)
63 return 1;
64
65 return fou_udp_encap_recv_deliver(skb, fou->protocol,
66 sizeof(struct udphdr));
67}
68
69static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
70{
71 struct fou *fou = fou_from_sock(sk);
72 size_t len;
73 struct guehdr *guehdr;
74 struct udphdr *uh;
75
76 if (!fou)
77 return 1;
78
79 len = sizeof(struct udphdr) + sizeof(struct guehdr);
80 if (!pskb_may_pull(skb, len))
81 goto drop;
82
83 uh = udp_hdr(skb);
84 guehdr = (struct guehdr *)&uh[1];
85
86 len += guehdr->hlen << 2;
87 if (!pskb_may_pull(skb, len))
88 goto drop;
89
90 if (guehdr->version != 0)
91 goto drop;
92
93 if (guehdr->flags) {
94 /* No support yet */
95 goto drop;
96 }
97
98 return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len);
99drop:
100 kfree_skb(skb);
101 return 0;
102}
103
104static struct sk_buff **fou_gro_receive(struct sk_buff **head,
105 struct sk_buff *skb)
106{
107 const struct net_offload *ops;
108 struct sk_buff **pp = NULL;
109 u8 proto = NAPI_GRO_CB(skb)->proto;
110 const struct net_offload **offloads;
111
112 rcu_read_lock();
113 offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
114 ops = rcu_dereference(offloads[proto]);
115 if (!ops || !ops->callbacks.gro_receive)
116 goto out_unlock;
117
118 pp = ops->callbacks.gro_receive(head, skb);
119
120out_unlock:
121 rcu_read_unlock();
122
123 return pp;
124}
125
126static int fou_gro_complete(struct sk_buff *skb, int nhoff)
127{
128 const struct net_offload *ops;
129 u8 proto = NAPI_GRO_CB(skb)->proto;
130 int err = -ENOSYS;
131 const struct net_offload **offloads;
132
133 rcu_read_lock();
134 offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
135 ops = rcu_dereference(offloads[proto]);
136 if (WARN_ON(!ops || !ops->callbacks.gro_complete))
137 goto out_unlock;
138
139 err = ops->callbacks.gro_complete(skb, nhoff);
140
141out_unlock:
142 rcu_read_unlock();
143
144 return err;
145}
146
147static struct sk_buff **gue_gro_receive(struct sk_buff **head,
148 struct sk_buff *skb)
149{
150 const struct net_offload **offloads;
151 const struct net_offload *ops;
152 struct sk_buff **pp = NULL;
153 struct sk_buff *p;
154 u8 proto;
155 struct guehdr *guehdr;
156 unsigned int hlen, guehlen;
157 unsigned int off;
158 int flush = 1;
159
160 off = skb_gro_offset(skb);
161 hlen = off + sizeof(*guehdr);
162 guehdr = skb_gro_header_fast(skb, off);
163 if (skb_gro_header_hard(skb, hlen)) {
164 guehdr = skb_gro_header_slow(skb, hlen, off);
165 if (unlikely(!guehdr))
166 goto out;
167 }
168
169 proto = guehdr->next_hdr;
170
171 rcu_read_lock();
172 offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
173 ops = rcu_dereference(offloads[proto]);
174 if (WARN_ON(!ops || !ops->callbacks.gro_receive))
175 goto out_unlock;
176
177 guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
178
179 hlen = off + guehlen;
180 if (skb_gro_header_hard(skb, hlen)) {
181 guehdr = skb_gro_header_slow(skb, hlen, off);
182 if (unlikely(!guehdr))
183 goto out_unlock;
184 }
185
186 flush = 0;
187
188 for (p = *head; p; p = p->next) {
189 const struct guehdr *guehdr2;
190
191 if (!NAPI_GRO_CB(p)->same_flow)
192 continue;
193
194 guehdr2 = (struct guehdr *)(p->data + off);
195
196 /* Compare base GUE header to be equal (covers
197 * hlen, version, next_hdr, and flags.
198 */
199 if (guehdr->word != guehdr2->word) {
200 NAPI_GRO_CB(p)->same_flow = 0;
201 continue;
202 }
203
204 /* Compare optional fields are the same. */
205 if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
206 guehdr->hlen << 2)) {
207 NAPI_GRO_CB(p)->same_flow = 0;
208 continue;
209 }
210 }
211
212 skb_gro_pull(skb, guehlen);
213
214 /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
215 skb_gro_postpull_rcsum(skb, guehdr, guehlen);
216
217 pp = ops->callbacks.gro_receive(head, skb);
218
219out_unlock:
220 rcu_read_unlock();
221out:
222 NAPI_GRO_CB(skb)->flush |= flush;
223
224 return pp;
225}
226
227static int gue_gro_complete(struct sk_buff *skb, int nhoff)
228{
229 const struct net_offload **offloads;
230 struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
231 const struct net_offload *ops;
232 unsigned int guehlen;
233 u8 proto;
234 int err = -ENOENT;
235
236 proto = guehdr->next_hdr;
237
238 guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
239
240 rcu_read_lock();
241 offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
242 ops = rcu_dereference(offloads[proto]);
243 if (WARN_ON(!ops || !ops->callbacks.gro_complete))
244 goto out_unlock;
245
246 err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
247
248out_unlock:
249 rcu_read_unlock();
250 return err;
251}
252
253static int fou_add_to_port_list(struct fou *fou)
254{
255 struct fou *fout;
256
257 spin_lock(&fou_lock);
258 list_for_each_entry(fout, &fou_list, list) {
259 if (fou->port == fout->port) {
260 spin_unlock(&fou_lock);
261 return -EALREADY;
262 }
263 }
264
265 list_add(&fou->list, &fou_list);
266 spin_unlock(&fou_lock);
267
268 return 0;
269}
270
271static void fou_release(struct fou *fou)
272{
273 struct socket *sock = fou->sock;
274 struct sock *sk = sock->sk;
275
276 udp_del_offload(&fou->udp_offloads);
277
278 list_del(&fou->list);
279
280 /* Remove hooks into tunnel socket */
281 sk->sk_user_data = NULL;
282
283 sock_release(sock);
284
285 kfree(fou);
286}
287
288static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
289{
290 udp_sk(sk)->encap_rcv = fou_udp_recv;
291 fou->protocol = cfg->protocol;
292 fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
293 fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
294 fou->udp_offloads.port = cfg->udp_config.local_udp_port;
295 fou->udp_offloads.ipproto = cfg->protocol;
296
297 return 0;
298}
299
300static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
301{
302 udp_sk(sk)->encap_rcv = gue_udp_recv;
303 fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
304 fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
305 fou->udp_offloads.port = cfg->udp_config.local_udp_port;
306
307 return 0;
308}
309
310static int fou_create(struct net *net, struct fou_cfg *cfg,
311 struct socket **sockp)
312{
313 struct fou *fou = NULL;
314 int err;
315 struct socket *sock = NULL;
316 struct sock *sk;
317
318 /* Open UDP socket */
319 err = udp_sock_create(net, &cfg->udp_config, &sock);
320 if (err < 0)
321 goto error;
322
323 /* Allocate FOU port structure */
324 fou = kzalloc(sizeof(*fou), GFP_KERNEL);
325 if (!fou) {
326 err = -ENOMEM;
327 goto error;
328 }
329
330 sk = sock->sk;
331
332 fou->port = cfg->udp_config.local_udp_port;
333
334 /* Initial for fou type */
335 switch (cfg->type) {
336 case FOU_ENCAP_DIRECT:
337 err = fou_encap_init(sk, fou, cfg);
338 if (err)
339 goto error;
340 break;
341 case FOU_ENCAP_GUE:
342 err = gue_encap_init(sk, fou, cfg);
343 if (err)
344 goto error;
345 break;
346 default:
347 err = -EINVAL;
348 goto error;
349 }
350
351 udp_sk(sk)->encap_type = 1;
352 udp_encap_enable();
353
354 sk->sk_user_data = fou;
355 fou->sock = sock;
356
357 udp_set_convert_csum(sk, true);
358
359 sk->sk_allocation = GFP_ATOMIC;
360
361 if (cfg->udp_config.family == AF_INET) {
362 err = udp_add_offload(&fou->udp_offloads);
363 if (err)
364 goto error;
365 }
366
367 err = fou_add_to_port_list(fou);
368 if (err)
369 goto error;
370
371 if (sockp)
372 *sockp = sock;
373
374 return 0;
375
376error:
377 kfree(fou);
378 if (sock)
379 sock_release(sock);
380
381 return err;
382}
383
384static int fou_destroy(struct net *net, struct fou_cfg *cfg)
385{
386 struct fou *fou;
387 u16 port = cfg->udp_config.local_udp_port;
388 int err = -EINVAL;
389
390 spin_lock(&fou_lock);
391 list_for_each_entry(fou, &fou_list, list) {
392 if (fou->port == port) {
393 udp_del_offload(&fou->udp_offloads);
394 fou_release(fou);
395 err = 0;
396 break;
397 }
398 }
399 spin_unlock(&fou_lock);
400
401 return err;
402}
403
404static struct genl_family fou_nl_family = {
405 .id = GENL_ID_GENERATE,
406 .hdrsize = 0,
407 .name = FOU_GENL_NAME,
408 .version = FOU_GENL_VERSION,
409 .maxattr = FOU_ATTR_MAX,
410 .netnsok = true,
411};
412
413static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
414 [FOU_ATTR_PORT] = { .type = NLA_U16, },
415 [FOU_ATTR_AF] = { .type = NLA_U8, },
416 [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
417 [FOU_ATTR_TYPE] = { .type = NLA_U8, },
418};
419
420static int parse_nl_config(struct genl_info *info,
421 struct fou_cfg *cfg)
422{
423 memset(cfg, 0, sizeof(*cfg));
424
425 cfg->udp_config.family = AF_INET;
426
427 if (info->attrs[FOU_ATTR_AF]) {
428 u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
429
430 if (family != AF_INET && family != AF_INET6)
431 return -EINVAL;
432
433 cfg->udp_config.family = family;
434 }
435
436 if (info->attrs[FOU_ATTR_PORT]) {
437 u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]);
438
439 cfg->udp_config.local_udp_port = port;
440 }
441
442 if (info->attrs[FOU_ATTR_IPPROTO])
443 cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
444
445 if (info->attrs[FOU_ATTR_TYPE])
446 cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
447
448 return 0;
449}
450
451static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
452{
453 struct fou_cfg cfg;
454 int err;
455
456 err = parse_nl_config(info, &cfg);
457 if (err)
458 return err;
459
460 return fou_create(&init_net, &cfg, NULL);
461}
462
463static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
464{
465 struct fou_cfg cfg;
466
467 parse_nl_config(info, &cfg);
468
469 return fou_destroy(&init_net, &cfg);
470}
471
472static const struct genl_ops fou_nl_ops[] = {
473 {
474 .cmd = FOU_CMD_ADD,
475 .doit = fou_nl_cmd_add_port,
476 .policy = fou_nl_policy,
477 .flags = GENL_ADMIN_PERM,
478 },
479 {
480 .cmd = FOU_CMD_DEL,
481 .doit = fou_nl_cmd_rm_port,
482 .policy = fou_nl_policy,
483 .flags = GENL_ADMIN_PERM,
484 },
485};
486
487static int __init fou_init(void)
488{
489 int ret;
490
491 ret = genl_register_family_with_ops(&fou_nl_family,
492 fou_nl_ops);
493
494 return ret;
495}
496
497static void __exit fou_fini(void)
498{
499 struct fou *fou, *next;
500
501 genl_unregister_family(&fou_nl_family);
502
503 /* Close all the FOU sockets */
504
505 spin_lock(&fou_lock);
506 list_for_each_entry_safe(fou, next, &fou_list, list)
507 fou_release(fou);
508 spin_unlock(&fou_lock);
509}
510
511module_init(fou_init);
512module_exit(fou_fini);
513MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
514MODULE_LICENSE("GPL");
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
new file mode 100644
index 000000000000..065cd94c640c
--- /dev/null
+++ b/net/ipv4/geneve.c
@@ -0,0 +1,373 @@
1/*
2 * Geneve: Generic Network Virtualization Encapsulation
3 *
4 * Copyright (c) 2014 Nicira, Inc.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14#include <linux/kernel.h>
15#include <linux/types.h>
16#include <linux/module.h>
17#include <linux/errno.h>
18#include <linux/slab.h>
19#include <linux/skbuff.h>
20#include <linux/rculist.h>
21#include <linux/netdevice.h>
22#include <linux/in.h>
23#include <linux/ip.h>
24#include <linux/udp.h>
25#include <linux/igmp.h>
26#include <linux/etherdevice.h>
27#include <linux/if_ether.h>
28#include <linux/if_vlan.h>
29#include <linux/hash.h>
30#include <linux/ethtool.h>
31#include <net/arp.h>
32#include <net/ndisc.h>
33#include <net/ip.h>
34#include <net/ip_tunnels.h>
35#include <net/icmp.h>
36#include <net/udp.h>
37#include <net/rtnetlink.h>
38#include <net/route.h>
39#include <net/dsfield.h>
40#include <net/inet_ecn.h>
41#include <net/net_namespace.h>
42#include <net/netns/generic.h>
43#include <net/geneve.h>
44#include <net/protocol.h>
45#include <net/udp_tunnel.h>
46#if IS_ENABLED(CONFIG_IPV6)
47#include <net/ipv6.h>
48#include <net/addrconf.h>
49#include <net/ip6_tunnel.h>
50#include <net/ip6_checksum.h>
51#endif
52
53#define PORT_HASH_BITS 8
54#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
55
56/* per-network namespace private data for this module */
57struct geneve_net {
58 struct hlist_head sock_list[PORT_HASH_SIZE];
59 spinlock_t sock_lock; /* Protects sock_list */
60};
61
62static int geneve_net_id;
63
64static struct workqueue_struct *geneve_wq;
65
66static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
67{
68 return (struct genevehdr *)(udp_hdr(skb) + 1);
69}
70
71static struct hlist_head *gs_head(struct net *net, __be16 port)
72{
73 struct geneve_net *gn = net_generic(net, geneve_net_id);
74
75 return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
76}
77
78/* Find geneve socket based on network namespace and UDP port */
79static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
80{
81 struct geneve_sock *gs;
82
83 hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) {
84 if (inet_sk(gs->sock->sk)->inet_sport == port)
85 return gs;
86 }
87
88 return NULL;
89}
90
91static void geneve_build_header(struct genevehdr *geneveh,
92 __be16 tun_flags, u8 vni[3],
93 u8 options_len, u8 *options)
94{
95 geneveh->ver = GENEVE_VER;
96 geneveh->opt_len = options_len / 4;
97 geneveh->oam = !!(tun_flags & TUNNEL_OAM);
98 geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
99 geneveh->rsvd1 = 0;
100 memcpy(geneveh->vni, vni, 3);
101 geneveh->proto_type = htons(ETH_P_TEB);
102 geneveh->rsvd2 = 0;
103
104 memcpy(geneveh->options, options, options_len);
105}
106
107/* Transmit a fully formated Geneve frame.
108 *
109 * When calling this function. The skb->data should point
110 * to the geneve header which is fully formed.
111 *
112 * This function will add other UDP tunnel headers.
113 */
114int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
115 struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
116 __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
117 __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
118 bool xnet)
119{
120 struct genevehdr *gnvh;
121 int min_headroom;
122 int err;
123
124 skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx);
125
126 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
127 + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
128 + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
129
130 err = skb_cow_head(skb, min_headroom);
131 if (unlikely(err))
132 return err;
133
134 if (vlan_tx_tag_present(skb)) {
135 if (unlikely(!__vlan_put_tag(skb,
136 skb->vlan_proto,
137 vlan_tx_tag_get(skb)))) {
138 err = -ENOMEM;
139 return err;
140 }
141 skb->vlan_tci = 0;
142 }
143
144 gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
145 geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
146
147 return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
148 tos, ttl, df, src_port, dst_port, xnet);
149}
150EXPORT_SYMBOL_GPL(geneve_xmit_skb);
151
152static void geneve_notify_add_rx_port(struct geneve_sock *gs)
153{
154 struct sock *sk = gs->sock->sk;
155 sa_family_t sa_family = sk->sk_family;
156 int err;
157
158 if (sa_family == AF_INET) {
159 err = udp_add_offload(&gs->udp_offloads);
160 if (err)
161 pr_warn("geneve: udp_add_offload failed with status %d\n",
162 err);
163 }
164}
165
166/* Callback from net/ipv4/udp.c to receive packets */
167static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
168{
169 struct genevehdr *geneveh;
170 struct geneve_sock *gs;
171 int opts_len;
172
173 /* Need Geneve and inner Ethernet header to be present */
174 if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
175 goto error;
176
177 /* Return packets with reserved bits set */
178 geneveh = geneve_hdr(skb);
179
180 if (unlikely(geneveh->ver != GENEVE_VER))
181 goto error;
182
183 if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
184 goto error;
185
186 opts_len = geneveh->opt_len * 4;
187 if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
188 htons(ETH_P_TEB)))
189 goto drop;
190
191 gs = rcu_dereference_sk_user_data(sk);
192 if (!gs)
193 goto drop;
194
195 gs->rcv(gs, skb);
196 return 0;
197
198drop:
199 /* Consume bad packet */
200 kfree_skb(skb);
201 return 0;
202
203error:
204 /* Let the UDP layer deal with the skb */
205 return 1;
206}
207
208static void geneve_del_work(struct work_struct *work)
209{
210 struct geneve_sock *gs = container_of(work, struct geneve_sock,
211 del_work);
212
213 udp_tunnel_sock_release(gs->sock);
214 kfree_rcu(gs, rcu);
215}
216
217static struct socket *geneve_create_sock(struct net *net, bool ipv6,
218 __be16 port)
219{
220 struct socket *sock;
221 struct udp_port_cfg udp_conf;
222 int err;
223
224 memset(&udp_conf, 0, sizeof(udp_conf));
225
226 if (ipv6) {
227 udp_conf.family = AF_INET6;
228 } else {
229 udp_conf.family = AF_INET;
230 udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
231 }
232
233 udp_conf.local_udp_port = port;
234
235 /* Open UDP socket */
236 err = udp_sock_create(net, &udp_conf, &sock);
237 if (err < 0)
238 return ERR_PTR(err);
239
240 return sock;
241}
242
243/* Create new listen socket if needed */
244static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
245 geneve_rcv_t *rcv, void *data,
246 bool ipv6)
247{
248 struct geneve_net *gn = net_generic(net, geneve_net_id);
249 struct geneve_sock *gs;
250 struct socket *sock;
251 struct udp_tunnel_sock_cfg tunnel_cfg;
252
253 gs = kzalloc(sizeof(*gs), GFP_KERNEL);
254 if (!gs)
255 return ERR_PTR(-ENOMEM);
256
257 INIT_WORK(&gs->del_work, geneve_del_work);
258
259 sock = geneve_create_sock(net, ipv6, port);
260 if (IS_ERR(sock)) {
261 kfree(gs);
262 return ERR_CAST(sock);
263 }
264
265 gs->sock = sock;
266 atomic_set(&gs->refcnt, 1);
267 gs->rcv = rcv;
268 gs->rcv_data = data;
269
270 /* Initialize the geneve udp offloads structure */
271 gs->udp_offloads.port = port;
272 gs->udp_offloads.callbacks.gro_receive = NULL;
273 gs->udp_offloads.callbacks.gro_complete = NULL;
274
275 spin_lock(&gn->sock_lock);
276 hlist_add_head_rcu(&gs->hlist, gs_head(net, port));
277 geneve_notify_add_rx_port(gs);
278 spin_unlock(&gn->sock_lock);
279
280 /* Mark socket as an encapsulation socket */
281 tunnel_cfg.sk_user_data = gs;
282 tunnel_cfg.encap_type = 1;
283 tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
284 tunnel_cfg.encap_destroy = NULL;
285 setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
286
287 return gs;
288}
289
290struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
291 geneve_rcv_t *rcv, void *data,
292 bool no_share, bool ipv6)
293{
294 struct geneve_sock *gs;
295
296 gs = geneve_socket_create(net, port, rcv, data, ipv6);
297 if (!IS_ERR(gs))
298 return gs;
299
300 if (no_share) /* Return error if sharing is not allowed. */
301 return ERR_PTR(-EINVAL);
302
303 gs = geneve_find_sock(net, port);
304 if (gs) {
305 if (gs->rcv == rcv)
306 atomic_inc(&gs->refcnt);
307 else
308 gs = ERR_PTR(-EBUSY);
309 } else {
310 gs = ERR_PTR(-EINVAL);
311 }
312
313 return gs;
314}
315EXPORT_SYMBOL_GPL(geneve_sock_add);
316
317void geneve_sock_release(struct geneve_sock *gs)
318{
319 if (!atomic_dec_and_test(&gs->refcnt))
320 return;
321
322 queue_work(geneve_wq, &gs->del_work);
323}
324EXPORT_SYMBOL_GPL(geneve_sock_release);
325
326static __net_init int geneve_init_net(struct net *net)
327{
328 struct geneve_net *gn = net_generic(net, geneve_net_id);
329 unsigned int h;
330
331 spin_lock_init(&gn->sock_lock);
332
333 for (h = 0; h < PORT_HASH_SIZE; ++h)
334 INIT_HLIST_HEAD(&gn->sock_list[h]);
335
336 return 0;
337}
338
339static struct pernet_operations geneve_net_ops = {
340 .init = geneve_init_net,
341 .exit = NULL,
342 .id = &geneve_net_id,
343 .size = sizeof(struct geneve_net),
344};
345
346static int __init geneve_init_module(void)
347{
348 int rc;
349
350 geneve_wq = alloc_workqueue("geneve", 0, 0);
351 if (!geneve_wq)
352 return -ENOMEM;
353
354 rc = register_pernet_subsys(&geneve_net_ops);
355 if (rc)
356 return rc;
357
358 pr_info("Geneve driver\n");
359
360 return 0;
361}
362late_initcall(geneve_init_module);
363
364static void __exit geneve_cleanup_module(void)
365{
366 destroy_workqueue(geneve_wq);
367}
368module_exit(geneve_cleanup_module);
369
370MODULE_LICENSE("GPL");
371MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
372MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
373MODULE_ALIAS_RTNL_LINK("geneve");
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 0485bf7f8f03..4a7b5b2a1ce3 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -98,7 +98,6 @@ EXPORT_SYMBOL_GPL(gre_build_header);
98static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, 98static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
99 bool *csum_err) 99 bool *csum_err)
100{ 100{
101 unsigned int ip_hlen = ip_hdrlen(skb);
102 const struct gre_base_hdr *greh; 101 const struct gre_base_hdr *greh;
103 __be32 *options; 102 __be32 *options;
104 int hdr_len; 103 int hdr_len;
@@ -106,7 +105,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
106 if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) 105 if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
107 return -EINVAL; 106 return -EINVAL;
108 107
109 greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); 108 greh = (struct gre_base_hdr *)skb_transport_header(skb);
110 if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) 109 if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
111 return -EINVAL; 110 return -EINVAL;
112 111
@@ -116,7 +115,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
116 if (!pskb_may_pull(skb, hdr_len)) 115 if (!pskb_may_pull(skb, hdr_len))
117 return -EINVAL; 116 return -EINVAL;
118 117
119 greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); 118 greh = (struct gre_base_hdr *)skb_transport_header(skb);
120 tpi->proto = greh->protocol; 119 tpi->proto = greh->protocol;
121 120
122 options = (__be32 *)(greh + 1); 121 options = (__be32 *)(greh + 1);
@@ -125,6 +124,10 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
125 *csum_err = true; 124 *csum_err = true;
126 return -EINVAL; 125 return -EINVAL;
127 } 126 }
127
128 skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
129 null_compute_pseudo);
130
128 options++; 131 options++;
129 } 132 }
130 133
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6556263c8fa5..a77729503071 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -15,13 +15,6 @@
15#include <net/protocol.h> 15#include <net/protocol.h>
16#include <net/gre.h> 16#include <net/gre.h>
17 17
18static int gre_gso_send_check(struct sk_buff *skb)
19{
20 if (!skb->encapsulation)
21 return -EINVAL;
22 return 0;
23}
24
25static struct sk_buff *gre_gso_segment(struct sk_buff *skb, 18static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
26 netdev_features_t features) 19 netdev_features_t features)
27{ 20{
@@ -46,6 +39,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
46 SKB_GSO_IPIP))) 39 SKB_GSO_IPIP)))
47 goto out; 40 goto out;
48 41
42 if (!skb->encapsulation)
43 goto out;
44
49 if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) 45 if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
50 goto out; 46 goto out;
51 47
@@ -119,28 +115,6 @@ out:
119 return segs; 115 return segs;
120} 116}
121 117
122/* Compute the whole skb csum in s/w and store it, then verify GRO csum
123 * starting from gro_offset.
124 */
125static __sum16 gro_skb_checksum(struct sk_buff *skb)
126{
127 __sum16 sum;
128
129 skb->csum = skb_checksum(skb, 0, skb->len, 0);
130 NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
131 csum_partial(skb->data, skb_gro_offset(skb), 0));
132 sum = csum_fold(NAPI_GRO_CB(skb)->csum);
133 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
134 if (unlikely(!sum) && !skb->csum_complete_sw)
135 netdev_rx_csum_fault(skb->dev);
136 } else {
137 skb->ip_summed = CHECKSUM_COMPLETE;
138 skb->csum_complete_sw = 1;
139 }
140
141 return sum;
142}
143
144static struct sk_buff **gre_gro_receive(struct sk_buff **head, 118static struct sk_buff **gre_gro_receive(struct sk_buff **head,
145 struct sk_buff *skb) 119 struct sk_buff *skb)
146{ 120{
@@ -192,22 +166,16 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
192 if (unlikely(!greh)) 166 if (unlikely(!greh))
193 goto out_unlock; 167 goto out_unlock;
194 } 168 }
195 if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ 169
196 __sum16 csum = 0; 170 /* Don't bother verifying checksum if we're going to flush anyway. */
197 171 if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
198 if (skb->ip_summed == CHECKSUM_COMPLETE) 172 if (skb_gro_checksum_simple_validate(skb))
199 csum = csum_fold(NAPI_GRO_CB(skb)->csum);
200 /* Don't trust csum error calculated/reported by h/w */
201 if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
202 csum = gro_skb_checksum(skb);
203
204 /* GRE CSUM is the 1's complement of the 1's complement sum
205 * of the GRE hdr plus payload so it should add up to 0xffff
206 * (and 0 after csum_fold()) just like the IPv4 hdr csum.
207 */
208 if (csum)
209 goto out_unlock; 173 goto out_unlock;
174
175 skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0,
176 null_compute_pseudo);
210 } 177 }
178
211 flush = 0; 179 flush = 0;
212 180
213 for (p = *head; p; p = p->next) { 181 for (p = *head; p; p = p->next) {
@@ -284,7 +252,6 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
284 252
285static const struct net_offload gre_offload = { 253static const struct net_offload gre_offload = {
286 .callbacks = { 254 .callbacks = {
287 .gso_send_check = gre_gso_send_check,
288 .gso_segment = gre_gso_segment, 255 .gso_segment = gre_gso_segment,
289 .gro_receive = gre_gro_receive, 256 .gro_receive = gre_gro_receive,
290 .gro_complete = gre_gro_complete, 257 .gro_complete = gre_gro_complete,
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index ea7d4afe8205..5882f584910e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk)
231 spin_unlock_bh(&sk->sk_lock.slock); 231 spin_unlock_bh(&sk->sk_lock.slock);
232} 232}
233 233
234int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
235int sysctl_icmp_msgs_burst __read_mostly = 50;
236
237static struct {
238 spinlock_t lock;
239 u32 credit;
240 u32 stamp;
241} icmp_global = {
242 .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
243};
244
245/**
246 * icmp_global_allow - Are we allowed to send one more ICMP message ?
247 *
248 * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
249 * Returns false if we reached the limit and can not send another packet.
250 * Note: called with BH disabled
251 */
252bool icmp_global_allow(void)
253{
254 u32 credit, delta, incr = 0, now = (u32)jiffies;
255 bool rc = false;
256
257 /* Check if token bucket is empty and cannot be refilled
258 * without taking the spinlock.
259 */
260 if (!icmp_global.credit) {
261 delta = min_t(u32, now - icmp_global.stamp, HZ);
262 if (delta < HZ / 50)
263 return false;
264 }
265
266 spin_lock(&icmp_global.lock);
267 delta = min_t(u32, now - icmp_global.stamp, HZ);
268 if (delta >= HZ / 50) {
269 incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
270 if (incr)
271 icmp_global.stamp = now;
272 }
273 credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
274 if (credit) {
275 credit--;
276 rc = true;
277 }
278 icmp_global.credit = credit;
279 spin_unlock(&icmp_global.lock);
280 return rc;
281}
282EXPORT_SYMBOL(icmp_global_allow);
283
234/* 284/*
235 * Send an ICMP frame. 285 * Send an ICMP frame.
236 */ 286 */
237 287
238static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, 288static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
239 struct flowi4 *fl4, int type, int code) 289 struct flowi4 *fl4, int type, int code)
240{ 290{
241 struct dst_entry *dst = &rt->dst; 291 struct dst_entry *dst = &rt->dst;
242 bool rc = true; 292 bool rc = true;
@@ -253,8 +303,14 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
253 goto out; 303 goto out;
254 304
255 /* Limit if icmp type is enabled in ratemask. */ 305 /* Limit if icmp type is enabled in ratemask. */
256 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { 306 if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
257 struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); 307 goto out;
308
309 rc = false;
310 if (icmp_global_allow()) {
311 struct inet_peer *peer;
312
313 peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
258 rc = inet_peer_xrlim_allow(peer, 314 rc = inet_peer_xrlim_allow(peer,
259 net->ipv4.sysctl_icmp_ratelimit); 315 net->ipv4.sysctl_icmp_ratelimit);
260 if (peer) 316 if (peer)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index f10eab462282..fb70e3ecc3e4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -117,7 +117,7 @@
117#define IGMP_V2_Unsolicited_Report_Interval (10*HZ) 117#define IGMP_V2_Unsolicited_Report_Interval (10*HZ)
118#define IGMP_V3_Unsolicited_Report_Interval (1*HZ) 118#define IGMP_V3_Unsolicited_Report_Interval (1*HZ)
119#define IGMP_Query_Response_Interval (10*HZ) 119#define IGMP_Query_Response_Interval (10*HZ)
120#define IGMP_Unsolicited_Report_Count 2 120#define IGMP_Query_Robustness_Variable 2
121 121
122 122
123#define IGMP_Initial_Report_Delay (1) 123#define IGMP_Initial_Report_Delay (1)
@@ -756,8 +756,7 @@ static void igmp_ifc_event(struct in_device *in_dev)
756{ 756{
757 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) 757 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
758 return; 758 return;
759 in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : 759 in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv;
760 IGMP_Unsolicited_Report_Count;
761 igmp_ifc_start_timer(in_dev, 1); 760 igmp_ifc_start_timer(in_dev, 1);
762} 761}
763 762
@@ -932,7 +931,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
932 in_dev->mr_qrv = ih3->qrv; 931 in_dev->mr_qrv = ih3->qrv;
933 if (!group) { /* general query */ 932 if (!group) { /* general query */
934 if (ih3->nsrcs) 933 if (ih3->nsrcs)
935 return false; /* no sources allowed */ 934 return true; /* no sources allowed */
936 igmp_gq_start_timer(in_dev); 935 igmp_gq_start_timer(in_dev);
937 return false; 936 return false;
938 } 937 }
@@ -1086,8 +1085,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
1086 pmc->interface = im->interface; 1085 pmc->interface = im->interface;
1087 in_dev_hold(in_dev); 1086 in_dev_hold(in_dev);
1088 pmc->multiaddr = im->multiaddr; 1087 pmc->multiaddr = im->multiaddr;
1089 pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 1088 pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
1090 IGMP_Unsolicited_Report_Count;
1091 pmc->sfmode = im->sfmode; 1089 pmc->sfmode = im->sfmode;
1092 if (pmc->sfmode == MCAST_INCLUDE) { 1090 if (pmc->sfmode == MCAST_INCLUDE) {
1093 struct ip_sf_list *psf; 1091 struct ip_sf_list *psf;
@@ -1226,8 +1224,7 @@ static void igmp_group_added(struct ip_mc_list *im)
1226 } 1224 }
1227 /* else, v3 */ 1225 /* else, v3 */
1228 1226
1229 im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 1227 im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
1230 IGMP_Unsolicited_Report_Count;
1231 igmp_ifc_event(in_dev); 1228 igmp_ifc_event(in_dev);
1232#endif 1229#endif
1233} 1230}
@@ -1322,7 +1319,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1322 spin_lock_init(&im->lock); 1319 spin_lock_init(&im->lock);
1323#ifdef CONFIG_IP_MULTICAST 1320#ifdef CONFIG_IP_MULTICAST
1324 setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); 1321 setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
1325 im->unsolicit_count = IGMP_Unsolicited_Report_Count; 1322 im->unsolicit_count = sysctl_igmp_qrv;
1326#endif 1323#endif
1327 1324
1328 im->next_rcu = in_dev->mc_list; 1325 im->next_rcu = in_dev->mc_list;
@@ -1460,7 +1457,7 @@ void ip_mc_init_dev(struct in_device *in_dev)
1460 (unsigned long)in_dev); 1457 (unsigned long)in_dev);
1461 setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 1458 setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
1462 (unsigned long)in_dev); 1459 (unsigned long)in_dev);
1463 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; 1460 in_dev->mr_qrv = sysctl_igmp_qrv;
1464#endif 1461#endif
1465 1462
1466 spin_lock_init(&in_dev->mc_tomb_lock); 1463 spin_lock_init(&in_dev->mc_tomb_lock);
@@ -1474,6 +1471,9 @@ void ip_mc_up(struct in_device *in_dev)
1474 1471
1475 ASSERT_RTNL(); 1472 ASSERT_RTNL();
1476 1473
1474#ifdef CONFIG_IP_MULTICAST
1475 in_dev->mr_qrv = sysctl_igmp_qrv;
1476#endif
1477 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); 1477 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
1478 1478
1479 for_each_pmc_rtnl(in_dev, pmc) 1479 for_each_pmc_rtnl(in_dev, pmc)
@@ -1540,7 +1540,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1540 */ 1540 */
1541int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; 1541int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
1542int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; 1542int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
1543 1543#ifdef CONFIG_IP_MULTICAST
1544int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable;
1545#endif
1544 1546
1545static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, 1547static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
1546 __be32 *psfsrc) 1548 __be32 *psfsrc)
@@ -1575,8 +1577,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
1575#ifdef CONFIG_IP_MULTICAST 1577#ifdef CONFIG_IP_MULTICAST
1576 if (psf->sf_oldin && 1578 if (psf->sf_oldin &&
1577 !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { 1579 !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
1578 psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 1580 psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
1579 IGMP_Unsolicited_Report_Count;
1580 psf->sf_next = pmc->tomb; 1581 psf->sf_next = pmc->tomb;
1581 pmc->tomb = psf; 1582 pmc->tomb = psf;
1582 rv = 1; 1583 rv = 1;
@@ -1639,8 +1640,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1639 /* filter mode change */ 1640 /* filter mode change */
1640 pmc->sfmode = MCAST_INCLUDE; 1641 pmc->sfmode = MCAST_INCLUDE;
1641#ifdef CONFIG_IP_MULTICAST 1642#ifdef CONFIG_IP_MULTICAST
1642 pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 1643 pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
1643 IGMP_Unsolicited_Report_Count;
1644 in_dev->mr_ifc_count = pmc->crcount; 1644 in_dev->mr_ifc_count = pmc->crcount;
1645 for (psf = pmc->sources; psf; psf = psf->sf_next) 1645 for (psf = pmc->sources; psf; psf = psf->sf_next)
1646 psf->sf_crcount = 0; 1646 psf->sf_crcount = 0;
@@ -1818,8 +1818,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1818#ifdef CONFIG_IP_MULTICAST 1818#ifdef CONFIG_IP_MULTICAST
1819 /* else no filters; keep old mode for reports */ 1819 /* else no filters; keep old mode for reports */
1820 1820
1821 pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 1821 pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
1822 IGMP_Unsolicited_Report_Count;
1823 in_dev->mr_ifc_count = pmc->crcount; 1822 in_dev->mr_ifc_count = pmc->crcount;
1824 for (psf = pmc->sources; psf; psf = psf->sf_next) 1823 for (psf = pmc->sources; psf; psf = psf->sf_next)
1825 psf->sf_crcount = 0; 1824 psf->sf_crcount = 0;
@@ -2539,7 +2538,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2539 querier = "NONE"; 2538 querier = "NONE";
2540#endif 2539#endif
2541 2540
2542 if (rcu_dereference(state->in_dev->mc_list) == im) { 2541 if (rcu_access_pointer(state->in_dev->mc_list) == im) {
2543 seq_printf(seq, "%d\t%-10s: %5d %7s\n", 2542 seq_printf(seq, "%d\t%-10s: %5d %7s\n",
2544 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); 2543 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2545 } 2544 }
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 43116e8c8e13..9111a4e22155 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -229,7 +229,7 @@ begin:
229 } 229 }
230 } else if (score == hiscore && reuseport) { 230 } else if (score == hiscore && reuseport) {
231 matches++; 231 matches++;
232 if (((u64)phash * matches) >> 32 == 0) 232 if (reciprocal_scale(phash, matches) == 0)
233 result = sk; 233 result = sk;
234 phash = next_pseudo_random32(phash); 234 phash = next_pseudo_random32(phash);
235 } 235 }
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index bd5f5928167d..241afd743d2c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -72,29 +72,10 @@ void inet_peer_base_init(struct inet_peer_base *bp)
72{ 72{
73 bp->root = peer_avl_empty_rcu; 73 bp->root = peer_avl_empty_rcu;
74 seqlock_init(&bp->lock); 74 seqlock_init(&bp->lock);
75 bp->flush_seq = ~0U;
76 bp->total = 0; 75 bp->total = 0;
77} 76}
78EXPORT_SYMBOL_GPL(inet_peer_base_init); 77EXPORT_SYMBOL_GPL(inet_peer_base_init);
79 78
80static atomic_t v4_seq = ATOMIC_INIT(0);
81static atomic_t v6_seq = ATOMIC_INIT(0);
82
83static atomic_t *inetpeer_seq_ptr(int family)
84{
85 return (family == AF_INET ? &v4_seq : &v6_seq);
86}
87
88static inline void flush_check(struct inet_peer_base *base, int family)
89{
90 atomic_t *fp = inetpeer_seq_ptr(family);
91
92 if (unlikely(base->flush_seq != atomic_read(fp))) {
93 inetpeer_invalidate_tree(base);
94 base->flush_seq = atomic_read(fp);
95 }
96}
97
98#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 79#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
99 80
100/* Exported for sysctl_net_ipv4. */ 81/* Exported for sysctl_net_ipv4. */
@@ -444,8 +425,6 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
444 unsigned int sequence; 425 unsigned int sequence;
445 int invalidated, gccnt = 0; 426 int invalidated, gccnt = 0;
446 427
447 flush_check(base, daddr->family);
448
449 /* Attempt a lockless lookup first. 428 /* Attempt a lockless lookup first.
450 * Because of a concurrent writer, we might not find an existing entry. 429 * Because of a concurrent writer, we might not find an existing entry.
451 */ 430 */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 15f0e2bad7ad..2811cc18701a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -790,7 +790,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
790 kfree(table); 790 kfree(table);
791} 791}
792 792
793static void ip4_frags_ctl_register(void) 793static void __init ip4_frags_ctl_register(void)
794{ 794{
795 register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); 795 register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
796} 796}
@@ -804,7 +804,7 @@ static inline void ip4_frags_ns_ctl_unregister(struct net *net)
804{ 804{
805} 805}
806 806
807static inline void ip4_frags_ctl_register(void) 807static inline void __init ip4_frags_ctl_register(void)
808{ 808{
809} 809}
810#endif 810#endif
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9b842544aea3..12055fdbe716 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -239,7 +239,9 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
239 tpi.seq = htonl(tunnel->o_seqno); 239 tpi.seq = htonl(tunnel->o_seqno);
240 240
241 /* Push GRE header. */ 241 /* Push GRE header. */
242 gre_build_header(skb, &tpi, tunnel->hlen); 242 gre_build_header(skb, &tpi, tunnel->tun_hlen);
243
244 skb_set_inner_protocol(skb, tpi.proto);
243 245
244 ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); 246 ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
245} 247}
@@ -310,7 +312,7 @@ out:
310static int ipgre_tunnel_ioctl(struct net_device *dev, 312static int ipgre_tunnel_ioctl(struct net_device *dev,
311 struct ifreq *ifr, int cmd) 313 struct ifreq *ifr, int cmd)
312{ 314{
313 int err = 0; 315 int err;
314 struct ip_tunnel_parm p; 316 struct ip_tunnel_parm p;
315 317
316 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 318 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
@@ -470,13 +472,18 @@ static void ipgre_tunnel_setup(struct net_device *dev)
470static void __gre_tunnel_init(struct net_device *dev) 472static void __gre_tunnel_init(struct net_device *dev)
471{ 473{
472 struct ip_tunnel *tunnel; 474 struct ip_tunnel *tunnel;
475 int t_hlen;
473 476
474 tunnel = netdev_priv(dev); 477 tunnel = netdev_priv(dev);
475 tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); 478 tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
476 tunnel->parms.iph.protocol = IPPROTO_GRE; 479 tunnel->parms.iph.protocol = IPPROTO_GRE;
477 480
478 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 481 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
479 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 482
483 t_hlen = tunnel->hlen + sizeof(struct iphdr);
484
485 dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
486 dev->mtu = ETH_DATA_LEN - t_hlen - 4;
480 487
481 dev->features |= GRE_FEATURES; 488 dev->features |= GRE_FEATURES;
482 dev->hw_features |= GRE_FEATURES; 489 dev->hw_features |= GRE_FEATURES;
@@ -503,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
503 memcpy(dev->broadcast, &iph->daddr, 4); 510 memcpy(dev->broadcast, &iph->daddr, 4);
504 511
505 dev->flags = IFF_NOARP; 512 dev->flags = IFF_NOARP;
506 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 513 netif_keep_dst(dev);
507 dev->addr_len = 4; 514 dev->addr_len = 4;
508 515
509 if (iph->daddr) { 516 if (iph->daddr) {
@@ -628,6 +635,40 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
628 parms->iph.frag_off = htons(IP_DF); 635 parms->iph.frag_off = htons(IP_DF);
629} 636}
630 637
638/* This function returns true when ENCAP attributes are present in the nl msg */
639static bool ipgre_netlink_encap_parms(struct nlattr *data[],
640 struct ip_tunnel_encap *ipencap)
641{
642 bool ret = false;
643
644 memset(ipencap, 0, sizeof(*ipencap));
645
646 if (!data)
647 return ret;
648
649 if (data[IFLA_GRE_ENCAP_TYPE]) {
650 ret = true;
651 ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
652 }
653
654 if (data[IFLA_GRE_ENCAP_FLAGS]) {
655 ret = true;
656 ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
657 }
658
659 if (data[IFLA_GRE_ENCAP_SPORT]) {
660 ret = true;
661 ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]);
662 }
663
664 if (data[IFLA_GRE_ENCAP_DPORT]) {
665 ret = true;
666 ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]);
667 }
668
669 return ret;
670}
671
631static int gre_tap_init(struct net_device *dev) 672static int gre_tap_init(struct net_device *dev)
632{ 673{
633 __gre_tunnel_init(dev); 674 __gre_tunnel_init(dev);
@@ -657,6 +698,15 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
657 struct nlattr *tb[], struct nlattr *data[]) 698 struct nlattr *tb[], struct nlattr *data[])
658{ 699{
659 struct ip_tunnel_parm p; 700 struct ip_tunnel_parm p;
701 struct ip_tunnel_encap ipencap;
702
703 if (ipgre_netlink_encap_parms(data, &ipencap)) {
704 struct ip_tunnel *t = netdev_priv(dev);
705 int err = ip_tunnel_encap_setup(t, &ipencap);
706
707 if (err < 0)
708 return err;
709 }
660 710
661 ipgre_netlink_parms(data, tb, &p); 711 ipgre_netlink_parms(data, tb, &p);
662 return ip_tunnel_newlink(dev, tb, &p); 712 return ip_tunnel_newlink(dev, tb, &p);
@@ -666,6 +716,15 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
666 struct nlattr *data[]) 716 struct nlattr *data[])
667{ 717{
668 struct ip_tunnel_parm p; 718 struct ip_tunnel_parm p;
719 struct ip_tunnel_encap ipencap;
720
721 if (ipgre_netlink_encap_parms(data, &ipencap)) {
722 struct ip_tunnel *t = netdev_priv(dev);
723 int err = ip_tunnel_encap_setup(t, &ipencap);
724
725 if (err < 0)
726 return err;
727 }
669 728
670 ipgre_netlink_parms(data, tb, &p); 729 ipgre_netlink_parms(data, tb, &p);
671 return ip_tunnel_changelink(dev, tb, &p); 730 return ip_tunnel_changelink(dev, tb, &p);
@@ -694,6 +753,14 @@ static size_t ipgre_get_size(const struct net_device *dev)
694 nla_total_size(1) + 753 nla_total_size(1) +
695 /* IFLA_GRE_PMTUDISC */ 754 /* IFLA_GRE_PMTUDISC */
696 nla_total_size(1) + 755 nla_total_size(1) +
756 /* IFLA_GRE_ENCAP_TYPE */
757 nla_total_size(2) +
758 /* IFLA_GRE_ENCAP_FLAGS */
759 nla_total_size(2) +
760 /* IFLA_GRE_ENCAP_SPORT */
761 nla_total_size(2) +
762 /* IFLA_GRE_ENCAP_DPORT */
763 nla_total_size(2) +
697 0; 764 0;
698} 765}
699 766
@@ -714,6 +781,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
714 nla_put_u8(skb, IFLA_GRE_PMTUDISC, 781 nla_put_u8(skb, IFLA_GRE_PMTUDISC,
715 !!(p->iph.frag_off & htons(IP_DF)))) 782 !!(p->iph.frag_off & htons(IP_DF))))
716 goto nla_put_failure; 783 goto nla_put_failure;
784
785 if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
786 t->encap.type) ||
787 nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT,
788 t->encap.sport) ||
789 nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT,
790 t->encap.dport) ||
791 nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
792 t->encap.dport))
793 goto nla_put_failure;
794
717 return 0; 795 return 0;
718 796
719nla_put_failure: 797nla_put_failure:
@@ -731,6 +809,10 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
731 [IFLA_GRE_TTL] = { .type = NLA_U8 }, 809 [IFLA_GRE_TTL] = { .type = NLA_U8 },
732 [IFLA_GRE_TOS] = { .type = NLA_U8 }, 810 [IFLA_GRE_TOS] = { .type = NLA_U8 },
733 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, 811 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
812 [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
813 [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
814 [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
815 [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
734}; 816};
735 817
736static struct rtnl_link_ops ipgre_link_ops __read_mostly = { 818static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ad382499bace..5b3d91be2db0 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
87 * NOTE: dopt cannot point to skb. 87 * NOTE: dopt cannot point to skb.
88 */ 88 */
89 89
90int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) 90int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
91 const struct ip_options *sopt)
91{ 92{
92 const struct ip_options *sopt;
93 unsigned char *sptr, *dptr; 93 unsigned char *sptr, *dptr;
94 int soffset, doffset; 94 int soffset, doffset;
95 int optlen; 95 int optlen;
96 96
97 memset(dopt, 0, sizeof(struct ip_options)); 97 memset(dopt, 0, sizeof(struct ip_options));
98 98
99 sopt = &(IPCB(skb)->opt);
100
101 if (sopt->optlen == 0) 99 if (sopt->optlen == 0)
102 return 0; 100 return 0;
103 101
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 215af2b155cb..e35b71289156 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -516,7 +516,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
516 516
517 hlen = iph->ihl * 4; 517 hlen = iph->ihl * 4;
518 mtu = mtu - hlen; /* Size of data space */ 518 mtu = mtu - hlen; /* Size of data space */
519#ifdef CONFIG_BRIDGE_NETFILTER 519#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
520 if (skb->nf_bridge) 520 if (skb->nf_bridge)
521 mtu -= nf_bridge_mtu_reduction(skb); 521 mtu -= nf_bridge_mtu_reduction(skb);
522#endif 522#endif
@@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
1522 .uc_ttl = -1, 1522 .uc_ttl = -1,
1523}; 1523};
1524 1524
1525void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, 1525void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1526 __be32 saddr, const struct ip_reply_arg *arg, 1526 const struct ip_options *sopt,
1527 __be32 daddr, __be32 saddr,
1528 const struct ip_reply_arg *arg,
1527 unsigned int len) 1529 unsigned int len)
1528{ 1530{
1529 struct ip_options_data replyopts; 1531 struct ip_options_data replyopts;
@@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
1534 struct sock *sk; 1536 struct sock *sk;
1535 struct inet_sock *inet; 1537 struct inet_sock *inet;
1536 1538
1537 if (ip_options_echo(&replyopts.opt.opt, skb)) 1539 if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
1538 return; 1540 return;
1539 1541
1540 ipc.addr = daddr; 1542 ipc.addr = daddr;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5cb830c78990..c373a9ad4555 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -303,7 +303,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
303 } 303 }
304 /* dont let ip_call_ra_chain() use sk again */ 304 /* dont let ip_call_ra_chain() use sk again */
305 ra->sk = NULL; 305 ra->sk = NULL;
306 rcu_assign_pointer(*rap, ra->next); 306 RCU_INIT_POINTER(*rap, ra->next);
307 spin_unlock_bh(&ip_ra_lock); 307 spin_unlock_bh(&ip_ra_lock);
308 308
309 if (ra->destructor) 309 if (ra->destructor)
@@ -325,7 +325,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
325 new_ra->sk = sk; 325 new_ra->sk = sk;
326 new_ra->destructor = destructor; 326 new_ra->destructor = destructor;
327 327
328 new_ra->next = ra; 328 RCU_INIT_POINTER(new_ra->next, ra);
329 rcu_assign_pointer(*rap, new_ra); 329 rcu_assign_pointer(*rap, new_ra);
330 sock_hold(sk); 330 sock_hold(sk);
331 spin_unlock_bh(&ip_ra_lock); 331 spin_unlock_bh(&ip_ra_lock);
@@ -405,7 +405,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
405int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) 405int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
406{ 406{
407 struct sock_exterr_skb *serr; 407 struct sock_exterr_skb *serr;
408 struct sk_buff *skb, *skb2; 408 struct sk_buff *skb;
409 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); 409 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
410 struct { 410 struct {
411 struct sock_extended_err ee; 411 struct sock_extended_err ee;
@@ -415,7 +415,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
415 int copied; 415 int copied;
416 416
417 err = -EAGAIN; 417 err = -EAGAIN;
418 skb = skb_dequeue(&sk->sk_error_queue); 418 skb = sock_dequeue_err_skb(sk);
419 if (skb == NULL) 419 if (skb == NULL)
420 goto out; 420 goto out;
421 421
@@ -462,17 +462,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
462 msg->msg_flags |= MSG_ERRQUEUE; 462 msg->msg_flags |= MSG_ERRQUEUE;
463 err = copied; 463 err = copied;
464 464
465 /* Reset and regenerate socket error */
466 spin_lock_bh(&sk->sk_error_queue.lock);
467 sk->sk_err = 0;
468 skb2 = skb_peek(&sk->sk_error_queue);
469 if (skb2 != NULL) {
470 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
471 spin_unlock_bh(&sk->sk_error_queue.lock);
472 sk->sk_error_report(sk);
473 } else
474 spin_unlock_bh(&sk->sk_error_queue.lock);
475
476out_free_skb: 465out_free_skb:
477 kfree_skb(skb); 466 kfree_skb(skb);
478out: 467out:
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index bda4bb8ae260..0bb8e141eacc 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,8 @@
55#include <net/net_namespace.h> 55#include <net/net_namespace.h>
56#include <net/netns/generic.h> 56#include <net/netns/generic.h>
57#include <net/rtnetlink.h> 57#include <net/rtnetlink.h>
58#include <net/udp.h>
59#include <net/gue.h>
58 60
59#if IS_ENABLED(CONFIG_IPV6) 61#if IS_ENABLED(CONFIG_IPV6)
60#include <net/ipv6.h> 62#include <net/ipv6.h>
@@ -487,6 +489,103 @@ drop:
487} 489}
488EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 490EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
489 491
492static int ip_encap_hlen(struct ip_tunnel_encap *e)
493{
494 switch (e->type) {
495 case TUNNEL_ENCAP_NONE:
496 return 0;
497 case TUNNEL_ENCAP_FOU:
498 return sizeof(struct udphdr);
499 case TUNNEL_ENCAP_GUE:
500 return sizeof(struct udphdr) + sizeof(struct guehdr);
501 default:
502 return -EINVAL;
503 }
504}
505
506int ip_tunnel_encap_setup(struct ip_tunnel *t,
507 struct ip_tunnel_encap *ipencap)
508{
509 int hlen;
510
511 memset(&t->encap, 0, sizeof(t->encap));
512
513 hlen = ip_encap_hlen(ipencap);
514 if (hlen < 0)
515 return hlen;
516
517 t->encap.type = ipencap->type;
518 t->encap.sport = ipencap->sport;
519 t->encap.dport = ipencap->dport;
520 t->encap.flags = ipencap->flags;
521
522 t->encap_hlen = hlen;
523 t->hlen = t->encap_hlen + t->tun_hlen;
524
525 return 0;
526}
527EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
528
529static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
530 size_t hdr_len, u8 *protocol, struct flowi4 *fl4)
531{
532 struct udphdr *uh;
533 __be16 sport;
534 bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
535 int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
536
537 skb = iptunnel_handle_offloads(skb, csum, type);
538
539 if (IS_ERR(skb))
540 return PTR_ERR(skb);
541
542 /* Get length and hash before making space in skb */
543
544 sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
545 skb, 0, 0, false);
546
547 skb_push(skb, hdr_len);
548
549 skb_reset_transport_header(skb);
550 uh = udp_hdr(skb);
551
552 if (e->type == TUNNEL_ENCAP_GUE) {
553 struct guehdr *guehdr = (struct guehdr *)&uh[1];
554
555 guehdr->version = 0;
556 guehdr->hlen = 0;
557 guehdr->flags = 0;
558 guehdr->next_hdr = *protocol;
559 }
560
561 uh->dest = e->dport;
562 uh->source = sport;
563 uh->len = htons(skb->len);
564 uh->check = 0;
565 udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
566 fl4->saddr, fl4->daddr, skb->len);
567
568 *protocol = IPPROTO_UDP;
569
570 return 0;
571}
572
573int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
574 u8 *protocol, struct flowi4 *fl4)
575{
576 switch (t->encap.type) {
577 case TUNNEL_ENCAP_NONE:
578 return 0;
579 case TUNNEL_ENCAP_FOU:
580 case TUNNEL_ENCAP_GUE:
581 return fou_build_header(skb, &t->encap, t->encap_hlen,
582 protocol, fl4);
583 default:
584 return -EINVAL;
585 }
586}
587EXPORT_SYMBOL(ip_tunnel_encap);
588
490static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 589static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
491 struct rtable *rt, __be16 df) 590 struct rtable *rt, __be16 df)
492{ 591{
@@ -536,7 +635,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
536} 635}
537 636
538void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 637void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
539 const struct iphdr *tnl_params, const u8 protocol) 638 const struct iphdr *tnl_params, u8 protocol)
540{ 639{
541 struct ip_tunnel *tunnel = netdev_priv(dev); 640 struct ip_tunnel *tunnel = netdev_priv(dev);
542 const struct iphdr *inner_iph; 641 const struct iphdr *inner_iph;
@@ -617,6 +716,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
617 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, 716 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
618 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); 717 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
619 718
719 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
720 goto tx_error;
721
620 rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; 722 rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
621 723
622 if (!rt) { 724 if (!rt) {
@@ -670,7 +772,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
670 df |= (inner_iph->frag_off&htons(IP_DF)); 772 df |= (inner_iph->frag_off&htons(IP_DF));
671 773
672 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 774 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
673 + rt->dst.header_len; 775 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
674 if (max_headroom > dev->needed_headroom) 776 if (max_headroom > dev->needed_headroom)
675 dev->needed_headroom = max_headroom; 777 dev->needed_headroom = max_headroom;
676 778
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index e453cb724a95..3e861011e4a3 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev)
364 dev->iflink = 0; 364 dev->iflink = 0;
365 dev->addr_len = 4; 365 dev->addr_len = 4;
366 dev->features |= NETIF_F_LLTX; 366 dev->features |= NETIF_F_LLTX;
367 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 367 netif_keep_dst(dev);
368 368
369 return ip_tunnel_init(dev); 369 return ip_tunnel_init(dev);
370} 370}
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 5bbef4fdcb43..648fa1490ea7 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -262,7 +262,8 @@ static int __init ic_open_devs(void)
262 /* wait for a carrier on at least one device */ 262 /* wait for a carrier on at least one device */
263 start = jiffies; 263 start = jiffies;
264 next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); 264 next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
265 while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { 265 while (time_before(jiffies, start +
266 msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) {
266 int wait, elapsed; 267 int wait, elapsed;
267 268
268 for_each_netdev(&init_net, dev) 269 for_each_netdev(&init_net, dev)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 62eaa005e146..37096d64730e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -224,6 +224,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
224 if (IS_ERR(skb)) 224 if (IS_ERR(skb))
225 goto out; 225 goto out;
226 226
227 skb_set_inner_ipproto(skb, IPPROTO_IPIP);
228
227 ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); 229 ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
228 return NETDEV_TX_OK; 230 return NETDEV_TX_OK;
229 231
@@ -287,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
287 dev->iflink = 0; 289 dev->iflink = 0;
288 dev->addr_len = 4; 290 dev->addr_len = 4;
289 dev->features |= NETIF_F_LLTX; 291 dev->features |= NETIF_F_LLTX;
290 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 292 netif_keep_dst(dev);
291 293
292 dev->features |= IPIP_FEATURES; 294 dev->features |= IPIP_FEATURES;
293 dev->hw_features |= IPIP_FEATURES; 295 dev->hw_features |= IPIP_FEATURES;
@@ -301,7 +303,8 @@ static int ipip_tunnel_init(struct net_device *dev)
301 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 303 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
302 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 304 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
303 305
304 tunnel->hlen = 0; 306 tunnel->tun_hlen = 0;
307 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
305 tunnel->parms.iph.protocol = IPPROTO_IPIP; 308 tunnel->parms.iph.protocol = IPPROTO_IPIP;
306 return ip_tunnel_init(dev); 309 return ip_tunnel_init(dev);
307} 310}
@@ -340,10 +343,53 @@ static void ipip_netlink_parms(struct nlattr *data[],
340 parms->iph.frag_off = htons(IP_DF); 343 parms->iph.frag_off = htons(IP_DF);
341} 344}
342 345
346/* This function returns true when ENCAP attributes are present in the nl msg */
347static bool ipip_netlink_encap_parms(struct nlattr *data[],
348 struct ip_tunnel_encap *ipencap)
349{
350 bool ret = false;
351
352 memset(ipencap, 0, sizeof(*ipencap));
353
354 if (!data)
355 return ret;
356
357 if (data[IFLA_IPTUN_ENCAP_TYPE]) {
358 ret = true;
359 ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
360 }
361
362 if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
363 ret = true;
364 ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
365 }
366
367 if (data[IFLA_IPTUN_ENCAP_SPORT]) {
368 ret = true;
369 ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]);
370 }
371
372 if (data[IFLA_IPTUN_ENCAP_DPORT]) {
373 ret = true;
374 ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]);
375 }
376
377 return ret;
378}
379
343static int ipip_newlink(struct net *src_net, struct net_device *dev, 380static int ipip_newlink(struct net *src_net, struct net_device *dev,
344 struct nlattr *tb[], struct nlattr *data[]) 381 struct nlattr *tb[], struct nlattr *data[])
345{ 382{
346 struct ip_tunnel_parm p; 383 struct ip_tunnel_parm p;
384 struct ip_tunnel_encap ipencap;
385
386 if (ipip_netlink_encap_parms(data, &ipencap)) {
387 struct ip_tunnel *t = netdev_priv(dev);
388 int err = ip_tunnel_encap_setup(t, &ipencap);
389
390 if (err < 0)
391 return err;
392 }
347 393
348 ipip_netlink_parms(data, &p); 394 ipip_netlink_parms(data, &p);
349 return ip_tunnel_newlink(dev, tb, &p); 395 return ip_tunnel_newlink(dev, tb, &p);
@@ -353,6 +399,15 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
353 struct nlattr *data[]) 399 struct nlattr *data[])
354{ 400{
355 struct ip_tunnel_parm p; 401 struct ip_tunnel_parm p;
402 struct ip_tunnel_encap ipencap;
403
404 if (ipip_netlink_encap_parms(data, &ipencap)) {
405 struct ip_tunnel *t = netdev_priv(dev);
406 int err = ip_tunnel_encap_setup(t, &ipencap);
407
408 if (err < 0)
409 return err;
410 }
356 411
357 ipip_netlink_parms(data, &p); 412 ipip_netlink_parms(data, &p);
358 413
@@ -378,6 +433,14 @@ static size_t ipip_get_size(const struct net_device *dev)
378 nla_total_size(1) + 433 nla_total_size(1) +
379 /* IFLA_IPTUN_PMTUDISC */ 434 /* IFLA_IPTUN_PMTUDISC */
380 nla_total_size(1) + 435 nla_total_size(1) +
436 /* IFLA_IPTUN_ENCAP_TYPE */
437 nla_total_size(2) +
438 /* IFLA_IPTUN_ENCAP_FLAGS */
439 nla_total_size(2) +
440 /* IFLA_IPTUN_ENCAP_SPORT */
441 nla_total_size(2) +
442 /* IFLA_IPTUN_ENCAP_DPORT */
443 nla_total_size(2) +
381 0; 444 0;
382} 445}
383 446
@@ -394,6 +457,17 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
394 nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, 457 nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
395 !!(parm->iph.frag_off & htons(IP_DF)))) 458 !!(parm->iph.frag_off & htons(IP_DF))))
396 goto nla_put_failure; 459 goto nla_put_failure;
460
461 if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
462 tunnel->encap.type) ||
463 nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT,
464 tunnel->encap.sport) ||
465 nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT,
466 tunnel->encap.dport) ||
467 nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
468 tunnel->encap.dport))
469 goto nla_put_failure;
470
397 return 0; 471 return 0;
398 472
399nla_put_failure: 473nla_put_failure:
@@ -407,6 +481,10 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
407 [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, 481 [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
408 [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, 482 [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
409 [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, 483 [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
484 [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
485 [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
486 [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
487 [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
410}; 488};
411 489
412static struct rtnl_link_ops ipip_link_ops __read_mostly = { 490static struct rtnl_link_ops ipip_link_ops __read_mostly = {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 7cbcaf4f0194..4c019d5c3f57 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -61,18 +61,13 @@ config NFT_CHAIN_ROUTE_IPV4
61 fields such as the source, destination, type of service and 61 fields such as the source, destination, type of service and
62 the packet mark. 62 the packet mark.
63 63
64config NFT_CHAIN_NAT_IPV4 64config NF_REJECT_IPV4
65 depends on NF_TABLES_IPV4 65 tristate "IPv4 packet rejection"
66 depends on NF_NAT_IPV4 && NFT_NAT 66 default m if NETFILTER_ADVANCED=n
67 tristate "IPv4 nf_tables nat chain support"
68 help
69 This option enables the "nat" chain for IPv4 in nf_tables. This
70 chain type is used to perform Network Address Translation (NAT)
71 packet transformations such as the source, destination address and
72 source and destination ports.
73 67
74config NFT_REJECT_IPV4 68config NFT_REJECT_IPV4
75 depends on NF_TABLES_IPV4 69 depends on NF_TABLES_IPV4
70 select NF_REJECT_IPV4
76 default NFT_REJECT 71 default NFT_REJECT
77 tristate 72 tristate
78 73
@@ -94,6 +89,30 @@ config NF_NAT_IPV4
94 89
95if NF_NAT_IPV4 90if NF_NAT_IPV4
96 91
92config NFT_CHAIN_NAT_IPV4
93 depends on NF_TABLES_IPV4
94 tristate "IPv4 nf_tables nat chain support"
95 help
96 This option enables the "nat" chain for IPv4 in nf_tables. This
97 chain type is used to perform Network Address Translation (NAT)
98 packet transformations such as the source, destination address and
99 source and destination ports.
100
101config NF_NAT_MASQUERADE_IPV4
102 tristate "IPv4 masquerade support"
103 help
104 This is the kernel functionality to provide NAT in the masquerade
105 flavour (automatic source address selection).
106
107config NFT_MASQ_IPV4
108 tristate "IPv4 masquerading support for nf_tables"
109 depends on NF_TABLES_IPV4
110 depends on NFT_MASQ
111 select NF_NAT_MASQUERADE_IPV4
112 help
113 This is the expression that provides IPv4 masquerading support for
114 nf_tables.
115
97config NF_NAT_SNMP_BASIC 116config NF_NAT_SNMP_BASIC
98 tristate "Basic SNMP-ALG support" 117 tristate "Basic SNMP-ALG support"
99 depends on NF_CONNTRACK_SNMP 118 depends on NF_CONNTRACK_SNMP
@@ -194,6 +213,7 @@ config IP_NF_FILTER
194config IP_NF_TARGET_REJECT 213config IP_NF_TARGET_REJECT
195 tristate "REJECT target support" 214 tristate "REJECT target support"
196 depends on IP_NF_FILTER 215 depends on IP_NF_FILTER
216 select NF_REJECT_IPV4
197 default m if NETFILTER_ADVANCED=n 217 default m if NETFILTER_ADVANCED=n
198 help 218 help
199 The REJECT target allows a filtering rule to specify that an ICMP 219 The REJECT target allows a filtering rule to specify that an ICMP
@@ -234,6 +254,7 @@ if IP_NF_NAT
234 254
235config IP_NF_TARGET_MASQUERADE 255config IP_NF_TARGET_MASQUERADE
236 tristate "MASQUERADE target support" 256 tristate "MASQUERADE target support"
257 select NF_NAT_MASQUERADE_IPV4
237 default m if NETFILTER_ADVANCED=n 258 default m if NETFILTER_ADVANCED=n
238 help 259 help
239 Masquerading is a special case of NAT: all outgoing connections are 260 Masquerading is a special case of NAT: all outgoing connections are
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index edf4af32e9f2..f4cef5af0969 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -23,10 +23,14 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
23obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o 23obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
24obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o 24obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
25 25
26# reject
27obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
28
26# NAT helpers (nf_conntrack) 29# NAT helpers (nf_conntrack)
27obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o 30obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
28obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o 31obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
29obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o 32obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
33obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
30 34
31# NAT protocols (nf_nat) 35# NAT protocols (nf_nat)
32obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o 36obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
@@ -35,6 +39,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
35obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o 39obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
36obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o 40obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
37obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o 41obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
42obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
38obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o 43obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
39 44
40# generic IP tables 45# generic IP tables
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2510c02c2d21..e90f83a3415b 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -285,7 +285,7 @@ clusterip_hashfn(const struct sk_buff *skb,
285 } 285 }
286 286
287 /* node numbers are 1..n, not 0..n */ 287 /* node numbers are 1..n, not 0..n */
288 return (((u64)hashval * config->num_total_nodes) >> 32) + 1; 288 return reciprocal_scale(hashval, config->num_total_nodes) + 1;
289} 289}
290 290
291static inline int 291static inline int
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 00352ce0f0de..da7f02a0b868 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -22,6 +22,7 @@
22#include <linux/netfilter_ipv4.h> 22#include <linux/netfilter_ipv4.h>
23#include <linux/netfilter/x_tables.h> 23#include <linux/netfilter/x_tables.h>
24#include <net/netfilter/nf_nat.h> 24#include <net/netfilter/nf_nat.h>
25#include <net/netfilter/ipv4/nf_nat_masquerade.h>
25 26
26MODULE_LICENSE("GPL"); 27MODULE_LICENSE("GPL");
27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 28MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -46,103 +47,17 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
46static unsigned int 47static unsigned int
47masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) 48masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
48{ 49{
49 struct nf_conn *ct; 50 struct nf_nat_range range;
50 struct nf_conn_nat *nat;
51 enum ip_conntrack_info ctinfo;
52 struct nf_nat_range newrange;
53 const struct nf_nat_ipv4_multi_range_compat *mr; 51 const struct nf_nat_ipv4_multi_range_compat *mr;
54 const struct rtable *rt;
55 __be32 newsrc, nh;
56
57 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
58
59 ct = nf_ct_get(skb, &ctinfo);
60 nat = nfct_nat(ct);
61
62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
63 ctinfo == IP_CT_RELATED_REPLY));
64
65 /* Source address is 0.0.0.0 - locally generated packet that is
66 * probably not supposed to be masqueraded.
67 */
68 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
69 return NF_ACCEPT;
70 52
71 mr = par->targinfo; 53 mr = par->targinfo;
72 rt = skb_rtable(skb); 54 range.flags = mr->range[0].flags;
73 nh = rt_nexthop(rt, ip_hdr(skb)->daddr); 55 range.min_proto = mr->range[0].min;
74 newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); 56 range.max_proto = mr->range[0].max;
75 if (!newsrc) {
76 pr_info("%s ate my IP address\n", par->out->name);
77 return NF_DROP;
78 }
79
80 nat->masq_index = par->out->ifindex;
81
82 /* Transfer from original range. */
83 memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
84 memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
85 newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
86 newrange.min_addr.ip = newsrc;
87 newrange.max_addr.ip = newsrc;
88 newrange.min_proto = mr->range[0].min;
89 newrange.max_proto = mr->range[0].max;
90 57
91 /* Hand modified range to generic setup. */ 58 return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
92 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
93} 59}
94 60
95static int
96device_cmp(struct nf_conn *i, void *ifindex)
97{
98 const struct nf_conn_nat *nat = nfct_nat(i);
99
100 if (!nat)
101 return 0;
102 if (nf_ct_l3num(i) != NFPROTO_IPV4)
103 return 0;
104 return nat->masq_index == (int)(long)ifindex;
105}
106
107static int masq_device_event(struct notifier_block *this,
108 unsigned long event,
109 void *ptr)
110{
111 const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
112 struct net *net = dev_net(dev);
113
114 if (event == NETDEV_DOWN) {
115 /* Device was downed. Search entire table for
116 conntracks which were associated with that device,
117 and forget them. */
118 NF_CT_ASSERT(dev->ifindex != 0);
119
120 nf_ct_iterate_cleanup(net, device_cmp,
121 (void *)(long)dev->ifindex, 0, 0);
122 }
123
124 return NOTIFY_DONE;
125}
126
127static int masq_inet_event(struct notifier_block *this,
128 unsigned long event,
129 void *ptr)
130{
131 struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
132 struct netdev_notifier_info info;
133
134 netdev_notifier_info_init(&info, dev);
135 return masq_device_event(this, event, &info);
136}
137
138static struct notifier_block masq_dev_notifier = {
139 .notifier_call = masq_device_event,
140};
141
142static struct notifier_block masq_inet_notifier = {
143 .notifier_call = masq_inet_event,
144};
145
146static struct xt_target masquerade_tg_reg __read_mostly = { 61static struct xt_target masquerade_tg_reg __read_mostly = {
147 .name = "MASQUERADE", 62 .name = "MASQUERADE",
148 .family = NFPROTO_IPV4, 63 .family = NFPROTO_IPV4,
@@ -160,12 +75,8 @@ static int __init masquerade_tg_init(void)
160 75
161 ret = xt_register_target(&masquerade_tg_reg); 76 ret = xt_register_target(&masquerade_tg_reg);
162 77
163 if (ret == 0) { 78 if (ret == 0)
164 /* Register for device down reports */ 79 nf_nat_masquerade_ipv4_register_notifier();
165 register_netdevice_notifier(&masq_dev_notifier);
166 /* Register IP address change reports */
167 register_inetaddr_notifier(&masq_inet_notifier);
168 }
169 80
170 return ret; 81 return ret;
171} 82}
@@ -173,8 +84,7 @@ static int __init masquerade_tg_init(void)
173static void __exit masquerade_tg_exit(void) 84static void __exit masquerade_tg_exit(void)
174{ 85{
175 xt_unregister_target(&masquerade_tg_reg); 86 xt_unregister_target(&masquerade_tg_reg);
176 unregister_netdevice_notifier(&masq_dev_notifier); 87 nf_nat_masquerade_ipv4_unregister_notifier();
177 unregister_inetaddr_notifier(&masq_inet_notifier);
178} 88}
179 89
180module_init(masquerade_tg_init); 90module_init(masquerade_tg_init);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 5b6e0df4ccff..8f48f5517e33 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -20,7 +20,7 @@
20#include <linux/netfilter/x_tables.h> 20#include <linux/netfilter/x_tables.h>
21#include <linux/netfilter_ipv4/ip_tables.h> 21#include <linux/netfilter_ipv4/ip_tables.h>
22#include <linux/netfilter_ipv4/ipt_REJECT.h> 22#include <linux/netfilter_ipv4/ipt_REJECT.h>
23#ifdef CONFIG_BRIDGE_NETFILTER 23#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
24#include <linux/netfilter_bridge.h> 24#include <linux/netfilter_bridge.h>
25#endif 25#endif
26 26
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index f1787c04a4dd..6b67d7e9a75d 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -28,222 +28,57 @@ static const struct xt_table nf_nat_ipv4_table = {
28 .af = NFPROTO_IPV4, 28 .af = NFPROTO_IPV4,
29}; 29};
30 30
31static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 31static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
32{ 32 struct sk_buff *skb,
33 /* Force range to this IP; let proto decide mapping for 33 const struct net_device *in,
34 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 34 const struct net_device *out,
35 */ 35 struct nf_conn *ct)
36 struct nf_nat_range range;
37
38 range.flags = 0;
39 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
40 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
41 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
42 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
43
44 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
45}
46
47static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
48 const struct net_device *in,
49 const struct net_device *out,
50 struct nf_conn *ct)
51{ 36{
52 struct net *net = nf_ct_net(ct); 37 struct net *net = nf_ct_net(ct);
53 unsigned int ret;
54 38
55 ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); 39 return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table);
56 if (ret == NF_ACCEPT) {
57 if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
58 ret = alloc_null_binding(ct, hooknum);
59 }
60 return ret;
61} 40}
62 41
63static unsigned int 42static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
64nf_nat_ipv4_fn(const struct nf_hook_ops *ops, 43 struct sk_buff *skb,
65 struct sk_buff *skb, 44 const struct net_device *in,
66 const struct net_device *in, 45 const struct net_device *out,
67 const struct net_device *out, 46 int (*okfn)(struct sk_buff *))
68 int (*okfn)(struct sk_buff *))
69{ 47{
70 struct nf_conn *ct; 48 return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain);
71 enum ip_conntrack_info ctinfo;
72 struct nf_conn_nat *nat;
73 /* maniptype == SRC for postrouting. */
74 enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
75
76 /* We never see fragments: conntrack defrags on pre-routing
77 * and local-out, and nf_nat_out protects post-routing.
78 */
79 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
80
81 ct = nf_ct_get(skb, &ctinfo);
82 /* Can't track? It's not due to stress, or conntrack would
83 * have dropped it. Hence it's the user's responsibilty to
84 * packet filter it out, or implement conntrack/NAT for that
85 * protocol. 8) --RR
86 */
87 if (!ct)
88 return NF_ACCEPT;
89
90 /* Don't try to NAT if this packet is not conntracked */
91 if (nf_ct_is_untracked(ct))
92 return NF_ACCEPT;
93
94 nat = nf_ct_nat_ext_add(ct);
95 if (nat == NULL)
96 return NF_ACCEPT;
97
98 switch (ctinfo) {
99 case IP_CT_RELATED:
100 case IP_CT_RELATED_REPLY:
101 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
102 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
103 ops->hooknum))
104 return NF_DROP;
105 else
106 return NF_ACCEPT;
107 }
108 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
109 case IP_CT_NEW:
110 /* Seen it before? This can happen for loopback, retrans,
111 * or local packets.
112 */
113 if (!nf_nat_initialized(ct, maniptype)) {
114 unsigned int ret;
115
116 ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
117 if (ret != NF_ACCEPT)
118 return ret;
119 } else {
120 pr_debug("Already setup manip %s for ct %p\n",
121 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
122 ct);
123 if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
124 goto oif_changed;
125 }
126 break;
127
128 default:
129 /* ESTABLISHED */
130 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
131 ctinfo == IP_CT_ESTABLISHED_REPLY);
132 if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
133 goto oif_changed;
134 }
135
136 return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
137
138oif_changed:
139 nf_ct_kill_acct(ct, ctinfo, skb);
140 return NF_DROP;
141} 49}
142 50
143static unsigned int 51static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
144nf_nat_ipv4_in(const struct nf_hook_ops *ops, 52 struct sk_buff *skb,
145 struct sk_buff *skb, 53 const struct net_device *in,
146 const struct net_device *in, 54 const struct net_device *out,
147 const struct net_device *out, 55 int (*okfn)(struct sk_buff *))
148 int (*okfn)(struct sk_buff *))
149{ 56{
150 unsigned int ret; 57 return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain);
151 __be32 daddr = ip_hdr(skb)->daddr;
152
153 ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
154 if (ret != NF_DROP && ret != NF_STOLEN &&
155 daddr != ip_hdr(skb)->daddr)
156 skb_dst_drop(skb);
157
158 return ret;
159} 58}
160 59
161static unsigned int 60static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
162nf_nat_ipv4_out(const struct nf_hook_ops *ops, 61 struct sk_buff *skb,
163 struct sk_buff *skb, 62 const struct net_device *in,
164 const struct net_device *in, 63 const struct net_device *out,
165 const struct net_device *out, 64 int (*okfn)(struct sk_buff *))
166 int (*okfn)(struct sk_buff *))
167{ 65{
168#ifdef CONFIG_XFRM 66 return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain);
169 const struct nf_conn *ct;
170 enum ip_conntrack_info ctinfo;
171 int err;
172#endif
173 unsigned int ret;
174
175 /* root is playing with raw sockets. */
176 if (skb->len < sizeof(struct iphdr) ||
177 ip_hdrlen(skb) < sizeof(struct iphdr))
178 return NF_ACCEPT;
179
180 ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
181#ifdef CONFIG_XFRM
182 if (ret != NF_DROP && ret != NF_STOLEN &&
183 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
184 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
185 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
186
187 if ((ct->tuplehash[dir].tuple.src.u3.ip !=
188 ct->tuplehash[!dir].tuple.dst.u3.ip) ||
189 (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
190 ct->tuplehash[dir].tuple.src.u.all !=
191 ct->tuplehash[!dir].tuple.dst.u.all)) {
192 err = nf_xfrm_me_harder(skb, AF_INET);
193 if (err < 0)
194 ret = NF_DROP_ERR(err);
195 }
196 }
197#endif
198 return ret;
199} 67}
200 68
201static unsigned int 69static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
202nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, 70 struct sk_buff *skb,
203 struct sk_buff *skb, 71 const struct net_device *in,
204 const struct net_device *in, 72 const struct net_device *out,
205 const struct net_device *out, 73 int (*okfn)(struct sk_buff *))
206 int (*okfn)(struct sk_buff *))
207{ 74{
208 const struct nf_conn *ct; 75 return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain);
209 enum ip_conntrack_info ctinfo;
210 unsigned int ret;
211 int err;
212
213 /* root is playing with raw sockets. */
214 if (skb->len < sizeof(struct iphdr) ||
215 ip_hdrlen(skb) < sizeof(struct iphdr))
216 return NF_ACCEPT;
217
218 ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
219 if (ret != NF_DROP && ret != NF_STOLEN &&
220 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
221 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
222
223 if (ct->tuplehash[dir].tuple.dst.u3.ip !=
224 ct->tuplehash[!dir].tuple.src.u3.ip) {
225 err = ip_route_me_harder(skb, RTN_UNSPEC);
226 if (err < 0)
227 ret = NF_DROP_ERR(err);
228 }
229#ifdef CONFIG_XFRM
230 else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
231 ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
232 ct->tuplehash[dir].tuple.dst.u.all !=
233 ct->tuplehash[!dir].tuple.src.u.all) {
234 err = nf_xfrm_me_harder(skb, AF_INET);
235 if (err < 0)
236 ret = NF_DROP_ERR(err);
237 }
238#endif
239 }
240 return ret;
241} 76}
242 77
243static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { 78static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
244 /* Before packet filtering, change destination */ 79 /* Before packet filtering, change destination */
245 { 80 {
246 .hook = nf_nat_ipv4_in, 81 .hook = iptable_nat_ipv4_in,
247 .owner = THIS_MODULE, 82 .owner = THIS_MODULE,
248 .pf = NFPROTO_IPV4, 83 .pf = NFPROTO_IPV4,
249 .hooknum = NF_INET_PRE_ROUTING, 84 .hooknum = NF_INET_PRE_ROUTING,
@@ -251,7 +86,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
251 }, 86 },
252 /* After packet filtering, change source */ 87 /* After packet filtering, change source */
253 { 88 {
254 .hook = nf_nat_ipv4_out, 89 .hook = iptable_nat_ipv4_out,
255 .owner = THIS_MODULE, 90 .owner = THIS_MODULE,
256 .pf = NFPROTO_IPV4, 91 .pf = NFPROTO_IPV4,
257 .hooknum = NF_INET_POST_ROUTING, 92 .hooknum = NF_INET_POST_ROUTING,
@@ -259,7 +94,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
259 }, 94 },
260 /* Before packet filtering, change destination */ 95 /* Before packet filtering, change destination */
261 { 96 {
262 .hook = nf_nat_ipv4_local_fn, 97 .hook = iptable_nat_ipv4_local_fn,
263 .owner = THIS_MODULE, 98 .owner = THIS_MODULE,
264 .pf = NFPROTO_IPV4, 99 .pf = NFPROTO_IPV4,
265 .hooknum = NF_INET_LOCAL_OUT, 100 .hooknum = NF_INET_LOCAL_OUT,
@@ -267,7 +102,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
267 }, 102 },
268 /* After packet filtering, change source */ 103 /* After packet filtering, change source */
269 { 104 {
270 .hook = nf_nat_ipv4_fn, 105 .hook = iptable_nat_ipv4_fn,
271 .owner = THIS_MODULE, 106 .owner = THIS_MODULE,
272 .pf = NFPROTO_IPV4, 107 .pf = NFPROTO_IPV4,
273 .hooknum = NF_INET_LOCAL_IN, 108 .hooknum = NF_INET_LOCAL_IN,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 76bd1aef257f..7e5ca6f2d0cd 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -50,7 +50,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
50 zone = nf_ct_zone((struct nf_conn *)skb->nfct); 50 zone = nf_ct_zone((struct nf_conn *)skb->nfct);
51#endif 51#endif
52 52
53#ifdef CONFIG_BRIDGE_NETFILTER 53#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
54 if (skb->nf_bridge && 54 if (skb->nf_bridge &&
55 skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) 55 skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
56 return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; 56 return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 14f5ccd06337..fc37711e11f3 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -254,6 +254,205 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
254} 254}
255EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); 255EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
256 256
257unsigned int
258nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
259 const struct net_device *in, const struct net_device *out,
260 unsigned int (*do_chain)(const struct nf_hook_ops *ops,
261 struct sk_buff *skb,
262 const struct net_device *in,
263 const struct net_device *out,
264 struct nf_conn *ct))
265{
266 struct nf_conn *ct;
267 enum ip_conntrack_info ctinfo;
268 struct nf_conn_nat *nat;
269 /* maniptype == SRC for postrouting. */
270 enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
271
272 /* We never see fragments: conntrack defrags on pre-routing
273 * and local-out, and nf_nat_out protects post-routing.
274 */
275 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
276
277 ct = nf_ct_get(skb, &ctinfo);
278 /* Can't track? It's not due to stress, or conntrack would
279 * have dropped it. Hence it's the user's responsibilty to
280 * packet filter it out, or implement conntrack/NAT for that
281 * protocol. 8) --RR
282 */
283 if (!ct)
284 return NF_ACCEPT;
285
286 /* Don't try to NAT if this packet is not conntracked */
287 if (nf_ct_is_untracked(ct))
288 return NF_ACCEPT;
289
290 nat = nf_ct_nat_ext_add(ct);
291 if (nat == NULL)
292 return NF_ACCEPT;
293
294 switch (ctinfo) {
295 case IP_CT_RELATED:
296 case IP_CT_RELATED_REPLY:
297 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
298 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
299 ops->hooknum))
300 return NF_DROP;
301 else
302 return NF_ACCEPT;
303 }
304 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
305 case IP_CT_NEW:
306 /* Seen it before? This can happen for loopback, retrans,
307 * or local packets.
308 */
309 if (!nf_nat_initialized(ct, maniptype)) {
310 unsigned int ret;
311
312 ret = do_chain(ops, skb, in, out, ct);
313 if (ret != NF_ACCEPT)
314 return ret;
315
316 if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
317 break;
318
319 ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
320 if (ret != NF_ACCEPT)
321 return ret;
322 } else {
323 pr_debug("Already setup manip %s for ct %p\n",
324 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
325 ct);
326 if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
327 goto oif_changed;
328 }
329 break;
330
331 default:
332 /* ESTABLISHED */
333 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
334 ctinfo == IP_CT_ESTABLISHED_REPLY);
335 if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
336 goto oif_changed;
337 }
338
339 return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
340
341oif_changed:
342 nf_ct_kill_acct(ct, ctinfo, skb);
343 return NF_DROP;
344}
345EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
346
347unsigned int
348nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
349 const struct net_device *in, const struct net_device *out,
350 unsigned int (*do_chain)(const struct nf_hook_ops *ops,
351 struct sk_buff *skb,
352 const struct net_device *in,
353 const struct net_device *out,
354 struct nf_conn *ct))
355{
356 unsigned int ret;
357 __be32 daddr = ip_hdr(skb)->daddr;
358
359 ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
360 if (ret != NF_DROP && ret != NF_STOLEN &&
361 daddr != ip_hdr(skb)->daddr)
362 skb_dst_drop(skb);
363
364 return ret;
365}
366EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
367
368unsigned int
369nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
370 const struct net_device *in, const struct net_device *out,
371 unsigned int (*do_chain)(const struct nf_hook_ops *ops,
372 struct sk_buff *skb,
373 const struct net_device *in,
374 const struct net_device *out,
375 struct nf_conn *ct))
376{
377#ifdef CONFIG_XFRM
378 const struct nf_conn *ct;
379 enum ip_conntrack_info ctinfo;
380 int err;
381#endif
382 unsigned int ret;
383
384 /* root is playing with raw sockets. */
385 if (skb->len < sizeof(struct iphdr) ||
386 ip_hdrlen(skb) < sizeof(struct iphdr))
387 return NF_ACCEPT;
388
389 ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
390#ifdef CONFIG_XFRM
391 if (ret != NF_DROP && ret != NF_STOLEN &&
392 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
393 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
394 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
395
396 if ((ct->tuplehash[dir].tuple.src.u3.ip !=
397 ct->tuplehash[!dir].tuple.dst.u3.ip) ||
398 (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
399 ct->tuplehash[dir].tuple.src.u.all !=
400 ct->tuplehash[!dir].tuple.dst.u.all)) {
401 err = nf_xfrm_me_harder(skb, AF_INET);
402 if (err < 0)
403 ret = NF_DROP_ERR(err);
404 }
405 }
406#endif
407 return ret;
408}
409EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
410
411unsigned int
412nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
413 const struct net_device *in, const struct net_device *out,
414 unsigned int (*do_chain)(const struct nf_hook_ops *ops,
415 struct sk_buff *skb,
416 const struct net_device *in,
417 const struct net_device *out,
418 struct nf_conn *ct))
419{
420 const struct nf_conn *ct;
421 enum ip_conntrack_info ctinfo;
422 unsigned int ret;
423 int err;
424
425 /* root is playing with raw sockets. */
426 if (skb->len < sizeof(struct iphdr) ||
427 ip_hdrlen(skb) < sizeof(struct iphdr))
428 return NF_ACCEPT;
429
430 ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
431 if (ret != NF_DROP && ret != NF_STOLEN &&
432 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
433 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
434
435 if (ct->tuplehash[dir].tuple.dst.u3.ip !=
436 ct->tuplehash[!dir].tuple.src.u3.ip) {
437 err = ip_route_me_harder(skb, RTN_UNSPEC);
438 if (err < 0)
439 ret = NF_DROP_ERR(err);
440 }
441#ifdef CONFIG_XFRM
442 else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
443 ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
444 ct->tuplehash[dir].tuple.dst.u.all !=
445 ct->tuplehash[!dir].tuple.src.u.all) {
446 err = nf_xfrm_me_harder(skb, AF_INET);
447 if (err < 0)
448 ret = NF_DROP_ERR(err);
449 }
450#endif
451 }
452 return ret;
453}
454EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
455
257static int __init nf_nat_l3proto_ipv4_init(void) 456static int __init nf_nat_l3proto_ipv4_init(void)
258{ 457{
259 int err; 458 int err;
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
new file mode 100644
index 000000000000..c6eb42100e9a
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -0,0 +1,153 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/module.h>
11#include <linux/atomic.h>
12#include <linux/inetdevice.h>
13#include <linux/ip.h>
14#include <linux/timer.h>
15#include <linux/netfilter.h>
16#include <net/protocol.h>
17#include <net/ip.h>
18#include <net/checksum.h>
19#include <net/route.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/netfilter/x_tables.h>
22#include <net/netfilter/nf_nat.h>
23#include <net/netfilter/ipv4/nf_nat_masquerade.h>
24
25unsigned int
26nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
27 const struct nf_nat_range *range,
28 const struct net_device *out)
29{
30 struct nf_conn *ct;
31 struct nf_conn_nat *nat;
32 enum ip_conntrack_info ctinfo;
33 struct nf_nat_range newrange;
34 const struct rtable *rt;
35 __be32 newsrc, nh;
36
37 NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
38
39 ct = nf_ct_get(skb, &ctinfo);
40 nat = nfct_nat(ct);
41
42 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
43 ctinfo == IP_CT_RELATED_REPLY));
44
45 /* Source address is 0.0.0.0 - locally generated packet that is
46 * probably not supposed to be masqueraded.
47 */
48 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
49 return NF_ACCEPT;
50
51 rt = skb_rtable(skb);
52 nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
53 newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
54 if (!newsrc) {
55 pr_info("%s ate my IP address\n", out->name);
56 return NF_DROP;
57 }
58
59 nat->masq_index = out->ifindex;
60
61 /* Transfer from original range. */
62 memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
63 memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
64 newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
65 newrange.min_addr.ip = newsrc;
66 newrange.max_addr.ip = newsrc;
67 newrange.min_proto = range->min_proto;
68 newrange.max_proto = range->max_proto;
69
70 /* Hand modified range to generic setup. */
71 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
72}
73EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
74
75static int device_cmp(struct nf_conn *i, void *ifindex)
76{
77 const struct nf_conn_nat *nat = nfct_nat(i);
78
79 if (!nat)
80 return 0;
81 if (nf_ct_l3num(i) != NFPROTO_IPV4)
82 return 0;
83 return nat->masq_index == (int)(long)ifindex;
84}
85
86static int masq_device_event(struct notifier_block *this,
87 unsigned long event,
88 void *ptr)
89{
90 const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
91 struct net *net = dev_net(dev);
92
93 if (event == NETDEV_DOWN) {
94 /* Device was downed. Search entire table for
95 * conntracks which were associated with that device,
96 * and forget them.
97 */
98 NF_CT_ASSERT(dev->ifindex != 0);
99
100 nf_ct_iterate_cleanup(net, device_cmp,
101 (void *)(long)dev->ifindex, 0, 0);
102 }
103
104 return NOTIFY_DONE;
105}
106
107static int masq_inet_event(struct notifier_block *this,
108 unsigned long event,
109 void *ptr)
110{
111 struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
112 struct netdev_notifier_info info;
113
114 netdev_notifier_info_init(&info, dev);
115 return masq_device_event(this, event, &info);
116}
117
118static struct notifier_block masq_dev_notifier = {
119 .notifier_call = masq_device_event,
120};
121
122static struct notifier_block masq_inet_notifier = {
123 .notifier_call = masq_inet_event,
124};
125
126static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
127
128void nf_nat_masquerade_ipv4_register_notifier(void)
129{
130 /* check if the notifier was already set */
131 if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
132 return;
133
134 /* Register for device down reports */
135 register_netdevice_notifier(&masq_dev_notifier);
136 /* Register IP address change reports */
137 register_inetaddr_notifier(&masq_inet_notifier);
138}
139EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
140
141void nf_nat_masquerade_ipv4_unregister_notifier(void)
142{
143 /* check if the notifier still has clients */
144 if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
145 return;
146
147 unregister_netdevice_notifier(&masq_dev_notifier);
148 unregister_inetaddr_notifier(&masq_inet_notifier);
149}
150EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
151
152MODULE_LICENSE("GPL");
153MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
new file mode 100644
index 000000000000..b023b4eb1a96
--- /dev/null
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -0,0 +1,127 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <net/ip.h>
10#include <net/tcp.h>
11#include <net/route.h>
12#include <net/dst.h>
13#include <linux/netfilter_ipv4.h>
14
15/* Send RST reply */
16void nf_send_reset(struct sk_buff *oldskb, int hook)
17{
18 struct sk_buff *nskb;
19 const struct iphdr *oiph;
20 struct iphdr *niph;
21 const struct tcphdr *oth;
22 struct tcphdr _otcph, *tcph;
23
24 /* IP header checks: fragment. */
25 if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
26 return;
27
28 oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
29 sizeof(_otcph), &_otcph);
30 if (oth == NULL)
31 return;
32
33 /* No RST for RST. */
34 if (oth->rst)
35 return;
36
37 if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
38 return;
39
40 /* Check checksum */
41 if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
42 return;
43 oiph = ip_hdr(oldskb);
44
45 nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
46 LL_MAX_HEADER, GFP_ATOMIC);
47 if (!nskb)
48 return;
49
50 skb_reserve(nskb, LL_MAX_HEADER);
51
52 skb_reset_network_header(nskb);
53 niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
54 niph->version = 4;
55 niph->ihl = sizeof(struct iphdr) / 4;
56 niph->tos = 0;
57 niph->id = 0;
58 niph->frag_off = htons(IP_DF);
59 niph->protocol = IPPROTO_TCP;
60 niph->check = 0;
61 niph->saddr = oiph->daddr;
62 niph->daddr = oiph->saddr;
63
64 skb_reset_transport_header(nskb);
65 tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
66 memset(tcph, 0, sizeof(*tcph));
67 tcph->source = oth->dest;
68 tcph->dest = oth->source;
69 tcph->doff = sizeof(struct tcphdr) / 4;
70
71 if (oth->ack)
72 tcph->seq = oth->ack_seq;
73 else {
74 tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
75 oldskb->len - ip_hdrlen(oldskb) -
76 (oth->doff << 2));
77 tcph->ack = 1;
78 }
79
80 tcph->rst = 1;
81 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
82 niph->daddr, 0);
83 nskb->ip_summed = CHECKSUM_PARTIAL;
84 nskb->csum_start = (unsigned char *)tcph - nskb->head;
85 nskb->csum_offset = offsetof(struct tcphdr, check);
86
87 /* ip_route_me_harder expects skb->dst to be set */
88 skb_dst_set_noref(nskb, skb_dst(oldskb));
89
90 nskb->protocol = htons(ETH_P_IP);
91 if (ip_route_me_harder(nskb, RTN_UNSPEC))
92 goto free_nskb;
93
94 niph->ttl = ip4_dst_hoplimit(skb_dst(nskb));
95
96 /* "Never happens" */
97 if (nskb->len > dst_mtu(skb_dst(nskb)))
98 goto free_nskb;
99
100 nf_ct_attach(nskb, oldskb);
101
102#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
103 /* If we use ip_local_out for bridged traffic, the MAC source on
104 * the RST will be ours, instead of the destination's. This confuses
105 * some routers/firewalls, and they drop the packet. So we need to
106 * build the eth header using the original destination's MAC as the
107 * source, and send the RST packet directly.
108 */
109 if (oldskb->nf_bridge) {
110 struct ethhdr *oeth = eth_hdr(oldskb);
111 nskb->dev = oldskb->nf_bridge->physindev;
112 niph->tot_len = htons(nskb->len);
113 ip_send_check(niph);
114 if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
115 oeth->h_source, oeth->h_dest, nskb->len) < 0)
116 goto free_nskb;
117 dev_queue_xmit(nskb);
118 } else
119#endif
120 ip_local_out(nskb);
121
122 return;
123
124 free_nskb:
125 kfree_skb(nskb);
126}
127EXPORT_SYMBOL_GPL(nf_send_reset);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index 3964157d826c..df547bf50078 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -26,136 +26,53 @@
26#include <net/netfilter/nf_nat_l3proto.h> 26#include <net/netfilter/nf_nat_l3proto.h>
27#include <net/ip.h> 27#include <net/ip.h>
28 28
29/* 29static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
30 * NAT chains 30 struct sk_buff *skb,
31 */ 31 const struct net_device *in,
32 32 const struct net_device *out,
33static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, 33 struct nf_conn *ct)
34 struct sk_buff *skb,
35 const struct net_device *in,
36 const struct net_device *out,
37 int (*okfn)(struct sk_buff *))
38{ 34{
39 enum ip_conntrack_info ctinfo;
40 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
41 struct nf_conn_nat *nat;
42 enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
43 struct nft_pktinfo pkt; 35 struct nft_pktinfo pkt;
44 unsigned int ret;
45
46 if (ct == NULL || nf_ct_is_untracked(ct))
47 return NF_ACCEPT;
48
49 NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
50
51 nat = nf_ct_nat_ext_add(ct);
52 if (nat == NULL)
53 return NF_ACCEPT;
54
55 switch (ctinfo) {
56 case IP_CT_RELATED:
57 case IP_CT_RELATED + IP_CT_IS_REPLY:
58 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
59 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
60 ops->hooknum))
61 return NF_DROP;
62 else
63 return NF_ACCEPT;
64 }
65 /* Fall through */
66 case IP_CT_NEW:
67 if (nf_nat_initialized(ct, maniptype))
68 break;
69 36
70 nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); 37 nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
71 38
72 ret = nft_do_chain(&pkt, ops); 39 return nft_do_chain(&pkt, ops);
73 if (ret != NF_ACCEPT)
74 return ret;
75 if (!nf_nat_initialized(ct, maniptype)) {
76 ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
77 if (ret != NF_ACCEPT)
78 return ret;
79 }
80 default:
81 break;
82 }
83
84 return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
85} 40}
86 41
87static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, 42static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
88 struct sk_buff *skb, 43 struct sk_buff *skb,
89 const struct net_device *in, 44 const struct net_device *in,
90 const struct net_device *out, 45 const struct net_device *out,
91 int (*okfn)(struct sk_buff *)) 46 int (*okfn)(struct sk_buff *))
92{ 47{
93 __be32 daddr = ip_hdr(skb)->daddr; 48 return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain);
94 unsigned int ret;
95
96 ret = nf_nat_fn(ops, skb, in, out, okfn);
97 if (ret != NF_DROP && ret != NF_STOLEN &&
98 ip_hdr(skb)->daddr != daddr) {
99 skb_dst_drop(skb);
100 }
101 return ret;
102} 49}
103 50
104static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, 51static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
105 struct sk_buff *skb, 52 struct sk_buff *skb,
106 const struct net_device *in, 53 const struct net_device *in,
107 const struct net_device *out, 54 const struct net_device *out,
108 int (*okfn)(struct sk_buff *)) 55 int (*okfn)(struct sk_buff *))
109{ 56{
110 enum ip_conntrack_info ctinfo __maybe_unused; 57 return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain);
111 const struct nf_conn *ct __maybe_unused;
112 unsigned int ret;
113
114 ret = nf_nat_fn(ops, skb, in, out, okfn);
115#ifdef CONFIG_XFRM
116 if (ret != NF_DROP && ret != NF_STOLEN &&
117 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
118 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
119
120 if (ct->tuplehash[dir].tuple.src.u3.ip !=
121 ct->tuplehash[!dir].tuple.dst.u3.ip ||
122 ct->tuplehash[dir].tuple.src.u.all !=
123 ct->tuplehash[!dir].tuple.dst.u.all)
124 return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
125 ret : NF_DROP;
126 }
127#endif
128 return ret;
129} 58}
130 59
131static unsigned int nf_nat_output(const struct nf_hook_ops *ops, 60static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
132 struct sk_buff *skb, 61 struct sk_buff *skb,
133 const struct net_device *in, 62 const struct net_device *in,
134 const struct net_device *out, 63 const struct net_device *out,
135 int (*okfn)(struct sk_buff *)) 64 int (*okfn)(struct sk_buff *))
136{ 65{
137 enum ip_conntrack_info ctinfo; 66 return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain);
138 const struct nf_conn *ct; 67}
139 unsigned int ret;
140
141 ret = nf_nat_fn(ops, skb, in, out, okfn);
142 if (ret != NF_DROP && ret != NF_STOLEN &&
143 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
144 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
145 68
146 if (ct->tuplehash[dir].tuple.dst.u3.ip != 69static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
147 ct->tuplehash[!dir].tuple.src.u3.ip) { 70 struct sk_buff *skb,
148 if (ip_route_me_harder(skb, RTN_UNSPEC)) 71 const struct net_device *in,
149 ret = NF_DROP; 72 const struct net_device *out,
150 } 73 int (*okfn)(struct sk_buff *))
151#ifdef CONFIG_XFRM 74{
152 else if (ct->tuplehash[dir].tuple.dst.u.all != 75 return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain);
153 ct->tuplehash[!dir].tuple.src.u.all)
154 if (nf_xfrm_me_harder(skb, AF_INET))
155 ret = NF_DROP;
156#endif
157 }
158 return ret;
159} 76}
160 77
161static const struct nf_chain_type nft_chain_nat_ipv4 = { 78static const struct nf_chain_type nft_chain_nat_ipv4 = {
@@ -168,10 +85,10 @@ static const struct nf_chain_type nft_chain_nat_ipv4 = {
168 (1 << NF_INET_LOCAL_OUT) | 85 (1 << NF_INET_LOCAL_OUT) |
169 (1 << NF_INET_LOCAL_IN), 86 (1 << NF_INET_LOCAL_IN),
170 .hooks = { 87 .hooks = {
171 [NF_INET_PRE_ROUTING] = nf_nat_prerouting, 88 [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in,
172 [NF_INET_POST_ROUTING] = nf_nat_postrouting, 89 [NF_INET_POST_ROUTING] = nft_nat_ipv4_out,
173 [NF_INET_LOCAL_OUT] = nf_nat_output, 90 [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn,
174 [NF_INET_LOCAL_IN] = nf_nat_fn, 91 [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn,
175 }, 92 },
176}; 93};
177 94
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
new file mode 100644
index 000000000000..1c636d6b5b50
--- /dev/null
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -0,0 +1,77 @@
1/*
2 * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/netlink.h>
13#include <linux/netfilter.h>
14#include <linux/netfilter/nf_tables.h>
15#include <net/netfilter/nf_tables.h>
16#include <net/netfilter/nft_masq.h>
17#include <net/netfilter/ipv4/nf_nat_masquerade.h>
18
19static void nft_masq_ipv4_eval(const struct nft_expr *expr,
20 struct nft_data data[NFT_REG_MAX + 1],
21 const struct nft_pktinfo *pkt)
22{
23 struct nft_masq *priv = nft_expr_priv(expr);
24 struct nf_nat_range range;
25 unsigned int verdict;
26
27 range.flags = priv->flags;
28
29 verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
30 &range, pkt->out);
31
32 data[NFT_REG_VERDICT].verdict = verdict;
33}
34
35static struct nft_expr_type nft_masq_ipv4_type;
36static const struct nft_expr_ops nft_masq_ipv4_ops = {
37 .type = &nft_masq_ipv4_type,
38 .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
39 .eval = nft_masq_ipv4_eval,
40 .init = nft_masq_init,
41 .dump = nft_masq_dump,
42};
43
44static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
45 .family = NFPROTO_IPV4,
46 .name = "masq",
47 .ops = &nft_masq_ipv4_ops,
48 .policy = nft_masq_policy,
49 .maxattr = NFTA_MASQ_MAX,
50 .owner = THIS_MODULE,
51};
52
53static int __init nft_masq_ipv4_module_init(void)
54{
55 int ret;
56
57 ret = nft_register_expr(&nft_masq_ipv4_type);
58 if (ret < 0)
59 return ret;
60
61 nf_nat_masquerade_ipv4_register_notifier();
62
63 return ret;
64}
65
66static void __exit nft_masq_ipv4_module_exit(void)
67{
68 nft_unregister_expr(&nft_masq_ipv4_type);
69 nf_nat_masquerade_ipv4_unregister_notifier();
70}
71
72module_init(nft_masq_ipv4_module_init);
73module_exit(nft_masq_ipv4_module_exit);
74
75MODULE_LICENSE("GPL");
76MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
77MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index e79718a382f2..ed33299c56d1 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -16,7 +16,6 @@
16#include <linux/netfilter.h> 16#include <linux/netfilter.h>
17#include <linux/netfilter/nf_tables.h> 17#include <linux/netfilter/nf_tables.h>
18#include <net/netfilter/nf_tables.h> 18#include <net/netfilter/nf_tables.h>
19#include <net/icmp.h>
20#include <net/netfilter/ipv4/nf_reject.h> 19#include <net/netfilter/ipv4/nf_reject.h>
21#include <net/netfilter/nft_reject.h> 20#include <net/netfilter/nft_reject.h>
22 21
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index a3c59a077a5f..57f7c9804139 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -311,7 +311,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
311 if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) 311 if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
312 chk_addr_ret = RTN_LOCAL; 312 chk_addr_ret = RTN_LOCAL;
313 313
314 if ((sysctl_ip_nonlocal_bind == 0 && 314 if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 &&
315 isk->freebind == 0 && isk->transparent == 0 && 315 isk->freebind == 0 && isk->transparent == 0 &&
316 chk_addr_ret != RTN_LOCAL) || 316 chk_addr_ret != RTN_LOCAL) ||
317 chk_addr_ret == RTN_MULTICAST || 317 chk_addr_ret == RTN_MULTICAST ||
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 46d6a1c923a8..4b7c0ec65251 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -30,6 +30,7 @@
30 30
31const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; 31const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
32const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; 32const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
33EXPORT_SYMBOL(inet_offloads);
33 34
34int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) 35int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
35{ 36{
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index cbadb942c332..793c0bb8c4fd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -596,12 +596,12 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
596 596
597static inline u32 fnhe_hashfun(__be32 daddr) 597static inline u32 fnhe_hashfun(__be32 daddr)
598{ 598{
599 static u32 fnhe_hashrnd __read_mostly;
599 u32 hval; 600 u32 hval;
600 601
601 hval = (__force u32) daddr; 602 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
602 hval ^= (hval >> 11) ^ (hval >> 22); 603 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
603 604 return hash_32(hval, FNHE_HASH_SHIFT);
604 return hval & (FNHE_HASH_SIZE - 1);
605} 605}
606 606
607static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 607static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
@@ -628,12 +628,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
628 628
629 spin_lock_bh(&fnhe_lock); 629 spin_lock_bh(&fnhe_lock);
630 630
631 hash = nh->nh_exceptions; 631 hash = rcu_dereference(nh->nh_exceptions);
632 if (!hash) { 632 if (!hash) {
633 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); 633 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
634 if (!hash) 634 if (!hash)
635 goto out_unlock; 635 goto out_unlock;
636 nh->nh_exceptions = hash; 636 rcu_assign_pointer(nh->nh_exceptions, hash);
637 } 637 }
638 638
639 hash += hval; 639 hash += hval;
@@ -1242,7 +1242,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1242 1242
1243static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1243static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1244{ 1244{
1245 struct fnhe_hash_bucket *hash = nh->nh_exceptions; 1245 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1246 struct fib_nh_exception *fnhe; 1246 struct fib_nh_exception *fnhe;
1247 u32 hval; 1247 u32 hval;
1248 1248
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c0c75688896e..0431a8f3c8f4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -25,7 +25,7 @@
25 25
26extern int sysctl_tcp_syncookies; 26extern int sysctl_tcp_syncookies;
27 27
28static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; 28static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
29 29
30#define COOKIEBITS 24 /* Upper bits store count */ 30#define COOKIEBITS 24 /* Upper bits store count */
31#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) 31#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a9fde0eef77c..b3c53c8b331e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -286,13 +286,6 @@ static struct ctl_table ipv4_table[] = {
286 .extra2 = &ip_ttl_max, 286 .extra2 = &ip_ttl_max,
287 }, 287 },
288 { 288 {
289 .procname = "ip_nonlocal_bind",
290 .data = &sysctl_ip_nonlocal_bind,
291 .maxlen = sizeof(int),
292 .mode = 0644,
293 .proc_handler = proc_dointvec
294 },
295 {
296 .procname = "tcp_syn_retries", 289 .procname = "tcp_syn_retries",
297 .data = &sysctl_tcp_syn_retries, 290 .data = &sysctl_tcp_syn_retries,
298 .maxlen = sizeof(int), 291 .maxlen = sizeof(int),
@@ -450,6 +443,16 @@ static struct ctl_table ipv4_table[] = {
450 .mode = 0644, 443 .mode = 0644,
451 .proc_handler = proc_dointvec 444 .proc_handler = proc_dointvec
452 }, 445 },
446#ifdef CONFIG_IP_MULTICAST
447 {
448 .procname = "igmp_qrv",
449 .data = &sysctl_igmp_qrv,
450 .maxlen = sizeof(int),
451 .mode = 0644,
452 .proc_handler = proc_dointvec_minmax,
453 .extra1 = &one
454 },
455#endif
453 { 456 {
454 .procname = "inet_peer_threshold", 457 .procname = "inet_peer_threshold",
455 .data = &inet_peer_threshold, 458 .data = &inet_peer_threshold,
@@ -719,6 +722,22 @@ static struct ctl_table ipv4_table[] = {
719 .extra2 = &one, 722 .extra2 = &one,
720 }, 723 },
721 { 724 {
725 .procname = "icmp_msgs_per_sec",
726 .data = &sysctl_icmp_msgs_per_sec,
727 .maxlen = sizeof(int),
728 .mode = 0644,
729 .proc_handler = proc_dointvec_minmax,
730 .extra1 = &zero,
731 },
732 {
733 .procname = "icmp_msgs_burst",
734 .data = &sysctl_icmp_msgs_burst,
735 .maxlen = sizeof(int),
736 .mode = 0644,
737 .proc_handler = proc_dointvec_minmax,
738 .extra1 = &zero,
739 },
740 {
722 .procname = "udp_mem", 741 .procname = "udp_mem",
723 .data = &sysctl_udp_mem, 742 .data = &sysctl_udp_mem,
724 .maxlen = sizeof(sysctl_udp_mem), 743 .maxlen = sizeof(sysctl_udp_mem),
@@ -830,6 +849,13 @@ static struct ctl_table ipv4_net_table[] = {
830 .proc_handler = proc_dointvec, 849 .proc_handler = proc_dointvec,
831 }, 850 },
832 { 851 {
852 .procname = "ip_nonlocal_bind",
853 .data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
854 .maxlen = sizeof(int),
855 .mode = 0644,
856 .proc_handler = proc_dointvec
857 },
858 {
833 .procname = "fwmark_reflect", 859 .procname = "fwmark_reflect",
834 .data = &init_net.ipv4.sysctl_fwmark_reflect, 860 .data = &init_net.ipv4.sysctl_fwmark_reflect,
835 .maxlen = sizeof(int), 861 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8ee43ae90396..461003d258ba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -404,7 +404,7 @@ void tcp_init_sock(struct sock *sk)
404 404
405 tp->reordering = sysctl_tcp_reordering; 405 tp->reordering = sysctl_tcp_reordering;
406 tcp_enable_early_retrans(tp); 406 tcp_enable_early_retrans(tp);
407 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 407 tcp_assign_congestion_control(sk);
408 408
409 tp->tsoffset = 0; 409 tp->tsoffset = 0;
410 410
@@ -608,7 +608,7 @@ static inline bool forced_push(const struct tcp_sock *tp)
608 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); 608 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
609} 609}
610 610
611static inline void skb_entail(struct sock *sk, struct sk_buff *skb) 611static void skb_entail(struct sock *sk, struct sk_buff *skb)
612{ 612{
613 struct tcp_sock *tp = tcp_sk(sk); 613 struct tcp_sock *tp = tcp_sk(sk);
614 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 614 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -617,7 +617,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
617 tcb->seq = tcb->end_seq = tp->write_seq; 617 tcb->seq = tcb->end_seq = tp->write_seq;
618 tcb->tcp_flags = TCPHDR_ACK; 618 tcb->tcp_flags = TCPHDR_ACK;
619 tcb->sacked = 0; 619 tcb->sacked = 0;
620 skb_header_release(skb); 620 __skb_header_release(skb);
621 tcp_add_write_queue_tail(sk, skb); 621 tcp_add_write_queue_tail(sk, skb);
622 sk->sk_wmem_queued += skb->truesize; 622 sk->sk_wmem_queued += skb->truesize;
623 sk_mem_charge(sk, skb->truesize); 623 sk_mem_charge(sk, skb->truesize);
@@ -962,7 +962,7 @@ new_segment:
962 skb->ip_summed = CHECKSUM_PARTIAL; 962 skb->ip_summed = CHECKSUM_PARTIAL;
963 tp->write_seq += copy; 963 tp->write_seq += copy;
964 TCP_SKB_CB(skb)->end_seq += copy; 964 TCP_SKB_CB(skb)->end_seq += copy;
965 skb_shinfo(skb)->gso_segs = 0; 965 tcp_skb_pcount_set(skb, 0);
966 966
967 if (!copied) 967 if (!copied)
968 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 968 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
@@ -1260,7 +1260,7 @@ new_segment:
1260 1260
1261 tp->write_seq += copy; 1261 tp->write_seq += copy;
1262 TCP_SKB_CB(skb)->end_seq += copy; 1262 TCP_SKB_CB(skb)->end_seq += copy;
1263 skb_shinfo(skb)->gso_segs = 0; 1263 tcp_skb_pcount_set(skb, 0);
1264 1264
1265 from += copy; 1265 from += copy;
1266 copied += copy; 1266 copied += copy;
@@ -1476,9 +1476,9 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1476 1476
1477 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 1477 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1478 offset = seq - TCP_SKB_CB(skb)->seq; 1478 offset = seq - TCP_SKB_CB(skb)->seq;
1479 if (tcp_hdr(skb)->syn) 1479 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
1480 offset--; 1480 offset--;
1481 if (offset < skb->len || tcp_hdr(skb)->fin) { 1481 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1482 *off = offset; 1482 *off = offset;
1483 return skb; 1483 return skb;
1484 } 1484 }
@@ -1551,7 +1551,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1551 if (offset + 1 != skb->len) 1551 if (offset + 1 != skb->len)
1552 continue; 1552 continue;
1553 } 1553 }
1554 if (tcp_hdr(skb)->fin) { 1554 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1555 sk_eat_skb(sk, skb); 1555 sk_eat_skb(sk, skb);
1556 ++seq; 1556 ++seq;
1557 break; 1557 break;
@@ -1665,11 +1665,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1665 break; 1665 break;
1666 1666
1667 offset = *seq - TCP_SKB_CB(skb)->seq; 1667 offset = *seq - TCP_SKB_CB(skb)->seq;
1668 if (tcp_hdr(skb)->syn) 1668 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
1669 offset--; 1669 offset--;
1670 if (offset < skb->len) 1670 if (offset < skb->len)
1671 goto found_ok_skb; 1671 goto found_ok_skb;
1672 if (tcp_hdr(skb)->fin) 1672 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1673 goto found_fin_ok; 1673 goto found_fin_ok;
1674 WARN(!(flags & MSG_PEEK), 1674 WARN(!(flags & MSG_PEEK),
1675 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", 1675 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
@@ -1857,7 +1857,7 @@ skip_copy:
1857 if (used + offset < skb->len) 1857 if (used + offset < skb->len)
1858 continue; 1858 continue;
1859 1859
1860 if (tcp_hdr(skb)->fin) 1860 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1861 goto found_fin_ok; 1861 goto found_fin_ok;
1862 if (!(flags & MSG_PEEK)) 1862 if (!(flags & MSG_PEEK))
1863 sk_eat_skb(sk, skb); 1863 sk_eat_skb(sk, skb);
@@ -2044,8 +2044,10 @@ void tcp_close(struct sock *sk, long timeout)
2044 * reader process may not have drained the data yet! 2044 * reader process may not have drained the data yet!
2045 */ 2045 */
2046 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { 2046 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2047 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - 2047 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2048 tcp_hdr(skb)->fin; 2048
2049 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2050 len--;
2049 data_was_unread += len; 2051 data_was_unread += len;
2050 __kfree_skb(skb); 2052 __kfree_skb(skb);
2051 } 2053 }
@@ -2572,7 +2574,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2572 break; 2574 break;
2573#endif 2575#endif
2574 case TCP_USER_TIMEOUT: 2576 case TCP_USER_TIMEOUT:
2575 /* Cap the max timeout in ms TCP will retry/retrans 2577 /* Cap the max time in ms TCP will retry or probe the window
2576 * before giving up and aborting (ETIMEDOUT) a connection. 2578 * before giving up and aborting (ETIMEDOUT) a connection.
2577 */ 2579 */
2578 if (val < 0) 2580 if (val < 0)
@@ -3051,7 +3053,7 @@ static int __init set_thash_entries(char *str)
3051} 3053}
3052__setup("thash_entries=", set_thash_entries); 3054__setup("thash_entries=", set_thash_entries);
3053 3055
3054static void tcp_init_mem(void) 3056static void __init tcp_init_mem(void)
3055{ 3057{
3056 unsigned long limit = nr_free_buffer_pages() / 8; 3058 unsigned long limit = nr_free_buffer_pages() / 8;
3057 limit = max(limit, 128UL); 3059 limit = max(limit, 128UL);
@@ -3137,8 +3139,6 @@ void __init tcp_init(void)
3137 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3139 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3138 3140
3139 tcp_metrics_init(); 3141 tcp_metrics_init();
3140 3142 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3141 tcp_register_congestion_control(&tcp_reno);
3142
3143 tcp_tasklet_init(); 3143 tcp_tasklet_init();
3144} 3144}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index d5de69bc04f5..bb395d46a389 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <net/tcp.h> 18#include <net/tcp.h>
19 19
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation 20#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta 21 * max_cwnd = snd_cwnd * beta
23 */ 22 */
@@ -46,11 +45,10 @@ MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
46module_param(smooth_part, int, 0644); 45module_param(smooth_part, int, 0644);
47MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); 46MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
48 47
49
50/* BIC TCP Parameters */ 48/* BIC TCP Parameters */
51struct bictcp { 49struct bictcp {
52 u32 cnt; /* increase cwnd by 1 after ACKs */ 50 u32 cnt; /* increase cwnd by 1 after ACKs */
53 u32 last_max_cwnd; /* last maximum snd_cwnd */ 51 u32 last_max_cwnd; /* last maximum snd_cwnd */
54 u32 loss_cwnd; /* congestion window at last loss */ 52 u32 loss_cwnd; /* congestion window at last loss */
55 u32 last_cwnd; /* the last snd_cwnd */ 53 u32 last_cwnd; /* the last snd_cwnd */
56 u32 last_time; /* time when updated last_cwnd */ 54 u32 last_time; /* time when updated last_cwnd */
@@ -103,7 +101,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
103 101
104 /* binary increase */ 102 /* binary increase */
105 if (cwnd < ca->last_max_cwnd) { 103 if (cwnd < ca->last_max_cwnd) {
106 __u32 dist = (ca->last_max_cwnd - cwnd) 104 __u32 dist = (ca->last_max_cwnd - cwnd)
107 / BICTCP_B; 105 / BICTCP_B;
108 106
109 if (dist > max_increment) 107 if (dist > max_increment)
@@ -154,7 +152,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
154 bictcp_update(ca, tp->snd_cwnd); 152 bictcp_update(ca, tp->snd_cwnd);
155 tcp_cong_avoid_ai(tp, ca->cnt); 153 tcp_cong_avoid_ai(tp, ca->cnt);
156 } 154 }
157
158} 155}
159 156
160/* 157/*
@@ -177,7 +174,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
177 174
178 ca->loss_cwnd = tp->snd_cwnd; 175 ca->loss_cwnd = tp->snd_cwnd;
179 176
180
181 if (tp->snd_cwnd <= low_window) 177 if (tp->snd_cwnd <= low_window)
182 return max(tp->snd_cwnd >> 1U, 2U); 178 return max(tp->snd_cwnd >> 1U, 2U);
183 else 179 else
@@ -188,6 +184,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
188{ 184{
189 const struct tcp_sock *tp = tcp_sk(sk); 185 const struct tcp_sock *tp = tcp_sk(sk);
190 const struct bictcp *ca = inet_csk_ca(sk); 186 const struct bictcp *ca = inet_csk_ca(sk);
187
191 return max(tp->snd_cwnd, ca->loss_cwnd); 188 return max(tp->snd_cwnd, ca->loss_cwnd);
192} 189}
193 190
@@ -206,12 +203,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
206 203
207 if (icsk->icsk_ca_state == TCP_CA_Open) { 204 if (icsk->icsk_ca_state == TCP_CA_Open) {
208 struct bictcp *ca = inet_csk_ca(sk); 205 struct bictcp *ca = inet_csk_ca(sk);
206
209 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; 207 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
210 ca->delayed_ack += cnt; 208 ca->delayed_ack += cnt;
211 } 209 }
212} 210}
213 211
214
215static struct tcp_congestion_ops bictcp __read_mostly = { 212static struct tcp_congestion_ops bictcp __read_mostly = {
216 .init = bictcp_init, 213 .init = bictcp_init,
217 .ssthresh = bictcp_recalc_ssthresh, 214 .ssthresh = bictcp_recalc_ssthresh,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 7b09d8b49fa5..b1c5970d47a1 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
74EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 74EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
75 75
76/* Assign choice of congestion control. */ 76/* Assign choice of congestion control. */
77void tcp_init_congestion_control(struct sock *sk) 77void tcp_assign_congestion_control(struct sock *sk)
78{ 78{
79 struct inet_connection_sock *icsk = inet_csk(sk); 79 struct inet_connection_sock *icsk = inet_csk(sk);
80 struct tcp_congestion_ops *ca; 80 struct tcp_congestion_ops *ca;
81 81
82 /* if no choice made yet assign the current value set as default */ 82 rcu_read_lock();
83 if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { 83 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
84 rcu_read_lock(); 84 if (likely(try_module_get(ca->owner))) {
85 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 85 icsk->icsk_ca_ops = ca;
86 if (try_module_get(ca->owner)) { 86 goto out;
87 icsk->icsk_ca_ops = ca;
88 break;
89 }
90
91 /* fallback to next available */
92 } 87 }
93 rcu_read_unlock(); 88 /* Fallback to next available. The last really
89 * guaranteed fallback is Reno from this list.
90 */
94 } 91 }
92out:
93 rcu_read_unlock();
94
95 /* Clear out private data before diag gets it and
96 * the ca has not been initialized.
97 */
98 if (ca->get_info)
99 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
100}
101
102void tcp_init_congestion_control(struct sock *sk)
103{
104 const struct inet_connection_sock *icsk = inet_csk(sk);
95 105
96 if (icsk->icsk_ca_ops->init) 106 if (icsk->icsk_ca_ops->init)
97 icsk->icsk_ca_ops->init(sk); 107 icsk->icsk_ca_ops->init(sk);
@@ -142,7 +152,6 @@ static int __init tcp_congestion_default(void)
142} 152}
143late_initcall(tcp_congestion_default); 153late_initcall(tcp_congestion_default);
144 154
145
146/* Build string with list of available congestion control values */ 155/* Build string with list of available congestion control values */
147void tcp_get_available_congestion_control(char *buf, size_t maxlen) 156void tcp_get_available_congestion_control(char *buf, size_t maxlen)
148{ 157{
@@ -154,7 +163,6 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
154 offs += snprintf(buf + offs, maxlen - offs, 163 offs += snprintf(buf + offs, maxlen - offs,
155 "%s%s", 164 "%s%s",
156 offs == 0 ? "" : " ", ca->name); 165 offs == 0 ? "" : " ", ca->name);
157
158 } 166 }
159 rcu_read_unlock(); 167 rcu_read_unlock();
160} 168}
@@ -186,7 +194,6 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
186 offs += snprintf(buf + offs, maxlen - offs, 194 offs += snprintf(buf + offs, maxlen - offs,
187 "%s%s", 195 "%s%s",
188 offs == 0 ? "" : " ", ca->name); 196 offs == 0 ? "" : " ", ca->name);
189
190 } 197 }
191 rcu_read_unlock(); 198 rcu_read_unlock();
192} 199}
@@ -230,7 +237,6 @@ out:
230 return ret; 237 return ret;
231} 238}
232 239
233
234/* Change congestion control for socket */ 240/* Change congestion control for socket */
235int tcp_set_congestion_control(struct sock *sk, const char *name) 241int tcp_set_congestion_control(struct sock *sk, const char *name)
236{ 242{
@@ -285,15 +291,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
285 * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and 291 * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
286 * returns the leftover acks to adjust cwnd in congestion avoidance mode. 292 * returns the leftover acks to adjust cwnd in congestion avoidance mode.
287 */ 293 */
288int tcp_slow_start(struct tcp_sock *tp, u32 acked) 294void tcp_slow_start(struct tcp_sock *tp, u32 acked)
289{ 295{
290 u32 cwnd = tp->snd_cwnd + acked; 296 u32 cwnd = tp->snd_cwnd + acked;
291 297
292 if (cwnd > tp->snd_ssthresh) 298 if (cwnd > tp->snd_ssthresh)
293 cwnd = tp->snd_ssthresh + 1; 299 cwnd = tp->snd_ssthresh + 1;
294 acked -= cwnd - tp->snd_cwnd;
295 tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); 300 tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
296 return acked;
297} 301}
298EXPORT_SYMBOL_GPL(tcp_slow_start); 302EXPORT_SYMBOL_GPL(tcp_slow_start);
299 303
@@ -337,6 +341,7 @@ EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
337u32 tcp_reno_ssthresh(struct sock *sk) 341u32 tcp_reno_ssthresh(struct sock *sk)
338{ 342{
339 const struct tcp_sock *tp = tcp_sk(sk); 343 const struct tcp_sock *tp = tcp_sk(sk);
344
340 return max(tp->snd_cwnd >> 1U, 2U); 345 return max(tp->snd_cwnd >> 1U, 2U);
341} 346}
342EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 347EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
@@ -348,15 +353,3 @@ struct tcp_congestion_ops tcp_reno = {
348 .ssthresh = tcp_reno_ssthresh, 353 .ssthresh = tcp_reno_ssthresh,
349 .cong_avoid = tcp_reno_cong_avoid, 354 .cong_avoid = tcp_reno_cong_avoid,
350}; 355};
351
352/* Initial congestion control used (until SYN)
353 * really reno under another name so we can tell difference
354 * during tcp_set_default_congestion_control
355 */
356struct tcp_congestion_ops tcp_init_congestion_ops = {
357 .name = "",
358 .owner = THIS_MODULE,
359 .ssthresh = tcp_reno_ssthresh,
360 .cong_avoid = tcp_reno_cong_avoid,
361};
362EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index a9bd8a4828a9..20de0118c98e 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -82,12 +82,13 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse
82/* BIC TCP Parameters */ 82/* BIC TCP Parameters */
83struct bictcp { 83struct bictcp {
84 u32 cnt; /* increase cwnd by 1 after ACKs */ 84 u32 cnt; /* increase cwnd by 1 after ACKs */
85 u32 last_max_cwnd; /* last maximum snd_cwnd */ 85 u32 last_max_cwnd; /* last maximum snd_cwnd */
86 u32 loss_cwnd; /* congestion window at last loss */ 86 u32 loss_cwnd; /* congestion window at last loss */
87 u32 last_cwnd; /* the last snd_cwnd */ 87 u32 last_cwnd; /* the last snd_cwnd */
88 u32 last_time; /* time when updated last_cwnd */ 88 u32 last_time; /* time when updated last_cwnd */
89 u32 bic_origin_point;/* origin point of bic function */ 89 u32 bic_origin_point;/* origin point of bic function */
90 u32 bic_K; /* time to origin point from the beginning of the current epoch */ 90 u32 bic_K; /* time to origin point
91 from the beginning of the current epoch */
91 u32 delay_min; /* min delay (msec << 3) */ 92 u32 delay_min; /* min delay (msec << 3) */
92 u32 epoch_start; /* beginning of an epoch */ 93 u32 epoch_start; /* beginning of an epoch */
93 u32 ack_cnt; /* number of acks */ 94 u32 ack_cnt; /* number of acks */
@@ -219,7 +220,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
219 ca->last_time = tcp_time_stamp; 220 ca->last_time = tcp_time_stamp;
220 221
221 if (ca->epoch_start == 0) { 222 if (ca->epoch_start == 0) {
222 ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ 223 ca->epoch_start = tcp_time_stamp; /* record beginning */
223 ca->ack_cnt = 1; /* start counting */ 224 ca->ack_cnt = 1; /* start counting */
224 ca->tcp_cwnd = cwnd; /* syn with cubic */ 225 ca->tcp_cwnd = cwnd; /* syn with cubic */
225 226
@@ -263,9 +264,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
263 264
264 /* c/rtt * (t-K)^3 */ 265 /* c/rtt * (t-K)^3 */
265 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); 266 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
266 if (t < ca->bic_K) /* below origin*/ 267 if (t < ca->bic_K) /* below origin*/
267 bic_target = ca->bic_origin_point - delta; 268 bic_target = ca->bic_origin_point - delta;
268 else /* above origin*/ 269 else /* above origin*/
269 bic_target = ca->bic_origin_point + delta; 270 bic_target = ca->bic_origin_point + delta;
270 271
271 /* cubic function - calc bictcp_cnt*/ 272 /* cubic function - calc bictcp_cnt*/
@@ -285,13 +286,14 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
285 /* TCP Friendly */ 286 /* TCP Friendly */
286 if (tcp_friendliness) { 287 if (tcp_friendliness) {
287 u32 scale = beta_scale; 288 u32 scale = beta_scale;
289
288 delta = (cwnd * scale) >> 3; 290 delta = (cwnd * scale) >> 3;
289 while (ca->ack_cnt > delta) { /* update tcp cwnd */ 291 while (ca->ack_cnt > delta) { /* update tcp cwnd */
290 ca->ack_cnt -= delta; 292 ca->ack_cnt -= delta;
291 ca->tcp_cwnd++; 293 ca->tcp_cwnd++;
292 } 294 }
293 295
294 if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ 296 if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
295 delta = ca->tcp_cwnd - cwnd; 297 delta = ca->tcp_cwnd - cwnd;
296 max_cnt = cwnd / delta; 298 max_cnt = cwnd / delta;
297 if (ca->cnt > max_cnt) 299 if (ca->cnt > max_cnt)
@@ -320,7 +322,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
320 bictcp_update(ca, tp->snd_cwnd); 322 bictcp_update(ca, tp->snd_cwnd);
321 tcp_cong_avoid_ai(tp, ca->cnt); 323 tcp_cong_avoid_ai(tp, ca->cnt);
322 } 324 }
323
324} 325}
325 326
326static u32 bictcp_recalc_ssthresh(struct sock *sk) 327static u32 bictcp_recalc_ssthresh(struct sock *sk)
@@ -452,7 +453,8 @@ static int __init cubictcp_register(void)
452 * based on SRTT of 100ms 453 * based on SRTT of 100ms
453 */ 454 */
454 455
455 beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); 456 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
457 / (BICTCP_BETA_SCALE - beta);
456 458
457 cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ 459 cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
458 460
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000000..b504371af742
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,344 @@
1/* DataCenter TCP (DCTCP) congestion control.
2 *
3 * http://simula.stanford.edu/~alizade/Site/DCTCP.html
4 *
5 * This is an implementation of DCTCP over Reno, an enhancement to the
6 * TCP congestion control algorithm designed for data centers. DCTCP
7 * leverages Explicit Congestion Notification (ECN) in the network to
8 * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
9 * the following three data center transport requirements:
10 *
11 * - High burst tolerance (incast due to partition/aggregate)
12 * - Low latency (short flows, queries)
13 * - High throughput (continuous data updates, large file transfers)
14 * with commodity shallow buffered switches
15 *
16 * The algorithm is described in detail in the following two papers:
17 *
18 * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
19 * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
20 * "Data Center TCP (DCTCP)", Data Center Networks session
21 * Proc. ACM SIGCOMM, New Delhi, 2010.
22 * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
23 *
24 * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
25 * "Analysis of DCTCP: Stability, Convergence, and Fairness"
26 * Proc. ACM SIGMETRICS, San Jose, 2011.
27 * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
28 *
29 * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
30 *
31 * Authors:
32 *
33 * Daniel Borkmann <dborkman@redhat.com>
34 * Florian Westphal <fw@strlen.de>
35 * Glenn Judd <glenn.judd@morganstanley.com>
36 *
37 * This program is free software; you can redistribute it and/or modify
38 * it under the terms of the GNU General Public License as published by
39 * the Free Software Foundation; either version 2 of the License, or (at
40 * your option) any later version.
41 */
42
43#include <linux/module.h>
44#include <linux/mm.h>
45#include <net/tcp.h>
46#include <linux/inet_diag.h>
47
48#define DCTCP_MAX_ALPHA 1024U
49
50struct dctcp {
51 u32 acked_bytes_ecn;
52 u32 acked_bytes_total;
53 u32 prior_snd_una;
54 u32 prior_rcv_nxt;
55 u32 dctcp_alpha;
56 u32 next_seq;
57 u32 ce_state;
58 u32 delayed_ack_reserved;
59};
60
61static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
62module_param(dctcp_shift_g, uint, 0644);
63MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
64
65static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
66module_param(dctcp_alpha_on_init, uint, 0644);
67MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
68
69static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
70module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
71MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
72 "parameter for clamping alpha on loss");
73
74static struct tcp_congestion_ops dctcp_reno;
75
76static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
77{
78 ca->next_seq = tp->snd_nxt;
79
80 ca->acked_bytes_ecn = 0;
81 ca->acked_bytes_total = 0;
82}
83
84static void dctcp_init(struct sock *sk)
85{
86 const struct tcp_sock *tp = tcp_sk(sk);
87
88 if ((tp->ecn_flags & TCP_ECN_OK) ||
89 (sk->sk_state == TCP_LISTEN ||
90 sk->sk_state == TCP_CLOSE)) {
91 struct dctcp *ca = inet_csk_ca(sk);
92
93 ca->prior_snd_una = tp->snd_una;
94 ca->prior_rcv_nxt = tp->rcv_nxt;
95
96 ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
97
98 ca->delayed_ack_reserved = 0;
99 ca->ce_state = 0;
100
101 dctcp_reset(tp, ca);
102 return;
103 }
104
105 /* No ECN support? Fall back to Reno. Also need to clear
106 * ECT from sk since it is set during 3WHS for DCTCP.
107 */
108 inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
109 INET_ECN_dontxmit(sk);
110}
111
112static u32 dctcp_ssthresh(struct sock *sk)
113{
114 const struct dctcp *ca = inet_csk_ca(sk);
115 struct tcp_sock *tp = tcp_sk(sk);
116
117 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
118}
119
120/* Minimal DCTP CE state machine:
121 *
122 * S: 0 <- last pkt was non-CE
123 * 1 <- last pkt was CE
124 */
125
126static void dctcp_ce_state_0_to_1(struct sock *sk)
127{
128 struct dctcp *ca = inet_csk_ca(sk);
129 struct tcp_sock *tp = tcp_sk(sk);
130
131 /* State has changed from CE=0 to CE=1 and delayed
132 * ACK has not sent yet.
133 */
134 if (!ca->ce_state && ca->delayed_ack_reserved) {
135 u32 tmp_rcv_nxt;
136
137 /* Save current rcv_nxt. */
138 tmp_rcv_nxt = tp->rcv_nxt;
139
140 /* Generate previous ack with CE=0. */
141 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
142 tp->rcv_nxt = ca->prior_rcv_nxt;
143
144 tcp_send_ack(sk);
145
146 /* Recover current rcv_nxt. */
147 tp->rcv_nxt = tmp_rcv_nxt;
148 }
149
150 ca->prior_rcv_nxt = tp->rcv_nxt;
151 ca->ce_state = 1;
152
153 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
154}
155
156static void dctcp_ce_state_1_to_0(struct sock *sk)
157{
158 struct dctcp *ca = inet_csk_ca(sk);
159 struct tcp_sock *tp = tcp_sk(sk);
160
161 /* State has changed from CE=1 to CE=0 and delayed
162 * ACK has not sent yet.
163 */
164 if (ca->ce_state && ca->delayed_ack_reserved) {
165 u32 tmp_rcv_nxt;
166
167 /* Save current rcv_nxt. */
168 tmp_rcv_nxt = tp->rcv_nxt;
169
170 /* Generate previous ack with CE=1. */
171 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
172 tp->rcv_nxt = ca->prior_rcv_nxt;
173
174 tcp_send_ack(sk);
175
176 /* Recover current rcv_nxt. */
177 tp->rcv_nxt = tmp_rcv_nxt;
178 }
179
180 ca->prior_rcv_nxt = tp->rcv_nxt;
181 ca->ce_state = 0;
182
183 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
184}
185
186static void dctcp_update_alpha(struct sock *sk, u32 flags)
187{
188 const struct tcp_sock *tp = tcp_sk(sk);
189 struct dctcp *ca = inet_csk_ca(sk);
190 u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
191
192 /* If ack did not advance snd_una, count dupack as MSS size.
193 * If ack did update window, do not count it at all.
194 */
195 if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
196 acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
197 if (acked_bytes) {
198 ca->acked_bytes_total += acked_bytes;
199 ca->prior_snd_una = tp->snd_una;
200
201 if (flags & CA_ACK_ECE)
202 ca->acked_bytes_ecn += acked_bytes;
203 }
204
205 /* Expired RTT */
206 if (!before(tp->snd_una, ca->next_seq)) {
207 /* For avoiding denominator == 1. */
208 if (ca->acked_bytes_total == 0)
209 ca->acked_bytes_total = 1;
210
211 /* alpha = (1 - g) * alpha + g * F */
212 ca->dctcp_alpha = ca->dctcp_alpha -
213 (ca->dctcp_alpha >> dctcp_shift_g) +
214 (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
215 ca->acked_bytes_total;
216
217 if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
218 /* Clamp dctcp_alpha to max. */
219 ca->dctcp_alpha = DCTCP_MAX_ALPHA;
220
221 dctcp_reset(tp, ca);
222 }
223}
224
225static void dctcp_state(struct sock *sk, u8 new_state)
226{
227 if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
228 struct dctcp *ca = inet_csk_ca(sk);
229
230 /* If this extension is enabled, we clamp dctcp_alpha to
231 * max on packet loss; the motivation is that dctcp_alpha
232 * is an indicator to the extend of congestion and packet
233 * loss is an indicator of extreme congestion; setting
234 * this in practice turned out to be beneficial, and
235 * effectively assumes total congestion which reduces the
236 * window by half.
237 */
238 ca->dctcp_alpha = DCTCP_MAX_ALPHA;
239 }
240}
241
242static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
243{
244 struct dctcp *ca = inet_csk_ca(sk);
245
246 switch (ev) {
247 case CA_EVENT_DELAYED_ACK:
248 if (!ca->delayed_ack_reserved)
249 ca->delayed_ack_reserved = 1;
250 break;
251 case CA_EVENT_NON_DELAYED_ACK:
252 if (ca->delayed_ack_reserved)
253 ca->delayed_ack_reserved = 0;
254 break;
255 default:
256 /* Don't care for the rest. */
257 break;
258 }
259}
260
261static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
262{
263 switch (ev) {
264 case CA_EVENT_ECN_IS_CE:
265 dctcp_ce_state_0_to_1(sk);
266 break;
267 case CA_EVENT_ECN_NO_CE:
268 dctcp_ce_state_1_to_0(sk);
269 break;
270 case CA_EVENT_DELAYED_ACK:
271 case CA_EVENT_NON_DELAYED_ACK:
272 dctcp_update_ack_reserved(sk, ev);
273 break;
274 default:
275 /* Don't care for the rest. */
276 break;
277 }
278}
279
280static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
281{
282 const struct dctcp *ca = inet_csk_ca(sk);
283
284 /* Fill it also in case of VEGASINFO due to req struct limits.
285 * We can still correctly retrieve it later.
286 */
287 if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
288 ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
289 struct tcp_dctcp_info info;
290
291 memset(&info, 0, sizeof(info));
292 if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
293 info.dctcp_enabled = 1;
294 info.dctcp_ce_state = (u16) ca->ce_state;
295 info.dctcp_alpha = ca->dctcp_alpha;
296 info.dctcp_ab_ecn = ca->acked_bytes_ecn;
297 info.dctcp_ab_tot = ca->acked_bytes_total;
298 }
299
300 nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
301 }
302}
303
304static struct tcp_congestion_ops dctcp __read_mostly = {
305 .init = dctcp_init,
306 .in_ack_event = dctcp_update_alpha,
307 .cwnd_event = dctcp_cwnd_event,
308 .ssthresh = dctcp_ssthresh,
309 .cong_avoid = tcp_reno_cong_avoid,
310 .set_state = dctcp_state,
311 .get_info = dctcp_get_info,
312 .flags = TCP_CONG_NEEDS_ECN,
313 .owner = THIS_MODULE,
314 .name = "dctcp",
315};
316
317static struct tcp_congestion_ops dctcp_reno __read_mostly = {
318 .ssthresh = tcp_reno_ssthresh,
319 .cong_avoid = tcp_reno_cong_avoid,
320 .get_info = dctcp_get_info,
321 .owner = THIS_MODULE,
322 .name = "dctcp-reno",
323};
324
325static int __init dctcp_register(void)
326{
327 BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
328 return tcp_register_congestion_control(&dctcp);
329}
330
331static void __exit dctcp_unregister(void)
332{
333 tcp_unregister_congestion_control(&dctcp);
334}
335
336module_init(dctcp_register);
337module_exit(dctcp_unregister);
338
339MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
340MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
341MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
342
343MODULE_LICENSE("GPL v2");
344MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index ed3f2ad42e0f..0d73f9ddb55b 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12
13#include <linux/module.h> 12#include <linux/module.h>
14#include <linux/inet_diag.h> 13#include <linux/inet_diag.h>
15 14
@@ -35,13 +34,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
35} 34}
36 35
37static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, 36static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
38 struct inet_diag_req_v2 *r, struct nlattr *bc) 37 struct inet_diag_req_v2 *r, struct nlattr *bc)
39{ 38{
40 inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); 39 inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
41} 40}
42 41
43static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, 42static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
44 struct inet_diag_req_v2 *req) 43 struct inet_diag_req_v2 *req)
45{ 44{
46 return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); 45 return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
47} 46}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 9771563ab564..815c85e3b1e0 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -115,7 +115,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
115 115
116 if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { 116 if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
117 struct in6_addr *buf = (struct in6_addr *) tmp.val; 117 struct in6_addr *buf = (struct in6_addr *) tmp.val;
118 int i = 4; 118 int i;
119 119
120 for (i = 0; i < 4; i++) 120 for (i = 0; i < 4; i++)
121 buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; 121 buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 1c4908280d92..882c08aae2f5 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -9,7 +9,6 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <net/tcp.h> 10#include <net/tcp.h>
11 11
12
13/* From AIMD tables from RFC 3649 appendix B, 12/* From AIMD tables from RFC 3649 appendix B,
14 * with fixed-point MD scaled <<8. 13 * with fixed-point MD scaled <<8.
15 */ 14 */
@@ -17,78 +16,78 @@ static const struct hstcp_aimd_val {
17 unsigned int cwnd; 16 unsigned int cwnd;
18 unsigned int md; 17 unsigned int md;
19} hstcp_aimd_vals[] = { 18} hstcp_aimd_vals[] = {
20 { 38, 128, /* 0.50 */ }, 19 { 38, 128, /* 0.50 */ },
21 { 118, 112, /* 0.44 */ }, 20 { 118, 112, /* 0.44 */ },
22 { 221, 104, /* 0.41 */ }, 21 { 221, 104, /* 0.41 */ },
23 { 347, 98, /* 0.38 */ }, 22 { 347, 98, /* 0.38 */ },
24 { 495, 93, /* 0.37 */ }, 23 { 495, 93, /* 0.37 */ },
25 { 663, 89, /* 0.35 */ }, 24 { 663, 89, /* 0.35 */ },
26 { 851, 86, /* 0.34 */ }, 25 { 851, 86, /* 0.34 */ },
27 { 1058, 83, /* 0.33 */ }, 26 { 1058, 83, /* 0.33 */ },
28 { 1284, 81, /* 0.32 */ }, 27 { 1284, 81, /* 0.32 */ },
29 { 1529, 78, /* 0.31 */ }, 28 { 1529, 78, /* 0.31 */ },
30 { 1793, 76, /* 0.30 */ }, 29 { 1793, 76, /* 0.30 */ },
31 { 2076, 74, /* 0.29 */ }, 30 { 2076, 74, /* 0.29 */ },
32 { 2378, 72, /* 0.28 */ }, 31 { 2378, 72, /* 0.28 */ },
33 { 2699, 71, /* 0.28 */ }, 32 { 2699, 71, /* 0.28 */ },
34 { 3039, 69, /* 0.27 */ }, 33 { 3039, 69, /* 0.27 */ },
35 { 3399, 68, /* 0.27 */ }, 34 { 3399, 68, /* 0.27 */ },
36 { 3778, 66, /* 0.26 */ }, 35 { 3778, 66, /* 0.26 */ },
37 { 4177, 65, /* 0.26 */ }, 36 { 4177, 65, /* 0.26 */ },
38 { 4596, 64, /* 0.25 */ }, 37 { 4596, 64, /* 0.25 */ },
39 { 5036, 62, /* 0.25 */ }, 38 { 5036, 62, /* 0.25 */ },
40 { 5497, 61, /* 0.24 */ }, 39 { 5497, 61, /* 0.24 */ },
41 { 5979, 60, /* 0.24 */ }, 40 { 5979, 60, /* 0.24 */ },
42 { 6483, 59, /* 0.23 */ }, 41 { 6483, 59, /* 0.23 */ },
43 { 7009, 58, /* 0.23 */ }, 42 { 7009, 58, /* 0.23 */ },
44 { 7558, 57, /* 0.22 */ }, 43 { 7558, 57, /* 0.22 */ },
45 { 8130, 56, /* 0.22 */ }, 44 { 8130, 56, /* 0.22 */ },
46 { 8726, 55, /* 0.22 */ }, 45 { 8726, 55, /* 0.22 */ },
47 { 9346, 54, /* 0.21 */ }, 46 { 9346, 54, /* 0.21 */ },
48 { 9991, 53, /* 0.21 */ }, 47 { 9991, 53, /* 0.21 */ },
49 { 10661, 52, /* 0.21 */ }, 48 { 10661, 52, /* 0.21 */ },
50 { 11358, 52, /* 0.20 */ }, 49 { 11358, 52, /* 0.20 */ },
51 { 12082, 51, /* 0.20 */ }, 50 { 12082, 51, /* 0.20 */ },
52 { 12834, 50, /* 0.20 */ }, 51 { 12834, 50, /* 0.20 */ },
53 { 13614, 49, /* 0.19 */ }, 52 { 13614, 49, /* 0.19 */ },
54 { 14424, 48, /* 0.19 */ }, 53 { 14424, 48, /* 0.19 */ },
55 { 15265, 48, /* 0.19 */ }, 54 { 15265, 48, /* 0.19 */ },
56 { 16137, 47, /* 0.19 */ }, 55 { 16137, 47, /* 0.19 */ },
57 { 17042, 46, /* 0.18 */ }, 56 { 17042, 46, /* 0.18 */ },
58 { 17981, 45, /* 0.18 */ }, 57 { 17981, 45, /* 0.18 */ },
59 { 18955, 45, /* 0.18 */ }, 58 { 18955, 45, /* 0.18 */ },
60 { 19965, 44, /* 0.17 */ }, 59 { 19965, 44, /* 0.17 */ },
61 { 21013, 43, /* 0.17 */ }, 60 { 21013, 43, /* 0.17 */ },
62 { 22101, 43, /* 0.17 */ }, 61 { 22101, 43, /* 0.17 */ },
63 { 23230, 42, /* 0.17 */ }, 62 { 23230, 42, /* 0.17 */ },
64 { 24402, 41, /* 0.16 */ }, 63 { 24402, 41, /* 0.16 */ },
65 { 25618, 41, /* 0.16 */ }, 64 { 25618, 41, /* 0.16 */ },
66 { 26881, 40, /* 0.16 */ }, 65 { 26881, 40, /* 0.16 */ },
67 { 28193, 39, /* 0.16 */ }, 66 { 28193, 39, /* 0.16 */ },
68 { 29557, 39, /* 0.15 */ }, 67 { 29557, 39, /* 0.15 */ },
69 { 30975, 38, /* 0.15 */ }, 68 { 30975, 38, /* 0.15 */ },
70 { 32450, 38, /* 0.15 */ }, 69 { 32450, 38, /* 0.15 */ },
71 { 33986, 37, /* 0.15 */ }, 70 { 33986, 37, /* 0.15 */ },
72 { 35586, 36, /* 0.14 */ }, 71 { 35586, 36, /* 0.14 */ },
73 { 37253, 36, /* 0.14 */ }, 72 { 37253, 36, /* 0.14 */ },
74 { 38992, 35, /* 0.14 */ }, 73 { 38992, 35, /* 0.14 */ },
75 { 40808, 35, /* 0.14 */ }, 74 { 40808, 35, /* 0.14 */ },
76 { 42707, 34, /* 0.13 */ }, 75 { 42707, 34, /* 0.13 */ },
77 { 44694, 33, /* 0.13 */ }, 76 { 44694, 33, /* 0.13 */ },
78 { 46776, 33, /* 0.13 */ }, 77 { 46776, 33, /* 0.13 */ },
79 { 48961, 32, /* 0.13 */ }, 78 { 48961, 32, /* 0.13 */ },
80 { 51258, 32, /* 0.13 */ }, 79 { 51258, 32, /* 0.13 */ },
81 { 53677, 31, /* 0.12 */ }, 80 { 53677, 31, /* 0.12 */ },
82 { 56230, 30, /* 0.12 */ }, 81 { 56230, 30, /* 0.12 */ },
83 { 58932, 30, /* 0.12 */ }, 82 { 58932, 30, /* 0.12 */ },
84 { 61799, 29, /* 0.12 */ }, 83 { 61799, 29, /* 0.12 */ },
85 { 64851, 28, /* 0.11 */ }, 84 { 64851, 28, /* 0.11 */ },
86 { 68113, 28, /* 0.11 */ }, 85 { 68113, 28, /* 0.11 */ },
87 { 71617, 27, /* 0.11 */ }, 86 { 71617, 27, /* 0.11 */ },
88 { 75401, 26, /* 0.10 */ }, 87 { 75401, 26, /* 0.10 */ },
89 { 79517, 26, /* 0.10 */ }, 88 { 79517, 26, /* 0.10 */ },
90 { 84035, 25, /* 0.10 */ }, 89 { 84035, 25, /* 0.10 */ },
91 { 89053, 24, /* 0.10 */ }, 90 { 89053, 24, /* 0.10 */ },
92}; 91};
93 92
94#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) 93#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 031361311a8b..58469fff6c18 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -98,7 +98,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
98 } 98 }
99} 99}
100 100
101static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt) 101static void measure_achieved_throughput(struct sock *sk,
102 u32 pkts_acked, s32 rtt)
102{ 103{
103 const struct inet_connection_sock *icsk = inet_csk(sk); 104 const struct inet_connection_sock *icsk = inet_csk(sk);
104 const struct tcp_sock *tp = tcp_sk(sk); 105 const struct tcp_sock *tp = tcp_sk(sk);
@@ -148,8 +149,8 @@ static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
148 if (use_bandwidth_switch) { 149 if (use_bandwidth_switch) {
149 u32 maxB = ca->maxB; 150 u32 maxB = ca->maxB;
150 u32 old_maxB = ca->old_maxB; 151 u32 old_maxB = ca->old_maxB;
151 ca->old_maxB = ca->maxB;
152 152
153 ca->old_maxB = ca->maxB;
153 if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { 154 if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
154 ca->beta = BETA_MIN; 155 ca->beta = BETA_MIN;
155 ca->modeswitch = 0; 156 ca->modeswitch = 0;
@@ -270,6 +271,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
270 case TCP_CA_Open: 271 case TCP_CA_Open:
271 { 272 {
272 struct htcp *ca = inet_csk_ca(sk); 273 struct htcp *ca = inet_csk_ca(sk);
274
273 if (ca->undo_last_cong) { 275 if (ca->undo_last_cong) {
274 ca->last_cong = jiffies; 276 ca->last_cong = jiffies;
275 ca->undo_last_cong = 0; 277 ca->undo_last_cong = 0;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index d8f8f05a4951..f963b274f2b0 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -29,7 +29,6 @@ static int rtt0 = 25;
29module_param(rtt0, int, 0644); 29module_param(rtt0, int, 0644);
30MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); 30MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
31 31
32
33/* This is called to refresh values for hybla parameters */ 32/* This is called to refresh values for hybla parameters */
34static inline void hybla_recalc_param (struct sock *sk) 33static inline void hybla_recalc_param (struct sock *sk)
35{ 34{
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 5999b3972e64..1d5a30a90adf 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -284,7 +284,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
284 delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; 284 delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
285 if (delta >= tp->snd_cwnd) { 285 if (delta >= tp->snd_cwnd) {
286 tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, 286 tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
287 (u32) tp->snd_cwnd_clamp); 287 (u32)tp->snd_cwnd_clamp);
288 tp->snd_cwnd_cnt = 0; 288 tp->snd_cwnd_cnt = 0;
289 } 289 }
290 } 290 }
@@ -299,7 +299,6 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
299 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); 299 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
300} 300}
301 301
302
303/* Extract info for Tcp socket info provided via netlink. */ 302/* Extract info for Tcp socket info provided via netlink. */
304static void tcp_illinois_info(struct sock *sk, u32 ext, 303static void tcp_illinois_info(struct sock *sk, u32 ext,
305 struct sk_buff *skb) 304 struct sk_buff *skb)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0185eea59342..00a41499d52c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -200,28 +200,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk)
200 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; 200 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
201} 201}
202 202
203static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) 203static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
204{ 204{
205 if (tp->ecn_flags & TCP_ECN_OK) 205 if (tp->ecn_flags & TCP_ECN_OK)
206 tp->ecn_flags |= TCP_ECN_QUEUE_CWR; 206 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
207} 207}
208 208
209static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) 209static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
210{ 210{
211 if (tcp_hdr(skb)->cwr) 211 if (tcp_hdr(skb)->cwr)
212 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 212 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
213} 213}
214 214
215static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) 215static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
216{ 216{
217 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 217 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
218} 218}
219 219
220static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) 220static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
221{ 221{
222 if (!(tp->ecn_flags & TCP_ECN_OK))
223 return;
224
225 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { 222 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
226 case INET_ECN_NOT_ECT: 223 case INET_ECN_NOT_ECT:
227 /* Funny extension: if ECT is not set on a segment, 224 /* Funny extension: if ECT is not set on a segment,
@@ -232,30 +229,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
232 tcp_enter_quickack_mode((struct sock *)tp); 229 tcp_enter_quickack_mode((struct sock *)tp);
233 break; 230 break;
234 case INET_ECN_CE: 231 case INET_ECN_CE:
232 if (tcp_ca_needs_ecn((struct sock *)tp))
233 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
234
235 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 235 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
236 /* Better not delay acks, sender can have a very low cwnd */ 236 /* Better not delay acks, sender can have a very low cwnd */
237 tcp_enter_quickack_mode((struct sock *)tp); 237 tcp_enter_quickack_mode((struct sock *)tp);
238 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 238 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
239 } 239 }
240 /* fallinto */ 240 tp->ecn_flags |= TCP_ECN_SEEN;
241 break;
241 default: 242 default:
243 if (tcp_ca_needs_ecn((struct sock *)tp))
244 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
242 tp->ecn_flags |= TCP_ECN_SEEN; 245 tp->ecn_flags |= TCP_ECN_SEEN;
246 break;
243 } 247 }
244} 248}
245 249
246static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) 250static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
251{
252 if (tp->ecn_flags & TCP_ECN_OK)
253 __tcp_ecn_check_ce(tp, skb);
254}
255
256static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
247{ 257{
248 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) 258 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
249 tp->ecn_flags &= ~TCP_ECN_OK; 259 tp->ecn_flags &= ~TCP_ECN_OK;
250} 260}
251 261
252static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) 262static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
253{ 263{
254 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) 264 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
255 tp->ecn_flags &= ~TCP_ECN_OK; 265 tp->ecn_flags &= ~TCP_ECN_OK;
256} 266}
257 267
258static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) 268static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
259{ 269{
260 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) 270 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
261 return true; 271 return true;
@@ -652,7 +662,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
652 } 662 }
653 icsk->icsk_ack.lrcvtime = now; 663 icsk->icsk_ack.lrcvtime = now;
654 664
655 TCP_ECN_check_ce(tp, skb); 665 tcp_ecn_check_ce(tp, skb);
656 666
657 if (skb->len >= 128) 667 if (skb->len >= 128)
658 tcp_grow_window(sk, skb); 668 tcp_grow_window(sk, skb);
@@ -1294,9 +1304,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1294 TCP_SKB_CB(prev)->end_seq += shifted; 1304 TCP_SKB_CB(prev)->end_seq += shifted;
1295 TCP_SKB_CB(skb)->seq += shifted; 1305 TCP_SKB_CB(skb)->seq += shifted;
1296 1306
1297 skb_shinfo(prev)->gso_segs += pcount; 1307 tcp_skb_pcount_add(prev, pcount);
1298 BUG_ON(skb_shinfo(skb)->gso_segs < pcount); 1308 BUG_ON(tcp_skb_pcount(skb) < pcount);
1299 skb_shinfo(skb)->gso_segs -= pcount; 1309 tcp_skb_pcount_add(skb, -pcount);
1300 1310
1301 /* When we're adding to gso_segs == 1, gso_size will be zero, 1311 /* When we're adding to gso_segs == 1, gso_size will be zero,
1302 * in theory this shouldn't be necessary but as long as DSACK 1312 * in theory this shouldn't be necessary but as long as DSACK
@@ -1309,7 +1319,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1309 } 1319 }
1310 1320
1311 /* CHECKME: To clear or not to clear? Mimics normal skb currently */ 1321 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1312 if (skb_shinfo(skb)->gso_segs <= 1) { 1322 if (tcp_skb_pcount(skb) <= 1) {
1313 skb_shinfo(skb)->gso_size = 0; 1323 skb_shinfo(skb)->gso_size = 0;
1314 skb_shinfo(skb)->gso_type = 0; 1324 skb_shinfo(skb)->gso_type = 0;
1315 } 1325 }
@@ -1887,21 +1897,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1887 tp->sacked_out = 0; 1897 tp->sacked_out = 0;
1888} 1898}
1889 1899
1890static void tcp_clear_retrans_partial(struct tcp_sock *tp) 1900void tcp_clear_retrans(struct tcp_sock *tp)
1891{ 1901{
1892 tp->retrans_out = 0; 1902 tp->retrans_out = 0;
1893 tp->lost_out = 0; 1903 tp->lost_out = 0;
1894
1895 tp->undo_marker = 0; 1904 tp->undo_marker = 0;
1896 tp->undo_retrans = -1; 1905 tp->undo_retrans = -1;
1906 tp->fackets_out = 0;
1907 tp->sacked_out = 0;
1897} 1908}
1898 1909
1899void tcp_clear_retrans(struct tcp_sock *tp) 1910static inline void tcp_init_undo(struct tcp_sock *tp)
1900{ 1911{
1901 tcp_clear_retrans_partial(tp); 1912 tp->undo_marker = tp->snd_una;
1902 1913 /* Retransmission still in flight may cause DSACKs later. */
1903 tp->fackets_out = 0; 1914 tp->undo_retrans = tp->retrans_out ? : -1;
1904 tp->sacked_out = 0;
1905} 1915}
1906 1916
1907/* Enter Loss state. If we detect SACK reneging, forget all SACK information 1917/* Enter Loss state. If we detect SACK reneging, forget all SACK information
@@ -1924,18 +1934,18 @@ void tcp_enter_loss(struct sock *sk)
1924 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1934 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1925 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1935 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1926 tcp_ca_event(sk, CA_EVENT_LOSS); 1936 tcp_ca_event(sk, CA_EVENT_LOSS);
1937 tcp_init_undo(tp);
1927 } 1938 }
1928 tp->snd_cwnd = 1; 1939 tp->snd_cwnd = 1;
1929 tp->snd_cwnd_cnt = 0; 1940 tp->snd_cwnd_cnt = 0;
1930 tp->snd_cwnd_stamp = tcp_time_stamp; 1941 tp->snd_cwnd_stamp = tcp_time_stamp;
1931 1942
1932 tcp_clear_retrans_partial(tp); 1943 tp->retrans_out = 0;
1944 tp->lost_out = 0;
1933 1945
1934 if (tcp_is_reno(tp)) 1946 if (tcp_is_reno(tp))
1935 tcp_reset_reno_sack(tp); 1947 tcp_reset_reno_sack(tp);
1936 1948
1937 tp->undo_marker = tp->snd_una;
1938
1939 skb = tcp_write_queue_head(sk); 1949 skb = tcp_write_queue_head(sk);
1940 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); 1950 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1941 if (is_reneg) { 1951 if (is_reneg) {
@@ -1949,9 +1959,6 @@ void tcp_enter_loss(struct sock *sk)
1949 if (skb == tcp_send_head(sk)) 1959 if (skb == tcp_send_head(sk))
1950 break; 1960 break;
1951 1961
1952 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1953 tp->undo_marker = 0;
1954
1955 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; 1962 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1956 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { 1963 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
1957 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1964 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
@@ -1971,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk)
1971 sysctl_tcp_reordering); 1978 sysctl_tcp_reordering);
1972 tcp_set_ca_state(sk, TCP_CA_Loss); 1979 tcp_set_ca_state(sk, TCP_CA_Loss);
1973 tp->high_seq = tp->snd_nxt; 1980 tp->high_seq = tp->snd_nxt;
1974 TCP_ECN_queue_cwr(tp); 1981 tcp_ecn_queue_cwr(tp);
1975 1982
1976 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous 1983 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
1977 * loss recovery is underway except recurring timeout(s) on 1984 * loss recovery is underway except recurring timeout(s) on
@@ -2363,7 +2370,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2363 2370
2364 if (tp->prior_ssthresh > tp->snd_ssthresh) { 2371 if (tp->prior_ssthresh > tp->snd_ssthresh) {
2365 tp->snd_ssthresh = tp->prior_ssthresh; 2372 tp->snd_ssthresh = tp->prior_ssthresh;
2366 TCP_ECN_withdraw_cwr(tp); 2373 tcp_ecn_withdraw_cwr(tp);
2367 } 2374 }
2368 } else { 2375 } else {
2369 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); 2376 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
@@ -2493,7 +2500,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
2493 tp->prr_delivered = 0; 2500 tp->prr_delivered = 0;
2494 tp->prr_out = 0; 2501 tp->prr_out = 0;
2495 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); 2502 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2496 TCP_ECN_queue_cwr(tp); 2503 tcp_ecn_queue_cwr(tp);
2497} 2504}
2498 2505
2499static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, 2506static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
@@ -2670,8 +2677,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2670 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2677 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2671 2678
2672 tp->prior_ssthresh = 0; 2679 tp->prior_ssthresh = 0;
2673 tp->undo_marker = tp->snd_una; 2680 tcp_init_undo(tp);
2674 tp->undo_retrans = tp->retrans_out ? : -1;
2675 2681
2676 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2682 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2677 if (!ece_ack) 2683 if (!ece_ack)
@@ -2970,7 +2976,8 @@ void tcp_rearm_rto(struct sock *sk)
2970 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2976 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2971 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2977 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2972 struct sk_buff *skb = tcp_write_queue_head(sk); 2978 struct sk_buff *skb = tcp_write_queue_head(sk);
2973 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 2979 const u32 rto_time_stamp =
2980 tcp_skb_timestamp(skb) + rto;
2974 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2981 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
2975 /* delta may not be positive if the socket is locked 2982 /* delta may not be positive if the socket is locked
2976 * when the retrans timer fires and is rescheduled. 2983 * when the retrans timer fires and is rescheduled.
@@ -3210,9 +3217,10 @@ static void tcp_ack_probe(struct sock *sk)
3210 * This function is not for random using! 3217 * This function is not for random using!
3211 */ 3218 */
3212 } else { 3219 } else {
3220 unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
3221
3213 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3222 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3214 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), 3223 when, TCP_RTO_MAX);
3215 TCP_RTO_MAX);
3216 } 3224 }
3217} 3225}
3218 3226
@@ -3363,6 +3371,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3363 } 3371 }
3364} 3372}
3365 3373
3374static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3375{
3376 const struct inet_connection_sock *icsk = inet_csk(sk);
3377
3378 if (icsk->icsk_ca_ops->in_ack_event)
3379 icsk->icsk_ca_ops->in_ack_event(sk, flags);
3380}
3381
3366/* This routine deals with incoming acks, but not outgoing ones. */ 3382/* This routine deals with incoming acks, but not outgoing ones. */
3367static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3383static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3368{ 3384{
@@ -3422,10 +3438,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3422 tp->snd_una = ack; 3438 tp->snd_una = ack;
3423 flag |= FLAG_WIN_UPDATE; 3439 flag |= FLAG_WIN_UPDATE;
3424 3440
3425 tcp_ca_event(sk, CA_EVENT_FAST_ACK); 3441 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3426 3442
3427 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); 3443 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
3428 } else { 3444 } else {
3445 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3446
3429 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 3447 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3430 flag |= FLAG_DATA; 3448 flag |= FLAG_DATA;
3431 else 3449 else
@@ -3437,10 +3455,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3437 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3455 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3438 &sack_rtt_us); 3456 &sack_rtt_us);
3439 3457
3440 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3458 if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3441 flag |= FLAG_ECE; 3459 flag |= FLAG_ECE;
3460 ack_ev_flags |= CA_ACK_ECE;
3461 }
3462
3463 if (flag & FLAG_WIN_UPDATE)
3464 ack_ev_flags |= CA_ACK_WIN_UPDATE;
3442 3465
3443 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); 3466 tcp_in_ack_event(sk, ack_ev_flags);
3444 } 3467 }
3445 3468
3446 /* We passed data and got it acked, remove any soft error 3469 /* We passed data and got it acked, remove any soft error
@@ -4062,6 +4085,44 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4062 tp->rx_opt.num_sacks = num_sacks; 4085 tp->rx_opt.num_sacks = num_sacks;
4063} 4086}
4064 4087
4088/**
4089 * tcp_try_coalesce - try to merge skb to prior one
4090 * @sk: socket
4091 * @to: prior buffer
4092 * @from: buffer to add in queue
4093 * @fragstolen: pointer to boolean
4094 *
4095 * Before queueing skb @from after @to, try to merge them
4096 * to reduce overall memory use and queue lengths, if cost is small.
4097 * Packets in ofo or receive queues can stay a long time.
4098 * Better try to coalesce them right now to avoid future collapses.
4099 * Returns true if caller should free @from instead of queueing it
4100 */
4101static bool tcp_try_coalesce(struct sock *sk,
4102 struct sk_buff *to,
4103 struct sk_buff *from,
4104 bool *fragstolen)
4105{
4106 int delta;
4107
4108 *fragstolen = false;
4109
4110 /* Its possible this segment overlaps with prior segment in queue */
4111 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4112 return false;
4113
4114 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4115 return false;
4116
4117 atomic_add(delta, &sk->sk_rmem_alloc);
4118 sk_mem_charge(sk, delta);
4119 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4120 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4121 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4122 TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4123 return true;
4124}
4125
4065/* This one checks to see if we can put data from the 4126/* This one checks to see if we can put data from the
4066 * out_of_order queue into the receive_queue. 4127 * out_of_order queue into the receive_queue.
4067 */ 4128 */
@@ -4069,7 +4130,8 @@ static void tcp_ofo_queue(struct sock *sk)
4069{ 4130{
4070 struct tcp_sock *tp = tcp_sk(sk); 4131 struct tcp_sock *tp = tcp_sk(sk);
4071 __u32 dsack_high = tp->rcv_nxt; 4132 __u32 dsack_high = tp->rcv_nxt;
4072 struct sk_buff *skb; 4133 struct sk_buff *skb, *tail;
4134 bool fragstolen, eaten;
4073 4135
4074 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { 4136 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
4075 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4137 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
@@ -4082,9 +4144,9 @@ static void tcp_ofo_queue(struct sock *sk)
4082 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 4144 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4083 } 4145 }
4084 4146
4147 __skb_unlink(skb, &tp->out_of_order_queue);
4085 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4148 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4086 SOCK_DEBUG(sk, "ofo packet was already received\n"); 4149 SOCK_DEBUG(sk, "ofo packet was already received\n");
4087 __skb_unlink(skb, &tp->out_of_order_queue);
4088 __kfree_skb(skb); 4150 __kfree_skb(skb);
4089 continue; 4151 continue;
4090 } 4152 }
@@ -4092,11 +4154,15 @@ static void tcp_ofo_queue(struct sock *sk)
4092 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 4154 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4093 TCP_SKB_CB(skb)->end_seq); 4155 TCP_SKB_CB(skb)->end_seq);
4094 4156
4095 __skb_unlink(skb, &tp->out_of_order_queue); 4157 tail = skb_peek_tail(&sk->sk_receive_queue);
4096 __skb_queue_tail(&sk->sk_receive_queue, skb); 4158 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4097 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4159 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4098 if (tcp_hdr(skb)->fin) 4160 if (!eaten)
4161 __skb_queue_tail(&sk->sk_receive_queue, skb);
4162 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4099 tcp_fin(sk); 4163 tcp_fin(sk);
4164 if (eaten)
4165 kfree_skb_partial(skb, fragstolen);
4100 } 4166 }
4101} 4167}
4102 4168
@@ -4123,53 +4189,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4123 return 0; 4189 return 0;
4124} 4190}
4125 4191
4126/**
4127 * tcp_try_coalesce - try to merge skb to prior one
4128 * @sk: socket
4129 * @to: prior buffer
4130 * @from: buffer to add in queue
4131 * @fragstolen: pointer to boolean
4132 *
4133 * Before queueing skb @from after @to, try to merge them
4134 * to reduce overall memory use and queue lengths, if cost is small.
4135 * Packets in ofo or receive queues can stay a long time.
4136 * Better try to coalesce them right now to avoid future collapses.
4137 * Returns true if caller should free @from instead of queueing it
4138 */
4139static bool tcp_try_coalesce(struct sock *sk,
4140 struct sk_buff *to,
4141 struct sk_buff *from,
4142 bool *fragstolen)
4143{
4144 int delta;
4145
4146 *fragstolen = false;
4147
4148 if (tcp_hdr(from)->fin)
4149 return false;
4150
4151 /* Its possible this segment overlaps with prior segment in queue */
4152 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4153 return false;
4154
4155 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4156 return false;
4157
4158 atomic_add(delta, &sk->sk_rmem_alloc);
4159 sk_mem_charge(sk, delta);
4160 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4161 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4162 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4163 return true;
4164}
4165
4166static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4192static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4167{ 4193{
4168 struct tcp_sock *tp = tcp_sk(sk); 4194 struct tcp_sock *tp = tcp_sk(sk);
4169 struct sk_buff *skb1; 4195 struct sk_buff *skb1;
4170 u32 seq, end_seq; 4196 u32 seq, end_seq;
4171 4197
4172 TCP_ECN_check_ce(tp, skb); 4198 tcp_ecn_check_ce(tp, skb);
4173 4199
4174 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { 4200 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4175 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); 4201 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4308,24 +4334,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4308 4334
4309int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4335int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4310{ 4336{
4311 struct sk_buff *skb = NULL; 4337 struct sk_buff *skb;
4312 struct tcphdr *th;
4313 bool fragstolen; 4338 bool fragstolen;
4314 4339
4315 if (size == 0) 4340 if (size == 0)
4316 return 0; 4341 return 0;
4317 4342
4318 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); 4343 skb = alloc_skb(size, sk->sk_allocation);
4319 if (!skb) 4344 if (!skb)
4320 goto err; 4345 goto err;
4321 4346
4322 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) 4347 if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4323 goto err_free; 4348 goto err_free;
4324 4349
4325 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4326 skb_reset_transport_header(skb);
4327 memset(th, 0, sizeof(*th));
4328
4329 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) 4350 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
4330 goto err_free; 4351 goto err_free;
4331 4352
@@ -4333,7 +4354,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4333 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; 4354 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4334 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; 4355 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4335 4356
4336 if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { 4357 if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4337 WARN_ON_ONCE(fragstolen); /* should not happen */ 4358 WARN_ON_ONCE(fragstolen); /* should not happen */
4338 __kfree_skb(skb); 4359 __kfree_skb(skb);
4339 } 4360 }
@@ -4347,7 +4368,6 @@ err:
4347 4368
4348static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 4369static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4349{ 4370{
4350 const struct tcphdr *th = tcp_hdr(skb);
4351 struct tcp_sock *tp = tcp_sk(sk); 4371 struct tcp_sock *tp = tcp_sk(sk);
4352 int eaten = -1; 4372 int eaten = -1;
4353 bool fragstolen = false; 4373 bool fragstolen = false;
@@ -4356,9 +4376,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4356 goto drop; 4376 goto drop;
4357 4377
4358 skb_dst_drop(skb); 4378 skb_dst_drop(skb);
4359 __skb_pull(skb, th->doff * 4); 4379 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
4360 4380
4361 TCP_ECN_accept_cwr(tp, skb); 4381 tcp_ecn_accept_cwr(tp, skb);
4362 4382
4363 tp->rx_opt.dsack = 0; 4383 tp->rx_opt.dsack = 0;
4364 4384
@@ -4400,7 +4420,7 @@ queue_and_out:
4400 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4420 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4401 if (skb->len) 4421 if (skb->len)
4402 tcp_event_data_recv(sk, skb); 4422 tcp_event_data_recv(sk, skb);
4403 if (th->fin) 4423 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4404 tcp_fin(sk); 4424 tcp_fin(sk);
4405 4425
4406 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4426 if (!skb_queue_empty(&tp->out_of_order_queue)) {
@@ -4515,7 +4535,7 @@ restart:
4515 * - bloated or contains data before "start" or 4535 * - bloated or contains data before "start" or
4516 * overlaps to the next one. 4536 * overlaps to the next one.
4517 */ 4537 */
4518 if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && 4538 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4519 (tcp_win_from_space(skb->truesize) > skb->len || 4539 (tcp_win_from_space(skb->truesize) > skb->len ||
4520 before(TCP_SKB_CB(skb)->seq, start))) { 4540 before(TCP_SKB_CB(skb)->seq, start))) {
4521 end_of_skbs = false; 4541 end_of_skbs = false;
@@ -4534,30 +4554,18 @@ restart:
4534 /* Decided to skip this, advance start seq. */ 4554 /* Decided to skip this, advance start seq. */
4535 start = TCP_SKB_CB(skb)->end_seq; 4555 start = TCP_SKB_CB(skb)->end_seq;
4536 } 4556 }
4537 if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) 4557 if (end_of_skbs ||
4558 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4538 return; 4559 return;
4539 4560
4540 while (before(start, end)) { 4561 while (before(start, end)) {
4562 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4541 struct sk_buff *nskb; 4563 struct sk_buff *nskb;
4542 unsigned int header = skb_headroom(skb);
4543 int copy = SKB_MAX_ORDER(header, 0);
4544 4564
4545 /* Too big header? This can happen with IPv6. */ 4565 nskb = alloc_skb(copy, GFP_ATOMIC);
4546 if (copy < 0)
4547 return;
4548 if (end - start < copy)
4549 copy = end - start;
4550 nskb = alloc_skb(copy + header, GFP_ATOMIC);
4551 if (!nskb) 4566 if (!nskb)
4552 return; 4567 return;
4553 4568
4554 skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
4555 skb_set_network_header(nskb, (skb_network_header(skb) -
4556 skb->head));
4557 skb_set_transport_header(nskb, (skb_transport_header(skb) -
4558 skb->head));
4559 skb_reserve(nskb, header);
4560 memcpy(nskb->head, skb->head, header);
4561 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4569 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4562 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4570 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4563 __skb_queue_before(list, skb, nskb); 4571 __skb_queue_before(list, skb, nskb);
@@ -4581,8 +4589,7 @@ restart:
4581 skb = tcp_collapse_one(sk, skb, list); 4589 skb = tcp_collapse_one(sk, skb, list);
4582 if (!skb || 4590 if (!skb ||
4583 skb == tail || 4591 skb == tail ||
4584 tcp_hdr(skb)->syn || 4592 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4585 tcp_hdr(skb)->fin)
4586 return; 4593 return;
4587 } 4594 }
4588 } 4595 }
@@ -5386,7 +5393,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5386 * state to ESTABLISHED..." 5393 * state to ESTABLISHED..."
5387 */ 5394 */
5388 5395
5389 TCP_ECN_rcv_synack(tp, th); 5396 tcp_ecn_rcv_synack(tp, th);
5390 5397
5391 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5398 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5392 tcp_ack(sk, skb, FLAG_SLOWPATH); 5399 tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -5505,7 +5512,7 @@ discard:
5505 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5512 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
5506 tp->max_window = tp->snd_wnd; 5513 tp->max_window = tp->snd_wnd;
5507 5514
5508 TCP_ECN_rcv_syn(tp, th); 5515 tcp_ecn_rcv_syn(tp, th);
5509 5516
5510 tcp_mtup_init(sk); 5517 tcp_mtup_init(sk);
5511 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5518 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -5835,6 +5842,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5835#endif 5842#endif
5836} 5843}
5837 5844
5845/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
5846 *
5847 * If we receive a SYN packet with these bits set, it means a
5848 * network is playing bad games with TOS bits. In order to
5849 * avoid possible false congestion notifications, we disable
5850 * TCP ECN negociation.
5851 *
5852 * Exception: tcp_ca wants ECN. This is required for DCTCP
5853 * congestion control; it requires setting ECT on all packets,
5854 * including SYN. We inverse the test in this case: If our
5855 * local socket wants ECN, but peer only set ece/cwr (but not
5856 * ECT in IP header) its probably a non-DCTCP aware sender.
5857 */
5858static void tcp_ecn_create_request(struct request_sock *req,
5859 const struct sk_buff *skb,
5860 const struct sock *listen_sk)
5861{
5862 const struct tcphdr *th = tcp_hdr(skb);
5863 const struct net *net = sock_net(listen_sk);
5864 bool th_ecn = th->ece && th->cwr;
5865 bool ect, need_ecn;
5866
5867 if (!th_ecn)
5868 return;
5869
5870 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
5871 need_ecn = tcp_ca_needs_ecn(listen_sk);
5872
5873 if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
5874 inet_rsk(req)->ecn_ok = 1;
5875 else if (ect && need_ecn)
5876 inet_rsk(req)->ecn_ok = 1;
5877}
5878
5838int tcp_conn_request(struct request_sock_ops *rsk_ops, 5879int tcp_conn_request(struct request_sock_ops *rsk_ops,
5839 const struct tcp_request_sock_ops *af_ops, 5880 const struct tcp_request_sock_ops *af_ops,
5840 struct sock *sk, struct sk_buff *skb) 5881 struct sock *sk, struct sk_buff *skb)
@@ -5843,7 +5884,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5843 struct request_sock *req; 5884 struct request_sock *req;
5844 struct tcp_sock *tp = tcp_sk(sk); 5885 struct tcp_sock *tp = tcp_sk(sk);
5845 struct dst_entry *dst = NULL; 5886 struct dst_entry *dst = NULL;
5846 __u32 isn = TCP_SKB_CB(skb)->when; 5887 __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
5847 bool want_cookie = false, fastopen; 5888 bool want_cookie = false, fastopen;
5848 struct flowi fl; 5889 struct flowi fl;
5849 struct tcp_fastopen_cookie foc = { .len = -1 }; 5890 struct tcp_fastopen_cookie foc = { .len = -1 };
@@ -5895,7 +5936,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5895 goto drop_and_free; 5936 goto drop_and_free;
5896 5937
5897 if (!want_cookie || tmp_opt.tstamp_ok) 5938 if (!want_cookie || tmp_opt.tstamp_ok)
5898 TCP_ECN_create_request(req, skb, sock_net(sk)); 5939 tcp_ecn_create_request(req, skb, sk);
5899 5940
5900 if (want_cookie) { 5941 if (want_cookie) {
5901 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); 5942 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fbea536cf5c0..552e87e3c269 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -89,7 +89,6 @@ int sysctl_tcp_tw_reuse __read_mostly;
89int sysctl_tcp_low_latency __read_mostly; 89int sysctl_tcp_low_latency __read_mostly;
90EXPORT_SYMBOL(sysctl_tcp_low_latency); 90EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 91
92
93#ifdef CONFIG_TCP_MD5SIG 92#ifdef CONFIG_TCP_MD5SIG
94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 93static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95 __be32 daddr, __be32 saddr, const struct tcphdr *th); 94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
@@ -430,15 +429,16 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
430 break; 429 break;
431 430
432 icsk->icsk_backoff--; 431 icsk->icsk_backoff--;
433 inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : 432 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
434 TCP_TIMEOUT_INIT) << icsk->icsk_backoff; 433 TCP_TIMEOUT_INIT;
435 tcp_bound_rto(sk); 434 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
436 435
437 skb = tcp_write_queue_head(sk); 436 skb = tcp_write_queue_head(sk);
438 BUG_ON(!skb); 437 BUG_ON(!skb);
439 438
440 remaining = icsk->icsk_rto - min(icsk->icsk_rto, 439 remaining = icsk->icsk_rto -
441 tcp_time_stamp - TCP_SKB_CB(skb)->when); 440 min(icsk->icsk_rto,
441 tcp_time_stamp - tcp_skb_timestamp(skb));
442 442
443 if (remaining) { 443 if (remaining) {
444 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 444 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -680,8 +680,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
680 680
681 net = dev_net(skb_dst(skb)->dev); 681 net = dev_net(skb_dst(skb)->dev);
682 arg.tos = ip_hdr(skb)->tos; 682 arg.tos = ip_hdr(skb)->tos;
683 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, 683 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
684 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); 684 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
685 &arg, arg.iov[0].iov_len);
685 686
686 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 687 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
687 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); 688 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
@@ -763,8 +764,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
763 if (oif) 764 if (oif)
764 arg.bound_dev_if = oif; 765 arg.bound_dev_if = oif;
765 arg.tos = tos; 766 arg.tos = tos;
766 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, 767 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
767 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); 768 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
769 &arg, arg.iov[0].iov_len);
768 770
769 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 771 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
770} 772}
@@ -883,18 +885,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
883 */ 885 */
884static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) 886static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
885{ 887{
886 const struct ip_options *opt = &(IPCB(skb)->opt); 888 const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
887 struct ip_options_rcu *dopt = NULL; 889 struct ip_options_rcu *dopt = NULL;
888 890
889 if (opt && opt->optlen) { 891 if (opt && opt->optlen) {
890 int opt_size = sizeof(*dopt) + opt->optlen; 892 int opt_size = sizeof(*dopt) + opt->optlen;
891 893
892 dopt = kmalloc(opt_size, GFP_ATOMIC); 894 dopt = kmalloc(opt_size, GFP_ATOMIC);
893 if (dopt) { 895 if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
894 if (ip_options_echo(&dopt->opt, skb)) { 896 kfree(dopt);
895 kfree(dopt); 897 dopt = NULL;
896 dopt = NULL;
897 }
898 } 898 }
899 } 899 }
900 return dopt; 900 return dopt;
@@ -1268,7 +1268,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1268 .send_ack = tcp_v4_reqsk_send_ack, 1268 .send_ack = tcp_v4_reqsk_send_ack,
1269 .destructor = tcp_v4_reqsk_destructor, 1269 .destructor = tcp_v4_reqsk_destructor,
1270 .send_reset = tcp_v4_send_reset, 1270 .send_reset = tcp_v4_send_reset,
1271 .syn_ack_timeout = tcp_syn_ack_timeout, 1271 .syn_ack_timeout = tcp_syn_ack_timeout,
1272}; 1272};
1273 1273
1274static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1274static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
@@ -1428,7 +1428,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1428 1428
1429#ifdef CONFIG_SYN_COOKIES 1429#ifdef CONFIG_SYN_COOKIES
1430 if (!th->syn) 1430 if (!th->syn)
1431 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); 1431 sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt);
1432#endif 1432#endif
1433 return sk; 1433 return sk;
1434} 1434}
@@ -1558,7 +1558,17 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1558 skb_queue_len(&tp->ucopy.prequeue) == 0) 1558 skb_queue_len(&tp->ucopy.prequeue) == 0)
1559 return false; 1559 return false;
1560 1560
1561 skb_dst_force(skb); 1561 /* Before escaping RCU protected region, we need to take care of skb
1562 * dst. Prequeue is only enabled for established sockets.
1563 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1564 * Instead of doing full sk_rx_dst validity here, let's perform
1565 * an optimistic check.
1566 */
1567 if (likely(sk->sk_rx_dst))
1568 skb_dst_drop(skb);
1569 else
1570 skb_dst_force(skb);
1571
1562 __skb_queue_tail(&tp->ucopy.prequeue, skb); 1572 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1563 tp->ucopy.memory += skb->truesize; 1573 tp->ucopy.memory += skb->truesize;
1564 if (tp->ucopy.memory > sk->sk_rcvbuf) { 1574 if (tp->ucopy.memory > sk->sk_rcvbuf) {
@@ -1623,11 +1633,19 @@ int tcp_v4_rcv(struct sk_buff *skb)
1623 1633
1624 th = tcp_hdr(skb); 1634 th = tcp_hdr(skb);
1625 iph = ip_hdr(skb); 1635 iph = ip_hdr(skb);
1636 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1637 * barrier() makes sure compiler wont play fool^Waliasing games.
1638 */
1639 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1640 sizeof(struct inet_skb_parm));
1641 barrier();
1642
1626 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1643 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1627 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1644 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1628 skb->len - th->doff * 4); 1645 skb->len - th->doff * 4);
1629 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1646 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1630 TCP_SKB_CB(skb)->when = 0; 1647 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1648 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1631 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1649 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1632 TCP_SKB_CB(skb)->sacked = 0; 1650 TCP_SKB_CB(skb)->sacked = 0;
1633 1651
@@ -1754,9 +1772,11 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1754{ 1772{
1755 struct dst_entry *dst = skb_dst(skb); 1773 struct dst_entry *dst = skb_dst(skb);
1756 1774
1757 dst_hold(dst); 1775 if (dst) {
1758 sk->sk_rx_dst = dst; 1776 dst_hold(dst);
1759 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1777 sk->sk_rx_dst = dst;
1778 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1779 }
1760} 1780}
1761EXPORT_SYMBOL(inet_sk_rx_dst_set); 1781EXPORT_SYMBOL(inet_sk_rx_dst_set);
1762 1782
@@ -2167,7 +2187,7 @@ int tcp_seq_open(struct inode *inode, struct file *file)
2167 2187
2168 s = ((struct seq_file *)file->private_data)->private; 2188 s = ((struct seq_file *)file->private_data)->private;
2169 s->family = afinfo->family; 2189 s->family = afinfo->family;
2170 s->last_pos = 0; 2190 s->last_pos = 0;
2171 return 0; 2191 return 0;
2172} 2192}
2173EXPORT_SYMBOL(tcp_seq_open); 2193EXPORT_SYMBOL(tcp_seq_open);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1649988bd1b6..63d2680b65db 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -232,7 +232,7 @@ kill:
232 u32 isn = tcptw->tw_snd_nxt + 65535 + 2; 232 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
233 if (isn == 0) 233 if (isn == 0)
234 isn++; 234 isn++;
235 TCP_SKB_CB(skb)->when = isn; 235 TCP_SKB_CB(skb)->tcp_tw_isn = isn;
236 return TCP_TW_SYN; 236 return TCP_TW_SYN;
237 } 237 }
238 238
@@ -393,8 +393,8 @@ void tcp_openreq_init_rwin(struct request_sock *req,
393} 393}
394EXPORT_SYMBOL(tcp_openreq_init_rwin); 394EXPORT_SYMBOL(tcp_openreq_init_rwin);
395 395
396static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, 396static void tcp_ecn_openreq_child(struct tcp_sock *tp,
397 struct request_sock *req) 397 const struct request_sock *req)
398{ 398{
399 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; 399 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
400} 400}
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
451 newtp->snd_cwnd = TCP_INIT_CWND; 451 newtp->snd_cwnd = TCP_INIT_CWND;
452 newtp->snd_cwnd_cnt = 0; 452 newtp->snd_cwnd_cnt = 0;
453 453
454 if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && 454 if (!try_module_get(newicsk->icsk_ca_ops->owner))
455 !try_module_get(newicsk->icsk_ca_ops->owner)) 455 tcp_assign_congestion_control(newsk);
456 newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
457 456
458 tcp_set_ca_state(newsk, TCP_CA_Open); 457 tcp_set_ca_state(newsk, TCP_CA_Open);
459 tcp_init_xmit_timers(newsk); 458 tcp_init_xmit_timers(newsk);
@@ -508,7 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
508 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) 507 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
509 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 508 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
510 newtp->rx_opt.mss_clamp = req->mss; 509 newtp->rx_opt.mss_clamp = req->mss;
511 TCP_ECN_openreq_child(newtp, req); 510 tcp_ecn_openreq_child(newtp, req);
512 newtp->fastopen_rsk = NULL; 511 newtp->fastopen_rsk = NULL;
513 newtp->syn_data_acked = 0; 512 newtp->syn_data_acked = 0;
514 513
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index bc1b83cb8309..5b90f2f447a5 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -29,6 +29,28 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
29 } 29 }
30} 30}
31 31
32struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
33 netdev_features_t features)
34{
35 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
36 return ERR_PTR(-EINVAL);
37
38 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
39 const struct iphdr *iph = ip_hdr(skb);
40 struct tcphdr *th = tcp_hdr(skb);
41
42 /* Set up checksum pseudo header, usually expect stack to
43 * have done this already.
44 */
45
46 th->check = 0;
47 skb->ip_summed = CHECKSUM_PARTIAL;
48 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
49 }
50
51 return tcp_gso_segment(skb, features);
52}
53
32struct sk_buff *tcp_gso_segment(struct sk_buff *skb, 54struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
33 netdev_features_t features) 55 netdev_features_t features)
34{ 56{
@@ -44,9 +66,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
44 __sum16 newcheck; 66 __sum16 newcheck;
45 bool ooo_okay, copy_destructor; 67 bool ooo_okay, copy_destructor;
46 68
47 if (!pskb_may_pull(skb, sizeof(*th)))
48 goto out;
49
50 th = tcp_hdr(skb); 69 th = tcp_hdr(skb);
51 thlen = th->doff * 4; 70 thlen = th->doff * 4;
52 if (thlen < sizeof(*th)) 71 if (thlen < sizeof(*th))
@@ -269,54 +288,16 @@ int tcp_gro_complete(struct sk_buff *skb)
269} 288}
270EXPORT_SYMBOL(tcp_gro_complete); 289EXPORT_SYMBOL(tcp_gro_complete);
271 290
272static int tcp_v4_gso_send_check(struct sk_buff *skb)
273{
274 const struct iphdr *iph;
275 struct tcphdr *th;
276
277 if (!pskb_may_pull(skb, sizeof(*th)))
278 return -EINVAL;
279
280 iph = ip_hdr(skb);
281 th = tcp_hdr(skb);
282
283 th->check = 0;
284 skb->ip_summed = CHECKSUM_PARTIAL;
285 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
286 return 0;
287}
288
289static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 291static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
290{ 292{
291 /* Use the IP hdr immediately proceeding for this transport */
292 const struct iphdr *iph = skb_gro_network_header(skb);
293 __wsum wsum;
294
295 /* Don't bother verifying checksum if we're going to flush anyway. */ 293 /* Don't bother verifying checksum if we're going to flush anyway. */
296 if (NAPI_GRO_CB(skb)->flush) 294 if (!NAPI_GRO_CB(skb)->flush &&
297 goto skip_csum; 295 skb_gro_checksum_validate(skb, IPPROTO_TCP,
298 296 inet_gro_compute_pseudo)) {
299 wsum = NAPI_GRO_CB(skb)->csum;
300
301 switch (skb->ip_summed) {
302 case CHECKSUM_NONE:
303 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
304 0);
305
306 /* fall through */
307
308 case CHECKSUM_COMPLETE:
309 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
310 wsum)) {
311 skb->ip_summed = CHECKSUM_UNNECESSARY;
312 break;
313 }
314
315 NAPI_GRO_CB(skb)->flush = 1; 297 NAPI_GRO_CB(skb)->flush = 1;
316 return NULL; 298 return NULL;
317 } 299 }
318 300
319skip_csum:
320 return tcp_gro_receive(head, skb); 301 return tcp_gro_receive(head, skb);
321} 302}
322 303
@@ -334,8 +315,7 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
334 315
335static const struct net_offload tcpv4_offload = { 316static const struct net_offload tcpv4_offload = {
336 .callbacks = { 317 .callbacks = {
337 .gso_send_check = tcp_v4_gso_send_check, 318 .gso_segment = tcp4_gso_segment,
338 .gso_segment = tcp_gso_segment,
339 .gro_receive = tcp4_gro_receive, 319 .gro_receive = tcp4_gro_receive,
340 .gro_complete = tcp4_gro_complete, 320 .gro_complete = tcp4_gro_complete,
341 }, 321 },
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5a7c41fbc6d3..8d4eac793700 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -318,36 +318,47 @@ static u16 tcp_select_window(struct sock *sk)
318} 318}
319 319
320/* Packet ECN state for a SYN-ACK */ 320/* Packet ECN state for a SYN-ACK */
321static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) 321static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
322{ 322{
323 const struct tcp_sock *tp = tcp_sk(sk);
324
323 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; 325 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
324 if (!(tp->ecn_flags & TCP_ECN_OK)) 326 if (!(tp->ecn_flags & TCP_ECN_OK))
325 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; 327 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
328 else if (tcp_ca_needs_ecn(sk))
329 INET_ECN_xmit(sk);
326} 330}
327 331
328/* Packet ECN state for a SYN. */ 332/* Packet ECN state for a SYN. */
329static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) 333static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
330{ 334{
331 struct tcp_sock *tp = tcp_sk(sk); 335 struct tcp_sock *tp = tcp_sk(sk);
332 336
333 tp->ecn_flags = 0; 337 tp->ecn_flags = 0;
334 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { 338 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
339 tcp_ca_needs_ecn(sk)) {
335 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 340 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
336 tp->ecn_flags = TCP_ECN_OK; 341 tp->ecn_flags = TCP_ECN_OK;
342 if (tcp_ca_needs_ecn(sk))
343 INET_ECN_xmit(sk);
337 } 344 }
338} 345}
339 346
340static __inline__ void 347static void
341TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) 348tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
349 struct sock *sk)
342{ 350{
343 if (inet_rsk(req)->ecn_ok) 351 if (inet_rsk(req)->ecn_ok) {
344 th->ece = 1; 352 th->ece = 1;
353 if (tcp_ca_needs_ecn(sk))
354 INET_ECN_xmit(sk);
355 }
345} 356}
346 357
347/* Set up ECN state for a packet on a ESTABLISHED socket that is about to 358/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
348 * be sent. 359 * be sent.
349 */ 360 */
350static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, 361static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
351 int tcp_header_len) 362 int tcp_header_len)
352{ 363{
353 struct tcp_sock *tp = tcp_sk(sk); 364 struct tcp_sock *tp = tcp_sk(sk);
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
362 tcp_hdr(skb)->cwr = 1; 373 tcp_hdr(skb)->cwr = 1;
363 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; 374 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
364 } 375 }
365 } else { 376 } else if (!tcp_ca_needs_ecn(sk)) {
366 /* ACK or retransmitted segment: clear ECT|CE */ 377 /* ACK or retransmitted segment: clear ECT|CE */
367 INET_ECN_dontxmit(sk); 378 INET_ECN_dontxmit(sk);
368 } 379 }
@@ -384,7 +395,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
384 TCP_SKB_CB(skb)->tcp_flags = flags; 395 TCP_SKB_CB(skb)->tcp_flags = flags;
385 TCP_SKB_CB(skb)->sacked = 0; 396 TCP_SKB_CB(skb)->sacked = 0;
386 397
387 shinfo->gso_segs = 1; 398 tcp_skb_pcount_set(skb, 1);
388 shinfo->gso_size = 0; 399 shinfo->gso_size = 0;
389 shinfo->gso_type = 0; 400 shinfo->gso_type = 0;
390 401
@@ -550,7 +561,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
550 561
551 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { 562 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
552 opts->options |= OPTION_TS; 563 opts->options |= OPTION_TS;
553 opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; 564 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
554 opts->tsecr = tp->rx_opt.ts_recent; 565 opts->tsecr = tp->rx_opt.ts_recent;
555 remaining -= TCPOLEN_TSTAMP_ALIGNED; 566 remaining -= TCPOLEN_TSTAMP_ALIGNED;
556 } 567 }
@@ -618,7 +629,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
618 } 629 }
619 if (likely(ireq->tstamp_ok)) { 630 if (likely(ireq->tstamp_ok)) {
620 opts->options |= OPTION_TS; 631 opts->options |= OPTION_TS;
621 opts->tsval = TCP_SKB_CB(skb)->when; 632 opts->tsval = tcp_skb_timestamp(skb);
622 opts->tsecr = req->ts_recent; 633 opts->tsecr = req->ts_recent;
623 remaining -= TCPOLEN_TSTAMP_ALIGNED; 634 remaining -= TCPOLEN_TSTAMP_ALIGNED;
624 } 635 }
@@ -647,7 +658,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
647 struct tcp_out_options *opts, 658 struct tcp_out_options *opts,
648 struct tcp_md5sig_key **md5) 659 struct tcp_md5sig_key **md5)
649{ 660{
650 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
651 struct tcp_sock *tp = tcp_sk(sk); 661 struct tcp_sock *tp = tcp_sk(sk);
652 unsigned int size = 0; 662 unsigned int size = 0;
653 unsigned int eff_sacks; 663 unsigned int eff_sacks;
@@ -666,7 +676,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
666 676
667 if (likely(tp->rx_opt.tstamp_ok)) { 677 if (likely(tp->rx_opt.tstamp_ok)) {
668 opts->options |= OPTION_TS; 678 opts->options |= OPTION_TS;
669 opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; 679 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
670 opts->tsecr = tp->rx_opt.ts_recent; 680 opts->tsecr = tp->rx_opt.ts_recent;
671 size += TCPOLEN_TSTAMP_ALIGNED; 681 size += TCPOLEN_TSTAMP_ALIGNED;
672 } 682 }
@@ -886,8 +896,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
886 skb = skb_clone(skb, gfp_mask); 896 skb = skb_clone(skb, gfp_mask);
887 if (unlikely(!skb)) 897 if (unlikely(!skb))
888 return -ENOBUFS; 898 return -ENOBUFS;
889 /* Our usage of tstamp should remain private */
890 skb->tstamp.tv64 = 0;
891 } 899 }
892 900
893 inet = inet_sk(sk); 901 inet = inet_sk(sk);
@@ -952,7 +960,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
952 960
953 tcp_options_write((__be32 *)(th + 1), tp, &opts); 961 tcp_options_write((__be32 *)(th + 1), tp, &opts);
954 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) 962 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
955 TCP_ECN_send(sk, skb, tcp_header_size); 963 tcp_ecn_send(sk, skb, tcp_header_size);
956 964
957#ifdef CONFIG_TCP_MD5SIG 965#ifdef CONFIG_TCP_MD5SIG
958 /* Calculate the MD5 hash, as we have all we need now */ 966 /* Calculate the MD5 hash, as we have all we need now */
@@ -975,7 +983,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
975 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, 983 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
976 tcp_skb_pcount(skb)); 984 tcp_skb_pcount(skb));
977 985
986 /* OK, its time to fill skb_shinfo(skb)->gso_segs */
987 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
988
989 /* Our usage of tstamp should remain private */
990 skb->tstamp.tv64 = 0;
991
992 /* Cleanup our debris for IP stacks */
993 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
994 sizeof(struct inet6_skb_parm)));
995
978 err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); 996 err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
997
979 if (likely(err <= 0)) 998 if (likely(err <= 0))
980 return err; 999 return err;
981 1000
@@ -995,7 +1014,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
995 1014
996 /* Advance write_seq and place onto the write_queue. */ 1015 /* Advance write_seq and place onto the write_queue. */
997 tp->write_seq = TCP_SKB_CB(skb)->end_seq; 1016 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
998 skb_header_release(skb); 1017 __skb_header_release(skb);
999 tcp_add_write_queue_tail(sk, skb); 1018 tcp_add_write_queue_tail(sk, skb);
1000 sk->sk_wmem_queued += skb->truesize; 1019 sk->sk_wmem_queued += skb->truesize;
1001 sk_mem_charge(sk, skb->truesize); 1020 sk_mem_charge(sk, skb->truesize);
@@ -1014,11 +1033,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
1014 /* Avoid the costly divide in the normal 1033 /* Avoid the costly divide in the normal
1015 * non-TSO case. 1034 * non-TSO case.
1016 */ 1035 */
1017 shinfo->gso_segs = 1; 1036 tcp_skb_pcount_set(skb, 1);
1018 shinfo->gso_size = 0; 1037 shinfo->gso_size = 0;
1019 shinfo->gso_type = 0; 1038 shinfo->gso_type = 0;
1020 } else { 1039 } else {
1021 shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); 1040 tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1022 shinfo->gso_size = mss_now; 1041 shinfo->gso_size = mss_now;
1023 shinfo->gso_type = sk->sk_gso_type; 1042 shinfo->gso_type = sk->sk_gso_type;
1024 } 1043 }
@@ -1146,10 +1165,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1146 1165
1147 buff->ip_summed = skb->ip_summed; 1166 buff->ip_summed = skb->ip_summed;
1148 1167
1149 /* Looks stupid, but our code really uses when of
1150 * skbs, which it never sent before. --ANK
1151 */
1152 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
1153 buff->tstamp = skb->tstamp; 1168 buff->tstamp = skb->tstamp;
1154 tcp_fragment_tstamp(skb, buff); 1169 tcp_fragment_tstamp(skb, buff);
1155 1170
@@ -1171,7 +1186,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1171 } 1186 }
1172 1187
1173 /* Link BUFF into the send queue. */ 1188 /* Link BUFF into the send queue. */
1174 skb_header_release(buff); 1189 __skb_header_release(buff);
1175 tcp_insert_write_queue_after(skb, buff, sk); 1190 tcp_insert_write_queue_after(skb, buff, sk);
1176 1191
1177 return 0; 1192 return 0;
@@ -1675,7 +1690,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1675 tcp_set_skb_tso_segs(sk, buff, mss_now); 1690 tcp_set_skb_tso_segs(sk, buff, mss_now);
1676 1691
1677 /* Link BUFF into the send queue. */ 1692 /* Link BUFF into the send queue. */
1678 skb_header_release(buff); 1693 __skb_header_release(buff);
1679 tcp_insert_write_queue_after(skb, buff, sk); 1694 tcp_insert_write_queue_after(skb, buff, sk);
1680 1695
1681 return 0; 1696 return 0;
@@ -1874,8 +1889,8 @@ static int tcp_mtu_probe(struct sock *sk)
1874 tcp_init_tso_segs(sk, nskb, nskb->len); 1889 tcp_init_tso_segs(sk, nskb, nskb->len);
1875 1890
1876 /* We're ready to send. If this fails, the probe will 1891 /* We're ready to send. If this fails, the probe will
1877 * be resegmented into mss-sized pieces by tcp_write_xmit(). */ 1892 * be resegmented into mss-sized pieces by tcp_write_xmit().
1878 TCP_SKB_CB(nskb)->when = tcp_time_stamp; 1893 */
1879 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { 1894 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
1880 /* Decrement cwnd here because we are sending 1895 /* Decrement cwnd here because we are sending
1881 * effectively two packets. */ 1896 * effectively two packets. */
@@ -1935,8 +1950,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1935 BUG_ON(!tso_segs); 1950 BUG_ON(!tso_segs);
1936 1951
1937 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { 1952 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
1938 /* "when" is used as a start point for the retransmit timer */ 1953 /* "skb_mstamp" is used as a start point for the retransmit timer */
1939 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1954 skb_mstamp_get(&skb->skb_mstamp);
1940 goto repair; /* Skip network transmission */ 1955 goto repair; /* Skip network transmission */
1941 } 1956 }
1942 1957
@@ -2000,8 +2015,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2000 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2015 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2001 break; 2016 break;
2002 2017
2003 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2004
2005 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) 2018 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2006 break; 2019 break;
2007 2020
@@ -2097,10 +2110,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2097static bool skb_still_in_host_queue(const struct sock *sk, 2110static bool skb_still_in_host_queue(const struct sock *sk,
2098 const struct sk_buff *skb) 2111 const struct sk_buff *skb)
2099{ 2112{
2100 const struct sk_buff *fclone = skb + 1; 2113 if (unlikely(skb_fclone_busy(skb))) {
2101
2102 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
2103 fclone->fclone == SKB_FCLONE_CLONE)) {
2104 NET_INC_STATS_BH(sock_net(sk), 2114 NET_INC_STATS_BH(sock_net(sk),
2105 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); 2115 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2106 return true; 2116 return true;
@@ -2499,7 +2509,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2499 /* Make a copy, if the first transmission SKB clone we made 2509 /* Make a copy, if the first transmission SKB clone we made
2500 * is still in somebody's hands, else make a clone. 2510 * is still in somebody's hands, else make a clone.
2501 */ 2511 */
2502 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2503 2512
2504 /* make sure skb->data is aligned on arches that require it 2513 /* make sure skb->data is aligned on arches that require it
2505 * and check if ack-trimming & collapsing extended the headroom 2514 * and check if ack-trimming & collapsing extended the headroom
@@ -2544,7 +2553,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2544 2553
2545 /* Save stamp of the first retransmit. */ 2554 /* Save stamp of the first retransmit. */
2546 if (!tp->retrans_stamp) 2555 if (!tp->retrans_stamp)
2547 tp->retrans_stamp = TCP_SKB_CB(skb)->when; 2556 tp->retrans_stamp = tcp_skb_timestamp(skb);
2548 2557
2549 /* snd_nxt is stored to detect loss of retransmitted segment, 2558 /* snd_nxt is stored to detect loss of retransmitted segment,
2550 * see tcp_input.c tcp_sacktag_write_queue(). 2559 * see tcp_input.c tcp_sacktag_write_queue().
@@ -2752,7 +2761,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2752 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), 2761 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2753 TCPHDR_ACK | TCPHDR_RST); 2762 TCPHDR_ACK | TCPHDR_RST);
2754 /* Send it off. */ 2763 /* Send it off. */
2755 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2756 if (tcp_transmit_skb(sk, skb, 0, priority)) 2764 if (tcp_transmit_skb(sk, skb, 0, priority))
2757 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); 2765 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
2758 2766
@@ -2780,7 +2788,7 @@ int tcp_send_synack(struct sock *sk)
2780 if (nskb == NULL) 2788 if (nskb == NULL)
2781 return -ENOMEM; 2789 return -ENOMEM;
2782 tcp_unlink_write_queue(skb, sk); 2790 tcp_unlink_write_queue(skb, sk);
2783 skb_header_release(nskb); 2791 __skb_header_release(nskb);
2784 __tcp_add_write_queue_head(sk, nskb); 2792 __tcp_add_write_queue_head(sk, nskb);
2785 sk_wmem_free_skb(sk, skb); 2793 sk_wmem_free_skb(sk, skb);
2786 sk->sk_wmem_queued += nskb->truesize; 2794 sk->sk_wmem_queued += nskb->truesize;
@@ -2789,9 +2797,8 @@ int tcp_send_synack(struct sock *sk)
2789 } 2797 }
2790 2798
2791 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; 2799 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2792 TCP_ECN_send_synack(tcp_sk(sk), skb); 2800 tcp_ecn_send_synack(sk, skb);
2793 } 2801 }
2794 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2795 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2802 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2796} 2803}
2797 2804
@@ -2835,10 +2842,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2835 memset(&opts, 0, sizeof(opts)); 2842 memset(&opts, 0, sizeof(opts));
2836#ifdef CONFIG_SYN_COOKIES 2843#ifdef CONFIG_SYN_COOKIES
2837 if (unlikely(req->cookie_ts)) 2844 if (unlikely(req->cookie_ts))
2838 TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); 2845 skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req);
2839 else 2846 else
2840#endif 2847#endif
2841 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2848 skb_mstamp_get(&skb->skb_mstamp);
2842 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, 2849 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
2843 foc) + sizeof(*th); 2850 foc) + sizeof(*th);
2844 2851
@@ -2849,7 +2856,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2849 memset(th, 0, sizeof(struct tcphdr)); 2856 memset(th, 0, sizeof(struct tcphdr));
2850 th->syn = 1; 2857 th->syn = 1;
2851 th->ack = 1; 2858 th->ack = 1;
2852 TCP_ECN_make_synack(req, th); 2859 tcp_ecn_make_synack(req, th, sk);
2853 th->source = htons(ireq->ir_num); 2860 th->source = htons(ireq->ir_num);
2854 th->dest = ireq->ir_rmt_port; 2861 th->dest = ireq->ir_rmt_port;
2855 /* Setting of flags are superfluous here for callers (and ECE is 2862 /* Setting of flags are superfluous here for callers (and ECE is
@@ -2956,7 +2963,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
2956 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 2963 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2957 2964
2958 tcb->end_seq += skb->len; 2965 tcb->end_seq += skb->len;
2959 skb_header_release(skb); 2966 __skb_header_release(skb);
2960 __tcp_add_write_queue_tail(sk, skb); 2967 __tcp_add_write_queue_tail(sk, skb);
2961 sk->sk_wmem_queued += skb->truesize; 2968 sk->sk_wmem_queued += skb->truesize;
2962 sk_mem_charge(sk, skb->truesize); 2969 sk_mem_charge(sk, skb->truesize);
@@ -3086,9 +3093,9 @@ int tcp_connect(struct sock *sk)
3086 skb_reserve(buff, MAX_TCP_HEADER); 3093 skb_reserve(buff, MAX_TCP_HEADER);
3087 3094
3088 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 3095 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3089 tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; 3096 tp->retrans_stamp = tcp_time_stamp;
3090 tcp_connect_queue_skb(sk, buff); 3097 tcp_connect_queue_skb(sk, buff);
3091 TCP_ECN_send_syn(sk, buff); 3098 tcp_ecn_send_syn(sk, buff);
3092 3099
3093 /* Send off SYN; include data in Fast Open. */ 3100 /* Send off SYN; include data in Fast Open. */
3094 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 3101 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3120,6 +3127,8 @@ void tcp_send_delayed_ack(struct sock *sk)
3120 int ato = icsk->icsk_ack.ato; 3127 int ato = icsk->icsk_ack.ato;
3121 unsigned long timeout; 3128 unsigned long timeout;
3122 3129
3130 tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
3131
3123 if (ato > TCP_DELACK_MIN) { 3132 if (ato > TCP_DELACK_MIN) {
3124 const struct tcp_sock *tp = tcp_sk(sk); 3133 const struct tcp_sock *tp = tcp_sk(sk);
3125 int max_ato = HZ / 2; 3134 int max_ato = HZ / 2;
@@ -3176,6 +3185,8 @@ void tcp_send_ack(struct sock *sk)
3176 if (sk->sk_state == TCP_CLOSE) 3185 if (sk->sk_state == TCP_CLOSE)
3177 return; 3186 return;
3178 3187
3188 tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
3189
3179 /* We are not putting this on the write queue, so 3190 /* We are not putting this on the write queue, so
3180 * tcp_transmit_skb() will set the ownership to this 3191 * tcp_transmit_skb() will set the ownership to this
3181 * sock. 3192 * sock.
@@ -3194,9 +3205,10 @@ void tcp_send_ack(struct sock *sk)
3194 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); 3205 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3195 3206
3196 /* Send it off, this clears delayed acks for us. */ 3207 /* Send it off, this clears delayed acks for us. */
3197 TCP_SKB_CB(buff)->when = tcp_time_stamp; 3208 skb_mstamp_get(&buff->skb_mstamp);
3198 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); 3209 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
3199} 3210}
3211EXPORT_SYMBOL_GPL(tcp_send_ack);
3200 3212
3201/* This routine sends a packet with an out of date sequence 3213/* This routine sends a packet with an out of date sequence
3202 * number. It assumes the other end will try to ack it. 3214 * number. It assumes the other end will try to ack it.
@@ -3226,7 +3238,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3226 * send it. 3238 * send it.
3227 */ 3239 */
3228 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); 3240 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
3229 TCP_SKB_CB(skb)->when = tcp_time_stamp; 3241 skb_mstamp_get(&skb->skb_mstamp);
3230 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); 3242 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
3231} 3243}
3232 3244
@@ -3270,7 +3282,6 @@ int tcp_write_wakeup(struct sock *sk)
3270 tcp_set_skb_tso_segs(sk, skb, mss); 3282 tcp_set_skb_tso_segs(sk, skb, mss);
3271 3283
3272 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3284 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3273 TCP_SKB_CB(skb)->when = tcp_time_stamp;
3274 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 3285 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3275 if (!err) 3286 if (!err)
3276 tcp_event_new_data_sent(sk, skb); 3287 tcp_event_new_data_sent(sk, skb);
@@ -3289,6 +3300,7 @@ void tcp_send_probe0(struct sock *sk)
3289{ 3300{
3290 struct inet_connection_sock *icsk = inet_csk(sk); 3301 struct inet_connection_sock *icsk = inet_csk(sk);
3291 struct tcp_sock *tp = tcp_sk(sk); 3302 struct tcp_sock *tp = tcp_sk(sk);
3303 unsigned long probe_max;
3292 int err; 3304 int err;
3293 3305
3294 err = tcp_write_wakeup(sk); 3306 err = tcp_write_wakeup(sk);
@@ -3304,9 +3316,7 @@ void tcp_send_probe0(struct sock *sk)
3304 if (icsk->icsk_backoff < sysctl_tcp_retries2) 3316 if (icsk->icsk_backoff < sysctl_tcp_retries2)
3305 icsk->icsk_backoff++; 3317 icsk->icsk_backoff++;
3306 icsk->icsk_probes_out++; 3318 icsk->icsk_probes_out++;
3307 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3319 probe_max = TCP_RTO_MAX;
3308 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
3309 TCP_RTO_MAX);
3310 } else { 3320 } else {
3311 /* If packet was not sent due to local congestion, 3321 /* If packet was not sent due to local congestion,
3312 * do not backoff and do not remember icsk_probes_out. 3322 * do not backoff and do not remember icsk_probes_out.
@@ -3316,11 +3326,11 @@ void tcp_send_probe0(struct sock *sk)
3316 */ 3326 */
3317 if (!icsk->icsk_probes_out) 3327 if (!icsk->icsk_probes_out)
3318 icsk->icsk_probes_out = 1; 3328 icsk->icsk_probes_out = 1;
3319 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3329 probe_max = TCP_RESOURCE_PROBE_INTERVAL;
3320 min(icsk->icsk_rto << icsk->icsk_backoff,
3321 TCP_RESOURCE_PROBE_INTERVAL),
3322 TCP_RTO_MAX);
3323 } 3330 }
3331 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3332 inet_csk_rto_backoff(icsk, probe_max),
3333 TCP_RTO_MAX);
3324} 3334}
3325 3335
3326int tcp_rtx_synack(struct sock *sk, struct request_sock *req) 3336int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 3b66610d4156..ebf5ff57526e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -83,7 +83,6 @@ static struct {
83 struct tcp_log *log; 83 struct tcp_log *log;
84} tcp_probe; 84} tcp_probe;
85 85
86
87static inline int tcp_probe_used(void) 86static inline int tcp_probe_used(void)
88{ 87{
89 return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); 88 return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
@@ -101,7 +100,6 @@ static inline int tcp_probe_avail(void)
101 si4.sin_addr.s_addr = inet->inet_##mem##addr; \ 100 si4.sin_addr.s_addr = inet->inet_##mem##addr; \
102 } while (0) \ 101 } while (0) \
103 102
104
105/* 103/*
106 * Hook inserted to be called before each receive packet. 104 * Hook inserted to be called before each receive packet.
107 * Note: arguments must match tcp_rcv_established()! 105 * Note: arguments must match tcp_rcv_established()!
@@ -194,8 +192,8 @@ static int tcpprobe_sprint(char *tbuf, int n)
194 192
195 return scnprintf(tbuf, n, 193 return scnprintf(tbuf, n,
196 "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", 194 "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
197 (unsigned long) tv.tv_sec, 195 (unsigned long)tv.tv_sec,
198 (unsigned long) tv.tv_nsec, 196 (unsigned long)tv.tv_nsec,
199 &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, 197 &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
200 p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); 198 p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
201} 199}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 8250949b8853..6824afb65d93 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -31,10 +31,10 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
31static u32 tcp_scalable_ssthresh(struct sock *sk) 31static u32 tcp_scalable_ssthresh(struct sock *sk)
32{ 32{
33 const struct tcp_sock *tp = tcp_sk(sk); 33 const struct tcp_sock *tp = tcp_sk(sk);
34
34 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); 35 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
35} 36}
36 37
37
38static struct tcp_congestion_ops tcp_scalable __read_mostly = { 38static struct tcp_congestion_ops tcp_scalable __read_mostly = {
39 .ssthresh = tcp_scalable_ssthresh, 39 .ssthresh = tcp_scalable_ssthresh,
40 .cong_avoid = tcp_scalable_cong_avoid, 40 .cong_avoid = tcp_scalable_cong_avoid,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index df90cd1ce37f..9b21ae8b2e31 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk)
52 * limit. 52 * limit.
53 * 2. If we have strong memory pressure. 53 * 2. If we have strong memory pressure.
54 */ 54 */
55static int tcp_out_of_resources(struct sock *sk, int do_reset) 55static int tcp_out_of_resources(struct sock *sk, bool do_reset)
56{ 56{
57 struct tcp_sock *tp = tcp_sk(sk); 57 struct tcp_sock *tp = tcp_sk(sk);
58 int shift = 0; 58 int shift = 0;
@@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
72 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || 72 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
73 /* 2. Window is closed. */ 73 /* 2. Window is closed. */
74 (!tp->snd_wnd && !tp->packets_out)) 74 (!tp->snd_wnd && !tp->packets_out))
75 do_reset = 1; 75 do_reset = true;
76 if (do_reset) 76 if (do_reset)
77 tcp_send_active_reset(sk, GFP_ATOMIC); 77 tcp_send_active_reset(sk, GFP_ATOMIC);
78 tcp_done(sk); 78 tcp_done(sk);
@@ -135,10 +135,9 @@ static bool retransmits_timed_out(struct sock *sk,
135 if (!inet_csk(sk)->icsk_retransmits) 135 if (!inet_csk(sk)->icsk_retransmits)
136 return false; 136 return false;
137 137
138 if (unlikely(!tcp_sk(sk)->retrans_stamp)) 138 start_ts = tcp_sk(sk)->retrans_stamp;
139 start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; 139 if (unlikely(!start_ts))
140 else 140 start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
141 start_ts = tcp_sk(sk)->retrans_stamp;
142 141
143 if (likely(timeout == 0)) { 142 if (likely(timeout == 0)) {
144 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); 143 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -181,7 +180,7 @@ static int tcp_write_timeout(struct sock *sk)
181 180
182 retry_until = sysctl_tcp_retries2; 181 retry_until = sysctl_tcp_retries2;
183 if (sock_flag(sk, SOCK_DEAD)) { 182 if (sock_flag(sk, SOCK_DEAD)) {
184 const int alive = (icsk->icsk_rto < TCP_RTO_MAX); 183 const int alive = icsk->icsk_rto < TCP_RTO_MAX;
185 184
186 retry_until = tcp_orphan_retries(sk, alive); 185 retry_until = tcp_orphan_retries(sk, alive);
187 do_reset = alive || 186 do_reset = alive ||
@@ -271,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk)
271 struct inet_connection_sock *icsk = inet_csk(sk); 270 struct inet_connection_sock *icsk = inet_csk(sk);
272 struct tcp_sock *tp = tcp_sk(sk); 271 struct tcp_sock *tp = tcp_sk(sk);
273 int max_probes; 272 int max_probes;
273 u32 start_ts;
274 274
275 if (tp->packets_out || !tcp_send_head(sk)) { 275 if (tp->packets_out || !tcp_send_head(sk)) {
276 icsk->icsk_probes_out = 0; 276 icsk->icsk_probes_out = 0;
277 return; 277 return;
278 } 278 }
279 279
280 /* *WARNING* RFC 1122 forbids this 280 /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
281 * 281 * long as the receiver continues to respond probes. We support this by
282 * It doesn't AFAIK, because we kill the retransmit timer -AK 282 * default and reset icsk_probes_out with incoming ACKs. But if the
283 * 283 * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
284 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing 284 * kill the socket when the retry count and the time exceeds the
285 * this behaviour in Solaris down as a bug fix. [AC] 285 * corresponding system limit. We also implement similar policy when
286 * 286 * we use RTO to probe window in tcp_retransmit_timer().
287 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
288 * even if they advertise zero window. Hence, connection is killed only
289 * if we received no ACKs for normal connection timeout. It is not killed
290 * only because window stays zero for some time, window may be zero
291 * until armageddon and even later. We are in full accordance
292 * with RFCs, only probe timer combines both retransmission timeout
293 * and probe timeout in one bottle. --ANK
294 */ 287 */
295 max_probes = sysctl_tcp_retries2; 288 start_ts = tcp_skb_timestamp(tcp_send_head(sk));
289 if (!start_ts)
290 skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp);
291 else if (icsk->icsk_user_timeout &&
292 (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
293 goto abort;
296 294
295 max_probes = sysctl_tcp_retries2;
297 if (sock_flag(sk, SOCK_DEAD)) { 296 if (sock_flag(sk, SOCK_DEAD)) {
298 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); 297 const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
299 298
300 max_probes = tcp_orphan_retries(sk, alive); 299 max_probes = tcp_orphan_retries(sk, alive);
301 300 if (!alive && icsk->icsk_backoff >= max_probes)
302 if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) 301 goto abort;
302 if (tcp_out_of_resources(sk, true))
303 return; 303 return;
304 } 304 }
305 305
306 if (icsk->icsk_probes_out > max_probes) { 306 if (icsk->icsk_probes_out > max_probes) {
307 tcp_write_err(sk); 307abort: tcp_write_err(sk);
308 } else { 308 } else {
309 /* Only send another probe if we didn't close things up. */ 309 /* Only send another probe if we didn't close things up. */
310 tcp_send_probe0(sk); 310 tcp_send_probe0(sk);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index b40ad897f945..a6afde666ab1 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -51,7 +51,6 @@ MODULE_PARM_DESC(beta, "upper bound of packets in network");
51module_param(gamma, int, 0644); 51module_param(gamma, int, 0644);
52MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); 52MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
53 53
54
55/* There are several situations when we must "re-start" Vegas: 54/* There are several situations when we must "re-start" Vegas:
56 * 55 *
57 * o when a connection is established 56 * o when a connection is established
@@ -133,7 +132,6 @@ EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked);
133 132
134void tcp_vegas_state(struct sock *sk, u8 ca_state) 133void tcp_vegas_state(struct sock *sk, u8 ca_state)
135{ 134{
136
137 if (ca_state == TCP_CA_Open) 135 if (ca_state == TCP_CA_Open)
138 vegas_enable(sk); 136 vegas_enable(sk);
139 else 137 else
@@ -285,7 +283,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
285 /* Use normal slow start */ 283 /* Use normal slow start */
286 else if (tp->snd_cwnd <= tp->snd_ssthresh) 284 else if (tp->snd_cwnd <= tp->snd_ssthresh)
287 tcp_slow_start(tp, acked); 285 tcp_slow_start(tp, acked);
288
289} 286}
290 287
291/* Extract info for Tcp socket info provided via netlink. */ 288/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 8276977d2c85..a4d2d2d88dca 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -175,7 +175,6 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
175 } else 175 } else
176 tp->snd_cwnd_cnt++; 176 tp->snd_cwnd_cnt++;
177 } 177 }
178
179 } 178 }
180 if (tp->snd_cwnd < 2) 179 if (tp->snd_cwnd < 2)
181 tp->snd_cwnd = 2; 180 tp->snd_cwnd = 2;
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index b94a04ae2ed5..bb63fba47d47 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -42,7 +42,6 @@ struct westwood {
42 u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ 42 u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/
43}; 43};
44 44
45
46/* TCP Westwood functions and constants */ 45/* TCP Westwood functions and constants */
47#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ 46#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
48#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ 47#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
@@ -153,7 +152,6 @@ static inline void update_rtt_min(struct westwood *w)
153 w->rtt_min = min(w->rtt, w->rtt_min); 152 w->rtt_min = min(w->rtt, w->rtt_min);
154} 153}
155 154
156
157/* 155/*
158 * @westwood_fast_bw 156 * @westwood_fast_bw
159 * It is called when we are in fast path. In particular it is called when 157 * It is called when we are in fast path. In particular it is called when
@@ -208,7 +206,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
208 return w->cumul_ack; 206 return w->cumul_ack;
209} 207}
210 208
211
212/* 209/*
213 * TCP Westwood 210 * TCP Westwood
214 * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it 211 * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
@@ -219,47 +216,51 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
219{ 216{
220 const struct tcp_sock *tp = tcp_sk(sk); 217 const struct tcp_sock *tp = tcp_sk(sk);
221 const struct westwood *w = inet_csk_ca(sk); 218 const struct westwood *w = inet_csk_ca(sk);
219
222 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); 220 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
223} 221}
224 222
223static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
224{
225 if (ack_flags & CA_ACK_SLOWPATH) {
226 struct westwood *w = inet_csk_ca(sk);
227
228 westwood_update_window(sk);
229 w->bk += westwood_acked_count(sk);
230
231 update_rtt_min(w);
232 return;
233 }
234
235 westwood_fast_bw(sk);
236}
237
225static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) 238static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
226{ 239{
227 struct tcp_sock *tp = tcp_sk(sk); 240 struct tcp_sock *tp = tcp_sk(sk);
228 struct westwood *w = inet_csk_ca(sk); 241 struct westwood *w = inet_csk_ca(sk);
229 242
230 switch (event) { 243 switch (event) {
231 case CA_EVENT_FAST_ACK:
232 westwood_fast_bw(sk);
233 break;
234
235 case CA_EVENT_COMPLETE_CWR: 244 case CA_EVENT_COMPLETE_CWR:
236 tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); 245 tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
237 break; 246 break;
238
239 case CA_EVENT_LOSS: 247 case CA_EVENT_LOSS:
240 tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); 248 tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
241 /* Update RTT_min when next ack arrives */ 249 /* Update RTT_min when next ack arrives */
242 w->reset_rtt_min = 1; 250 w->reset_rtt_min = 1;
243 break; 251 break;
244
245 case CA_EVENT_SLOW_ACK:
246 westwood_update_window(sk);
247 w->bk += westwood_acked_count(sk);
248 update_rtt_min(w);
249 break;
250
251 default: 252 default:
252 /* don't care */ 253 /* don't care */
253 break; 254 break;
254 } 255 }
255} 256}
256 257
257
258/* Extract info for Tcp socket info provided via netlink. */ 258/* Extract info for Tcp socket info provided via netlink. */
259static void tcp_westwood_info(struct sock *sk, u32 ext, 259static void tcp_westwood_info(struct sock *sk, u32 ext,
260 struct sk_buff *skb) 260 struct sk_buff *skb)
261{ 261{
262 const struct westwood *ca = inet_csk_ca(sk); 262 const struct westwood *ca = inet_csk_ca(sk);
263
263 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 264 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
264 struct tcpvegas_info info = { 265 struct tcpvegas_info info = {
265 .tcpv_enabled = 1, 266 .tcpv_enabled = 1,
@@ -271,12 +272,12 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
271 } 272 }
272} 273}
273 274
274
275static struct tcp_congestion_ops tcp_westwood __read_mostly = { 275static struct tcp_congestion_ops tcp_westwood __read_mostly = {
276 .init = tcp_westwood_init, 276 .init = tcp_westwood_init,
277 .ssthresh = tcp_reno_ssthresh, 277 .ssthresh = tcp_reno_ssthresh,
278 .cong_avoid = tcp_reno_cong_avoid, 278 .cong_avoid = tcp_reno_cong_avoid,
279 .cwnd_event = tcp_westwood_event, 279 .cwnd_event = tcp_westwood_event,
280 .in_ack_event = tcp_westwood_ack,
280 .get_info = tcp_westwood_info, 281 .get_info = tcp_westwood_info,
281 .pkts_acked = tcp_westwood_pkts_acked, 282 .pkts_acked = tcp_westwood_pkts_acked,
282 283
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 599b79b8eac0..cd7273218598 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -54,10 +54,8 @@ static void tcp_yeah_init(struct sock *sk)
54 /* Ensure the MD arithmetic works. This is somewhat pedantic, 54 /* Ensure the MD arithmetic works. This is somewhat pedantic,
55 * since I don't think we will see a cwnd this large. :) */ 55 * since I don't think we will see a cwnd this large. :) */
56 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); 56 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
57
58} 57}
59 58
60
61static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) 59static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
62{ 60{
63 const struct inet_connection_sock *icsk = inet_csk(sk); 61 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -84,7 +82,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
84 /* Scalable */ 82 /* Scalable */
85 83
86 tp->snd_cwnd_cnt += yeah->pkts_acked; 84 tp->snd_cwnd_cnt += yeah->pkts_acked;
87 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ 85 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) {
88 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 86 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
89 tp->snd_cwnd++; 87 tp->snd_cwnd++;
90 tp->snd_cwnd_cnt = 0; 88 tp->snd_cwnd_cnt = 0;
@@ -120,7 +118,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
120 */ 118 */
121 119
122 if (after(ack, yeah->vegas.beg_snd_nxt)) { 120 if (after(ack, yeah->vegas.beg_snd_nxt)) {
123
124 /* We do the Vegas calculations only if we got enough RTT 121 /* We do the Vegas calculations only if we got enough RTT
125 * samples that we can be reasonably sure that we got 122 * samples that we can be reasonably sure that we got
126 * at least one RTT sample that wasn't from a delayed ACK. 123 * at least one RTT sample that wasn't from a delayed ACK.
@@ -189,7 +186,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
189 } 186 }
190 187
191 yeah->lastQ = queue; 188 yeah->lastQ = queue;
192
193 } 189 }
194 190
195 /* Save the extent of the current window so we can use this 191 /* Save the extent of the current window so we can use this
@@ -205,7 +201,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
205 } 201 }
206} 202}
207 203
208static u32 tcp_yeah_ssthresh(struct sock *sk) { 204static u32 tcp_yeah_ssthresh(struct sock *sk)
205{
209 const struct tcp_sock *tp = tcp_sk(sk); 206 const struct tcp_sock *tp = tcp_sk(sk);
210 struct yeah *yeah = inet_csk_ca(sk); 207 struct yeah *yeah = inet_csk_ca(sk);
211 u32 reduction; 208 u32 reduction;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f57c0e4c2326..cd0db5471bb5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -99,6 +99,7 @@
99#include <linux/slab.h> 99#include <linux/slab.h>
100#include <net/tcp_states.h> 100#include <net/tcp_states.h>
101#include <linux/skbuff.h> 101#include <linux/skbuff.h>
102#include <linux/netdevice.h>
102#include <linux/proc_fs.h> 103#include <linux/proc_fs.h>
103#include <linux/seq_file.h> 104#include <linux/seq_file.h>
104#include <net/net_namespace.h> 105#include <net/net_namespace.h>
@@ -224,7 +225,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
224 remaining = (high - low) + 1; 225 remaining = (high - low) + 1;
225 226
226 rand = prandom_u32(); 227 rand = prandom_u32();
227 first = (((u64)rand * remaining) >> 32) + low; 228 first = reciprocal_scale(rand, remaining) + low;
228 /* 229 /*
229 * force rand to be an odd multiple of UDP_HTABLE_SIZE 230 * force rand to be an odd multiple of UDP_HTABLE_SIZE
230 */ 231 */
@@ -448,7 +449,7 @@ begin:
448 } 449 }
449 } else if (score == badness && reuseport) { 450 } else if (score == badness && reuseport) {
450 matches++; 451 matches++;
451 if (((u64)hash * matches) >> 32 == 0) 452 if (reciprocal_scale(hash, matches) == 0)
452 result = sk; 453 result = sk;
453 hash = next_pseudo_random32(hash); 454 hash = next_pseudo_random32(hash);
454 } 455 }
@@ -529,7 +530,7 @@ begin:
529 } 530 }
530 } else if (score == badness && reuseport) { 531 } else if (score == badness && reuseport) {
531 matches++; 532 matches++;
532 if (((u64)hash * matches) >> 32 == 0) 533 if (reciprocal_scale(hash, matches) == 0)
533 result = sk; 534 result = sk;
534 hash = next_pseudo_random32(hash); 535 hash = next_pseudo_random32(hash);
535 } 536 }
@@ -1787,6 +1788,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1787 if (sk != NULL) { 1788 if (sk != NULL) {
1788 int ret; 1789 int ret;
1789 1790
1791 if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk))
1792 skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
1793 inet_compute_pseudo);
1794
1790 ret = udp_queue_rcv_skb(sk, skb); 1795 ret = udp_queue_rcv_skb(sk, skb);
1791 sock_put(sk); 1796 sock_put(sk);
1792 1797
@@ -1967,7 +1972,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
1967 return; 1972 return;
1968 1973
1969 skb->sk = sk; 1974 skb->sk = sk;
1970 skb->destructor = sock_edemux; 1975 skb->destructor = sock_efree;
1971 dst = sk->sk_rx_dst; 1976 dst = sk->sk_rx_dst;
1972 1977
1973 if (dst) 1978 if (dst)
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 59035bc3008d..507310ef4b56 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -25,30 +25,11 @@ struct udp_offload_priv {
25 struct udp_offload_priv __rcu *next; 25 struct udp_offload_priv __rcu *next;
26}; 26};
27 27
28static int udp4_ufo_send_check(struct sk_buff *skb) 28static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
29{ 29 netdev_features_t features,
30 if (!pskb_may_pull(skb, sizeof(struct udphdr))) 30 struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
31 return -EINVAL; 31 netdev_features_t features),
32 32 __be16 new_protocol)
33 if (likely(!skb->encapsulation)) {
34 const struct iphdr *iph;
35 struct udphdr *uh;
36
37 iph = ip_hdr(skb);
38 uh = udp_hdr(skb);
39
40 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
41 IPPROTO_UDP, 0);
42 skb->csum_start = skb_transport_header(skb) - skb->head;
43 skb->csum_offset = offsetof(struct udphdr, check);
44 skb->ip_summed = CHECKSUM_PARTIAL;
45 }
46
47 return 0;
48}
49
50struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
51 netdev_features_t features)
52{ 33{
53 struct sk_buff *segs = ERR_PTR(-EINVAL); 34 struct sk_buff *segs = ERR_PTR(-EINVAL);
54 u16 mac_offset = skb->mac_header; 35 u16 mac_offset = skb->mac_header;
@@ -70,7 +51,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
70 skb_reset_mac_header(skb); 51 skb_reset_mac_header(skb);
71 skb_set_network_header(skb, skb_inner_network_offset(skb)); 52 skb_set_network_header(skb, skb_inner_network_offset(skb));
72 skb->mac_len = skb_inner_network_offset(skb); 53 skb->mac_len = skb_inner_network_offset(skb);
73 skb->protocol = htons(ETH_P_TEB); 54 skb->protocol = new_protocol;
74 55
75 need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); 56 need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
76 if (need_csum) 57 if (need_csum)
@@ -78,7 +59,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
78 59
79 /* segment inner packet. */ 60 /* segment inner packet. */
80 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); 61 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
81 segs = skb_mac_gso_segment(skb, enc_features); 62 segs = gso_inner_segment(skb, enc_features);
82 if (IS_ERR_OR_NULL(segs)) { 63 if (IS_ERR_OR_NULL(segs)) {
83 skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, 64 skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
84 mac_len); 65 mac_len);
@@ -123,21 +104,63 @@ out:
123 return segs; 104 return segs;
124} 105}
125 106
107struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
108 netdev_features_t features,
109 bool is_ipv6)
110{
111 __be16 protocol = skb->protocol;
112 const struct net_offload **offloads;
113 const struct net_offload *ops;
114 struct sk_buff *segs = ERR_PTR(-EINVAL);
115 struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
116 netdev_features_t features);
117
118 rcu_read_lock();
119
120 switch (skb->inner_protocol_type) {
121 case ENCAP_TYPE_ETHER:
122 protocol = skb->inner_protocol;
123 gso_inner_segment = skb_mac_gso_segment;
124 break;
125 case ENCAP_TYPE_IPPROTO:
126 offloads = is_ipv6 ? inet6_offloads : inet_offloads;
127 ops = rcu_dereference(offloads[skb->inner_ipproto]);
128 if (!ops || !ops->callbacks.gso_segment)
129 goto out_unlock;
130 gso_inner_segment = ops->callbacks.gso_segment;
131 break;
132 default:
133 goto out_unlock;
134 }
135
136 segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment,
137 protocol);
138
139out_unlock:
140 rcu_read_unlock();
141
142 return segs;
143}
144
126static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, 145static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
127 netdev_features_t features) 146 netdev_features_t features)
128{ 147{
129 struct sk_buff *segs = ERR_PTR(-EINVAL); 148 struct sk_buff *segs = ERR_PTR(-EINVAL);
130 unsigned int mss; 149 unsigned int mss;
131 int offset;
132 __wsum csum; 150 __wsum csum;
151 struct udphdr *uh;
152 struct iphdr *iph;
133 153
134 if (skb->encapsulation && 154 if (skb->encapsulation &&
135 (skb_shinfo(skb)->gso_type & 155 (skb_shinfo(skb)->gso_type &
136 (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { 156 (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
137 segs = skb_udp_tunnel_segment(skb, features); 157 segs = skb_udp_tunnel_segment(skb, features, false);
138 goto out; 158 goto out;
139 } 159 }
140 160
161 if (!pskb_may_pull(skb, sizeof(struct udphdr)))
162 goto out;
163
141 mss = skb_shinfo(skb)->gso_size; 164 mss = skb_shinfo(skb)->gso_size;
142 if (unlikely(skb->len <= mss)) 165 if (unlikely(skb->len <= mss))
143 goto out; 166 goto out;
@@ -165,10 +188,16 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
165 * HW cannot do checksum of UDP packets sent as multiple 188 * HW cannot do checksum of UDP packets sent as multiple
166 * IP fragments. 189 * IP fragments.
167 */ 190 */
168 offset = skb_checksum_start_offset(skb); 191
169 csum = skb_checksum(skb, offset, skb->len - offset, 0); 192 uh = udp_hdr(skb);
170 offset += skb->csum_offset; 193 iph = ip_hdr(skb);
171 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 194
195 uh->check = 0;
196 csum = skb_checksum(skb, 0, skb->len, 0);
197 uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
198 if (uh->check == 0)
199 uh->check = CSUM_MANGLED_0;
200
172 skb->ip_summed = CHECKSUM_NONE; 201 skb->ip_summed = CHECKSUM_NONE;
173 202
174 /* Fragment the skb. IP headers of the fragments are updated in 203 /* Fragment the skb. IP headers of the fragments are updated in
@@ -228,30 +257,24 @@ unlock:
228} 257}
229EXPORT_SYMBOL(udp_del_offload); 258EXPORT_SYMBOL(udp_del_offload);
230 259
231static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) 260struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
261 struct udphdr *uh)
232{ 262{
233 struct udp_offload_priv *uo_priv; 263 struct udp_offload_priv *uo_priv;
234 struct sk_buff *p, **pp = NULL; 264 struct sk_buff *p, **pp = NULL;
235 struct udphdr *uh, *uh2; 265 struct udphdr *uh2;
236 unsigned int hlen, off; 266 unsigned int off = skb_gro_offset(skb);
237 int flush = 1; 267 int flush = 1;
238 268
239 if (NAPI_GRO_CB(skb)->udp_mark || 269 if (NAPI_GRO_CB(skb)->udp_mark ||
240 (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) 270 (skb->ip_summed != CHECKSUM_PARTIAL &&
271 NAPI_GRO_CB(skb)->csum_cnt == 0 &&
272 !NAPI_GRO_CB(skb)->csum_valid))
241 goto out; 273 goto out;
242 274
243 /* mark that this skb passed once through the udp gro layer */ 275 /* mark that this skb passed once through the udp gro layer */
244 NAPI_GRO_CB(skb)->udp_mark = 1; 276 NAPI_GRO_CB(skb)->udp_mark = 1;
245 277
246 off = skb_gro_offset(skb);
247 hlen = off + sizeof(*uh);
248 uh = skb_gro_header_fast(skb, off);
249 if (skb_gro_header_hard(skb, hlen)) {
250 uh = skb_gro_header_slow(skb, hlen, off);
251 if (unlikely(!uh))
252 goto out;
253 }
254
255 rcu_read_lock(); 278 rcu_read_lock();
256 uo_priv = rcu_dereference(udp_offload_base); 279 uo_priv = rcu_dereference(udp_offload_base);
257 for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { 280 for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
@@ -269,7 +292,12 @@ unflush:
269 continue; 292 continue;
270 293
271 uh2 = (struct udphdr *)(p->data + off); 294 uh2 = (struct udphdr *)(p->data + off);
272 if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { 295
296 /* Match ports and either checksums are either both zero
297 * or nonzero.
298 */
299 if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) ||
300 (!uh->check ^ !uh2->check)) {
273 NAPI_GRO_CB(p)->same_flow = 0; 301 NAPI_GRO_CB(p)->same_flow = 0;
274 continue; 302 continue;
275 } 303 }
@@ -277,6 +305,7 @@ unflush:
277 305
278 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ 306 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
279 skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); 307 skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
308 NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
280 pp = uo_priv->offload->callbacks.gro_receive(head, skb); 309 pp = uo_priv->offload->callbacks.gro_receive(head, skb);
281 310
282out_unlock: 311out_unlock:
@@ -286,7 +315,34 @@ out:
286 return pp; 315 return pp;
287} 316}
288 317
289static int udp_gro_complete(struct sk_buff *skb, int nhoff) 318static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
319 struct sk_buff *skb)
320{
321 struct udphdr *uh = udp_gro_udphdr(skb);
322
323 if (unlikely(!uh))
324 goto flush;
325
326 /* Don't bother verifying checksum if we're going to flush anyway. */
327 if (NAPI_GRO_CB(skb)->flush)
328 goto skip;
329
330 if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
331 inet_gro_compute_pseudo))
332 goto flush;
333 else if (uh->check)
334 skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
335 inet_gro_compute_pseudo);
336skip:
337 NAPI_GRO_CB(skb)->is_ipv6 = 0;
338 return udp_gro_receive(head, skb, uh);
339
340flush:
341 NAPI_GRO_CB(skb)->flush = 1;
342 return NULL;
343}
344
345int udp_gro_complete(struct sk_buff *skb, int nhoff)
290{ 346{
291 struct udp_offload_priv *uo_priv; 347 struct udp_offload_priv *uo_priv;
292 __be16 newlen = htons(skb->len - nhoff); 348 __be16 newlen = htons(skb->len - nhoff);
@@ -304,19 +360,32 @@ static int udp_gro_complete(struct sk_buff *skb, int nhoff)
304 break; 360 break;
305 } 361 }
306 362
307 if (uo_priv != NULL) 363 if (uo_priv != NULL) {
364 NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
308 err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); 365 err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr));
366 }
309 367
310 rcu_read_unlock(); 368 rcu_read_unlock();
311 return err; 369 return err;
312} 370}
313 371
372static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
373{
374 const struct iphdr *iph = ip_hdr(skb);
375 struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
376
377 if (uh->check)
378 uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
379 iph->daddr, 0);
380
381 return udp_gro_complete(skb, nhoff);
382}
383
314static const struct net_offload udpv4_offload = { 384static const struct net_offload udpv4_offload = {
315 .callbacks = { 385 .callbacks = {
316 .gso_send_check = udp4_ufo_send_check,
317 .gso_segment = udp4_ufo_fragment, 386 .gso_segment = udp4_ufo_fragment,
318 .gro_receive = udp_gro_receive, 387 .gro_receive = udp4_gro_receive,
319 .gro_complete = udp_gro_complete, 388 .gro_complete = udp4_gro_complete,
320 }, 389 },
321}; 390};
322 391
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 61ec1a65207e..1671263e5fa0 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -8,83 +8,40 @@
8#include <net/udp_tunnel.h> 8#include <net/udp_tunnel.h>
9#include <net/net_namespace.h> 9#include <net/net_namespace.h>
10 10
11int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, 11int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
12 struct socket **sockp) 12 struct socket **sockp)
13{ 13{
14 int err = -EINVAL; 14 int err;
15 struct socket *sock = NULL; 15 struct socket *sock = NULL;
16 struct sockaddr_in udp_addr;
16 17
17#if IS_ENABLED(CONFIG_IPV6) 18 err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
18 if (cfg->family == AF_INET6) { 19 if (err < 0)
19 struct sockaddr_in6 udp6_addr; 20 goto error;
20 21
21 err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); 22 sk_change_net(sock->sk, net);
22 if (err < 0)
23 goto error;
24
25 sk_change_net(sock->sk, net);
26
27 udp6_addr.sin6_family = AF_INET6;
28 memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
29 sizeof(udp6_addr.sin6_addr));
30 udp6_addr.sin6_port = cfg->local_udp_port;
31 err = kernel_bind(sock, (struct sockaddr *)&udp6_addr,
32 sizeof(udp6_addr));
33 if (err < 0)
34 goto error;
35
36 if (cfg->peer_udp_port) {
37 udp6_addr.sin6_family = AF_INET6;
38 memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
39 sizeof(udp6_addr.sin6_addr));
40 udp6_addr.sin6_port = cfg->peer_udp_port;
41 err = kernel_connect(sock,
42 (struct sockaddr *)&udp6_addr,
43 sizeof(udp6_addr), 0);
44 }
45 if (err < 0)
46 goto error;
47 23
48 udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); 24 udp_addr.sin_family = AF_INET;
49 udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); 25 udp_addr.sin_addr = cfg->local_ip;
50 } else 26 udp_addr.sin_port = cfg->local_udp_port;
51#endif 27 err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
52 if (cfg->family == AF_INET) { 28 sizeof(udp_addr));
53 struct sockaddr_in udp_addr; 29 if (err < 0)
54 30 goto error;
55 err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
56 if (err < 0)
57 goto error;
58
59 sk_change_net(sock->sk, net);
60 31
32 if (cfg->peer_udp_port) {
61 udp_addr.sin_family = AF_INET; 33 udp_addr.sin_family = AF_INET;
62 udp_addr.sin_addr = cfg->local_ip; 34 udp_addr.sin_addr = cfg->peer_ip;
63 udp_addr.sin_port = cfg->local_udp_port; 35 udp_addr.sin_port = cfg->peer_udp_port;
64 err = kernel_bind(sock, (struct sockaddr *)&udp_addr, 36 err = kernel_connect(sock, (struct sockaddr *)&udp_addr,
65 sizeof(udp_addr)); 37 sizeof(udp_addr), 0);
66 if (err < 0) 38 if (err < 0)
67 goto error; 39 goto error;
68
69 if (cfg->peer_udp_port) {
70 udp_addr.sin_family = AF_INET;
71 udp_addr.sin_addr = cfg->peer_ip;
72 udp_addr.sin_port = cfg->peer_udp_port;
73 err = kernel_connect(sock,
74 (struct sockaddr *)&udp_addr,
75 sizeof(udp_addr), 0);
76 if (err < 0)
77 goto error;
78 }
79
80 sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
81 } else {
82 return -EPFNOSUPPORT;
83 } 40 }
84 41
42 sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
85 43
86 *sockp = sock; 44 *sockp = sock;
87
88 return 0; 45 return 0;
89 46
90error: 47error:
@@ -95,6 +52,57 @@ error:
95 *sockp = NULL; 52 *sockp = NULL;
96 return err; 53 return err;
97} 54}
98EXPORT_SYMBOL(udp_sock_create); 55EXPORT_SYMBOL(udp_sock_create4);
56
57void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
58 struct udp_tunnel_sock_cfg *cfg)
59{
60 struct sock *sk = sock->sk;
61
62 /* Disable multicast loopback */
63 inet_sk(sk)->mc_loop = 0;
64
65 /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
66 udp_set_convert_csum(sk, true);
67
68 rcu_assign_sk_user_data(sk, cfg->sk_user_data);
69
70 udp_sk(sk)->encap_type = cfg->encap_type;
71 udp_sk(sk)->encap_rcv = cfg->encap_rcv;
72 udp_sk(sk)->encap_destroy = cfg->encap_destroy;
73
74 udp_tunnel_encap_enable(sock);
75}
76EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
77
78int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt,
79 struct sk_buff *skb, __be32 src, __be32 dst,
80 __u8 tos, __u8 ttl, __be16 df, __be16 src_port,
81 __be16 dst_port, bool xnet)
82{
83 struct udphdr *uh;
84
85 __skb_push(skb, sizeof(*uh));
86 skb_reset_transport_header(skb);
87 uh = udp_hdr(skb);
88
89 uh->dest = dst_port;
90 uh->source = src_port;
91 uh->len = htons(skb->len);
92
93 udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len);
94
95 return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP,
96 tos, ttl, df, xnet);
97}
98EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
99
100void udp_tunnel_sock_release(struct socket *sock)
101{
102 rcu_assign_sk_user_data(sock->sk, NULL);
103 kernel_sock_shutdown(sock, SHUT_RDWR);
104 sk_release_kernel(sock->sk);
105}
106EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
99 107
100MODULE_LICENSE("GPL"); 108MODULE_LICENSE("GPL");