diff options
Diffstat (limited to 'net/ipv4')
69 files changed, 3104 insertions, 1210 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index dbc10d84161f..e682b48e0709 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -309,8 +309,33 @@ config NET_IPVTI | |||
309 | 309 | ||
310 | config NET_UDP_TUNNEL | 310 | config NET_UDP_TUNNEL |
311 | tristate | 311 | tristate |
312 | select NET_IP_TUNNEL | ||
312 | default n | 313 | default n |
313 | 314 | ||
315 | config NET_FOU | ||
316 | tristate "IP: Foo (IP protocols) over UDP" | ||
317 | select XFRM | ||
318 | select NET_UDP_TUNNEL | ||
319 | ---help--- | ||
320 | Foo over UDP allows any IP protocol to be directly encapsulated | ||
321 | over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP | ||
322 | network mechanisms and optimizations for UDP (such as ECMP | ||
323 | and RSS) can be leveraged to provide better service. | ||
324 | |||
325 | config GENEVE | ||
326 | tristate "Generic Network Virtualization Encapsulation (Geneve)" | ||
327 | depends on INET | ||
328 | select NET_UDP_TUNNEL | ||
329 | ---help--- | ||
330 | This allows one to create Geneve virtual interfaces that provide | ||
331 | Layer 2 Networks over Layer 3 Networks. Geneve is often used | ||
332 | to tunnel virtual network infrastructure in virtualized environments. | ||
333 | For more information see: | ||
334 | http://tools.ietf.org/html/draft-gross-geneve-01 | ||
335 | |||
336 | To compile this driver as a module, choose M here: the module | ||
337 | |||
338 | |||
314 | config INET_AH | 339 | config INET_AH |
315 | tristate "IP: AH transformation" | 340 | tristate "IP: AH transformation" |
316 | select XFRM_ALGO | 341 | select XFRM_ALGO |
@@ -560,6 +585,27 @@ config TCP_CONG_ILLINOIS | |||
560 | For further details see: | 585 | For further details see: |
561 | http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html | 586 | http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html |
562 | 587 | ||
588 | config TCP_CONG_DCTCP | ||
589 | tristate "DataCenter TCP (DCTCP)" | ||
590 | default n | ||
591 | ---help--- | ||
592 | DCTCP leverages Explicit Congestion Notification (ECN) in the network to | ||
593 | provide multi-bit feedback to the end hosts. It is designed to provide: | ||
594 | |||
595 | - High burst tolerance (incast due to partition/aggregate), | ||
596 | - Low latency (short flows, queries), | ||
597 | - High throughput (continuous data updates, large file transfers) with | ||
598 | commodity, shallow-buffered switches. | ||
599 | |||
600 | All switches in the data center network running DCTCP must support | ||
601 | ECN marking and be configured for marking when reaching defined switch | ||
602 | buffer thresholds. The default ECN marking threshold heuristic for | ||
603 | DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets | ||
604 | (~100KB) at 10Gbps, but might need further careful tweaking. | ||
605 | |||
606 | For further details see: | ||
607 | http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf | ||
608 | |||
563 | choice | 609 | choice |
564 | prompt "Default TCP congestion control" | 610 | prompt "Default TCP congestion control" |
565 | default DEFAULT_CUBIC | 611 | default DEFAULT_CUBIC |
@@ -588,9 +634,11 @@ choice | |||
588 | config DEFAULT_WESTWOOD | 634 | config DEFAULT_WESTWOOD |
589 | bool "Westwood" if TCP_CONG_WESTWOOD=y | 635 | bool "Westwood" if TCP_CONG_WESTWOOD=y |
590 | 636 | ||
637 | config DEFAULT_DCTCP | ||
638 | bool "DCTCP" if TCP_CONG_DCTCP=y | ||
639 | |||
591 | config DEFAULT_RENO | 640 | config DEFAULT_RENO |
592 | bool "Reno" | 641 | bool "Reno" |
593 | |||
594 | endchoice | 642 | endchoice |
595 | 643 | ||
596 | endif | 644 | endif |
@@ -610,6 +658,7 @@ config DEFAULT_TCP_CONG | |||
610 | default "westwood" if DEFAULT_WESTWOOD | 658 | default "westwood" if DEFAULT_WESTWOOD |
611 | default "veno" if DEFAULT_VENO | 659 | default "veno" if DEFAULT_VENO |
612 | default "reno" if DEFAULT_RENO | 660 | default "reno" if DEFAULT_RENO |
661 | default "dctcp" if DEFAULT_DCTCP | ||
613 | default "cubic" | 662 | default "cubic" |
614 | 663 | ||
615 | config TCP_MD5SIG | 664 | config TCP_MD5SIG |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 8ee1cd4053ee..518c04ed666e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o | |||
20 | obj-$(CONFIG_IP_MROUTE) += ipmr.o | 20 | obj-$(CONFIG_IP_MROUTE) += ipmr.o |
21 | obj-$(CONFIG_NET_IPIP) += ipip.o | 21 | obj-$(CONFIG_NET_IPIP) += ipip.o |
22 | gre-y := gre_demux.o | 22 | gre-y := gre_demux.o |
23 | obj-$(CONFIG_NET_FOU) += fou.o | ||
23 | obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o | 24 | obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o |
24 | obj-$(CONFIG_NET_IPGRE) += ip_gre.o | 25 | obj-$(CONFIG_NET_IPGRE) += ip_gre.o |
25 | obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o | 26 | obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o |
@@ -42,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o | |||
42 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o | 43 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o |
43 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | 44 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o |
44 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o | 45 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o |
46 | obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o | ||
45 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o | 47 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o |
46 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o | 48 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o |
47 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o | 49 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o |
@@ -54,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o | |||
54 | obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o | 56 | obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o |
55 | obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o | 57 | obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o |
56 | obj-$(CONFIG_NETLABEL) += cipso_ipv4.o | 58 | obj-$(CONFIG_NETLABEL) += cipso_ipv4.o |
59 | obj-$(CONFIG_GENEVE) += geneve.o | ||
57 | 60 | ||
58 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 61 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
59 | xfrm4_output.o xfrm4_protocol.o | 62 | xfrm4_output.o xfrm4_protocol.o |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d156b3c5f363..92db7a69f2b9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -418,10 +418,6 @@ int inet_release(struct socket *sock) | |||
418 | } | 418 | } |
419 | EXPORT_SYMBOL(inet_release); | 419 | EXPORT_SYMBOL(inet_release); |
420 | 420 | ||
421 | /* It is off by default, see below. */ | ||
422 | int sysctl_ip_nonlocal_bind __read_mostly; | ||
423 | EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); | ||
424 | |||
425 | int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | 421 | int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) |
426 | { | 422 | { |
427 | struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; | 423 | struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; |
@@ -461,7 +457,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
461 | * is temporarily down) | 457 | * is temporarily down) |
462 | */ | 458 | */ |
463 | err = -EADDRNOTAVAIL; | 459 | err = -EADDRNOTAVAIL; |
464 | if (!sysctl_ip_nonlocal_bind && | 460 | if (!net->ipv4.sysctl_ip_nonlocal_bind && |
465 | !(inet->freebind || inet->transparent) && | 461 | !(inet->freebind || inet->transparent) && |
466 | addr->sin_addr.s_addr != htonl(INADDR_ANY) && | 462 | addr->sin_addr.s_addr != htonl(INADDR_ANY) && |
467 | chk_addr_ret != RTN_LOCAL && | 463 | chk_addr_ret != RTN_LOCAL && |
@@ -1201,40 +1197,6 @@ int inet_sk_rebuild_header(struct sock *sk) | |||
1201 | } | 1197 | } |
1202 | EXPORT_SYMBOL(inet_sk_rebuild_header); | 1198 | EXPORT_SYMBOL(inet_sk_rebuild_header); |
1203 | 1199 | ||
1204 | static int inet_gso_send_check(struct sk_buff *skb) | ||
1205 | { | ||
1206 | const struct net_offload *ops; | ||
1207 | const struct iphdr *iph; | ||
1208 | int proto; | ||
1209 | int ihl; | ||
1210 | int err = -EINVAL; | ||
1211 | |||
1212 | if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) | ||
1213 | goto out; | ||
1214 | |||
1215 | iph = ip_hdr(skb); | ||
1216 | ihl = iph->ihl * 4; | ||
1217 | if (ihl < sizeof(*iph)) | ||
1218 | goto out; | ||
1219 | |||
1220 | proto = iph->protocol; | ||
1221 | |||
1222 | /* Warning: after this point, iph might be no longer valid */ | ||
1223 | if (unlikely(!pskb_may_pull(skb, ihl))) | ||
1224 | goto out; | ||
1225 | __skb_pull(skb, ihl); | ||
1226 | |||
1227 | skb_reset_transport_header(skb); | ||
1228 | err = -EPROTONOSUPPORT; | ||
1229 | |||
1230 | ops = rcu_dereference(inet_offloads[proto]); | ||
1231 | if (likely(ops && ops->callbacks.gso_send_check)) | ||
1232 | err = ops->callbacks.gso_send_check(skb); | ||
1233 | |||
1234 | out: | ||
1235 | return err; | ||
1236 | } | ||
1237 | |||
1238 | static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | 1200 | static struct sk_buff *inet_gso_segment(struct sk_buff *skb, |
1239 | netdev_features_t features) | 1201 | netdev_features_t features) |
1240 | { | 1202 | { |
@@ -1407,6 +1369,9 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, | |||
1407 | * immediately following this IP hdr. | 1369 | * immediately following this IP hdr. |
1408 | */ | 1370 | */ |
1409 | 1371 | ||
1372 | /* Note : No need to call skb_gro_postpull_rcsum() here, | ||
1373 | * as we already checked checksum over ipv4 header was 0 | ||
1374 | */ | ||
1410 | skb_gro_pull(skb, sizeof(*iph)); | 1375 | skb_gro_pull(skb, sizeof(*iph)); |
1411 | skb_set_transport_header(skb, skb_gro_offset(skb)); | 1376 | skb_set_transport_header(skb, skb_gro_offset(skb)); |
1412 | 1377 | ||
@@ -1659,7 +1624,6 @@ static int ipv4_proc_init(void); | |||
1659 | static struct packet_offload ip_packet_offload __read_mostly = { | 1624 | static struct packet_offload ip_packet_offload __read_mostly = { |
1660 | .type = cpu_to_be16(ETH_P_IP), | 1625 | .type = cpu_to_be16(ETH_P_IP), |
1661 | .callbacks = { | 1626 | .callbacks = { |
1662 | .gso_send_check = inet_gso_send_check, | ||
1663 | .gso_segment = inet_gso_segment, | 1627 | .gso_segment = inet_gso_segment, |
1664 | .gro_receive = inet_gro_receive, | 1628 | .gro_receive = inet_gro_receive, |
1665 | .gro_complete = inet_gro_complete, | 1629 | .gro_complete = inet_gro_complete, |
@@ -1668,8 +1632,9 @@ static struct packet_offload ip_packet_offload __read_mostly = { | |||
1668 | 1632 | ||
1669 | static const struct net_offload ipip_offload = { | 1633 | static const struct net_offload ipip_offload = { |
1670 | .callbacks = { | 1634 | .callbacks = { |
1671 | .gso_send_check = inet_gso_send_check, | ||
1672 | .gso_segment = inet_gso_segment, | 1635 | .gso_segment = inet_gso_segment, |
1636 | .gro_receive = inet_gro_receive, | ||
1637 | .gro_complete = inet_gro_complete, | ||
1673 | }, | 1638 | }, |
1674 | }; | 1639 | }; |
1675 | 1640 | ||
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a2afa89513a0..ac9a32ec3ee4 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -505,8 +505,6 @@ static int ah_init_state(struct xfrm_state *x) | |||
505 | ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; | 505 | ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; |
506 | ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; | 506 | ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; |
507 | 507 | ||
508 | BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); | ||
509 | |||
510 | if (x->props.flags & XFRM_STATE_ALIGN4) | 508 | if (x->props.flags & XFRM_STATE_ALIGN4) |
511 | x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + | 509 | x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + |
512 | ahp->icv_trunc_len); | 510 | ahp->icv_trunc_len); |
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1a9b99e04465..16acb59d665e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
@@ -953,10 +953,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, | |||
953 | { | 953 | { |
954 | const struct arphdr *arp; | 954 | const struct arphdr *arp; |
955 | 955 | ||
956 | /* do not tweak dropwatch on an ARP we will ignore */ | ||
956 | if (dev->flags & IFF_NOARP || | 957 | if (dev->flags & IFF_NOARP || |
957 | skb->pkt_type == PACKET_OTHERHOST || | 958 | skb->pkt_type == PACKET_OTHERHOST || |
958 | skb->pkt_type == PACKET_LOOPBACK) | 959 | skb->pkt_type == PACKET_LOOPBACK) |
959 | goto freeskb; | 960 | goto consumeskb; |
960 | 961 | ||
961 | skb = skb_share_check(skb, GFP_ATOMIC); | 962 | skb = skb_share_check(skb, GFP_ATOMIC); |
962 | if (!skb) | 963 | if (!skb) |
@@ -974,6 +975,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, | |||
974 | 975 | ||
975 | return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); | 976 | return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); |
976 | 977 | ||
978 | consumeskb: | ||
979 | consume_skb(skb); | ||
980 | return 0; | ||
977 | freeskb: | 981 | freeskb: |
978 | kfree_skb(skb); | 982 | kfree_skb(skb); |
979 | out_of_mem: | 983 | out_of_mem: |
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 05b708bbdb0d..4715f25dfe03 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c | |||
@@ -246,7 +246,7 @@ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len) | |||
246 | * success, negative values on error. | 246 | * success, negative values on error. |
247 | * | 247 | * |
248 | */ | 248 | */ |
249 | static int cipso_v4_cache_init(void) | 249 | static int __init cipso_v4_cache_init(void) |
250 | { | 250 | { |
251 | u32 iter; | 251 | u32 iter; |
252 | 252 | ||
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 255aa9946fe7..23104a3f2924 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -243,7 +243,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | |||
243 | u8 tos, int oif, struct net_device *dev, | 243 | u8 tos, int oif, struct net_device *dev, |
244 | int rpf, struct in_device *idev, u32 *itag) | 244 | int rpf, struct in_device *idev, u32 *itag) |
245 | { | 245 | { |
246 | int ret, no_addr, accept_local; | 246 | int ret, no_addr; |
247 | struct fib_result res; | 247 | struct fib_result res; |
248 | struct flowi4 fl4; | 248 | struct flowi4 fl4; |
249 | struct net *net; | 249 | struct net *net; |
@@ -258,16 +258,17 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | |||
258 | 258 | ||
259 | no_addr = idev->ifa_list == NULL; | 259 | no_addr = idev->ifa_list == NULL; |
260 | 260 | ||
261 | accept_local = IN_DEV_ACCEPT_LOCAL(idev); | ||
262 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; | 261 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; |
263 | 262 | ||
264 | net = dev_net(dev); | 263 | net = dev_net(dev); |
265 | if (fib_lookup(net, &fl4, &res)) | 264 | if (fib_lookup(net, &fl4, &res)) |
266 | goto last_resort; | 265 | goto last_resort; |
267 | if (res.type != RTN_UNICAST) { | 266 | if (res.type != RTN_UNICAST && |
268 | if (res.type != RTN_LOCAL || !accept_local) | 267 | (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) |
269 | goto e_inval; | 268 | goto e_inval; |
270 | } | 269 | if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && |
270 | (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) | ||
271 | goto last_resort; | ||
271 | fib_combine_itag(itag, &res); | 272 | fib_combine_itag(itag, &res); |
272 | dev_match = false; | 273 | dev_match = false; |
273 | 274 | ||
@@ -321,6 +322,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | |||
321 | int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); | 322 | int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); |
322 | 323 | ||
323 | if (!r && !fib_num_tclassid_users(dev_net(dev)) && | 324 | if (!r && !fib_num_tclassid_users(dev_net(dev)) && |
325 | IN_DEV_ACCEPT_LOCAL(idev) && | ||
324 | (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { | 326 | (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { |
325 | *itag = 0; | 327 | *itag = 0; |
326 | return 0; | 328 | return 0; |
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b10cd43a4722..5b6efb3d2308 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
@@ -157,9 +157,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) | |||
157 | 157 | ||
158 | static void free_nh_exceptions(struct fib_nh *nh) | 158 | static void free_nh_exceptions(struct fib_nh *nh) |
159 | { | 159 | { |
160 | struct fnhe_hash_bucket *hash = nh->nh_exceptions; | 160 | struct fnhe_hash_bucket *hash; |
161 | int i; | 161 | int i; |
162 | 162 | ||
163 | hash = rcu_dereference_protected(nh->nh_exceptions, 1); | ||
164 | if (!hash) | ||
165 | return; | ||
163 | for (i = 0; i < FNHE_HASH_SIZE; i++) { | 166 | for (i = 0; i < FNHE_HASH_SIZE; i++) { |
164 | struct fib_nh_exception *fnhe; | 167 | struct fib_nh_exception *fnhe; |
165 | 168 | ||
@@ -205,8 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head) | |||
205 | change_nexthops(fi) { | 208 | change_nexthops(fi) { |
206 | if (nexthop_nh->nh_dev) | 209 | if (nexthop_nh->nh_dev) |
207 | dev_put(nexthop_nh->nh_dev); | 210 | dev_put(nexthop_nh->nh_dev); |
208 | if (nexthop_nh->nh_exceptions) | 211 | free_nh_exceptions(nexthop_nh); |
209 | free_nh_exceptions(nexthop_nh); | ||
210 | rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); | 212 | rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); |
211 | rt_fibinfo_free(&nexthop_nh->nh_rth_input); | 213 | rt_fibinfo_free(&nexthop_nh->nh_rth_input); |
212 | } endfor_nexthops(fi); | 214 | } endfor_nexthops(fi); |
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c new file mode 100644 index 000000000000..efa70ad44906 --- /dev/null +++ b/net/ipv4/fou.c | |||
@@ -0,0 +1,514 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/socket.h> | ||
4 | #include <linux/skbuff.h> | ||
5 | #include <linux/ip.h> | ||
6 | #include <linux/udp.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <net/genetlink.h> | ||
10 | #include <net/gue.h> | ||
11 | #include <net/ip.h> | ||
12 | #include <net/protocol.h> | ||
13 | #include <net/udp.h> | ||
14 | #include <net/udp_tunnel.h> | ||
15 | #include <net/xfrm.h> | ||
16 | #include <uapi/linux/fou.h> | ||
17 | #include <uapi/linux/genetlink.h> | ||
18 | |||
19 | static DEFINE_SPINLOCK(fou_lock); | ||
20 | static LIST_HEAD(fou_list); | ||
21 | |||
22 | struct fou { | ||
23 | struct socket *sock; | ||
24 | u8 protocol; | ||
25 | u16 port; | ||
26 | struct udp_offload udp_offloads; | ||
27 | struct list_head list; | ||
28 | }; | ||
29 | |||
30 | struct fou_cfg { | ||
31 | u16 type; | ||
32 | u8 protocol; | ||
33 | struct udp_port_cfg udp_config; | ||
34 | }; | ||
35 | |||
36 | static inline struct fou *fou_from_sock(struct sock *sk) | ||
37 | { | ||
38 | return sk->sk_user_data; | ||
39 | } | ||
40 | |||
41 | static int fou_udp_encap_recv_deliver(struct sk_buff *skb, | ||
42 | u8 protocol, size_t len) | ||
43 | { | ||
44 | struct iphdr *iph = ip_hdr(skb); | ||
45 | |||
46 | /* Remove 'len' bytes from the packet (UDP header and | ||
47 | * FOU header if present), modify the protocol to the one | ||
48 | * we found, and then call rcv_encap. | ||
49 | */ | ||
50 | iph->tot_len = htons(ntohs(iph->tot_len) - len); | ||
51 | __skb_pull(skb, len); | ||
52 | skb_postpull_rcsum(skb, udp_hdr(skb), len); | ||
53 | skb_reset_transport_header(skb); | ||
54 | |||
55 | return -protocol; | ||
56 | } | ||
57 | |||
58 | static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) | ||
59 | { | ||
60 | struct fou *fou = fou_from_sock(sk); | ||
61 | |||
62 | if (!fou) | ||
63 | return 1; | ||
64 | |||
65 | return fou_udp_encap_recv_deliver(skb, fou->protocol, | ||
66 | sizeof(struct udphdr)); | ||
67 | } | ||
68 | |||
69 | static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) | ||
70 | { | ||
71 | struct fou *fou = fou_from_sock(sk); | ||
72 | size_t len; | ||
73 | struct guehdr *guehdr; | ||
74 | struct udphdr *uh; | ||
75 | |||
76 | if (!fou) | ||
77 | return 1; | ||
78 | |||
79 | len = sizeof(struct udphdr) + sizeof(struct guehdr); | ||
80 | if (!pskb_may_pull(skb, len)) | ||
81 | goto drop; | ||
82 | |||
83 | uh = udp_hdr(skb); | ||
84 | guehdr = (struct guehdr *)&uh[1]; | ||
85 | |||
86 | len += guehdr->hlen << 2; | ||
87 | if (!pskb_may_pull(skb, len)) | ||
88 | goto drop; | ||
89 | |||
90 | if (guehdr->version != 0) | ||
91 | goto drop; | ||
92 | |||
93 | if (guehdr->flags) { | ||
94 | /* No support yet */ | ||
95 | goto drop; | ||
96 | } | ||
97 | |||
98 | return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); | ||
99 | drop: | ||
100 | kfree_skb(skb); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static struct sk_buff **fou_gro_receive(struct sk_buff **head, | ||
105 | struct sk_buff *skb) | ||
106 | { | ||
107 | const struct net_offload *ops; | ||
108 | struct sk_buff **pp = NULL; | ||
109 | u8 proto = NAPI_GRO_CB(skb)->proto; | ||
110 | const struct net_offload **offloads; | ||
111 | |||
112 | rcu_read_lock(); | ||
113 | offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; | ||
114 | ops = rcu_dereference(offloads[proto]); | ||
115 | if (!ops || !ops->callbacks.gro_receive) | ||
116 | goto out_unlock; | ||
117 | |||
118 | pp = ops->callbacks.gro_receive(head, skb); | ||
119 | |||
120 | out_unlock: | ||
121 | rcu_read_unlock(); | ||
122 | |||
123 | return pp; | ||
124 | } | ||
125 | |||
126 | static int fou_gro_complete(struct sk_buff *skb, int nhoff) | ||
127 | { | ||
128 | const struct net_offload *ops; | ||
129 | u8 proto = NAPI_GRO_CB(skb)->proto; | ||
130 | int err = -ENOSYS; | ||
131 | const struct net_offload **offloads; | ||
132 | |||
133 | rcu_read_lock(); | ||
134 | offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; | ||
135 | ops = rcu_dereference(offloads[proto]); | ||
136 | if (WARN_ON(!ops || !ops->callbacks.gro_complete)) | ||
137 | goto out_unlock; | ||
138 | |||
139 | err = ops->callbacks.gro_complete(skb, nhoff); | ||
140 | |||
141 | out_unlock: | ||
142 | rcu_read_unlock(); | ||
143 | |||
144 | return err; | ||
145 | } | ||
146 | |||
147 | static struct sk_buff **gue_gro_receive(struct sk_buff **head, | ||
148 | struct sk_buff *skb) | ||
149 | { | ||
150 | const struct net_offload **offloads; | ||
151 | const struct net_offload *ops; | ||
152 | struct sk_buff **pp = NULL; | ||
153 | struct sk_buff *p; | ||
154 | u8 proto; | ||
155 | struct guehdr *guehdr; | ||
156 | unsigned int hlen, guehlen; | ||
157 | unsigned int off; | ||
158 | int flush = 1; | ||
159 | |||
160 | off = skb_gro_offset(skb); | ||
161 | hlen = off + sizeof(*guehdr); | ||
162 | guehdr = skb_gro_header_fast(skb, off); | ||
163 | if (skb_gro_header_hard(skb, hlen)) { | ||
164 | guehdr = skb_gro_header_slow(skb, hlen, off); | ||
165 | if (unlikely(!guehdr)) | ||
166 | goto out; | ||
167 | } | ||
168 | |||
169 | proto = guehdr->next_hdr; | ||
170 | |||
171 | rcu_read_lock(); | ||
172 | offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; | ||
173 | ops = rcu_dereference(offloads[proto]); | ||
174 | if (WARN_ON(!ops || !ops->callbacks.gro_receive)) | ||
175 | goto out_unlock; | ||
176 | |||
177 | guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); | ||
178 | |||
179 | hlen = off + guehlen; | ||
180 | if (skb_gro_header_hard(skb, hlen)) { | ||
181 | guehdr = skb_gro_header_slow(skb, hlen, off); | ||
182 | if (unlikely(!guehdr)) | ||
183 | goto out_unlock; | ||
184 | } | ||
185 | |||
186 | flush = 0; | ||
187 | |||
188 | for (p = *head; p; p = p->next) { | ||
189 | const struct guehdr *guehdr2; | ||
190 | |||
191 | if (!NAPI_GRO_CB(p)->same_flow) | ||
192 | continue; | ||
193 | |||
194 | guehdr2 = (struct guehdr *)(p->data + off); | ||
195 | |||
196 | /* Compare base GUE header to be equal (covers | ||
197 | * hlen, version, next_hdr, and flags. | ||
198 | */ | ||
199 | if (guehdr->word != guehdr2->word) { | ||
200 | NAPI_GRO_CB(p)->same_flow = 0; | ||
201 | continue; | ||
202 | } | ||
203 | |||
204 | /* Compare optional fields are the same. */ | ||
205 | if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1], | ||
206 | guehdr->hlen << 2)) { | ||
207 | NAPI_GRO_CB(p)->same_flow = 0; | ||
208 | continue; | ||
209 | } | ||
210 | } | ||
211 | |||
212 | skb_gro_pull(skb, guehlen); | ||
213 | |||
214 | /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ | ||
215 | skb_gro_postpull_rcsum(skb, guehdr, guehlen); | ||
216 | |||
217 | pp = ops->callbacks.gro_receive(head, skb); | ||
218 | |||
219 | out_unlock: | ||
220 | rcu_read_unlock(); | ||
221 | out: | ||
222 | NAPI_GRO_CB(skb)->flush |= flush; | ||
223 | |||
224 | return pp; | ||
225 | } | ||
226 | |||
227 | static int gue_gro_complete(struct sk_buff *skb, int nhoff) | ||
228 | { | ||
229 | const struct net_offload **offloads; | ||
230 | struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); | ||
231 | const struct net_offload *ops; | ||
232 | unsigned int guehlen; | ||
233 | u8 proto; | ||
234 | int err = -ENOENT; | ||
235 | |||
236 | proto = guehdr->next_hdr; | ||
237 | |||
238 | guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); | ||
239 | |||
240 | rcu_read_lock(); | ||
241 | offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; | ||
242 | ops = rcu_dereference(offloads[proto]); | ||
243 | if (WARN_ON(!ops || !ops->callbacks.gro_complete)) | ||
244 | goto out_unlock; | ||
245 | |||
246 | err = ops->callbacks.gro_complete(skb, nhoff + guehlen); | ||
247 | |||
248 | out_unlock: | ||
249 | rcu_read_unlock(); | ||
250 | return err; | ||
251 | } | ||
252 | |||
253 | static int fou_add_to_port_list(struct fou *fou) | ||
254 | { | ||
255 | struct fou *fout; | ||
256 | |||
257 | spin_lock(&fou_lock); | ||
258 | list_for_each_entry(fout, &fou_list, list) { | ||
259 | if (fou->port == fout->port) { | ||
260 | spin_unlock(&fou_lock); | ||
261 | return -EALREADY; | ||
262 | } | ||
263 | } | ||
264 | |||
265 | list_add(&fou->list, &fou_list); | ||
266 | spin_unlock(&fou_lock); | ||
267 | |||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | static void fou_release(struct fou *fou) | ||
272 | { | ||
273 | struct socket *sock = fou->sock; | ||
274 | struct sock *sk = sock->sk; | ||
275 | |||
276 | udp_del_offload(&fou->udp_offloads); | ||
277 | |||
278 | list_del(&fou->list); | ||
279 | |||
280 | /* Remove hooks into tunnel socket */ | ||
281 | sk->sk_user_data = NULL; | ||
282 | |||
283 | sock_release(sock); | ||
284 | |||
285 | kfree(fou); | ||
286 | } | ||
287 | |||
288 | static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) | ||
289 | { | ||
290 | udp_sk(sk)->encap_rcv = fou_udp_recv; | ||
291 | fou->protocol = cfg->protocol; | ||
292 | fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; | ||
293 | fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; | ||
294 | fou->udp_offloads.port = cfg->udp_config.local_udp_port; | ||
295 | fou->udp_offloads.ipproto = cfg->protocol; | ||
296 | |||
297 | return 0; | ||
298 | } | ||
299 | |||
300 | static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) | ||
301 | { | ||
302 | udp_sk(sk)->encap_rcv = gue_udp_recv; | ||
303 | fou->udp_offloads.callbacks.gro_receive = gue_gro_receive; | ||
304 | fou->udp_offloads.callbacks.gro_complete = gue_gro_complete; | ||
305 | fou->udp_offloads.port = cfg->udp_config.local_udp_port; | ||
306 | |||
307 | return 0; | ||
308 | } | ||
309 | |||
310 | static int fou_create(struct net *net, struct fou_cfg *cfg, | ||
311 | struct socket **sockp) | ||
312 | { | ||
313 | struct fou *fou = NULL; | ||
314 | int err; | ||
315 | struct socket *sock = NULL; | ||
316 | struct sock *sk; | ||
317 | |||
318 | /* Open UDP socket */ | ||
319 | err = udp_sock_create(net, &cfg->udp_config, &sock); | ||
320 | if (err < 0) | ||
321 | goto error; | ||
322 | |||
323 | /* Allocate FOU port structure */ | ||
324 | fou = kzalloc(sizeof(*fou), GFP_KERNEL); | ||
325 | if (!fou) { | ||
326 | err = -ENOMEM; | ||
327 | goto error; | ||
328 | } | ||
329 | |||
330 | sk = sock->sk; | ||
331 | |||
332 | fou->port = cfg->udp_config.local_udp_port; | ||
333 | |||
334 | /* Initial for fou type */ | ||
335 | switch (cfg->type) { | ||
336 | case FOU_ENCAP_DIRECT: | ||
337 | err = fou_encap_init(sk, fou, cfg); | ||
338 | if (err) | ||
339 | goto error; | ||
340 | break; | ||
341 | case FOU_ENCAP_GUE: | ||
342 | err = gue_encap_init(sk, fou, cfg); | ||
343 | if (err) | ||
344 | goto error; | ||
345 | break; | ||
346 | default: | ||
347 | err = -EINVAL; | ||
348 | goto error; | ||
349 | } | ||
350 | |||
351 | udp_sk(sk)->encap_type = 1; | ||
352 | udp_encap_enable(); | ||
353 | |||
354 | sk->sk_user_data = fou; | ||
355 | fou->sock = sock; | ||
356 | |||
357 | udp_set_convert_csum(sk, true); | ||
358 | |||
359 | sk->sk_allocation = GFP_ATOMIC; | ||
360 | |||
361 | if (cfg->udp_config.family == AF_INET) { | ||
362 | err = udp_add_offload(&fou->udp_offloads); | ||
363 | if (err) | ||
364 | goto error; | ||
365 | } | ||
366 | |||
367 | err = fou_add_to_port_list(fou); | ||
368 | if (err) | ||
369 | goto error; | ||
370 | |||
371 | if (sockp) | ||
372 | *sockp = sock; | ||
373 | |||
374 | return 0; | ||
375 | |||
376 | error: | ||
377 | kfree(fou); | ||
378 | if (sock) | ||
379 | sock_release(sock); | ||
380 | |||
381 | return err; | ||
382 | } | ||
383 | |||
384 | static int fou_destroy(struct net *net, struct fou_cfg *cfg) | ||
385 | { | ||
386 | struct fou *fou; | ||
387 | u16 port = cfg->udp_config.local_udp_port; | ||
388 | int err = -EINVAL; | ||
389 | |||
390 | spin_lock(&fou_lock); | ||
391 | list_for_each_entry(fou, &fou_list, list) { | ||
392 | if (fou->port == port) { | ||
393 | udp_del_offload(&fou->udp_offloads); | ||
394 | fou_release(fou); | ||
395 | err = 0; | ||
396 | break; | ||
397 | } | ||
398 | } | ||
399 | spin_unlock(&fou_lock); | ||
400 | |||
401 | return err; | ||
402 | } | ||
403 | |||
404 | static struct genl_family fou_nl_family = { | ||
405 | .id = GENL_ID_GENERATE, | ||
406 | .hdrsize = 0, | ||
407 | .name = FOU_GENL_NAME, | ||
408 | .version = FOU_GENL_VERSION, | ||
409 | .maxattr = FOU_ATTR_MAX, | ||
410 | .netnsok = true, | ||
411 | }; | ||
412 | |||
413 | static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { | ||
414 | [FOU_ATTR_PORT] = { .type = NLA_U16, }, | ||
415 | [FOU_ATTR_AF] = { .type = NLA_U8, }, | ||
416 | [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, | ||
417 | [FOU_ATTR_TYPE] = { .type = NLA_U8, }, | ||
418 | }; | ||
419 | |||
420 | static int parse_nl_config(struct genl_info *info, | ||
421 | struct fou_cfg *cfg) | ||
422 | { | ||
423 | memset(cfg, 0, sizeof(*cfg)); | ||
424 | |||
425 | cfg->udp_config.family = AF_INET; | ||
426 | |||
427 | if (info->attrs[FOU_ATTR_AF]) { | ||
428 | u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); | ||
429 | |||
430 | if (family != AF_INET && family != AF_INET6) | ||
431 | return -EINVAL; | ||
432 | |||
433 | cfg->udp_config.family = family; | ||
434 | } | ||
435 | |||
436 | if (info->attrs[FOU_ATTR_PORT]) { | ||
437 | u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]); | ||
438 | |||
439 | cfg->udp_config.local_udp_port = port; | ||
440 | } | ||
441 | |||
442 | if (info->attrs[FOU_ATTR_IPPROTO]) | ||
443 | cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); | ||
444 | |||
445 | if (info->attrs[FOU_ATTR_TYPE]) | ||
446 | cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]); | ||
447 | |||
448 | return 0; | ||
449 | } | ||
450 | |||
451 | static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) | ||
452 | { | ||
453 | struct fou_cfg cfg; | ||
454 | int err; | ||
455 | |||
456 | err = parse_nl_config(info, &cfg); | ||
457 | if (err) | ||
458 | return err; | ||
459 | |||
460 | return fou_create(&init_net, &cfg, NULL); | ||
461 | } | ||
462 | |||
463 | static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) | ||
464 | { | ||
465 | struct fou_cfg cfg; | ||
466 | |||
467 | parse_nl_config(info, &cfg); | ||
468 | |||
469 | return fou_destroy(&init_net, &cfg); | ||
470 | } | ||
471 | |||
472 | static const struct genl_ops fou_nl_ops[] = { | ||
473 | { | ||
474 | .cmd = FOU_CMD_ADD, | ||
475 | .doit = fou_nl_cmd_add_port, | ||
476 | .policy = fou_nl_policy, | ||
477 | .flags = GENL_ADMIN_PERM, | ||
478 | }, | ||
479 | { | ||
480 | .cmd = FOU_CMD_DEL, | ||
481 | .doit = fou_nl_cmd_rm_port, | ||
482 | .policy = fou_nl_policy, | ||
483 | .flags = GENL_ADMIN_PERM, | ||
484 | }, | ||
485 | }; | ||
486 | |||
487 | static int __init fou_init(void) | ||
488 | { | ||
489 | int ret; | ||
490 | |||
491 | ret = genl_register_family_with_ops(&fou_nl_family, | ||
492 | fou_nl_ops); | ||
493 | |||
494 | return ret; | ||
495 | } | ||
496 | |||
497 | static void __exit fou_fini(void) | ||
498 | { | ||
499 | struct fou *fou, *next; | ||
500 | |||
501 | genl_unregister_family(&fou_nl_family); | ||
502 | |||
503 | /* Close all the FOU sockets */ | ||
504 | |||
505 | spin_lock(&fou_lock); | ||
506 | list_for_each_entry_safe(fou, next, &fou_list, list) | ||
507 | fou_release(fou); | ||
508 | spin_unlock(&fou_lock); | ||
509 | } | ||
510 | |||
511 | module_init(fou_init); | ||
512 | module_exit(fou_fini); | ||
513 | MODULE_AUTHOR("Tom Herbert <therbert@google.com>"); | ||
514 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c new file mode 100644 index 000000000000..065cd94c640c --- /dev/null +++ b/net/ipv4/geneve.c | |||
@@ -0,0 +1,373 @@ | |||
1 | /* | ||
2 | * Geneve: Generic Network Virtualization Encapsulation | ||
3 | * | ||
4 | * Copyright (c) 2014 Nicira, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
13 | |||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/types.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/errno.h> | ||
18 | #include <linux/slab.h> | ||
19 | #include <linux/skbuff.h> | ||
20 | #include <linux/rculist.h> | ||
21 | #include <linux/netdevice.h> | ||
22 | #include <linux/in.h> | ||
23 | #include <linux/ip.h> | ||
24 | #include <linux/udp.h> | ||
25 | #include <linux/igmp.h> | ||
26 | #include <linux/etherdevice.h> | ||
27 | #include <linux/if_ether.h> | ||
28 | #include <linux/if_vlan.h> | ||
29 | #include <linux/hash.h> | ||
30 | #include <linux/ethtool.h> | ||
31 | #include <net/arp.h> | ||
32 | #include <net/ndisc.h> | ||
33 | #include <net/ip.h> | ||
34 | #include <net/ip_tunnels.h> | ||
35 | #include <net/icmp.h> | ||
36 | #include <net/udp.h> | ||
37 | #include <net/rtnetlink.h> | ||
38 | #include <net/route.h> | ||
39 | #include <net/dsfield.h> | ||
40 | #include <net/inet_ecn.h> | ||
41 | #include <net/net_namespace.h> | ||
42 | #include <net/netns/generic.h> | ||
43 | #include <net/geneve.h> | ||
44 | #include <net/protocol.h> | ||
45 | #include <net/udp_tunnel.h> | ||
46 | #if IS_ENABLED(CONFIG_IPV6) | ||
47 | #include <net/ipv6.h> | ||
48 | #include <net/addrconf.h> | ||
49 | #include <net/ip6_tunnel.h> | ||
50 | #include <net/ip6_checksum.h> | ||
51 | #endif | ||
52 | |||
53 | #define PORT_HASH_BITS 8 | ||
54 | #define PORT_HASH_SIZE (1<<PORT_HASH_BITS) | ||
55 | |||
56 | /* per-network namespace private data for this module */ | ||
57 | struct geneve_net { | ||
58 | struct hlist_head sock_list[PORT_HASH_SIZE]; | ||
59 | spinlock_t sock_lock; /* Protects sock_list */ | ||
60 | }; | ||
61 | |||
62 | static int geneve_net_id; | ||
63 | |||
64 | static struct workqueue_struct *geneve_wq; | ||
65 | |||
66 | static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) | ||
67 | { | ||
68 | return (struct genevehdr *)(udp_hdr(skb) + 1); | ||
69 | } | ||
70 | |||
71 | static struct hlist_head *gs_head(struct net *net, __be16 port) | ||
72 | { | ||
73 | struct geneve_net *gn = net_generic(net, geneve_net_id); | ||
74 | |||
75 | return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; | ||
76 | } | ||
77 | |||
78 | /* Find geneve socket based on network namespace and UDP port */ | ||
79 | static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port) | ||
80 | { | ||
81 | struct geneve_sock *gs; | ||
82 | |||
83 | hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) { | ||
84 | if (inet_sk(gs->sock->sk)->inet_sport == port) | ||
85 | return gs; | ||
86 | } | ||
87 | |||
88 | return NULL; | ||
89 | } | ||
90 | |||
91 | static void geneve_build_header(struct genevehdr *geneveh, | ||
92 | __be16 tun_flags, u8 vni[3], | ||
93 | u8 options_len, u8 *options) | ||
94 | { | ||
95 | geneveh->ver = GENEVE_VER; | ||
96 | geneveh->opt_len = options_len / 4; | ||
97 | geneveh->oam = !!(tun_flags & TUNNEL_OAM); | ||
98 | geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); | ||
99 | geneveh->rsvd1 = 0; | ||
100 | memcpy(geneveh->vni, vni, 3); | ||
101 | geneveh->proto_type = htons(ETH_P_TEB); | ||
102 | geneveh->rsvd2 = 0; | ||
103 | |||
104 | memcpy(geneveh->options, options, options_len); | ||
105 | } | ||
106 | |||
107 | /* Transmit a fully formated Geneve frame. | ||
108 | * | ||
109 | * When calling this function. The skb->data should point | ||
110 | * to the geneve header which is fully formed. | ||
111 | * | ||
112 | * This function will add other UDP tunnel headers. | ||
113 | */ | ||
114 | int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, | ||
115 | struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, | ||
116 | __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, | ||
117 | __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, | ||
118 | bool xnet) | ||
119 | { | ||
120 | struct genevehdr *gnvh; | ||
121 | int min_headroom; | ||
122 | int err; | ||
123 | |||
124 | skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx); | ||
125 | |||
126 | min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len | ||
127 | + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) | ||
128 | + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); | ||
129 | |||
130 | err = skb_cow_head(skb, min_headroom); | ||
131 | if (unlikely(err)) | ||
132 | return err; | ||
133 | |||
134 | if (vlan_tx_tag_present(skb)) { | ||
135 | if (unlikely(!__vlan_put_tag(skb, | ||
136 | skb->vlan_proto, | ||
137 | vlan_tx_tag_get(skb)))) { | ||
138 | err = -ENOMEM; | ||
139 | return err; | ||
140 | } | ||
141 | skb->vlan_tci = 0; | ||
142 | } | ||
143 | |||
144 | gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); | ||
145 | geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); | ||
146 | |||
147 | return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, | ||
148 | tos, ttl, df, src_port, dst_port, xnet); | ||
149 | } | ||
150 | EXPORT_SYMBOL_GPL(geneve_xmit_skb); | ||
151 | |||
152 | static void geneve_notify_add_rx_port(struct geneve_sock *gs) | ||
153 | { | ||
154 | struct sock *sk = gs->sock->sk; | ||
155 | sa_family_t sa_family = sk->sk_family; | ||
156 | int err; | ||
157 | |||
158 | if (sa_family == AF_INET) { | ||
159 | err = udp_add_offload(&gs->udp_offloads); | ||
160 | if (err) | ||
161 | pr_warn("geneve: udp_add_offload failed with status %d\n", | ||
162 | err); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | /* Callback from net/ipv4/udp.c to receive packets */ | ||
167 | static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) | ||
168 | { | ||
169 | struct genevehdr *geneveh; | ||
170 | struct geneve_sock *gs; | ||
171 | int opts_len; | ||
172 | |||
173 | /* Need Geneve and inner Ethernet header to be present */ | ||
174 | if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) | ||
175 | goto error; | ||
176 | |||
177 | /* Return packets with reserved bits set */ | ||
178 | geneveh = geneve_hdr(skb); | ||
179 | |||
180 | if (unlikely(geneveh->ver != GENEVE_VER)) | ||
181 | goto error; | ||
182 | |||
183 | if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) | ||
184 | goto error; | ||
185 | |||
186 | opts_len = geneveh->opt_len * 4; | ||
187 | if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, | ||
188 | htons(ETH_P_TEB))) | ||
189 | goto drop; | ||
190 | |||
191 | gs = rcu_dereference_sk_user_data(sk); | ||
192 | if (!gs) | ||
193 | goto drop; | ||
194 | |||
195 | gs->rcv(gs, skb); | ||
196 | return 0; | ||
197 | |||
198 | drop: | ||
199 | /* Consume bad packet */ | ||
200 | kfree_skb(skb); | ||
201 | return 0; | ||
202 | |||
203 | error: | ||
204 | /* Let the UDP layer deal with the skb */ | ||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | static void geneve_del_work(struct work_struct *work) | ||
209 | { | ||
210 | struct geneve_sock *gs = container_of(work, struct geneve_sock, | ||
211 | del_work); | ||
212 | |||
213 | udp_tunnel_sock_release(gs->sock); | ||
214 | kfree_rcu(gs, rcu); | ||
215 | } | ||
216 | |||
217 | static struct socket *geneve_create_sock(struct net *net, bool ipv6, | ||
218 | __be16 port) | ||
219 | { | ||
220 | struct socket *sock; | ||
221 | struct udp_port_cfg udp_conf; | ||
222 | int err; | ||
223 | |||
224 | memset(&udp_conf, 0, sizeof(udp_conf)); | ||
225 | |||
226 | if (ipv6) { | ||
227 | udp_conf.family = AF_INET6; | ||
228 | } else { | ||
229 | udp_conf.family = AF_INET; | ||
230 | udp_conf.local_ip.s_addr = htonl(INADDR_ANY); | ||
231 | } | ||
232 | |||
233 | udp_conf.local_udp_port = port; | ||
234 | |||
235 | /* Open UDP socket */ | ||
236 | err = udp_sock_create(net, &udp_conf, &sock); | ||
237 | if (err < 0) | ||
238 | return ERR_PTR(err); | ||
239 | |||
240 | return sock; | ||
241 | } | ||
242 | |||
243 | /* Create new listen socket if needed */ | ||
244 | static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, | ||
245 | geneve_rcv_t *rcv, void *data, | ||
246 | bool ipv6) | ||
247 | { | ||
248 | struct geneve_net *gn = net_generic(net, geneve_net_id); | ||
249 | struct geneve_sock *gs; | ||
250 | struct socket *sock; | ||
251 | struct udp_tunnel_sock_cfg tunnel_cfg; | ||
252 | |||
253 | gs = kzalloc(sizeof(*gs), GFP_KERNEL); | ||
254 | if (!gs) | ||
255 | return ERR_PTR(-ENOMEM); | ||
256 | |||
257 | INIT_WORK(&gs->del_work, geneve_del_work); | ||
258 | |||
259 | sock = geneve_create_sock(net, ipv6, port); | ||
260 | if (IS_ERR(sock)) { | ||
261 | kfree(gs); | ||
262 | return ERR_CAST(sock); | ||
263 | } | ||
264 | |||
265 | gs->sock = sock; | ||
266 | atomic_set(&gs->refcnt, 1); | ||
267 | gs->rcv = rcv; | ||
268 | gs->rcv_data = data; | ||
269 | |||
270 | /* Initialize the geneve udp offloads structure */ | ||
271 | gs->udp_offloads.port = port; | ||
272 | gs->udp_offloads.callbacks.gro_receive = NULL; | ||
273 | gs->udp_offloads.callbacks.gro_complete = NULL; | ||
274 | |||
275 | spin_lock(&gn->sock_lock); | ||
276 | hlist_add_head_rcu(&gs->hlist, gs_head(net, port)); | ||
277 | geneve_notify_add_rx_port(gs); | ||
278 | spin_unlock(&gn->sock_lock); | ||
279 | |||
280 | /* Mark socket as an encapsulation socket */ | ||
281 | tunnel_cfg.sk_user_data = gs; | ||
282 | tunnel_cfg.encap_type = 1; | ||
283 | tunnel_cfg.encap_rcv = geneve_udp_encap_recv; | ||
284 | tunnel_cfg.encap_destroy = NULL; | ||
285 | setup_udp_tunnel_sock(net, sock, &tunnel_cfg); | ||
286 | |||
287 | return gs; | ||
288 | } | ||
289 | |||
290 | struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, | ||
291 | geneve_rcv_t *rcv, void *data, | ||
292 | bool no_share, bool ipv6) | ||
293 | { | ||
294 | struct geneve_sock *gs; | ||
295 | |||
296 | gs = geneve_socket_create(net, port, rcv, data, ipv6); | ||
297 | if (!IS_ERR(gs)) | ||
298 | return gs; | ||
299 | |||
300 | if (no_share) /* Return error if sharing is not allowed. */ | ||
301 | return ERR_PTR(-EINVAL); | ||
302 | |||
303 | gs = geneve_find_sock(net, port); | ||
304 | if (gs) { | ||
305 | if (gs->rcv == rcv) | ||
306 | atomic_inc(&gs->refcnt); | ||
307 | else | ||
308 | gs = ERR_PTR(-EBUSY); | ||
309 | } else { | ||
310 | gs = ERR_PTR(-EINVAL); | ||
311 | } | ||
312 | |||
313 | return gs; | ||
314 | } | ||
315 | EXPORT_SYMBOL_GPL(geneve_sock_add); | ||
316 | |||
317 | void geneve_sock_release(struct geneve_sock *gs) | ||
318 | { | ||
319 | if (!atomic_dec_and_test(&gs->refcnt)) | ||
320 | return; | ||
321 | |||
322 | queue_work(geneve_wq, &gs->del_work); | ||
323 | } | ||
324 | EXPORT_SYMBOL_GPL(geneve_sock_release); | ||
325 | |||
326 | static __net_init int geneve_init_net(struct net *net) | ||
327 | { | ||
328 | struct geneve_net *gn = net_generic(net, geneve_net_id); | ||
329 | unsigned int h; | ||
330 | |||
331 | spin_lock_init(&gn->sock_lock); | ||
332 | |||
333 | for (h = 0; h < PORT_HASH_SIZE; ++h) | ||
334 | INIT_HLIST_HEAD(&gn->sock_list[h]); | ||
335 | |||
336 | return 0; | ||
337 | } | ||
338 | |||
339 | static struct pernet_operations geneve_net_ops = { | ||
340 | .init = geneve_init_net, | ||
341 | .exit = NULL, | ||
342 | .id = &geneve_net_id, | ||
343 | .size = sizeof(struct geneve_net), | ||
344 | }; | ||
345 | |||
346 | static int __init geneve_init_module(void) | ||
347 | { | ||
348 | int rc; | ||
349 | |||
350 | geneve_wq = alloc_workqueue("geneve", 0, 0); | ||
351 | if (!geneve_wq) | ||
352 | return -ENOMEM; | ||
353 | |||
354 | rc = register_pernet_subsys(&geneve_net_ops); | ||
355 | if (rc) | ||
356 | return rc; | ||
357 | |||
358 | pr_info("Geneve driver\n"); | ||
359 | |||
360 | return 0; | ||
361 | } | ||
362 | late_initcall(geneve_init_module); | ||
363 | |||
364 | static void __exit geneve_cleanup_module(void) | ||
365 | { | ||
366 | destroy_workqueue(geneve_wq); | ||
367 | } | ||
368 | module_exit(geneve_cleanup_module); | ||
369 | |||
370 | MODULE_LICENSE("GPL"); | ||
371 | MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>"); | ||
372 | MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); | ||
373 | MODULE_ALIAS_RTNL_LINK("geneve"); | ||
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 0485bf7f8f03..4a7b5b2a1ce3 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c | |||
@@ -98,7 +98,6 @@ EXPORT_SYMBOL_GPL(gre_build_header); | |||
98 | static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, | 98 | static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, |
99 | bool *csum_err) | 99 | bool *csum_err) |
100 | { | 100 | { |
101 | unsigned int ip_hlen = ip_hdrlen(skb); | ||
102 | const struct gre_base_hdr *greh; | 101 | const struct gre_base_hdr *greh; |
103 | __be32 *options; | 102 | __be32 *options; |
104 | int hdr_len; | 103 | int hdr_len; |
@@ -106,7 +105,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, | |||
106 | if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) | 105 | if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) |
107 | return -EINVAL; | 106 | return -EINVAL; |
108 | 107 | ||
109 | greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); | 108 | greh = (struct gre_base_hdr *)skb_transport_header(skb); |
110 | if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) | 109 | if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) |
111 | return -EINVAL; | 110 | return -EINVAL; |
112 | 111 | ||
@@ -116,7 +115,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, | |||
116 | if (!pskb_may_pull(skb, hdr_len)) | 115 | if (!pskb_may_pull(skb, hdr_len)) |
117 | return -EINVAL; | 116 | return -EINVAL; |
118 | 117 | ||
119 | greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); | 118 | greh = (struct gre_base_hdr *)skb_transport_header(skb); |
120 | tpi->proto = greh->protocol; | 119 | tpi->proto = greh->protocol; |
121 | 120 | ||
122 | options = (__be32 *)(greh + 1); | 121 | options = (__be32 *)(greh + 1); |
@@ -125,6 +124,10 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, | |||
125 | *csum_err = true; | 124 | *csum_err = true; |
126 | return -EINVAL; | 125 | return -EINVAL; |
127 | } | 126 | } |
127 | |||
128 | skb_checksum_try_convert(skb, IPPROTO_GRE, 0, | ||
129 | null_compute_pseudo); | ||
130 | |||
128 | options++; | 131 | options++; |
129 | } | 132 | } |
130 | 133 | ||
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 6556263c8fa5..a77729503071 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c | |||
@@ -15,13 +15,6 @@ | |||
15 | #include <net/protocol.h> | 15 | #include <net/protocol.h> |
16 | #include <net/gre.h> | 16 | #include <net/gre.h> |
17 | 17 | ||
18 | static int gre_gso_send_check(struct sk_buff *skb) | ||
19 | { | ||
20 | if (!skb->encapsulation) | ||
21 | return -EINVAL; | ||
22 | return 0; | ||
23 | } | ||
24 | |||
25 | static struct sk_buff *gre_gso_segment(struct sk_buff *skb, | 18 | static struct sk_buff *gre_gso_segment(struct sk_buff *skb, |
26 | netdev_features_t features) | 19 | netdev_features_t features) |
27 | { | 20 | { |
@@ -46,6 +39,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, | |||
46 | SKB_GSO_IPIP))) | 39 | SKB_GSO_IPIP))) |
47 | goto out; | 40 | goto out; |
48 | 41 | ||
42 | if (!skb->encapsulation) | ||
43 | goto out; | ||
44 | |||
49 | if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) | 45 | if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) |
50 | goto out; | 46 | goto out; |
51 | 47 | ||
@@ -119,28 +115,6 @@ out: | |||
119 | return segs; | 115 | return segs; |
120 | } | 116 | } |
121 | 117 | ||
122 | /* Compute the whole skb csum in s/w and store it, then verify GRO csum | ||
123 | * starting from gro_offset. | ||
124 | */ | ||
125 | static __sum16 gro_skb_checksum(struct sk_buff *skb) | ||
126 | { | ||
127 | __sum16 sum; | ||
128 | |||
129 | skb->csum = skb_checksum(skb, 0, skb->len, 0); | ||
130 | NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, | ||
131 | csum_partial(skb->data, skb_gro_offset(skb), 0)); | ||
132 | sum = csum_fold(NAPI_GRO_CB(skb)->csum); | ||
133 | if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { | ||
134 | if (unlikely(!sum) && !skb->csum_complete_sw) | ||
135 | netdev_rx_csum_fault(skb->dev); | ||
136 | } else { | ||
137 | skb->ip_summed = CHECKSUM_COMPLETE; | ||
138 | skb->csum_complete_sw = 1; | ||
139 | } | ||
140 | |||
141 | return sum; | ||
142 | } | ||
143 | |||
144 | static struct sk_buff **gre_gro_receive(struct sk_buff **head, | 118 | static struct sk_buff **gre_gro_receive(struct sk_buff **head, |
145 | struct sk_buff *skb) | 119 | struct sk_buff *skb) |
146 | { | 120 | { |
@@ -192,22 +166,16 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, | |||
192 | if (unlikely(!greh)) | 166 | if (unlikely(!greh)) |
193 | goto out_unlock; | 167 | goto out_unlock; |
194 | } | 168 | } |
195 | if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ | 169 | |
196 | __sum16 csum = 0; | 170 | /* Don't bother verifying checksum if we're going to flush anyway. */ |
197 | 171 | if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) { | |
198 | if (skb->ip_summed == CHECKSUM_COMPLETE) | 172 | if (skb_gro_checksum_simple_validate(skb)) |
199 | csum = csum_fold(NAPI_GRO_CB(skb)->csum); | ||
200 | /* Don't trust csum error calculated/reported by h/w */ | ||
201 | if (skb->ip_summed == CHECKSUM_NONE || csum != 0) | ||
202 | csum = gro_skb_checksum(skb); | ||
203 | |||
204 | /* GRE CSUM is the 1's complement of the 1's complement sum | ||
205 | * of the GRE hdr plus payload so it should add up to 0xffff | ||
206 | * (and 0 after csum_fold()) just like the IPv4 hdr csum. | ||
207 | */ | ||
208 | if (csum) | ||
209 | goto out_unlock; | 173 | goto out_unlock; |
174 | |||
175 | skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, | ||
176 | null_compute_pseudo); | ||
210 | } | 177 | } |
178 | |||
211 | flush = 0; | 179 | flush = 0; |
212 | 180 | ||
213 | for (p = *head; p; p = p->next) { | 181 | for (p = *head; p; p = p->next) { |
@@ -284,7 +252,6 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) | |||
284 | 252 | ||
285 | static const struct net_offload gre_offload = { | 253 | static const struct net_offload gre_offload = { |
286 | .callbacks = { | 254 | .callbacks = { |
287 | .gso_send_check = gre_gso_send_check, | ||
288 | .gso_segment = gre_gso_segment, | 255 | .gso_segment = gre_gso_segment, |
289 | .gro_receive = gre_gro_receive, | 256 | .gro_receive = gre_gro_receive, |
290 | .gro_complete = gre_gro_complete, | 257 | .gro_complete = gre_gro_complete, |
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index ea7d4afe8205..5882f584910e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk) | |||
231 | spin_unlock_bh(&sk->sk_lock.slock); | 231 | spin_unlock_bh(&sk->sk_lock.slock); |
232 | } | 232 | } |
233 | 233 | ||
234 | int sysctl_icmp_msgs_per_sec __read_mostly = 1000; | ||
235 | int sysctl_icmp_msgs_burst __read_mostly = 50; | ||
236 | |||
237 | static struct { | ||
238 | spinlock_t lock; | ||
239 | u32 credit; | ||
240 | u32 stamp; | ||
241 | } icmp_global = { | ||
242 | .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), | ||
243 | }; | ||
244 | |||
245 | /** | ||
246 | * icmp_global_allow - Are we allowed to send one more ICMP message ? | ||
247 | * | ||
248 | * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. | ||
249 | * Returns false if we reached the limit and can not send another packet. | ||
250 | * Note: called with BH disabled | ||
251 | */ | ||
252 | bool icmp_global_allow(void) | ||
253 | { | ||
254 | u32 credit, delta, incr = 0, now = (u32)jiffies; | ||
255 | bool rc = false; | ||
256 | |||
257 | /* Check if token bucket is empty and cannot be refilled | ||
258 | * without taking the spinlock. | ||
259 | */ | ||
260 | if (!icmp_global.credit) { | ||
261 | delta = min_t(u32, now - icmp_global.stamp, HZ); | ||
262 | if (delta < HZ / 50) | ||
263 | return false; | ||
264 | } | ||
265 | |||
266 | spin_lock(&icmp_global.lock); | ||
267 | delta = min_t(u32, now - icmp_global.stamp, HZ); | ||
268 | if (delta >= HZ / 50) { | ||
269 | incr = sysctl_icmp_msgs_per_sec * delta / HZ ; | ||
270 | if (incr) | ||
271 | icmp_global.stamp = now; | ||
272 | } | ||
273 | credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); | ||
274 | if (credit) { | ||
275 | credit--; | ||
276 | rc = true; | ||
277 | } | ||
278 | icmp_global.credit = credit; | ||
279 | spin_unlock(&icmp_global.lock); | ||
280 | return rc; | ||
281 | } | ||
282 | EXPORT_SYMBOL(icmp_global_allow); | ||
283 | |||
234 | /* | 284 | /* |
235 | * Send an ICMP frame. | 285 | * Send an ICMP frame. |
236 | */ | 286 | */ |
237 | 287 | ||
238 | static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, | 288 | static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, |
239 | struct flowi4 *fl4, int type, int code) | 289 | struct flowi4 *fl4, int type, int code) |
240 | { | 290 | { |
241 | struct dst_entry *dst = &rt->dst; | 291 | struct dst_entry *dst = &rt->dst; |
242 | bool rc = true; | 292 | bool rc = true; |
@@ -253,8 +303,14 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, | |||
253 | goto out; | 303 | goto out; |
254 | 304 | ||
255 | /* Limit if icmp type is enabled in ratemask. */ | 305 | /* Limit if icmp type is enabled in ratemask. */ |
256 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { | 306 | if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) |
257 | struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); | 307 | goto out; |
308 | |||
309 | rc = false; | ||
310 | if (icmp_global_allow()) { | ||
311 | struct inet_peer *peer; | ||
312 | |||
313 | peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); | ||
258 | rc = inet_peer_xrlim_allow(peer, | 314 | rc = inet_peer_xrlim_allow(peer, |
259 | net->ipv4.sysctl_icmp_ratelimit); | 315 | net->ipv4.sysctl_icmp_ratelimit); |
260 | if (peer) | 316 | if (peer) |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f10eab462282..fb70e3ecc3e4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -117,7 +117,7 @@ | |||
117 | #define IGMP_V2_Unsolicited_Report_Interval (10*HZ) | 117 | #define IGMP_V2_Unsolicited_Report_Interval (10*HZ) |
118 | #define IGMP_V3_Unsolicited_Report_Interval (1*HZ) | 118 | #define IGMP_V3_Unsolicited_Report_Interval (1*HZ) |
119 | #define IGMP_Query_Response_Interval (10*HZ) | 119 | #define IGMP_Query_Response_Interval (10*HZ) |
120 | #define IGMP_Unsolicited_Report_Count 2 | 120 | #define IGMP_Query_Robustness_Variable 2 |
121 | 121 | ||
122 | 122 | ||
123 | #define IGMP_Initial_Report_Delay (1) | 123 | #define IGMP_Initial_Report_Delay (1) |
@@ -756,8 +756,7 @@ static void igmp_ifc_event(struct in_device *in_dev) | |||
756 | { | 756 | { |
757 | if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) | 757 | if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) |
758 | return; | 758 | return; |
759 | in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : | 759 | in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; |
760 | IGMP_Unsolicited_Report_Count; | ||
761 | igmp_ifc_start_timer(in_dev, 1); | 760 | igmp_ifc_start_timer(in_dev, 1); |
762 | } | 761 | } |
763 | 762 | ||
@@ -932,7 +931,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | |||
932 | in_dev->mr_qrv = ih3->qrv; | 931 | in_dev->mr_qrv = ih3->qrv; |
933 | if (!group) { /* general query */ | 932 | if (!group) { /* general query */ |
934 | if (ih3->nsrcs) | 933 | if (ih3->nsrcs) |
935 | return false; /* no sources allowed */ | 934 | return true; /* no sources allowed */ |
936 | igmp_gq_start_timer(in_dev); | 935 | igmp_gq_start_timer(in_dev); |
937 | return false; | 936 | return false; |
938 | } | 937 | } |
@@ -1086,8 +1085,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) | |||
1086 | pmc->interface = im->interface; | 1085 | pmc->interface = im->interface; |
1087 | in_dev_hold(in_dev); | 1086 | in_dev_hold(in_dev); |
1088 | pmc->multiaddr = im->multiaddr; | 1087 | pmc->multiaddr = im->multiaddr; |
1089 | pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : | 1088 | pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; |
1090 | IGMP_Unsolicited_Report_Count; | ||
1091 | pmc->sfmode = im->sfmode; | 1089 | pmc->sfmode = im->sfmode; |
1092 | if (pmc->sfmode == MCAST_INCLUDE) { | 1090 | if (pmc->sfmode == MCAST_INCLUDE) { |
1093 | struct ip_sf_list *psf; | 1091 | struct ip_sf_list *psf; |
@@ -1226,8 +1224,7 @@ static void igmp_group_added(struct ip_mc_list *im) | |||
1226 | } | 1224 | } |
1227 | /* else, v3 */ | 1225 | /* else, v3 */ |
1228 | 1226 | ||
1229 | im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : | 1227 | im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; |
1230 | IGMP_Unsolicited_Report_Count; | ||
1231 | igmp_ifc_event(in_dev); | 1228 | igmp_ifc_event(in_dev); |
1232 | #endif | 1229 | #endif |
1233 | } | 1230 | } |
@@ -1322,7 +1319,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) | |||
1322 | spin_lock_init(&im->lock); | 1319 | spin_lock_init(&im->lock); |
1323 | #ifdef CONFIG_IP_MULTICAST | 1320 | #ifdef CONFIG_IP_MULTICAST |
1324 | setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); | 1321 | setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); |
1325 | im->unsolicit_count = IGMP_Unsolicited_Report_Count; | 1322 | im->unsolicit_count = sysctl_igmp_qrv; |
1326 | #endif | 1323 | #endif |
1327 | 1324 | ||
1328 | im->next_rcu = in_dev->mc_list; | 1325 | im->next_rcu = in_dev->mc_list; |
@@ -1460,7 +1457,7 @@ void ip_mc_init_dev(struct in_device *in_dev) | |||
1460 | (unsigned long)in_dev); | 1457 | (unsigned long)in_dev); |
1461 | setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, | 1458 | setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, |
1462 | (unsigned long)in_dev); | 1459 | (unsigned long)in_dev); |
1463 | in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; | 1460 | in_dev->mr_qrv = sysctl_igmp_qrv; |
1464 | #endif | 1461 | #endif |
1465 | 1462 | ||
1466 | spin_lock_init(&in_dev->mc_tomb_lock); | 1463 | spin_lock_init(&in_dev->mc_tomb_lock); |
@@ -1474,6 +1471,9 @@ void ip_mc_up(struct in_device *in_dev) | |||
1474 | 1471 | ||
1475 | ASSERT_RTNL(); | 1472 | ASSERT_RTNL(); |
1476 | 1473 | ||
1474 | #ifdef CONFIG_IP_MULTICAST | ||
1475 | in_dev->mr_qrv = sysctl_igmp_qrv; | ||
1476 | #endif | ||
1477 | ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); | 1477 | ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); |
1478 | 1478 | ||
1479 | for_each_pmc_rtnl(in_dev, pmc) | 1479 | for_each_pmc_rtnl(in_dev, pmc) |
@@ -1540,7 +1540,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) | |||
1540 | */ | 1540 | */ |
1541 | int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; | 1541 | int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; |
1542 | int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; | 1542 | int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; |
1543 | 1543 | #ifdef CONFIG_IP_MULTICAST | |
1544 | int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; | ||
1545 | #endif | ||
1544 | 1546 | ||
1545 | static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, | 1547 | static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, |
1546 | __be32 *psfsrc) | 1548 | __be32 *psfsrc) |
@@ -1575,8 +1577,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, | |||
1575 | #ifdef CONFIG_IP_MULTICAST | 1577 | #ifdef CONFIG_IP_MULTICAST |
1576 | if (psf->sf_oldin && | 1578 | if (psf->sf_oldin && |
1577 | !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { | 1579 | !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { |
1578 | psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : | 1580 | psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; |
1579 | IGMP_Unsolicited_Report_Count; | ||
1580 | psf->sf_next = pmc->tomb; | 1581 | psf->sf_next = pmc->tomb; |
1581 | pmc->tomb = psf; | 1582 | pmc->tomb = psf; |
1582 | rv = 1; | 1583 | rv = 1; |
@@ -1639,8 +1640,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, | |||
1639 | /* filter mode change */ | 1640 | /* filter mode change */ |
1640 | pmc->sfmode = MCAST_INCLUDE; | 1641 | pmc->sfmode = MCAST_INCLUDE; |
1641 | #ifdef CONFIG_IP_MULTICAST | 1642 | #ifdef CONFIG_IP_MULTICAST |
1642 | pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : | 1643 | pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; |
1643 | IGMP_Unsolicited_Report_Count; | ||
1644 | in_dev->mr_ifc_count = pmc->crcount; | 1644 | in_dev->mr_ifc_count = pmc->crcount; |
1645 | for (psf = pmc->sources; psf; psf = psf->sf_next) | 1645 | for (psf = pmc->sources; psf; psf = psf->sf_next) |
1646 | psf->sf_crcount = 0; | 1646 | psf->sf_crcount = 0; |
@@ -1818,8 +1818,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, | |||
1818 | #ifdef CONFIG_IP_MULTICAST | 1818 | #ifdef CONFIG_IP_MULTICAST |
1819 | /* else no filters; keep old mode for reports */ | 1819 | /* else no filters; keep old mode for reports */ |
1820 | 1820 | ||
1821 | pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : | 1821 | pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; |
1822 | IGMP_Unsolicited_Report_Count; | ||
1823 | in_dev->mr_ifc_count = pmc->crcount; | 1822 | in_dev->mr_ifc_count = pmc->crcount; |
1824 | for (psf = pmc->sources; psf; psf = psf->sf_next) | 1823 | for (psf = pmc->sources; psf; psf = psf->sf_next) |
1825 | psf->sf_crcount = 0; | 1824 | psf->sf_crcount = 0; |
@@ -2539,7 +2538,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) | |||
2539 | querier = "NONE"; | 2538 | querier = "NONE"; |
2540 | #endif | 2539 | #endif |
2541 | 2540 | ||
2542 | if (rcu_dereference(state->in_dev->mc_list) == im) { | 2541 | if (rcu_access_pointer(state->in_dev->mc_list) == im) { |
2543 | seq_printf(seq, "%d\t%-10s: %5d %7s\n", | 2542 | seq_printf(seq, "%d\t%-10s: %5d %7s\n", |
2544 | state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); | 2543 | state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); |
2545 | } | 2544 | } |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 43116e8c8e13..9111a4e22155 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -229,7 +229,7 @@ begin: | |||
229 | } | 229 | } |
230 | } else if (score == hiscore && reuseport) { | 230 | } else if (score == hiscore && reuseport) { |
231 | matches++; | 231 | matches++; |
232 | if (((u64)phash * matches) >> 32 == 0) | 232 | if (reciprocal_scale(phash, matches) == 0) |
233 | result = sk; | 233 | result = sk; |
234 | phash = next_pseudo_random32(phash); | 234 | phash = next_pseudo_random32(phash); |
235 | } | 235 | } |
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index bd5f5928167d..241afd743d2c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -72,29 +72,10 @@ void inet_peer_base_init(struct inet_peer_base *bp) | |||
72 | { | 72 | { |
73 | bp->root = peer_avl_empty_rcu; | 73 | bp->root = peer_avl_empty_rcu; |
74 | seqlock_init(&bp->lock); | 74 | seqlock_init(&bp->lock); |
75 | bp->flush_seq = ~0U; | ||
76 | bp->total = 0; | 75 | bp->total = 0; |
77 | } | 76 | } |
78 | EXPORT_SYMBOL_GPL(inet_peer_base_init); | 77 | EXPORT_SYMBOL_GPL(inet_peer_base_init); |
79 | 78 | ||
80 | static atomic_t v4_seq = ATOMIC_INIT(0); | ||
81 | static atomic_t v6_seq = ATOMIC_INIT(0); | ||
82 | |||
83 | static atomic_t *inetpeer_seq_ptr(int family) | ||
84 | { | ||
85 | return (family == AF_INET ? &v4_seq : &v6_seq); | ||
86 | } | ||
87 | |||
88 | static inline void flush_check(struct inet_peer_base *base, int family) | ||
89 | { | ||
90 | atomic_t *fp = inetpeer_seq_ptr(family); | ||
91 | |||
92 | if (unlikely(base->flush_seq != atomic_read(fp))) { | ||
93 | inetpeer_invalidate_tree(base); | ||
94 | base->flush_seq = atomic_read(fp); | ||
95 | } | ||
96 | } | ||
97 | |||
98 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ | 79 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ |
99 | 80 | ||
100 | /* Exported for sysctl_net_ipv4. */ | 81 | /* Exported for sysctl_net_ipv4. */ |
@@ -444,8 +425,6 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, | |||
444 | unsigned int sequence; | 425 | unsigned int sequence; |
445 | int invalidated, gccnt = 0; | 426 | int invalidated, gccnt = 0; |
446 | 427 | ||
447 | flush_check(base, daddr->family); | ||
448 | |||
449 | /* Attempt a lockless lookup first. | 428 | /* Attempt a lockless lookup first. |
450 | * Because of a concurrent writer, we might not find an existing entry. | 429 | * Because of a concurrent writer, we might not find an existing entry. |
451 | */ | 430 | */ |
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 15f0e2bad7ad..2811cc18701a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c | |||
@@ -790,7 +790,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) | |||
790 | kfree(table); | 790 | kfree(table); |
791 | } | 791 | } |
792 | 792 | ||
793 | static void ip4_frags_ctl_register(void) | 793 | static void __init ip4_frags_ctl_register(void) |
794 | { | 794 | { |
795 | register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); | 795 | register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); |
796 | } | 796 | } |
@@ -804,7 +804,7 @@ static inline void ip4_frags_ns_ctl_unregister(struct net *net) | |||
804 | { | 804 | { |
805 | } | 805 | } |
806 | 806 | ||
807 | static inline void ip4_frags_ctl_register(void) | 807 | static inline void __init ip4_frags_ctl_register(void) |
808 | { | 808 | { |
809 | } | 809 | } |
810 | #endif | 810 | #endif |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9b842544aea3..12055fdbe716 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -239,7 +239,9 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, | |||
239 | tpi.seq = htonl(tunnel->o_seqno); | 239 | tpi.seq = htonl(tunnel->o_seqno); |
240 | 240 | ||
241 | /* Push GRE header. */ | 241 | /* Push GRE header. */ |
242 | gre_build_header(skb, &tpi, tunnel->hlen); | 242 | gre_build_header(skb, &tpi, tunnel->tun_hlen); |
243 | |||
244 | skb_set_inner_protocol(skb, tpi.proto); | ||
243 | 245 | ||
244 | ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); | 246 | ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); |
245 | } | 247 | } |
@@ -310,7 +312,7 @@ out: | |||
310 | static int ipgre_tunnel_ioctl(struct net_device *dev, | 312 | static int ipgre_tunnel_ioctl(struct net_device *dev, |
311 | struct ifreq *ifr, int cmd) | 313 | struct ifreq *ifr, int cmd) |
312 | { | 314 | { |
313 | int err = 0; | 315 | int err; |
314 | struct ip_tunnel_parm p; | 316 | struct ip_tunnel_parm p; |
315 | 317 | ||
316 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | 318 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) |
@@ -470,13 +472,18 @@ static void ipgre_tunnel_setup(struct net_device *dev) | |||
470 | static void __gre_tunnel_init(struct net_device *dev) | 472 | static void __gre_tunnel_init(struct net_device *dev) |
471 | { | 473 | { |
472 | struct ip_tunnel *tunnel; | 474 | struct ip_tunnel *tunnel; |
475 | int t_hlen; | ||
473 | 476 | ||
474 | tunnel = netdev_priv(dev); | 477 | tunnel = netdev_priv(dev); |
475 | tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); | 478 | tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); |
476 | tunnel->parms.iph.protocol = IPPROTO_GRE; | 479 | tunnel->parms.iph.protocol = IPPROTO_GRE; |
477 | 480 | ||
478 | dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; | 481 | tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; |
479 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; | 482 | |
483 | t_hlen = tunnel->hlen + sizeof(struct iphdr); | ||
484 | |||
485 | dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; | ||
486 | dev->mtu = ETH_DATA_LEN - t_hlen - 4; | ||
480 | 487 | ||
481 | dev->features |= GRE_FEATURES; | 488 | dev->features |= GRE_FEATURES; |
482 | dev->hw_features |= GRE_FEATURES; | 489 | dev->hw_features |= GRE_FEATURES; |
@@ -503,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev) | |||
503 | memcpy(dev->broadcast, &iph->daddr, 4); | 510 | memcpy(dev->broadcast, &iph->daddr, 4); |
504 | 511 | ||
505 | dev->flags = IFF_NOARP; | 512 | dev->flags = IFF_NOARP; |
506 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 513 | netif_keep_dst(dev); |
507 | dev->addr_len = 4; | 514 | dev->addr_len = 4; |
508 | 515 | ||
509 | if (iph->daddr) { | 516 | if (iph->daddr) { |
@@ -628,6 +635,40 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], | |||
628 | parms->iph.frag_off = htons(IP_DF); | 635 | parms->iph.frag_off = htons(IP_DF); |
629 | } | 636 | } |
630 | 637 | ||
638 | /* This function returns true when ENCAP attributes are present in the nl msg */ | ||
639 | static bool ipgre_netlink_encap_parms(struct nlattr *data[], | ||
640 | struct ip_tunnel_encap *ipencap) | ||
641 | { | ||
642 | bool ret = false; | ||
643 | |||
644 | memset(ipencap, 0, sizeof(*ipencap)); | ||
645 | |||
646 | if (!data) | ||
647 | return ret; | ||
648 | |||
649 | if (data[IFLA_GRE_ENCAP_TYPE]) { | ||
650 | ret = true; | ||
651 | ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); | ||
652 | } | ||
653 | |||
654 | if (data[IFLA_GRE_ENCAP_FLAGS]) { | ||
655 | ret = true; | ||
656 | ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); | ||
657 | } | ||
658 | |||
659 | if (data[IFLA_GRE_ENCAP_SPORT]) { | ||
660 | ret = true; | ||
661 | ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]); | ||
662 | } | ||
663 | |||
664 | if (data[IFLA_GRE_ENCAP_DPORT]) { | ||
665 | ret = true; | ||
666 | ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]); | ||
667 | } | ||
668 | |||
669 | return ret; | ||
670 | } | ||
671 | |||
631 | static int gre_tap_init(struct net_device *dev) | 672 | static int gre_tap_init(struct net_device *dev) |
632 | { | 673 | { |
633 | __gre_tunnel_init(dev); | 674 | __gre_tunnel_init(dev); |
@@ -657,6 +698,15 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, | |||
657 | struct nlattr *tb[], struct nlattr *data[]) | 698 | struct nlattr *tb[], struct nlattr *data[]) |
658 | { | 699 | { |
659 | struct ip_tunnel_parm p; | 700 | struct ip_tunnel_parm p; |
701 | struct ip_tunnel_encap ipencap; | ||
702 | |||
703 | if (ipgre_netlink_encap_parms(data, &ipencap)) { | ||
704 | struct ip_tunnel *t = netdev_priv(dev); | ||
705 | int err = ip_tunnel_encap_setup(t, &ipencap); | ||
706 | |||
707 | if (err < 0) | ||
708 | return err; | ||
709 | } | ||
660 | 710 | ||
661 | ipgre_netlink_parms(data, tb, &p); | 711 | ipgre_netlink_parms(data, tb, &p); |
662 | return ip_tunnel_newlink(dev, tb, &p); | 712 | return ip_tunnel_newlink(dev, tb, &p); |
@@ -666,6 +716,15 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], | |||
666 | struct nlattr *data[]) | 716 | struct nlattr *data[]) |
667 | { | 717 | { |
668 | struct ip_tunnel_parm p; | 718 | struct ip_tunnel_parm p; |
719 | struct ip_tunnel_encap ipencap; | ||
720 | |||
721 | if (ipgre_netlink_encap_parms(data, &ipencap)) { | ||
722 | struct ip_tunnel *t = netdev_priv(dev); | ||
723 | int err = ip_tunnel_encap_setup(t, &ipencap); | ||
724 | |||
725 | if (err < 0) | ||
726 | return err; | ||
727 | } | ||
669 | 728 | ||
670 | ipgre_netlink_parms(data, tb, &p); | 729 | ipgre_netlink_parms(data, tb, &p); |
671 | return ip_tunnel_changelink(dev, tb, &p); | 730 | return ip_tunnel_changelink(dev, tb, &p); |
@@ -694,6 +753,14 @@ static size_t ipgre_get_size(const struct net_device *dev) | |||
694 | nla_total_size(1) + | 753 | nla_total_size(1) + |
695 | /* IFLA_GRE_PMTUDISC */ | 754 | /* IFLA_GRE_PMTUDISC */ |
696 | nla_total_size(1) + | 755 | nla_total_size(1) + |
756 | /* IFLA_GRE_ENCAP_TYPE */ | ||
757 | nla_total_size(2) + | ||
758 | /* IFLA_GRE_ENCAP_FLAGS */ | ||
759 | nla_total_size(2) + | ||
760 | /* IFLA_GRE_ENCAP_SPORT */ | ||
761 | nla_total_size(2) + | ||
762 | /* IFLA_GRE_ENCAP_DPORT */ | ||
763 | nla_total_size(2) + | ||
697 | 0; | 764 | 0; |
698 | } | 765 | } |
699 | 766 | ||
@@ -714,6 +781,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) | |||
714 | nla_put_u8(skb, IFLA_GRE_PMTUDISC, | 781 | nla_put_u8(skb, IFLA_GRE_PMTUDISC, |
715 | !!(p->iph.frag_off & htons(IP_DF)))) | 782 | !!(p->iph.frag_off & htons(IP_DF)))) |
716 | goto nla_put_failure; | 783 | goto nla_put_failure; |
784 | |||
785 | if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, | ||
786 | t->encap.type) || | ||
787 | nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT, | ||
788 | t->encap.sport) || | ||
789 | nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, | ||
790 | t->encap.dport) || | ||
791 | nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, | ||
792 | t->encap.dport)) | ||
793 | goto nla_put_failure; | ||
794 | |||
717 | return 0; | 795 | return 0; |
718 | 796 | ||
719 | nla_put_failure: | 797 | nla_put_failure: |
@@ -731,6 +809,10 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { | |||
731 | [IFLA_GRE_TTL] = { .type = NLA_U8 }, | 809 | [IFLA_GRE_TTL] = { .type = NLA_U8 }, |
732 | [IFLA_GRE_TOS] = { .type = NLA_U8 }, | 810 | [IFLA_GRE_TOS] = { .type = NLA_U8 }, |
733 | [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, | 811 | [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, |
812 | [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, | ||
813 | [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, | ||
814 | [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, | ||
815 | [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, | ||
734 | }; | 816 | }; |
735 | 817 | ||
736 | static struct rtnl_link_ops ipgre_link_ops __read_mostly = { | 818 | static struct rtnl_link_ops ipgre_link_ops __read_mostly = { |
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ad382499bace..5b3d91be2db0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c | |||
@@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, | |||
87 | * NOTE: dopt cannot point to skb. | 87 | * NOTE: dopt cannot point to skb. |
88 | */ | 88 | */ |
89 | 89 | ||
90 | int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | 90 | int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, |
91 | const struct ip_options *sopt) | ||
91 | { | 92 | { |
92 | const struct ip_options *sopt; | ||
93 | unsigned char *sptr, *dptr; | 93 | unsigned char *sptr, *dptr; |
94 | int soffset, doffset; | 94 | int soffset, doffset; |
95 | int optlen; | 95 | int optlen; |
96 | 96 | ||
97 | memset(dopt, 0, sizeof(struct ip_options)); | 97 | memset(dopt, 0, sizeof(struct ip_options)); |
98 | 98 | ||
99 | sopt = &(IPCB(skb)->opt); | ||
100 | |||
101 | if (sopt->optlen == 0) | 99 | if (sopt->optlen == 0) |
102 | return 0; | 100 | return 0; |
103 | 101 | ||
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 215af2b155cb..e35b71289156 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -516,7 +516,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) | |||
516 | 516 | ||
517 | hlen = iph->ihl * 4; | 517 | hlen = iph->ihl * 4; |
518 | mtu = mtu - hlen; /* Size of data space */ | 518 | mtu = mtu - hlen; /* Size of data space */ |
519 | #ifdef CONFIG_BRIDGE_NETFILTER | 519 | #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) |
520 | if (skb->nf_bridge) | 520 | if (skb->nf_bridge) |
521 | mtu -= nf_bridge_mtu_reduction(skb); | 521 | mtu -= nf_bridge_mtu_reduction(skb); |
522 | #endif | 522 | #endif |
@@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { | |||
1522 | .uc_ttl = -1, | 1522 | .uc_ttl = -1, |
1523 | }; | 1523 | }; |
1524 | 1524 | ||
1525 | void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, | 1525 | void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, |
1526 | __be32 saddr, const struct ip_reply_arg *arg, | 1526 | const struct ip_options *sopt, |
1527 | __be32 daddr, __be32 saddr, | ||
1528 | const struct ip_reply_arg *arg, | ||
1527 | unsigned int len) | 1529 | unsigned int len) |
1528 | { | 1530 | { |
1529 | struct ip_options_data replyopts; | 1531 | struct ip_options_data replyopts; |
@@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, | |||
1534 | struct sock *sk; | 1536 | struct sock *sk; |
1535 | struct inet_sock *inet; | 1537 | struct inet_sock *inet; |
1536 | 1538 | ||
1537 | if (ip_options_echo(&replyopts.opt.opt, skb)) | 1539 | if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) |
1538 | return; | 1540 | return; |
1539 | 1541 | ||
1540 | ipc.addr = daddr; | 1542 | ipc.addr = daddr; |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5cb830c78990..c373a9ad4555 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -303,7 +303,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, | |||
303 | } | 303 | } |
304 | /* dont let ip_call_ra_chain() use sk again */ | 304 | /* dont let ip_call_ra_chain() use sk again */ |
305 | ra->sk = NULL; | 305 | ra->sk = NULL; |
306 | rcu_assign_pointer(*rap, ra->next); | 306 | RCU_INIT_POINTER(*rap, ra->next); |
307 | spin_unlock_bh(&ip_ra_lock); | 307 | spin_unlock_bh(&ip_ra_lock); |
308 | 308 | ||
309 | if (ra->destructor) | 309 | if (ra->destructor) |
@@ -325,7 +325,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, | |||
325 | new_ra->sk = sk; | 325 | new_ra->sk = sk; |
326 | new_ra->destructor = destructor; | 326 | new_ra->destructor = destructor; |
327 | 327 | ||
328 | new_ra->next = ra; | 328 | RCU_INIT_POINTER(new_ra->next, ra); |
329 | rcu_assign_pointer(*rap, new_ra); | 329 | rcu_assign_pointer(*rap, new_ra); |
330 | sock_hold(sk); | 330 | sock_hold(sk); |
331 | spin_unlock_bh(&ip_ra_lock); | 331 | spin_unlock_bh(&ip_ra_lock); |
@@ -405,7 +405,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf | |||
405 | int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) | 405 | int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) |
406 | { | 406 | { |
407 | struct sock_exterr_skb *serr; | 407 | struct sock_exterr_skb *serr; |
408 | struct sk_buff *skb, *skb2; | 408 | struct sk_buff *skb; |
409 | DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); | 409 | DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); |
410 | struct { | 410 | struct { |
411 | struct sock_extended_err ee; | 411 | struct sock_extended_err ee; |
@@ -415,7 +415,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) | |||
415 | int copied; | 415 | int copied; |
416 | 416 | ||
417 | err = -EAGAIN; | 417 | err = -EAGAIN; |
418 | skb = skb_dequeue(&sk->sk_error_queue); | 418 | skb = sock_dequeue_err_skb(sk); |
419 | if (skb == NULL) | 419 | if (skb == NULL) |
420 | goto out; | 420 | goto out; |
421 | 421 | ||
@@ -462,17 +462,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) | |||
462 | msg->msg_flags |= MSG_ERRQUEUE; | 462 | msg->msg_flags |= MSG_ERRQUEUE; |
463 | err = copied; | 463 | err = copied; |
464 | 464 | ||
465 | /* Reset and regenerate socket error */ | ||
466 | spin_lock_bh(&sk->sk_error_queue.lock); | ||
467 | sk->sk_err = 0; | ||
468 | skb2 = skb_peek(&sk->sk_error_queue); | ||
469 | if (skb2 != NULL) { | ||
470 | sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; | ||
471 | spin_unlock_bh(&sk->sk_error_queue.lock); | ||
472 | sk->sk_error_report(sk); | ||
473 | } else | ||
474 | spin_unlock_bh(&sk->sk_error_queue.lock); | ||
475 | |||
476 | out_free_skb: | 465 | out_free_skb: |
477 | kfree_skb(skb); | 466 | kfree_skb(skb); |
478 | out: | 467 | out: |
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index bda4bb8ae260..0bb8e141eacc 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c | |||
@@ -55,6 +55,8 @@ | |||
55 | #include <net/net_namespace.h> | 55 | #include <net/net_namespace.h> |
56 | #include <net/netns/generic.h> | 56 | #include <net/netns/generic.h> |
57 | #include <net/rtnetlink.h> | 57 | #include <net/rtnetlink.h> |
58 | #include <net/udp.h> | ||
59 | #include <net/gue.h> | ||
58 | 60 | ||
59 | #if IS_ENABLED(CONFIG_IPV6) | 61 | #if IS_ENABLED(CONFIG_IPV6) |
60 | #include <net/ipv6.h> | 62 | #include <net/ipv6.h> |
@@ -487,6 +489,103 @@ drop: | |||
487 | } | 489 | } |
488 | EXPORT_SYMBOL_GPL(ip_tunnel_rcv); | 490 | EXPORT_SYMBOL_GPL(ip_tunnel_rcv); |
489 | 491 | ||
492 | static int ip_encap_hlen(struct ip_tunnel_encap *e) | ||
493 | { | ||
494 | switch (e->type) { | ||
495 | case TUNNEL_ENCAP_NONE: | ||
496 | return 0; | ||
497 | case TUNNEL_ENCAP_FOU: | ||
498 | return sizeof(struct udphdr); | ||
499 | case TUNNEL_ENCAP_GUE: | ||
500 | return sizeof(struct udphdr) + sizeof(struct guehdr); | ||
501 | default: | ||
502 | return -EINVAL; | ||
503 | } | ||
504 | } | ||
505 | |||
506 | int ip_tunnel_encap_setup(struct ip_tunnel *t, | ||
507 | struct ip_tunnel_encap *ipencap) | ||
508 | { | ||
509 | int hlen; | ||
510 | |||
511 | memset(&t->encap, 0, sizeof(t->encap)); | ||
512 | |||
513 | hlen = ip_encap_hlen(ipencap); | ||
514 | if (hlen < 0) | ||
515 | return hlen; | ||
516 | |||
517 | t->encap.type = ipencap->type; | ||
518 | t->encap.sport = ipencap->sport; | ||
519 | t->encap.dport = ipencap->dport; | ||
520 | t->encap.flags = ipencap->flags; | ||
521 | |||
522 | t->encap_hlen = hlen; | ||
523 | t->hlen = t->encap_hlen + t->tun_hlen; | ||
524 | |||
525 | return 0; | ||
526 | } | ||
527 | EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); | ||
528 | |||
529 | static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, | ||
530 | size_t hdr_len, u8 *protocol, struct flowi4 *fl4) | ||
531 | { | ||
532 | struct udphdr *uh; | ||
533 | __be16 sport; | ||
534 | bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); | ||
535 | int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; | ||
536 | |||
537 | skb = iptunnel_handle_offloads(skb, csum, type); | ||
538 | |||
539 | if (IS_ERR(skb)) | ||
540 | return PTR_ERR(skb); | ||
541 | |||
542 | /* Get length and hash before making space in skb */ | ||
543 | |||
544 | sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), | ||
545 | skb, 0, 0, false); | ||
546 | |||
547 | skb_push(skb, hdr_len); | ||
548 | |||
549 | skb_reset_transport_header(skb); | ||
550 | uh = udp_hdr(skb); | ||
551 | |||
552 | if (e->type == TUNNEL_ENCAP_GUE) { | ||
553 | struct guehdr *guehdr = (struct guehdr *)&uh[1]; | ||
554 | |||
555 | guehdr->version = 0; | ||
556 | guehdr->hlen = 0; | ||
557 | guehdr->flags = 0; | ||
558 | guehdr->next_hdr = *protocol; | ||
559 | } | ||
560 | |||
561 | uh->dest = e->dport; | ||
562 | uh->source = sport; | ||
563 | uh->len = htons(skb->len); | ||
564 | uh->check = 0; | ||
565 | udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, | ||
566 | fl4->saddr, fl4->daddr, skb->len); | ||
567 | |||
568 | *protocol = IPPROTO_UDP; | ||
569 | |||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, | ||
574 | u8 *protocol, struct flowi4 *fl4) | ||
575 | { | ||
576 | switch (t->encap.type) { | ||
577 | case TUNNEL_ENCAP_NONE: | ||
578 | return 0; | ||
579 | case TUNNEL_ENCAP_FOU: | ||
580 | case TUNNEL_ENCAP_GUE: | ||
581 | return fou_build_header(skb, &t->encap, t->encap_hlen, | ||
582 | protocol, fl4); | ||
583 | default: | ||
584 | return -EINVAL; | ||
585 | } | ||
586 | } | ||
587 | EXPORT_SYMBOL(ip_tunnel_encap); | ||
588 | |||
490 | static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, | 589 | static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, |
491 | struct rtable *rt, __be16 df) | 590 | struct rtable *rt, __be16 df) |
492 | { | 591 | { |
@@ -536,7 +635,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, | |||
536 | } | 635 | } |
537 | 636 | ||
538 | void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, | 637 | void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, |
539 | const struct iphdr *tnl_params, const u8 protocol) | 638 | const struct iphdr *tnl_params, u8 protocol) |
540 | { | 639 | { |
541 | struct ip_tunnel *tunnel = netdev_priv(dev); | 640 | struct ip_tunnel *tunnel = netdev_priv(dev); |
542 | const struct iphdr *inner_iph; | 641 | const struct iphdr *inner_iph; |
@@ -617,6 +716,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, | |||
617 | init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, | 716 | init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, |
618 | tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); | 717 | tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); |
619 | 718 | ||
719 | if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) | ||
720 | goto tx_error; | ||
721 | |||
620 | rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; | 722 | rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; |
621 | 723 | ||
622 | if (!rt) { | 724 | if (!rt) { |
@@ -670,7 +772,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, | |||
670 | df |= (inner_iph->frag_off&htons(IP_DF)); | 772 | df |= (inner_iph->frag_off&htons(IP_DF)); |
671 | 773 | ||
672 | max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) | 774 | max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) |
673 | + rt->dst.header_len; | 775 | + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); |
674 | if (max_headroom > dev->needed_headroom) | 776 | if (max_headroom > dev->needed_headroom) |
675 | dev->needed_headroom = max_headroom; | 777 | dev->needed_headroom = max_headroom; |
676 | 778 | ||
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e453cb724a95..3e861011e4a3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c | |||
@@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev) | |||
364 | dev->iflink = 0; | 364 | dev->iflink = 0; |
365 | dev->addr_len = 4; | 365 | dev->addr_len = 4; |
366 | dev->features |= NETIF_F_LLTX; | 366 | dev->features |= NETIF_F_LLTX; |
367 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 367 | netif_keep_dst(dev); |
368 | 368 | ||
369 | return ip_tunnel_init(dev); | 369 | return ip_tunnel_init(dev); |
370 | } | 370 | } |
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 5bbef4fdcb43..648fa1490ea7 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c | |||
@@ -262,7 +262,8 @@ static int __init ic_open_devs(void) | |||
262 | /* wait for a carrier on at least one device */ | 262 | /* wait for a carrier on at least one device */ |
263 | start = jiffies; | 263 | start = jiffies; |
264 | next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); | 264 | next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); |
265 | while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { | 265 | while (time_before(jiffies, start + |
266 | msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) { | ||
266 | int wait, elapsed; | 267 | int wait, elapsed; |
267 | 268 | ||
268 | for_each_netdev(&init_net, dev) | 269 | for_each_netdev(&init_net, dev) |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 62eaa005e146..37096d64730e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -224,6 +224,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
224 | if (IS_ERR(skb)) | 224 | if (IS_ERR(skb)) |
225 | goto out; | 225 | goto out; |
226 | 226 | ||
227 | skb_set_inner_ipproto(skb, IPPROTO_IPIP); | ||
228 | |||
227 | ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); | 229 | ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); |
228 | return NETDEV_TX_OK; | 230 | return NETDEV_TX_OK; |
229 | 231 | ||
@@ -287,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev) | |||
287 | dev->iflink = 0; | 289 | dev->iflink = 0; |
288 | dev->addr_len = 4; | 290 | dev->addr_len = 4; |
289 | dev->features |= NETIF_F_LLTX; | 291 | dev->features |= NETIF_F_LLTX; |
290 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 292 | netif_keep_dst(dev); |
291 | 293 | ||
292 | dev->features |= IPIP_FEATURES; | 294 | dev->features |= IPIP_FEATURES; |
293 | dev->hw_features |= IPIP_FEATURES; | 295 | dev->hw_features |= IPIP_FEATURES; |
@@ -301,7 +303,8 @@ static int ipip_tunnel_init(struct net_device *dev) | |||
301 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | 303 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); |
302 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | 304 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); |
303 | 305 | ||
304 | tunnel->hlen = 0; | 306 | tunnel->tun_hlen = 0; |
307 | tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; | ||
305 | tunnel->parms.iph.protocol = IPPROTO_IPIP; | 308 | tunnel->parms.iph.protocol = IPPROTO_IPIP; |
306 | return ip_tunnel_init(dev); | 309 | return ip_tunnel_init(dev); |
307 | } | 310 | } |
@@ -340,10 +343,53 @@ static void ipip_netlink_parms(struct nlattr *data[], | |||
340 | parms->iph.frag_off = htons(IP_DF); | 343 | parms->iph.frag_off = htons(IP_DF); |
341 | } | 344 | } |
342 | 345 | ||
346 | /* This function returns true when ENCAP attributes are present in the nl msg */ | ||
347 | static bool ipip_netlink_encap_parms(struct nlattr *data[], | ||
348 | struct ip_tunnel_encap *ipencap) | ||
349 | { | ||
350 | bool ret = false; | ||
351 | |||
352 | memset(ipencap, 0, sizeof(*ipencap)); | ||
353 | |||
354 | if (!data) | ||
355 | return ret; | ||
356 | |||
357 | if (data[IFLA_IPTUN_ENCAP_TYPE]) { | ||
358 | ret = true; | ||
359 | ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); | ||
360 | } | ||
361 | |||
362 | if (data[IFLA_IPTUN_ENCAP_FLAGS]) { | ||
363 | ret = true; | ||
364 | ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); | ||
365 | } | ||
366 | |||
367 | if (data[IFLA_IPTUN_ENCAP_SPORT]) { | ||
368 | ret = true; | ||
369 | ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]); | ||
370 | } | ||
371 | |||
372 | if (data[IFLA_IPTUN_ENCAP_DPORT]) { | ||
373 | ret = true; | ||
374 | ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]); | ||
375 | } | ||
376 | |||
377 | return ret; | ||
378 | } | ||
379 | |||
343 | static int ipip_newlink(struct net *src_net, struct net_device *dev, | 380 | static int ipip_newlink(struct net *src_net, struct net_device *dev, |
344 | struct nlattr *tb[], struct nlattr *data[]) | 381 | struct nlattr *tb[], struct nlattr *data[]) |
345 | { | 382 | { |
346 | struct ip_tunnel_parm p; | 383 | struct ip_tunnel_parm p; |
384 | struct ip_tunnel_encap ipencap; | ||
385 | |||
386 | if (ipip_netlink_encap_parms(data, &ipencap)) { | ||
387 | struct ip_tunnel *t = netdev_priv(dev); | ||
388 | int err = ip_tunnel_encap_setup(t, &ipencap); | ||
389 | |||
390 | if (err < 0) | ||
391 | return err; | ||
392 | } | ||
347 | 393 | ||
348 | ipip_netlink_parms(data, &p); | 394 | ipip_netlink_parms(data, &p); |
349 | return ip_tunnel_newlink(dev, tb, &p); | 395 | return ip_tunnel_newlink(dev, tb, &p); |
@@ -353,6 +399,15 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], | |||
353 | struct nlattr *data[]) | 399 | struct nlattr *data[]) |
354 | { | 400 | { |
355 | struct ip_tunnel_parm p; | 401 | struct ip_tunnel_parm p; |
402 | struct ip_tunnel_encap ipencap; | ||
403 | |||
404 | if (ipip_netlink_encap_parms(data, &ipencap)) { | ||
405 | struct ip_tunnel *t = netdev_priv(dev); | ||
406 | int err = ip_tunnel_encap_setup(t, &ipencap); | ||
407 | |||
408 | if (err < 0) | ||
409 | return err; | ||
410 | } | ||
356 | 411 | ||
357 | ipip_netlink_parms(data, &p); | 412 | ipip_netlink_parms(data, &p); |
358 | 413 | ||
@@ -378,6 +433,14 @@ static size_t ipip_get_size(const struct net_device *dev) | |||
378 | nla_total_size(1) + | 433 | nla_total_size(1) + |
379 | /* IFLA_IPTUN_PMTUDISC */ | 434 | /* IFLA_IPTUN_PMTUDISC */ |
380 | nla_total_size(1) + | 435 | nla_total_size(1) + |
436 | /* IFLA_IPTUN_ENCAP_TYPE */ | ||
437 | nla_total_size(2) + | ||
438 | /* IFLA_IPTUN_ENCAP_FLAGS */ | ||
439 | nla_total_size(2) + | ||
440 | /* IFLA_IPTUN_ENCAP_SPORT */ | ||
441 | nla_total_size(2) + | ||
442 | /* IFLA_IPTUN_ENCAP_DPORT */ | ||
443 | nla_total_size(2) + | ||
381 | 0; | 444 | 0; |
382 | } | 445 | } |
383 | 446 | ||
@@ -394,6 +457,17 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) | |||
394 | nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, | 457 | nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, |
395 | !!(parm->iph.frag_off & htons(IP_DF)))) | 458 | !!(parm->iph.frag_off & htons(IP_DF)))) |
396 | goto nla_put_failure; | 459 | goto nla_put_failure; |
460 | |||
461 | if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, | ||
462 | tunnel->encap.type) || | ||
463 | nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT, | ||
464 | tunnel->encap.sport) || | ||
465 | nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, | ||
466 | tunnel->encap.dport) || | ||
467 | nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, | ||
468 | tunnel->encap.dport)) | ||
469 | goto nla_put_failure; | ||
470 | |||
397 | return 0; | 471 | return 0; |
398 | 472 | ||
399 | nla_put_failure: | 473 | nla_put_failure: |
@@ -407,6 +481,10 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { | |||
407 | [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, | 481 | [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, |
408 | [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, | 482 | [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, |
409 | [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, | 483 | [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, |
484 | [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, | ||
485 | [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, | ||
486 | [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, | ||
487 | [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, | ||
410 | }; | 488 | }; |
411 | 489 | ||
412 | static struct rtnl_link_ops ipip_link_ops __read_mostly = { | 490 | static struct rtnl_link_ops ipip_link_ops __read_mostly = { |
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 7cbcaf4f0194..4c019d5c3f57 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
@@ -61,18 +61,13 @@ config NFT_CHAIN_ROUTE_IPV4 | |||
61 | fields such as the source, destination, type of service and | 61 | fields such as the source, destination, type of service and |
62 | the packet mark. | 62 | the packet mark. |
63 | 63 | ||
64 | config NFT_CHAIN_NAT_IPV4 | 64 | config NF_REJECT_IPV4 |
65 | depends on NF_TABLES_IPV4 | 65 | tristate "IPv4 packet rejection" |
66 | depends on NF_NAT_IPV4 && NFT_NAT | 66 | default m if NETFILTER_ADVANCED=n |
67 | tristate "IPv4 nf_tables nat chain support" | ||
68 | help | ||
69 | This option enables the "nat" chain for IPv4 in nf_tables. This | ||
70 | chain type is used to perform Network Address Translation (NAT) | ||
71 | packet transformations such as the source, destination address and | ||
72 | source and destination ports. | ||
73 | 67 | ||
74 | config NFT_REJECT_IPV4 | 68 | config NFT_REJECT_IPV4 |
75 | depends on NF_TABLES_IPV4 | 69 | depends on NF_TABLES_IPV4 |
70 | select NF_REJECT_IPV4 | ||
76 | default NFT_REJECT | 71 | default NFT_REJECT |
77 | tristate | 72 | tristate |
78 | 73 | ||
@@ -94,6 +89,30 @@ config NF_NAT_IPV4 | |||
94 | 89 | ||
95 | if NF_NAT_IPV4 | 90 | if NF_NAT_IPV4 |
96 | 91 | ||
92 | config NFT_CHAIN_NAT_IPV4 | ||
93 | depends on NF_TABLES_IPV4 | ||
94 | tristate "IPv4 nf_tables nat chain support" | ||
95 | help | ||
96 | This option enables the "nat" chain for IPv4 in nf_tables. This | ||
97 | chain type is used to perform Network Address Translation (NAT) | ||
98 | packet transformations such as the source, destination address and | ||
99 | source and destination ports. | ||
100 | |||
101 | config NF_NAT_MASQUERADE_IPV4 | ||
102 | tristate "IPv4 masquerade support" | ||
103 | help | ||
104 | This is the kernel functionality to provide NAT in the masquerade | ||
105 | flavour (automatic source address selection). | ||
106 | |||
107 | config NFT_MASQ_IPV4 | ||
108 | tristate "IPv4 masquerading support for nf_tables" | ||
109 | depends on NF_TABLES_IPV4 | ||
110 | depends on NFT_MASQ | ||
111 | select NF_NAT_MASQUERADE_IPV4 | ||
112 | help | ||
113 | This is the expression that provides IPv4 masquerading support for | ||
114 | nf_tables. | ||
115 | |||
97 | config NF_NAT_SNMP_BASIC | 116 | config NF_NAT_SNMP_BASIC |
98 | tristate "Basic SNMP-ALG support" | 117 | tristate "Basic SNMP-ALG support" |
99 | depends on NF_CONNTRACK_SNMP | 118 | depends on NF_CONNTRACK_SNMP |
@@ -194,6 +213,7 @@ config IP_NF_FILTER | |||
194 | config IP_NF_TARGET_REJECT | 213 | config IP_NF_TARGET_REJECT |
195 | tristate "REJECT target support" | 214 | tristate "REJECT target support" |
196 | depends on IP_NF_FILTER | 215 | depends on IP_NF_FILTER |
216 | select NF_REJECT_IPV4 | ||
197 | default m if NETFILTER_ADVANCED=n | 217 | default m if NETFILTER_ADVANCED=n |
198 | help | 218 | help |
199 | The REJECT target allows a filtering rule to specify that an ICMP | 219 | The REJECT target allows a filtering rule to specify that an ICMP |
@@ -234,6 +254,7 @@ if IP_NF_NAT | |||
234 | 254 | ||
235 | config IP_NF_TARGET_MASQUERADE | 255 | config IP_NF_TARGET_MASQUERADE |
236 | tristate "MASQUERADE target support" | 256 | tristate "MASQUERADE target support" |
257 | select NF_NAT_MASQUERADE_IPV4 | ||
237 | default m if NETFILTER_ADVANCED=n | 258 | default m if NETFILTER_ADVANCED=n |
238 | help | 259 | help |
239 | Masquerading is a special case of NAT: all outgoing connections are | 260 | Masquerading is a special case of NAT: all outgoing connections are |
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index edf4af32e9f2..f4cef5af0969 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
@@ -23,10 +23,14 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o | |||
23 | obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o | 23 | obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o |
24 | obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o | 24 | obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o |
25 | 25 | ||
26 | # reject | ||
27 | obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o | ||
28 | |||
26 | # NAT helpers (nf_conntrack) | 29 | # NAT helpers (nf_conntrack) |
27 | obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o | 30 | obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o |
28 | obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o | 31 | obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o |
29 | obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o | 32 | obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o |
33 | obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o | ||
30 | 34 | ||
31 | # NAT protocols (nf_nat) | 35 | # NAT protocols (nf_nat) |
32 | obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o | 36 | obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o |
@@ -35,6 +39,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o | |||
35 | obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o | 39 | obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o |
36 | obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o | 40 | obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o |
37 | obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o | 41 | obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o |
42 | obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o | ||
38 | obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o | 43 | obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o |
39 | 44 | ||
40 | # generic IP tables | 45 | # generic IP tables |
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2510c02c2d21..e90f83a3415b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c | |||
@@ -285,7 +285,7 @@ clusterip_hashfn(const struct sk_buff *skb, | |||
285 | } | 285 | } |
286 | 286 | ||
287 | /* node numbers are 1..n, not 0..n */ | 287 | /* node numbers are 1..n, not 0..n */ |
288 | return (((u64)hashval * config->num_total_nodes) >> 32) + 1; | 288 | return reciprocal_scale(hashval, config->num_total_nodes) + 1; |
289 | } | 289 | } |
290 | 290 | ||
291 | static inline int | 291 | static inline int |
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 00352ce0f0de..da7f02a0b868 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/netfilter_ipv4.h> | 22 | #include <linux/netfilter_ipv4.h> |
23 | #include <linux/netfilter/x_tables.h> | 23 | #include <linux/netfilter/x_tables.h> |
24 | #include <net/netfilter/nf_nat.h> | 24 | #include <net/netfilter/nf_nat.h> |
25 | #include <net/netfilter/ipv4/nf_nat_masquerade.h> | ||
25 | 26 | ||
26 | MODULE_LICENSE("GPL"); | 27 | MODULE_LICENSE("GPL"); |
27 | MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); | 28 | MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); |
@@ -46,103 +47,17 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) | |||
46 | static unsigned int | 47 | static unsigned int |
47 | masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) | 48 | masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) |
48 | { | 49 | { |
49 | struct nf_conn *ct; | 50 | struct nf_nat_range range; |
50 | struct nf_conn_nat *nat; | ||
51 | enum ip_conntrack_info ctinfo; | ||
52 | struct nf_nat_range newrange; | ||
53 | const struct nf_nat_ipv4_multi_range_compat *mr; | 51 | const struct nf_nat_ipv4_multi_range_compat *mr; |
54 | const struct rtable *rt; | ||
55 | __be32 newsrc, nh; | ||
56 | |||
57 | NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); | ||
58 | |||
59 | ct = nf_ct_get(skb, &ctinfo); | ||
60 | nat = nfct_nat(ct); | ||
61 | |||
62 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || | ||
63 | ctinfo == IP_CT_RELATED_REPLY)); | ||
64 | |||
65 | /* Source address is 0.0.0.0 - locally generated packet that is | ||
66 | * probably not supposed to be masqueraded. | ||
67 | */ | ||
68 | if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) | ||
69 | return NF_ACCEPT; | ||
70 | 52 | ||
71 | mr = par->targinfo; | 53 | mr = par->targinfo; |
72 | rt = skb_rtable(skb); | 54 | range.flags = mr->range[0].flags; |
73 | nh = rt_nexthop(rt, ip_hdr(skb)->daddr); | 55 | range.min_proto = mr->range[0].min; |
74 | newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); | 56 | range.max_proto = mr->range[0].max; |
75 | if (!newsrc) { | ||
76 | pr_info("%s ate my IP address\n", par->out->name); | ||
77 | return NF_DROP; | ||
78 | } | ||
79 | |||
80 | nat->masq_index = par->out->ifindex; | ||
81 | |||
82 | /* Transfer from original range. */ | ||
83 | memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); | ||
84 | memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); | ||
85 | newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; | ||
86 | newrange.min_addr.ip = newsrc; | ||
87 | newrange.max_addr.ip = newsrc; | ||
88 | newrange.min_proto = mr->range[0].min; | ||
89 | newrange.max_proto = mr->range[0].max; | ||
90 | 57 | ||
91 | /* Hand modified range to generic setup. */ | 58 | return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out); |
92 | return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); | ||
93 | } | 59 | } |
94 | 60 | ||
95 | static int | ||
96 | device_cmp(struct nf_conn *i, void *ifindex) | ||
97 | { | ||
98 | const struct nf_conn_nat *nat = nfct_nat(i); | ||
99 | |||
100 | if (!nat) | ||
101 | return 0; | ||
102 | if (nf_ct_l3num(i) != NFPROTO_IPV4) | ||
103 | return 0; | ||
104 | return nat->masq_index == (int)(long)ifindex; | ||
105 | } | ||
106 | |||
107 | static int masq_device_event(struct notifier_block *this, | ||
108 | unsigned long event, | ||
109 | void *ptr) | ||
110 | { | ||
111 | const struct net_device *dev = netdev_notifier_info_to_dev(ptr); | ||
112 | struct net *net = dev_net(dev); | ||
113 | |||
114 | if (event == NETDEV_DOWN) { | ||
115 | /* Device was downed. Search entire table for | ||
116 | conntracks which were associated with that device, | ||
117 | and forget them. */ | ||
118 | NF_CT_ASSERT(dev->ifindex != 0); | ||
119 | |||
120 | nf_ct_iterate_cleanup(net, device_cmp, | ||
121 | (void *)(long)dev->ifindex, 0, 0); | ||
122 | } | ||
123 | |||
124 | return NOTIFY_DONE; | ||
125 | } | ||
126 | |||
127 | static int masq_inet_event(struct notifier_block *this, | ||
128 | unsigned long event, | ||
129 | void *ptr) | ||
130 | { | ||
131 | struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; | ||
132 | struct netdev_notifier_info info; | ||
133 | |||
134 | netdev_notifier_info_init(&info, dev); | ||
135 | return masq_device_event(this, event, &info); | ||
136 | } | ||
137 | |||
138 | static struct notifier_block masq_dev_notifier = { | ||
139 | .notifier_call = masq_device_event, | ||
140 | }; | ||
141 | |||
142 | static struct notifier_block masq_inet_notifier = { | ||
143 | .notifier_call = masq_inet_event, | ||
144 | }; | ||
145 | |||
146 | static struct xt_target masquerade_tg_reg __read_mostly = { | 61 | static struct xt_target masquerade_tg_reg __read_mostly = { |
147 | .name = "MASQUERADE", | 62 | .name = "MASQUERADE", |
148 | .family = NFPROTO_IPV4, | 63 | .family = NFPROTO_IPV4, |
@@ -160,12 +75,8 @@ static int __init masquerade_tg_init(void) | |||
160 | 75 | ||
161 | ret = xt_register_target(&masquerade_tg_reg); | 76 | ret = xt_register_target(&masquerade_tg_reg); |
162 | 77 | ||
163 | if (ret == 0) { | 78 | if (ret == 0) |
164 | /* Register for device down reports */ | 79 | nf_nat_masquerade_ipv4_register_notifier(); |
165 | register_netdevice_notifier(&masq_dev_notifier); | ||
166 | /* Register IP address change reports */ | ||
167 | register_inetaddr_notifier(&masq_inet_notifier); | ||
168 | } | ||
169 | 80 | ||
170 | return ret; | 81 | return ret; |
171 | } | 82 | } |
@@ -173,8 +84,7 @@ static int __init masquerade_tg_init(void) | |||
173 | static void __exit masquerade_tg_exit(void) | 84 | static void __exit masquerade_tg_exit(void) |
174 | { | 85 | { |
175 | xt_unregister_target(&masquerade_tg_reg); | 86 | xt_unregister_target(&masquerade_tg_reg); |
176 | unregister_netdevice_notifier(&masq_dev_notifier); | 87 | nf_nat_masquerade_ipv4_unregister_notifier(); |
177 | unregister_inetaddr_notifier(&masq_inet_notifier); | ||
178 | } | 88 | } |
179 | 89 | ||
180 | module_init(masquerade_tg_init); | 90 | module_init(masquerade_tg_init); |
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 5b6e0df4ccff..8f48f5517e33 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
@@ -20,7 +20,7 @@ | |||
20 | #include <linux/netfilter/x_tables.h> | 20 | #include <linux/netfilter/x_tables.h> |
21 | #include <linux/netfilter_ipv4/ip_tables.h> | 21 | #include <linux/netfilter_ipv4/ip_tables.h> |
22 | #include <linux/netfilter_ipv4/ipt_REJECT.h> | 22 | #include <linux/netfilter_ipv4/ipt_REJECT.h> |
23 | #ifdef CONFIG_BRIDGE_NETFILTER | 23 | #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) |
24 | #include <linux/netfilter_bridge.h> | 24 | #include <linux/netfilter_bridge.h> |
25 | #endif | 25 | #endif |
26 | 26 | ||
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index f1787c04a4dd..6b67d7e9a75d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c | |||
@@ -28,222 +28,57 @@ static const struct xt_table nf_nat_ipv4_table = { | |||
28 | .af = NFPROTO_IPV4, | 28 | .af = NFPROTO_IPV4, |
29 | }; | 29 | }; |
30 | 30 | ||
31 | static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) | 31 | static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, |
32 | { | 32 | struct sk_buff *skb, |
33 | /* Force range to this IP; let proto decide mapping for | 33 | const struct net_device *in, |
34 | * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). | 34 | const struct net_device *out, |
35 | */ | 35 | struct nf_conn *ct) |
36 | struct nf_nat_range range; | ||
37 | |||
38 | range.flags = 0; | ||
39 | pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, | ||
40 | HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? | ||
41 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : | ||
42 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); | ||
43 | |||
44 | return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); | ||
45 | } | ||
46 | |||
47 | static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, | ||
48 | const struct net_device *in, | ||
49 | const struct net_device *out, | ||
50 | struct nf_conn *ct) | ||
51 | { | 36 | { |
52 | struct net *net = nf_ct_net(ct); | 37 | struct net *net = nf_ct_net(ct); |
53 | unsigned int ret; | ||
54 | 38 | ||
55 | ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); | 39 | return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table); |
56 | if (ret == NF_ACCEPT) { | ||
57 | if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) | ||
58 | ret = alloc_null_binding(ct, hooknum); | ||
59 | } | ||
60 | return ret; | ||
61 | } | 40 | } |
62 | 41 | ||
63 | static unsigned int | 42 | static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, |
64 | nf_nat_ipv4_fn(const struct nf_hook_ops *ops, | 43 | struct sk_buff *skb, |
65 | struct sk_buff *skb, | 44 | const struct net_device *in, |
66 | const struct net_device *in, | 45 | const struct net_device *out, |
67 | const struct net_device *out, | 46 | int (*okfn)(struct sk_buff *)) |
68 | int (*okfn)(struct sk_buff *)) | ||
69 | { | 47 | { |
70 | struct nf_conn *ct; | 48 | return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain); |
71 | enum ip_conntrack_info ctinfo; | ||
72 | struct nf_conn_nat *nat; | ||
73 | /* maniptype == SRC for postrouting. */ | ||
74 | enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); | ||
75 | |||
76 | /* We never see fragments: conntrack defrags on pre-routing | ||
77 | * and local-out, and nf_nat_out protects post-routing. | ||
78 | */ | ||
79 | NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); | ||
80 | |||
81 | ct = nf_ct_get(skb, &ctinfo); | ||
82 | /* Can't track? It's not due to stress, or conntrack would | ||
83 | * have dropped it. Hence it's the user's responsibilty to | ||
84 | * packet filter it out, or implement conntrack/NAT for that | ||
85 | * protocol. 8) --RR | ||
86 | */ | ||
87 | if (!ct) | ||
88 | return NF_ACCEPT; | ||
89 | |||
90 | /* Don't try to NAT if this packet is not conntracked */ | ||
91 | if (nf_ct_is_untracked(ct)) | ||
92 | return NF_ACCEPT; | ||
93 | |||
94 | nat = nf_ct_nat_ext_add(ct); | ||
95 | if (nat == NULL) | ||
96 | return NF_ACCEPT; | ||
97 | |||
98 | switch (ctinfo) { | ||
99 | case IP_CT_RELATED: | ||
100 | case IP_CT_RELATED_REPLY: | ||
101 | if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { | ||
102 | if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, | ||
103 | ops->hooknum)) | ||
104 | return NF_DROP; | ||
105 | else | ||
106 | return NF_ACCEPT; | ||
107 | } | ||
108 | /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ | ||
109 | case IP_CT_NEW: | ||
110 | /* Seen it before? This can happen for loopback, retrans, | ||
111 | * or local packets. | ||
112 | */ | ||
113 | if (!nf_nat_initialized(ct, maniptype)) { | ||
114 | unsigned int ret; | ||
115 | |||
116 | ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); | ||
117 | if (ret != NF_ACCEPT) | ||
118 | return ret; | ||
119 | } else { | ||
120 | pr_debug("Already setup manip %s for ct %p\n", | ||
121 | maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", | ||
122 | ct); | ||
123 | if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) | ||
124 | goto oif_changed; | ||
125 | } | ||
126 | break; | ||
127 | |||
128 | default: | ||
129 | /* ESTABLISHED */ | ||
130 | NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || | ||
131 | ctinfo == IP_CT_ESTABLISHED_REPLY); | ||
132 | if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) | ||
133 | goto oif_changed; | ||
134 | } | ||
135 | |||
136 | return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); | ||
137 | |||
138 | oif_changed: | ||
139 | nf_ct_kill_acct(ct, ctinfo, skb); | ||
140 | return NF_DROP; | ||
141 | } | 49 | } |
142 | 50 | ||
143 | static unsigned int | 51 | static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, |
144 | nf_nat_ipv4_in(const struct nf_hook_ops *ops, | 52 | struct sk_buff *skb, |
145 | struct sk_buff *skb, | 53 | const struct net_device *in, |
146 | const struct net_device *in, | 54 | const struct net_device *out, |
147 | const struct net_device *out, | 55 | int (*okfn)(struct sk_buff *)) |
148 | int (*okfn)(struct sk_buff *)) | ||
149 | { | 56 | { |
150 | unsigned int ret; | 57 | return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain); |
151 | __be32 daddr = ip_hdr(skb)->daddr; | ||
152 | |||
153 | ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); | ||
154 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
155 | daddr != ip_hdr(skb)->daddr) | ||
156 | skb_dst_drop(skb); | ||
157 | |||
158 | return ret; | ||
159 | } | 58 | } |
160 | 59 | ||
161 | static unsigned int | 60 | static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, |
162 | nf_nat_ipv4_out(const struct nf_hook_ops *ops, | 61 | struct sk_buff *skb, |
163 | struct sk_buff *skb, | 62 | const struct net_device *in, |
164 | const struct net_device *in, | 63 | const struct net_device *out, |
165 | const struct net_device *out, | 64 | int (*okfn)(struct sk_buff *)) |
166 | int (*okfn)(struct sk_buff *)) | ||
167 | { | 65 | { |
168 | #ifdef CONFIG_XFRM | 66 | return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain); |
169 | const struct nf_conn *ct; | ||
170 | enum ip_conntrack_info ctinfo; | ||
171 | int err; | ||
172 | #endif | ||
173 | unsigned int ret; | ||
174 | |||
175 | /* root is playing with raw sockets. */ | ||
176 | if (skb->len < sizeof(struct iphdr) || | ||
177 | ip_hdrlen(skb) < sizeof(struct iphdr)) | ||
178 | return NF_ACCEPT; | ||
179 | |||
180 | ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); | ||
181 | #ifdef CONFIG_XFRM | ||
182 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
183 | !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && | ||
184 | (ct = nf_ct_get(skb, &ctinfo)) != NULL) { | ||
185 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
186 | |||
187 | if ((ct->tuplehash[dir].tuple.src.u3.ip != | ||
188 | ct->tuplehash[!dir].tuple.dst.u3.ip) || | ||
189 | (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && | ||
190 | ct->tuplehash[dir].tuple.src.u.all != | ||
191 | ct->tuplehash[!dir].tuple.dst.u.all)) { | ||
192 | err = nf_xfrm_me_harder(skb, AF_INET); | ||
193 | if (err < 0) | ||
194 | ret = NF_DROP_ERR(err); | ||
195 | } | ||
196 | } | ||
197 | #endif | ||
198 | return ret; | ||
199 | } | 67 | } |
200 | 68 | ||
201 | static unsigned int | 69 | static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, |
202 | nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, | 70 | struct sk_buff *skb, |
203 | struct sk_buff *skb, | 71 | const struct net_device *in, |
204 | const struct net_device *in, | 72 | const struct net_device *out, |
205 | const struct net_device *out, | 73 | int (*okfn)(struct sk_buff *)) |
206 | int (*okfn)(struct sk_buff *)) | ||
207 | { | 74 | { |
208 | const struct nf_conn *ct; | 75 | return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain); |
209 | enum ip_conntrack_info ctinfo; | ||
210 | unsigned int ret; | ||
211 | int err; | ||
212 | |||
213 | /* root is playing with raw sockets. */ | ||
214 | if (skb->len < sizeof(struct iphdr) || | ||
215 | ip_hdrlen(skb) < sizeof(struct iphdr)) | ||
216 | return NF_ACCEPT; | ||
217 | |||
218 | ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); | ||
219 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
220 | (ct = nf_ct_get(skb, &ctinfo)) != NULL) { | ||
221 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
222 | |||
223 | if (ct->tuplehash[dir].tuple.dst.u3.ip != | ||
224 | ct->tuplehash[!dir].tuple.src.u3.ip) { | ||
225 | err = ip_route_me_harder(skb, RTN_UNSPEC); | ||
226 | if (err < 0) | ||
227 | ret = NF_DROP_ERR(err); | ||
228 | } | ||
229 | #ifdef CONFIG_XFRM | ||
230 | else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && | ||
231 | ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && | ||
232 | ct->tuplehash[dir].tuple.dst.u.all != | ||
233 | ct->tuplehash[!dir].tuple.src.u.all) { | ||
234 | err = nf_xfrm_me_harder(skb, AF_INET); | ||
235 | if (err < 0) | ||
236 | ret = NF_DROP_ERR(err); | ||
237 | } | ||
238 | #endif | ||
239 | } | ||
240 | return ret; | ||
241 | } | 76 | } |
242 | 77 | ||
243 | static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { | 78 | static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { |
244 | /* Before packet filtering, change destination */ | 79 | /* Before packet filtering, change destination */ |
245 | { | 80 | { |
246 | .hook = nf_nat_ipv4_in, | 81 | .hook = iptable_nat_ipv4_in, |
247 | .owner = THIS_MODULE, | 82 | .owner = THIS_MODULE, |
248 | .pf = NFPROTO_IPV4, | 83 | .pf = NFPROTO_IPV4, |
249 | .hooknum = NF_INET_PRE_ROUTING, | 84 | .hooknum = NF_INET_PRE_ROUTING, |
@@ -251,7 +86,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { | |||
251 | }, | 86 | }, |
252 | /* After packet filtering, change source */ | 87 | /* After packet filtering, change source */ |
253 | { | 88 | { |
254 | .hook = nf_nat_ipv4_out, | 89 | .hook = iptable_nat_ipv4_out, |
255 | .owner = THIS_MODULE, | 90 | .owner = THIS_MODULE, |
256 | .pf = NFPROTO_IPV4, | 91 | .pf = NFPROTO_IPV4, |
257 | .hooknum = NF_INET_POST_ROUTING, | 92 | .hooknum = NF_INET_POST_ROUTING, |
@@ -259,7 +94,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { | |||
259 | }, | 94 | }, |
260 | /* Before packet filtering, change destination */ | 95 | /* Before packet filtering, change destination */ |
261 | { | 96 | { |
262 | .hook = nf_nat_ipv4_local_fn, | 97 | .hook = iptable_nat_ipv4_local_fn, |
263 | .owner = THIS_MODULE, | 98 | .owner = THIS_MODULE, |
264 | .pf = NFPROTO_IPV4, | 99 | .pf = NFPROTO_IPV4, |
265 | .hooknum = NF_INET_LOCAL_OUT, | 100 | .hooknum = NF_INET_LOCAL_OUT, |
@@ -267,7 +102,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { | |||
267 | }, | 102 | }, |
268 | /* After packet filtering, change source */ | 103 | /* After packet filtering, change source */ |
269 | { | 104 | { |
270 | .hook = nf_nat_ipv4_fn, | 105 | .hook = iptable_nat_ipv4_fn, |
271 | .owner = THIS_MODULE, | 106 | .owner = THIS_MODULE, |
272 | .pf = NFPROTO_IPV4, | 107 | .pf = NFPROTO_IPV4, |
273 | .hooknum = NF_INET_LOCAL_IN, | 108 | .hooknum = NF_INET_LOCAL_IN, |
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 76bd1aef257f..7e5ca6f2d0cd 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c | |||
@@ -50,7 +50,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, | |||
50 | zone = nf_ct_zone((struct nf_conn *)skb->nfct); | 50 | zone = nf_ct_zone((struct nf_conn *)skb->nfct); |
51 | #endif | 51 | #endif |
52 | 52 | ||
53 | #ifdef CONFIG_BRIDGE_NETFILTER | 53 | #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) |
54 | if (skb->nf_bridge && | 54 | if (skb->nf_bridge && |
55 | skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) | 55 | skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) |
56 | return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; | 56 | return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; |
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 14f5ccd06337..fc37711e11f3 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | |||
@@ -254,6 +254,205 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, | |||
254 | } | 254 | } |
255 | EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); | 255 | EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); |
256 | 256 | ||
257 | unsigned int | ||
258 | nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, | ||
259 | const struct net_device *in, const struct net_device *out, | ||
260 | unsigned int (*do_chain)(const struct nf_hook_ops *ops, | ||
261 | struct sk_buff *skb, | ||
262 | const struct net_device *in, | ||
263 | const struct net_device *out, | ||
264 | struct nf_conn *ct)) | ||
265 | { | ||
266 | struct nf_conn *ct; | ||
267 | enum ip_conntrack_info ctinfo; | ||
268 | struct nf_conn_nat *nat; | ||
269 | /* maniptype == SRC for postrouting. */ | ||
270 | enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); | ||
271 | |||
272 | /* We never see fragments: conntrack defrags on pre-routing | ||
273 | * and local-out, and nf_nat_out protects post-routing. | ||
274 | */ | ||
275 | NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); | ||
276 | |||
277 | ct = nf_ct_get(skb, &ctinfo); | ||
278 | /* Can't track? It's not due to stress, or conntrack would | ||
279 | * have dropped it. Hence it's the user's responsibilty to | ||
280 | * packet filter it out, or implement conntrack/NAT for that | ||
281 | * protocol. 8) --RR | ||
282 | */ | ||
283 | if (!ct) | ||
284 | return NF_ACCEPT; | ||
285 | |||
286 | /* Don't try to NAT if this packet is not conntracked */ | ||
287 | if (nf_ct_is_untracked(ct)) | ||
288 | return NF_ACCEPT; | ||
289 | |||
290 | nat = nf_ct_nat_ext_add(ct); | ||
291 | if (nat == NULL) | ||
292 | return NF_ACCEPT; | ||
293 | |||
294 | switch (ctinfo) { | ||
295 | case IP_CT_RELATED: | ||
296 | case IP_CT_RELATED_REPLY: | ||
297 | if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { | ||
298 | if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, | ||
299 | ops->hooknum)) | ||
300 | return NF_DROP; | ||
301 | else | ||
302 | return NF_ACCEPT; | ||
303 | } | ||
304 | /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ | ||
305 | case IP_CT_NEW: | ||
306 | /* Seen it before? This can happen for loopback, retrans, | ||
307 | * or local packets. | ||
308 | */ | ||
309 | if (!nf_nat_initialized(ct, maniptype)) { | ||
310 | unsigned int ret; | ||
311 | |||
312 | ret = do_chain(ops, skb, in, out, ct); | ||
313 | if (ret != NF_ACCEPT) | ||
314 | return ret; | ||
315 | |||
316 | if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) | ||
317 | break; | ||
318 | |||
319 | ret = nf_nat_alloc_null_binding(ct, ops->hooknum); | ||
320 | if (ret != NF_ACCEPT) | ||
321 | return ret; | ||
322 | } else { | ||
323 | pr_debug("Already setup manip %s for ct %p\n", | ||
324 | maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", | ||
325 | ct); | ||
326 | if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) | ||
327 | goto oif_changed; | ||
328 | } | ||
329 | break; | ||
330 | |||
331 | default: | ||
332 | /* ESTABLISHED */ | ||
333 | NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || | ||
334 | ctinfo == IP_CT_ESTABLISHED_REPLY); | ||
335 | if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) | ||
336 | goto oif_changed; | ||
337 | } | ||
338 | |||
339 | return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); | ||
340 | |||
341 | oif_changed: | ||
342 | nf_ct_kill_acct(ct, ctinfo, skb); | ||
343 | return NF_DROP; | ||
344 | } | ||
345 | EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); | ||
346 | |||
347 | unsigned int | ||
348 | nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, | ||
349 | const struct net_device *in, const struct net_device *out, | ||
350 | unsigned int (*do_chain)(const struct nf_hook_ops *ops, | ||
351 | struct sk_buff *skb, | ||
352 | const struct net_device *in, | ||
353 | const struct net_device *out, | ||
354 | struct nf_conn *ct)) | ||
355 | { | ||
356 | unsigned int ret; | ||
357 | __be32 daddr = ip_hdr(skb)->daddr; | ||
358 | |||
359 | ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); | ||
360 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
361 | daddr != ip_hdr(skb)->daddr) | ||
362 | skb_dst_drop(skb); | ||
363 | |||
364 | return ret; | ||
365 | } | ||
366 | EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); | ||
367 | |||
368 | unsigned int | ||
369 | nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, | ||
370 | const struct net_device *in, const struct net_device *out, | ||
371 | unsigned int (*do_chain)(const struct nf_hook_ops *ops, | ||
372 | struct sk_buff *skb, | ||
373 | const struct net_device *in, | ||
374 | const struct net_device *out, | ||
375 | struct nf_conn *ct)) | ||
376 | { | ||
377 | #ifdef CONFIG_XFRM | ||
378 | const struct nf_conn *ct; | ||
379 | enum ip_conntrack_info ctinfo; | ||
380 | int err; | ||
381 | #endif | ||
382 | unsigned int ret; | ||
383 | |||
384 | /* root is playing with raw sockets. */ | ||
385 | if (skb->len < sizeof(struct iphdr) || | ||
386 | ip_hdrlen(skb) < sizeof(struct iphdr)) | ||
387 | return NF_ACCEPT; | ||
388 | |||
389 | ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); | ||
390 | #ifdef CONFIG_XFRM | ||
391 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
392 | !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && | ||
393 | (ct = nf_ct_get(skb, &ctinfo)) != NULL) { | ||
394 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
395 | |||
396 | if ((ct->tuplehash[dir].tuple.src.u3.ip != | ||
397 | ct->tuplehash[!dir].tuple.dst.u3.ip) || | ||
398 | (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && | ||
399 | ct->tuplehash[dir].tuple.src.u.all != | ||
400 | ct->tuplehash[!dir].tuple.dst.u.all)) { | ||
401 | err = nf_xfrm_me_harder(skb, AF_INET); | ||
402 | if (err < 0) | ||
403 | ret = NF_DROP_ERR(err); | ||
404 | } | ||
405 | } | ||
406 | #endif | ||
407 | return ret; | ||
408 | } | ||
409 | EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); | ||
410 | |||
411 | unsigned int | ||
412 | nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, | ||
413 | const struct net_device *in, const struct net_device *out, | ||
414 | unsigned int (*do_chain)(const struct nf_hook_ops *ops, | ||
415 | struct sk_buff *skb, | ||
416 | const struct net_device *in, | ||
417 | const struct net_device *out, | ||
418 | struct nf_conn *ct)) | ||
419 | { | ||
420 | const struct nf_conn *ct; | ||
421 | enum ip_conntrack_info ctinfo; | ||
422 | unsigned int ret; | ||
423 | int err; | ||
424 | |||
425 | /* root is playing with raw sockets. */ | ||
426 | if (skb->len < sizeof(struct iphdr) || | ||
427 | ip_hdrlen(skb) < sizeof(struct iphdr)) | ||
428 | return NF_ACCEPT; | ||
429 | |||
430 | ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); | ||
431 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
432 | (ct = nf_ct_get(skb, &ctinfo)) != NULL) { | ||
433 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
434 | |||
435 | if (ct->tuplehash[dir].tuple.dst.u3.ip != | ||
436 | ct->tuplehash[!dir].tuple.src.u3.ip) { | ||
437 | err = ip_route_me_harder(skb, RTN_UNSPEC); | ||
438 | if (err < 0) | ||
439 | ret = NF_DROP_ERR(err); | ||
440 | } | ||
441 | #ifdef CONFIG_XFRM | ||
442 | else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && | ||
443 | ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && | ||
444 | ct->tuplehash[dir].tuple.dst.u.all != | ||
445 | ct->tuplehash[!dir].tuple.src.u.all) { | ||
446 | err = nf_xfrm_me_harder(skb, AF_INET); | ||
447 | if (err < 0) | ||
448 | ret = NF_DROP_ERR(err); | ||
449 | } | ||
450 | #endif | ||
451 | } | ||
452 | return ret; | ||
453 | } | ||
454 | EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn); | ||
455 | |||
257 | static int __init nf_nat_l3proto_ipv4_init(void) | 456 | static int __init nf_nat_l3proto_ipv4_init(void) |
258 | { | 457 | { |
259 | int err; | 458 | int err; |
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c new file mode 100644 index 000000000000..c6eb42100e9a --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c | |||
@@ -0,0 +1,153 @@ | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
2 | * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | */ | ||
8 | |||
9 | #include <linux/types.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/atomic.h> | ||
12 | #include <linux/inetdevice.h> | ||
13 | #include <linux/ip.h> | ||
14 | #include <linux/timer.h> | ||
15 | #include <linux/netfilter.h> | ||
16 | #include <net/protocol.h> | ||
17 | #include <net/ip.h> | ||
18 | #include <net/checksum.h> | ||
19 | #include <net/route.h> | ||
20 | #include <linux/netfilter_ipv4.h> | ||
21 | #include <linux/netfilter/x_tables.h> | ||
22 | #include <net/netfilter/nf_nat.h> | ||
23 | #include <net/netfilter/ipv4/nf_nat_masquerade.h> | ||
24 | |||
25 | unsigned int | ||
26 | nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, | ||
27 | const struct nf_nat_range *range, | ||
28 | const struct net_device *out) | ||
29 | { | ||
30 | struct nf_conn *ct; | ||
31 | struct nf_conn_nat *nat; | ||
32 | enum ip_conntrack_info ctinfo; | ||
33 | struct nf_nat_range newrange; | ||
34 | const struct rtable *rt; | ||
35 | __be32 newsrc, nh; | ||
36 | |||
37 | NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); | ||
38 | |||
39 | ct = nf_ct_get(skb, &ctinfo); | ||
40 | nat = nfct_nat(ct); | ||
41 | |||
42 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || | ||
43 | ctinfo == IP_CT_RELATED_REPLY)); | ||
44 | |||
45 | /* Source address is 0.0.0.0 - locally generated packet that is | ||
46 | * probably not supposed to be masqueraded. | ||
47 | */ | ||
48 | if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) | ||
49 | return NF_ACCEPT; | ||
50 | |||
51 | rt = skb_rtable(skb); | ||
52 | nh = rt_nexthop(rt, ip_hdr(skb)->daddr); | ||
53 | newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); | ||
54 | if (!newsrc) { | ||
55 | pr_info("%s ate my IP address\n", out->name); | ||
56 | return NF_DROP; | ||
57 | } | ||
58 | |||
59 | nat->masq_index = out->ifindex; | ||
60 | |||
61 | /* Transfer from original range. */ | ||
62 | memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); | ||
63 | memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); | ||
64 | newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; | ||
65 | newrange.min_addr.ip = newsrc; | ||
66 | newrange.max_addr.ip = newsrc; | ||
67 | newrange.min_proto = range->min_proto; | ||
68 | newrange.max_proto = range->max_proto; | ||
69 | |||
70 | /* Hand modified range to generic setup. */ | ||
71 | return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); | ||
72 | } | ||
73 | EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); | ||
74 | |||
75 | static int device_cmp(struct nf_conn *i, void *ifindex) | ||
76 | { | ||
77 | const struct nf_conn_nat *nat = nfct_nat(i); | ||
78 | |||
79 | if (!nat) | ||
80 | return 0; | ||
81 | if (nf_ct_l3num(i) != NFPROTO_IPV4) | ||
82 | return 0; | ||
83 | return nat->masq_index == (int)(long)ifindex; | ||
84 | } | ||
85 | |||
86 | static int masq_device_event(struct notifier_block *this, | ||
87 | unsigned long event, | ||
88 | void *ptr) | ||
89 | { | ||
90 | const struct net_device *dev = netdev_notifier_info_to_dev(ptr); | ||
91 | struct net *net = dev_net(dev); | ||
92 | |||
93 | if (event == NETDEV_DOWN) { | ||
94 | /* Device was downed. Search entire table for | ||
95 | * conntracks which were associated with that device, | ||
96 | * and forget them. | ||
97 | */ | ||
98 | NF_CT_ASSERT(dev->ifindex != 0); | ||
99 | |||
100 | nf_ct_iterate_cleanup(net, device_cmp, | ||
101 | (void *)(long)dev->ifindex, 0, 0); | ||
102 | } | ||
103 | |||
104 | return NOTIFY_DONE; | ||
105 | } | ||
106 | |||
107 | static int masq_inet_event(struct notifier_block *this, | ||
108 | unsigned long event, | ||
109 | void *ptr) | ||
110 | { | ||
111 | struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; | ||
112 | struct netdev_notifier_info info; | ||
113 | |||
114 | netdev_notifier_info_init(&info, dev); | ||
115 | return masq_device_event(this, event, &info); | ||
116 | } | ||
117 | |||
118 | static struct notifier_block masq_dev_notifier = { | ||
119 | .notifier_call = masq_device_event, | ||
120 | }; | ||
121 | |||
122 | static struct notifier_block masq_inet_notifier = { | ||
123 | .notifier_call = masq_inet_event, | ||
124 | }; | ||
125 | |||
126 | static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); | ||
127 | |||
128 | void nf_nat_masquerade_ipv4_register_notifier(void) | ||
129 | { | ||
130 | /* check if the notifier was already set */ | ||
131 | if (atomic_inc_return(&masquerade_notifier_refcount) > 1) | ||
132 | return; | ||
133 | |||
134 | /* Register for device down reports */ | ||
135 | register_netdevice_notifier(&masq_dev_notifier); | ||
136 | /* Register IP address change reports */ | ||
137 | register_inetaddr_notifier(&masq_inet_notifier); | ||
138 | } | ||
139 | EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); | ||
140 | |||
141 | void nf_nat_masquerade_ipv4_unregister_notifier(void) | ||
142 | { | ||
143 | /* check if the notifier still has clients */ | ||
144 | if (atomic_dec_return(&masquerade_notifier_refcount) > 0) | ||
145 | return; | ||
146 | |||
147 | unregister_netdevice_notifier(&masq_dev_notifier); | ||
148 | unregister_inetaddr_notifier(&masq_inet_notifier); | ||
149 | } | ||
150 | EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); | ||
151 | |||
152 | MODULE_LICENSE("GPL"); | ||
153 | MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); | ||
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c new file mode 100644 index 000000000000..b023b4eb1a96 --- /dev/null +++ b/net/ipv4/netfilter/nf_reject_ipv4.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
2 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | */ | ||
8 | |||
9 | #include <net/ip.h> | ||
10 | #include <net/tcp.h> | ||
11 | #include <net/route.h> | ||
12 | #include <net/dst.h> | ||
13 | #include <linux/netfilter_ipv4.h> | ||
14 | |||
15 | /* Send RST reply */ | ||
16 | void nf_send_reset(struct sk_buff *oldskb, int hook) | ||
17 | { | ||
18 | struct sk_buff *nskb; | ||
19 | const struct iphdr *oiph; | ||
20 | struct iphdr *niph; | ||
21 | const struct tcphdr *oth; | ||
22 | struct tcphdr _otcph, *tcph; | ||
23 | |||
24 | /* IP header checks: fragment. */ | ||
25 | if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) | ||
26 | return; | ||
27 | |||
28 | oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), | ||
29 | sizeof(_otcph), &_otcph); | ||
30 | if (oth == NULL) | ||
31 | return; | ||
32 | |||
33 | /* No RST for RST. */ | ||
34 | if (oth->rst) | ||
35 | return; | ||
36 | |||
37 | if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) | ||
38 | return; | ||
39 | |||
40 | /* Check checksum */ | ||
41 | if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) | ||
42 | return; | ||
43 | oiph = ip_hdr(oldskb); | ||
44 | |||
45 | nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + | ||
46 | LL_MAX_HEADER, GFP_ATOMIC); | ||
47 | if (!nskb) | ||
48 | return; | ||
49 | |||
50 | skb_reserve(nskb, LL_MAX_HEADER); | ||
51 | |||
52 | skb_reset_network_header(nskb); | ||
53 | niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); | ||
54 | niph->version = 4; | ||
55 | niph->ihl = sizeof(struct iphdr) / 4; | ||
56 | niph->tos = 0; | ||
57 | niph->id = 0; | ||
58 | niph->frag_off = htons(IP_DF); | ||
59 | niph->protocol = IPPROTO_TCP; | ||
60 | niph->check = 0; | ||
61 | niph->saddr = oiph->daddr; | ||
62 | niph->daddr = oiph->saddr; | ||
63 | |||
64 | skb_reset_transport_header(nskb); | ||
65 | tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); | ||
66 | memset(tcph, 0, sizeof(*tcph)); | ||
67 | tcph->source = oth->dest; | ||
68 | tcph->dest = oth->source; | ||
69 | tcph->doff = sizeof(struct tcphdr) / 4; | ||
70 | |||
71 | if (oth->ack) | ||
72 | tcph->seq = oth->ack_seq; | ||
73 | else { | ||
74 | tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + | ||
75 | oldskb->len - ip_hdrlen(oldskb) - | ||
76 | (oth->doff << 2)); | ||
77 | tcph->ack = 1; | ||
78 | } | ||
79 | |||
80 | tcph->rst = 1; | ||
81 | tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, | ||
82 | niph->daddr, 0); | ||
83 | nskb->ip_summed = CHECKSUM_PARTIAL; | ||
84 | nskb->csum_start = (unsigned char *)tcph - nskb->head; | ||
85 | nskb->csum_offset = offsetof(struct tcphdr, check); | ||
86 | |||
87 | /* ip_route_me_harder expects skb->dst to be set */ | ||
88 | skb_dst_set_noref(nskb, skb_dst(oldskb)); | ||
89 | |||
90 | nskb->protocol = htons(ETH_P_IP); | ||
91 | if (ip_route_me_harder(nskb, RTN_UNSPEC)) | ||
92 | goto free_nskb; | ||
93 | |||
94 | niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); | ||
95 | |||
96 | /* "Never happens" */ | ||
97 | if (nskb->len > dst_mtu(skb_dst(nskb))) | ||
98 | goto free_nskb; | ||
99 | |||
100 | nf_ct_attach(nskb, oldskb); | ||
101 | |||
102 | #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) | ||
103 | /* If we use ip_local_out for bridged traffic, the MAC source on | ||
104 | * the RST will be ours, instead of the destination's. This confuses | ||
105 | * some routers/firewalls, and they drop the packet. So we need to | ||
106 | * build the eth header using the original destination's MAC as the | ||
107 | * source, and send the RST packet directly. | ||
108 | */ | ||
109 | if (oldskb->nf_bridge) { | ||
110 | struct ethhdr *oeth = eth_hdr(oldskb); | ||
111 | nskb->dev = oldskb->nf_bridge->physindev; | ||
112 | niph->tot_len = htons(nskb->len); | ||
113 | ip_send_check(niph); | ||
114 | if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), | ||
115 | oeth->h_source, oeth->h_dest, nskb->len) < 0) | ||
116 | goto free_nskb; | ||
117 | dev_queue_xmit(nskb); | ||
118 | } else | ||
119 | #endif | ||
120 | ip_local_out(nskb); | ||
121 | |||
122 | return; | ||
123 | |||
124 | free_nskb: | ||
125 | kfree_skb(nskb); | ||
126 | } | ||
127 | EXPORT_SYMBOL_GPL(nf_send_reset); | ||
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index 3964157d826c..df547bf50078 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c | |||
@@ -26,136 +26,53 @@ | |||
26 | #include <net/netfilter/nf_nat_l3proto.h> | 26 | #include <net/netfilter/nf_nat_l3proto.h> |
27 | #include <net/ip.h> | 27 | #include <net/ip.h> |
28 | 28 | ||
29 | /* | 29 | static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, |
30 | * NAT chains | 30 | struct sk_buff *skb, |
31 | */ | 31 | const struct net_device *in, |
32 | 32 | const struct net_device *out, | |
33 | static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, | 33 | struct nf_conn *ct) |
34 | struct sk_buff *skb, | ||
35 | const struct net_device *in, | ||
36 | const struct net_device *out, | ||
37 | int (*okfn)(struct sk_buff *)) | ||
38 | { | 34 | { |
39 | enum ip_conntrack_info ctinfo; | ||
40 | struct nf_conn *ct = nf_ct_get(skb, &ctinfo); | ||
41 | struct nf_conn_nat *nat; | ||
42 | enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); | ||
43 | struct nft_pktinfo pkt; | 35 | struct nft_pktinfo pkt; |
44 | unsigned int ret; | ||
45 | |||
46 | if (ct == NULL || nf_ct_is_untracked(ct)) | ||
47 | return NF_ACCEPT; | ||
48 | |||
49 | NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); | ||
50 | |||
51 | nat = nf_ct_nat_ext_add(ct); | ||
52 | if (nat == NULL) | ||
53 | return NF_ACCEPT; | ||
54 | |||
55 | switch (ctinfo) { | ||
56 | case IP_CT_RELATED: | ||
57 | case IP_CT_RELATED + IP_CT_IS_REPLY: | ||
58 | if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { | ||
59 | if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, | ||
60 | ops->hooknum)) | ||
61 | return NF_DROP; | ||
62 | else | ||
63 | return NF_ACCEPT; | ||
64 | } | ||
65 | /* Fall through */ | ||
66 | case IP_CT_NEW: | ||
67 | if (nf_nat_initialized(ct, maniptype)) | ||
68 | break; | ||
69 | 36 | ||
70 | nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); | 37 | nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); |
71 | 38 | ||
72 | ret = nft_do_chain(&pkt, ops); | 39 | return nft_do_chain(&pkt, ops); |
73 | if (ret != NF_ACCEPT) | ||
74 | return ret; | ||
75 | if (!nf_nat_initialized(ct, maniptype)) { | ||
76 | ret = nf_nat_alloc_null_binding(ct, ops->hooknum); | ||
77 | if (ret != NF_ACCEPT) | ||
78 | return ret; | ||
79 | } | ||
80 | default: | ||
81 | break; | ||
82 | } | ||
83 | |||
84 | return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); | ||
85 | } | 40 | } |
86 | 41 | ||
87 | static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, | 42 | static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, |
88 | struct sk_buff *skb, | 43 | struct sk_buff *skb, |
89 | const struct net_device *in, | 44 | const struct net_device *in, |
90 | const struct net_device *out, | 45 | const struct net_device *out, |
91 | int (*okfn)(struct sk_buff *)) | 46 | int (*okfn)(struct sk_buff *)) |
92 | { | 47 | { |
93 | __be32 daddr = ip_hdr(skb)->daddr; | 48 | return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain); |
94 | unsigned int ret; | ||
95 | |||
96 | ret = nf_nat_fn(ops, skb, in, out, okfn); | ||
97 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
98 | ip_hdr(skb)->daddr != daddr) { | ||
99 | skb_dst_drop(skb); | ||
100 | } | ||
101 | return ret; | ||
102 | } | 49 | } |
103 | 50 | ||
104 | static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, | 51 | static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, |
105 | struct sk_buff *skb, | 52 | struct sk_buff *skb, |
106 | const struct net_device *in, | 53 | const struct net_device *in, |
107 | const struct net_device *out, | 54 | const struct net_device *out, |
108 | int (*okfn)(struct sk_buff *)) | 55 | int (*okfn)(struct sk_buff *)) |
109 | { | 56 | { |
110 | enum ip_conntrack_info ctinfo __maybe_unused; | 57 | return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain); |
111 | const struct nf_conn *ct __maybe_unused; | ||
112 | unsigned int ret; | ||
113 | |||
114 | ret = nf_nat_fn(ops, skb, in, out, okfn); | ||
115 | #ifdef CONFIG_XFRM | ||
116 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
117 | (ct = nf_ct_get(skb, &ctinfo)) != NULL) { | ||
118 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
119 | |||
120 | if (ct->tuplehash[dir].tuple.src.u3.ip != | ||
121 | ct->tuplehash[!dir].tuple.dst.u3.ip || | ||
122 | ct->tuplehash[dir].tuple.src.u.all != | ||
123 | ct->tuplehash[!dir].tuple.dst.u.all) | ||
124 | return nf_xfrm_me_harder(skb, AF_INET) == 0 ? | ||
125 | ret : NF_DROP; | ||
126 | } | ||
127 | #endif | ||
128 | return ret; | ||
129 | } | 58 | } |
130 | 59 | ||
131 | static unsigned int nf_nat_output(const struct nf_hook_ops *ops, | 60 | static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, |
132 | struct sk_buff *skb, | 61 | struct sk_buff *skb, |
133 | const struct net_device *in, | 62 | const struct net_device *in, |
134 | const struct net_device *out, | 63 | const struct net_device *out, |
135 | int (*okfn)(struct sk_buff *)) | 64 | int (*okfn)(struct sk_buff *)) |
136 | { | 65 | { |
137 | enum ip_conntrack_info ctinfo; | 66 | return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain); |
138 | const struct nf_conn *ct; | 67 | } |
139 | unsigned int ret; | ||
140 | |||
141 | ret = nf_nat_fn(ops, skb, in, out, okfn); | ||
142 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
143 | (ct = nf_ct_get(skb, &ctinfo)) != NULL) { | ||
144 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
145 | 68 | ||
146 | if (ct->tuplehash[dir].tuple.dst.u3.ip != | 69 | static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, |
147 | ct->tuplehash[!dir].tuple.src.u3.ip) { | 70 | struct sk_buff *skb, |
148 | if (ip_route_me_harder(skb, RTN_UNSPEC)) | 71 | const struct net_device *in, |
149 | ret = NF_DROP; | 72 | const struct net_device *out, |
150 | } | 73 | int (*okfn)(struct sk_buff *)) |
151 | #ifdef CONFIG_XFRM | 74 | { |
152 | else if (ct->tuplehash[dir].tuple.dst.u.all != | 75 | return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain); |
153 | ct->tuplehash[!dir].tuple.src.u.all) | ||
154 | if (nf_xfrm_me_harder(skb, AF_INET)) | ||
155 | ret = NF_DROP; | ||
156 | #endif | ||
157 | } | ||
158 | return ret; | ||
159 | } | 76 | } |
160 | 77 | ||
161 | static const struct nf_chain_type nft_chain_nat_ipv4 = { | 78 | static const struct nf_chain_type nft_chain_nat_ipv4 = { |
@@ -168,10 +85,10 @@ static const struct nf_chain_type nft_chain_nat_ipv4 = { | |||
168 | (1 << NF_INET_LOCAL_OUT) | | 85 | (1 << NF_INET_LOCAL_OUT) | |
169 | (1 << NF_INET_LOCAL_IN), | 86 | (1 << NF_INET_LOCAL_IN), |
170 | .hooks = { | 87 | .hooks = { |
171 | [NF_INET_PRE_ROUTING] = nf_nat_prerouting, | 88 | [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in, |
172 | [NF_INET_POST_ROUTING] = nf_nat_postrouting, | 89 | [NF_INET_POST_ROUTING] = nft_nat_ipv4_out, |
173 | [NF_INET_LOCAL_OUT] = nf_nat_output, | 90 | [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn, |
174 | [NF_INET_LOCAL_IN] = nf_nat_fn, | 91 | [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn, |
175 | }, | 92 | }, |
176 | }; | 93 | }; |
177 | 94 | ||
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c new file mode 100644 index 000000000000..1c636d6b5b50 --- /dev/null +++ b/net/ipv4/netfilter/nft_masq_ipv4.c | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | */ | ||
8 | |||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/netlink.h> | ||
13 | #include <linux/netfilter.h> | ||
14 | #include <linux/netfilter/nf_tables.h> | ||
15 | #include <net/netfilter/nf_tables.h> | ||
16 | #include <net/netfilter/nft_masq.h> | ||
17 | #include <net/netfilter/ipv4/nf_nat_masquerade.h> | ||
18 | |||
19 | static void nft_masq_ipv4_eval(const struct nft_expr *expr, | ||
20 | struct nft_data data[NFT_REG_MAX + 1], | ||
21 | const struct nft_pktinfo *pkt) | ||
22 | { | ||
23 | struct nft_masq *priv = nft_expr_priv(expr); | ||
24 | struct nf_nat_range range; | ||
25 | unsigned int verdict; | ||
26 | |||
27 | range.flags = priv->flags; | ||
28 | |||
29 | verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, | ||
30 | &range, pkt->out); | ||
31 | |||
32 | data[NFT_REG_VERDICT].verdict = verdict; | ||
33 | } | ||
34 | |||
35 | static struct nft_expr_type nft_masq_ipv4_type; | ||
36 | static const struct nft_expr_ops nft_masq_ipv4_ops = { | ||
37 | .type = &nft_masq_ipv4_type, | ||
38 | .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), | ||
39 | .eval = nft_masq_ipv4_eval, | ||
40 | .init = nft_masq_init, | ||
41 | .dump = nft_masq_dump, | ||
42 | }; | ||
43 | |||
44 | static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { | ||
45 | .family = NFPROTO_IPV4, | ||
46 | .name = "masq", | ||
47 | .ops = &nft_masq_ipv4_ops, | ||
48 | .policy = nft_masq_policy, | ||
49 | .maxattr = NFTA_MASQ_MAX, | ||
50 | .owner = THIS_MODULE, | ||
51 | }; | ||
52 | |||
53 | static int __init nft_masq_ipv4_module_init(void) | ||
54 | { | ||
55 | int ret; | ||
56 | |||
57 | ret = nft_register_expr(&nft_masq_ipv4_type); | ||
58 | if (ret < 0) | ||
59 | return ret; | ||
60 | |||
61 | nf_nat_masquerade_ipv4_register_notifier(); | ||
62 | |||
63 | return ret; | ||
64 | } | ||
65 | |||
66 | static void __exit nft_masq_ipv4_module_exit(void) | ||
67 | { | ||
68 | nft_unregister_expr(&nft_masq_ipv4_type); | ||
69 | nf_nat_masquerade_ipv4_unregister_notifier(); | ||
70 | } | ||
71 | |||
72 | module_init(nft_masq_ipv4_module_init); | ||
73 | module_exit(nft_masq_ipv4_module_exit); | ||
74 | |||
75 | MODULE_LICENSE("GPL"); | ||
76 | MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); | ||
77 | MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); | ||
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e79718a382f2..ed33299c56d1 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <linux/netfilter.h> | 16 | #include <linux/netfilter.h> |
17 | #include <linux/netfilter/nf_tables.h> | 17 | #include <linux/netfilter/nf_tables.h> |
18 | #include <net/netfilter/nf_tables.h> | 18 | #include <net/netfilter/nf_tables.h> |
19 | #include <net/icmp.h> | ||
20 | #include <net/netfilter/ipv4/nf_reject.h> | 19 | #include <net/netfilter/ipv4/nf_reject.h> |
21 | #include <net/netfilter/nft_reject.h> | 20 | #include <net/netfilter/nft_reject.h> |
22 | 21 | ||
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index a3c59a077a5f..57f7c9804139 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
@@ -311,7 +311,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, | |||
311 | if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) | 311 | if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) |
312 | chk_addr_ret = RTN_LOCAL; | 312 | chk_addr_ret = RTN_LOCAL; |
313 | 313 | ||
314 | if ((sysctl_ip_nonlocal_bind == 0 && | 314 | if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && |
315 | isk->freebind == 0 && isk->transparent == 0 && | 315 | isk->freebind == 0 && isk->transparent == 0 && |
316 | chk_addr_ret != RTN_LOCAL) || | 316 | chk_addr_ret != RTN_LOCAL) || |
317 | chk_addr_ret == RTN_MULTICAST || | 317 | chk_addr_ret == RTN_MULTICAST || |
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 46d6a1c923a8..4b7c0ec65251 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c | |||
@@ -30,6 +30,7 @@ | |||
30 | 30 | ||
31 | const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; | 31 | const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; |
32 | const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; | 32 | const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; |
33 | EXPORT_SYMBOL(inet_offloads); | ||
33 | 34 | ||
34 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) | 35 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) |
35 | { | 36 | { |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cbadb942c332..793c0bb8c4fd 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -596,12 +596,12 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) | |||
596 | 596 | ||
597 | static inline u32 fnhe_hashfun(__be32 daddr) | 597 | static inline u32 fnhe_hashfun(__be32 daddr) |
598 | { | 598 | { |
599 | static u32 fnhe_hashrnd __read_mostly; | ||
599 | u32 hval; | 600 | u32 hval; |
600 | 601 | ||
601 | hval = (__force u32) daddr; | 602 | net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); |
602 | hval ^= (hval >> 11) ^ (hval >> 22); | 603 | hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); |
603 | 604 | return hash_32(hval, FNHE_HASH_SHIFT); | |
604 | return hval & (FNHE_HASH_SIZE - 1); | ||
605 | } | 605 | } |
606 | 606 | ||
607 | static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) | 607 | static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) |
@@ -628,12 +628,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, | |||
628 | 628 | ||
629 | spin_lock_bh(&fnhe_lock); | 629 | spin_lock_bh(&fnhe_lock); |
630 | 630 | ||
631 | hash = nh->nh_exceptions; | 631 | hash = rcu_dereference(nh->nh_exceptions); |
632 | if (!hash) { | 632 | if (!hash) { |
633 | hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); | 633 | hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); |
634 | if (!hash) | 634 | if (!hash) |
635 | goto out_unlock; | 635 | goto out_unlock; |
636 | nh->nh_exceptions = hash; | 636 | rcu_assign_pointer(nh->nh_exceptions, hash); |
637 | } | 637 | } |
638 | 638 | ||
639 | hash += hval; | 639 | hash += hval; |
@@ -1242,7 +1242,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) | |||
1242 | 1242 | ||
1243 | static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) | 1243 | static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) |
1244 | { | 1244 | { |
1245 | struct fnhe_hash_bucket *hash = nh->nh_exceptions; | 1245 | struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); |
1246 | struct fib_nh_exception *fnhe; | 1246 | struct fib_nh_exception *fnhe; |
1247 | u32 hval; | 1247 | u32 hval; |
1248 | 1248 | ||
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c0c75688896e..0431a8f3c8f4 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -25,7 +25,7 @@ | |||
25 | 25 | ||
26 | extern int sysctl_tcp_syncookies; | 26 | extern int sysctl_tcp_syncookies; |
27 | 27 | ||
28 | static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; | 28 | static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; |
29 | 29 | ||
30 | #define COOKIEBITS 24 /* Upper bits store count */ | 30 | #define COOKIEBITS 24 /* Upper bits store count */ |
31 | #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) | 31 | #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a9fde0eef77c..b3c53c8b331e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -286,13 +286,6 @@ static struct ctl_table ipv4_table[] = { | |||
286 | .extra2 = &ip_ttl_max, | 286 | .extra2 = &ip_ttl_max, |
287 | }, | 287 | }, |
288 | { | 288 | { |
289 | .procname = "ip_nonlocal_bind", | ||
290 | .data = &sysctl_ip_nonlocal_bind, | ||
291 | .maxlen = sizeof(int), | ||
292 | .mode = 0644, | ||
293 | .proc_handler = proc_dointvec | ||
294 | }, | ||
295 | { | ||
296 | .procname = "tcp_syn_retries", | 289 | .procname = "tcp_syn_retries", |
297 | .data = &sysctl_tcp_syn_retries, | 290 | .data = &sysctl_tcp_syn_retries, |
298 | .maxlen = sizeof(int), | 291 | .maxlen = sizeof(int), |
@@ -450,6 +443,16 @@ static struct ctl_table ipv4_table[] = { | |||
450 | .mode = 0644, | 443 | .mode = 0644, |
451 | .proc_handler = proc_dointvec | 444 | .proc_handler = proc_dointvec |
452 | }, | 445 | }, |
446 | #ifdef CONFIG_IP_MULTICAST | ||
447 | { | ||
448 | .procname = "igmp_qrv", | ||
449 | .data = &sysctl_igmp_qrv, | ||
450 | .maxlen = sizeof(int), | ||
451 | .mode = 0644, | ||
452 | .proc_handler = proc_dointvec_minmax, | ||
453 | .extra1 = &one | ||
454 | }, | ||
455 | #endif | ||
453 | { | 456 | { |
454 | .procname = "inet_peer_threshold", | 457 | .procname = "inet_peer_threshold", |
455 | .data = &inet_peer_threshold, | 458 | .data = &inet_peer_threshold, |
@@ -719,6 +722,22 @@ static struct ctl_table ipv4_table[] = { | |||
719 | .extra2 = &one, | 722 | .extra2 = &one, |
720 | }, | 723 | }, |
721 | { | 724 | { |
725 | .procname = "icmp_msgs_per_sec", | ||
726 | .data = &sysctl_icmp_msgs_per_sec, | ||
727 | .maxlen = sizeof(int), | ||
728 | .mode = 0644, | ||
729 | .proc_handler = proc_dointvec_minmax, | ||
730 | .extra1 = &zero, | ||
731 | }, | ||
732 | { | ||
733 | .procname = "icmp_msgs_burst", | ||
734 | .data = &sysctl_icmp_msgs_burst, | ||
735 | .maxlen = sizeof(int), | ||
736 | .mode = 0644, | ||
737 | .proc_handler = proc_dointvec_minmax, | ||
738 | .extra1 = &zero, | ||
739 | }, | ||
740 | { | ||
722 | .procname = "udp_mem", | 741 | .procname = "udp_mem", |
723 | .data = &sysctl_udp_mem, | 742 | .data = &sysctl_udp_mem, |
724 | .maxlen = sizeof(sysctl_udp_mem), | 743 | .maxlen = sizeof(sysctl_udp_mem), |
@@ -830,6 +849,13 @@ static struct ctl_table ipv4_net_table[] = { | |||
830 | .proc_handler = proc_dointvec, | 849 | .proc_handler = proc_dointvec, |
831 | }, | 850 | }, |
832 | { | 851 | { |
852 | .procname = "ip_nonlocal_bind", | ||
853 | .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, | ||
854 | .maxlen = sizeof(int), | ||
855 | .mode = 0644, | ||
856 | .proc_handler = proc_dointvec | ||
857 | }, | ||
858 | { | ||
833 | .procname = "fwmark_reflect", | 859 | .procname = "fwmark_reflect", |
834 | .data = &init_net.ipv4.sysctl_fwmark_reflect, | 860 | .data = &init_net.ipv4.sysctl_fwmark_reflect, |
835 | .maxlen = sizeof(int), | 861 | .maxlen = sizeof(int), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8ee43ae90396..461003d258ba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -404,7 +404,7 @@ void tcp_init_sock(struct sock *sk) | |||
404 | 404 | ||
405 | tp->reordering = sysctl_tcp_reordering; | 405 | tp->reordering = sysctl_tcp_reordering; |
406 | tcp_enable_early_retrans(tp); | 406 | tcp_enable_early_retrans(tp); |
407 | icsk->icsk_ca_ops = &tcp_init_congestion_ops; | 407 | tcp_assign_congestion_control(sk); |
408 | 408 | ||
409 | tp->tsoffset = 0; | 409 | tp->tsoffset = 0; |
410 | 410 | ||
@@ -608,7 +608,7 @@ static inline bool forced_push(const struct tcp_sock *tp) | |||
608 | return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); | 608 | return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); |
609 | } | 609 | } |
610 | 610 | ||
611 | static inline void skb_entail(struct sock *sk, struct sk_buff *skb) | 611 | static void skb_entail(struct sock *sk, struct sk_buff *skb) |
612 | { | 612 | { |
613 | struct tcp_sock *tp = tcp_sk(sk); | 613 | struct tcp_sock *tp = tcp_sk(sk); |
614 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | 614 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
@@ -617,7 +617,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) | |||
617 | tcb->seq = tcb->end_seq = tp->write_seq; | 617 | tcb->seq = tcb->end_seq = tp->write_seq; |
618 | tcb->tcp_flags = TCPHDR_ACK; | 618 | tcb->tcp_flags = TCPHDR_ACK; |
619 | tcb->sacked = 0; | 619 | tcb->sacked = 0; |
620 | skb_header_release(skb); | 620 | __skb_header_release(skb); |
621 | tcp_add_write_queue_tail(sk, skb); | 621 | tcp_add_write_queue_tail(sk, skb); |
622 | sk->sk_wmem_queued += skb->truesize; | 622 | sk->sk_wmem_queued += skb->truesize; |
623 | sk_mem_charge(sk, skb->truesize); | 623 | sk_mem_charge(sk, skb->truesize); |
@@ -962,7 +962,7 @@ new_segment: | |||
962 | skb->ip_summed = CHECKSUM_PARTIAL; | 962 | skb->ip_summed = CHECKSUM_PARTIAL; |
963 | tp->write_seq += copy; | 963 | tp->write_seq += copy; |
964 | TCP_SKB_CB(skb)->end_seq += copy; | 964 | TCP_SKB_CB(skb)->end_seq += copy; |
965 | skb_shinfo(skb)->gso_segs = 0; | 965 | tcp_skb_pcount_set(skb, 0); |
966 | 966 | ||
967 | if (!copied) | 967 | if (!copied) |
968 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; | 968 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; |
@@ -1260,7 +1260,7 @@ new_segment: | |||
1260 | 1260 | ||
1261 | tp->write_seq += copy; | 1261 | tp->write_seq += copy; |
1262 | TCP_SKB_CB(skb)->end_seq += copy; | 1262 | TCP_SKB_CB(skb)->end_seq += copy; |
1263 | skb_shinfo(skb)->gso_segs = 0; | 1263 | tcp_skb_pcount_set(skb, 0); |
1264 | 1264 | ||
1265 | from += copy; | 1265 | from += copy; |
1266 | copied += copy; | 1266 | copied += copy; |
@@ -1476,9 +1476,9 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) | |||
1476 | 1476 | ||
1477 | while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { | 1477 | while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { |
1478 | offset = seq - TCP_SKB_CB(skb)->seq; | 1478 | offset = seq - TCP_SKB_CB(skb)->seq; |
1479 | if (tcp_hdr(skb)->syn) | 1479 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) |
1480 | offset--; | 1480 | offset--; |
1481 | if (offset < skb->len || tcp_hdr(skb)->fin) { | 1481 | if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { |
1482 | *off = offset; | 1482 | *off = offset; |
1483 | return skb; | 1483 | return skb; |
1484 | } | 1484 | } |
@@ -1551,7 +1551,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
1551 | if (offset + 1 != skb->len) | 1551 | if (offset + 1 != skb->len) |
1552 | continue; | 1552 | continue; |
1553 | } | 1553 | } |
1554 | if (tcp_hdr(skb)->fin) { | 1554 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { |
1555 | sk_eat_skb(sk, skb); | 1555 | sk_eat_skb(sk, skb); |
1556 | ++seq; | 1556 | ++seq; |
1557 | break; | 1557 | break; |
@@ -1665,11 +1665,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1665 | break; | 1665 | break; |
1666 | 1666 | ||
1667 | offset = *seq - TCP_SKB_CB(skb)->seq; | 1667 | offset = *seq - TCP_SKB_CB(skb)->seq; |
1668 | if (tcp_hdr(skb)->syn) | 1668 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) |
1669 | offset--; | 1669 | offset--; |
1670 | if (offset < skb->len) | 1670 | if (offset < skb->len) |
1671 | goto found_ok_skb; | 1671 | goto found_ok_skb; |
1672 | if (tcp_hdr(skb)->fin) | 1672 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
1673 | goto found_fin_ok; | 1673 | goto found_fin_ok; |
1674 | WARN(!(flags & MSG_PEEK), | 1674 | WARN(!(flags & MSG_PEEK), |
1675 | "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", | 1675 | "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", |
@@ -1857,7 +1857,7 @@ skip_copy: | |||
1857 | if (used + offset < skb->len) | 1857 | if (used + offset < skb->len) |
1858 | continue; | 1858 | continue; |
1859 | 1859 | ||
1860 | if (tcp_hdr(skb)->fin) | 1860 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
1861 | goto found_fin_ok; | 1861 | goto found_fin_ok; |
1862 | if (!(flags & MSG_PEEK)) | 1862 | if (!(flags & MSG_PEEK)) |
1863 | sk_eat_skb(sk, skb); | 1863 | sk_eat_skb(sk, skb); |
@@ -2044,8 +2044,10 @@ void tcp_close(struct sock *sk, long timeout) | |||
2044 | * reader process may not have drained the data yet! | 2044 | * reader process may not have drained the data yet! |
2045 | */ | 2045 | */ |
2046 | while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { | 2046 | while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { |
2047 | u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - | 2047 | u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; |
2048 | tcp_hdr(skb)->fin; | 2048 | |
2049 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | ||
2050 | len--; | ||
2049 | data_was_unread += len; | 2051 | data_was_unread += len; |
2050 | __kfree_skb(skb); | 2052 | __kfree_skb(skb); |
2051 | } | 2053 | } |
@@ -2572,7 +2574,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2572 | break; | 2574 | break; |
2573 | #endif | 2575 | #endif |
2574 | case TCP_USER_TIMEOUT: | 2576 | case TCP_USER_TIMEOUT: |
2575 | /* Cap the max timeout in ms TCP will retry/retrans | 2577 | /* Cap the max time in ms TCP will retry or probe the window |
2576 | * before giving up and aborting (ETIMEDOUT) a connection. | 2578 | * before giving up and aborting (ETIMEDOUT) a connection. |
2577 | */ | 2579 | */ |
2578 | if (val < 0) | 2580 | if (val < 0) |
@@ -3051,7 +3053,7 @@ static int __init set_thash_entries(char *str) | |||
3051 | } | 3053 | } |
3052 | __setup("thash_entries=", set_thash_entries); | 3054 | __setup("thash_entries=", set_thash_entries); |
3053 | 3055 | ||
3054 | static void tcp_init_mem(void) | 3056 | static void __init tcp_init_mem(void) |
3055 | { | 3057 | { |
3056 | unsigned long limit = nr_free_buffer_pages() / 8; | 3058 | unsigned long limit = nr_free_buffer_pages() / 8; |
3057 | limit = max(limit, 128UL); | 3059 | limit = max(limit, 128UL); |
@@ -3137,8 +3139,6 @@ void __init tcp_init(void) | |||
3137 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); | 3139 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); |
3138 | 3140 | ||
3139 | tcp_metrics_init(); | 3141 | tcp_metrics_init(); |
3140 | 3142 | BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); | |
3141 | tcp_register_congestion_control(&tcp_reno); | ||
3142 | |||
3143 | tcp_tasklet_init(); | 3143 | tcp_tasklet_init(); |
3144 | } | 3144 | } |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index d5de69bc04f5..bb395d46a389 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <net/tcp.h> | 18 | #include <net/tcp.h> |
19 | 19 | ||
20 | |||
21 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation | 20 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation |
22 | * max_cwnd = snd_cwnd * beta | 21 | * max_cwnd = snd_cwnd * beta |
23 | */ | 22 | */ |
@@ -46,11 +45,10 @@ MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); | |||
46 | module_param(smooth_part, int, 0644); | 45 | module_param(smooth_part, int, 0644); |
47 | MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); | 46 | MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); |
48 | 47 | ||
49 | |||
50 | /* BIC TCP Parameters */ | 48 | /* BIC TCP Parameters */ |
51 | struct bictcp { | 49 | struct bictcp { |
52 | u32 cnt; /* increase cwnd by 1 after ACKs */ | 50 | u32 cnt; /* increase cwnd by 1 after ACKs */ |
53 | u32 last_max_cwnd; /* last maximum snd_cwnd */ | 51 | u32 last_max_cwnd; /* last maximum snd_cwnd */ |
54 | u32 loss_cwnd; /* congestion window at last loss */ | 52 | u32 loss_cwnd; /* congestion window at last loss */ |
55 | u32 last_cwnd; /* the last snd_cwnd */ | 53 | u32 last_cwnd; /* the last snd_cwnd */ |
56 | u32 last_time; /* time when updated last_cwnd */ | 54 | u32 last_time; /* time when updated last_cwnd */ |
@@ -103,7 +101,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
103 | 101 | ||
104 | /* binary increase */ | 102 | /* binary increase */ |
105 | if (cwnd < ca->last_max_cwnd) { | 103 | if (cwnd < ca->last_max_cwnd) { |
106 | __u32 dist = (ca->last_max_cwnd - cwnd) | 104 | __u32 dist = (ca->last_max_cwnd - cwnd) |
107 | / BICTCP_B; | 105 | / BICTCP_B; |
108 | 106 | ||
109 | if (dist > max_increment) | 107 | if (dist > max_increment) |
@@ -154,7 +152,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
154 | bictcp_update(ca, tp->snd_cwnd); | 152 | bictcp_update(ca, tp->snd_cwnd); |
155 | tcp_cong_avoid_ai(tp, ca->cnt); | 153 | tcp_cong_avoid_ai(tp, ca->cnt); |
156 | } | 154 | } |
157 | |||
158 | } | 155 | } |
159 | 156 | ||
160 | /* | 157 | /* |
@@ -177,7 +174,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) | |||
177 | 174 | ||
178 | ca->loss_cwnd = tp->snd_cwnd; | 175 | ca->loss_cwnd = tp->snd_cwnd; |
179 | 176 | ||
180 | |||
181 | if (tp->snd_cwnd <= low_window) | 177 | if (tp->snd_cwnd <= low_window) |
182 | return max(tp->snd_cwnd >> 1U, 2U); | 178 | return max(tp->snd_cwnd >> 1U, 2U); |
183 | else | 179 | else |
@@ -188,6 +184,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) | |||
188 | { | 184 | { |
189 | const struct tcp_sock *tp = tcp_sk(sk); | 185 | const struct tcp_sock *tp = tcp_sk(sk); |
190 | const struct bictcp *ca = inet_csk_ca(sk); | 186 | const struct bictcp *ca = inet_csk_ca(sk); |
187 | |||
191 | return max(tp->snd_cwnd, ca->loss_cwnd); | 188 | return max(tp->snd_cwnd, ca->loss_cwnd); |
192 | } | 189 | } |
193 | 190 | ||
@@ -206,12 +203,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) | |||
206 | 203 | ||
207 | if (icsk->icsk_ca_state == TCP_CA_Open) { | 204 | if (icsk->icsk_ca_state == TCP_CA_Open) { |
208 | struct bictcp *ca = inet_csk_ca(sk); | 205 | struct bictcp *ca = inet_csk_ca(sk); |
206 | |||
209 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; | 207 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; |
210 | ca->delayed_ack += cnt; | 208 | ca->delayed_ack += cnt; |
211 | } | 209 | } |
212 | } | 210 | } |
213 | 211 | ||
214 | |||
215 | static struct tcp_congestion_ops bictcp __read_mostly = { | 212 | static struct tcp_congestion_ops bictcp __read_mostly = { |
216 | .init = bictcp_init, | 213 | .init = bictcp_init, |
217 | .ssthresh = bictcp_recalc_ssthresh, | 214 | .ssthresh = bictcp_recalc_ssthresh, |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 7b09d8b49fa5..b1c5970d47a1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | |||
74 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | 74 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); |
75 | 75 | ||
76 | /* Assign choice of congestion control. */ | 76 | /* Assign choice of congestion control. */ |
77 | void tcp_init_congestion_control(struct sock *sk) | 77 | void tcp_assign_congestion_control(struct sock *sk) |
78 | { | 78 | { |
79 | struct inet_connection_sock *icsk = inet_csk(sk); | 79 | struct inet_connection_sock *icsk = inet_csk(sk); |
80 | struct tcp_congestion_ops *ca; | 80 | struct tcp_congestion_ops *ca; |
81 | 81 | ||
82 | /* if no choice made yet assign the current value set as default */ | 82 | rcu_read_lock(); |
83 | if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { | 83 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { |
84 | rcu_read_lock(); | 84 | if (likely(try_module_get(ca->owner))) { |
85 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | 85 | icsk->icsk_ca_ops = ca; |
86 | if (try_module_get(ca->owner)) { | 86 | goto out; |
87 | icsk->icsk_ca_ops = ca; | ||
88 | break; | ||
89 | } | ||
90 | |||
91 | /* fallback to next available */ | ||
92 | } | 87 | } |
93 | rcu_read_unlock(); | 88 | /* Fallback to next available. The last really |
89 | * guaranteed fallback is Reno from this list. | ||
90 | */ | ||
94 | } | 91 | } |
92 | out: | ||
93 | rcu_read_unlock(); | ||
94 | |||
95 | /* Clear out private data before diag gets it and | ||
96 | * the ca has not been initialized. | ||
97 | */ | ||
98 | if (ca->get_info) | ||
99 | memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); | ||
100 | } | ||
101 | |||
102 | void tcp_init_congestion_control(struct sock *sk) | ||
103 | { | ||
104 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
95 | 105 | ||
96 | if (icsk->icsk_ca_ops->init) | 106 | if (icsk->icsk_ca_ops->init) |
97 | icsk->icsk_ca_ops->init(sk); | 107 | icsk->icsk_ca_ops->init(sk); |
@@ -142,7 +152,6 @@ static int __init tcp_congestion_default(void) | |||
142 | } | 152 | } |
143 | late_initcall(tcp_congestion_default); | 153 | late_initcall(tcp_congestion_default); |
144 | 154 | ||
145 | |||
146 | /* Build string with list of available congestion control values */ | 155 | /* Build string with list of available congestion control values */ |
147 | void tcp_get_available_congestion_control(char *buf, size_t maxlen) | 156 | void tcp_get_available_congestion_control(char *buf, size_t maxlen) |
148 | { | 157 | { |
@@ -154,7 +163,6 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) | |||
154 | offs += snprintf(buf + offs, maxlen - offs, | 163 | offs += snprintf(buf + offs, maxlen - offs, |
155 | "%s%s", | 164 | "%s%s", |
156 | offs == 0 ? "" : " ", ca->name); | 165 | offs == 0 ? "" : " ", ca->name); |
157 | |||
158 | } | 166 | } |
159 | rcu_read_unlock(); | 167 | rcu_read_unlock(); |
160 | } | 168 | } |
@@ -186,7 +194,6 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) | |||
186 | offs += snprintf(buf + offs, maxlen - offs, | 194 | offs += snprintf(buf + offs, maxlen - offs, |
187 | "%s%s", | 195 | "%s%s", |
188 | offs == 0 ? "" : " ", ca->name); | 196 | offs == 0 ? "" : " ", ca->name); |
189 | |||
190 | } | 197 | } |
191 | rcu_read_unlock(); | 198 | rcu_read_unlock(); |
192 | } | 199 | } |
@@ -230,7 +237,6 @@ out: | |||
230 | return ret; | 237 | return ret; |
231 | } | 238 | } |
232 | 239 | ||
233 | |||
234 | /* Change congestion control for socket */ | 240 | /* Change congestion control for socket */ |
235 | int tcp_set_congestion_control(struct sock *sk, const char *name) | 241 | int tcp_set_congestion_control(struct sock *sk, const char *name) |
236 | { | 242 | { |
@@ -285,15 +291,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) | |||
285 | * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and | 291 | * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and |
286 | * returns the leftover acks to adjust cwnd in congestion avoidance mode. | 292 | * returns the leftover acks to adjust cwnd in congestion avoidance mode. |
287 | */ | 293 | */ |
288 | int tcp_slow_start(struct tcp_sock *tp, u32 acked) | 294 | void tcp_slow_start(struct tcp_sock *tp, u32 acked) |
289 | { | 295 | { |
290 | u32 cwnd = tp->snd_cwnd + acked; | 296 | u32 cwnd = tp->snd_cwnd + acked; |
291 | 297 | ||
292 | if (cwnd > tp->snd_ssthresh) | 298 | if (cwnd > tp->snd_ssthresh) |
293 | cwnd = tp->snd_ssthresh + 1; | 299 | cwnd = tp->snd_ssthresh + 1; |
294 | acked -= cwnd - tp->snd_cwnd; | ||
295 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); | 300 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); |
296 | return acked; | ||
297 | } | 301 | } |
298 | EXPORT_SYMBOL_GPL(tcp_slow_start); | 302 | EXPORT_SYMBOL_GPL(tcp_slow_start); |
299 | 303 | ||
@@ -337,6 +341,7 @@ EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | |||
337 | u32 tcp_reno_ssthresh(struct sock *sk) | 341 | u32 tcp_reno_ssthresh(struct sock *sk) |
338 | { | 342 | { |
339 | const struct tcp_sock *tp = tcp_sk(sk); | 343 | const struct tcp_sock *tp = tcp_sk(sk); |
344 | |||
340 | return max(tp->snd_cwnd >> 1U, 2U); | 345 | return max(tp->snd_cwnd >> 1U, 2U); |
341 | } | 346 | } |
342 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | 347 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); |
@@ -348,15 +353,3 @@ struct tcp_congestion_ops tcp_reno = { | |||
348 | .ssthresh = tcp_reno_ssthresh, | 353 | .ssthresh = tcp_reno_ssthresh, |
349 | .cong_avoid = tcp_reno_cong_avoid, | 354 | .cong_avoid = tcp_reno_cong_avoid, |
350 | }; | 355 | }; |
351 | |||
352 | /* Initial congestion control used (until SYN) | ||
353 | * really reno under another name so we can tell difference | ||
354 | * during tcp_set_default_congestion_control | ||
355 | */ | ||
356 | struct tcp_congestion_ops tcp_init_congestion_ops = { | ||
357 | .name = "", | ||
358 | .owner = THIS_MODULE, | ||
359 | .ssthresh = tcp_reno_ssthresh, | ||
360 | .cong_avoid = tcp_reno_cong_avoid, | ||
361 | }; | ||
362 | EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); | ||
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9bd8a4828a9..20de0118c98e 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c | |||
@@ -82,12 +82,13 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse | |||
82 | /* BIC TCP Parameters */ | 82 | /* BIC TCP Parameters */ |
83 | struct bictcp { | 83 | struct bictcp { |
84 | u32 cnt; /* increase cwnd by 1 after ACKs */ | 84 | u32 cnt; /* increase cwnd by 1 after ACKs */ |
85 | u32 last_max_cwnd; /* last maximum snd_cwnd */ | 85 | u32 last_max_cwnd; /* last maximum snd_cwnd */ |
86 | u32 loss_cwnd; /* congestion window at last loss */ | 86 | u32 loss_cwnd; /* congestion window at last loss */ |
87 | u32 last_cwnd; /* the last snd_cwnd */ | 87 | u32 last_cwnd; /* the last snd_cwnd */ |
88 | u32 last_time; /* time when updated last_cwnd */ | 88 | u32 last_time; /* time when updated last_cwnd */ |
89 | u32 bic_origin_point;/* origin point of bic function */ | 89 | u32 bic_origin_point;/* origin point of bic function */ |
90 | u32 bic_K; /* time to origin point from the beginning of the current epoch */ | 90 | u32 bic_K; /* time to origin point |
91 | from the beginning of the current epoch */ | ||
91 | u32 delay_min; /* min delay (msec << 3) */ | 92 | u32 delay_min; /* min delay (msec << 3) */ |
92 | u32 epoch_start; /* beginning of an epoch */ | 93 | u32 epoch_start; /* beginning of an epoch */ |
93 | u32 ack_cnt; /* number of acks */ | 94 | u32 ack_cnt; /* number of acks */ |
@@ -219,7 +220,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
219 | ca->last_time = tcp_time_stamp; | 220 | ca->last_time = tcp_time_stamp; |
220 | 221 | ||
221 | if (ca->epoch_start == 0) { | 222 | if (ca->epoch_start == 0) { |
222 | ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ | 223 | ca->epoch_start = tcp_time_stamp; /* record beginning */ |
223 | ca->ack_cnt = 1; /* start counting */ | 224 | ca->ack_cnt = 1; /* start counting */ |
224 | ca->tcp_cwnd = cwnd; /* syn with cubic */ | 225 | ca->tcp_cwnd = cwnd; /* syn with cubic */ |
225 | 226 | ||
@@ -263,9 +264,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
263 | 264 | ||
264 | /* c/rtt * (t-K)^3 */ | 265 | /* c/rtt * (t-K)^3 */ |
265 | delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); | 266 | delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); |
266 | if (t < ca->bic_K) /* below origin*/ | 267 | if (t < ca->bic_K) /* below origin*/ |
267 | bic_target = ca->bic_origin_point - delta; | 268 | bic_target = ca->bic_origin_point - delta; |
268 | else /* above origin*/ | 269 | else /* above origin*/ |
269 | bic_target = ca->bic_origin_point + delta; | 270 | bic_target = ca->bic_origin_point + delta; |
270 | 271 | ||
271 | /* cubic function - calc bictcp_cnt*/ | 272 | /* cubic function - calc bictcp_cnt*/ |
@@ -285,13 +286,14 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
285 | /* TCP Friendly */ | 286 | /* TCP Friendly */ |
286 | if (tcp_friendliness) { | 287 | if (tcp_friendliness) { |
287 | u32 scale = beta_scale; | 288 | u32 scale = beta_scale; |
289 | |||
288 | delta = (cwnd * scale) >> 3; | 290 | delta = (cwnd * scale) >> 3; |
289 | while (ca->ack_cnt > delta) { /* update tcp cwnd */ | 291 | while (ca->ack_cnt > delta) { /* update tcp cwnd */ |
290 | ca->ack_cnt -= delta; | 292 | ca->ack_cnt -= delta; |
291 | ca->tcp_cwnd++; | 293 | ca->tcp_cwnd++; |
292 | } | 294 | } |
293 | 295 | ||
294 | if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ | 296 | if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ |
295 | delta = ca->tcp_cwnd - cwnd; | 297 | delta = ca->tcp_cwnd - cwnd; |
296 | max_cnt = cwnd / delta; | 298 | max_cnt = cwnd / delta; |
297 | if (ca->cnt > max_cnt) | 299 | if (ca->cnt > max_cnt) |
@@ -320,7 +322,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
320 | bictcp_update(ca, tp->snd_cwnd); | 322 | bictcp_update(ca, tp->snd_cwnd); |
321 | tcp_cong_avoid_ai(tp, ca->cnt); | 323 | tcp_cong_avoid_ai(tp, ca->cnt); |
322 | } | 324 | } |
323 | |||
324 | } | 325 | } |
325 | 326 | ||
326 | static u32 bictcp_recalc_ssthresh(struct sock *sk) | 327 | static u32 bictcp_recalc_ssthresh(struct sock *sk) |
@@ -452,7 +453,8 @@ static int __init cubictcp_register(void) | |||
452 | * based on SRTT of 100ms | 453 | * based on SRTT of 100ms |
453 | */ | 454 | */ |
454 | 455 | ||
455 | beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); | 456 | beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 |
457 | / (BICTCP_BETA_SCALE - beta); | ||
456 | 458 | ||
457 | cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ | 459 | cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ |
458 | 460 | ||
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c new file mode 100644 index 000000000000..b504371af742 --- /dev/null +++ b/net/ipv4/tcp_dctcp.c | |||
@@ -0,0 +1,344 @@ | |||
1 | /* DataCenter TCP (DCTCP) congestion control. | ||
2 | * | ||
3 | * http://simula.stanford.edu/~alizade/Site/DCTCP.html | ||
4 | * | ||
5 | * This is an implementation of DCTCP over Reno, an enhancement to the | ||
6 | * TCP congestion control algorithm designed for data centers. DCTCP | ||
7 | * leverages Explicit Congestion Notification (ECN) in the network to | ||
8 | * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet | ||
9 | * the following three data center transport requirements: | ||
10 | * | ||
11 | * - High burst tolerance (incast due to partition/aggregate) | ||
12 | * - Low latency (short flows, queries) | ||
13 | * - High throughput (continuous data updates, large file transfers) | ||
14 | * with commodity shallow buffered switches | ||
15 | * | ||
16 | * The algorithm is described in detail in the following two papers: | ||
17 | * | ||
18 | * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, | ||
19 | * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: | ||
20 | * "Data Center TCP (DCTCP)", Data Center Networks session | ||
21 | * Proc. ACM SIGCOMM, New Delhi, 2010. | ||
22 | * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf | ||
23 | * | ||
24 | * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: | ||
25 | * "Analysis of DCTCP: Stability, Convergence, and Fairness" | ||
26 | * Proc. ACM SIGMETRICS, San Jose, 2011. | ||
27 | * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf | ||
28 | * | ||
29 | * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. | ||
30 | * | ||
31 | * Authors: | ||
32 | * | ||
33 | * Daniel Borkmann <dborkman@redhat.com> | ||
34 | * Florian Westphal <fw@strlen.de> | ||
35 | * Glenn Judd <glenn.judd@morganstanley.com> | ||
36 | * | ||
37 | * This program is free software; you can redistribute it and/or modify | ||
38 | * it under the terms of the GNU General Public License as published by | ||
39 | * the Free Software Foundation; either version 2 of the License, or (at | ||
40 | * your option) any later version. | ||
41 | */ | ||
42 | |||
43 | #include <linux/module.h> | ||
44 | #include <linux/mm.h> | ||
45 | #include <net/tcp.h> | ||
46 | #include <linux/inet_diag.h> | ||
47 | |||
48 | #define DCTCP_MAX_ALPHA 1024U | ||
49 | |||
50 | struct dctcp { | ||
51 | u32 acked_bytes_ecn; | ||
52 | u32 acked_bytes_total; | ||
53 | u32 prior_snd_una; | ||
54 | u32 prior_rcv_nxt; | ||
55 | u32 dctcp_alpha; | ||
56 | u32 next_seq; | ||
57 | u32 ce_state; | ||
58 | u32 delayed_ack_reserved; | ||
59 | }; | ||
60 | |||
61 | static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ | ||
62 | module_param(dctcp_shift_g, uint, 0644); | ||
63 | MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); | ||
64 | |||
65 | static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; | ||
66 | module_param(dctcp_alpha_on_init, uint, 0644); | ||
67 | MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); | ||
68 | |||
69 | static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; | ||
70 | module_param(dctcp_clamp_alpha_on_loss, uint, 0644); | ||
71 | MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, | ||
72 | "parameter for clamping alpha on loss"); | ||
73 | |||
74 | static struct tcp_congestion_ops dctcp_reno; | ||
75 | |||
76 | static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) | ||
77 | { | ||
78 | ca->next_seq = tp->snd_nxt; | ||
79 | |||
80 | ca->acked_bytes_ecn = 0; | ||
81 | ca->acked_bytes_total = 0; | ||
82 | } | ||
83 | |||
84 | static void dctcp_init(struct sock *sk) | ||
85 | { | ||
86 | const struct tcp_sock *tp = tcp_sk(sk); | ||
87 | |||
88 | if ((tp->ecn_flags & TCP_ECN_OK) || | ||
89 | (sk->sk_state == TCP_LISTEN || | ||
90 | sk->sk_state == TCP_CLOSE)) { | ||
91 | struct dctcp *ca = inet_csk_ca(sk); | ||
92 | |||
93 | ca->prior_snd_una = tp->snd_una; | ||
94 | ca->prior_rcv_nxt = tp->rcv_nxt; | ||
95 | |||
96 | ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); | ||
97 | |||
98 | ca->delayed_ack_reserved = 0; | ||
99 | ca->ce_state = 0; | ||
100 | |||
101 | dctcp_reset(tp, ca); | ||
102 | return; | ||
103 | } | ||
104 | |||
105 | /* No ECN support? Fall back to Reno. Also need to clear | ||
106 | * ECT from sk since it is set during 3WHS for DCTCP. | ||
107 | */ | ||
108 | inet_csk(sk)->icsk_ca_ops = &dctcp_reno; | ||
109 | INET_ECN_dontxmit(sk); | ||
110 | } | ||
111 | |||
112 | static u32 dctcp_ssthresh(struct sock *sk) | ||
113 | { | ||
114 | const struct dctcp *ca = inet_csk_ca(sk); | ||
115 | struct tcp_sock *tp = tcp_sk(sk); | ||
116 | |||
117 | return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); | ||
118 | } | ||
119 | |||
120 | /* Minimal DCTP CE state machine: | ||
121 | * | ||
122 | * S: 0 <- last pkt was non-CE | ||
123 | * 1 <- last pkt was CE | ||
124 | */ | ||
125 | |||
126 | static void dctcp_ce_state_0_to_1(struct sock *sk) | ||
127 | { | ||
128 | struct dctcp *ca = inet_csk_ca(sk); | ||
129 | struct tcp_sock *tp = tcp_sk(sk); | ||
130 | |||
131 | /* State has changed from CE=0 to CE=1 and delayed | ||
132 | * ACK has not sent yet. | ||
133 | */ | ||
134 | if (!ca->ce_state && ca->delayed_ack_reserved) { | ||
135 | u32 tmp_rcv_nxt; | ||
136 | |||
137 | /* Save current rcv_nxt. */ | ||
138 | tmp_rcv_nxt = tp->rcv_nxt; | ||
139 | |||
140 | /* Generate previous ack with CE=0. */ | ||
141 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | ||
142 | tp->rcv_nxt = ca->prior_rcv_nxt; | ||
143 | |||
144 | tcp_send_ack(sk); | ||
145 | |||
146 | /* Recover current rcv_nxt. */ | ||
147 | tp->rcv_nxt = tmp_rcv_nxt; | ||
148 | } | ||
149 | |||
150 | ca->prior_rcv_nxt = tp->rcv_nxt; | ||
151 | ca->ce_state = 1; | ||
152 | |||
153 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | ||
154 | } | ||
155 | |||
156 | static void dctcp_ce_state_1_to_0(struct sock *sk) | ||
157 | { | ||
158 | struct dctcp *ca = inet_csk_ca(sk); | ||
159 | struct tcp_sock *tp = tcp_sk(sk); | ||
160 | |||
161 | /* State has changed from CE=1 to CE=0 and delayed | ||
162 | * ACK has not sent yet. | ||
163 | */ | ||
164 | if (ca->ce_state && ca->delayed_ack_reserved) { | ||
165 | u32 tmp_rcv_nxt; | ||
166 | |||
167 | /* Save current rcv_nxt. */ | ||
168 | tmp_rcv_nxt = tp->rcv_nxt; | ||
169 | |||
170 | /* Generate previous ack with CE=1. */ | ||
171 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | ||
172 | tp->rcv_nxt = ca->prior_rcv_nxt; | ||
173 | |||
174 | tcp_send_ack(sk); | ||
175 | |||
176 | /* Recover current rcv_nxt. */ | ||
177 | tp->rcv_nxt = tmp_rcv_nxt; | ||
178 | } | ||
179 | |||
180 | ca->prior_rcv_nxt = tp->rcv_nxt; | ||
181 | ca->ce_state = 0; | ||
182 | |||
183 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | ||
184 | } | ||
185 | |||
186 | static void dctcp_update_alpha(struct sock *sk, u32 flags) | ||
187 | { | ||
188 | const struct tcp_sock *tp = tcp_sk(sk); | ||
189 | struct dctcp *ca = inet_csk_ca(sk); | ||
190 | u32 acked_bytes = tp->snd_una - ca->prior_snd_una; | ||
191 | |||
192 | /* If ack did not advance snd_una, count dupack as MSS size. | ||
193 | * If ack did update window, do not count it at all. | ||
194 | */ | ||
195 | if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) | ||
196 | acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; | ||
197 | if (acked_bytes) { | ||
198 | ca->acked_bytes_total += acked_bytes; | ||
199 | ca->prior_snd_una = tp->snd_una; | ||
200 | |||
201 | if (flags & CA_ACK_ECE) | ||
202 | ca->acked_bytes_ecn += acked_bytes; | ||
203 | } | ||
204 | |||
205 | /* Expired RTT */ | ||
206 | if (!before(tp->snd_una, ca->next_seq)) { | ||
207 | /* For avoiding denominator == 1. */ | ||
208 | if (ca->acked_bytes_total == 0) | ||
209 | ca->acked_bytes_total = 1; | ||
210 | |||
211 | /* alpha = (1 - g) * alpha + g * F */ | ||
212 | ca->dctcp_alpha = ca->dctcp_alpha - | ||
213 | (ca->dctcp_alpha >> dctcp_shift_g) + | ||
214 | (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / | ||
215 | ca->acked_bytes_total; | ||
216 | |||
217 | if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) | ||
218 | /* Clamp dctcp_alpha to max. */ | ||
219 | ca->dctcp_alpha = DCTCP_MAX_ALPHA; | ||
220 | |||
221 | dctcp_reset(tp, ca); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | static void dctcp_state(struct sock *sk, u8 new_state) | ||
226 | { | ||
227 | if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { | ||
228 | struct dctcp *ca = inet_csk_ca(sk); | ||
229 | |||
230 | /* If this extension is enabled, we clamp dctcp_alpha to | ||
231 | * max on packet loss; the motivation is that dctcp_alpha | ||
232 | * is an indicator to the extend of congestion and packet | ||
233 | * loss is an indicator of extreme congestion; setting | ||
234 | * this in practice turned out to be beneficial, and | ||
235 | * effectively assumes total congestion which reduces the | ||
236 | * window by half. | ||
237 | */ | ||
238 | ca->dctcp_alpha = DCTCP_MAX_ALPHA; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) | ||
243 | { | ||
244 | struct dctcp *ca = inet_csk_ca(sk); | ||
245 | |||
246 | switch (ev) { | ||
247 | case CA_EVENT_DELAYED_ACK: | ||
248 | if (!ca->delayed_ack_reserved) | ||
249 | ca->delayed_ack_reserved = 1; | ||
250 | break; | ||
251 | case CA_EVENT_NON_DELAYED_ACK: | ||
252 | if (ca->delayed_ack_reserved) | ||
253 | ca->delayed_ack_reserved = 0; | ||
254 | break; | ||
255 | default: | ||
256 | /* Don't care for the rest. */ | ||
257 | break; | ||
258 | } | ||
259 | } | ||
260 | |||
261 | static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) | ||
262 | { | ||
263 | switch (ev) { | ||
264 | case CA_EVENT_ECN_IS_CE: | ||
265 | dctcp_ce_state_0_to_1(sk); | ||
266 | break; | ||
267 | case CA_EVENT_ECN_NO_CE: | ||
268 | dctcp_ce_state_1_to_0(sk); | ||
269 | break; | ||
270 | case CA_EVENT_DELAYED_ACK: | ||
271 | case CA_EVENT_NON_DELAYED_ACK: | ||
272 | dctcp_update_ack_reserved(sk, ev); | ||
273 | break; | ||
274 | default: | ||
275 | /* Don't care for the rest. */ | ||
276 | break; | ||
277 | } | ||
278 | } | ||
279 | |||
280 | static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) | ||
281 | { | ||
282 | const struct dctcp *ca = inet_csk_ca(sk); | ||
283 | |||
284 | /* Fill it also in case of VEGASINFO due to req struct limits. | ||
285 | * We can still correctly retrieve it later. | ||
286 | */ | ||
287 | if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || | ||
288 | ext & (1 << (INET_DIAG_VEGASINFO - 1))) { | ||
289 | struct tcp_dctcp_info info; | ||
290 | |||
291 | memset(&info, 0, sizeof(info)); | ||
292 | if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { | ||
293 | info.dctcp_enabled = 1; | ||
294 | info.dctcp_ce_state = (u16) ca->ce_state; | ||
295 | info.dctcp_alpha = ca->dctcp_alpha; | ||
296 | info.dctcp_ab_ecn = ca->acked_bytes_ecn; | ||
297 | info.dctcp_ab_tot = ca->acked_bytes_total; | ||
298 | } | ||
299 | |||
300 | nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); | ||
301 | } | ||
302 | } | ||
303 | |||
304 | static struct tcp_congestion_ops dctcp __read_mostly = { | ||
305 | .init = dctcp_init, | ||
306 | .in_ack_event = dctcp_update_alpha, | ||
307 | .cwnd_event = dctcp_cwnd_event, | ||
308 | .ssthresh = dctcp_ssthresh, | ||
309 | .cong_avoid = tcp_reno_cong_avoid, | ||
310 | .set_state = dctcp_state, | ||
311 | .get_info = dctcp_get_info, | ||
312 | .flags = TCP_CONG_NEEDS_ECN, | ||
313 | .owner = THIS_MODULE, | ||
314 | .name = "dctcp", | ||
315 | }; | ||
316 | |||
317 | static struct tcp_congestion_ops dctcp_reno __read_mostly = { | ||
318 | .ssthresh = tcp_reno_ssthresh, | ||
319 | .cong_avoid = tcp_reno_cong_avoid, | ||
320 | .get_info = dctcp_get_info, | ||
321 | .owner = THIS_MODULE, | ||
322 | .name = "dctcp-reno", | ||
323 | }; | ||
324 | |||
325 | static int __init dctcp_register(void) | ||
326 | { | ||
327 | BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); | ||
328 | return tcp_register_congestion_control(&dctcp); | ||
329 | } | ||
330 | |||
331 | static void __exit dctcp_unregister(void) | ||
332 | { | ||
333 | tcp_unregister_congestion_control(&dctcp); | ||
334 | } | ||
335 | |||
336 | module_init(dctcp_register); | ||
337 | module_exit(dctcp_unregister); | ||
338 | |||
339 | MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); | ||
340 | MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); | ||
341 | MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>"); | ||
342 | |||
343 | MODULE_LICENSE("GPL v2"); | ||
344 | MODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); | ||
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index ed3f2ad42e0f..0d73f9ddb55b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c | |||
@@ -9,7 +9,6 @@ | |||
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | |||
13 | #include <linux/module.h> | 12 | #include <linux/module.h> |
14 | #include <linux/inet_diag.h> | 13 | #include <linux/inet_diag.h> |
15 | 14 | ||
@@ -35,13 +34,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, | |||
35 | } | 34 | } |
36 | 35 | ||
37 | static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, | 36 | static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, |
38 | struct inet_diag_req_v2 *r, struct nlattr *bc) | 37 | struct inet_diag_req_v2 *r, struct nlattr *bc) |
39 | { | 38 | { |
40 | inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); | 39 | inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); |
41 | } | 40 | } |
42 | 41 | ||
43 | static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, | 42 | static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, |
44 | struct inet_diag_req_v2 *req) | 43 | struct inet_diag_req_v2 *req) |
45 | { | 44 | { |
46 | return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); | 45 | return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); |
47 | } | 46 | } |
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 9771563ab564..815c85e3b1e0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c | |||
@@ -115,7 +115,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, | |||
115 | 115 | ||
116 | if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { | 116 | if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { |
117 | struct in6_addr *buf = (struct in6_addr *) tmp.val; | 117 | struct in6_addr *buf = (struct in6_addr *) tmp.val; |
118 | int i = 4; | 118 | int i; |
119 | 119 | ||
120 | for (i = 0; i < 4; i++) | 120 | for (i = 0; i < 4; i++) |
121 | buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; | 121 | buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; |
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 1c4908280d92..882c08aae2f5 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <net/tcp.h> | 10 | #include <net/tcp.h> |
11 | 11 | ||
12 | |||
13 | /* From AIMD tables from RFC 3649 appendix B, | 12 | /* From AIMD tables from RFC 3649 appendix B, |
14 | * with fixed-point MD scaled <<8. | 13 | * with fixed-point MD scaled <<8. |
15 | */ | 14 | */ |
@@ -17,78 +16,78 @@ static const struct hstcp_aimd_val { | |||
17 | unsigned int cwnd; | 16 | unsigned int cwnd; |
18 | unsigned int md; | 17 | unsigned int md; |
19 | } hstcp_aimd_vals[] = { | 18 | } hstcp_aimd_vals[] = { |
20 | { 38, 128, /* 0.50 */ }, | 19 | { 38, 128, /* 0.50 */ }, |
21 | { 118, 112, /* 0.44 */ }, | 20 | { 118, 112, /* 0.44 */ }, |
22 | { 221, 104, /* 0.41 */ }, | 21 | { 221, 104, /* 0.41 */ }, |
23 | { 347, 98, /* 0.38 */ }, | 22 | { 347, 98, /* 0.38 */ }, |
24 | { 495, 93, /* 0.37 */ }, | 23 | { 495, 93, /* 0.37 */ }, |
25 | { 663, 89, /* 0.35 */ }, | 24 | { 663, 89, /* 0.35 */ }, |
26 | { 851, 86, /* 0.34 */ }, | 25 | { 851, 86, /* 0.34 */ }, |
27 | { 1058, 83, /* 0.33 */ }, | 26 | { 1058, 83, /* 0.33 */ }, |
28 | { 1284, 81, /* 0.32 */ }, | 27 | { 1284, 81, /* 0.32 */ }, |
29 | { 1529, 78, /* 0.31 */ }, | 28 | { 1529, 78, /* 0.31 */ }, |
30 | { 1793, 76, /* 0.30 */ }, | 29 | { 1793, 76, /* 0.30 */ }, |
31 | { 2076, 74, /* 0.29 */ }, | 30 | { 2076, 74, /* 0.29 */ }, |
32 | { 2378, 72, /* 0.28 */ }, | 31 | { 2378, 72, /* 0.28 */ }, |
33 | { 2699, 71, /* 0.28 */ }, | 32 | { 2699, 71, /* 0.28 */ }, |
34 | { 3039, 69, /* 0.27 */ }, | 33 | { 3039, 69, /* 0.27 */ }, |
35 | { 3399, 68, /* 0.27 */ }, | 34 | { 3399, 68, /* 0.27 */ }, |
36 | { 3778, 66, /* 0.26 */ }, | 35 | { 3778, 66, /* 0.26 */ }, |
37 | { 4177, 65, /* 0.26 */ }, | 36 | { 4177, 65, /* 0.26 */ }, |
38 | { 4596, 64, /* 0.25 */ }, | 37 | { 4596, 64, /* 0.25 */ }, |
39 | { 5036, 62, /* 0.25 */ }, | 38 | { 5036, 62, /* 0.25 */ }, |
40 | { 5497, 61, /* 0.24 */ }, | 39 | { 5497, 61, /* 0.24 */ }, |
41 | { 5979, 60, /* 0.24 */ }, | 40 | { 5979, 60, /* 0.24 */ }, |
42 | { 6483, 59, /* 0.23 */ }, | 41 | { 6483, 59, /* 0.23 */ }, |
43 | { 7009, 58, /* 0.23 */ }, | 42 | { 7009, 58, /* 0.23 */ }, |
44 | { 7558, 57, /* 0.22 */ }, | 43 | { 7558, 57, /* 0.22 */ }, |
45 | { 8130, 56, /* 0.22 */ }, | 44 | { 8130, 56, /* 0.22 */ }, |
46 | { 8726, 55, /* 0.22 */ }, | 45 | { 8726, 55, /* 0.22 */ }, |
47 | { 9346, 54, /* 0.21 */ }, | 46 | { 9346, 54, /* 0.21 */ }, |
48 | { 9991, 53, /* 0.21 */ }, | 47 | { 9991, 53, /* 0.21 */ }, |
49 | { 10661, 52, /* 0.21 */ }, | 48 | { 10661, 52, /* 0.21 */ }, |
50 | { 11358, 52, /* 0.20 */ }, | 49 | { 11358, 52, /* 0.20 */ }, |
51 | { 12082, 51, /* 0.20 */ }, | 50 | { 12082, 51, /* 0.20 */ }, |
52 | { 12834, 50, /* 0.20 */ }, | 51 | { 12834, 50, /* 0.20 */ }, |
53 | { 13614, 49, /* 0.19 */ }, | 52 | { 13614, 49, /* 0.19 */ }, |
54 | { 14424, 48, /* 0.19 */ }, | 53 | { 14424, 48, /* 0.19 */ }, |
55 | { 15265, 48, /* 0.19 */ }, | 54 | { 15265, 48, /* 0.19 */ }, |
56 | { 16137, 47, /* 0.19 */ }, | 55 | { 16137, 47, /* 0.19 */ }, |
57 | { 17042, 46, /* 0.18 */ }, | 56 | { 17042, 46, /* 0.18 */ }, |
58 | { 17981, 45, /* 0.18 */ }, | 57 | { 17981, 45, /* 0.18 */ }, |
59 | { 18955, 45, /* 0.18 */ }, | 58 | { 18955, 45, /* 0.18 */ }, |
60 | { 19965, 44, /* 0.17 */ }, | 59 | { 19965, 44, /* 0.17 */ }, |
61 | { 21013, 43, /* 0.17 */ }, | 60 | { 21013, 43, /* 0.17 */ }, |
62 | { 22101, 43, /* 0.17 */ }, | 61 | { 22101, 43, /* 0.17 */ }, |
63 | { 23230, 42, /* 0.17 */ }, | 62 | { 23230, 42, /* 0.17 */ }, |
64 | { 24402, 41, /* 0.16 */ }, | 63 | { 24402, 41, /* 0.16 */ }, |
65 | { 25618, 41, /* 0.16 */ }, | 64 | { 25618, 41, /* 0.16 */ }, |
66 | { 26881, 40, /* 0.16 */ }, | 65 | { 26881, 40, /* 0.16 */ }, |
67 | { 28193, 39, /* 0.16 */ }, | 66 | { 28193, 39, /* 0.16 */ }, |
68 | { 29557, 39, /* 0.15 */ }, | 67 | { 29557, 39, /* 0.15 */ }, |
69 | { 30975, 38, /* 0.15 */ }, | 68 | { 30975, 38, /* 0.15 */ }, |
70 | { 32450, 38, /* 0.15 */ }, | 69 | { 32450, 38, /* 0.15 */ }, |
71 | { 33986, 37, /* 0.15 */ }, | 70 | { 33986, 37, /* 0.15 */ }, |
72 | { 35586, 36, /* 0.14 */ }, | 71 | { 35586, 36, /* 0.14 */ }, |
73 | { 37253, 36, /* 0.14 */ }, | 72 | { 37253, 36, /* 0.14 */ }, |
74 | { 38992, 35, /* 0.14 */ }, | 73 | { 38992, 35, /* 0.14 */ }, |
75 | { 40808, 35, /* 0.14 */ }, | 74 | { 40808, 35, /* 0.14 */ }, |
76 | { 42707, 34, /* 0.13 */ }, | 75 | { 42707, 34, /* 0.13 */ }, |
77 | { 44694, 33, /* 0.13 */ }, | 76 | { 44694, 33, /* 0.13 */ }, |
78 | { 46776, 33, /* 0.13 */ }, | 77 | { 46776, 33, /* 0.13 */ }, |
79 | { 48961, 32, /* 0.13 */ }, | 78 | { 48961, 32, /* 0.13 */ }, |
80 | { 51258, 32, /* 0.13 */ }, | 79 | { 51258, 32, /* 0.13 */ }, |
81 | { 53677, 31, /* 0.12 */ }, | 80 | { 53677, 31, /* 0.12 */ }, |
82 | { 56230, 30, /* 0.12 */ }, | 81 | { 56230, 30, /* 0.12 */ }, |
83 | { 58932, 30, /* 0.12 */ }, | 82 | { 58932, 30, /* 0.12 */ }, |
84 | { 61799, 29, /* 0.12 */ }, | 83 | { 61799, 29, /* 0.12 */ }, |
85 | { 64851, 28, /* 0.11 */ }, | 84 | { 64851, 28, /* 0.11 */ }, |
86 | { 68113, 28, /* 0.11 */ }, | 85 | { 68113, 28, /* 0.11 */ }, |
87 | { 71617, 27, /* 0.11 */ }, | 86 | { 71617, 27, /* 0.11 */ }, |
88 | { 75401, 26, /* 0.10 */ }, | 87 | { 75401, 26, /* 0.10 */ }, |
89 | { 79517, 26, /* 0.10 */ }, | 88 | { 79517, 26, /* 0.10 */ }, |
90 | { 84035, 25, /* 0.10 */ }, | 89 | { 84035, 25, /* 0.10 */ }, |
91 | { 89053, 24, /* 0.10 */ }, | 90 | { 89053, 24, /* 0.10 */ }, |
92 | }; | 91 | }; |
93 | 92 | ||
94 | #define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) | 93 | #define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) |
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 031361311a8b..58469fff6c18 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
@@ -98,7 +98,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt) | |||
98 | } | 98 | } |
99 | } | 99 | } |
100 | 100 | ||
101 | static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt) | 101 | static void measure_achieved_throughput(struct sock *sk, |
102 | u32 pkts_acked, s32 rtt) | ||
102 | { | 103 | { |
103 | const struct inet_connection_sock *icsk = inet_csk(sk); | 104 | const struct inet_connection_sock *icsk = inet_csk(sk); |
104 | const struct tcp_sock *tp = tcp_sk(sk); | 105 | const struct tcp_sock *tp = tcp_sk(sk); |
@@ -148,8 +149,8 @@ static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) | |||
148 | if (use_bandwidth_switch) { | 149 | if (use_bandwidth_switch) { |
149 | u32 maxB = ca->maxB; | 150 | u32 maxB = ca->maxB; |
150 | u32 old_maxB = ca->old_maxB; | 151 | u32 old_maxB = ca->old_maxB; |
151 | ca->old_maxB = ca->maxB; | ||
152 | 152 | ||
153 | ca->old_maxB = ca->maxB; | ||
153 | if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { | 154 | if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { |
154 | ca->beta = BETA_MIN; | 155 | ca->beta = BETA_MIN; |
155 | ca->modeswitch = 0; | 156 | ca->modeswitch = 0; |
@@ -270,6 +271,7 @@ static void htcp_state(struct sock *sk, u8 new_state) | |||
270 | case TCP_CA_Open: | 271 | case TCP_CA_Open: |
271 | { | 272 | { |
272 | struct htcp *ca = inet_csk_ca(sk); | 273 | struct htcp *ca = inet_csk_ca(sk); |
274 | |||
273 | if (ca->undo_last_cong) { | 275 | if (ca->undo_last_cong) { |
274 | ca->last_cong = jiffies; | 276 | ca->last_cong = jiffies; |
275 | ca->undo_last_cong = 0; | 277 | ca->undo_last_cong = 0; |
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index d8f8f05a4951..f963b274f2b0 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c | |||
@@ -29,7 +29,6 @@ static int rtt0 = 25; | |||
29 | module_param(rtt0, int, 0644); | 29 | module_param(rtt0, int, 0644); |
30 | MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); | 30 | MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); |
31 | 31 | ||
32 | |||
33 | /* This is called to refresh values for hybla parameters */ | 32 | /* This is called to refresh values for hybla parameters */ |
34 | static inline void hybla_recalc_param (struct sock *sk) | 33 | static inline void hybla_recalc_param (struct sock *sk) |
35 | { | 34 | { |
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 5999b3972e64..1d5a30a90adf 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c | |||
@@ -284,7 +284,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
284 | delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; | 284 | delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; |
285 | if (delta >= tp->snd_cwnd) { | 285 | if (delta >= tp->snd_cwnd) { |
286 | tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, | 286 | tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, |
287 | (u32) tp->snd_cwnd_clamp); | 287 | (u32)tp->snd_cwnd_clamp); |
288 | tp->snd_cwnd_cnt = 0; | 288 | tp->snd_cwnd_cnt = 0; |
289 | } | 289 | } |
290 | } | 290 | } |
@@ -299,7 +299,6 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) | |||
299 | return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); | 299 | return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); |
300 | } | 300 | } |
301 | 301 | ||
302 | |||
303 | /* Extract info for Tcp socket info provided via netlink. */ | 302 | /* Extract info for Tcp socket info provided via netlink. */ |
304 | static void tcp_illinois_info(struct sock *sk, u32 ext, | 303 | static void tcp_illinois_info(struct sock *sk, u32 ext, |
305 | struct sk_buff *skb) | 304 | struct sk_buff *skb) |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0185eea59342..00a41499d52c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -200,28 +200,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk) | |||
200 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; | 200 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; |
201 | } | 201 | } |
202 | 202 | ||
203 | static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) | 203 | static void tcp_ecn_queue_cwr(struct tcp_sock *tp) |
204 | { | 204 | { |
205 | if (tp->ecn_flags & TCP_ECN_OK) | 205 | if (tp->ecn_flags & TCP_ECN_OK) |
206 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; | 206 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; |
207 | } | 207 | } |
208 | 208 | ||
209 | static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) | 209 | static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) |
210 | { | 210 | { |
211 | if (tcp_hdr(skb)->cwr) | 211 | if (tcp_hdr(skb)->cwr) |
212 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 212 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
213 | } | 213 | } |
214 | 214 | ||
215 | static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) | 215 | static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) |
216 | { | 216 | { |
217 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 217 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
218 | } | 218 | } |
219 | 219 | ||
220 | static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) | 220 | static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) |
221 | { | 221 | { |
222 | if (!(tp->ecn_flags & TCP_ECN_OK)) | ||
223 | return; | ||
224 | |||
225 | switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { | 222 | switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { |
226 | case INET_ECN_NOT_ECT: | 223 | case INET_ECN_NOT_ECT: |
227 | /* Funny extension: if ECT is not set on a segment, | 224 | /* Funny extension: if ECT is not set on a segment, |
@@ -232,30 +229,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s | |||
232 | tcp_enter_quickack_mode((struct sock *)tp); | 229 | tcp_enter_quickack_mode((struct sock *)tp); |
233 | break; | 230 | break; |
234 | case INET_ECN_CE: | 231 | case INET_ECN_CE: |
232 | if (tcp_ca_needs_ecn((struct sock *)tp)) | ||
233 | tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); | ||
234 | |||
235 | if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { | 235 | if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { |
236 | /* Better not delay acks, sender can have a very low cwnd */ | 236 | /* Better not delay acks, sender can have a very low cwnd */ |
237 | tcp_enter_quickack_mode((struct sock *)tp); | 237 | tcp_enter_quickack_mode((struct sock *)tp); |
238 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | 238 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; |
239 | } | 239 | } |
240 | /* fallinto */ | 240 | tp->ecn_flags |= TCP_ECN_SEEN; |
241 | break; | ||
241 | default: | 242 | default: |
243 | if (tcp_ca_needs_ecn((struct sock *)tp)) | ||
244 | tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); | ||
242 | tp->ecn_flags |= TCP_ECN_SEEN; | 245 | tp->ecn_flags |= TCP_ECN_SEEN; |
246 | break; | ||
243 | } | 247 | } |
244 | } | 248 | } |
245 | 249 | ||
246 | static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) | 250 | static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) |
251 | { | ||
252 | if (tp->ecn_flags & TCP_ECN_OK) | ||
253 | __tcp_ecn_check_ce(tp, skb); | ||
254 | } | ||
255 | |||
256 | static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) | ||
247 | { | 257 | { |
248 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) | 258 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) |
249 | tp->ecn_flags &= ~TCP_ECN_OK; | 259 | tp->ecn_flags &= ~TCP_ECN_OK; |
250 | } | 260 | } |
251 | 261 | ||
252 | static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) | 262 | static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) |
253 | { | 263 | { |
254 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) | 264 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) |
255 | tp->ecn_flags &= ~TCP_ECN_OK; | 265 | tp->ecn_flags &= ~TCP_ECN_OK; |
256 | } | 266 | } |
257 | 267 | ||
258 | static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) | 268 | static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) |
259 | { | 269 | { |
260 | if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) | 270 | if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) |
261 | return true; | 271 | return true; |
@@ -652,7 +662,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) | |||
652 | } | 662 | } |
653 | icsk->icsk_ack.lrcvtime = now; | 663 | icsk->icsk_ack.lrcvtime = now; |
654 | 664 | ||
655 | TCP_ECN_check_ce(tp, skb); | 665 | tcp_ecn_check_ce(tp, skb); |
656 | 666 | ||
657 | if (skb->len >= 128) | 667 | if (skb->len >= 128) |
658 | tcp_grow_window(sk, skb); | 668 | tcp_grow_window(sk, skb); |
@@ -1294,9 +1304,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1294 | TCP_SKB_CB(prev)->end_seq += shifted; | 1304 | TCP_SKB_CB(prev)->end_seq += shifted; |
1295 | TCP_SKB_CB(skb)->seq += shifted; | 1305 | TCP_SKB_CB(skb)->seq += shifted; |
1296 | 1306 | ||
1297 | skb_shinfo(prev)->gso_segs += pcount; | 1307 | tcp_skb_pcount_add(prev, pcount); |
1298 | BUG_ON(skb_shinfo(skb)->gso_segs < pcount); | 1308 | BUG_ON(tcp_skb_pcount(skb) < pcount); |
1299 | skb_shinfo(skb)->gso_segs -= pcount; | 1309 | tcp_skb_pcount_add(skb, -pcount); |
1300 | 1310 | ||
1301 | /* When we're adding to gso_segs == 1, gso_size will be zero, | 1311 | /* When we're adding to gso_segs == 1, gso_size will be zero, |
1302 | * in theory this shouldn't be necessary but as long as DSACK | 1312 | * in theory this shouldn't be necessary but as long as DSACK |
@@ -1309,7 +1319,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1309 | } | 1319 | } |
1310 | 1320 | ||
1311 | /* CHECKME: To clear or not to clear? Mimics normal skb currently */ | 1321 | /* CHECKME: To clear or not to clear? Mimics normal skb currently */ |
1312 | if (skb_shinfo(skb)->gso_segs <= 1) { | 1322 | if (tcp_skb_pcount(skb) <= 1) { |
1313 | skb_shinfo(skb)->gso_size = 0; | 1323 | skb_shinfo(skb)->gso_size = 0; |
1314 | skb_shinfo(skb)->gso_type = 0; | 1324 | skb_shinfo(skb)->gso_type = 0; |
1315 | } | 1325 | } |
@@ -1887,21 +1897,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) | |||
1887 | tp->sacked_out = 0; | 1897 | tp->sacked_out = 0; |
1888 | } | 1898 | } |
1889 | 1899 | ||
1890 | static void tcp_clear_retrans_partial(struct tcp_sock *tp) | 1900 | void tcp_clear_retrans(struct tcp_sock *tp) |
1891 | { | 1901 | { |
1892 | tp->retrans_out = 0; | 1902 | tp->retrans_out = 0; |
1893 | tp->lost_out = 0; | 1903 | tp->lost_out = 0; |
1894 | |||
1895 | tp->undo_marker = 0; | 1904 | tp->undo_marker = 0; |
1896 | tp->undo_retrans = -1; | 1905 | tp->undo_retrans = -1; |
1906 | tp->fackets_out = 0; | ||
1907 | tp->sacked_out = 0; | ||
1897 | } | 1908 | } |
1898 | 1909 | ||
1899 | void tcp_clear_retrans(struct tcp_sock *tp) | 1910 | static inline void tcp_init_undo(struct tcp_sock *tp) |
1900 | { | 1911 | { |
1901 | tcp_clear_retrans_partial(tp); | 1912 | tp->undo_marker = tp->snd_una; |
1902 | 1913 | /* Retransmission still in flight may cause DSACKs later. */ | |
1903 | tp->fackets_out = 0; | 1914 | tp->undo_retrans = tp->retrans_out ? : -1; |
1904 | tp->sacked_out = 0; | ||
1905 | } | 1915 | } |
1906 | 1916 | ||
1907 | /* Enter Loss state. If we detect SACK reneging, forget all SACK information | 1917 | /* Enter Loss state. If we detect SACK reneging, forget all SACK information |
@@ -1924,18 +1934,18 @@ void tcp_enter_loss(struct sock *sk) | |||
1924 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | 1934 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
1925 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | 1935 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); |
1926 | tcp_ca_event(sk, CA_EVENT_LOSS); | 1936 | tcp_ca_event(sk, CA_EVENT_LOSS); |
1937 | tcp_init_undo(tp); | ||
1927 | } | 1938 | } |
1928 | tp->snd_cwnd = 1; | 1939 | tp->snd_cwnd = 1; |
1929 | tp->snd_cwnd_cnt = 0; | 1940 | tp->snd_cwnd_cnt = 0; |
1930 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1941 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1931 | 1942 | ||
1932 | tcp_clear_retrans_partial(tp); | 1943 | tp->retrans_out = 0; |
1944 | tp->lost_out = 0; | ||
1933 | 1945 | ||
1934 | if (tcp_is_reno(tp)) | 1946 | if (tcp_is_reno(tp)) |
1935 | tcp_reset_reno_sack(tp); | 1947 | tcp_reset_reno_sack(tp); |
1936 | 1948 | ||
1937 | tp->undo_marker = tp->snd_una; | ||
1938 | |||
1939 | skb = tcp_write_queue_head(sk); | 1949 | skb = tcp_write_queue_head(sk); |
1940 | is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); | 1950 | is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); |
1941 | if (is_reneg) { | 1951 | if (is_reneg) { |
@@ -1949,9 +1959,6 @@ void tcp_enter_loss(struct sock *sk) | |||
1949 | if (skb == tcp_send_head(sk)) | 1959 | if (skb == tcp_send_head(sk)) |
1950 | break; | 1960 | break; |
1951 | 1961 | ||
1952 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) | ||
1953 | tp->undo_marker = 0; | ||
1954 | |||
1955 | TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; | 1962 | TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; |
1956 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { | 1963 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { |
1957 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; | 1964 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; |
@@ -1971,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk) | |||
1971 | sysctl_tcp_reordering); | 1978 | sysctl_tcp_reordering); |
1972 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1979 | tcp_set_ca_state(sk, TCP_CA_Loss); |
1973 | tp->high_seq = tp->snd_nxt; | 1980 | tp->high_seq = tp->snd_nxt; |
1974 | TCP_ECN_queue_cwr(tp); | 1981 | tcp_ecn_queue_cwr(tp); |
1975 | 1982 | ||
1976 | /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous | 1983 | /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous |
1977 | * loss recovery is underway except recurring timeout(s) on | 1984 | * loss recovery is underway except recurring timeout(s) on |
@@ -2363,7 +2370,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) | |||
2363 | 2370 | ||
2364 | if (tp->prior_ssthresh > tp->snd_ssthresh) { | 2371 | if (tp->prior_ssthresh > tp->snd_ssthresh) { |
2365 | tp->snd_ssthresh = tp->prior_ssthresh; | 2372 | tp->snd_ssthresh = tp->prior_ssthresh; |
2366 | TCP_ECN_withdraw_cwr(tp); | 2373 | tcp_ecn_withdraw_cwr(tp); |
2367 | } | 2374 | } |
2368 | } else { | 2375 | } else { |
2369 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); | 2376 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); |
@@ -2493,7 +2500,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) | |||
2493 | tp->prr_delivered = 0; | 2500 | tp->prr_delivered = 0; |
2494 | tp->prr_out = 0; | 2501 | tp->prr_out = 0; |
2495 | tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); | 2502 | tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); |
2496 | TCP_ECN_queue_cwr(tp); | 2503 | tcp_ecn_queue_cwr(tp); |
2497 | } | 2504 | } |
2498 | 2505 | ||
2499 | static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, | 2506 | static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, |
@@ -2670,8 +2677,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | |||
2670 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | 2677 | NET_INC_STATS_BH(sock_net(sk), mib_idx); |
2671 | 2678 | ||
2672 | tp->prior_ssthresh = 0; | 2679 | tp->prior_ssthresh = 0; |
2673 | tp->undo_marker = tp->snd_una; | 2680 | tcp_init_undo(tp); |
2674 | tp->undo_retrans = tp->retrans_out ? : -1; | ||
2675 | 2681 | ||
2676 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | 2682 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { |
2677 | if (!ece_ack) | 2683 | if (!ece_ack) |
@@ -2970,7 +2976,8 @@ void tcp_rearm_rto(struct sock *sk) | |||
2970 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || | 2976 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || |
2971 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | 2977 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { |
2972 | struct sk_buff *skb = tcp_write_queue_head(sk); | 2978 | struct sk_buff *skb = tcp_write_queue_head(sk); |
2973 | const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; | 2979 | const u32 rto_time_stamp = |
2980 | tcp_skb_timestamp(skb) + rto; | ||
2974 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); | 2981 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); |
2975 | /* delta may not be positive if the socket is locked | 2982 | /* delta may not be positive if the socket is locked |
2976 | * when the retrans timer fires and is rescheduled. | 2983 | * when the retrans timer fires and is rescheduled. |
@@ -3210,9 +3217,10 @@ static void tcp_ack_probe(struct sock *sk) | |||
3210 | * This function is not for random using! | 3217 | * This function is not for random using! |
3211 | */ | 3218 | */ |
3212 | } else { | 3219 | } else { |
3220 | unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); | ||
3221 | |||
3213 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, | 3222 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
3214 | min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), | 3223 | when, TCP_RTO_MAX); |
3215 | TCP_RTO_MAX); | ||
3216 | } | 3224 | } |
3217 | } | 3225 | } |
3218 | 3226 | ||
@@ -3363,6 +3371,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) | |||
3363 | } | 3371 | } |
3364 | } | 3372 | } |
3365 | 3373 | ||
3374 | static inline void tcp_in_ack_event(struct sock *sk, u32 flags) | ||
3375 | { | ||
3376 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
3377 | |||
3378 | if (icsk->icsk_ca_ops->in_ack_event) | ||
3379 | icsk->icsk_ca_ops->in_ack_event(sk, flags); | ||
3380 | } | ||
3381 | |||
3366 | /* This routine deals with incoming acks, but not outgoing ones. */ | 3382 | /* This routine deals with incoming acks, but not outgoing ones. */ |
3367 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | 3383 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) |
3368 | { | 3384 | { |
@@ -3422,10 +3438,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3422 | tp->snd_una = ack; | 3438 | tp->snd_una = ack; |
3423 | flag |= FLAG_WIN_UPDATE; | 3439 | flag |= FLAG_WIN_UPDATE; |
3424 | 3440 | ||
3425 | tcp_ca_event(sk, CA_EVENT_FAST_ACK); | 3441 | tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); |
3426 | 3442 | ||
3427 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); | 3443 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); |
3428 | } else { | 3444 | } else { |
3445 | u32 ack_ev_flags = CA_ACK_SLOWPATH; | ||
3446 | |||
3429 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 3447 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
3430 | flag |= FLAG_DATA; | 3448 | flag |= FLAG_DATA; |
3431 | else | 3449 | else |
@@ -3437,10 +3455,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3437 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, | 3455 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3438 | &sack_rtt_us); | 3456 | &sack_rtt_us); |
3439 | 3457 | ||
3440 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) | 3458 | if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { |
3441 | flag |= FLAG_ECE; | 3459 | flag |= FLAG_ECE; |
3460 | ack_ev_flags |= CA_ACK_ECE; | ||
3461 | } | ||
3462 | |||
3463 | if (flag & FLAG_WIN_UPDATE) | ||
3464 | ack_ev_flags |= CA_ACK_WIN_UPDATE; | ||
3442 | 3465 | ||
3443 | tcp_ca_event(sk, CA_EVENT_SLOW_ACK); | 3466 | tcp_in_ack_event(sk, ack_ev_flags); |
3444 | } | 3467 | } |
3445 | 3468 | ||
3446 | /* We passed data and got it acked, remove any soft error | 3469 | /* We passed data and got it acked, remove any soft error |
@@ -4062,6 +4085,44 @@ static void tcp_sack_remove(struct tcp_sock *tp) | |||
4062 | tp->rx_opt.num_sacks = num_sacks; | 4085 | tp->rx_opt.num_sacks = num_sacks; |
4063 | } | 4086 | } |
4064 | 4087 | ||
4088 | /** | ||
4089 | * tcp_try_coalesce - try to merge skb to prior one | ||
4090 | * @sk: socket | ||
4091 | * @to: prior buffer | ||
4092 | * @from: buffer to add in queue | ||
4093 | * @fragstolen: pointer to boolean | ||
4094 | * | ||
4095 | * Before queueing skb @from after @to, try to merge them | ||
4096 | * to reduce overall memory use and queue lengths, if cost is small. | ||
4097 | * Packets in ofo or receive queues can stay a long time. | ||
4098 | * Better try to coalesce them right now to avoid future collapses. | ||
4099 | * Returns true if caller should free @from instead of queueing it | ||
4100 | */ | ||
4101 | static bool tcp_try_coalesce(struct sock *sk, | ||
4102 | struct sk_buff *to, | ||
4103 | struct sk_buff *from, | ||
4104 | bool *fragstolen) | ||
4105 | { | ||
4106 | int delta; | ||
4107 | |||
4108 | *fragstolen = false; | ||
4109 | |||
4110 | /* Its possible this segment overlaps with prior segment in queue */ | ||
4111 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) | ||
4112 | return false; | ||
4113 | |||
4114 | if (!skb_try_coalesce(to, from, fragstolen, &delta)) | ||
4115 | return false; | ||
4116 | |||
4117 | atomic_add(delta, &sk->sk_rmem_alloc); | ||
4118 | sk_mem_charge(sk, delta); | ||
4119 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); | ||
4120 | TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; | ||
4121 | TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; | ||
4122 | TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; | ||
4123 | return true; | ||
4124 | } | ||
4125 | |||
4065 | /* This one checks to see if we can put data from the | 4126 | /* This one checks to see if we can put data from the |
4066 | * out_of_order queue into the receive_queue. | 4127 | * out_of_order queue into the receive_queue. |
4067 | */ | 4128 | */ |
@@ -4069,7 +4130,8 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4069 | { | 4130 | { |
4070 | struct tcp_sock *tp = tcp_sk(sk); | 4131 | struct tcp_sock *tp = tcp_sk(sk); |
4071 | __u32 dsack_high = tp->rcv_nxt; | 4132 | __u32 dsack_high = tp->rcv_nxt; |
4072 | struct sk_buff *skb; | 4133 | struct sk_buff *skb, *tail; |
4134 | bool fragstolen, eaten; | ||
4073 | 4135 | ||
4074 | while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { | 4136 | while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { |
4075 | if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) | 4137 | if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
@@ -4082,9 +4144,9 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4082 | tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); | 4144 | tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); |
4083 | } | 4145 | } |
4084 | 4146 | ||
4147 | __skb_unlink(skb, &tp->out_of_order_queue); | ||
4085 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { | 4148 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { |
4086 | SOCK_DEBUG(sk, "ofo packet was already received\n"); | 4149 | SOCK_DEBUG(sk, "ofo packet was already received\n"); |
4087 | __skb_unlink(skb, &tp->out_of_order_queue); | ||
4088 | __kfree_skb(skb); | 4150 | __kfree_skb(skb); |
4089 | continue; | 4151 | continue; |
4090 | } | 4152 | } |
@@ -4092,11 +4154,15 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4092 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, | 4154 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
4093 | TCP_SKB_CB(skb)->end_seq); | 4155 | TCP_SKB_CB(skb)->end_seq); |
4094 | 4156 | ||
4095 | __skb_unlink(skb, &tp->out_of_order_queue); | 4157 | tail = skb_peek_tail(&sk->sk_receive_queue); |
4096 | __skb_queue_tail(&sk->sk_receive_queue, skb); | 4158 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
4097 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4159 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
4098 | if (tcp_hdr(skb)->fin) | 4160 | if (!eaten) |
4161 | __skb_queue_tail(&sk->sk_receive_queue, skb); | ||
4162 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | ||
4099 | tcp_fin(sk); | 4163 | tcp_fin(sk); |
4164 | if (eaten) | ||
4165 | kfree_skb_partial(skb, fragstolen); | ||
4100 | } | 4166 | } |
4101 | } | 4167 | } |
4102 | 4168 | ||
@@ -4123,53 +4189,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, | |||
4123 | return 0; | 4189 | return 0; |
4124 | } | 4190 | } |
4125 | 4191 | ||
4126 | /** | ||
4127 | * tcp_try_coalesce - try to merge skb to prior one | ||
4128 | * @sk: socket | ||
4129 | * @to: prior buffer | ||
4130 | * @from: buffer to add in queue | ||
4131 | * @fragstolen: pointer to boolean | ||
4132 | * | ||
4133 | * Before queueing skb @from after @to, try to merge them | ||
4134 | * to reduce overall memory use and queue lengths, if cost is small. | ||
4135 | * Packets in ofo or receive queues can stay a long time. | ||
4136 | * Better try to coalesce them right now to avoid future collapses. | ||
4137 | * Returns true if caller should free @from instead of queueing it | ||
4138 | */ | ||
4139 | static bool tcp_try_coalesce(struct sock *sk, | ||
4140 | struct sk_buff *to, | ||
4141 | struct sk_buff *from, | ||
4142 | bool *fragstolen) | ||
4143 | { | ||
4144 | int delta; | ||
4145 | |||
4146 | *fragstolen = false; | ||
4147 | |||
4148 | if (tcp_hdr(from)->fin) | ||
4149 | return false; | ||
4150 | |||
4151 | /* Its possible this segment overlaps with prior segment in queue */ | ||
4152 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) | ||
4153 | return false; | ||
4154 | |||
4155 | if (!skb_try_coalesce(to, from, fragstolen, &delta)) | ||
4156 | return false; | ||
4157 | |||
4158 | atomic_add(delta, &sk->sk_rmem_alloc); | ||
4159 | sk_mem_charge(sk, delta); | ||
4160 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); | ||
4161 | TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; | ||
4162 | TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; | ||
4163 | return true; | ||
4164 | } | ||
4165 | |||
4166 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | 4192 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) |
4167 | { | 4193 | { |
4168 | struct tcp_sock *tp = tcp_sk(sk); | 4194 | struct tcp_sock *tp = tcp_sk(sk); |
4169 | struct sk_buff *skb1; | 4195 | struct sk_buff *skb1; |
4170 | u32 seq, end_seq; | 4196 | u32 seq, end_seq; |
4171 | 4197 | ||
4172 | TCP_ECN_check_ce(tp, skb); | 4198 | tcp_ecn_check_ce(tp, skb); |
4173 | 4199 | ||
4174 | if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { | 4200 | if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { |
4175 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); | 4201 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); |
@@ -4308,24 +4334,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int | |||
4308 | 4334 | ||
4309 | int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) | 4335 | int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) |
4310 | { | 4336 | { |
4311 | struct sk_buff *skb = NULL; | 4337 | struct sk_buff *skb; |
4312 | struct tcphdr *th; | ||
4313 | bool fragstolen; | 4338 | bool fragstolen; |
4314 | 4339 | ||
4315 | if (size == 0) | 4340 | if (size == 0) |
4316 | return 0; | 4341 | return 0; |
4317 | 4342 | ||
4318 | skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); | 4343 | skb = alloc_skb(size, sk->sk_allocation); |
4319 | if (!skb) | 4344 | if (!skb) |
4320 | goto err; | 4345 | goto err; |
4321 | 4346 | ||
4322 | if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) | 4347 | if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) |
4323 | goto err_free; | 4348 | goto err_free; |
4324 | 4349 | ||
4325 | th = (struct tcphdr *)skb_put(skb, sizeof(*th)); | ||
4326 | skb_reset_transport_header(skb); | ||
4327 | memset(th, 0, sizeof(*th)); | ||
4328 | |||
4329 | if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) | 4350 | if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) |
4330 | goto err_free; | 4351 | goto err_free; |
4331 | 4352 | ||
@@ -4333,7 +4354,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) | |||
4333 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; | 4354 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; |
4334 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; | 4355 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; |
4335 | 4356 | ||
4336 | if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { | 4357 | if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { |
4337 | WARN_ON_ONCE(fragstolen); /* should not happen */ | 4358 | WARN_ON_ONCE(fragstolen); /* should not happen */ |
4338 | __kfree_skb(skb); | 4359 | __kfree_skb(skb); |
4339 | } | 4360 | } |
@@ -4347,7 +4368,6 @@ err: | |||
4347 | 4368 | ||
4348 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | 4369 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) |
4349 | { | 4370 | { |
4350 | const struct tcphdr *th = tcp_hdr(skb); | ||
4351 | struct tcp_sock *tp = tcp_sk(sk); | 4371 | struct tcp_sock *tp = tcp_sk(sk); |
4352 | int eaten = -1; | 4372 | int eaten = -1; |
4353 | bool fragstolen = false; | 4373 | bool fragstolen = false; |
@@ -4356,9 +4376,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | |||
4356 | goto drop; | 4376 | goto drop; |
4357 | 4377 | ||
4358 | skb_dst_drop(skb); | 4378 | skb_dst_drop(skb); |
4359 | __skb_pull(skb, th->doff * 4); | 4379 | __skb_pull(skb, tcp_hdr(skb)->doff * 4); |
4360 | 4380 | ||
4361 | TCP_ECN_accept_cwr(tp, skb); | 4381 | tcp_ecn_accept_cwr(tp, skb); |
4362 | 4382 | ||
4363 | tp->rx_opt.dsack = 0; | 4383 | tp->rx_opt.dsack = 0; |
4364 | 4384 | ||
@@ -4400,7 +4420,7 @@ queue_and_out: | |||
4400 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4420 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
4401 | if (skb->len) | 4421 | if (skb->len) |
4402 | tcp_event_data_recv(sk, skb); | 4422 | tcp_event_data_recv(sk, skb); |
4403 | if (th->fin) | 4423 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
4404 | tcp_fin(sk); | 4424 | tcp_fin(sk); |
4405 | 4425 | ||
4406 | if (!skb_queue_empty(&tp->out_of_order_queue)) { | 4426 | if (!skb_queue_empty(&tp->out_of_order_queue)) { |
@@ -4515,7 +4535,7 @@ restart: | |||
4515 | * - bloated or contains data before "start" or | 4535 | * - bloated or contains data before "start" or |
4516 | * overlaps to the next one. | 4536 | * overlaps to the next one. |
4517 | */ | 4537 | */ |
4518 | if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && | 4538 | if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && |
4519 | (tcp_win_from_space(skb->truesize) > skb->len || | 4539 | (tcp_win_from_space(skb->truesize) > skb->len || |
4520 | before(TCP_SKB_CB(skb)->seq, start))) { | 4540 | before(TCP_SKB_CB(skb)->seq, start))) { |
4521 | end_of_skbs = false; | 4541 | end_of_skbs = false; |
@@ -4534,30 +4554,18 @@ restart: | |||
4534 | /* Decided to skip this, advance start seq. */ | 4554 | /* Decided to skip this, advance start seq. */ |
4535 | start = TCP_SKB_CB(skb)->end_seq; | 4555 | start = TCP_SKB_CB(skb)->end_seq; |
4536 | } | 4556 | } |
4537 | if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) | 4557 | if (end_of_skbs || |
4558 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) | ||
4538 | return; | 4559 | return; |
4539 | 4560 | ||
4540 | while (before(start, end)) { | 4561 | while (before(start, end)) { |
4562 | int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); | ||
4541 | struct sk_buff *nskb; | 4563 | struct sk_buff *nskb; |
4542 | unsigned int header = skb_headroom(skb); | ||
4543 | int copy = SKB_MAX_ORDER(header, 0); | ||
4544 | 4564 | ||
4545 | /* Too big header? This can happen with IPv6. */ | 4565 | nskb = alloc_skb(copy, GFP_ATOMIC); |
4546 | if (copy < 0) | ||
4547 | return; | ||
4548 | if (end - start < copy) | ||
4549 | copy = end - start; | ||
4550 | nskb = alloc_skb(copy + header, GFP_ATOMIC); | ||
4551 | if (!nskb) | 4566 | if (!nskb) |
4552 | return; | 4567 | return; |
4553 | 4568 | ||
4554 | skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); | ||
4555 | skb_set_network_header(nskb, (skb_network_header(skb) - | ||
4556 | skb->head)); | ||
4557 | skb_set_transport_header(nskb, (skb_transport_header(skb) - | ||
4558 | skb->head)); | ||
4559 | skb_reserve(nskb, header); | ||
4560 | memcpy(nskb->head, skb->head, header); | ||
4561 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); | 4569 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); |
4562 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; | 4570 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; |
4563 | __skb_queue_before(list, skb, nskb); | 4571 | __skb_queue_before(list, skb, nskb); |
@@ -4581,8 +4589,7 @@ restart: | |||
4581 | skb = tcp_collapse_one(sk, skb, list); | 4589 | skb = tcp_collapse_one(sk, skb, list); |
4582 | if (!skb || | 4590 | if (!skb || |
4583 | skb == tail || | 4591 | skb == tail || |
4584 | tcp_hdr(skb)->syn || | 4592 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
4585 | tcp_hdr(skb)->fin) | ||
4586 | return; | 4593 | return; |
4587 | } | 4594 | } |
4588 | } | 4595 | } |
@@ -5386,7 +5393,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5386 | * state to ESTABLISHED..." | 5393 | * state to ESTABLISHED..." |
5387 | */ | 5394 | */ |
5388 | 5395 | ||
5389 | TCP_ECN_rcv_synack(tp, th); | 5396 | tcp_ecn_rcv_synack(tp, th); |
5390 | 5397 | ||
5391 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | 5398 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
5392 | tcp_ack(sk, skb, FLAG_SLOWPATH); | 5399 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
@@ -5505,7 +5512,7 @@ discard: | |||
5505 | tp->snd_wl1 = TCP_SKB_CB(skb)->seq; | 5512 | tp->snd_wl1 = TCP_SKB_CB(skb)->seq; |
5506 | tp->max_window = tp->snd_wnd; | 5513 | tp->max_window = tp->snd_wnd; |
5507 | 5514 | ||
5508 | TCP_ECN_rcv_syn(tp, th); | 5515 | tcp_ecn_rcv_syn(tp, th); |
5509 | 5516 | ||
5510 | tcp_mtup_init(sk); | 5517 | tcp_mtup_init(sk); |
5511 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 5518 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
@@ -5835,6 +5842,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) | |||
5835 | #endif | 5842 | #endif |
5836 | } | 5843 | } |
5837 | 5844 | ||
5845 | /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set | ||
5846 | * | ||
5847 | * If we receive a SYN packet with these bits set, it means a | ||
5848 | * network is playing bad games with TOS bits. In order to | ||
5849 | * avoid possible false congestion notifications, we disable | ||
5850 | * TCP ECN negociation. | ||
5851 | * | ||
5852 | * Exception: tcp_ca wants ECN. This is required for DCTCP | ||
5853 | * congestion control; it requires setting ECT on all packets, | ||
5854 | * including SYN. We inverse the test in this case: If our | ||
5855 | * local socket wants ECN, but peer only set ece/cwr (but not | ||
5856 | * ECT in IP header) its probably a non-DCTCP aware sender. | ||
5857 | */ | ||
5858 | static void tcp_ecn_create_request(struct request_sock *req, | ||
5859 | const struct sk_buff *skb, | ||
5860 | const struct sock *listen_sk) | ||
5861 | { | ||
5862 | const struct tcphdr *th = tcp_hdr(skb); | ||
5863 | const struct net *net = sock_net(listen_sk); | ||
5864 | bool th_ecn = th->ece && th->cwr; | ||
5865 | bool ect, need_ecn; | ||
5866 | |||
5867 | if (!th_ecn) | ||
5868 | return; | ||
5869 | |||
5870 | ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); | ||
5871 | need_ecn = tcp_ca_needs_ecn(listen_sk); | ||
5872 | |||
5873 | if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) | ||
5874 | inet_rsk(req)->ecn_ok = 1; | ||
5875 | else if (ect && need_ecn) | ||
5876 | inet_rsk(req)->ecn_ok = 1; | ||
5877 | } | ||
5878 | |||
5838 | int tcp_conn_request(struct request_sock_ops *rsk_ops, | 5879 | int tcp_conn_request(struct request_sock_ops *rsk_ops, |
5839 | const struct tcp_request_sock_ops *af_ops, | 5880 | const struct tcp_request_sock_ops *af_ops, |
5840 | struct sock *sk, struct sk_buff *skb) | 5881 | struct sock *sk, struct sk_buff *skb) |
@@ -5843,7 +5884,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, | |||
5843 | struct request_sock *req; | 5884 | struct request_sock *req; |
5844 | struct tcp_sock *tp = tcp_sk(sk); | 5885 | struct tcp_sock *tp = tcp_sk(sk); |
5845 | struct dst_entry *dst = NULL; | 5886 | struct dst_entry *dst = NULL; |
5846 | __u32 isn = TCP_SKB_CB(skb)->when; | 5887 | __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; |
5847 | bool want_cookie = false, fastopen; | 5888 | bool want_cookie = false, fastopen; |
5848 | struct flowi fl; | 5889 | struct flowi fl; |
5849 | struct tcp_fastopen_cookie foc = { .len = -1 }; | 5890 | struct tcp_fastopen_cookie foc = { .len = -1 }; |
@@ -5895,7 +5936,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, | |||
5895 | goto drop_and_free; | 5936 | goto drop_and_free; |
5896 | 5937 | ||
5897 | if (!want_cookie || tmp_opt.tstamp_ok) | 5938 | if (!want_cookie || tmp_opt.tstamp_ok) |
5898 | TCP_ECN_create_request(req, skb, sock_net(sk)); | 5939 | tcp_ecn_create_request(req, skb, sk); |
5899 | 5940 | ||
5900 | if (want_cookie) { | 5941 | if (want_cookie) { |
5901 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); | 5942 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fbea536cf5c0..552e87e3c269 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -89,7 +89,6 @@ int sysctl_tcp_tw_reuse __read_mostly; | |||
89 | int sysctl_tcp_low_latency __read_mostly; | 89 | int sysctl_tcp_low_latency __read_mostly; |
90 | EXPORT_SYMBOL(sysctl_tcp_low_latency); | 90 | EXPORT_SYMBOL(sysctl_tcp_low_latency); |
91 | 91 | ||
92 | |||
93 | #ifdef CONFIG_TCP_MD5SIG | 92 | #ifdef CONFIG_TCP_MD5SIG |
94 | static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, | 93 | static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, |
95 | __be32 daddr, __be32 saddr, const struct tcphdr *th); | 94 | __be32 daddr, __be32 saddr, const struct tcphdr *th); |
@@ -430,15 +429,16 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
430 | break; | 429 | break; |
431 | 430 | ||
432 | icsk->icsk_backoff--; | 431 | icsk->icsk_backoff--; |
433 | inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : | 432 | icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : |
434 | TCP_TIMEOUT_INIT) << icsk->icsk_backoff; | 433 | TCP_TIMEOUT_INIT; |
435 | tcp_bound_rto(sk); | 434 | icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); |
436 | 435 | ||
437 | skb = tcp_write_queue_head(sk); | 436 | skb = tcp_write_queue_head(sk); |
438 | BUG_ON(!skb); | 437 | BUG_ON(!skb); |
439 | 438 | ||
440 | remaining = icsk->icsk_rto - min(icsk->icsk_rto, | 439 | remaining = icsk->icsk_rto - |
441 | tcp_time_stamp - TCP_SKB_CB(skb)->when); | 440 | min(icsk->icsk_rto, |
441 | tcp_time_stamp - tcp_skb_timestamp(skb)); | ||
442 | 442 | ||
443 | if (remaining) { | 443 | if (remaining) { |
444 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 444 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
@@ -680,8 +680,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
680 | 680 | ||
681 | net = dev_net(skb_dst(skb)->dev); | 681 | net = dev_net(skb_dst(skb)->dev); |
682 | arg.tos = ip_hdr(skb)->tos; | 682 | arg.tos = ip_hdr(skb)->tos; |
683 | ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, | 683 | ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, |
684 | ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); | 684 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, |
685 | &arg, arg.iov[0].iov_len); | ||
685 | 686 | ||
686 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 687 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
687 | TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); | 688 | TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); |
@@ -763,8 +764,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
763 | if (oif) | 764 | if (oif) |
764 | arg.bound_dev_if = oif; | 765 | arg.bound_dev_if = oif; |
765 | arg.tos = tos; | 766 | arg.tos = tos; |
766 | ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, | 767 | ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, |
767 | ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); | 768 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, |
769 | &arg, arg.iov[0].iov_len); | ||
768 | 770 | ||
769 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 771 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
770 | } | 772 | } |
@@ -883,18 +885,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action); | |||
883 | */ | 885 | */ |
884 | static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) | 886 | static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) |
885 | { | 887 | { |
886 | const struct ip_options *opt = &(IPCB(skb)->opt); | 888 | const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; |
887 | struct ip_options_rcu *dopt = NULL; | 889 | struct ip_options_rcu *dopt = NULL; |
888 | 890 | ||
889 | if (opt && opt->optlen) { | 891 | if (opt && opt->optlen) { |
890 | int opt_size = sizeof(*dopt) + opt->optlen; | 892 | int opt_size = sizeof(*dopt) + opt->optlen; |
891 | 893 | ||
892 | dopt = kmalloc(opt_size, GFP_ATOMIC); | 894 | dopt = kmalloc(opt_size, GFP_ATOMIC); |
893 | if (dopt) { | 895 | if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { |
894 | if (ip_options_echo(&dopt->opt, skb)) { | 896 | kfree(dopt); |
895 | kfree(dopt); | 897 | dopt = NULL; |
896 | dopt = NULL; | ||
897 | } | ||
898 | } | 898 | } |
899 | } | 899 | } |
900 | return dopt; | 900 | return dopt; |
@@ -1268,7 +1268,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { | |||
1268 | .send_ack = tcp_v4_reqsk_send_ack, | 1268 | .send_ack = tcp_v4_reqsk_send_ack, |
1269 | .destructor = tcp_v4_reqsk_destructor, | 1269 | .destructor = tcp_v4_reqsk_destructor, |
1270 | .send_reset = tcp_v4_send_reset, | 1270 | .send_reset = tcp_v4_send_reset, |
1271 | .syn_ack_timeout = tcp_syn_ack_timeout, | 1271 | .syn_ack_timeout = tcp_syn_ack_timeout, |
1272 | }; | 1272 | }; |
1273 | 1273 | ||
1274 | static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { | 1274 | static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { |
@@ -1428,7 +1428,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) | |||
1428 | 1428 | ||
1429 | #ifdef CONFIG_SYN_COOKIES | 1429 | #ifdef CONFIG_SYN_COOKIES |
1430 | if (!th->syn) | 1430 | if (!th->syn) |
1431 | sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); | 1431 | sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); |
1432 | #endif | 1432 | #endif |
1433 | return sk; | 1433 | return sk; |
1434 | } | 1434 | } |
@@ -1558,7 +1558,17 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) | |||
1558 | skb_queue_len(&tp->ucopy.prequeue) == 0) | 1558 | skb_queue_len(&tp->ucopy.prequeue) == 0) |
1559 | return false; | 1559 | return false; |
1560 | 1560 | ||
1561 | skb_dst_force(skb); | 1561 | /* Before escaping RCU protected region, we need to take care of skb |
1562 | * dst. Prequeue is only enabled for established sockets. | ||
1563 | * For such sockets, we might need the skb dst only to set sk->sk_rx_dst | ||
1564 | * Instead of doing full sk_rx_dst validity here, let's perform | ||
1565 | * an optimistic check. | ||
1566 | */ | ||
1567 | if (likely(sk->sk_rx_dst)) | ||
1568 | skb_dst_drop(skb); | ||
1569 | else | ||
1570 | skb_dst_force(skb); | ||
1571 | |||
1562 | __skb_queue_tail(&tp->ucopy.prequeue, skb); | 1572 | __skb_queue_tail(&tp->ucopy.prequeue, skb); |
1563 | tp->ucopy.memory += skb->truesize; | 1573 | tp->ucopy.memory += skb->truesize; |
1564 | if (tp->ucopy.memory > sk->sk_rcvbuf) { | 1574 | if (tp->ucopy.memory > sk->sk_rcvbuf) { |
@@ -1623,11 +1633,19 @@ int tcp_v4_rcv(struct sk_buff *skb) | |||
1623 | 1633 | ||
1624 | th = tcp_hdr(skb); | 1634 | th = tcp_hdr(skb); |
1625 | iph = ip_hdr(skb); | 1635 | iph = ip_hdr(skb); |
1636 | /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() | ||
1637 | * barrier() makes sure compiler wont play fool^Waliasing games. | ||
1638 | */ | ||
1639 | memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), | ||
1640 | sizeof(struct inet_skb_parm)); | ||
1641 | barrier(); | ||
1642 | |||
1626 | TCP_SKB_CB(skb)->seq = ntohl(th->seq); | 1643 | TCP_SKB_CB(skb)->seq = ntohl(th->seq); |
1627 | TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + | 1644 | TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + |
1628 | skb->len - th->doff * 4); | 1645 | skb->len - th->doff * 4); |
1629 | TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); | 1646 | TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); |
1630 | TCP_SKB_CB(skb)->when = 0; | 1647 | TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); |
1648 | TCP_SKB_CB(skb)->tcp_tw_isn = 0; | ||
1631 | TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); | 1649 | TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); |
1632 | TCP_SKB_CB(skb)->sacked = 0; | 1650 | TCP_SKB_CB(skb)->sacked = 0; |
1633 | 1651 | ||
@@ -1754,9 +1772,11 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) | |||
1754 | { | 1772 | { |
1755 | struct dst_entry *dst = skb_dst(skb); | 1773 | struct dst_entry *dst = skb_dst(skb); |
1756 | 1774 | ||
1757 | dst_hold(dst); | 1775 | if (dst) { |
1758 | sk->sk_rx_dst = dst; | 1776 | dst_hold(dst); |
1759 | inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; | 1777 | sk->sk_rx_dst = dst; |
1778 | inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; | ||
1779 | } | ||
1760 | } | 1780 | } |
1761 | EXPORT_SYMBOL(inet_sk_rx_dst_set); | 1781 | EXPORT_SYMBOL(inet_sk_rx_dst_set); |
1762 | 1782 | ||
@@ -2167,7 +2187,7 @@ int tcp_seq_open(struct inode *inode, struct file *file) | |||
2167 | 2187 | ||
2168 | s = ((struct seq_file *)file->private_data)->private; | 2188 | s = ((struct seq_file *)file->private_data)->private; |
2169 | s->family = afinfo->family; | 2189 | s->family = afinfo->family; |
2170 | s->last_pos = 0; | 2190 | s->last_pos = 0; |
2171 | return 0; | 2191 | return 0; |
2172 | } | 2192 | } |
2173 | EXPORT_SYMBOL(tcp_seq_open); | 2193 | EXPORT_SYMBOL(tcp_seq_open); |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1649988bd1b6..63d2680b65db 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -232,7 +232,7 @@ kill: | |||
232 | u32 isn = tcptw->tw_snd_nxt + 65535 + 2; | 232 | u32 isn = tcptw->tw_snd_nxt + 65535 + 2; |
233 | if (isn == 0) | 233 | if (isn == 0) |
234 | isn++; | 234 | isn++; |
235 | TCP_SKB_CB(skb)->when = isn; | 235 | TCP_SKB_CB(skb)->tcp_tw_isn = isn; |
236 | return TCP_TW_SYN; | 236 | return TCP_TW_SYN; |
237 | } | 237 | } |
238 | 238 | ||
@@ -393,8 +393,8 @@ void tcp_openreq_init_rwin(struct request_sock *req, | |||
393 | } | 393 | } |
394 | EXPORT_SYMBOL(tcp_openreq_init_rwin); | 394 | EXPORT_SYMBOL(tcp_openreq_init_rwin); |
395 | 395 | ||
396 | static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, | 396 | static void tcp_ecn_openreq_child(struct tcp_sock *tp, |
397 | struct request_sock *req) | 397 | const struct request_sock *req) |
398 | { | 398 | { |
399 | tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; | 399 | tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; |
400 | } | 400 | } |
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
451 | newtp->snd_cwnd = TCP_INIT_CWND; | 451 | newtp->snd_cwnd = TCP_INIT_CWND; |
452 | newtp->snd_cwnd_cnt = 0; | 452 | newtp->snd_cwnd_cnt = 0; |
453 | 453 | ||
454 | if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && | 454 | if (!try_module_get(newicsk->icsk_ca_ops->owner)) |
455 | !try_module_get(newicsk->icsk_ca_ops->owner)) | 455 | tcp_assign_congestion_control(newsk); |
456 | newicsk->icsk_ca_ops = &tcp_init_congestion_ops; | ||
457 | 456 | ||
458 | tcp_set_ca_state(newsk, TCP_CA_Open); | 457 | tcp_set_ca_state(newsk, TCP_CA_Open); |
459 | tcp_init_xmit_timers(newsk); | 458 | tcp_init_xmit_timers(newsk); |
@@ -508,7 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
508 | if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) | 507 | if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) |
509 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; | 508 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; |
510 | newtp->rx_opt.mss_clamp = req->mss; | 509 | newtp->rx_opt.mss_clamp = req->mss; |
511 | TCP_ECN_openreq_child(newtp, req); | 510 | tcp_ecn_openreq_child(newtp, req); |
512 | newtp->fastopen_rsk = NULL; | 511 | newtp->fastopen_rsk = NULL; |
513 | newtp->syn_data_acked = 0; | 512 | newtp->syn_data_acked = 0; |
514 | 513 | ||
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index bc1b83cb8309..5b90f2f447a5 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c | |||
@@ -29,6 +29,28 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, | |||
29 | } | 29 | } |
30 | } | 30 | } |
31 | 31 | ||
32 | struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, | ||
33 | netdev_features_t features) | ||
34 | { | ||
35 | if (!pskb_may_pull(skb, sizeof(struct tcphdr))) | ||
36 | return ERR_PTR(-EINVAL); | ||
37 | |||
38 | if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { | ||
39 | const struct iphdr *iph = ip_hdr(skb); | ||
40 | struct tcphdr *th = tcp_hdr(skb); | ||
41 | |||
42 | /* Set up checksum pseudo header, usually expect stack to | ||
43 | * have done this already. | ||
44 | */ | ||
45 | |||
46 | th->check = 0; | ||
47 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
48 | __tcp_v4_send_check(skb, iph->saddr, iph->daddr); | ||
49 | } | ||
50 | |||
51 | return tcp_gso_segment(skb, features); | ||
52 | } | ||
53 | |||
32 | struct sk_buff *tcp_gso_segment(struct sk_buff *skb, | 54 | struct sk_buff *tcp_gso_segment(struct sk_buff *skb, |
33 | netdev_features_t features) | 55 | netdev_features_t features) |
34 | { | 56 | { |
@@ -44,9 +66,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, | |||
44 | __sum16 newcheck; | 66 | __sum16 newcheck; |
45 | bool ooo_okay, copy_destructor; | 67 | bool ooo_okay, copy_destructor; |
46 | 68 | ||
47 | if (!pskb_may_pull(skb, sizeof(*th))) | ||
48 | goto out; | ||
49 | |||
50 | th = tcp_hdr(skb); | 69 | th = tcp_hdr(skb); |
51 | thlen = th->doff * 4; | 70 | thlen = th->doff * 4; |
52 | if (thlen < sizeof(*th)) | 71 | if (thlen < sizeof(*th)) |
@@ -269,54 +288,16 @@ int tcp_gro_complete(struct sk_buff *skb) | |||
269 | } | 288 | } |
270 | EXPORT_SYMBOL(tcp_gro_complete); | 289 | EXPORT_SYMBOL(tcp_gro_complete); |
271 | 290 | ||
272 | static int tcp_v4_gso_send_check(struct sk_buff *skb) | ||
273 | { | ||
274 | const struct iphdr *iph; | ||
275 | struct tcphdr *th; | ||
276 | |||
277 | if (!pskb_may_pull(skb, sizeof(*th))) | ||
278 | return -EINVAL; | ||
279 | |||
280 | iph = ip_hdr(skb); | ||
281 | th = tcp_hdr(skb); | ||
282 | |||
283 | th->check = 0; | ||
284 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
285 | __tcp_v4_send_check(skb, iph->saddr, iph->daddr); | ||
286 | return 0; | ||
287 | } | ||
288 | |||
289 | static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) | 291 | static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) |
290 | { | 292 | { |
291 | /* Use the IP hdr immediately proceeding for this transport */ | ||
292 | const struct iphdr *iph = skb_gro_network_header(skb); | ||
293 | __wsum wsum; | ||
294 | |||
295 | /* Don't bother verifying checksum if we're going to flush anyway. */ | 293 | /* Don't bother verifying checksum if we're going to flush anyway. */ |
296 | if (NAPI_GRO_CB(skb)->flush) | 294 | if (!NAPI_GRO_CB(skb)->flush && |
297 | goto skip_csum; | 295 | skb_gro_checksum_validate(skb, IPPROTO_TCP, |
298 | 296 | inet_gro_compute_pseudo)) { | |
299 | wsum = NAPI_GRO_CB(skb)->csum; | ||
300 | |||
301 | switch (skb->ip_summed) { | ||
302 | case CHECKSUM_NONE: | ||
303 | wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), | ||
304 | 0); | ||
305 | |||
306 | /* fall through */ | ||
307 | |||
308 | case CHECKSUM_COMPLETE: | ||
309 | if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, | ||
310 | wsum)) { | ||
311 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
312 | break; | ||
313 | } | ||
314 | |||
315 | NAPI_GRO_CB(skb)->flush = 1; | 297 | NAPI_GRO_CB(skb)->flush = 1; |
316 | return NULL; | 298 | return NULL; |
317 | } | 299 | } |
318 | 300 | ||
319 | skip_csum: | ||
320 | return tcp_gro_receive(head, skb); | 301 | return tcp_gro_receive(head, skb); |
321 | } | 302 | } |
322 | 303 | ||
@@ -334,8 +315,7 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff) | |||
334 | 315 | ||
335 | static const struct net_offload tcpv4_offload = { | 316 | static const struct net_offload tcpv4_offload = { |
336 | .callbacks = { | 317 | .callbacks = { |
337 | .gso_send_check = tcp_v4_gso_send_check, | 318 | .gso_segment = tcp4_gso_segment, |
338 | .gso_segment = tcp_gso_segment, | ||
339 | .gro_receive = tcp4_gro_receive, | 319 | .gro_receive = tcp4_gro_receive, |
340 | .gro_complete = tcp4_gro_complete, | 320 | .gro_complete = tcp4_gro_complete, |
341 | }, | 321 | }, |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5a7c41fbc6d3..8d4eac793700 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -318,36 +318,47 @@ static u16 tcp_select_window(struct sock *sk) | |||
318 | } | 318 | } |
319 | 319 | ||
320 | /* Packet ECN state for a SYN-ACK */ | 320 | /* Packet ECN state for a SYN-ACK */ |
321 | static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) | 321 | static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) |
322 | { | 322 | { |
323 | const struct tcp_sock *tp = tcp_sk(sk); | ||
324 | |||
323 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; | 325 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; |
324 | if (!(tp->ecn_flags & TCP_ECN_OK)) | 326 | if (!(tp->ecn_flags & TCP_ECN_OK)) |
325 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; | 327 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; |
328 | else if (tcp_ca_needs_ecn(sk)) | ||
329 | INET_ECN_xmit(sk); | ||
326 | } | 330 | } |
327 | 331 | ||
328 | /* Packet ECN state for a SYN. */ | 332 | /* Packet ECN state for a SYN. */ |
329 | static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) | 333 | static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) |
330 | { | 334 | { |
331 | struct tcp_sock *tp = tcp_sk(sk); | 335 | struct tcp_sock *tp = tcp_sk(sk); |
332 | 336 | ||
333 | tp->ecn_flags = 0; | 337 | tp->ecn_flags = 0; |
334 | if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { | 338 | if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || |
339 | tcp_ca_needs_ecn(sk)) { | ||
335 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; | 340 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; |
336 | tp->ecn_flags = TCP_ECN_OK; | 341 | tp->ecn_flags = TCP_ECN_OK; |
342 | if (tcp_ca_needs_ecn(sk)) | ||
343 | INET_ECN_xmit(sk); | ||
337 | } | 344 | } |
338 | } | 345 | } |
339 | 346 | ||
340 | static __inline__ void | 347 | static void |
341 | TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) | 348 | tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, |
349 | struct sock *sk) | ||
342 | { | 350 | { |
343 | if (inet_rsk(req)->ecn_ok) | 351 | if (inet_rsk(req)->ecn_ok) { |
344 | th->ece = 1; | 352 | th->ece = 1; |
353 | if (tcp_ca_needs_ecn(sk)) | ||
354 | INET_ECN_xmit(sk); | ||
355 | } | ||
345 | } | 356 | } |
346 | 357 | ||
347 | /* Set up ECN state for a packet on a ESTABLISHED socket that is about to | 358 | /* Set up ECN state for a packet on a ESTABLISHED socket that is about to |
348 | * be sent. | 359 | * be sent. |
349 | */ | 360 | */ |
350 | static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, | 361 | static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, |
351 | int tcp_header_len) | 362 | int tcp_header_len) |
352 | { | 363 | { |
353 | struct tcp_sock *tp = tcp_sk(sk); | 364 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, | |||
362 | tcp_hdr(skb)->cwr = 1; | 373 | tcp_hdr(skb)->cwr = 1; |
363 | skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; | 374 | skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; |
364 | } | 375 | } |
365 | } else { | 376 | } else if (!tcp_ca_needs_ecn(sk)) { |
366 | /* ACK or retransmitted segment: clear ECT|CE */ | 377 | /* ACK or retransmitted segment: clear ECT|CE */ |
367 | INET_ECN_dontxmit(sk); | 378 | INET_ECN_dontxmit(sk); |
368 | } | 379 | } |
@@ -384,7 +395,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) | |||
384 | TCP_SKB_CB(skb)->tcp_flags = flags; | 395 | TCP_SKB_CB(skb)->tcp_flags = flags; |
385 | TCP_SKB_CB(skb)->sacked = 0; | 396 | TCP_SKB_CB(skb)->sacked = 0; |
386 | 397 | ||
387 | shinfo->gso_segs = 1; | 398 | tcp_skb_pcount_set(skb, 1); |
388 | shinfo->gso_size = 0; | 399 | shinfo->gso_size = 0; |
389 | shinfo->gso_type = 0; | 400 | shinfo->gso_type = 0; |
390 | 401 | ||
@@ -550,7 +561,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
550 | 561 | ||
551 | if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { | 562 | if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { |
552 | opts->options |= OPTION_TS; | 563 | opts->options |= OPTION_TS; |
553 | opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; | 564 | opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; |
554 | opts->tsecr = tp->rx_opt.ts_recent; | 565 | opts->tsecr = tp->rx_opt.ts_recent; |
555 | remaining -= TCPOLEN_TSTAMP_ALIGNED; | 566 | remaining -= TCPOLEN_TSTAMP_ALIGNED; |
556 | } | 567 | } |
@@ -618,7 +629,7 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
618 | } | 629 | } |
619 | if (likely(ireq->tstamp_ok)) { | 630 | if (likely(ireq->tstamp_ok)) { |
620 | opts->options |= OPTION_TS; | 631 | opts->options |= OPTION_TS; |
621 | opts->tsval = TCP_SKB_CB(skb)->when; | 632 | opts->tsval = tcp_skb_timestamp(skb); |
622 | opts->tsecr = req->ts_recent; | 633 | opts->tsecr = req->ts_recent; |
623 | remaining -= TCPOLEN_TSTAMP_ALIGNED; | 634 | remaining -= TCPOLEN_TSTAMP_ALIGNED; |
624 | } | 635 | } |
@@ -647,7 +658,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
647 | struct tcp_out_options *opts, | 658 | struct tcp_out_options *opts, |
648 | struct tcp_md5sig_key **md5) | 659 | struct tcp_md5sig_key **md5) |
649 | { | 660 | { |
650 | struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; | ||
651 | struct tcp_sock *tp = tcp_sk(sk); | 661 | struct tcp_sock *tp = tcp_sk(sk); |
652 | unsigned int size = 0; | 662 | unsigned int size = 0; |
653 | unsigned int eff_sacks; | 663 | unsigned int eff_sacks; |
@@ -666,7 +676,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
666 | 676 | ||
667 | if (likely(tp->rx_opt.tstamp_ok)) { | 677 | if (likely(tp->rx_opt.tstamp_ok)) { |
668 | opts->options |= OPTION_TS; | 678 | opts->options |= OPTION_TS; |
669 | opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; | 679 | opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; |
670 | opts->tsecr = tp->rx_opt.ts_recent; | 680 | opts->tsecr = tp->rx_opt.ts_recent; |
671 | size += TCPOLEN_TSTAMP_ALIGNED; | 681 | size += TCPOLEN_TSTAMP_ALIGNED; |
672 | } | 682 | } |
@@ -886,8 +896,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
886 | skb = skb_clone(skb, gfp_mask); | 896 | skb = skb_clone(skb, gfp_mask); |
887 | if (unlikely(!skb)) | 897 | if (unlikely(!skb)) |
888 | return -ENOBUFS; | 898 | return -ENOBUFS; |
889 | /* Our usage of tstamp should remain private */ | ||
890 | skb->tstamp.tv64 = 0; | ||
891 | } | 899 | } |
892 | 900 | ||
893 | inet = inet_sk(sk); | 901 | inet = inet_sk(sk); |
@@ -952,7 +960,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
952 | 960 | ||
953 | tcp_options_write((__be32 *)(th + 1), tp, &opts); | 961 | tcp_options_write((__be32 *)(th + 1), tp, &opts); |
954 | if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) | 962 | if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) |
955 | TCP_ECN_send(sk, skb, tcp_header_size); | 963 | tcp_ecn_send(sk, skb, tcp_header_size); |
956 | 964 | ||
957 | #ifdef CONFIG_TCP_MD5SIG | 965 | #ifdef CONFIG_TCP_MD5SIG |
958 | /* Calculate the MD5 hash, as we have all we need now */ | 966 | /* Calculate the MD5 hash, as we have all we need now */ |
@@ -975,7 +983,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
975 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, | 983 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, |
976 | tcp_skb_pcount(skb)); | 984 | tcp_skb_pcount(skb)); |
977 | 985 | ||
986 | /* OK, its time to fill skb_shinfo(skb)->gso_segs */ | ||
987 | skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); | ||
988 | |||
989 | /* Our usage of tstamp should remain private */ | ||
990 | skb->tstamp.tv64 = 0; | ||
991 | |||
992 | /* Cleanup our debris for IP stacks */ | ||
993 | memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), | ||
994 | sizeof(struct inet6_skb_parm))); | ||
995 | |||
978 | err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); | 996 | err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); |
997 | |||
979 | if (likely(err <= 0)) | 998 | if (likely(err <= 0)) |
980 | return err; | 999 | return err; |
981 | 1000 | ||
@@ -995,7 +1014,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) | |||
995 | 1014 | ||
996 | /* Advance write_seq and place onto the write_queue. */ | 1015 | /* Advance write_seq and place onto the write_queue. */ |
997 | tp->write_seq = TCP_SKB_CB(skb)->end_seq; | 1016 | tp->write_seq = TCP_SKB_CB(skb)->end_seq; |
998 | skb_header_release(skb); | 1017 | __skb_header_release(skb); |
999 | tcp_add_write_queue_tail(sk, skb); | 1018 | tcp_add_write_queue_tail(sk, skb); |
1000 | sk->sk_wmem_queued += skb->truesize; | 1019 | sk->sk_wmem_queued += skb->truesize; |
1001 | sk_mem_charge(sk, skb->truesize); | 1020 | sk_mem_charge(sk, skb->truesize); |
@@ -1014,11 +1033,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, | |||
1014 | /* Avoid the costly divide in the normal | 1033 | /* Avoid the costly divide in the normal |
1015 | * non-TSO case. | 1034 | * non-TSO case. |
1016 | */ | 1035 | */ |
1017 | shinfo->gso_segs = 1; | 1036 | tcp_skb_pcount_set(skb, 1); |
1018 | shinfo->gso_size = 0; | 1037 | shinfo->gso_size = 0; |
1019 | shinfo->gso_type = 0; | 1038 | shinfo->gso_type = 0; |
1020 | } else { | 1039 | } else { |
1021 | shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); | 1040 | tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); |
1022 | shinfo->gso_size = mss_now; | 1041 | shinfo->gso_size = mss_now; |
1023 | shinfo->gso_type = sk->sk_gso_type; | 1042 | shinfo->gso_type = sk->sk_gso_type; |
1024 | } | 1043 | } |
@@ -1146,10 +1165,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
1146 | 1165 | ||
1147 | buff->ip_summed = skb->ip_summed; | 1166 | buff->ip_summed = skb->ip_summed; |
1148 | 1167 | ||
1149 | /* Looks stupid, but our code really uses when of | ||
1150 | * skbs, which it never sent before. --ANK | ||
1151 | */ | ||
1152 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; | ||
1153 | buff->tstamp = skb->tstamp; | 1168 | buff->tstamp = skb->tstamp; |
1154 | tcp_fragment_tstamp(skb, buff); | 1169 | tcp_fragment_tstamp(skb, buff); |
1155 | 1170 | ||
@@ -1171,7 +1186,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
1171 | } | 1186 | } |
1172 | 1187 | ||
1173 | /* Link BUFF into the send queue. */ | 1188 | /* Link BUFF into the send queue. */ |
1174 | skb_header_release(buff); | 1189 | __skb_header_release(buff); |
1175 | tcp_insert_write_queue_after(skb, buff, sk); | 1190 | tcp_insert_write_queue_after(skb, buff, sk); |
1176 | 1191 | ||
1177 | return 0; | 1192 | return 0; |
@@ -1675,7 +1690,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1675 | tcp_set_skb_tso_segs(sk, buff, mss_now); | 1690 | tcp_set_skb_tso_segs(sk, buff, mss_now); |
1676 | 1691 | ||
1677 | /* Link BUFF into the send queue. */ | 1692 | /* Link BUFF into the send queue. */ |
1678 | skb_header_release(buff); | 1693 | __skb_header_release(buff); |
1679 | tcp_insert_write_queue_after(skb, buff, sk); | 1694 | tcp_insert_write_queue_after(skb, buff, sk); |
1680 | 1695 | ||
1681 | return 0; | 1696 | return 0; |
@@ -1874,8 +1889,8 @@ static int tcp_mtu_probe(struct sock *sk) | |||
1874 | tcp_init_tso_segs(sk, nskb, nskb->len); | 1889 | tcp_init_tso_segs(sk, nskb, nskb->len); |
1875 | 1890 | ||
1876 | /* We're ready to send. If this fails, the probe will | 1891 | /* We're ready to send. If this fails, the probe will |
1877 | * be resegmented into mss-sized pieces by tcp_write_xmit(). */ | 1892 | * be resegmented into mss-sized pieces by tcp_write_xmit(). |
1878 | TCP_SKB_CB(nskb)->when = tcp_time_stamp; | 1893 | */ |
1879 | if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { | 1894 | if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { |
1880 | /* Decrement cwnd here because we are sending | 1895 | /* Decrement cwnd here because we are sending |
1881 | * effectively two packets. */ | 1896 | * effectively two packets. */ |
@@ -1935,8 +1950,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1935 | BUG_ON(!tso_segs); | 1950 | BUG_ON(!tso_segs); |
1936 | 1951 | ||
1937 | if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { | 1952 | if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { |
1938 | /* "when" is used as a start point for the retransmit timer */ | 1953 | /* "skb_mstamp" is used as a start point for the retransmit timer */ |
1939 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 1954 | skb_mstamp_get(&skb->skb_mstamp); |
1940 | goto repair; /* Skip network transmission */ | 1955 | goto repair; /* Skip network transmission */ |
1941 | } | 1956 | } |
1942 | 1957 | ||
@@ -2000,8 +2015,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
2000 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | 2015 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) |
2001 | break; | 2016 | break; |
2002 | 2017 | ||
2003 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
2004 | |||
2005 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) | 2018 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) |
2006 | break; | 2019 | break; |
2007 | 2020 | ||
@@ -2097,10 +2110,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
2097 | static bool skb_still_in_host_queue(const struct sock *sk, | 2110 | static bool skb_still_in_host_queue(const struct sock *sk, |
2098 | const struct sk_buff *skb) | 2111 | const struct sk_buff *skb) |
2099 | { | 2112 | { |
2100 | const struct sk_buff *fclone = skb + 1; | 2113 | if (unlikely(skb_fclone_busy(skb))) { |
2101 | |||
2102 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && | ||
2103 | fclone->fclone == SKB_FCLONE_CLONE)) { | ||
2104 | NET_INC_STATS_BH(sock_net(sk), | 2114 | NET_INC_STATS_BH(sock_net(sk), |
2105 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); | 2115 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); |
2106 | return true; | 2116 | return true; |
@@ -2499,7 +2509,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2499 | /* Make a copy, if the first transmission SKB clone we made | 2509 | /* Make a copy, if the first transmission SKB clone we made |
2500 | * is still in somebody's hands, else make a clone. | 2510 | * is still in somebody's hands, else make a clone. |
2501 | */ | 2511 | */ |
2502 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
2503 | 2512 | ||
2504 | /* make sure skb->data is aligned on arches that require it | 2513 | /* make sure skb->data is aligned on arches that require it |
2505 | * and check if ack-trimming & collapsing extended the headroom | 2514 | * and check if ack-trimming & collapsing extended the headroom |
@@ -2544,7 +2553,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2544 | 2553 | ||
2545 | /* Save stamp of the first retransmit. */ | 2554 | /* Save stamp of the first retransmit. */ |
2546 | if (!tp->retrans_stamp) | 2555 | if (!tp->retrans_stamp) |
2547 | tp->retrans_stamp = TCP_SKB_CB(skb)->when; | 2556 | tp->retrans_stamp = tcp_skb_timestamp(skb); |
2548 | 2557 | ||
2549 | /* snd_nxt is stored to detect loss of retransmitted segment, | 2558 | /* snd_nxt is stored to detect loss of retransmitted segment, |
2550 | * see tcp_input.c tcp_sacktag_write_queue(). | 2559 | * see tcp_input.c tcp_sacktag_write_queue(). |
@@ -2752,7 +2761,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) | |||
2752 | tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), | 2761 | tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), |
2753 | TCPHDR_ACK | TCPHDR_RST); | 2762 | TCPHDR_ACK | TCPHDR_RST); |
2754 | /* Send it off. */ | 2763 | /* Send it off. */ |
2755 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
2756 | if (tcp_transmit_skb(sk, skb, 0, priority)) | 2764 | if (tcp_transmit_skb(sk, skb, 0, priority)) |
2757 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); | 2765 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); |
2758 | 2766 | ||
@@ -2780,7 +2788,7 @@ int tcp_send_synack(struct sock *sk) | |||
2780 | if (nskb == NULL) | 2788 | if (nskb == NULL) |
2781 | return -ENOMEM; | 2789 | return -ENOMEM; |
2782 | tcp_unlink_write_queue(skb, sk); | 2790 | tcp_unlink_write_queue(skb, sk); |
2783 | skb_header_release(nskb); | 2791 | __skb_header_release(nskb); |
2784 | __tcp_add_write_queue_head(sk, nskb); | 2792 | __tcp_add_write_queue_head(sk, nskb); |
2785 | sk_wmem_free_skb(sk, skb); | 2793 | sk_wmem_free_skb(sk, skb); |
2786 | sk->sk_wmem_queued += nskb->truesize; | 2794 | sk->sk_wmem_queued += nskb->truesize; |
@@ -2789,9 +2797,8 @@ int tcp_send_synack(struct sock *sk) | |||
2789 | } | 2797 | } |
2790 | 2798 | ||
2791 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; | 2799 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; |
2792 | TCP_ECN_send_synack(tcp_sk(sk), skb); | 2800 | tcp_ecn_send_synack(sk, skb); |
2793 | } | 2801 | } |
2794 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
2795 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2802 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
2796 | } | 2803 | } |
2797 | 2804 | ||
@@ -2835,10 +2842,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2835 | memset(&opts, 0, sizeof(opts)); | 2842 | memset(&opts, 0, sizeof(opts)); |
2836 | #ifdef CONFIG_SYN_COOKIES | 2843 | #ifdef CONFIG_SYN_COOKIES |
2837 | if (unlikely(req->cookie_ts)) | 2844 | if (unlikely(req->cookie_ts)) |
2838 | TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); | 2845 | skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); |
2839 | else | 2846 | else |
2840 | #endif | 2847 | #endif |
2841 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2848 | skb_mstamp_get(&skb->skb_mstamp); |
2842 | tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, | 2849 | tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, |
2843 | foc) + sizeof(*th); | 2850 | foc) + sizeof(*th); |
2844 | 2851 | ||
@@ -2849,7 +2856,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2849 | memset(th, 0, sizeof(struct tcphdr)); | 2856 | memset(th, 0, sizeof(struct tcphdr)); |
2850 | th->syn = 1; | 2857 | th->syn = 1; |
2851 | th->ack = 1; | 2858 | th->ack = 1; |
2852 | TCP_ECN_make_synack(req, th); | 2859 | tcp_ecn_make_synack(req, th, sk); |
2853 | th->source = htons(ireq->ir_num); | 2860 | th->source = htons(ireq->ir_num); |
2854 | th->dest = ireq->ir_rmt_port; | 2861 | th->dest = ireq->ir_rmt_port; |
2855 | /* Setting of flags are superfluous here for callers (and ECE is | 2862 | /* Setting of flags are superfluous here for callers (and ECE is |
@@ -2956,7 +2963,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) | |||
2956 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | 2963 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
2957 | 2964 | ||
2958 | tcb->end_seq += skb->len; | 2965 | tcb->end_seq += skb->len; |
2959 | skb_header_release(skb); | 2966 | __skb_header_release(skb); |
2960 | __tcp_add_write_queue_tail(sk, skb); | 2967 | __tcp_add_write_queue_tail(sk, skb); |
2961 | sk->sk_wmem_queued += skb->truesize; | 2968 | sk->sk_wmem_queued += skb->truesize; |
2962 | sk_mem_charge(sk, skb->truesize); | 2969 | sk_mem_charge(sk, skb->truesize); |
@@ -3086,9 +3093,9 @@ int tcp_connect(struct sock *sk) | |||
3086 | skb_reserve(buff, MAX_TCP_HEADER); | 3093 | skb_reserve(buff, MAX_TCP_HEADER); |
3087 | 3094 | ||
3088 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); | 3095 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); |
3089 | tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; | 3096 | tp->retrans_stamp = tcp_time_stamp; |
3090 | tcp_connect_queue_skb(sk, buff); | 3097 | tcp_connect_queue_skb(sk, buff); |
3091 | TCP_ECN_send_syn(sk, buff); | 3098 | tcp_ecn_send_syn(sk, buff); |
3092 | 3099 | ||
3093 | /* Send off SYN; include data in Fast Open. */ | 3100 | /* Send off SYN; include data in Fast Open. */ |
3094 | err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : | 3101 | err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : |
@@ -3120,6 +3127,8 @@ void tcp_send_delayed_ack(struct sock *sk) | |||
3120 | int ato = icsk->icsk_ack.ato; | 3127 | int ato = icsk->icsk_ack.ato; |
3121 | unsigned long timeout; | 3128 | unsigned long timeout; |
3122 | 3129 | ||
3130 | tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); | ||
3131 | |||
3123 | if (ato > TCP_DELACK_MIN) { | 3132 | if (ato > TCP_DELACK_MIN) { |
3124 | const struct tcp_sock *tp = tcp_sk(sk); | 3133 | const struct tcp_sock *tp = tcp_sk(sk); |
3125 | int max_ato = HZ / 2; | 3134 | int max_ato = HZ / 2; |
@@ -3176,6 +3185,8 @@ void tcp_send_ack(struct sock *sk) | |||
3176 | if (sk->sk_state == TCP_CLOSE) | 3185 | if (sk->sk_state == TCP_CLOSE) |
3177 | return; | 3186 | return; |
3178 | 3187 | ||
3188 | tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); | ||
3189 | |||
3179 | /* We are not putting this on the write queue, so | 3190 | /* We are not putting this on the write queue, so |
3180 | * tcp_transmit_skb() will set the ownership to this | 3191 | * tcp_transmit_skb() will set the ownership to this |
3181 | * sock. | 3192 | * sock. |
@@ -3194,9 +3205,10 @@ void tcp_send_ack(struct sock *sk) | |||
3194 | tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); | 3205 | tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); |
3195 | 3206 | ||
3196 | /* Send it off, this clears delayed acks for us. */ | 3207 | /* Send it off, this clears delayed acks for us. */ |
3197 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 3208 | skb_mstamp_get(&buff->skb_mstamp); |
3198 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); | 3209 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); |
3199 | } | 3210 | } |
3211 | EXPORT_SYMBOL_GPL(tcp_send_ack); | ||
3200 | 3212 | ||
3201 | /* This routine sends a packet with an out of date sequence | 3213 | /* This routine sends a packet with an out of date sequence |
3202 | * number. It assumes the other end will try to ack it. | 3214 | * number. It assumes the other end will try to ack it. |
@@ -3226,7 +3238,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) | |||
3226 | * send it. | 3238 | * send it. |
3227 | */ | 3239 | */ |
3228 | tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); | 3240 | tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); |
3229 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 3241 | skb_mstamp_get(&skb->skb_mstamp); |
3230 | return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); | 3242 | return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); |
3231 | } | 3243 | } |
3232 | 3244 | ||
@@ -3270,7 +3282,6 @@ int tcp_write_wakeup(struct sock *sk) | |||
3270 | tcp_set_skb_tso_segs(sk, skb, mss); | 3282 | tcp_set_skb_tso_segs(sk, skb, mss); |
3271 | 3283 | ||
3272 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; | 3284 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; |
3273 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
3274 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 3285 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
3275 | if (!err) | 3286 | if (!err) |
3276 | tcp_event_new_data_sent(sk, skb); | 3287 | tcp_event_new_data_sent(sk, skb); |
@@ -3289,6 +3300,7 @@ void tcp_send_probe0(struct sock *sk) | |||
3289 | { | 3300 | { |
3290 | struct inet_connection_sock *icsk = inet_csk(sk); | 3301 | struct inet_connection_sock *icsk = inet_csk(sk); |
3291 | struct tcp_sock *tp = tcp_sk(sk); | 3302 | struct tcp_sock *tp = tcp_sk(sk); |
3303 | unsigned long probe_max; | ||
3292 | int err; | 3304 | int err; |
3293 | 3305 | ||
3294 | err = tcp_write_wakeup(sk); | 3306 | err = tcp_write_wakeup(sk); |
@@ -3304,9 +3316,7 @@ void tcp_send_probe0(struct sock *sk) | |||
3304 | if (icsk->icsk_backoff < sysctl_tcp_retries2) | 3316 | if (icsk->icsk_backoff < sysctl_tcp_retries2) |
3305 | icsk->icsk_backoff++; | 3317 | icsk->icsk_backoff++; |
3306 | icsk->icsk_probes_out++; | 3318 | icsk->icsk_probes_out++; |
3307 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, | 3319 | probe_max = TCP_RTO_MAX; |
3308 | min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), | ||
3309 | TCP_RTO_MAX); | ||
3310 | } else { | 3320 | } else { |
3311 | /* If packet was not sent due to local congestion, | 3321 | /* If packet was not sent due to local congestion, |
3312 | * do not backoff and do not remember icsk_probes_out. | 3322 | * do not backoff and do not remember icsk_probes_out. |
@@ -3316,11 +3326,11 @@ void tcp_send_probe0(struct sock *sk) | |||
3316 | */ | 3326 | */ |
3317 | if (!icsk->icsk_probes_out) | 3327 | if (!icsk->icsk_probes_out) |
3318 | icsk->icsk_probes_out = 1; | 3328 | icsk->icsk_probes_out = 1; |
3319 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, | 3329 | probe_max = TCP_RESOURCE_PROBE_INTERVAL; |
3320 | min(icsk->icsk_rto << icsk->icsk_backoff, | ||
3321 | TCP_RESOURCE_PROBE_INTERVAL), | ||
3322 | TCP_RTO_MAX); | ||
3323 | } | 3330 | } |
3331 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, | ||
3332 | inet_csk_rto_backoff(icsk, probe_max), | ||
3333 | TCP_RTO_MAX); | ||
3324 | } | 3334 | } |
3325 | 3335 | ||
3326 | int tcp_rtx_synack(struct sock *sk, struct request_sock *req) | 3336 | int tcp_rtx_synack(struct sock *sk, struct request_sock *req) |
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 3b66610d4156..ebf5ff57526e 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c | |||
@@ -83,7 +83,6 @@ static struct { | |||
83 | struct tcp_log *log; | 83 | struct tcp_log *log; |
84 | } tcp_probe; | 84 | } tcp_probe; |
85 | 85 | ||
86 | |||
87 | static inline int tcp_probe_used(void) | 86 | static inline int tcp_probe_used(void) |
88 | { | 87 | { |
89 | return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); | 88 | return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); |
@@ -101,7 +100,6 @@ static inline int tcp_probe_avail(void) | |||
101 | si4.sin_addr.s_addr = inet->inet_##mem##addr; \ | 100 | si4.sin_addr.s_addr = inet->inet_##mem##addr; \ |
102 | } while (0) \ | 101 | } while (0) \ |
103 | 102 | ||
104 | |||
105 | /* | 103 | /* |
106 | * Hook inserted to be called before each receive packet. | 104 | * Hook inserted to be called before each receive packet. |
107 | * Note: arguments must match tcp_rcv_established()! | 105 | * Note: arguments must match tcp_rcv_established()! |
@@ -194,8 +192,8 @@ static int tcpprobe_sprint(char *tbuf, int n) | |||
194 | 192 | ||
195 | return scnprintf(tbuf, n, | 193 | return scnprintf(tbuf, n, |
196 | "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", | 194 | "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", |
197 | (unsigned long) tv.tv_sec, | 195 | (unsigned long)tv.tv_sec, |
198 | (unsigned long) tv.tv_nsec, | 196 | (unsigned long)tv.tv_nsec, |
199 | &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, | 197 | &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, |
200 | p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); | 198 | p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); |
201 | } | 199 | } |
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 8250949b8853..6824afb65d93 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c | |||
@@ -31,10 +31,10 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
31 | static u32 tcp_scalable_ssthresh(struct sock *sk) | 31 | static u32 tcp_scalable_ssthresh(struct sock *sk) |
32 | { | 32 | { |
33 | const struct tcp_sock *tp = tcp_sk(sk); | 33 | const struct tcp_sock *tp = tcp_sk(sk); |
34 | |||
34 | return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); | 35 | return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); |
35 | } | 36 | } |
36 | 37 | ||
37 | |||
38 | static struct tcp_congestion_ops tcp_scalable __read_mostly = { | 38 | static struct tcp_congestion_ops tcp_scalable __read_mostly = { |
39 | .ssthresh = tcp_scalable_ssthresh, | 39 | .ssthresh = tcp_scalable_ssthresh, |
40 | .cong_avoid = tcp_scalable_cong_avoid, | 40 | .cong_avoid = tcp_scalable_cong_avoid, |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index df90cd1ce37f..9b21ae8b2e31 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk) | |||
52 | * limit. | 52 | * limit. |
53 | * 2. If we have strong memory pressure. | 53 | * 2. If we have strong memory pressure. |
54 | */ | 54 | */ |
55 | static int tcp_out_of_resources(struct sock *sk, int do_reset) | 55 | static int tcp_out_of_resources(struct sock *sk, bool do_reset) |
56 | { | 56 | { |
57 | struct tcp_sock *tp = tcp_sk(sk); | 57 | struct tcp_sock *tp = tcp_sk(sk); |
58 | int shift = 0; | 58 | int shift = 0; |
@@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) | |||
72 | if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || | 72 | if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || |
73 | /* 2. Window is closed. */ | 73 | /* 2. Window is closed. */ |
74 | (!tp->snd_wnd && !tp->packets_out)) | 74 | (!tp->snd_wnd && !tp->packets_out)) |
75 | do_reset = 1; | 75 | do_reset = true; |
76 | if (do_reset) | 76 | if (do_reset) |
77 | tcp_send_active_reset(sk, GFP_ATOMIC); | 77 | tcp_send_active_reset(sk, GFP_ATOMIC); |
78 | tcp_done(sk); | 78 | tcp_done(sk); |
@@ -135,10 +135,9 @@ static bool retransmits_timed_out(struct sock *sk, | |||
135 | if (!inet_csk(sk)->icsk_retransmits) | 135 | if (!inet_csk(sk)->icsk_retransmits) |
136 | return false; | 136 | return false; |
137 | 137 | ||
138 | if (unlikely(!tcp_sk(sk)->retrans_stamp)) | 138 | start_ts = tcp_sk(sk)->retrans_stamp; |
139 | start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; | 139 | if (unlikely(!start_ts)) |
140 | else | 140 | start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); |
141 | start_ts = tcp_sk(sk)->retrans_stamp; | ||
142 | 141 | ||
143 | if (likely(timeout == 0)) { | 142 | if (likely(timeout == 0)) { |
144 | linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); | 143 | linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); |
@@ -181,7 +180,7 @@ static int tcp_write_timeout(struct sock *sk) | |||
181 | 180 | ||
182 | retry_until = sysctl_tcp_retries2; | 181 | retry_until = sysctl_tcp_retries2; |
183 | if (sock_flag(sk, SOCK_DEAD)) { | 182 | if (sock_flag(sk, SOCK_DEAD)) { |
184 | const int alive = (icsk->icsk_rto < TCP_RTO_MAX); | 183 | const int alive = icsk->icsk_rto < TCP_RTO_MAX; |
185 | 184 | ||
186 | retry_until = tcp_orphan_retries(sk, alive); | 185 | retry_until = tcp_orphan_retries(sk, alive); |
187 | do_reset = alive || | 186 | do_reset = alive || |
@@ -271,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk) | |||
271 | struct inet_connection_sock *icsk = inet_csk(sk); | 270 | struct inet_connection_sock *icsk = inet_csk(sk); |
272 | struct tcp_sock *tp = tcp_sk(sk); | 271 | struct tcp_sock *tp = tcp_sk(sk); |
273 | int max_probes; | 272 | int max_probes; |
273 | u32 start_ts; | ||
274 | 274 | ||
275 | if (tp->packets_out || !tcp_send_head(sk)) { | 275 | if (tp->packets_out || !tcp_send_head(sk)) { |
276 | icsk->icsk_probes_out = 0; | 276 | icsk->icsk_probes_out = 0; |
277 | return; | 277 | return; |
278 | } | 278 | } |
279 | 279 | ||
280 | /* *WARNING* RFC 1122 forbids this | 280 | /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as |
281 | * | 281 | * long as the receiver continues to respond probes. We support this by |
282 | * It doesn't AFAIK, because we kill the retransmit timer -AK | 282 | * default and reset icsk_probes_out with incoming ACKs. But if the |
283 | * | 283 | * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we |
284 | * FIXME: We ought not to do it, Solaris 2.5 actually has fixing | 284 | * kill the socket when the retry count and the time exceeds the |
285 | * this behaviour in Solaris down as a bug fix. [AC] | 285 | * corresponding system limit. We also implement similar policy when |
286 | * | 286 | * we use RTO to probe window in tcp_retransmit_timer(). |
287 | * Let me to explain. icsk_probes_out is zeroed by incoming ACKs | ||
288 | * even if they advertise zero window. Hence, connection is killed only | ||
289 | * if we received no ACKs for normal connection timeout. It is not killed | ||
290 | * only because window stays zero for some time, window may be zero | ||
291 | * until armageddon and even later. We are in full accordance | ||
292 | * with RFCs, only probe timer combines both retransmission timeout | ||
293 | * and probe timeout in one bottle. --ANK | ||
294 | */ | 287 | */ |
295 | max_probes = sysctl_tcp_retries2; | 288 | start_ts = tcp_skb_timestamp(tcp_send_head(sk)); |
289 | if (!start_ts) | ||
290 | skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp); | ||
291 | else if (icsk->icsk_user_timeout && | ||
292 | (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) | ||
293 | goto abort; | ||
296 | 294 | ||
295 | max_probes = sysctl_tcp_retries2; | ||
297 | if (sock_flag(sk, SOCK_DEAD)) { | 296 | if (sock_flag(sk, SOCK_DEAD)) { |
298 | const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); | 297 | const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; |
299 | 298 | ||
300 | max_probes = tcp_orphan_retries(sk, alive); | 299 | max_probes = tcp_orphan_retries(sk, alive); |
301 | 300 | if (!alive && icsk->icsk_backoff >= max_probes) | |
302 | if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) | 301 | goto abort; |
302 | if (tcp_out_of_resources(sk, true)) | ||
303 | return; | 303 | return; |
304 | } | 304 | } |
305 | 305 | ||
306 | if (icsk->icsk_probes_out > max_probes) { | 306 | if (icsk->icsk_probes_out > max_probes) { |
307 | tcp_write_err(sk); | 307 | abort: tcp_write_err(sk); |
308 | } else { | 308 | } else { |
309 | /* Only send another probe if we didn't close things up. */ | 309 | /* Only send another probe if we didn't close things up. */ |
310 | tcp_send_probe0(sk); | 310 | tcp_send_probe0(sk); |
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index b40ad897f945..a6afde666ab1 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c | |||
@@ -51,7 +51,6 @@ MODULE_PARM_DESC(beta, "upper bound of packets in network"); | |||
51 | module_param(gamma, int, 0644); | 51 | module_param(gamma, int, 0644); |
52 | MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); | 52 | MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); |
53 | 53 | ||
54 | |||
55 | /* There are several situations when we must "re-start" Vegas: | 54 | /* There are several situations when we must "re-start" Vegas: |
56 | * | 55 | * |
57 | * o when a connection is established | 56 | * o when a connection is established |
@@ -133,7 +132,6 @@ EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); | |||
133 | 132 | ||
134 | void tcp_vegas_state(struct sock *sk, u8 ca_state) | 133 | void tcp_vegas_state(struct sock *sk, u8 ca_state) |
135 | { | 134 | { |
136 | |||
137 | if (ca_state == TCP_CA_Open) | 135 | if (ca_state == TCP_CA_Open) |
138 | vegas_enable(sk); | 136 | vegas_enable(sk); |
139 | else | 137 | else |
@@ -285,7 +283,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
285 | /* Use normal slow start */ | 283 | /* Use normal slow start */ |
286 | else if (tp->snd_cwnd <= tp->snd_ssthresh) | 284 | else if (tp->snd_cwnd <= tp->snd_ssthresh) |
287 | tcp_slow_start(tp, acked); | 285 | tcp_slow_start(tp, acked); |
288 | |||
289 | } | 286 | } |
290 | 287 | ||
291 | /* Extract info for Tcp socket info provided via netlink. */ | 288 | /* Extract info for Tcp socket info provided via netlink. */ |
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 8276977d2c85..a4d2d2d88dca 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c | |||
@@ -175,7 +175,6 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
175 | } else | 175 | } else |
176 | tp->snd_cwnd_cnt++; | 176 | tp->snd_cwnd_cnt++; |
177 | } | 177 | } |
178 | |||
179 | } | 178 | } |
180 | if (tp->snd_cwnd < 2) | 179 | if (tp->snd_cwnd < 2) |
181 | tp->snd_cwnd = 2; | 180 | tp->snd_cwnd = 2; |
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index b94a04ae2ed5..bb63fba47d47 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c | |||
@@ -42,7 +42,6 @@ struct westwood { | |||
42 | u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ | 42 | u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ |
43 | }; | 43 | }; |
44 | 44 | ||
45 | |||
46 | /* TCP Westwood functions and constants */ | 45 | /* TCP Westwood functions and constants */ |
47 | #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ | 46 | #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ |
48 | #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ | 47 | #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ |
@@ -153,7 +152,6 @@ static inline void update_rtt_min(struct westwood *w) | |||
153 | w->rtt_min = min(w->rtt, w->rtt_min); | 152 | w->rtt_min = min(w->rtt, w->rtt_min); |
154 | } | 153 | } |
155 | 154 | ||
156 | |||
157 | /* | 155 | /* |
158 | * @westwood_fast_bw | 156 | * @westwood_fast_bw |
159 | * It is called when we are in fast path. In particular it is called when | 157 | * It is called when we are in fast path. In particular it is called when |
@@ -208,7 +206,6 @@ static inline u32 westwood_acked_count(struct sock *sk) | |||
208 | return w->cumul_ack; | 206 | return w->cumul_ack; |
209 | } | 207 | } |
210 | 208 | ||
211 | |||
212 | /* | 209 | /* |
213 | * TCP Westwood | 210 | * TCP Westwood |
214 | * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it | 211 | * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it |
@@ -219,47 +216,51 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) | |||
219 | { | 216 | { |
220 | const struct tcp_sock *tp = tcp_sk(sk); | 217 | const struct tcp_sock *tp = tcp_sk(sk); |
221 | const struct westwood *w = inet_csk_ca(sk); | 218 | const struct westwood *w = inet_csk_ca(sk); |
219 | |||
222 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | 220 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); |
223 | } | 221 | } |
224 | 222 | ||
223 | static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) | ||
224 | { | ||
225 | if (ack_flags & CA_ACK_SLOWPATH) { | ||
226 | struct westwood *w = inet_csk_ca(sk); | ||
227 | |||
228 | westwood_update_window(sk); | ||
229 | w->bk += westwood_acked_count(sk); | ||
230 | |||
231 | update_rtt_min(w); | ||
232 | return; | ||
233 | } | ||
234 | |||
235 | westwood_fast_bw(sk); | ||
236 | } | ||
237 | |||
225 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) | 238 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) |
226 | { | 239 | { |
227 | struct tcp_sock *tp = tcp_sk(sk); | 240 | struct tcp_sock *tp = tcp_sk(sk); |
228 | struct westwood *w = inet_csk_ca(sk); | 241 | struct westwood *w = inet_csk_ca(sk); |
229 | 242 | ||
230 | switch (event) { | 243 | switch (event) { |
231 | case CA_EVENT_FAST_ACK: | ||
232 | westwood_fast_bw(sk); | ||
233 | break; | ||
234 | |||
235 | case CA_EVENT_COMPLETE_CWR: | 244 | case CA_EVENT_COMPLETE_CWR: |
236 | tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); | 245 | tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
237 | break; | 246 | break; |
238 | |||
239 | case CA_EVENT_LOSS: | 247 | case CA_EVENT_LOSS: |
240 | tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); | 248 | tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
241 | /* Update RTT_min when next ack arrives */ | 249 | /* Update RTT_min when next ack arrives */ |
242 | w->reset_rtt_min = 1; | 250 | w->reset_rtt_min = 1; |
243 | break; | 251 | break; |
244 | |||
245 | case CA_EVENT_SLOW_ACK: | ||
246 | westwood_update_window(sk); | ||
247 | w->bk += westwood_acked_count(sk); | ||
248 | update_rtt_min(w); | ||
249 | break; | ||
250 | |||
251 | default: | 252 | default: |
252 | /* don't care */ | 253 | /* don't care */ |
253 | break; | 254 | break; |
254 | } | 255 | } |
255 | } | 256 | } |
256 | 257 | ||
257 | |||
258 | /* Extract info for Tcp socket info provided via netlink. */ | 258 | /* Extract info for Tcp socket info provided via netlink. */ |
259 | static void tcp_westwood_info(struct sock *sk, u32 ext, | 259 | static void tcp_westwood_info(struct sock *sk, u32 ext, |
260 | struct sk_buff *skb) | 260 | struct sk_buff *skb) |
261 | { | 261 | { |
262 | const struct westwood *ca = inet_csk_ca(sk); | 262 | const struct westwood *ca = inet_csk_ca(sk); |
263 | |||
263 | if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { | 264 | if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { |
264 | struct tcpvegas_info info = { | 265 | struct tcpvegas_info info = { |
265 | .tcpv_enabled = 1, | 266 | .tcpv_enabled = 1, |
@@ -271,12 +272,12 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, | |||
271 | } | 272 | } |
272 | } | 273 | } |
273 | 274 | ||
274 | |||
275 | static struct tcp_congestion_ops tcp_westwood __read_mostly = { | 275 | static struct tcp_congestion_ops tcp_westwood __read_mostly = { |
276 | .init = tcp_westwood_init, | 276 | .init = tcp_westwood_init, |
277 | .ssthresh = tcp_reno_ssthresh, | 277 | .ssthresh = tcp_reno_ssthresh, |
278 | .cong_avoid = tcp_reno_cong_avoid, | 278 | .cong_avoid = tcp_reno_cong_avoid, |
279 | .cwnd_event = tcp_westwood_event, | 279 | .cwnd_event = tcp_westwood_event, |
280 | .in_ack_event = tcp_westwood_ack, | ||
280 | .get_info = tcp_westwood_info, | 281 | .get_info = tcp_westwood_info, |
281 | .pkts_acked = tcp_westwood_pkts_acked, | 282 | .pkts_acked = tcp_westwood_pkts_acked, |
282 | 283 | ||
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 599b79b8eac0..cd7273218598 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c | |||
@@ -54,10 +54,8 @@ static void tcp_yeah_init(struct sock *sk) | |||
54 | /* Ensure the MD arithmetic works. This is somewhat pedantic, | 54 | /* Ensure the MD arithmetic works. This is somewhat pedantic, |
55 | * since I don't think we will see a cwnd this large. :) */ | 55 | * since I don't think we will see a cwnd this large. :) */ |
56 | tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); | 56 | tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); |
57 | |||
58 | } | 57 | } |
59 | 58 | ||
60 | |||
61 | static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) | 59 | static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) |
62 | { | 60 | { |
63 | const struct inet_connection_sock *icsk = inet_csk(sk); | 61 | const struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -84,7 +82,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
84 | /* Scalable */ | 82 | /* Scalable */ |
85 | 83 | ||
86 | tp->snd_cwnd_cnt += yeah->pkts_acked; | 84 | tp->snd_cwnd_cnt += yeah->pkts_acked; |
87 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ | 85 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) { |
88 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 86 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
89 | tp->snd_cwnd++; | 87 | tp->snd_cwnd++; |
90 | tp->snd_cwnd_cnt = 0; | 88 | tp->snd_cwnd_cnt = 0; |
@@ -120,7 +118,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
120 | */ | 118 | */ |
121 | 119 | ||
122 | if (after(ack, yeah->vegas.beg_snd_nxt)) { | 120 | if (after(ack, yeah->vegas.beg_snd_nxt)) { |
123 | |||
124 | /* We do the Vegas calculations only if we got enough RTT | 121 | /* We do the Vegas calculations only if we got enough RTT |
125 | * samples that we can be reasonably sure that we got | 122 | * samples that we can be reasonably sure that we got |
126 | * at least one RTT sample that wasn't from a delayed ACK. | 123 | * at least one RTT sample that wasn't from a delayed ACK. |
@@ -189,7 +186,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
189 | } | 186 | } |
190 | 187 | ||
191 | yeah->lastQ = queue; | 188 | yeah->lastQ = queue; |
192 | |||
193 | } | 189 | } |
194 | 190 | ||
195 | /* Save the extent of the current window so we can use this | 191 | /* Save the extent of the current window so we can use this |
@@ -205,7 +201,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
205 | } | 201 | } |
206 | } | 202 | } |
207 | 203 | ||
208 | static u32 tcp_yeah_ssthresh(struct sock *sk) { | 204 | static u32 tcp_yeah_ssthresh(struct sock *sk) |
205 | { | ||
209 | const struct tcp_sock *tp = tcp_sk(sk); | 206 | const struct tcp_sock *tp = tcp_sk(sk); |
210 | struct yeah *yeah = inet_csk_ca(sk); | 207 | struct yeah *yeah = inet_csk_ca(sk); |
211 | u32 reduction; | 208 | u32 reduction; |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f57c0e4c2326..cd0db5471bb5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -99,6 +99,7 @@ | |||
99 | #include <linux/slab.h> | 99 | #include <linux/slab.h> |
100 | #include <net/tcp_states.h> | 100 | #include <net/tcp_states.h> |
101 | #include <linux/skbuff.h> | 101 | #include <linux/skbuff.h> |
102 | #include <linux/netdevice.h> | ||
102 | #include <linux/proc_fs.h> | 103 | #include <linux/proc_fs.h> |
103 | #include <linux/seq_file.h> | 104 | #include <linux/seq_file.h> |
104 | #include <net/net_namespace.h> | 105 | #include <net/net_namespace.h> |
@@ -224,7 +225,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
224 | remaining = (high - low) + 1; | 225 | remaining = (high - low) + 1; |
225 | 226 | ||
226 | rand = prandom_u32(); | 227 | rand = prandom_u32(); |
227 | first = (((u64)rand * remaining) >> 32) + low; | 228 | first = reciprocal_scale(rand, remaining) + low; |
228 | /* | 229 | /* |
229 | * force rand to be an odd multiple of UDP_HTABLE_SIZE | 230 | * force rand to be an odd multiple of UDP_HTABLE_SIZE |
230 | */ | 231 | */ |
@@ -448,7 +449,7 @@ begin: | |||
448 | } | 449 | } |
449 | } else if (score == badness && reuseport) { | 450 | } else if (score == badness && reuseport) { |
450 | matches++; | 451 | matches++; |
451 | if (((u64)hash * matches) >> 32 == 0) | 452 | if (reciprocal_scale(hash, matches) == 0) |
452 | result = sk; | 453 | result = sk; |
453 | hash = next_pseudo_random32(hash); | 454 | hash = next_pseudo_random32(hash); |
454 | } | 455 | } |
@@ -529,7 +530,7 @@ begin: | |||
529 | } | 530 | } |
530 | } else if (score == badness && reuseport) { | 531 | } else if (score == badness && reuseport) { |
531 | matches++; | 532 | matches++; |
532 | if (((u64)hash * matches) >> 32 == 0) | 533 | if (reciprocal_scale(hash, matches) == 0) |
533 | result = sk; | 534 | result = sk; |
534 | hash = next_pseudo_random32(hash); | 535 | hash = next_pseudo_random32(hash); |
535 | } | 536 | } |
@@ -1787,6 +1788,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, | |||
1787 | if (sk != NULL) { | 1788 | if (sk != NULL) { |
1788 | int ret; | 1789 | int ret; |
1789 | 1790 | ||
1791 | if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) | ||
1792 | skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, | ||
1793 | inet_compute_pseudo); | ||
1794 | |||
1790 | ret = udp_queue_rcv_skb(sk, skb); | 1795 | ret = udp_queue_rcv_skb(sk, skb); |
1791 | sock_put(sk); | 1796 | sock_put(sk); |
1792 | 1797 | ||
@@ -1967,7 +1972,7 @@ void udp_v4_early_demux(struct sk_buff *skb) | |||
1967 | return; | 1972 | return; |
1968 | 1973 | ||
1969 | skb->sk = sk; | 1974 | skb->sk = sk; |
1970 | skb->destructor = sock_edemux; | 1975 | skb->destructor = sock_efree; |
1971 | dst = sk->sk_rx_dst; | 1976 | dst = sk->sk_rx_dst; |
1972 | 1977 | ||
1973 | if (dst) | 1978 | if (dst) |
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 59035bc3008d..507310ef4b56 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c | |||
@@ -25,30 +25,11 @@ struct udp_offload_priv { | |||
25 | struct udp_offload_priv __rcu *next; | 25 | struct udp_offload_priv __rcu *next; |
26 | }; | 26 | }; |
27 | 27 | ||
28 | static int udp4_ufo_send_check(struct sk_buff *skb) | 28 | static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, |
29 | { | 29 | netdev_features_t features, |
30 | if (!pskb_may_pull(skb, sizeof(struct udphdr))) | 30 | struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, |
31 | return -EINVAL; | 31 | netdev_features_t features), |
32 | 32 | __be16 new_protocol) | |
33 | if (likely(!skb->encapsulation)) { | ||
34 | const struct iphdr *iph; | ||
35 | struct udphdr *uh; | ||
36 | |||
37 | iph = ip_hdr(skb); | ||
38 | uh = udp_hdr(skb); | ||
39 | |||
40 | uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, | ||
41 | IPPROTO_UDP, 0); | ||
42 | skb->csum_start = skb_transport_header(skb) - skb->head; | ||
43 | skb->csum_offset = offsetof(struct udphdr, check); | ||
44 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
45 | } | ||
46 | |||
47 | return 0; | ||
48 | } | ||
49 | |||
50 | struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | ||
51 | netdev_features_t features) | ||
52 | { | 33 | { |
53 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 34 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
54 | u16 mac_offset = skb->mac_header; | 35 | u16 mac_offset = skb->mac_header; |
@@ -70,7 +51,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | |||
70 | skb_reset_mac_header(skb); | 51 | skb_reset_mac_header(skb); |
71 | skb_set_network_header(skb, skb_inner_network_offset(skb)); | 52 | skb_set_network_header(skb, skb_inner_network_offset(skb)); |
72 | skb->mac_len = skb_inner_network_offset(skb); | 53 | skb->mac_len = skb_inner_network_offset(skb); |
73 | skb->protocol = htons(ETH_P_TEB); | 54 | skb->protocol = new_protocol; |
74 | 55 | ||
75 | need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); | 56 | need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); |
76 | if (need_csum) | 57 | if (need_csum) |
@@ -78,7 +59,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | |||
78 | 59 | ||
79 | /* segment inner packet. */ | 60 | /* segment inner packet. */ |
80 | enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); | 61 | enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); |
81 | segs = skb_mac_gso_segment(skb, enc_features); | 62 | segs = gso_inner_segment(skb, enc_features); |
82 | if (IS_ERR_OR_NULL(segs)) { | 63 | if (IS_ERR_OR_NULL(segs)) { |
83 | skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, | 64 | skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, |
84 | mac_len); | 65 | mac_len); |
@@ -123,21 +104,63 @@ out: | |||
123 | return segs; | 104 | return segs; |
124 | } | 105 | } |
125 | 106 | ||
107 | struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | ||
108 | netdev_features_t features, | ||
109 | bool is_ipv6) | ||
110 | { | ||
111 | __be16 protocol = skb->protocol; | ||
112 | const struct net_offload **offloads; | ||
113 | const struct net_offload *ops; | ||
114 | struct sk_buff *segs = ERR_PTR(-EINVAL); | ||
115 | struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, | ||
116 | netdev_features_t features); | ||
117 | |||
118 | rcu_read_lock(); | ||
119 | |||
120 | switch (skb->inner_protocol_type) { | ||
121 | case ENCAP_TYPE_ETHER: | ||
122 | protocol = skb->inner_protocol; | ||
123 | gso_inner_segment = skb_mac_gso_segment; | ||
124 | break; | ||
125 | case ENCAP_TYPE_IPPROTO: | ||
126 | offloads = is_ipv6 ? inet6_offloads : inet_offloads; | ||
127 | ops = rcu_dereference(offloads[skb->inner_ipproto]); | ||
128 | if (!ops || !ops->callbacks.gso_segment) | ||
129 | goto out_unlock; | ||
130 | gso_inner_segment = ops->callbacks.gso_segment; | ||
131 | break; | ||
132 | default: | ||
133 | goto out_unlock; | ||
134 | } | ||
135 | |||
136 | segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, | ||
137 | protocol); | ||
138 | |||
139 | out_unlock: | ||
140 | rcu_read_unlock(); | ||
141 | |||
142 | return segs; | ||
143 | } | ||
144 | |||
126 | static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, | 145 | static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, |
127 | netdev_features_t features) | 146 | netdev_features_t features) |
128 | { | 147 | { |
129 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 148 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
130 | unsigned int mss; | 149 | unsigned int mss; |
131 | int offset; | ||
132 | __wsum csum; | 150 | __wsum csum; |
151 | struct udphdr *uh; | ||
152 | struct iphdr *iph; | ||
133 | 153 | ||
134 | if (skb->encapsulation && | 154 | if (skb->encapsulation && |
135 | (skb_shinfo(skb)->gso_type & | 155 | (skb_shinfo(skb)->gso_type & |
136 | (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { | 156 | (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { |
137 | segs = skb_udp_tunnel_segment(skb, features); | 157 | segs = skb_udp_tunnel_segment(skb, features, false); |
138 | goto out; | 158 | goto out; |
139 | } | 159 | } |
140 | 160 | ||
161 | if (!pskb_may_pull(skb, sizeof(struct udphdr))) | ||
162 | goto out; | ||
163 | |||
141 | mss = skb_shinfo(skb)->gso_size; | 164 | mss = skb_shinfo(skb)->gso_size; |
142 | if (unlikely(skb->len <= mss)) | 165 | if (unlikely(skb->len <= mss)) |
143 | goto out; | 166 | goto out; |
@@ -165,10 +188,16 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, | |||
165 | * HW cannot do checksum of UDP packets sent as multiple | 188 | * HW cannot do checksum of UDP packets sent as multiple |
166 | * IP fragments. | 189 | * IP fragments. |
167 | */ | 190 | */ |
168 | offset = skb_checksum_start_offset(skb); | 191 | |
169 | csum = skb_checksum(skb, offset, skb->len - offset, 0); | 192 | uh = udp_hdr(skb); |
170 | offset += skb->csum_offset; | 193 | iph = ip_hdr(skb); |
171 | *(__sum16 *)(skb->data + offset) = csum_fold(csum); | 194 | |
195 | uh->check = 0; | ||
196 | csum = skb_checksum(skb, 0, skb->len, 0); | ||
197 | uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); | ||
198 | if (uh->check == 0) | ||
199 | uh->check = CSUM_MANGLED_0; | ||
200 | |||
172 | skb->ip_summed = CHECKSUM_NONE; | 201 | skb->ip_summed = CHECKSUM_NONE; |
173 | 202 | ||
174 | /* Fragment the skb. IP headers of the fragments are updated in | 203 | /* Fragment the skb. IP headers of the fragments are updated in |
@@ -228,30 +257,24 @@ unlock: | |||
228 | } | 257 | } |
229 | EXPORT_SYMBOL(udp_del_offload); | 258 | EXPORT_SYMBOL(udp_del_offload); |
230 | 259 | ||
231 | static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) | 260 | struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, |
261 | struct udphdr *uh) | ||
232 | { | 262 | { |
233 | struct udp_offload_priv *uo_priv; | 263 | struct udp_offload_priv *uo_priv; |
234 | struct sk_buff *p, **pp = NULL; | 264 | struct sk_buff *p, **pp = NULL; |
235 | struct udphdr *uh, *uh2; | 265 | struct udphdr *uh2; |
236 | unsigned int hlen, off; | 266 | unsigned int off = skb_gro_offset(skb); |
237 | int flush = 1; | 267 | int flush = 1; |
238 | 268 | ||
239 | if (NAPI_GRO_CB(skb)->udp_mark || | 269 | if (NAPI_GRO_CB(skb)->udp_mark || |
240 | (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) | 270 | (skb->ip_summed != CHECKSUM_PARTIAL && |
271 | NAPI_GRO_CB(skb)->csum_cnt == 0 && | ||
272 | !NAPI_GRO_CB(skb)->csum_valid)) | ||
241 | goto out; | 273 | goto out; |
242 | 274 | ||
243 | /* mark that this skb passed once through the udp gro layer */ | 275 | /* mark that this skb passed once through the udp gro layer */ |
244 | NAPI_GRO_CB(skb)->udp_mark = 1; | 276 | NAPI_GRO_CB(skb)->udp_mark = 1; |
245 | 277 | ||
246 | off = skb_gro_offset(skb); | ||
247 | hlen = off + sizeof(*uh); | ||
248 | uh = skb_gro_header_fast(skb, off); | ||
249 | if (skb_gro_header_hard(skb, hlen)) { | ||
250 | uh = skb_gro_header_slow(skb, hlen, off); | ||
251 | if (unlikely(!uh)) | ||
252 | goto out; | ||
253 | } | ||
254 | |||
255 | rcu_read_lock(); | 278 | rcu_read_lock(); |
256 | uo_priv = rcu_dereference(udp_offload_base); | 279 | uo_priv = rcu_dereference(udp_offload_base); |
257 | for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { | 280 | for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { |
@@ -269,7 +292,12 @@ unflush: | |||
269 | continue; | 292 | continue; |
270 | 293 | ||
271 | uh2 = (struct udphdr *)(p->data + off); | 294 | uh2 = (struct udphdr *)(p->data + off); |
272 | if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { | 295 | |
296 | /* Match ports and either checksums are either both zero | ||
297 | * or nonzero. | ||
298 | */ | ||
299 | if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) || | ||
300 | (!uh->check ^ !uh2->check)) { | ||
273 | NAPI_GRO_CB(p)->same_flow = 0; | 301 | NAPI_GRO_CB(p)->same_flow = 0; |
274 | continue; | 302 | continue; |
275 | } | 303 | } |
@@ -277,6 +305,7 @@ unflush: | |||
277 | 305 | ||
278 | skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ | 306 | skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ |
279 | skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); | 307 | skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); |
308 | NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; | ||
280 | pp = uo_priv->offload->callbacks.gro_receive(head, skb); | 309 | pp = uo_priv->offload->callbacks.gro_receive(head, skb); |
281 | 310 | ||
282 | out_unlock: | 311 | out_unlock: |
@@ -286,7 +315,34 @@ out: | |||
286 | return pp; | 315 | return pp; |
287 | } | 316 | } |
288 | 317 | ||
289 | static int udp_gro_complete(struct sk_buff *skb, int nhoff) | 318 | static struct sk_buff **udp4_gro_receive(struct sk_buff **head, |
319 | struct sk_buff *skb) | ||
320 | { | ||
321 | struct udphdr *uh = udp_gro_udphdr(skb); | ||
322 | |||
323 | if (unlikely(!uh)) | ||
324 | goto flush; | ||
325 | |||
326 | /* Don't bother verifying checksum if we're going to flush anyway. */ | ||
327 | if (NAPI_GRO_CB(skb)->flush) | ||
328 | goto skip; | ||
329 | |||
330 | if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, | ||
331 | inet_gro_compute_pseudo)) | ||
332 | goto flush; | ||
333 | else if (uh->check) | ||
334 | skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, | ||
335 | inet_gro_compute_pseudo); | ||
336 | skip: | ||
337 | NAPI_GRO_CB(skb)->is_ipv6 = 0; | ||
338 | return udp_gro_receive(head, skb, uh); | ||
339 | |||
340 | flush: | ||
341 | NAPI_GRO_CB(skb)->flush = 1; | ||
342 | return NULL; | ||
343 | } | ||
344 | |||
345 | int udp_gro_complete(struct sk_buff *skb, int nhoff) | ||
290 | { | 346 | { |
291 | struct udp_offload_priv *uo_priv; | 347 | struct udp_offload_priv *uo_priv; |
292 | __be16 newlen = htons(skb->len - nhoff); | 348 | __be16 newlen = htons(skb->len - nhoff); |
@@ -304,19 +360,32 @@ static int udp_gro_complete(struct sk_buff *skb, int nhoff) | |||
304 | break; | 360 | break; |
305 | } | 361 | } |
306 | 362 | ||
307 | if (uo_priv != NULL) | 363 | if (uo_priv != NULL) { |
364 | NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; | ||
308 | err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); | 365 | err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); |
366 | } | ||
309 | 367 | ||
310 | rcu_read_unlock(); | 368 | rcu_read_unlock(); |
311 | return err; | 369 | return err; |
312 | } | 370 | } |
313 | 371 | ||
372 | static int udp4_gro_complete(struct sk_buff *skb, int nhoff) | ||
373 | { | ||
374 | const struct iphdr *iph = ip_hdr(skb); | ||
375 | struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); | ||
376 | |||
377 | if (uh->check) | ||
378 | uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, | ||
379 | iph->daddr, 0); | ||
380 | |||
381 | return udp_gro_complete(skb, nhoff); | ||
382 | } | ||
383 | |||
314 | static const struct net_offload udpv4_offload = { | 384 | static const struct net_offload udpv4_offload = { |
315 | .callbacks = { | 385 | .callbacks = { |
316 | .gso_send_check = udp4_ufo_send_check, | ||
317 | .gso_segment = udp4_ufo_fragment, | 386 | .gso_segment = udp4_ufo_fragment, |
318 | .gro_receive = udp_gro_receive, | 387 | .gro_receive = udp4_gro_receive, |
319 | .gro_complete = udp_gro_complete, | 388 | .gro_complete = udp4_gro_complete, |
320 | }, | 389 | }, |
321 | }; | 390 | }; |
322 | 391 | ||
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 61ec1a65207e..1671263e5fa0 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c | |||
@@ -8,83 +8,40 @@ | |||
8 | #include <net/udp_tunnel.h> | 8 | #include <net/udp_tunnel.h> |
9 | #include <net/net_namespace.h> | 9 | #include <net/net_namespace.h> |
10 | 10 | ||
11 | int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, | 11 | int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, |
12 | struct socket **sockp) | 12 | struct socket **sockp) |
13 | { | 13 | { |
14 | int err = -EINVAL; | 14 | int err; |
15 | struct socket *sock = NULL; | 15 | struct socket *sock = NULL; |
16 | struct sockaddr_in udp_addr; | ||
16 | 17 | ||
17 | #if IS_ENABLED(CONFIG_IPV6) | 18 | err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); |
18 | if (cfg->family == AF_INET6) { | 19 | if (err < 0) |
19 | struct sockaddr_in6 udp6_addr; | 20 | goto error; |
20 | 21 | ||
21 | err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); | 22 | sk_change_net(sock->sk, net); |
22 | if (err < 0) | ||
23 | goto error; | ||
24 | |||
25 | sk_change_net(sock->sk, net); | ||
26 | |||
27 | udp6_addr.sin6_family = AF_INET6; | ||
28 | memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, | ||
29 | sizeof(udp6_addr.sin6_addr)); | ||
30 | udp6_addr.sin6_port = cfg->local_udp_port; | ||
31 | err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, | ||
32 | sizeof(udp6_addr)); | ||
33 | if (err < 0) | ||
34 | goto error; | ||
35 | |||
36 | if (cfg->peer_udp_port) { | ||
37 | udp6_addr.sin6_family = AF_INET6; | ||
38 | memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, | ||
39 | sizeof(udp6_addr.sin6_addr)); | ||
40 | udp6_addr.sin6_port = cfg->peer_udp_port; | ||
41 | err = kernel_connect(sock, | ||
42 | (struct sockaddr *)&udp6_addr, | ||
43 | sizeof(udp6_addr), 0); | ||
44 | } | ||
45 | if (err < 0) | ||
46 | goto error; | ||
47 | 23 | ||
48 | udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); | 24 | udp_addr.sin_family = AF_INET; |
49 | udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); | 25 | udp_addr.sin_addr = cfg->local_ip; |
50 | } else | 26 | udp_addr.sin_port = cfg->local_udp_port; |
51 | #endif | 27 | err = kernel_bind(sock, (struct sockaddr *)&udp_addr, |
52 | if (cfg->family == AF_INET) { | 28 | sizeof(udp_addr)); |
53 | struct sockaddr_in udp_addr; | 29 | if (err < 0) |
54 | 30 | goto error; | |
55 | err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); | ||
56 | if (err < 0) | ||
57 | goto error; | ||
58 | |||
59 | sk_change_net(sock->sk, net); | ||
60 | 31 | ||
32 | if (cfg->peer_udp_port) { | ||
61 | udp_addr.sin_family = AF_INET; | 33 | udp_addr.sin_family = AF_INET; |
62 | udp_addr.sin_addr = cfg->local_ip; | 34 | udp_addr.sin_addr = cfg->peer_ip; |
63 | udp_addr.sin_port = cfg->local_udp_port; | 35 | udp_addr.sin_port = cfg->peer_udp_port; |
64 | err = kernel_bind(sock, (struct sockaddr *)&udp_addr, | 36 | err = kernel_connect(sock, (struct sockaddr *)&udp_addr, |
65 | sizeof(udp_addr)); | 37 | sizeof(udp_addr), 0); |
66 | if (err < 0) | 38 | if (err < 0) |
67 | goto error; | 39 | goto error; |
68 | |||
69 | if (cfg->peer_udp_port) { | ||
70 | udp_addr.sin_family = AF_INET; | ||
71 | udp_addr.sin_addr = cfg->peer_ip; | ||
72 | udp_addr.sin_port = cfg->peer_udp_port; | ||
73 | err = kernel_connect(sock, | ||
74 | (struct sockaddr *)&udp_addr, | ||
75 | sizeof(udp_addr), 0); | ||
76 | if (err < 0) | ||
77 | goto error; | ||
78 | } | ||
79 | |||
80 | sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; | ||
81 | } else { | ||
82 | return -EPFNOSUPPORT; | ||
83 | } | 40 | } |
84 | 41 | ||
42 | sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; | ||
85 | 43 | ||
86 | *sockp = sock; | 44 | *sockp = sock; |
87 | |||
88 | return 0; | 45 | return 0; |
89 | 46 | ||
90 | error: | 47 | error: |
@@ -95,6 +52,57 @@ error: | |||
95 | *sockp = NULL; | 52 | *sockp = NULL; |
96 | return err; | 53 | return err; |
97 | } | 54 | } |
98 | EXPORT_SYMBOL(udp_sock_create); | 55 | EXPORT_SYMBOL(udp_sock_create4); |
56 | |||
57 | void setup_udp_tunnel_sock(struct net *net, struct socket *sock, | ||
58 | struct udp_tunnel_sock_cfg *cfg) | ||
59 | { | ||
60 | struct sock *sk = sock->sk; | ||
61 | |||
62 | /* Disable multicast loopback */ | ||
63 | inet_sk(sk)->mc_loop = 0; | ||
64 | |||
65 | /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ | ||
66 | udp_set_convert_csum(sk, true); | ||
67 | |||
68 | rcu_assign_sk_user_data(sk, cfg->sk_user_data); | ||
69 | |||
70 | udp_sk(sk)->encap_type = cfg->encap_type; | ||
71 | udp_sk(sk)->encap_rcv = cfg->encap_rcv; | ||
72 | udp_sk(sk)->encap_destroy = cfg->encap_destroy; | ||
73 | |||
74 | udp_tunnel_encap_enable(sock); | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); | ||
77 | |||
78 | int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, | ||
79 | struct sk_buff *skb, __be32 src, __be32 dst, | ||
80 | __u8 tos, __u8 ttl, __be16 df, __be16 src_port, | ||
81 | __be16 dst_port, bool xnet) | ||
82 | { | ||
83 | struct udphdr *uh; | ||
84 | |||
85 | __skb_push(skb, sizeof(*uh)); | ||
86 | skb_reset_transport_header(skb); | ||
87 | uh = udp_hdr(skb); | ||
88 | |||
89 | uh->dest = dst_port; | ||
90 | uh->source = src_port; | ||
91 | uh->len = htons(skb->len); | ||
92 | |||
93 | udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len); | ||
94 | |||
95 | return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP, | ||
96 | tos, ttl, df, xnet); | ||
97 | } | ||
98 | EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); | ||
99 | |||
100 | void udp_tunnel_sock_release(struct socket *sock) | ||
101 | { | ||
102 | rcu_assign_sk_user_data(sock->sk, NULL); | ||
103 | kernel_sock_shutdown(sock, SHUT_RDWR); | ||
104 | sk_release_kernel(sock->sk); | ||
105 | } | ||
106 | EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); | ||
99 | 107 | ||
100 | MODULE_LICENSE("GPL"); | 108 | MODULE_LICENSE("GPL"); |