aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig11
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c75
-rw-r--r--net/ipv4/ah4.c17
-rw-r--r--net/ipv4/arp.c8
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/esp4.c17
-rw-r--r--net/ipv4/fib_frontend.c130
-rw-r--r--net/ipv4/fib_rules.c39
-rw-r--r--net/ipv4/fib_semantics.c46
-rw-r--r--net/ipv4/fib_trie.c13
-rw-r--r--net/ipv4/icmp.c191
-rw-r--r--net/ipv4/inet_connection_sock.c53
-rw-r--r--net/ipv4/inet_diag.c146
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inetpeer.c99
-rw-r--r--net/ipv4/ip_fragment.c10
-rw-r--r--net/ipv4/ip_gre.c25
-rw-r--r--net/ipv4/ip_input.c32
-rw-r--r--net/ipv4/ip_options.c29
-rw-r--r--net/ipv4/ip_output.c93
-rw-r--r--net/ipv4/ip_sockglue.c12
-rw-r--r--net/ipv4/ip_vti.c956
-rw-r--r--net/ipv4/ipcomp.c17
-rw-r--r--net/ipv4/ipip.c28
-rw-r--r--net/ipv4/ipmr.c41
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c5
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c23
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c172
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c81
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c13
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c4
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c7
-rw-r--r--net/ipv4/protocol.c8
-rw-r--r--net/ipv4/raw.c5
-rw-r--r--net/ipv4/route.c2146
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c28
-rw-r--r--net/ipv4/tcp.c72
-rw-r--r--net/ipv4/tcp_cong.c5
-rw-r--r--net/ipv4/tcp_fastopen.c11
-rw-r--r--net/ipv4/tcp_input.c375
-rw-r--r--net/ipv4/tcp_ipv4.c186
-rw-r--r--net/ipv4/tcp_metrics.c745
-rw-r--r--net/ipv4/tcp_minisocks.c61
-rw-r--r--net/ipv4/tcp_output.c343
-rw-r--r--net/ipv4/tcp_timer.c70
-rw-r--r--net/ipv4/udp.c9
-rw-r--r--net/ipv4/udp_diag.c10
-rw-r--r--net/ipv4/xfrm4_input.c4
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c68
-rw-r--r--net/ipv4/xfrm4_policy.c33
59 files changed, 4048 insertions, 2570 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 20f1cb5c8aba..5a19aeb86094 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -310,6 +310,17 @@ config SYN_COOKIES
310 310
311 If unsure, say N. 311 If unsure, say N.
312 312
313config NET_IPVTI
314 tristate "Virtual (secure) IP: tunneling"
315 select INET_TUNNEL
316 depends on INET_XFRM_MODE_TUNNEL
317 ---help---
318 Tunneling means encapsulating data of one protocol type within
319 another protocol and sending it over a channel that understands the
320 encapsulating protocol. This can be used with xfrm mode tunnel to give
321 the notion of a secure tunnel for IPSEC and then use routing protocol
322 on top.
323
313config INET_AH 324config INET_AH
314 tristate "IP: AH transformation" 325 tristate "IP: AH transformation"
315 select XFRM_ALGO 326 select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6a..ae2ccf2890e4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
7 ip_output.o ip_sockglue.o inet_hashtables.o \ 7 ip_output.o ip_sockglue.o inet_hashtables.o \
8 inet_timewait_sock.o inet_connection_sock.o \ 8 inet_timewait_sock.o inet_connection_sock.o \
9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o fib_trie.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
@@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
20obj-$(CONFIG_NET_IPIP) += ipip.o 20obj-$(CONFIG_NET_IPIP) += ipip.o
21obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o 21obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
22obj-$(CONFIG_NET_IPGRE) += ip_gre.o 22obj-$(CONFIG_NET_IPGRE) += ip_gre.o
23obj-$(CONFIG_NET_IPVTI) += ip_vti.o
23obj-$(CONFIG_SYN_COOKIES) += syncookies.o 24obj-$(CONFIG_SYN_COOKIES) += syncookies.o
24obj-$(CONFIG_INET_AH) += ah4.o 25obj-$(CONFIG_INET_AH) += ah4.o
25obj-$(CONFIG_INET_ESP) += esp4.o 26obj-$(CONFIG_INET_ESP) += esp4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c8f7aee587d1..fe4582ca969a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
157 157
158 kfree(rcu_dereference_protected(inet->inet_opt, 1)); 158 kfree(rcu_dereference_protected(inet->inet_opt, 1));
159 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); 159 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
160 dst_release(sk->sk_rx_dst);
160 sk_refcnt_debug_dec(sk); 161 sk_refcnt_debug_dec(sk);
161} 162}
162EXPORT_SYMBOL(inet_sock_destruct); 163EXPORT_SYMBOL(inet_sock_destruct);
@@ -242,20 +243,18 @@ void build_ehash_secret(void)
242} 243}
243EXPORT_SYMBOL(build_ehash_secret); 244EXPORT_SYMBOL(build_ehash_secret);
244 245
245static inline int inet_netns_ok(struct net *net, int protocol) 246static inline int inet_netns_ok(struct net *net, __u8 protocol)
246{ 247{
247 int hash;
248 const struct net_protocol *ipprot; 248 const struct net_protocol *ipprot;
249 249
250 if (net_eq(net, &init_net)) 250 if (net_eq(net, &init_net))
251 return 1; 251 return 1;
252 252
253 hash = protocol & (MAX_INET_PROTOS - 1); 253 ipprot = rcu_dereference(inet_protos[protocol]);
254 ipprot = rcu_dereference(inet_protos[hash]); 254 if (ipprot == NULL) {
255
256 if (ipprot == NULL)
257 /* raw IP is OK */ 255 /* raw IP is OK */
258 return 1; 256 return 1;
257 }
259 return ipprot->netns_ok; 258 return ipprot->netns_ok;
260} 259}
261 260
@@ -553,15 +552,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
553 552
554 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 553 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
555 return -EAGAIN; 554 return -EAGAIN;
556 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); 555 return sk->sk_prot->connect(sk, uaddr, addr_len);
557} 556}
558EXPORT_SYMBOL(inet_dgram_connect); 557EXPORT_SYMBOL(inet_dgram_connect);
559 558
560static long inet_wait_for_connect(struct sock *sk, long timeo) 559static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
561{ 560{
562 DEFINE_WAIT(wait); 561 DEFINE_WAIT(wait);
563 562
564 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 563 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
564 sk->sk_write_pending += writebias;
565 565
566 /* Basic assumption: if someone sets sk->sk_err, he _must_ 566 /* Basic assumption: if someone sets sk->sk_err, he _must_
567 * change state of the socket from TCP_SYN_*. 567 * change state of the socket from TCP_SYN_*.
@@ -577,6 +577,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
577 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 577 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
578 } 578 }
579 finish_wait(sk_sleep(sk), &wait); 579 finish_wait(sk_sleep(sk), &wait);
580 sk->sk_write_pending -= writebias;
580 return timeo; 581 return timeo;
581} 582}
582 583
@@ -584,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
584 * Connect to a remote host. There is regrettably still a little 585 * Connect to a remote host. There is regrettably still a little
585 * TCP 'magic' in here. 586 * TCP 'magic' in here.
586 */ 587 */
587int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, 588int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
588 int addr_len, int flags) 589 int addr_len, int flags)
589{ 590{
590 struct sock *sk = sock->sk; 591 struct sock *sk = sock->sk;
591 int err; 592 int err;
@@ -594,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
594 if (addr_len < sizeof(uaddr->sa_family)) 595 if (addr_len < sizeof(uaddr->sa_family))
595 return -EINVAL; 596 return -EINVAL;
596 597
597 lock_sock(sk);
598
599 if (uaddr->sa_family == AF_UNSPEC) { 598 if (uaddr->sa_family == AF_UNSPEC) {
600 err = sk->sk_prot->disconnect(sk, flags); 599 err = sk->sk_prot->disconnect(sk, flags);
601 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; 600 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -635,8 +634,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
635 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 634 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
636 635
637 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 636 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
637 int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
638 tcp_sk(sk)->fastopen_req &&
639 tcp_sk(sk)->fastopen_req->data ? 1 : 0;
640
638 /* Error code is set above */ 641 /* Error code is set above */
639 if (!timeo || !inet_wait_for_connect(sk, timeo)) 642 if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
640 goto out; 643 goto out;
641 644
642 err = sock_intr_errno(timeo); 645 err = sock_intr_errno(timeo);
@@ -658,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
658 sock->state = SS_CONNECTED; 661 sock->state = SS_CONNECTED;
659 err = 0; 662 err = 0;
660out: 663out:
661 release_sock(sk);
662 return err; 664 return err;
663 665
664sock_error: 666sock_error:
@@ -668,6 +670,18 @@ sock_error:
668 sock->state = SS_DISCONNECTING; 670 sock->state = SS_DISCONNECTING;
669 goto out; 671 goto out;
670} 672}
673EXPORT_SYMBOL(__inet_stream_connect);
674
675int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
676 int addr_len, int flags)
677{
678 int err;
679
680 lock_sock(sock->sk);
681 err = __inet_stream_connect(sock, uaddr, addr_len, flags);
682 release_sock(sock->sk);
683 return err;
684}
671EXPORT_SYMBOL(inet_stream_connect); 685EXPORT_SYMBOL(inet_stream_connect);
672 686
673/* 687/*
@@ -1216,8 +1230,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
1216 1230
1217static int inet_gso_send_check(struct sk_buff *skb) 1231static int inet_gso_send_check(struct sk_buff *skb)
1218{ 1232{
1219 const struct iphdr *iph;
1220 const struct net_protocol *ops; 1233 const struct net_protocol *ops;
1234 const struct iphdr *iph;
1221 int proto; 1235 int proto;
1222 int ihl; 1236 int ihl;
1223 int err = -EINVAL; 1237 int err = -EINVAL;
@@ -1236,7 +1250,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
1236 __skb_pull(skb, ihl); 1250 __skb_pull(skb, ihl);
1237 skb_reset_transport_header(skb); 1251 skb_reset_transport_header(skb);
1238 iph = ip_hdr(skb); 1252 iph = ip_hdr(skb);
1239 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1253 proto = iph->protocol;
1240 err = -EPROTONOSUPPORT; 1254 err = -EPROTONOSUPPORT;
1241 1255
1242 rcu_read_lock(); 1256 rcu_read_lock();
@@ -1253,8 +1267,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1253 netdev_features_t features) 1267 netdev_features_t features)
1254{ 1268{
1255 struct sk_buff *segs = ERR_PTR(-EINVAL); 1269 struct sk_buff *segs = ERR_PTR(-EINVAL);
1256 struct iphdr *iph;
1257 const struct net_protocol *ops; 1270 const struct net_protocol *ops;
1271 struct iphdr *iph;
1258 int proto; 1272 int proto;
1259 int ihl; 1273 int ihl;
1260 int id; 1274 int id;
@@ -1286,7 +1300,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1286 skb_reset_transport_header(skb); 1300 skb_reset_transport_header(skb);
1287 iph = ip_hdr(skb); 1301 iph = ip_hdr(skb);
1288 id = ntohs(iph->id); 1302 id = ntohs(iph->id);
1289 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1303 proto = iph->protocol;
1290 segs = ERR_PTR(-EPROTONOSUPPORT); 1304 segs = ERR_PTR(-EPROTONOSUPPORT);
1291 1305
1292 rcu_read_lock(); 1306 rcu_read_lock();
@@ -1340,7 +1354,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1340 goto out; 1354 goto out;
1341 } 1355 }
1342 1356
1343 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1357 proto = iph->protocol;
1344 1358
1345 rcu_read_lock(); 1359 rcu_read_lock();
1346 ops = rcu_dereference(inet_protos[proto]); 1360 ops = rcu_dereference(inet_protos[proto]);
@@ -1398,11 +1412,11 @@ out:
1398 1412
1399static int inet_gro_complete(struct sk_buff *skb) 1413static int inet_gro_complete(struct sk_buff *skb)
1400{ 1414{
1401 const struct net_protocol *ops; 1415 __be16 newlen = htons(skb->len - skb_network_offset(skb));
1402 struct iphdr *iph = ip_hdr(skb); 1416 struct iphdr *iph = ip_hdr(skb);
1403 int proto = iph->protocol & (MAX_INET_PROTOS - 1); 1417 const struct net_protocol *ops;
1418 int proto = iph->protocol;
1404 int err = -ENOSYS; 1419 int err = -ENOSYS;
1405 __be16 newlen = htons(skb->len - skb_network_offset(skb));
1406 1420
1407 csum_replace2(&iph->check, iph->tot_len, newlen); 1421 csum_replace2(&iph->check, iph->tot_len, newlen);
1408 iph->tot_len = newlen; 1422 iph->tot_len = newlen;
@@ -1520,14 +1534,15 @@ static const struct net_protocol igmp_protocol = {
1520#endif 1534#endif
1521 1535
1522static const struct net_protocol tcp_protocol = { 1536static const struct net_protocol tcp_protocol = {
1523 .handler = tcp_v4_rcv, 1537 .early_demux = tcp_v4_early_demux,
1524 .err_handler = tcp_v4_err, 1538 .handler = tcp_v4_rcv,
1525 .gso_send_check = tcp_v4_gso_send_check, 1539 .err_handler = tcp_v4_err,
1526 .gso_segment = tcp_tso_segment, 1540 .gso_send_check = tcp_v4_gso_send_check,
1527 .gro_receive = tcp4_gro_receive, 1541 .gso_segment = tcp_tso_segment,
1528 .gro_complete = tcp4_gro_complete, 1542 .gro_receive = tcp4_gro_receive,
1529 .no_policy = 1, 1543 .gro_complete = tcp4_gro_complete,
1530 .netns_ok = 1, 1544 .no_policy = 1,
1545 .netns_ok = 1,
1531}; 1546};
1532 1547
1533static const struct net_protocol udp_protocol = { 1548static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e8f2617ecd47..a0d8392491c3 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -398,16 +398,25 @@ static void ah4_err(struct sk_buff *skb, u32 info)
398 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); 398 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
399 struct xfrm_state *x; 399 struct xfrm_state *x;
400 400
401 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || 401 switch (icmp_hdr(skb)->type) {
402 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 402 case ICMP_DEST_UNREACH:
403 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
404 return;
405 case ICMP_REDIRECT:
406 break;
407 default:
403 return; 408 return;
409 }
404 410
405 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 411 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
406 ah->spi, IPPROTO_AH, AF_INET); 412 ah->spi, IPPROTO_AH, AF_INET);
407 if (!x) 413 if (!x)
408 return; 414 return;
409 pr_debug("pmtu discovery on SA AH/%08x/%08x\n", 415
410 ntohl(ah->spi), ntohl(iph->daddr)); 416 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
417 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
418 else
419 ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
411 xfrm_state_put(x); 420 xfrm_state_put(x);
412} 421}
413 422
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index cda37be02f8d..a0124eb7dbea 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -475,8 +475,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
475 return 1; 475 return 1;
476 } 476 }
477 477
478 paddr = skb_rtable(skb)->rt_gateway; 478 paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
479
480 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, 479 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
481 paddr, dev)) 480 paddr, dev))
482 return 0; 481 return 0;
@@ -790,7 +789,8 @@ static int arp_process(struct sk_buff *skb)
790 * Check for bad requests for 127.x.x.x and requests for multicast 789 * Check for bad requests for 127.x.x.x and requests for multicast
791 * addresses. If this is one such, delete it. 790 * addresses. If this is one such, delete it.
792 */ 791 */
793 if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) 792 if (ipv4_is_multicast(tip) ||
793 (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
794 goto out; 794 goto out;
795 795
796/* 796/*
@@ -827,7 +827,7 @@ static int arp_process(struct sk_buff *skb)
827 } 827 }
828 828
829 if (arp->ar_op == htons(ARPOP_REQUEST) && 829 if (arp->ar_op == htons(ARPOP_REQUEST) &&
830 ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { 830 ip_route_input(skb, tip, sip, 0, dev) == 0) {
831 831
832 rt = skb_rtable(skb); 832 rt = skb_rtable(skb);
833 addr_type = rt->rt_type; 833 addr_type = rt->rt_type;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 10e15a144e95..44bf82e3aef7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1500,7 +1500,8 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
1500 1500
1501 if (cnf == net->ipv4.devconf_dflt) 1501 if (cnf == net->ipv4.devconf_dflt)
1502 devinet_copy_dflt_conf(net, i); 1502 devinet_copy_dflt_conf(net, i);
1503 if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) 1503 if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
1504 i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
1504 if ((new_value == 0) && (old_value != 0)) 1505 if ((new_value == 0) && (old_value != 0))
1505 rt_cache_flush(net, 0); 1506 rt_cache_flush(net, 0);
1506 } 1507 }
@@ -1617,6 +1618,8 @@ static struct devinet_sysctl_table {
1617 "force_igmp_version"), 1618 "force_igmp_version"),
1618 DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, 1619 DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
1619 "promote_secondaries"), 1620 "promote_secondaries"),
1621 DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
1622 "route_localnet"),
1620 }, 1623 },
1621}; 1624};
1622 1625
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index cb982a61536f..b61e9deb7c7e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -484,16 +484,25 @@ static void esp4_err(struct sk_buff *skb, u32 info)
484 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); 484 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
485 struct xfrm_state *x; 485 struct xfrm_state *x;
486 486
487 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || 487 switch (icmp_hdr(skb)->type) {
488 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 488 case ICMP_DEST_UNREACH:
489 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
490 return;
491 case ICMP_REDIRECT:
492 break;
493 default:
489 return; 494 return;
495 }
490 496
491 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 497 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
492 esph->spi, IPPROTO_ESP, AF_INET); 498 esph->spi, IPPROTO_ESP, AF_INET);
493 if (!x) 499 if (!x)
494 return; 500 return;
495 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 501
496 ntohl(esph->spi), ntohl(iph->daddr)); 502 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
503 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
504 else
505 ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
497 xfrm_state_put(x); 506 xfrm_state_put(x);
498} 507}
499 508
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3854411fa37c..8732cc7920ed 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -31,6 +31,7 @@
31#include <linux/if_addr.h> 31#include <linux/if_addr.h>
32#include <linux/if_arp.h> 32#include <linux/if_arp.h>
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <linux/cache.h>
34#include <linux/init.h> 35#include <linux/init.h>
35#include <linux/list.h> 36#include <linux/list.h>
36#include <linux/slab.h> 37#include <linux/slab.h>
@@ -85,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
85 tb = fib_trie_table(id); 86 tb = fib_trie_table(id);
86 if (!tb) 87 if (!tb)
87 return NULL; 88 return NULL;
89
90 switch (id) {
91 case RT_TABLE_LOCAL:
92 net->ipv4.fib_local = tb;
93 break;
94
95 case RT_TABLE_MAIN:
96 net->ipv4.fib_main = tb;
97 break;
98
99 case RT_TABLE_DEFAULT:
100 net->ipv4.fib_default = tb;
101 break;
102
103 default:
104 break;
105 }
106
88 h = id & (FIB_TABLE_HASHSZ - 1); 107 h = id & (FIB_TABLE_HASHSZ - 1);
89 hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); 108 hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
90 return tb; 109 return tb;
@@ -150,10 +169,6 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
150 if (ipv4_is_multicast(addr)) 169 if (ipv4_is_multicast(addr))
151 return RTN_MULTICAST; 170 return RTN_MULTICAST;
152 171
153#ifdef CONFIG_IP_MULTIPLE_TABLES
154 res.r = NULL;
155#endif
156
157 local_table = fib_get_table(net, RT_TABLE_LOCAL); 172 local_table = fib_get_table(net, RT_TABLE_LOCAL);
158 if (local_table) { 173 if (local_table) {
159 ret = RTN_UNICAST; 174 ret = RTN_UNICAST;
@@ -180,6 +195,44 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
180} 195}
181EXPORT_SYMBOL(inet_dev_addr_type); 196EXPORT_SYMBOL(inet_dev_addr_type);
182 197
198__be32 fib_compute_spec_dst(struct sk_buff *skb)
199{
200 struct net_device *dev = skb->dev;
201 struct in_device *in_dev;
202 struct fib_result res;
203 struct rtable *rt;
204 struct flowi4 fl4;
205 struct net *net;
206 int scope;
207
208 rt = skb_rtable(skb);
209 if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
210 RTCF_LOCAL)
211 return ip_hdr(skb)->daddr;
212
213 in_dev = __in_dev_get_rcu(dev);
214 BUG_ON(!in_dev);
215
216 net = dev_net(dev);
217
218 scope = RT_SCOPE_UNIVERSE;
219 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
220 fl4.flowi4_oif = 0;
221 fl4.flowi4_iif = net->loopback_dev->ifindex;
222 fl4.daddr = ip_hdr(skb)->saddr;
223 fl4.saddr = 0;
224 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
225 fl4.flowi4_scope = scope;
226 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
227 if (!fib_lookup(net, &fl4, &res))
228 return FIB_RES_PREFSRC(net, res);
229 } else {
230 scope = RT_SCOPE_LINK;
231 }
232
233 return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
234}
235
183/* Given (packet source, input interface) and optional (dst, oif, tos): 236/* Given (packet source, input interface) and optional (dst, oif, tos):
184 * - (main) check, that source is valid i.e. not broadcast or our local 237 * - (main) check, that source is valid i.e. not broadcast or our local
185 * address. 238 * address.
@@ -188,17 +241,15 @@ EXPORT_SYMBOL(inet_dev_addr_type);
188 * - check, that packet arrived from expected physical interface. 241 * - check, that packet arrived from expected physical interface.
189 * called with rcu_read_lock() 242 * called with rcu_read_lock()
190 */ 243 */
191int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, 244static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
192 int oif, struct net_device *dev, __be32 *spec_dst, 245 u8 tos, int oif, struct net_device *dev,
193 u32 *itag) 246 int rpf, struct in_device *idev, u32 *itag)
194{ 247{
195 struct in_device *in_dev; 248 int ret, no_addr, accept_local;
196 struct flowi4 fl4;
197 struct fib_result res; 249 struct fib_result res;
198 int no_addr, rpf, accept_local; 250 struct flowi4 fl4;
199 bool dev_match;
200 int ret;
201 struct net *net; 251 struct net *net;
252 bool dev_match;
202 253
203 fl4.flowi4_oif = 0; 254 fl4.flowi4_oif = 0;
204 fl4.flowi4_iif = oif; 255 fl4.flowi4_iif = oif;
@@ -207,20 +258,10 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
207 fl4.flowi4_tos = tos; 258 fl4.flowi4_tos = tos;
208 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 259 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
209 260
210 no_addr = rpf = accept_local = 0; 261 no_addr = idev->ifa_list == NULL;
211 in_dev = __in_dev_get_rcu(dev);
212 if (in_dev) {
213 no_addr = in_dev->ifa_list == NULL;
214
215 /* Ignore rp_filter for packets protected by IPsec. */
216 rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
217
218 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
219 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
220 }
221 262
222 if (in_dev == NULL) 263 accept_local = IN_DEV_ACCEPT_LOCAL(idev);
223 goto e_inval; 264 fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
224 265
225 net = dev_net(dev); 266 net = dev_net(dev);
226 if (fib_lookup(net, &fl4, &res)) 267 if (fib_lookup(net, &fl4, &res))
@@ -229,7 +270,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
229 if (res.type != RTN_LOCAL || !accept_local) 270 if (res.type != RTN_LOCAL || !accept_local)
230 goto e_inval; 271 goto e_inval;
231 } 272 }
232 *spec_dst = FIB_RES_PREFSRC(net, res);
233 fib_combine_itag(itag, &res); 273 fib_combine_itag(itag, &res);
234 dev_match = false; 274 dev_match = false;
235 275
@@ -258,17 +298,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
258 298
259 ret = 0; 299 ret = 0;
260 if (fib_lookup(net, &fl4, &res) == 0) { 300 if (fib_lookup(net, &fl4, &res) == 0) {
261 if (res.type == RTN_UNICAST) { 301 if (res.type == RTN_UNICAST)
262 *spec_dst = FIB_RES_PREFSRC(net, res);
263 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 302 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
264 }
265 } 303 }
266 return ret; 304 return ret;
267 305
268last_resort: 306last_resort:
269 if (rpf) 307 if (rpf)
270 goto e_rpf; 308 goto e_rpf;
271 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
272 *itag = 0; 309 *itag = 0;
273 return 0; 310 return 0;
274 311
@@ -278,6 +315,20 @@ e_rpf:
278 return -EXDEV; 315 return -EXDEV;
279} 316}
280 317
318/* Ignore rp_filter for packets protected by IPsec. */
319int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
320 u8 tos, int oif, struct net_device *dev,
321 struct in_device *idev, u32 *itag)
322{
323 int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
324
325 if (!r && !fib_num_tclassid_users(dev_net(dev))) {
326 *itag = 0;
327 return 0;
328 }
329 return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
330}
331
281static inline __be32 sk_extract_addr(struct sockaddr *addr) 332static inline __be32 sk_extract_addr(struct sockaddr *addr)
282{ 333{
283 return ((struct sockaddr_in *) addr)->sin_addr.s_addr; 334 return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
@@ -879,10 +930,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
879 .flowi4_scope = frn->fl_scope, 930 .flowi4_scope = frn->fl_scope,
880 }; 931 };
881 932
882#ifdef CONFIG_IP_MULTIPLE_TABLES
883 res.r = NULL;
884#endif
885
886 frn->err = -ENOENT; 933 frn->err = -ENOENT;
887 if (tb) { 934 if (tb) {
888 local_bh_disable(); 935 local_bh_disable();
@@ -935,8 +982,11 @@ static void nl_fib_input(struct sk_buff *skb)
935static int __net_init nl_fib_lookup_init(struct net *net) 982static int __net_init nl_fib_lookup_init(struct net *net)
936{ 983{
937 struct sock *sk; 984 struct sock *sk;
938 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, 985 struct netlink_kernel_cfg cfg = {
939 nl_fib_input, NULL, THIS_MODULE); 986 .input = nl_fib_input,
987 };
988
989 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
940 if (sk == NULL) 990 if (sk == NULL)
941 return -EAFNOSUPPORT; 991 return -EAFNOSUPPORT;
942 net->ipv4.fibnl = sk; 992 net->ipv4.fibnl = sk;
@@ -1021,11 +1071,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1021 rt_cache_flush(dev_net(dev), 0); 1071 rt_cache_flush(dev_net(dev), 0);
1022 break; 1072 break;
1023 case NETDEV_UNREGISTER_BATCH: 1073 case NETDEV_UNREGISTER_BATCH:
1024 /* The batch unregister is only called on the first
1025 * device in the list of devices being unregistered.
1026 * Therefore we should not pass dev_net(dev) in here.
1027 */
1028 rt_cache_flush_batch(NULL);
1029 break; 1074 break;
1030 } 1075 }
1031 return NOTIFY_DONE; 1076 return NOTIFY_DONE;
@@ -1090,6 +1135,9 @@ static int __net_init fib_net_init(struct net *net)
1090{ 1135{
1091 int error; 1136 int error;
1092 1137
1138#ifdef CONFIG_IP_ROUTE_CLASSID
1139 net->ipv4.fib_num_tclassid_users = 0;
1140#endif
1093 error = ip_fib_net_init(net); 1141 error = ip_fib_net_init(net);
1094 if (error < 0) 1142 if (error < 0)
1095 goto out; 1143 goto out;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 2d043f71ef70..a83d74e498d2 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -47,14 +47,7 @@ struct fib4_rule {
47#endif 47#endif
48}; 48};
49 49
50#ifdef CONFIG_IP_ROUTE_CLASSID 50int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
51u32 fib_rules_tclass(const struct fib_result *res)
52{
53 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
54}
55#endif
56
57int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
58{ 51{
59 struct fib_lookup_arg arg = { 52 struct fib_lookup_arg arg = {
60 .result = res, 53 .result = res,
@@ -63,11 +56,15 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
63 int err; 56 int err;
64 57
65 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); 58 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
66 res->r = arg.rule; 59#ifdef CONFIG_IP_ROUTE_CLASSID
67 60 if (arg.rule)
61 res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
62 else
63 res->tclassid = 0;
64#endif
68 return err; 65 return err;
69} 66}
70EXPORT_SYMBOL_GPL(fib_lookup); 67EXPORT_SYMBOL_GPL(__fib_lookup);
71 68
72static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, 69static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
73 int flags, struct fib_lookup_arg *arg) 70 int flags, struct fib_lookup_arg *arg)
@@ -169,8 +166,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
169 rule4->dst = nla_get_be32(tb[FRA_DST]); 166 rule4->dst = nla_get_be32(tb[FRA_DST]);
170 167
171#ifdef CONFIG_IP_ROUTE_CLASSID 168#ifdef CONFIG_IP_ROUTE_CLASSID
172 if (tb[FRA_FLOW]) 169 if (tb[FRA_FLOW]) {
173 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
171 if (rule4->tclassid)
172 net->ipv4.fib_num_tclassid_users++;
173 }
174#endif 174#endif
175 175
176 rule4->src_len = frh->src_len; 176 rule4->src_len = frh->src_len;
@@ -179,11 +179,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
179 rule4->dstmask = inet_make_mask(rule4->dst_len); 179 rule4->dstmask = inet_make_mask(rule4->dst_len);
180 rule4->tos = frh->tos; 180 rule4->tos = frh->tos;
181 181
182 net->ipv4.fib_has_custom_rules = true;
182 err = 0; 183 err = 0;
183errout: 184errout:
184 return err; 185 return err;
185} 186}
186 187
188static void fib4_rule_delete(struct fib_rule *rule)
189{
190 struct net *net = rule->fr_net;
191#ifdef CONFIG_IP_ROUTE_CLASSID
192 struct fib4_rule *rule4 = (struct fib4_rule *) rule;
193
194 if (rule4->tclassid)
195 net->ipv4.fib_num_tclassid_users--;
196#endif
197 net->ipv4.fib_has_custom_rules = true;
198}
199
187static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, 200static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
188 struct nlattr **tb) 201 struct nlattr **tb)
189{ 202{
@@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
256 .action = fib4_rule_action, 269 .action = fib4_rule_action,
257 .match = fib4_rule_match, 270 .match = fib4_rule_match,
258 .configure = fib4_rule_configure, 271 .configure = fib4_rule_configure,
272 .delete = fib4_rule_delete,
259 .compare = fib4_rule_compare, 273 .compare = fib4_rule_compare,
260 .fill = fib4_rule_fill, 274 .fill = fib4_rule_fill,
261 .default_pref = fib_default_rule_pref, 275 .default_pref = fib_default_rule_pref,
@@ -295,6 +309,7 @@ int __net_init fib4_rules_init(struct net *net)
295 if (err < 0) 309 if (err < 0)
296 goto fail; 310 goto fail;
297 net->ipv4.rules_ops = ops; 311 net->ipv4.rules_ops = ops;
312 net->ipv4.fib_has_custom_rules = false;
298 return 0; 313 return 0;
299 314
300fail: 315fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e5b7182fa099..e55171f184f9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
140 }, 140 },
141}; 141};
142 142
143static void free_nh_exceptions(struct fib_nh *nh)
144{
145 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
146 int i;
147
148 for (i = 0; i < FNHE_HASH_SIZE; i++) {
149 struct fib_nh_exception *fnhe;
150
151 fnhe = rcu_dereference_protected(hash[i].chain, 1);
152 while (fnhe) {
153 struct fib_nh_exception *next;
154
155 next = rcu_dereference_protected(fnhe->fnhe_next, 1);
156 kfree(fnhe);
157
158 fnhe = next;
159 }
160 }
161 kfree(hash);
162}
163
143/* Release a nexthop info record */ 164/* Release a nexthop info record */
144static void free_fib_info_rcu(struct rcu_head *head) 165static void free_fib_info_rcu(struct rcu_head *head)
145{ 166{
@@ -148,6 +169,12 @@ static void free_fib_info_rcu(struct rcu_head *head)
148 change_nexthops(fi) { 169 change_nexthops(fi) {
149 if (nexthop_nh->nh_dev) 170 if (nexthop_nh->nh_dev)
150 dev_put(nexthop_nh->nh_dev); 171 dev_put(nexthop_nh->nh_dev);
172 if (nexthop_nh->nh_exceptions)
173 free_nh_exceptions(nexthop_nh);
174 if (nexthop_nh->nh_rth_output)
175 dst_release(&nexthop_nh->nh_rth_output->dst);
176 if (nexthop_nh->nh_rth_input)
177 dst_release(&nexthop_nh->nh_rth_input->dst);
151 } endfor_nexthops(fi); 178 } endfor_nexthops(fi);
152 179
153 release_net(fi->fib_net); 180 release_net(fi->fib_net);
@@ -163,6 +190,12 @@ void free_fib_info(struct fib_info *fi)
163 return; 190 return;
164 } 191 }
165 fib_info_cnt--; 192 fib_info_cnt--;
193#ifdef CONFIG_IP_ROUTE_CLASSID
194 change_nexthops(fi) {
195 if (nexthop_nh->nh_tclassid)
196 fi->fib_net->ipv4.fib_num_tclassid_users--;
197 } endfor_nexthops(fi);
198#endif
166 call_rcu(&fi->rcu, free_fib_info_rcu); 199 call_rcu(&fi->rcu, free_fib_info_rcu);
167} 200}
168 201
@@ -421,6 +454,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
421#ifdef CONFIG_IP_ROUTE_CLASSID 454#ifdef CONFIG_IP_ROUTE_CLASSID
422 nla = nla_find(attrs, attrlen, RTA_FLOW); 455 nla = nla_find(attrs, attrlen, RTA_FLOW);
423 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 456 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
457 if (nexthop_nh->nh_tclassid)
458 fi->fib_net->ipv4.fib_num_tclassid_users++;
424#endif 459#endif
425 } 460 }
426 461
@@ -779,9 +814,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
779 int type = nla_type(nla); 814 int type = nla_type(nla);
780 815
781 if (type) { 816 if (type) {
817 u32 val;
818
782 if (type > RTAX_MAX) 819 if (type > RTAX_MAX)
783 goto err_inval; 820 goto err_inval;
784 fi->fib_metrics[type - 1] = nla_get_u32(nla); 821 val = nla_get_u32(nla);
822 if (type == RTAX_ADVMSS && val > 65535 - 40)
823 val = 65535 - 40;
824 if (type == RTAX_MTU && val > 65535 - 15)
825 val = 65535 - 15;
826 fi->fib_metrics[type - 1] = val;
785 } 827 }
786 } 828 }
787 } 829 }
@@ -810,6 +852,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
810 nh->nh_flags = cfg->fc_flags; 852 nh->nh_flags = cfg->fc_flags;
811#ifdef CONFIG_IP_ROUTE_CLASSID 853#ifdef CONFIG_IP_ROUTE_CLASSID
812 nh->nh_tclassid = cfg->fc_flow; 854 nh->nh_tclassid = cfg->fc_flow;
855 if (nh->nh_tclassid)
856 fi->fib_net->ipv4.fib_num_tclassid_users++;
813#endif 857#endif
814#ifdef CONFIG_IP_ROUTE_MULTIPATH 858#ifdef CONFIG_IP_ROUTE_MULTIPATH
815 nh->nh_weight = 1; 859 nh->nh_weight = 1;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 30b88d7b4bd6..18cbc15b20d5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1007,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1007 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { 1007 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1008 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1008 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1009 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 1009 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1010 tn = (struct tnode *) resize(t, (struct tnode *)tn); 1010 tn = (struct tnode *)resize(t, tn);
1011 1011
1012 tnode_put_child_reorg((struct tnode *)tp, cindex, 1012 tnode_put_child_reorg(tp, cindex,
1013 (struct rt_trie_node *)tn, wasfull); 1013 (struct rt_trie_node *)tn, wasfull);
1014 1014
1015 tp = node_parent((struct rt_trie_node *) tn); 1015 tp = node_parent((struct rt_trie_node *) tn);
@@ -1024,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1024 1024
1025 /* Handle last (top) tnode */ 1025 /* Handle last (top) tnode */
1026 if (IS_TNODE(tn)) 1026 if (IS_TNODE(tn))
1027 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1027 tn = (struct tnode *)resize(t, tn);
1028 1028
1029 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); 1029 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1030 tnode_free_flush(); 1030 tnode_free_flush();
@@ -1125,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1125 node_set_parent((struct rt_trie_node *)l, tp); 1125 node_set_parent((struct rt_trie_node *)l, tp);
1126 1126
1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); 1128 put_child(t, tp, cindex, (struct rt_trie_node *)l);
1129 } else { 1129 } else {
1130 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1130 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1131 /* 1131 /*
@@ -1160,8 +1160,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1160 1160
1161 if (tp) { 1161 if (tp) {
1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163 put_child(t, (struct tnode *)tp, cindex, 1163 put_child(t, tp, cindex, (struct rt_trie_node *)tn);
1164 (struct rt_trie_node *)tn);
1165 } else { 1164 } else {
1166 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); 1165 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1167 tp = tn; 1166 tp = tn;
@@ -1620,7 +1619,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
1620 1619
1621 if (tp) { 1620 if (tp) {
1622 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); 1621 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1623 put_child(t, (struct tnode *)tp, cindex, NULL); 1622 put_child(t, tp, cindex, NULL);
1624 trie_rebalance(t, tp); 1623 trie_rebalance(t, tp);
1625 } else 1624 } else
1626 RCU_INIT_POINTER(t->trie, NULL); 1625 RCU_INIT_POINTER(t->trie, NULL);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c75efbdc71cb..f2eccd531746 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -95,6 +95,7 @@
95#include <net/checksum.h> 95#include <net/checksum.h>
96#include <net/xfrm.h> 96#include <net/xfrm.h>
97#include <net/inet_common.h> 97#include <net/inet_common.h>
98#include <net/ip_fib.h>
98 99
99/* 100/*
100 * Build xmit assembly blocks 101 * Build xmit assembly blocks
@@ -253,10 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
253 254
254 /* Limit if icmp type is enabled in ratemask. */ 255 /* Limit if icmp type is enabled in ratemask. */
255 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { 256 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
256 if (!rt->peer) 257 struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
257 rt_bind_peer(rt, fl4->daddr, 1); 258 rc = inet_peer_xrlim_allow(peer,
258 rc = inet_peer_xrlim_allow(rt->peer,
259 net->ipv4.sysctl_icmp_ratelimit); 259 net->ipv4.sysctl_icmp_ratelimit);
260 inet_putpeer(peer);
260 } 261 }
261out: 262out:
262 return rc; 263 return rc;
@@ -334,7 +335,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
334 struct flowi4 fl4; 335 struct flowi4 fl4;
335 struct sock *sk; 336 struct sock *sk;
336 struct inet_sock *inet; 337 struct inet_sock *inet;
337 __be32 daddr; 338 __be32 daddr, saddr;
338 339
339 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) 340 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
340 return; 341 return;
@@ -348,6 +349,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
348 349
349 inet->tos = ip_hdr(skb)->tos; 350 inet->tos = ip_hdr(skb)->tos;
350 daddr = ipc.addr = ip_hdr(skb)->saddr; 351 daddr = ipc.addr = ip_hdr(skb)->saddr;
352 saddr = fib_compute_spec_dst(skb);
351 ipc.opt = NULL; 353 ipc.opt = NULL;
352 ipc.tx_flags = 0; 354 ipc.tx_flags = 0;
353 if (icmp_param->replyopts.opt.opt.optlen) { 355 if (icmp_param->replyopts.opt.opt.optlen) {
@@ -357,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
357 } 359 }
358 memset(&fl4, 0, sizeof(fl4)); 360 memset(&fl4, 0, sizeof(fl4));
359 fl4.daddr = daddr; 361 fl4.daddr = daddr;
360 fl4.saddr = rt->rt_spec_dst; 362 fl4.saddr = saddr;
361 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 363 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
362 fl4.flowi4_proto = IPPROTO_ICMP; 364 fl4.flowi4_proto = IPPROTO_ICMP;
363 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 365 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -569,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
569 rcu_read_lock(); 571 rcu_read_lock();
570 if (rt_is_input_route(rt) && 572 if (rt_is_input_route(rt) &&
571 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 573 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
572 dev = dev_get_by_index_rcu(net, rt->rt_iif); 574 dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
573 575
574 if (dev) 576 if (dev)
575 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 577 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -632,6 +634,27 @@ out:;
632EXPORT_SYMBOL(icmp_send); 634EXPORT_SYMBOL(icmp_send);
633 635
634 636
637static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
638{
639 const struct iphdr *iph = (const struct iphdr *) skb->data;
640 const struct net_protocol *ipprot;
641 int protocol = iph->protocol;
642
643 /* Checkin full IP header plus 8 bytes of protocol to
644 * avoid additional coding at protocol handlers.
645 */
646 if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
647 return;
648
649 raw_icmp_error(skb, protocol, info);
650
651 rcu_read_lock();
652 ipprot = rcu_dereference(inet_protos[protocol]);
653 if (ipprot && ipprot->err_handler)
654 ipprot->err_handler(skb, info);
655 rcu_read_unlock();
656}
657
635/* 658/*
636 * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. 659 * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
637 */ 660 */
@@ -640,10 +663,8 @@ static void icmp_unreach(struct sk_buff *skb)
640{ 663{
641 const struct iphdr *iph; 664 const struct iphdr *iph;
642 struct icmphdr *icmph; 665 struct icmphdr *icmph;
643 int hash, protocol;
644 const struct net_protocol *ipprot;
645 u32 info = 0;
646 struct net *net; 666 struct net *net;
667 u32 info = 0;
647 668
648 net = dev_net(skb_dst(skb)->dev); 669 net = dev_net(skb_dst(skb)->dev);
649 670
@@ -674,9 +695,7 @@ static void icmp_unreach(struct sk_buff *skb)
674 LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), 695 LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
675 &iph->daddr); 696 &iph->daddr);
676 } else { 697 } else {
677 info = ip_rt_frag_needed(net, iph, 698 info = ntohs(icmph->un.frag.mtu);
678 ntohs(icmph->un.frag.mtu),
679 skb->dev);
680 if (!info) 699 if (!info)
681 goto out; 700 goto out;
682 } 701 }
@@ -720,26 +739,7 @@ static void icmp_unreach(struct sk_buff *skb)
720 goto out; 739 goto out;
721 } 740 }
722 741
723 /* Checkin full IP header plus 8 bytes of protocol to 742 icmp_socket_deliver(skb, info);
724 * avoid additional coding at protocol handlers.
725 */
726 if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
727 goto out;
728
729 iph = (const struct iphdr *)skb->data;
730 protocol = iph->protocol;
731
732 /*
733 * Deliver ICMP message to raw sockets. Pretty useless feature?
734 */
735 raw_icmp_error(skb, protocol, info);
736
737 hash = protocol & (MAX_INET_PROTOS - 1);
738 rcu_read_lock();
739 ipprot = rcu_dereference(inet_protos[hash]);
740 if (ipprot && ipprot->err_handler)
741 ipprot->err_handler(skb, info);
742 rcu_read_unlock();
743 743
744out: 744out:
745 return; 745 return;
@@ -755,46 +755,15 @@ out_err:
755 755
756static void icmp_redirect(struct sk_buff *skb) 756static void icmp_redirect(struct sk_buff *skb)
757{ 757{
758 const struct iphdr *iph; 758 if (skb->len < sizeof(struct iphdr)) {
759 759 ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
760 if (skb->len < sizeof(struct iphdr)) 760 return;
761 goto out_err;
762
763 /*
764 * Get the copied header of the packet that caused the redirect
765 */
766 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
767 goto out;
768
769 iph = (const struct iphdr *)skb->data;
770
771 switch (icmp_hdr(skb)->code & 7) {
772 case ICMP_REDIR_NET:
773 case ICMP_REDIR_NETTOS:
774 /*
775 * As per RFC recommendations now handle it as a host redirect.
776 */
777 case ICMP_REDIR_HOST:
778 case ICMP_REDIR_HOSTTOS:
779 ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
780 icmp_hdr(skb)->un.gateway,
781 iph->saddr, skb->dev);
782 break;
783 } 761 }
784 762
785 /* Ping wants to see redirects. 763 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
786 * Let's pretend they are errors of sorts... */ 764 return;
787 if (iph->protocol == IPPROTO_ICMP &&
788 iph->ihl >= 5 &&
789 pskb_may_pull(skb, (iph->ihl<<2)+8)) {
790 ping_err(skb, icmp_hdr(skb)->un.gateway);
791 }
792 765
793out: 766 icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
794 return;
795out_err:
796 ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
797 goto out;
798} 767}
799 768
800/* 769/*
@@ -868,86 +837,6 @@ out_err:
868 goto out; 837 goto out;
869} 838}
870 839
871
872/*
873 * Handle ICMP_ADDRESS_MASK requests. (RFC950)
874 *
875 * RFC1122 (3.2.2.9). A host MUST only send replies to
876 * ADDRESS_MASK requests if it's been configured as an address mask
877 * agent. Receiving a request doesn't constitute implicit permission to
878 * act as one. Of course, implementing this correctly requires (SHOULD)
879 * a way to turn the functionality on and off. Another one for sysctl(),
880 * I guess. -- MS
881 *
882 * RFC1812 (4.3.3.9). A router MUST implement it.
883 * A router SHOULD have switch turning it on/off.
884 * This switch MUST be ON by default.
885 *
886 * Gratuitous replies, zero-source replies are not implemented,
887 * that complies with RFC. DO NOT implement them!!! All the idea
888 * of broadcast addrmask replies as specified in RFC950 is broken.
889 * The problem is that it is not uncommon to have several prefixes
890 * on one physical interface. Moreover, addrmask agent can even be
891 * not aware of existing another prefixes.
892 * If source is zero, addrmask agent cannot choose correct prefix.
893 * Gratuitous mask announcements suffer from the same problem.
894 * RFC1812 explains it, but still allows to use ADDRMASK,
895 * that is pretty silly. --ANK
896 *
897 * All these rules are so bizarre, that I removed kernel addrmask
898 * support at all. It is wrong, it is obsolete, nobody uses it in
899 * any case. --ANK
900 *
901 * Furthermore you can do it with a usermode address agent program
902 * anyway...
903 */
904
905static void icmp_address(struct sk_buff *skb)
906{
907#if 0
908 net_dbg_ratelimited("a guy asks for address mask. Who is it?\n");
909#endif
910}
911
912/*
913 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
914 * loudly if an inconsistency is found.
915 * called with rcu_read_lock()
916 */
917
918static void icmp_address_reply(struct sk_buff *skb)
919{
920 struct rtable *rt = skb_rtable(skb);
921 struct net_device *dev = skb->dev;
922 struct in_device *in_dev;
923 struct in_ifaddr *ifa;
924
925 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
926 return;
927
928 in_dev = __in_dev_get_rcu(dev);
929 if (!in_dev)
930 return;
931
932 if (in_dev->ifa_list &&
933 IN_DEV_LOG_MARTIANS(in_dev) &&
934 IN_DEV_FORWARD(in_dev)) {
935 __be32 _mask, *mp;
936
937 mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
938 BUG_ON(mp == NULL);
939 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
940 if (*mp == ifa->ifa_mask &&
941 inet_ifa_match(ip_hdr(skb)->saddr, ifa))
942 break;
943 }
944 if (!ifa)
945 net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n",
946 mp,
947 dev->name, &ip_hdr(skb)->saddr);
948 }
949}
950
951static void icmp_discard(struct sk_buff *skb) 840static void icmp_discard(struct sk_buff *skb)
952{ 841{
953} 842}
@@ -1111,10 +1000,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
1111 .handler = icmp_discard, 1000 .handler = icmp_discard,
1112 }, 1001 },
1113 [ICMP_ADDRESS] = { 1002 [ICMP_ADDRESS] = {
1114 .handler = icmp_address, 1003 .handler = icmp_discard,
1115 }, 1004 },
1116 [ICMP_ADDRESSREPLY] = { 1005 [ICMP_ADDRESSREPLY] = {
1117 .handler = icmp_address_reply, 1006 .handler = icmp_discard,
1118 }, 1007 },
1119}; 1008};
1120 1009
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f9ee7417f6a0..db0cf17c00f7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -374,18 +374,19 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
374 const struct inet_request_sock *ireq = inet_rsk(req); 374 const struct inet_request_sock *ireq = inet_rsk(req);
375 struct ip_options_rcu *opt = inet_rsk(req)->opt; 375 struct ip_options_rcu *opt = inet_rsk(req)->opt;
376 struct net *net = sock_net(sk); 376 struct net *net = sock_net(sk);
377 int flags = inet_sk_flowi_flags(sk);
377 378
378 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 379 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
379 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 380 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
380 sk->sk_protocol, 381 sk->sk_protocol,
381 inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS, 382 flags,
382 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, 383 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
383 ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); 384 ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
384 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 385 security_req_classify_flow(req, flowi4_to_flowi(fl4));
385 rt = ip_route_output_flow(net, fl4, sk); 386 rt = ip_route_output_flow(net, fl4, sk);
386 if (IS_ERR(rt)) 387 if (IS_ERR(rt))
387 goto no_route; 388 goto no_route;
388 if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) 389 if (opt && opt->opt.is_strictroute && rt->rt_gateway)
389 goto route_err; 390 goto route_err;
390 return &rt->dst; 391 return &rt->dst;
391 392
@@ -418,7 +419,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
418 rt = ip_route_output_flow(net, fl4, sk); 419 rt = ip_route_output_flow(net, fl4, sk);
419 if (IS_ERR(rt)) 420 if (IS_ERR(rt))
420 goto no_route; 421 goto no_route;
421 if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) 422 if (opt && opt->opt.is_strictroute && rt->rt_gateway)
422 goto route_err; 423 goto route_err;
423 return &rt->dst; 424 return &rt->dst;
424 425
@@ -799,3 +800,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
799} 800}
800EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); 801EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
801#endif 802#endif
803
804static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
805{
806 const struct inet_sock *inet = inet_sk(sk);
807 const struct ip_options_rcu *inet_opt;
808 __be32 daddr = inet->inet_daddr;
809 struct flowi4 *fl4;
810 struct rtable *rt;
811
812 rcu_read_lock();
813 inet_opt = rcu_dereference(inet->inet_opt);
814 if (inet_opt && inet_opt->opt.srr)
815 daddr = inet_opt->opt.faddr;
816 fl4 = &fl->u.ip4;
817 rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
818 inet->inet_saddr, inet->inet_dport,
819 inet->inet_sport, sk->sk_protocol,
820 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
821 if (IS_ERR(rt))
822 rt = NULL;
823 if (rt)
824 sk_setup_caps(sk, &rt->dst);
825 rcu_read_unlock();
826
827 return &rt->dst;
828}
829
830struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
831{
832 struct dst_entry *dst = __sk_dst_check(sk, 0);
833 struct inet_sock *inet = inet_sk(sk);
834
835 if (!dst) {
836 dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
837 if (!dst)
838 goto out;
839 }
840 dst->ops->update_pmtu(dst, sk, NULL, mtu);
841
842 dst = __sk_dst_check(sk, 0);
843 if (!dst)
844 dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
845out:
846 return dst;
847}
848EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 46d1e7199a8c..570e61f9611f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -46,9 +46,6 @@ struct inet_diag_entry {
46 u16 userlocks; 46 u16 userlocks;
47}; 47};
48 48
49#define INET_DIAG_PUT(skb, attrtype, attrlen) \
50 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
51
52static DEFINE_MUTEX(inet_diag_table_mutex); 49static DEFINE_MUTEX(inet_diag_table_mutex);
53 50
54static const struct inet_diag_handler *inet_diag_lock_handler(int proto) 51static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
@@ -78,24 +75,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
78 const struct inet_sock *inet = inet_sk(sk); 75 const struct inet_sock *inet = inet_sk(sk);
79 struct inet_diag_msg *r; 76 struct inet_diag_msg *r;
80 struct nlmsghdr *nlh; 77 struct nlmsghdr *nlh;
78 struct nlattr *attr;
81 void *info = NULL; 79 void *info = NULL;
82 struct inet_diag_meminfo *minfo = NULL;
83 unsigned char *b = skb_tail_pointer(skb);
84 const struct inet_diag_handler *handler; 80 const struct inet_diag_handler *handler;
85 int ext = req->idiag_ext; 81 int ext = req->idiag_ext;
86 82
87 handler = inet_diag_table[req->sdiag_protocol]; 83 handler = inet_diag_table[req->sdiag_protocol];
88 BUG_ON(handler == NULL); 84 BUG_ON(handler == NULL);
89 85
90 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); 86 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
91 nlh->nlmsg_flags = nlmsg_flags; 87 nlmsg_flags);
88 if (!nlh)
89 return -EMSGSIZE;
92 90
93 r = NLMSG_DATA(nlh); 91 r = nlmsg_data(nlh);
94 BUG_ON(sk->sk_state == TCP_TIME_WAIT); 92 BUG_ON(sk->sk_state == TCP_TIME_WAIT);
95 93
96 if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
97 minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
98
99 r->idiag_family = sk->sk_family; 94 r->idiag_family = sk->sk_family;
100 r->idiag_state = sk->sk_state; 95 r->idiag_state = sk->sk_state;
101 r->idiag_timer = 0; 96 r->idiag_timer = 0;
@@ -113,7 +108,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
113 * hence this needs to be included regardless of socket family. 108 * hence this needs to be included regardless of socket family.
114 */ 109 */
115 if (ext & (1 << (INET_DIAG_TOS - 1))) 110 if (ext & (1 << (INET_DIAG_TOS - 1)))
116 RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); 111 if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
112 goto errout;
117 113
118#if IS_ENABLED(CONFIG_IPV6) 114#if IS_ENABLED(CONFIG_IPV6)
119 if (r->idiag_family == AF_INET6) { 115 if (r->idiag_family == AF_INET6) {
@@ -121,24 +117,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
121 117
122 *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; 118 *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
123 *(struct in6_addr *)r->id.idiag_dst = np->daddr; 119 *(struct in6_addr *)r->id.idiag_dst = np->daddr;
120
124 if (ext & (1 << (INET_DIAG_TCLASS - 1))) 121 if (ext & (1 << (INET_DIAG_TCLASS - 1)))
125 RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); 122 if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
123 goto errout;
126 } 124 }
127#endif 125#endif
128 126
129 r->idiag_uid = sock_i_uid(sk); 127 r->idiag_uid = sock_i_uid(sk);
130 r->idiag_inode = sock_i_ino(sk); 128 r->idiag_inode = sock_i_ino(sk);
131 129
132 if (minfo) { 130 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
133 minfo->idiag_rmem = sk_rmem_alloc_get(sk); 131 struct inet_diag_meminfo minfo = {
134 minfo->idiag_wmem = sk->sk_wmem_queued; 132 .idiag_rmem = sk_rmem_alloc_get(sk),
135 minfo->idiag_fmem = sk->sk_forward_alloc; 133 .idiag_wmem = sk->sk_wmem_queued,
136 minfo->idiag_tmem = sk_wmem_alloc_get(sk); 134 .idiag_fmem = sk->sk_forward_alloc,
135 .idiag_tmem = sk_wmem_alloc_get(sk),
136 };
137
138 if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
139 goto errout;
137 } 140 }
138 141
139 if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) 142 if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
140 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) 143 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
141 goto rtattr_failure; 144 goto errout;
142 145
143 if (icsk == NULL) { 146 if (icsk == NULL) {
144 handler->idiag_get_info(sk, r, NULL); 147 handler->idiag_get_info(sk, r, NULL);
@@ -165,16 +168,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
165 } 168 }
166#undef EXPIRES_IN_MS 169#undef EXPIRES_IN_MS
167 170
168 if (ext & (1 << (INET_DIAG_INFO - 1))) 171 if (ext & (1 << (INET_DIAG_INFO - 1))) {
169 info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info)); 172 attr = nla_reserve(skb, INET_DIAG_INFO,
170 173 sizeof(struct tcp_info));
171 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { 174 if (!attr)
172 const size_t len = strlen(icsk->icsk_ca_ops->name); 175 goto errout;
173 176
174 strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), 177 info = nla_data(attr);
175 icsk->icsk_ca_ops->name);
176 } 178 }
177 179
180 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
181 if (nla_put_string(skb, INET_DIAG_CONG,
182 icsk->icsk_ca_ops->name) < 0)
183 goto errout;
184
178 handler->idiag_get_info(sk, r, info); 185 handler->idiag_get_info(sk, r, info);
179 186
180 if (sk->sk_state < TCP_TIME_WAIT && 187 if (sk->sk_state < TCP_TIME_WAIT &&
@@ -182,12 +189,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
182 icsk->icsk_ca_ops->get_info(sk, ext, skb); 189 icsk->icsk_ca_ops->get_info(sk, ext, skb);
183 190
184out: 191out:
185 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 192 return nlmsg_end(skb, nlh);
186 return skb->len;
187 193
188rtattr_failure: 194errout:
189nlmsg_failure: 195 nlmsg_cancel(skb, nlh);
190 nlmsg_trim(skb, b);
191 return -EMSGSIZE; 196 return -EMSGSIZE;
192} 197}
193EXPORT_SYMBOL_GPL(inet_sk_diag_fill); 198EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
@@ -208,14 +213,15 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
208{ 213{
209 long tmo; 214 long tmo;
210 struct inet_diag_msg *r; 215 struct inet_diag_msg *r;
211 const unsigned char *previous_tail = skb_tail_pointer(skb); 216 struct nlmsghdr *nlh;
212 struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
213 unlh->nlmsg_type, sizeof(*r));
214 217
215 r = NLMSG_DATA(nlh); 218 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
216 BUG_ON(tw->tw_state != TCP_TIME_WAIT); 219 nlmsg_flags);
220 if (!nlh)
221 return -EMSGSIZE;
217 222
218 nlh->nlmsg_flags = nlmsg_flags; 223 r = nlmsg_data(nlh);
224 BUG_ON(tw->tw_state != TCP_TIME_WAIT);
219 225
220 tmo = tw->tw_ttd - jiffies; 226 tmo = tw->tw_ttd - jiffies;
221 if (tmo < 0) 227 if (tmo < 0)
@@ -245,11 +251,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
245 *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; 251 *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
246 } 252 }
247#endif 253#endif
248 nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; 254
249 return skb->len; 255 return nlmsg_end(skb, nlh);
250nlmsg_failure:
251 nlmsg_trim(skb, previous_tail);
252 return -EMSGSIZE;
253} 256}
254 257
255static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 258static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
@@ -269,16 +272,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
269 int err; 272 int err;
270 struct sock *sk; 273 struct sock *sk;
271 struct sk_buff *rep; 274 struct sk_buff *rep;
275 struct net *net = sock_net(in_skb->sk);
272 276
273 err = -EINVAL; 277 err = -EINVAL;
274 if (req->sdiag_family == AF_INET) { 278 if (req->sdiag_family == AF_INET) {
275 sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], 279 sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
276 req->id.idiag_dport, req->id.idiag_src[0], 280 req->id.idiag_dport, req->id.idiag_src[0],
277 req->id.idiag_sport, req->id.idiag_if); 281 req->id.idiag_sport, req->id.idiag_if);
278 } 282 }
279#if IS_ENABLED(CONFIG_IPV6) 283#if IS_ENABLED(CONFIG_IPV6)
280 else if (req->sdiag_family == AF_INET6) { 284 else if (req->sdiag_family == AF_INET6) {
281 sk = inet6_lookup(&init_net, hashinfo, 285 sk = inet6_lookup(net, hashinfo,
282 (struct in6_addr *)req->id.idiag_dst, 286 (struct in6_addr *)req->id.idiag_dst,
283 req->id.idiag_dport, 287 req->id.idiag_dport,
284 (struct in6_addr *)req->id.idiag_src, 288 (struct in6_addr *)req->id.idiag_src,
@@ -298,23 +302,23 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
298 if (err) 302 if (err)
299 goto out; 303 goto out;
300 304
301 err = -ENOMEM; 305 rep = nlmsg_new(sizeof(struct inet_diag_msg) +
302 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + 306 sizeof(struct inet_diag_meminfo) +
303 sizeof(struct inet_diag_meminfo) + 307 sizeof(struct tcp_info) + 64, GFP_KERNEL);
304 sizeof(struct tcp_info) + 64)), 308 if (!rep) {
305 GFP_KERNEL); 309 err = -ENOMEM;
306 if (!rep)
307 goto out; 310 goto out;
311 }
308 312
309 err = sk_diag_fill(sk, rep, req, 313 err = sk_diag_fill(sk, rep, req,
310 NETLINK_CB(in_skb).pid, 314 NETLINK_CB(in_skb).pid,
311 nlh->nlmsg_seq, 0, nlh); 315 nlh->nlmsg_seq, 0, nlh);
312 if (err < 0) { 316 if (err < 0) {
313 WARN_ON(err == -EMSGSIZE); 317 WARN_ON(err == -EMSGSIZE);
314 kfree_skb(rep); 318 nlmsg_free(rep);
315 goto out; 319 goto out;
316 } 320 }
317 err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, 321 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
318 MSG_DONTWAIT); 322 MSG_DONTWAIT);
319 if (err > 0) 323 if (err > 0)
320 err = 0; 324 err = 0;
@@ -592,15 +596,16 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
592{ 596{
593 const struct inet_request_sock *ireq = inet_rsk(req); 597 const struct inet_request_sock *ireq = inet_rsk(req);
594 struct inet_sock *inet = inet_sk(sk); 598 struct inet_sock *inet = inet_sk(sk);
595 unsigned char *b = skb_tail_pointer(skb);
596 struct inet_diag_msg *r; 599 struct inet_diag_msg *r;
597 struct nlmsghdr *nlh; 600 struct nlmsghdr *nlh;
598 long tmo; 601 long tmo;
599 602
600 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); 603 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
601 nlh->nlmsg_flags = NLM_F_MULTI; 604 NLM_F_MULTI);
602 r = NLMSG_DATA(nlh); 605 if (!nlh)
606 return -EMSGSIZE;
603 607
608 r = nlmsg_data(nlh);
604 r->idiag_family = sk->sk_family; 609 r->idiag_family = sk->sk_family;
605 r->idiag_state = TCP_SYN_RECV; 610 r->idiag_state = TCP_SYN_RECV;
606 r->idiag_timer = 1; 611 r->idiag_timer = 1;
@@ -628,13 +633,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
628 *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; 633 *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr;
629 } 634 }
630#endif 635#endif
631 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
632
633 return skb->len;
634 636
635nlmsg_failure: 637 return nlmsg_end(skb, nlh);
636 nlmsg_trim(skb, b);
637 return -1;
638} 638}
639 639
640static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, 640static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
@@ -725,6 +725,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
725{ 725{
726 int i, num; 726 int i, num;
727 int s_i, s_num; 727 int s_i, s_num;
728 struct net *net = sock_net(skb->sk);
728 729
729 s_i = cb->args[1]; 730 s_i = cb->args[1];
730 s_num = num = cb->args[2]; 731 s_num = num = cb->args[2];
@@ -744,6 +745,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
744 sk_nulls_for_each(sk, node, &ilb->head) { 745 sk_nulls_for_each(sk, node, &ilb->head) {
745 struct inet_sock *inet = inet_sk(sk); 746 struct inet_sock *inet = inet_sk(sk);
746 747
748 if (!net_eq(sock_net(sk), net))
749 continue;
750
747 if (num < s_num) { 751 if (num < s_num) {
748 num++; 752 num++;
749 continue; 753 continue;
@@ -814,6 +818,8 @@ skip_listen_ht:
814 sk_nulls_for_each(sk, node, &head->chain) { 818 sk_nulls_for_each(sk, node, &head->chain) {
815 struct inet_sock *inet = inet_sk(sk); 819 struct inet_sock *inet = inet_sk(sk);
816 820
821 if (!net_eq(sock_net(sk), net))
822 continue;
817 if (num < s_num) 823 if (num < s_num)
818 goto next_normal; 824 goto next_normal;
819 if (!(r->idiag_states & (1 << sk->sk_state))) 825 if (!(r->idiag_states & (1 << sk->sk_state)))
@@ -840,6 +846,8 @@ next_normal:
840 846
841 inet_twsk_for_each(tw, node, 847 inet_twsk_for_each(tw, node,
842 &head->twchain) { 848 &head->twchain) {
849 if (!net_eq(twsk_net(tw), net))
850 continue;
843 851
844 if (num < s_num) 852 if (num < s_num)
845 goto next_dying; 853 goto next_dying;
@@ -892,7 +900,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
892 if (nlmsg_attrlen(cb->nlh, hdrlen)) 900 if (nlmsg_attrlen(cb->nlh, hdrlen))
893 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); 901 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
894 902
895 return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc); 903 return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
896} 904}
897 905
898static inline int inet_diag_type2proto(int type) 906static inline int inet_diag_type2proto(int type)
@@ -909,7 +917,7 @@ static inline int inet_diag_type2proto(int type)
909 917
910static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) 918static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
911{ 919{
912 struct inet_diag_req *rc = NLMSG_DATA(cb->nlh); 920 struct inet_diag_req *rc = nlmsg_data(cb->nlh);
913 struct inet_diag_req_v2 req; 921 struct inet_diag_req_v2 req;
914 struct nlattr *bc = NULL; 922 struct nlattr *bc = NULL;
915 int hdrlen = sizeof(struct inet_diag_req); 923 int hdrlen = sizeof(struct inet_diag_req);
@@ -929,7 +937,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c
929static int inet_diag_get_exact_compat(struct sk_buff *in_skb, 937static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
930 const struct nlmsghdr *nlh) 938 const struct nlmsghdr *nlh)
931{ 939{
932 struct inet_diag_req *rc = NLMSG_DATA(nlh); 940 struct inet_diag_req *rc = nlmsg_data(nlh);
933 struct inet_diag_req_v2 req; 941 struct inet_diag_req_v2 req;
934 942
935 req.sdiag_family = rc->idiag_family; 943 req.sdiag_family = rc->idiag_family;
@@ -944,6 +952,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
944static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) 952static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
945{ 953{
946 int hdrlen = sizeof(struct inet_diag_req); 954 int hdrlen = sizeof(struct inet_diag_req);
955 struct net *net = sock_net(skb->sk);
947 956
948 if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || 957 if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
949 nlmsg_len(nlh) < hdrlen) 958 nlmsg_len(nlh) < hdrlen)
@@ -964,7 +973,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
964 struct netlink_dump_control c = { 973 struct netlink_dump_control c = {
965 .dump = inet_diag_dump_compat, 974 .dump = inet_diag_dump_compat,
966 }; 975 };
967 return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c); 976 return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
968 } 977 }
969 } 978 }
970 979
@@ -974,6 +983,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
974static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) 983static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
975{ 984{
976 int hdrlen = sizeof(struct inet_diag_req_v2); 985 int hdrlen = sizeof(struct inet_diag_req_v2);
986 struct net *net = sock_net(skb->sk);
977 987
978 if (nlmsg_len(h) < hdrlen) 988 if (nlmsg_len(h) < hdrlen)
979 return -EINVAL; 989 return -EINVAL;
@@ -992,11 +1002,11 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
992 struct netlink_dump_control c = { 1002 struct netlink_dump_control c = {
993 .dump = inet_diag_dump, 1003 .dump = inet_diag_dump,
994 }; 1004 };
995 return netlink_dump_start(sock_diag_nlsk, skb, h, &c); 1005 return netlink_dump_start(net->diag_nlsk, skb, h, &c);
996 } 1006 }
997 } 1007 }
998 1008
999 return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h)); 1009 return inet_diag_get_exact(skb, h, nlmsg_data(h));
1000} 1010}
1001 1011
1002static const struct sock_diag_handler inet_diag_handler = { 1012static const struct sock_diag_handler inet_diag_handler = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5ff2a51b6d0c..85190e69297b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
243 if (q == NULL) 243 if (q == NULL)
244 return NULL; 244 return NULL;
245 245
246 q->net = nf;
246 f->constructor(q, arg); 247 f->constructor(q, arg);
247 atomic_add(f->qsize, &nf->mem); 248 atomic_add(f->qsize, &nf->mem);
248 setup_timer(&q->timer, f->frag_expire, (unsigned long)q); 249 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
249 spin_lock_init(&q->lock); 250 spin_lock_init(&q->lock);
250 atomic_set(&q->refcnt, 1); 251 atomic_set(&q->refcnt, 1);
251 q->net = nf;
252 252
253 return q; 253 return q;
254} 254}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index dfba343b2509..e1e0a4e8fd34 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -82,23 +82,39 @@ static const struct inet_peer peer_fake_node = {
82 .avl_height = 0 82 .avl_height = 0
83}; 83};
84 84
85struct inet_peer_base { 85void inet_peer_base_init(struct inet_peer_base *bp)
86 struct inet_peer __rcu *root; 86{
87 seqlock_t lock; 87 bp->root = peer_avl_empty_rcu;
88 int total; 88 seqlock_init(&bp->lock);
89}; 89 bp->flush_seq = ~0U;
90 bp->total = 0;
91}
92EXPORT_SYMBOL_GPL(inet_peer_base_init);
90 93
91static struct inet_peer_base v4_peers = { 94static atomic_t v4_seq = ATOMIC_INIT(0);
92 .root = peer_avl_empty_rcu, 95static atomic_t v6_seq = ATOMIC_INIT(0);
93 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
94 .total = 0,
95};
96 96
97static struct inet_peer_base v6_peers = { 97static atomic_t *inetpeer_seq_ptr(int family)
98 .root = peer_avl_empty_rcu, 98{
99 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), 99 return (family == AF_INET ? &v4_seq : &v6_seq);
100 .total = 0, 100}
101}; 101
102static inline void flush_check(struct inet_peer_base *base, int family)
103{
104 atomic_t *fp = inetpeer_seq_ptr(family);
105
106 if (unlikely(base->flush_seq != atomic_read(fp))) {
107 inetpeer_invalidate_tree(base);
108 base->flush_seq = atomic_read(fp);
109 }
110}
111
112void inetpeer_invalidate_family(int family)
113{
114 atomic_t *fp = inetpeer_seq_ptr(family);
115
116 atomic_inc(fp);
117}
102 118
103#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 119#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
104 120
@@ -110,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
110 126
111static void inetpeer_gc_worker(struct work_struct *work) 127static void inetpeer_gc_worker(struct work_struct *work)
112{ 128{
113 struct inet_peer *p, *n; 129 struct inet_peer *p, *n, *c;
114 LIST_HEAD(list); 130 LIST_HEAD(list);
115 131
116 spin_lock_bh(&gc_lock); 132 spin_lock_bh(&gc_lock);
@@ -122,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work)
122 138
123 list_for_each_entry_safe(p, n, &list, gc_list) { 139 list_for_each_entry_safe(p, n, &list, gc_list) {
124 140
125 if(need_resched()) 141 if (need_resched())
126 cond_resched(); 142 cond_resched();
127 143
128 if (p->avl_left != peer_avl_empty) { 144 c = rcu_dereference_protected(p->avl_left, 1);
129 list_add_tail(&p->avl_left->gc_list, &list); 145 if (c != peer_avl_empty) {
130 p->avl_left = peer_avl_empty; 146 list_add_tail(&c->gc_list, &list);
147 p->avl_left = peer_avl_empty_rcu;
131 } 148 }
132 149
133 if (p->avl_right != peer_avl_empty) { 150 c = rcu_dereference_protected(p->avl_right, 1);
134 list_add_tail(&p->avl_right->gc_list, &list); 151 if (c != peer_avl_empty) {
135 p->avl_right = peer_avl_empty; 152 list_add_tail(&c->gc_list, &list);
153 p->avl_right = peer_avl_empty_rcu;
136 } 154 }
137 155
138 n = list_entry(p->gc_list.next, struct inet_peer, gc_list); 156 n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
@@ -401,11 +419,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
401 call_rcu(&p->rcu, inetpeer_free_rcu); 419 call_rcu(&p->rcu, inetpeer_free_rcu);
402} 420}
403 421
404static struct inet_peer_base *family_to_base(int family)
405{
406 return family == AF_INET ? &v4_peers : &v6_peers;
407}
408
409/* perform garbage collect on all items stacked during a lookup */ 422/* perform garbage collect on all items stacked during a lookup */
410static int inet_peer_gc(struct inet_peer_base *base, 423static int inet_peer_gc(struct inet_peer_base *base,
411 struct inet_peer __rcu **stack[PEER_MAXDEPTH], 424 struct inet_peer __rcu **stack[PEER_MAXDEPTH],
@@ -443,14 +456,17 @@ static int inet_peer_gc(struct inet_peer_base *base,
443 return cnt; 456 return cnt;
444} 457}
445 458
446struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) 459struct inet_peer *inet_getpeer(struct inet_peer_base *base,
460 const struct inetpeer_addr *daddr,
461 int create)
447{ 462{
448 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 463 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
449 struct inet_peer_base *base = family_to_base(daddr->family);
450 struct inet_peer *p; 464 struct inet_peer *p;
451 unsigned int sequence; 465 unsigned int sequence;
452 int invalidated, gccnt = 0; 466 int invalidated, gccnt = 0;
453 467
468 flush_check(base, daddr->family);
469
454 /* Attempt a lockless lookup first. 470 /* Attempt a lockless lookup first.
455 * Because of a concurrent writer, we might not find an existing entry. 471 * Because of a concurrent writer, we might not find an existing entry.
456 */ 472 */
@@ -492,13 +508,9 @@ relookup:
492 (daddr->family == AF_INET) ? 508 (daddr->family == AF_INET) ?
493 secure_ip_id(daddr->addr.a4) : 509 secure_ip_id(daddr->addr.a4) :
494 secure_ipv6_id(daddr->addr.a6)); 510 secure_ipv6_id(daddr->addr.a6));
495 p->tcp_ts_stamp = 0;
496 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 511 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
497 p->rate_tokens = 0; 512 p->rate_tokens = 0;
498 p->rate_last = 0; 513 p->rate_last = 0;
499 p->pmtu_expires = 0;
500 p->pmtu_orig = 0;
501 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
502 INIT_LIST_HEAD(&p->gc_list); 514 INIT_LIST_HEAD(&p->gc_list);
503 515
504 /* Link the node. */ 516 /* Link the node. */
@@ -571,26 +583,19 @@ static void inetpeer_inval_rcu(struct rcu_head *head)
571 schedule_delayed_work(&gc_work, gc_delay); 583 schedule_delayed_work(&gc_work, gc_delay);
572} 584}
573 585
574void inetpeer_invalidate_tree(int family) 586void inetpeer_invalidate_tree(struct inet_peer_base *base)
575{ 587{
576 struct inet_peer *old, *new, *prev; 588 struct inet_peer *root;
577 struct inet_peer_base *base = family_to_base(family);
578 589
579 write_seqlock_bh(&base->lock); 590 write_seqlock_bh(&base->lock);
580 591
581 old = base->root; 592 root = rcu_deref_locked(base->root, base);
582 if (old == peer_avl_empty_rcu) 593 if (root != peer_avl_empty) {
583 goto out; 594 base->root = peer_avl_empty_rcu;
584
585 new = peer_avl_empty_rcu;
586
587 prev = cmpxchg(&base->root, old, new);
588 if (prev == old) {
589 base->total = 0; 595 base->total = 0;
590 call_rcu(&prev->gc_rcu, inetpeer_inval_rcu); 596 call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
591 } 597 }
592 598
593out:
594 write_sequnlock_bh(&base->lock); 599 write_sequnlock_bh(&base->lock);
595} 600}
596EXPORT_SYMBOL(inetpeer_invalidate_tree); 601EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9dbd3dd6022d..7ad88e5e7110 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
171static void ip4_frag_init(struct inet_frag_queue *q, void *a) 171static void ip4_frag_init(struct inet_frag_queue *q, void *a)
172{ 172{
173 struct ipq *qp = container_of(q, struct ipq, q); 173 struct ipq *qp = container_of(q, struct ipq, q);
174 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
175 frags);
176 struct net *net = container_of(ipv4, struct net, ipv4);
177
174 struct ip4_create_arg *arg = a; 178 struct ip4_create_arg *arg = a;
175 179
176 qp->protocol = arg->iph->protocol; 180 qp->protocol = arg->iph->protocol;
@@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
180 qp->daddr = arg->iph->daddr; 184 qp->daddr = arg->iph->daddr;
181 qp->user = arg->user; 185 qp->user = arg->user;
182 qp->peer = sysctl_ipfrag_max_dist ? 186 qp->peer = sysctl_ipfrag_max_dist ?
183 inet_getpeer_v4(arg->iph->saddr, 1) : NULL; 187 inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
184} 188}
185 189
186static __inline__ void ip4_frag_free(struct inet_frag_queue *q) 190static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -254,8 +258,8 @@ static void ip_expire(unsigned long arg)
254 /* skb dst is stale, drop it, and perform route lookup again */ 258 /* skb dst is stale, drop it, and perform route lookup again */
255 skb_dst_drop(head); 259 skb_dst_drop(head);
256 iph = ip_hdr(head); 260 iph = ip_hdr(head);
257 err = ip_route_input_noref(head, iph->daddr, iph->saddr, 261 err = ip_route_input(head, iph->daddr, iph->saddr,
258 iph->tos, head->dev); 262 iph->tos, head->dev);
259 if (err) 263 if (err)
260 goto out_rcu_unlock; 264 goto out_rcu_unlock;
261 265
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f49047b79609..b062a98574f2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -516,9 +516,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
516 case ICMP_PORT_UNREACH: 516 case ICMP_PORT_UNREACH:
517 /* Impossible event. */ 517 /* Impossible event. */
518 return; 518 return;
519 case ICMP_FRAG_NEEDED:
520 /* Soft state for pmtu is maintained by IP core. */
521 return;
522 default: 519 default:
523 /* All others are translated to HOST_UNREACH. 520 /* All others are translated to HOST_UNREACH.
524 rfc2003 contains "deep thoughts" about NET_UNREACH, 521 rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -531,6 +528,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
531 if (code != ICMP_EXC_TTL) 528 if (code != ICMP_EXC_TTL)
532 return; 529 return;
533 break; 530 break;
531
532 case ICMP_REDIRECT:
533 break;
534 } 534 }
535 535
536 rcu_read_lock(); 536 rcu_read_lock();
@@ -538,7 +538,20 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
538 flags & GRE_KEY ? 538 flags & GRE_KEY ?
539 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 539 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
540 p[1]); 540 p[1]);
541 if (t == NULL || t->parms.iph.daddr == 0 || 541 if (t == NULL)
542 goto out;
543
544 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
545 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
546 t->parms.link, 0, IPPROTO_GRE, 0);
547 goto out;
548 }
549 if (type == ICMP_REDIRECT) {
550 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
551 IPPROTO_GRE, 0);
552 goto out;
553 }
554 if (t->parms.iph.daddr == 0 ||
542 ipv4_is_multicast(t->parms.iph.daddr)) 555 ipv4_is_multicast(t->parms.iph.daddr))
543 goto out; 556 goto out;
544 557
@@ -753,7 +766,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
753 766
754 if (skb->protocol == htons(ETH_P_IP)) { 767 if (skb->protocol == htons(ETH_P_IP)) {
755 rt = skb_rtable(skb); 768 rt = skb_rtable(skb);
756 dst = rt->rt_gateway; 769 dst = rt_nexthop(rt, old_iph->daddr);
757 } 770 }
758#if IS_ENABLED(CONFIG_IPV6) 771#if IS_ENABLED(CONFIG_IPV6)
759 else if (skb->protocol == htons(ETH_P_IPV6)) { 772 else if (skb->protocol == htons(ETH_P_IPV6)) {
@@ -820,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
820 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 833 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
821 834
822 if (skb_dst(skb)) 835 if (skb_dst(skb))
823 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 836 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
824 837
825 if (skb->protocol == htons(ETH_P_IP)) { 838 if (skb->protocol == htons(ETH_P_IP)) {
826 df |= (old_iph->frag_off&htons(IP_DF)); 839 df |= (old_iph->frag_off&htons(IP_DF));
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144ca330..4ebc6feee250 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
198 rcu_read_lock(); 198 rcu_read_lock();
199 { 199 {
200 int protocol = ip_hdr(skb)->protocol; 200 int protocol = ip_hdr(skb)->protocol;
201 int hash, raw;
202 const struct net_protocol *ipprot; 201 const struct net_protocol *ipprot;
202 int raw;
203 203
204 resubmit: 204 resubmit:
205 raw = raw_local_deliver(skb, protocol); 205 raw = raw_local_deliver(skb, protocol);
206 206
207 hash = protocol & (MAX_INET_PROTOS - 1); 207 ipprot = rcu_dereference(inet_protos[protocol]);
208 ipprot = rcu_dereference(inet_protos[hash]);
209 if (ipprot != NULL) { 208 if (ipprot != NULL) {
210 int ret; 209 int ret;
211 210
@@ -314,26 +313,33 @@ drop:
314 return true; 313 return true;
315} 314}
316 315
316int sysctl_ip_early_demux __read_mostly = 1;
317
317static int ip_rcv_finish(struct sk_buff *skb) 318static int ip_rcv_finish(struct sk_buff *skb)
318{ 319{
319 const struct iphdr *iph = ip_hdr(skb); 320 const struct iphdr *iph = ip_hdr(skb);
320 struct rtable *rt; 321 struct rtable *rt;
321 322
323 if (sysctl_ip_early_demux && !skb_dst(skb)) {
324 const struct net_protocol *ipprot;
325 int protocol = iph->protocol;
326
327 rcu_read_lock();
328 ipprot = rcu_dereference(inet_protos[protocol]);
329 if (ipprot && ipprot->early_demux)
330 ipprot->early_demux(skb);
331 rcu_read_unlock();
332 }
333
322 /* 334 /*
323 * Initialise the virtual path cache for the packet. It describes 335 * Initialise the virtual path cache for the packet. It describes
324 * how the packet travels inside Linux networking. 336 * how the packet travels inside Linux networking.
325 */ 337 */
326 if (skb_dst(skb) == NULL) { 338 if (!skb_dst(skb)) {
327 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 339 int err = ip_route_input(skb, iph->daddr, iph->saddr,
328 iph->tos, skb->dev); 340 iph->tos, skb->dev);
329 if (unlikely(err)) { 341 if (unlikely(err)) {
330 if (err == -EHOSTUNREACH) 342 if (err == -EXDEV)
331 IP_INC_STATS_BH(dev_net(skb->dev),
332 IPSTATS_MIB_INADDRERRORS);
333 else if (err == -ENETUNREACH)
334 IP_INC_STATS_BH(dev_net(skb->dev),
335 IPSTATS_MIB_INNOROUTES);
336 else if (err == -EXDEV)
337 NET_INC_STATS_BH(dev_net(skb->dev), 343 NET_INC_STATS_BH(dev_net(skb->dev),
338 LINUX_MIB_IPRPFILTER); 344 LINUX_MIB_IPRPFILTER);
339 goto drop; 345 goto drop;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 708b99494e23..1dc01f9793d5 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -27,6 +27,7 @@
27#include <net/icmp.h> 27#include <net/icmp.h>
28#include <net/route.h> 28#include <net/route.h>
29#include <net/cipso_ipv4.h> 29#include <net/cipso_ipv4.h>
30#include <net/ip_fib.h>
30 31
31/* 32/*
32 * Write options to IP header, record destination address to 33 * Write options to IP header, record destination address to
@@ -92,7 +93,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
92 unsigned char *sptr, *dptr; 93 unsigned char *sptr, *dptr;
93 int soffset, doffset; 94 int soffset, doffset;
94 int optlen; 95 int optlen;
95 __be32 daddr;
96 96
97 memset(dopt, 0, sizeof(struct ip_options)); 97 memset(dopt, 0, sizeof(struct ip_options));
98 98
@@ -104,8 +104,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
104 sptr = skb_network_header(skb); 104 sptr = skb_network_header(skb);
105 dptr = dopt->__data; 105 dptr = dopt->__data;
106 106
107 daddr = skb_rtable(skb)->rt_spec_dst;
108
109 if (sopt->rr) { 107 if (sopt->rr) {
110 optlen = sptr[sopt->rr+1]; 108 optlen = sptr[sopt->rr+1];
111 soffset = sptr[sopt->rr+2]; 109 soffset = sptr[sopt->rr+2];
@@ -179,6 +177,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
179 doffset -= 4; 177 doffset -= 4;
180 } 178 }
181 if (doffset > 3) { 179 if (doffset > 3) {
180 __be32 daddr = fib_compute_spec_dst(skb);
181
182 memcpy(&start[doffset-1], &daddr, 4); 182 memcpy(&start[doffset-1], &daddr, 4);
183 dopt->faddr = faddr; 183 dopt->faddr = faddr;
184 dptr[0] = start[0]; 184 dptr[0] = start[0];
@@ -241,6 +241,15 @@ void ip_options_fragment(struct sk_buff *skb)
241 opt->ts_needtime = 0; 241 opt->ts_needtime = 0;
242} 242}
243 243
244/* helper used by ip_options_compile() to call fib_compute_spec_dst()
245 * at most one time.
246 */
247static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
248{
249 if (*spec_dst == htonl(INADDR_ANY))
250 *spec_dst = fib_compute_spec_dst(skb);
251}
252
244/* 253/*
245 * Verify options and fill pointers in struct options. 254 * Verify options and fill pointers in struct options.
246 * Caller should clear *opt, and set opt->data. 255 * Caller should clear *opt, and set opt->data.
@@ -250,12 +259,12 @@ void ip_options_fragment(struct sk_buff *skb)
250int ip_options_compile(struct net *net, 259int ip_options_compile(struct net *net,
251 struct ip_options *opt, struct sk_buff *skb) 260 struct ip_options *opt, struct sk_buff *skb)
252{ 261{
253 int l; 262 __be32 spec_dst = htonl(INADDR_ANY);
254 unsigned char *iph;
255 unsigned char *optptr;
256 int optlen;
257 unsigned char *pp_ptr = NULL; 263 unsigned char *pp_ptr = NULL;
258 struct rtable *rt = NULL; 264 struct rtable *rt = NULL;
265 unsigned char *optptr;
266 unsigned char *iph;
267 int optlen, l;
259 268
260 if (skb != NULL) { 269 if (skb != NULL) {
261 rt = skb_rtable(skb); 270 rt = skb_rtable(skb);
@@ -331,7 +340,8 @@ int ip_options_compile(struct net *net,
331 goto error; 340 goto error;
332 } 341 }
333 if (rt) { 342 if (rt) {
334 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); 343 spec_dst_fill(&spec_dst, skb);
344 memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
335 opt->is_changed = 1; 345 opt->is_changed = 1;
336 } 346 }
337 optptr[2] += 4; 347 optptr[2] += 4;
@@ -373,7 +383,8 @@ int ip_options_compile(struct net *net,
373 } 383 }
374 opt->ts = optptr - iph; 384 opt->ts = optptr - iph;
375 if (rt) { 385 if (rt) {
376 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); 386 spec_dst_fill(&spec_dst, skb);
387 memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
377 timeptr = &optptr[optptr[2]+3]; 388 timeptr = &optptr[optptr[2]+3];
378 } 389 }
379 opt->ts_needaddr = 1; 390 opt->ts_needaddr = 1;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 451f97c42eb4..ba39a52d18c1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb)
113} 113}
114EXPORT_SYMBOL_GPL(ip_local_out); 114EXPORT_SYMBOL_GPL(ip_local_out);
115 115
116/* dev_loopback_xmit for use with netfilter. */
117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118{
119 skb_reset_mac_header(newskb);
120 __skb_pull(newskb, skb_network_offset(newskb));
121 newskb->pkt_type = PACKET_LOOPBACK;
122 newskb->ip_summed = CHECKSUM_UNNECESSARY;
123 WARN_ON(!skb_dst(newskb));
124 skb_dst_force(newskb);
125 netif_rx_ni(newskb);
126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) 116static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{ 117{
131 int ttl = inet->uc_ttl; 118 int ttl = inet->uc_ttl;
@@ -183,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
183 struct net_device *dev = dst->dev; 170 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev); 171 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh; 172 struct neighbour *neigh;
173 u32 nexthop;
186 174
187 if (rt->rt_type == RTN_MULTICAST) { 175 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); 176 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -200,19 +188,22 @@ static inline int ip_finish_output2(struct sk_buff *skb)
200 } 188 }
201 if (skb->sk) 189 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk); 190 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb); 191 consume_skb(skb);
204 skb = skb2; 192 skb = skb2;
205 } 193 }
206 194
207 rcu_read_lock(); 195 rcu_read_lock_bh();
208 neigh = dst_get_neighbour_noref(dst); 196 nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
198 if (unlikely(!neigh))
199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
209 if (neigh) { 200 if (neigh) {
210 int res = neigh_output(neigh, skb); 201 int res = dst_neigh_output(dst, neigh, skb);
211 202
212 rcu_read_unlock(); 203 rcu_read_unlock_bh();
213 return res; 204 return res;
214 } 205 }
215 rcu_read_unlock(); 206 rcu_read_unlock_bh();
216 207
217 net_dbg_ratelimited("%s: No header cache and no neighbour!\n", 208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
218 __func__); 209 __func__);
@@ -281,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb)
281 if (newskb) 272 if (newskb)
282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 273 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 newskb, NULL, newskb->dev, 274 newskb, NULL, newskb->dev,
284 ip_dev_loopback_xmit); 275 dev_loopback_xmit);
285 } 276 }
286 277
287 /* Multicasts with ttl 0 must not go beyond the host */ 278 /* Multicasts with ttl 0 must not go beyond the host */
@@ -296,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb)
296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 287 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 if (newskb) 288 if (newskb)
298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, 289 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 NULL, newskb->dev, ip_dev_loopback_xmit); 290 NULL, newskb->dev, dev_loopback_xmit);
300 } 291 }
301 292
302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, 293 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -380,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
380 skb_dst_set_noref(skb, &rt->dst); 371 skb_dst_set_noref(skb, &rt->dst);
381 372
382packet_routed: 373packet_routed:
383 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) 374 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
384 goto no_route; 375 goto no_route;
385 376
386 /* OK, we know where to send it, allocate and build IP header. */ 377 /* OK, we know where to send it, allocate and build IP header. */
@@ -709,7 +700,7 @@ slow_path:
709 700
710 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 701 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
711 } 702 }
712 kfree_skb(skb); 703 consume_skb(skb);
713 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
714 return err; 705 return err;
715 706
@@ -1472,19 +1463,34 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1472 1463
1473/* 1464/*
1474 * Generic function to send a packet as reply to another packet. 1465 * Generic function to send a packet as reply to another packet.
1475 * Used to send TCP resets so far. ICMP should use this function too. 1466 * Used to send some TCP resets/acks so far.
1476 * 1467 *
1477 * Should run single threaded per socket because it uses the sock 1468 * Use a fake percpu inet socket to avoid false sharing and contention.
1478 * structure to pass arguments.
1479 */ 1469 */
1480void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, 1470static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
1481 const struct ip_reply_arg *arg, unsigned int len) 1471 .sk = {
1472 .__sk_common = {
1473 .skc_refcnt = ATOMIC_INIT(1),
1474 },
1475 .sk_wmem_alloc = ATOMIC_INIT(1),
1476 .sk_allocation = GFP_ATOMIC,
1477 .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
1478 },
1479 .pmtudisc = IP_PMTUDISC_WANT,
1480 .uc_ttl = -1,
1481};
1482
1483void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
1484 __be32 saddr, const struct ip_reply_arg *arg,
1485 unsigned int len)
1482{ 1486{
1483 struct inet_sock *inet = inet_sk(sk);
1484 struct ip_options_data replyopts; 1487 struct ip_options_data replyopts;
1485 struct ipcm_cookie ipc; 1488 struct ipcm_cookie ipc;
1486 struct flowi4 fl4; 1489 struct flowi4 fl4;
1487 struct rtable *rt = skb_rtable(skb); 1490 struct rtable *rt = skb_rtable(skb);
1491 struct sk_buff *nskb;
1492 struct sock *sk;
1493 struct inet_sock *inet;
1488 1494
1489 if (ip_options_echo(&replyopts.opt.opt, skb)) 1495 if (ip_options_echo(&replyopts.opt.opt, skb))
1490 return; 1496 return;
@@ -1502,38 +1508,39 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1502 1508
1503 flowi4_init_output(&fl4, arg->bound_dev_if, 0, 1509 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1504 RT_TOS(arg->tos), 1510 RT_TOS(arg->tos),
1505 RT_SCOPE_UNIVERSE, sk->sk_protocol, 1511 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1506 ip_reply_arg_flowi_flags(arg), 1512 ip_reply_arg_flowi_flags(arg),
1507 daddr, rt->rt_spec_dst, 1513 daddr, saddr,
1508 tcp_hdr(skb)->source, tcp_hdr(skb)->dest); 1514 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1509 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 1515 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1510 rt = ip_route_output_key(sock_net(sk), &fl4); 1516 rt = ip_route_output_key(net, &fl4);
1511 if (IS_ERR(rt)) 1517 if (IS_ERR(rt))
1512 return; 1518 return;
1513 1519
1514 /* And let IP do all the hard work. 1520 inet = &get_cpu_var(unicast_sock);
1515 1521
1516 This chunk is not reenterable, hence spinlock.
1517 Note that it uses the fact, that this function is called
1518 with locally disabled BH and that sk cannot be already spinlocked.
1519 */
1520 bh_lock_sock(sk);
1521 inet->tos = arg->tos; 1522 inet->tos = arg->tos;
1523 sk = &inet->sk;
1522 sk->sk_priority = skb->priority; 1524 sk->sk_priority = skb->priority;
1523 sk->sk_protocol = ip_hdr(skb)->protocol; 1525 sk->sk_protocol = ip_hdr(skb)->protocol;
1524 sk->sk_bound_dev_if = arg->bound_dev_if; 1526 sk->sk_bound_dev_if = arg->bound_dev_if;
1527 sock_net_set(sk, net);
1528 __skb_queue_head_init(&sk->sk_write_queue);
1529 sk->sk_sndbuf = sysctl_wmem_default;
1525 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1530 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1526 &ipc, &rt, MSG_DONTWAIT); 1531 &ipc, &rt, MSG_DONTWAIT);
1527 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1532 nskb = skb_peek(&sk->sk_write_queue);
1533 if (nskb) {
1528 if (arg->csumoffset >= 0) 1534 if (arg->csumoffset >= 0)
1529 *((__sum16 *)skb_transport_header(skb) + 1535 *((__sum16 *)skb_transport_header(nskb) +
1530 arg->csumoffset) = csum_fold(csum_add(skb->csum, 1536 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1531 arg->csum)); 1537 arg->csum));
1532 skb->ip_summed = CHECKSUM_NONE; 1538 nskb->ip_summed = CHECKSUM_NONE;
1539 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1533 ip_push_pending_frames(sk, &fl4); 1540 ip_push_pending_frames(sk, &fl4);
1534 } 1541 }
1535 1542
1536 bh_unlock_sock(sk); 1543 put_cpu_var(unicast_sock);
1537 1544
1538 ip_rt_put(rt); 1545 ip_rt_put(rt);
1539} 1546}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0d11f234d615..5eea4a811042 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -40,6 +40,7 @@
40#if IS_ENABLED(CONFIG_IPV6) 40#if IS_ENABLED(CONFIG_IPV6)
41#include <net/transp_v6.h> 41#include <net/transp_v6.h>
42#endif 42#endif
43#include <net/ip_fib.h>
43 44
44#include <linux/errqueue.h> 45#include <linux/errqueue.h>
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
@@ -1019,18 +1020,17 @@ e_inval:
1019 * @sk: socket 1020 * @sk: socket
1020 * @skb: buffer 1021 * @skb: buffer
1021 * 1022 *
1022 * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst 1023 * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
1023 * in skb->cb[] before dst drop. 1024 * destination in skb->cb[] before dst drop.
1024 * This way, receiver doesnt make cache line misses to read rtable. 1025 * This way, receiver doesnt make cache line misses to read rtable.
1025 */ 1026 */
1026void ipv4_pktinfo_prepare(struct sk_buff *skb) 1027void ipv4_pktinfo_prepare(struct sk_buff *skb)
1027{ 1028{
1028 struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); 1029 struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
1029 const struct rtable *rt = skb_rtable(skb);
1030 1030
1031 if (rt) { 1031 if (skb_rtable(skb)) {
1032 pktinfo->ipi_ifindex = rt->rt_iif; 1032 pktinfo->ipi_ifindex = inet_iif(skb);
1033 pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst; 1033 pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
1034 } else { 1034 } else {
1035 pktinfo->ipi_ifindex = 0; 1035 pktinfo->ipi_ifindex = 0;
1036 pktinfo->ipi_spec_dst.s_addr = 0; 1036 pktinfo->ipi_spec_dst.s_addr = 0;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 000000000000..3511ffba7bd4
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,956 @@
1/*
2 * Linux NET3: IP/IP protocol decoder modified to support
3 * virtual tunnel interface
4 *
5 * Authors:
6 * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 */
14
15/*
16 This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
17
18 For comments look at net/ipv4/ip_gre.c --ANK
19 */
20
21
22#include <linux/capability.h>
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/mroute.h>
34#include <linux/init.h>
35#include <linux/netfilter_ipv4.h>
36#include <linux/if_ether.h>
37
38#include <net/sock.h>
39#include <net/ip.h>
40#include <net/icmp.h>
41#include <net/ipip.h>
42#include <net/inet_ecn.h>
43#include <net/xfrm.h>
44#include <net/net_namespace.h>
45#include <net/netns/generic.h>
46
47#define HASH_SIZE 16
48#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
49
50static struct rtnl_link_ops vti_link_ops __read_mostly;
51
52static int vti_net_id __read_mostly;
53struct vti_net {
54 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
55 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
56 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
57 struct ip_tunnel __rcu *tunnels_wc[1];
58 struct ip_tunnel __rcu **tunnels[4];
59
60 struct net_device *fb_tunnel_dev;
61};
62
63static int vti_fb_tunnel_init(struct net_device *dev);
64static int vti_tunnel_init(struct net_device *dev);
65static void vti_tunnel_setup(struct net_device *dev);
66static void vti_dev_free(struct net_device *dev);
67static int vti_tunnel_bind_dev(struct net_device *dev);
68
69/* Locking : hash tables are protected by RCU and RTNL */
70
71#define for_each_ip_tunnel_rcu(start) \
72 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
73
74/* often modified stats are per cpu, other are shared (netdev->stats) */
75struct pcpu_tstats {
76 u64 rx_packets;
77 u64 rx_bytes;
78 u64 tx_packets;
79 u64 tx_bytes;
80 struct u64_stats_sync syncp;
81};
82
83#define VTI_XMIT(stats1, stats2) do { \
84 int err; \
85 int pkt_len = skb->len; \
86 err = dst_output(skb); \
87 if (net_xmit_eval(err) == 0) { \
88 u64_stats_update_begin(&(stats1)->syncp); \
89 (stats1)->tx_bytes += pkt_len; \
90 (stats1)->tx_packets++; \
91 u64_stats_update_end(&(stats1)->syncp); \
92 } else { \
93 (stats2)->tx_errors++; \
94 (stats2)->tx_aborted_errors++; \
95 } \
96} while (0)
97
98
99static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
100 struct rtnl_link_stats64 *tot)
101{
102 int i;
103
104 for_each_possible_cpu(i) {
105 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
106 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
107 unsigned int start;
108
109 do {
110 start = u64_stats_fetch_begin_bh(&tstats->syncp);
111 rx_packets = tstats->rx_packets;
112 tx_packets = tstats->tx_packets;
113 rx_bytes = tstats->rx_bytes;
114 tx_bytes = tstats->tx_bytes;
115 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
116
117 tot->rx_packets += rx_packets;
118 tot->tx_packets += tx_packets;
119 tot->rx_bytes += rx_bytes;
120 tot->tx_bytes += tx_bytes;
121 }
122
123 tot->multicast = dev->stats.multicast;
124 tot->rx_crc_errors = dev->stats.rx_crc_errors;
125 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
126 tot->rx_length_errors = dev->stats.rx_length_errors;
127 tot->rx_errors = dev->stats.rx_errors;
128 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
129 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
130 tot->tx_dropped = dev->stats.tx_dropped;
131 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
132 tot->tx_errors = dev->stats.tx_errors;
133
134 return tot;
135}
136
137static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
138 __be32 remote, __be32 local)
139{
140 unsigned h0 = HASH(remote);
141 unsigned h1 = HASH(local);
142 struct ip_tunnel *t;
143 struct vti_net *ipn = net_generic(net, vti_net_id);
144
145 for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
146 if (local == t->parms.iph.saddr &&
147 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
148 return t;
149 for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
150 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
151 return t;
152
153 for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
154 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
155 return t;
156
157 for_each_ip_tunnel_rcu(ipn->tunnels_wc[0])
158 if (t && (t->dev->flags&IFF_UP))
159 return t;
160 return NULL;
161}
162
163static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
164 struct ip_tunnel_parm *parms)
165{
166 __be32 remote = parms->iph.daddr;
167 __be32 local = parms->iph.saddr;
168 unsigned h = 0;
169 int prio = 0;
170
171 if (remote) {
172 prio |= 2;
173 h ^= HASH(remote);
174 }
175 if (local) {
176 prio |= 1;
177 h ^= HASH(local);
178 }
179 return &ipn->tunnels[prio][h];
180}
181
182static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
183 struct ip_tunnel *t)
184{
185 return __vti_bucket(ipn, &t->parms);
186}
187
188static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
189{
190 struct ip_tunnel __rcu **tp;
191 struct ip_tunnel *iter;
192
193 for (tp = vti_bucket(ipn, t);
194 (iter = rtnl_dereference(*tp)) != NULL;
195 tp = &iter->next) {
196 if (t == iter) {
197 rcu_assign_pointer(*tp, t->next);
198 break;
199 }
200 }
201}
202
203static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
204{
205 struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
206
207 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
208 rcu_assign_pointer(*tp, t);
209}
210
211static struct ip_tunnel *vti_tunnel_locate(struct net *net,
212 struct ip_tunnel_parm *parms,
213 int create)
214{
215 __be32 remote = parms->iph.daddr;
216 __be32 local = parms->iph.saddr;
217 struct ip_tunnel *t, *nt;
218 struct ip_tunnel __rcu **tp;
219 struct net_device *dev;
220 char name[IFNAMSIZ];
221 struct vti_net *ipn = net_generic(net, vti_net_id);
222
223 for (tp = __vti_bucket(ipn, parms);
224 (t = rtnl_dereference(*tp)) != NULL;
225 tp = &t->next) {
226 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
227 return t;
228 }
229 if (!create)
230 return NULL;
231
232 if (parms->name[0])
233 strlcpy(name, parms->name, IFNAMSIZ);
234 else
235 strcpy(name, "vti%d");
236
237 dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
238 if (dev == NULL)
239 return NULL;
240
241 dev_net_set(dev, net);
242
243 nt = netdev_priv(dev);
244 nt->parms = *parms;
245 dev->rtnl_link_ops = &vti_link_ops;
246
247 vti_tunnel_bind_dev(dev);
248
249 if (register_netdevice(dev) < 0)
250 goto failed_free;
251
252 dev_hold(dev);
253 vti_tunnel_link(ipn, nt);
254 return nt;
255
256failed_free:
257 free_netdev(dev);
258 return NULL;
259}
260
261static void vti_tunnel_uninit(struct net_device *dev)
262{
263 struct net *net = dev_net(dev);
264 struct vti_net *ipn = net_generic(net, vti_net_id);
265
266 vti_tunnel_unlink(ipn, netdev_priv(dev));
267 dev_put(dev);
268}
269
270static int vti_err(struct sk_buff *skb, u32 info)
271{
272
273 /* All the routers (except for Linux) return only
274 * 8 bytes of packet payload. It means, that precise relaying of
275 * ICMP in the real Internet is absolutely infeasible.
276 */
277 struct iphdr *iph = (struct iphdr *)skb->data;
278 const int type = icmp_hdr(skb)->type;
279 const int code = icmp_hdr(skb)->code;
280 struct ip_tunnel *t;
281 int err;
282
283 switch (type) {
284 default:
285 case ICMP_PARAMETERPROB:
286 return 0;
287
288 case ICMP_DEST_UNREACH:
289 switch (code) {
290 case ICMP_SR_FAILED:
291 case ICMP_PORT_UNREACH:
292 /* Impossible event. */
293 return 0;
294 default:
295 /* All others are translated to HOST_UNREACH. */
296 break;
297 }
298 break;
299 case ICMP_TIME_EXCEEDED:
300 if (code != ICMP_EXC_TTL)
301 return 0;
302 break;
303 }
304
305 err = -ENOENT;
306
307 rcu_read_lock();
308 t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
309 if (t == NULL)
310 goto out;
311
312 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
313 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
314 t->parms.link, 0, IPPROTO_IPIP, 0);
315 err = 0;
316 goto out;
317 }
318
319 err = 0;
320 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
321 goto out;
322
323 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
324 t->err_count++;
325 else
326 t->err_count = 1;
327 t->err_time = jiffies;
328out:
329 rcu_read_unlock();
330 return err;
331}
332
333/* We dont digest the packet therefore let the packet pass */
334static int vti_rcv(struct sk_buff *skb)
335{
336 struct ip_tunnel *tunnel;
337 const struct iphdr *iph = ip_hdr(skb);
338
339 rcu_read_lock();
340 tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
341 if (tunnel != NULL) {
342 struct pcpu_tstats *tstats;
343
344 tstats = this_cpu_ptr(tunnel->dev->tstats);
345 u64_stats_update_begin(&tstats->syncp);
346 tstats->rx_packets++;
347 tstats->rx_bytes += skb->len;
348 u64_stats_update_end(&tstats->syncp);
349
350 skb->dev = tunnel->dev;
351 rcu_read_unlock();
352 return 1;
353 }
354 rcu_read_unlock();
355
356 return -1;
357}
358
359/* This function assumes it is being called from dev_queue_xmit()
360 * and that skb is filled properly by that function.
361 */
362
363static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
364{
365 struct ip_tunnel *tunnel = netdev_priv(dev);
366 struct pcpu_tstats *tstats;
367 struct iphdr *tiph = &tunnel->parms.iph;
368 u8 tos;
369 struct rtable *rt; /* Route to the other host */
370 struct net_device *tdev; /* Device to other host */
371 struct iphdr *old_iph = ip_hdr(skb);
372 __be32 dst = tiph->daddr;
373 struct flowi4 fl4;
374
375 if (skb->protocol != htons(ETH_P_IP))
376 goto tx_error;
377
378 tos = old_iph->tos;
379
380 memset(&fl4, 0, sizeof(fl4));
381 flowi4_init_output(&fl4, tunnel->parms.link,
382 htonl(tunnel->parms.i_key), RT_TOS(tos),
383 RT_SCOPE_UNIVERSE,
384 IPPROTO_IPIP, 0,
385 dst, tiph->saddr, 0, 0);
386 rt = ip_route_output_key(dev_net(dev), &fl4);
387 if (IS_ERR(rt)) {
388 dev->stats.tx_carrier_errors++;
389 goto tx_error_icmp;
390 }
391 /* if there is no transform then this tunnel is not functional.
392 * Or if the xfrm is not mode tunnel.
393 */
394 if (!rt->dst.xfrm ||
395 rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
396 dev->stats.tx_carrier_errors++;
397 goto tx_error_icmp;
398 }
399 tdev = rt->dst.dev;
400
401 if (tdev == dev) {
402 ip_rt_put(rt);
403 dev->stats.collisions++;
404 goto tx_error;
405 }
406
407 if (tunnel->err_count > 0) {
408 if (time_before(jiffies,
409 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
410 tunnel->err_count--;
411 dst_link_failure(skb);
412 } else
413 tunnel->err_count = 0;
414 }
415
416 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
417 IPSKB_REROUTED);
418 skb_dst_drop(skb);
419 skb_dst_set(skb, &rt->dst);
420 nf_reset(skb);
421 skb->dev = skb_dst(skb)->dev;
422
423 tstats = this_cpu_ptr(dev->tstats);
424 VTI_XMIT(tstats, &dev->stats);
425 return NETDEV_TX_OK;
426
427tx_error_icmp:
428 dst_link_failure(skb);
429tx_error:
430 dev->stats.tx_errors++;
431 dev_kfree_skb(skb);
432 return NETDEV_TX_OK;
433}
434
435static int vti_tunnel_bind_dev(struct net_device *dev)
436{
437 struct net_device *tdev = NULL;
438 struct ip_tunnel *tunnel;
439 struct iphdr *iph;
440
441 tunnel = netdev_priv(dev);
442 iph = &tunnel->parms.iph;
443
444 if (iph->daddr) {
445 struct rtable *rt;
446 struct flowi4 fl4;
447 memset(&fl4, 0, sizeof(fl4));
448 flowi4_init_output(&fl4, tunnel->parms.link,
449 htonl(tunnel->parms.i_key),
450 RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
451 IPPROTO_IPIP, 0,
452 iph->daddr, iph->saddr, 0, 0);
453 rt = ip_route_output_key(dev_net(dev), &fl4);
454 if (!IS_ERR(rt)) {
455 tdev = rt->dst.dev;
456 ip_rt_put(rt);
457 }
458 dev->flags |= IFF_POINTOPOINT;
459 }
460
461 if (!tdev && tunnel->parms.link)
462 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
463
464 if (tdev) {
465 dev->hard_header_len = tdev->hard_header_len +
466 sizeof(struct iphdr);
467 dev->mtu = tdev->mtu;
468 }
469 dev->iflink = tunnel->parms.link;
470 return dev->mtu;
471}
472
473static int
474vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
475{
476 int err = 0;
477 struct ip_tunnel_parm p;
478 struct ip_tunnel *t;
479 struct net *net = dev_net(dev);
480 struct vti_net *ipn = net_generic(net, vti_net_id);
481
482 switch (cmd) {
483 case SIOCGETTUNNEL:
484 t = NULL;
485 if (dev == ipn->fb_tunnel_dev) {
486 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
487 sizeof(p))) {
488 err = -EFAULT;
489 break;
490 }
491 t = vti_tunnel_locate(net, &p, 0);
492 }
493 if (t == NULL)
494 t = netdev_priv(dev);
495 memcpy(&p, &t->parms, sizeof(p));
496 p.i_flags |= GRE_KEY | VTI_ISVTI;
497 p.o_flags |= GRE_KEY;
498 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
499 err = -EFAULT;
500 break;
501
502 case SIOCADDTUNNEL:
503 case SIOCCHGTUNNEL:
504 err = -EPERM;
505 if (!capable(CAP_NET_ADMIN))
506 goto done;
507
508 err = -EFAULT;
509 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
510 goto done;
511
512 err = -EINVAL;
513 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
514 p.iph.ihl != 5)
515 goto done;
516
517 t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
518
519 if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
520 if (t != NULL) {
521 if (t->dev != dev) {
522 err = -EEXIST;
523 break;
524 }
525 } else {
526 if (((dev->flags&IFF_POINTOPOINT) &&
527 !p.iph.daddr) ||
528 (!(dev->flags&IFF_POINTOPOINT) &&
529 p.iph.daddr)) {
530 err = -EINVAL;
531 break;
532 }
533 t = netdev_priv(dev);
534 vti_tunnel_unlink(ipn, t);
535 synchronize_net();
536 t->parms.iph.saddr = p.iph.saddr;
537 t->parms.iph.daddr = p.iph.daddr;
538 t->parms.i_key = p.i_key;
539 t->parms.o_key = p.o_key;
540 t->parms.iph.protocol = IPPROTO_IPIP;
541 memcpy(dev->dev_addr, &p.iph.saddr, 4);
542 memcpy(dev->broadcast, &p.iph.daddr, 4);
543 vti_tunnel_link(ipn, t);
544 netdev_state_change(dev);
545 }
546 }
547
548 if (t) {
549 err = 0;
550 if (cmd == SIOCCHGTUNNEL) {
551 t->parms.i_key = p.i_key;
552 t->parms.o_key = p.o_key;
553 if (t->parms.link != p.link) {
554 t->parms.link = p.link;
555 vti_tunnel_bind_dev(dev);
556 netdev_state_change(dev);
557 }
558 }
559 p.i_flags |= GRE_KEY | VTI_ISVTI;
560 p.o_flags |= GRE_KEY;
561 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
562 sizeof(p)))
563 err = -EFAULT;
564 } else
565 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
566 break;
567
568 case SIOCDELTUNNEL:
569 err = -EPERM;
570 if (!capable(CAP_NET_ADMIN))
571 goto done;
572
573 if (dev == ipn->fb_tunnel_dev) {
574 err = -EFAULT;
575 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
576 sizeof(p)))
577 goto done;
578 err = -ENOENT;
579
580 t = vti_tunnel_locate(net, &p, 0);
581 if (t == NULL)
582 goto done;
583 err = -EPERM;
584 if (t->dev == ipn->fb_tunnel_dev)
585 goto done;
586 dev = t->dev;
587 }
588 unregister_netdevice(dev);
589 err = 0;
590 break;
591
592 default:
593 err = -EINVAL;
594 }
595
596done:
597 return err;
598}
599
600static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
601{
602 if (new_mtu < 68 || new_mtu > 0xFFF8)
603 return -EINVAL;
604 dev->mtu = new_mtu;
605 return 0;
606}
607
608static const struct net_device_ops vti_netdev_ops = {
609 .ndo_init = vti_tunnel_init,
610 .ndo_uninit = vti_tunnel_uninit,
611 .ndo_start_xmit = vti_tunnel_xmit,
612 .ndo_do_ioctl = vti_tunnel_ioctl,
613 .ndo_change_mtu = vti_tunnel_change_mtu,
614 .ndo_get_stats64 = vti_get_stats64,
615};
616
617static void vti_dev_free(struct net_device *dev)
618{
619 free_percpu(dev->tstats);
620 free_netdev(dev);
621}
622
623static void vti_tunnel_setup(struct net_device *dev)
624{
625 dev->netdev_ops = &vti_netdev_ops;
626 dev->destructor = vti_dev_free;
627
628 dev->type = ARPHRD_TUNNEL;
629 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
630 dev->mtu = ETH_DATA_LEN;
631 dev->flags = IFF_NOARP;
632 dev->iflink = 0;
633 dev->addr_len = 4;
634 dev->features |= NETIF_F_NETNS_LOCAL;
635 dev->features |= NETIF_F_LLTX;
636 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
637}
638
639static int vti_tunnel_init(struct net_device *dev)
640{
641 struct ip_tunnel *tunnel = netdev_priv(dev);
642
643 tunnel->dev = dev;
644 strcpy(tunnel->parms.name, dev->name);
645
646 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
647 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
648
649 dev->tstats = alloc_percpu(struct pcpu_tstats);
650 if (!dev->tstats)
651 return -ENOMEM;
652
653 return 0;
654}
655
656static int __net_init vti_fb_tunnel_init(struct net_device *dev)
657{
658 struct ip_tunnel *tunnel = netdev_priv(dev);
659 struct iphdr *iph = &tunnel->parms.iph;
660 struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
661
662 tunnel->dev = dev;
663 strcpy(tunnel->parms.name, dev->name);
664
665 iph->version = 4;
666 iph->protocol = IPPROTO_IPIP;
667 iph->ihl = 5;
668
669 dev->tstats = alloc_percpu(struct pcpu_tstats);
670 if (!dev->tstats)
671 return -ENOMEM;
672
673 dev_hold(dev);
674 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
675 return 0;
676}
677
678static struct xfrm_tunnel vti_handler __read_mostly = {
679 .handler = vti_rcv,
680 .err_handler = vti_err,
681 .priority = 1,
682};
683
684static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
685{
686 int prio;
687
688 for (prio = 1; prio < 4; prio++) {
689 int h;
690 for (h = 0; h < HASH_SIZE; h++) {
691 struct ip_tunnel *t;
692
693 t = rtnl_dereference(ipn->tunnels[prio][h]);
694 while (t != NULL) {
695 unregister_netdevice_queue(t->dev, head);
696 t = rtnl_dereference(t->next);
697 }
698 }
699 }
700}
701
702static int __net_init vti_init_net(struct net *net)
703{
704 int err;
705 struct vti_net *ipn = net_generic(net, vti_net_id);
706
707 ipn->tunnels[0] = ipn->tunnels_wc;
708 ipn->tunnels[1] = ipn->tunnels_l;
709 ipn->tunnels[2] = ipn->tunnels_r;
710 ipn->tunnels[3] = ipn->tunnels_r_l;
711
712 ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
713 "ip_vti0",
714 vti_tunnel_setup);
715 if (!ipn->fb_tunnel_dev) {
716 err = -ENOMEM;
717 goto err_alloc_dev;
718 }
719 dev_net_set(ipn->fb_tunnel_dev, net);
720
721 err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
722 if (err)
723 goto err_reg_dev;
724 ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
725
726 err = register_netdev(ipn->fb_tunnel_dev);
727 if (err)
728 goto err_reg_dev;
729 return 0;
730
731err_reg_dev:
732 vti_dev_free(ipn->fb_tunnel_dev);
733err_alloc_dev:
734 /* nothing */
735 return err;
736}
737
738static void __net_exit vti_exit_net(struct net *net)
739{
740 struct vti_net *ipn = net_generic(net, vti_net_id);
741 LIST_HEAD(list);
742
743 rtnl_lock();
744 vti_destroy_tunnels(ipn, &list);
745 unregister_netdevice_many(&list);
746 rtnl_unlock();
747}
748
749static struct pernet_operations vti_net_ops = {
750 .init = vti_init_net,
751 .exit = vti_exit_net,
752 .id = &vti_net_id,
753 .size = sizeof(struct vti_net),
754};
755
756static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
757{
758 return 0;
759}
760
761static void vti_netlink_parms(struct nlattr *data[],
762 struct ip_tunnel_parm *parms)
763{
764 memset(parms, 0, sizeof(*parms));
765
766 parms->iph.protocol = IPPROTO_IPIP;
767
768 if (!data)
769 return;
770
771 if (data[IFLA_VTI_LINK])
772 parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
773
774 if (data[IFLA_VTI_IKEY])
775 parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
776
777 if (data[IFLA_VTI_OKEY])
778 parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
779
780 if (data[IFLA_VTI_LOCAL])
781 parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
782
783 if (data[IFLA_VTI_REMOTE])
784 parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
785
786}
787
788static int vti_newlink(struct net *src_net, struct net_device *dev,
789 struct nlattr *tb[], struct nlattr *data[])
790{
791 struct ip_tunnel *nt;
792 struct net *net = dev_net(dev);
793 struct vti_net *ipn = net_generic(net, vti_net_id);
794 int mtu;
795 int err;
796
797 nt = netdev_priv(dev);
798 vti_netlink_parms(data, &nt->parms);
799
800 if (vti_tunnel_locate(net, &nt->parms, 0))
801 return -EEXIST;
802
803 mtu = vti_tunnel_bind_dev(dev);
804 if (!tb[IFLA_MTU])
805 dev->mtu = mtu;
806
807 err = register_netdevice(dev);
808 if (err)
809 goto out;
810
811 dev_hold(dev);
812 vti_tunnel_link(ipn, nt);
813
814out:
815 return err;
816}
817
818static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
819 struct nlattr *data[])
820{
821 struct ip_tunnel *t, *nt;
822 struct net *net = dev_net(dev);
823 struct vti_net *ipn = net_generic(net, vti_net_id);
824 struct ip_tunnel_parm p;
825 int mtu;
826
827 if (dev == ipn->fb_tunnel_dev)
828 return -EINVAL;
829
830 nt = netdev_priv(dev);
831 vti_netlink_parms(data, &p);
832
833 t = vti_tunnel_locate(net, &p, 0);
834
835 if (t) {
836 if (t->dev != dev)
837 return -EEXIST;
838 } else {
839 t = nt;
840
841 vti_tunnel_unlink(ipn, t);
842 t->parms.iph.saddr = p.iph.saddr;
843 t->parms.iph.daddr = p.iph.daddr;
844 t->parms.i_key = p.i_key;
845 t->parms.o_key = p.o_key;
846 if (dev->type != ARPHRD_ETHER) {
847 memcpy(dev->dev_addr, &p.iph.saddr, 4);
848 memcpy(dev->broadcast, &p.iph.daddr, 4);
849 }
850 vti_tunnel_link(ipn, t);
851 netdev_state_change(dev);
852 }
853
854 if (t->parms.link != p.link) {
855 t->parms.link = p.link;
856 mtu = vti_tunnel_bind_dev(dev);
857 if (!tb[IFLA_MTU])
858 dev->mtu = mtu;
859 netdev_state_change(dev);
860 }
861
862 return 0;
863}
864
865static size_t vti_get_size(const struct net_device *dev)
866{
867 return
868 /* IFLA_VTI_LINK */
869 nla_total_size(4) +
870 /* IFLA_VTI_IKEY */
871 nla_total_size(4) +
872 /* IFLA_VTI_OKEY */
873 nla_total_size(4) +
874 /* IFLA_VTI_LOCAL */
875 nla_total_size(4) +
876 /* IFLA_VTI_REMOTE */
877 nla_total_size(4) +
878 0;
879}
880
881static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
882{
883 struct ip_tunnel *t = netdev_priv(dev);
884 struct ip_tunnel_parm *p = &t->parms;
885
886 nla_put_u32(skb, IFLA_VTI_LINK, p->link);
887 nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
888 nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
889 nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
890 nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
891
892 return 0;
893}
894
895static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
896 [IFLA_VTI_LINK] = { .type = NLA_U32 },
897 [IFLA_VTI_IKEY] = { .type = NLA_U32 },
898 [IFLA_VTI_OKEY] = { .type = NLA_U32 },
899 [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
900 [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
901};
902
903static struct rtnl_link_ops vti_link_ops __read_mostly = {
904 .kind = "vti",
905 .maxtype = IFLA_VTI_MAX,
906 .policy = vti_policy,
907 .priv_size = sizeof(struct ip_tunnel),
908 .setup = vti_tunnel_setup,
909 .validate = vti_tunnel_validate,
910 .newlink = vti_newlink,
911 .changelink = vti_changelink,
912 .get_size = vti_get_size,
913 .fill_info = vti_fill_info,
914};
915
916static int __init vti_init(void)
917{
918 int err;
919
920 pr_info("IPv4 over IPSec tunneling driver\n");
921
922 err = register_pernet_device(&vti_net_ops);
923 if (err < 0)
924 return err;
925 err = xfrm4_mode_tunnel_input_register(&vti_handler);
926 if (err < 0) {
927 unregister_pernet_device(&vti_net_ops);
928 pr_info(KERN_INFO "vti init: can't register tunnel\n");
929 }
930
931 err = rtnl_link_register(&vti_link_ops);
932 if (err < 0)
933 goto rtnl_link_failed;
934
935 return err;
936
937rtnl_link_failed:
938 xfrm4_mode_tunnel_input_deregister(&vti_handler);
939 unregister_pernet_device(&vti_net_ops);
940 return err;
941}
942
943static void __exit vti_fini(void)
944{
945 rtnl_link_unregister(&vti_link_ops);
946 if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
947 pr_info("vti close: can't deregister tunnel\n");
948
949 unregister_pernet_device(&vti_net_ops);
950}
951
952module_init(vti_init);
953module_exit(vti_fini);
954MODULE_LICENSE("GPL");
955MODULE_ALIAS_RTNL_LINK("vti");
956MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 63b64c45a826..d3ab47e19a89 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -31,17 +31,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); 31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
32 struct xfrm_state *x; 32 struct xfrm_state *x;
33 33
34 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || 34 switch (icmp_hdr(skb)->type) {
35 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 35 case ICMP_DEST_UNREACH:
36 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
37 return;
38 case ICMP_REDIRECT:
39 break;
40 default:
36 return; 41 return;
42 }
37 43
38 spi = htonl(ntohs(ipch->cpi)); 44 spi = htonl(ntohs(ipch->cpi));
39 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 45 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
40 spi, IPPROTO_COMP, AF_INET); 46 spi, IPPROTO_COMP, AF_INET);
41 if (!x) 47 if (!x)
42 return; 48 return;
43 NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", 49
44 spi, &iph->daddr); 50 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
51 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
52 else
53 ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
45 xfrm_state_put(x); 54 xfrm_state_put(x);
46} 55}
47 56
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2d0f99bf61b3..99af1f0cc658 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -348,9 +348,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
348 case ICMP_PORT_UNREACH: 348 case ICMP_PORT_UNREACH:
349 /* Impossible event. */ 349 /* Impossible event. */
350 return 0; 350 return 0;
351 case ICMP_FRAG_NEEDED:
352 /* Soft state for pmtu is maintained by IP core. */
353 return 0;
354 default: 351 default:
355 /* All others are translated to HOST_UNREACH. 352 /* All others are translated to HOST_UNREACH.
356 rfc2003 contains "deep thoughts" about NET_UNREACH, 353 rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -363,13 +360,32 @@ static int ipip_err(struct sk_buff *skb, u32 info)
363 if (code != ICMP_EXC_TTL) 360 if (code != ICMP_EXC_TTL)
364 return 0; 361 return 0;
365 break; 362 break;
363 case ICMP_REDIRECT:
364 break;
366 } 365 }
367 366
368 err = -ENOENT; 367 err = -ENOENT;
369 368
370 rcu_read_lock(); 369 rcu_read_lock();
371 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 370 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
372 if (t == NULL || t->parms.iph.daddr == 0) 371 if (t == NULL)
372 goto out;
373
374 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
375 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
376 t->dev->ifindex, 0, IPPROTO_IPIP, 0);
377 err = 0;
378 goto out;
379 }
380
381 if (type == ICMP_REDIRECT) {
382 ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0,
383 IPPROTO_IPIP, 0);
384 err = 0;
385 goto out;
386 }
387
388 if (t->parms.iph.daddr == 0)
373 goto out; 389 goto out;
374 390
375 err = 0; 391 err = 0;
@@ -471,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
471 dev->stats.tx_fifo_errors++; 487 dev->stats.tx_fifo_errors++;
472 goto tx_error; 488 goto tx_error;
473 } 489 }
474 dst = rt->rt_gateway; 490 dst = rt_nexthop(rt, old_iph->daddr);
475 } 491 }
476 492
477 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, 493 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
@@ -503,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
503 } 519 }
504 520
505 if (skb_dst(skb)) 521 if (skb_dst(skb))
506 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 522 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
507 523
508 if ((old_iph->frag_off & htons(IP_DF)) && 524 if ((old_iph->frag_off & htons(IP_DF)) &&
509 mtu < ntohs(old_iph->tot_len)) { 525 mtu < ntohs(old_iph->tot_len)) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c94bbc6f2ba3..8eec8f4a0536 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -524,8 +524,8 @@ failure:
524} 524}
525#endif 525#endif
526 526
527/* 527/**
528 * Delete a VIF entry 528 * vif_delete - Delete a VIF entry
529 * @notify: Set to 1, if the caller is a notifier_call 529 * @notify: Set to 1, if the caller is a notifier_call
530 */ 530 */
531 531
@@ -1795,9 +1795,12 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
1795 .daddr = iph->daddr, 1795 .daddr = iph->daddr,
1796 .saddr = iph->saddr, 1796 .saddr = iph->saddr,
1797 .flowi4_tos = RT_TOS(iph->tos), 1797 .flowi4_tos = RT_TOS(iph->tos),
1798 .flowi4_oif = rt->rt_oif, 1798 .flowi4_oif = (rt_is_output_route(rt) ?
1799 .flowi4_iif = rt->rt_iif, 1799 skb->dev->ifindex : 0),
1800 .flowi4_mark = rt->rt_mark, 1800 .flowi4_iif = (rt_is_output_route(rt) ?
1801 net->loopback_dev->ifindex :
1802 skb->dev->ifindex),
1803 .flowi4_mark = skb->mark,
1801 }; 1804 };
1802 struct mr_table *mrt; 1805 struct mr_table *mrt;
1803 int err; 1806 int err;
@@ -2006,37 +2009,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2006{ 2009{
2007 int ct; 2010 int ct;
2008 struct rtnexthop *nhp; 2011 struct rtnexthop *nhp;
2009 u8 *b = skb_tail_pointer(skb); 2012 struct nlattr *mp_attr;
2010 struct rtattr *mp_head;
2011 2013
2012 /* If cache is unresolved, don't try to parse IIF and OIF */ 2014 /* If cache is unresolved, don't try to parse IIF and OIF */
2013 if (c->mfc_parent >= MAXVIFS) 2015 if (c->mfc_parent >= MAXVIFS)
2014 return -ENOENT; 2016 return -ENOENT;
2015 2017
2016 if (VIF_EXISTS(mrt, c->mfc_parent)) 2018 if (VIF_EXISTS(mrt, c->mfc_parent) &&
2017 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); 2019 nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
2020 return -EMSGSIZE;
2018 2021
2019 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); 2022 if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
2023 return -EMSGSIZE;
2020 2024
2021 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { 2025 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
2022 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { 2026 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
2023 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) 2027 if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
2024 goto rtattr_failure; 2028 nla_nest_cancel(skb, mp_attr);
2025 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); 2029 return -EMSGSIZE;
2030 }
2031
2026 nhp->rtnh_flags = 0; 2032 nhp->rtnh_flags = 0;
2027 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 2033 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
2028 nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; 2034 nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
2029 nhp->rtnh_len = sizeof(*nhp); 2035 nhp->rtnh_len = sizeof(*nhp);
2030 } 2036 }
2031 } 2037 }
2032 mp_head->rta_type = RTA_MULTIPATH; 2038
2033 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; 2039 nla_nest_end(skb, mp_attr);
2040
2034 rtm->rtm_type = RTN_MULTICAST; 2041 rtm->rtm_type = RTN_MULTICAST;
2035 return 1; 2042 return 1;
2036
2037rtattr_failure:
2038 nlmsg_trim(skb, b);
2039 return -EMSGSIZE;
2040} 2043}
2041 2044
2042int ipmr_get_route(struct net *net, struct sk_buff *skb, 2045int ipmr_get_route(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 2f210c79dc87..cbb6a1a6f6f7 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -52,7 +52,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
52 struct nf_nat_ipv4_range newrange; 52 struct nf_nat_ipv4_range newrange;
53 const struct nf_nat_ipv4_multi_range_compat *mr; 53 const struct nf_nat_ipv4_multi_range_compat *mr;
54 const struct rtable *rt; 54 const struct rtable *rt;
55 __be32 newsrc; 55 __be32 newsrc, nh;
56 56
57 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); 57 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
58 58
@@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
70 70
71 mr = par->targinfo; 71 mr = par->targinfo;
72 rt = skb_rtable(skb); 72 rt = skb_rtable(skb);
73 newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); 73 nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
74 newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
74 if (!newsrc) { 75 if (!newsrc) {
75 pr_info("%s ate my IP address\n", par->out->name); 76 pr_info("%s ate my IP address\n", par->out->name);
76 return NF_DROP; 77 return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index ba5756d20165..1109f7f6c254 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum,
196 196
197 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); 197 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
198 198
199 /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ 199 nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
200 nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, 200 sizeof(*pm)+copy_len, 0);
201 sizeof(*pm)+copy_len); 201 if (!nlh) {
202 pr_debug("error during nlmsg_put\n");
203 goto out_unlock;
204 }
202 ub->qlen++; 205 ub->qlen++;
203 206
204 pm = NLMSG_DATA(nlh); 207 pm = nlmsg_data(nlh);
205 208
206 /* We might not have a timestamp, get one */ 209 /* We might not have a timestamp, get one */
207 if (skb->tstamp.tv64 == 0) 210 if (skb->tstamp.tv64 == 0)
@@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
261 nlh->nlmsg_type = NLMSG_DONE; 264 nlh->nlmsg_type = NLMSG_DONE;
262 ulog_send(groupnum); 265 ulog_send(groupnum);
263 } 266 }
264 267out_unlock:
265 spin_unlock_bh(&ulog_lock); 268 spin_unlock_bh(&ulog_lock);
266 269
267 return; 270 return;
268 271
269nlmsg_failure:
270 pr_debug("error during NLMSG_PUT\n");
271alloc_failure: 272alloc_failure:
272 pr_debug("Error building netlink message\n"); 273 pr_debug("Error building netlink message\n");
273 spin_unlock_bh(&ulog_lock); 274 spin_unlock_bh(&ulog_lock);
@@ -380,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
380static int __init ulog_tg_init(void) 381static int __init ulog_tg_init(void)
381{ 382{
382 int ret, i; 383 int ret, i;
384 struct netlink_kernel_cfg cfg = {
385 .groups = ULOG_MAXNLGROUPS,
386 };
383 387
384 pr_debug("init module\n"); 388 pr_debug("init module\n");
385 389
@@ -392,9 +396,8 @@ static int __init ulog_tg_init(void)
392 for (i = 0; i < ULOG_MAXNLGROUPS; i++) 396 for (i = 0; i < ULOG_MAXNLGROUPS; i++)
393 setup_timer(&ulog_buffers[i].timer, ulog_timer, i); 397 setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
394 398
395 nflognl = netlink_kernel_create(&init_net, 399 nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
396 NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, 400 THIS_MODULE, &cfg);
397 NULL, THIS_MODULE);
398 if (!nflognl) 401 if (!nflognl)
399 return -ENOMEM; 402 return -ENOMEM;
400 403
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 91747d4ebc26..e7ff2dcab6ce 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -95,11 +95,11 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
95 return NF_ACCEPT; 95 return NF_ACCEPT;
96} 96}
97 97
98static unsigned int ipv4_confirm(unsigned int hooknum, 98static unsigned int ipv4_helper(unsigned int hooknum,
99 struct sk_buff *skb, 99 struct sk_buff *skb,
100 const struct net_device *in, 100 const struct net_device *in,
101 const struct net_device *out, 101 const struct net_device *out,
102 int (*okfn)(struct sk_buff *)) 102 int (*okfn)(struct sk_buff *))
103{ 103{
104 struct nf_conn *ct; 104 struct nf_conn *ct;
105 enum ip_conntrack_info ctinfo; 105 enum ip_conntrack_info ctinfo;
@@ -110,24 +110,38 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
110 /* This is where we call the helper: as the packet goes out. */ 110 /* This is where we call the helper: as the packet goes out. */
111 ct = nf_ct_get(skb, &ctinfo); 111 ct = nf_ct_get(skb, &ctinfo);
112 if (!ct || ctinfo == IP_CT_RELATED_REPLY) 112 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
113 goto out; 113 return NF_ACCEPT;
114 114
115 help = nfct_help(ct); 115 help = nfct_help(ct);
116 if (!help) 116 if (!help)
117 goto out; 117 return NF_ACCEPT;
118 118
119 /* rcu_read_lock()ed by nf_hook_slow */ 119 /* rcu_read_lock()ed by nf_hook_slow */
120 helper = rcu_dereference(help->helper); 120 helper = rcu_dereference(help->helper);
121 if (!helper) 121 if (!helper)
122 goto out; 122 return NF_ACCEPT;
123 123
124 ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), 124 ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
125 ct, ctinfo); 125 ct, ctinfo);
126 if (ret != NF_ACCEPT) { 126 if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
127 nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, 127 nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
128 "nf_ct_%s: dropping packet", helper->name); 128 "nf_ct_%s: dropping packet", helper->name);
129 return ret;
130 } 129 }
130 return ret;
131}
132
133static unsigned int ipv4_confirm(unsigned int hooknum,
134 struct sk_buff *skb,
135 const struct net_device *in,
136 const struct net_device *out,
137 int (*okfn)(struct sk_buff *))
138{
139 struct nf_conn *ct;
140 enum ip_conntrack_info ctinfo;
141
142 ct = nf_ct_get(skb, &ctinfo);
143 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
144 goto out;
131 145
132 /* adjust seqs for loopback traffic only in outgoing direction */ 146 /* adjust seqs for loopback traffic only in outgoing direction */
133 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 147 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
@@ -185,6 +199,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
185 .priority = NF_IP_PRI_CONNTRACK, 199 .priority = NF_IP_PRI_CONNTRACK,
186 }, 200 },
187 { 201 {
202 .hook = ipv4_helper,
203 .owner = THIS_MODULE,
204 .pf = NFPROTO_IPV4,
205 .hooknum = NF_INET_POST_ROUTING,
206 .priority = NF_IP_PRI_CONNTRACK_HELPER,
207 },
208 {
188 .hook = ipv4_confirm, 209 .hook = ipv4_confirm,
189 .owner = THIS_MODULE, 210 .owner = THIS_MODULE,
190 .pf = NFPROTO_IPV4, 211 .pf = NFPROTO_IPV4,
@@ -192,6 +213,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
192 .priority = NF_IP_PRI_CONNTRACK_CONFIRM, 213 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
193 }, 214 },
194 { 215 {
216 .hook = ipv4_helper,
217 .owner = THIS_MODULE,
218 .pf = NFPROTO_IPV4,
219 .hooknum = NF_INET_LOCAL_IN,
220 .priority = NF_IP_PRI_CONNTRACK_HELPER,
221 },
222 {
195 .hook = ipv4_confirm, 223 .hook = ipv4_confirm,
196 .owner = THIS_MODULE, 224 .owner = THIS_MODULE,
197 .pf = NFPROTO_IPV4, 225 .pf = NFPROTO_IPV4,
@@ -207,35 +235,30 @@ static int log_invalid_proto_max = 255;
207static ctl_table ip_ct_sysctl_table[] = { 235static ctl_table ip_ct_sysctl_table[] = {
208 { 236 {
209 .procname = "ip_conntrack_max", 237 .procname = "ip_conntrack_max",
210 .data = &nf_conntrack_max,
211 .maxlen = sizeof(int), 238 .maxlen = sizeof(int),
212 .mode = 0644, 239 .mode = 0644,
213 .proc_handler = proc_dointvec, 240 .proc_handler = proc_dointvec,
214 }, 241 },
215 { 242 {
216 .procname = "ip_conntrack_count", 243 .procname = "ip_conntrack_count",
217 .data = &init_net.ct.count,
218 .maxlen = sizeof(int), 244 .maxlen = sizeof(int),
219 .mode = 0444, 245 .mode = 0444,
220 .proc_handler = proc_dointvec, 246 .proc_handler = proc_dointvec,
221 }, 247 },
222 { 248 {
223 .procname = "ip_conntrack_buckets", 249 .procname = "ip_conntrack_buckets",
224 .data = &init_net.ct.htable_size,
225 .maxlen = sizeof(unsigned int), 250 .maxlen = sizeof(unsigned int),
226 .mode = 0444, 251 .mode = 0444,
227 .proc_handler = proc_dointvec, 252 .proc_handler = proc_dointvec,
228 }, 253 },
229 { 254 {
230 .procname = "ip_conntrack_checksum", 255 .procname = "ip_conntrack_checksum",
231 .data = &init_net.ct.sysctl_checksum,
232 .maxlen = sizeof(int), 256 .maxlen = sizeof(int),
233 .mode = 0644, 257 .mode = 0644,
234 .proc_handler = proc_dointvec, 258 .proc_handler = proc_dointvec,
235 }, 259 },
236 { 260 {
237 .procname = "ip_conntrack_log_invalid", 261 .procname = "ip_conntrack_log_invalid",
238 .data = &init_net.ct.sysctl_log_invalid,
239 .maxlen = sizeof(unsigned int), 262 .maxlen = sizeof(unsigned int),
240 .mode = 0644, 263 .mode = 0644,
241 .proc_handler = proc_dointvec_minmax, 264 .proc_handler = proc_dointvec_minmax,
@@ -351,6 +374,25 @@ static struct nf_sockopt_ops so_getorigdst = {
351 .owner = THIS_MODULE, 374 .owner = THIS_MODULE,
352}; 375};
353 376
377static int ipv4_init_net(struct net *net)
378{
379#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
380 struct nf_ip_net *in = &net->ct.nf_ct_proto;
381 in->ctl_table = kmemdup(ip_ct_sysctl_table,
382 sizeof(ip_ct_sysctl_table),
383 GFP_KERNEL);
384 if (!in->ctl_table)
385 return -ENOMEM;
386
387 in->ctl_table[0].data = &nf_conntrack_max;
388 in->ctl_table[1].data = &net->ct.count;
389 in->ctl_table[2].data = &net->ct.htable_size;
390 in->ctl_table[3].data = &net->ct.sysctl_checksum;
391 in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
392#endif
393 return 0;
394}
395
354struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { 396struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
355 .l3proto = PF_INET, 397 .l3proto = PF_INET,
356 .name = "ipv4", 398 .name = "ipv4",
@@ -366,8 +408,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
366#endif 408#endif
367#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 409#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
368 .ctl_table_path = "net/ipv4/netfilter", 410 .ctl_table_path = "net/ipv4/netfilter",
369 .ctl_table = ip_ct_sysctl_table,
370#endif 411#endif
412 .init_net = ipv4_init_net,
371 .me = THIS_MODULE, 413 .me = THIS_MODULE,
372}; 414};
373 415
@@ -378,6 +420,65 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
378MODULE_ALIAS("ip_conntrack"); 420MODULE_ALIAS("ip_conntrack");
379MODULE_LICENSE("GPL"); 421MODULE_LICENSE("GPL");
380 422
423static int ipv4_net_init(struct net *net)
424{
425 int ret = 0;
426
427 ret = nf_conntrack_l4proto_register(net,
428 &nf_conntrack_l4proto_tcp4);
429 if (ret < 0) {
430 pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
431 goto out_tcp;
432 }
433 ret = nf_conntrack_l4proto_register(net,
434 &nf_conntrack_l4proto_udp4);
435 if (ret < 0) {
436 pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
437 goto out_udp;
438 }
439 ret = nf_conntrack_l4proto_register(net,
440 &nf_conntrack_l4proto_icmp);
441 if (ret < 0) {
442 pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
443 goto out_icmp;
444 }
445 ret = nf_conntrack_l3proto_register(net,
446 &nf_conntrack_l3proto_ipv4);
447 if (ret < 0) {
448 pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
449 goto out_ipv4;
450 }
451 return 0;
452out_ipv4:
453 nf_conntrack_l4proto_unregister(net,
454 &nf_conntrack_l4proto_icmp);
455out_icmp:
456 nf_conntrack_l4proto_unregister(net,
457 &nf_conntrack_l4proto_udp4);
458out_udp:
459 nf_conntrack_l4proto_unregister(net,
460 &nf_conntrack_l4proto_tcp4);
461out_tcp:
462 return ret;
463}
464
465static void ipv4_net_exit(struct net *net)
466{
467 nf_conntrack_l3proto_unregister(net,
468 &nf_conntrack_l3proto_ipv4);
469 nf_conntrack_l4proto_unregister(net,
470 &nf_conntrack_l4proto_icmp);
471 nf_conntrack_l4proto_unregister(net,
472 &nf_conntrack_l4proto_udp4);
473 nf_conntrack_l4proto_unregister(net,
474 &nf_conntrack_l4proto_tcp4);
475}
476
477static struct pernet_operations ipv4_net_ops = {
478 .init = ipv4_net_init,
479 .exit = ipv4_net_exit,
480};
481
381static int __init nf_conntrack_l3proto_ipv4_init(void) 482static int __init nf_conntrack_l3proto_ipv4_init(void)
382{ 483{
383 int ret = 0; 484 int ret = 0;
@@ -391,35 +492,17 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
391 return ret; 492 return ret;
392 } 493 }
393 494
394 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); 495 ret = register_pernet_subsys(&ipv4_net_ops);
395 if (ret < 0) { 496 if (ret < 0) {
396 pr_err("nf_conntrack_ipv4: can't register tcp.\n"); 497 pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
397 goto cleanup_sockopt; 498 goto cleanup_sockopt;
398 } 499 }
399 500
400 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
401 if (ret < 0) {
402 pr_err("nf_conntrack_ipv4: can't register udp.\n");
403 goto cleanup_tcp;
404 }
405
406 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
407 if (ret < 0) {
408 pr_err("nf_conntrack_ipv4: can't register icmp.\n");
409 goto cleanup_udp;
410 }
411
412 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
413 if (ret < 0) {
414 pr_err("nf_conntrack_ipv4: can't register ipv4\n");
415 goto cleanup_icmp;
416 }
417
418 ret = nf_register_hooks(ipv4_conntrack_ops, 501 ret = nf_register_hooks(ipv4_conntrack_ops,
419 ARRAY_SIZE(ipv4_conntrack_ops)); 502 ARRAY_SIZE(ipv4_conntrack_ops));
420 if (ret < 0) { 503 if (ret < 0) {
421 pr_err("nf_conntrack_ipv4: can't register hooks.\n"); 504 pr_err("nf_conntrack_ipv4: can't register hooks.\n");
422 goto cleanup_ipv4; 505 goto cleanup_pernet;
423 } 506 }
424#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 507#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
425 ret = nf_conntrack_ipv4_compat_init(); 508 ret = nf_conntrack_ipv4_compat_init();
@@ -431,14 +514,8 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
431 cleanup_hooks: 514 cleanup_hooks:
432 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); 515 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
433#endif 516#endif
434 cleanup_ipv4: 517 cleanup_pernet:
435 nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); 518 unregister_pernet_subsys(&ipv4_net_ops);
436 cleanup_icmp:
437 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
438 cleanup_udp:
439 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
440 cleanup_tcp:
441 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
442 cleanup_sockopt: 519 cleanup_sockopt:
443 nf_unregister_sockopt(&so_getorigdst); 520 nf_unregister_sockopt(&so_getorigdst);
444 return ret; 521 return ret;
@@ -451,10 +528,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
451 nf_conntrack_ipv4_compat_fini(); 528 nf_conntrack_ipv4_compat_fini();
452#endif 529#endif
453 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); 530 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
454 nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); 531 unregister_pernet_subsys(&ipv4_net_ops);
455 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
456 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
457 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
458 nf_unregister_sockopt(&so_getorigdst); 532 nf_unregister_sockopt(&so_getorigdst);
459} 533}
460 534
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 0847e373d33c..5241d997ab75 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -23,6 +23,11 @@
23 23
24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; 24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
25 25
26static inline struct nf_icmp_net *icmp_pernet(struct net *net)
27{
28 return &net->ct.nf_ct_proto.icmp;
29}
30
26static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, 31static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
27 struct nf_conntrack_tuple *tuple) 32 struct nf_conntrack_tuple *tuple)
28{ 33{
@@ -77,7 +82,7 @@ static int icmp_print_tuple(struct seq_file *s,
77 82
78static unsigned int *icmp_get_timeouts(struct net *net) 83static unsigned int *icmp_get_timeouts(struct net *net)
79{ 84{
80 return &nf_ct_icmp_timeout; 85 return &icmp_pernet(net)->timeout;
81} 86}
82 87
83/* Returns verdict for packet, or -1 for invalid. */ 88/* Returns verdict for packet, or -1 for invalid. */
@@ -274,16 +279,18 @@ static int icmp_nlattr_tuple_size(void)
274#include <linux/netfilter/nfnetlink.h> 279#include <linux/netfilter/nfnetlink.h>
275#include <linux/netfilter/nfnetlink_cttimeout.h> 280#include <linux/netfilter/nfnetlink_cttimeout.h>
276 281
277static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) 282static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
283 struct net *net, void *data)
278{ 284{
279 unsigned int *timeout = data; 285 unsigned int *timeout = data;
286 struct nf_icmp_net *in = icmp_pernet(net);
280 287
281 if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { 288 if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
282 *timeout = 289 *timeout =
283 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; 290 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
284 } else { 291 } else {
285 /* Set default ICMP timeout. */ 292 /* Set default ICMP timeout. */
286 *timeout = nf_ct_icmp_timeout; 293 *timeout = in->timeout;
287 } 294 }
288 return 0; 295 return 0;
289} 296}
@@ -308,11 +315,9 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
308#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ 315#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
309 316
310#ifdef CONFIG_SYSCTL 317#ifdef CONFIG_SYSCTL
311static struct ctl_table_header *icmp_sysctl_header;
312static struct ctl_table icmp_sysctl_table[] = { 318static struct ctl_table icmp_sysctl_table[] = {
313 { 319 {
314 .procname = "nf_conntrack_icmp_timeout", 320 .procname = "nf_conntrack_icmp_timeout",
315 .data = &nf_ct_icmp_timeout,
316 .maxlen = sizeof(unsigned int), 321 .maxlen = sizeof(unsigned int),
317 .mode = 0644, 322 .mode = 0644,
318 .proc_handler = proc_dointvec_jiffies, 323 .proc_handler = proc_dointvec_jiffies,
@@ -323,7 +328,6 @@ static struct ctl_table icmp_sysctl_table[] = {
323static struct ctl_table icmp_compat_sysctl_table[] = { 328static struct ctl_table icmp_compat_sysctl_table[] = {
324 { 329 {
325 .procname = "ip_conntrack_icmp_timeout", 330 .procname = "ip_conntrack_icmp_timeout",
326 .data = &nf_ct_icmp_timeout,
327 .maxlen = sizeof(unsigned int), 331 .maxlen = sizeof(unsigned int),
328 .mode = 0644, 332 .mode = 0644,
329 .proc_handler = proc_dointvec_jiffies, 333 .proc_handler = proc_dointvec_jiffies,
@@ -333,6 +337,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
333#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ 337#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
334#endif /* CONFIG_SYSCTL */ 338#endif /* CONFIG_SYSCTL */
335 339
340static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
341 struct nf_icmp_net *in)
342{
343#ifdef CONFIG_SYSCTL
344 pn->ctl_table = kmemdup(icmp_sysctl_table,
345 sizeof(icmp_sysctl_table),
346 GFP_KERNEL);
347 if (!pn->ctl_table)
348 return -ENOMEM;
349
350 pn->ctl_table[0].data = &in->timeout;
351#endif
352 return 0;
353}
354
355static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
356 struct nf_icmp_net *in)
357{
358#ifdef CONFIG_SYSCTL
359#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
360 pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
361 sizeof(icmp_compat_sysctl_table),
362 GFP_KERNEL);
363 if (!pn->ctl_compat_table)
364 return -ENOMEM;
365
366 pn->ctl_compat_table[0].data = &in->timeout;
367#endif
368#endif
369 return 0;
370}
371
372static int icmp_init_net(struct net *net, u_int16_t proto)
373{
374 int ret;
375 struct nf_icmp_net *in = icmp_pernet(net);
376 struct nf_proto_net *pn = &in->pn;
377
378 in->timeout = nf_ct_icmp_timeout;
379
380 ret = icmp_kmemdup_compat_sysctl_table(pn, in);
381 if (ret < 0)
382 return ret;
383
384 ret = icmp_kmemdup_sysctl_table(pn, in);
385 if (ret < 0)
386 nf_ct_kfree_compat_sysctl_table(pn);
387
388 return ret;
389}
390
391static struct nf_proto_net *icmp_get_net_proto(struct net *net)
392{
393 return &net->ct.nf_ct_proto.icmp.pn;
394}
395
336struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = 396struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
337{ 397{
338 .l3proto = PF_INET, 398 .l3proto = PF_INET,
@@ -362,11 +422,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
362 .nla_policy = icmp_timeout_nla_policy, 422 .nla_policy = icmp_timeout_nla_policy,
363 }, 423 },
364#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ 424#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
365#ifdef CONFIG_SYSCTL 425 .init_net = icmp_init_net,
366 .ctl_table_header = &icmp_sysctl_header, 426 .get_net_proto = icmp_get_net_proto,
367 .ctl_table = icmp_sysctl_table,
368#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
369 .ctl_compat_table = icmp_compat_sysctl_table,
370#endif
371#endif
372}; 427};
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 9bb1b8a37a22..742815518b0f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
94 { 94 {
95 .hook = ipv4_conntrack_defrag, 95 .hook = ipv4_conntrack_defrag,
96 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
97 .pf = PF_INET, 97 .pf = NFPROTO_IPV4,
98 .hooknum = NF_INET_PRE_ROUTING, 98 .hooknum = NF_INET_PRE_ROUTING,
99 .priority = NF_IP_PRI_CONNTRACK_DEFRAG, 99 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
100 }, 100 },
101 { 101 {
102 .hook = ipv4_conntrack_defrag, 102 .hook = ipv4_conntrack_defrag,
103 .owner = THIS_MODULE, 103 .owner = THIS_MODULE,
104 .pf = PF_INET, 104 .pf = NFPROTO_IPV4,
105 .hooknum = NF_INET_LOCAL_OUT, 105 .hooknum = NF_INET_LOCAL_OUT,
106 .priority = NF_IP_PRI_CONNTRACK_DEFRAG, 106 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
107 }, 107 },
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 7b22382ff0e9..3c04d24e2976 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -13,10 +13,10 @@
13#include <linux/skbuff.h> 13#include <linux/skbuff.h>
14#include <linux/udp.h> 14#include <linux/udp.h>
15 15
16#include <net/netfilter/nf_nat_helper.h>
17#include <net/netfilter/nf_nat_rule.h>
18#include <net/netfilter/nf_conntrack_helper.h> 16#include <net/netfilter/nf_conntrack_helper.h>
19#include <net/netfilter/nf_conntrack_expect.h> 17#include <net/netfilter/nf_conntrack_expect.h>
18#include <net/netfilter/nf_nat_helper.h>
19#include <net/netfilter/nf_nat_rule.h>
20#include <linux/netfilter/nf_conntrack_amanda.h> 20#include <linux/netfilter/nf_conntrack_amanda.h>
21 21
22MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); 22MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index abb52adf5acd..44b082fd48ab 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -691,6 +691,10 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
691 .expectfn = nf_nat_follow_master, 691 .expectfn = nf_nat_follow_master,
692}; 692};
693 693
694static struct nfq_ct_nat_hook nfq_ct_nat = {
695 .seq_adjust = nf_nat_tcp_seq_adjust,
696};
697
694static int __init nf_nat_init(void) 698static int __init nf_nat_init(void)
695{ 699{
696 size_t i; 700 size_t i;
@@ -731,6 +735,7 @@ static int __init nf_nat_init(void)
731 nfnetlink_parse_nat_setup); 735 nfnetlink_parse_nat_setup);
732 BUG_ON(nf_ct_nat_offset != NULL); 736 BUG_ON(nf_ct_nat_offset != NULL);
733 RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); 737 RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
738 RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
734 return 0; 739 return 0;
735 740
736 cleanup_extend: 741 cleanup_extend:
@@ -747,6 +752,7 @@ static void __exit nf_nat_cleanup(void)
747 RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); 752 RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
748 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); 753 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
749 RCU_INIT_POINTER(nf_ct_nat_offset, NULL); 754 RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
755 RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
750 synchronize_net(); 756 synchronize_net();
751} 757}
752 758
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index cad29c121318..c6784a18c1c4 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -95,7 +95,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
95 unsigned char **data, 95 unsigned char **data,
96 TransportAddress *taddr, int count) 96 TransportAddress *taddr, int count)
97{ 97{
98 const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; 98 const struct nf_ct_h323_master *info = nfct_help_data(ct);
99 int dir = CTINFO2DIR(ctinfo); 99 int dir = CTINFO2DIR(ctinfo);
100 int i; 100 int i;
101 __be16 port; 101 __be16 port;
@@ -178,7 +178,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
178 struct nf_conntrack_expect *rtp_exp, 178 struct nf_conntrack_expect *rtp_exp,
179 struct nf_conntrack_expect *rtcp_exp) 179 struct nf_conntrack_expect *rtcp_exp)
180{ 180{
181 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; 181 struct nf_ct_h323_master *info = nfct_help_data(ct);
182 int dir = CTINFO2DIR(ctinfo); 182 int dir = CTINFO2DIR(ctinfo);
183 int i; 183 int i;
184 u_int16_t nated_port; 184 u_int16_t nated_port;
@@ -330,7 +330,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
330 TransportAddress *taddr, __be16 port, 330 TransportAddress *taddr, __be16 port,
331 struct nf_conntrack_expect *exp) 331 struct nf_conntrack_expect *exp)
332{ 332{
333 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; 333 struct nf_ct_h323_master *info = nfct_help_data(ct);
334 int dir = CTINFO2DIR(ctinfo); 334 int dir = CTINFO2DIR(ctinfo);
335 u_int16_t nated_port = ntohs(port); 335 u_int16_t nated_port = ntohs(port);
336 336
@@ -419,7 +419,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
419 unsigned char **data, TransportAddress *taddr, int idx, 419 unsigned char **data, TransportAddress *taddr, int idx,
420 __be16 port, struct nf_conntrack_expect *exp) 420 __be16 port, struct nf_conntrack_expect *exp)
421{ 421{
422 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; 422 struct nf_ct_h323_master *info = nfct_help_data(ct);
423 int dir = CTINFO2DIR(ctinfo); 423 int dir = CTINFO2DIR(ctinfo);
424 u_int16_t nated_port = ntohs(port); 424 u_int16_t nated_port = ntohs(port);
425 union nf_inet_addr addr; 425 union nf_inet_addr addr;
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index af65958f6308..2e59ad0b90ca 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,19 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
153} 153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); 154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155 155
156void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
157 u32 ctinfo, int off)
158{
159 const struct tcphdr *th;
160
161 if (nf_ct_protonum(ct) != IPPROTO_TCP)
162 return;
163
164 th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
165 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
166}
167EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
168
156static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, 169static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
157 int datalen, __sum16 *check, int oldlen) 170 int datalen, __sum16 *check, int oldlen)
158{ 171{
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index c273d58980ae..388140881ebe 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -49,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
49 const struct nf_nat_pptp *nat_pptp_info; 49 const struct nf_nat_pptp *nat_pptp_info;
50 struct nf_nat_ipv4_range range; 50 struct nf_nat_ipv4_range range;
51 51
52 ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; 52 ct_pptp_info = nfct_help_data(master);
53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; 53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
54 54
55 /* And here goes the grand finale of corrosion... */ 55 /* And here goes the grand finale of corrosion... */
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
123 __be16 new_callid; 123 __be16 new_callid;
124 unsigned int cid_off; 124 unsigned int cid_off;
125 125
126 ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; 126 ct_pptp_info = nfct_help_data(ct);
127 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; 127 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
128 128
129 new_callid = ct_pptp_info->pns_call_id; 129 new_callid = ct_pptp_info->pns_call_id;
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
192 struct nf_ct_pptp_master *ct_pptp_info; 192 struct nf_ct_pptp_master *ct_pptp_info;
193 struct nf_nat_pptp *nat_pptp_info; 193 struct nf_nat_pptp *nat_pptp_info;
194 194
195 ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; 195 ct_pptp_info = nfct_help_data(ct);
196 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; 196 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
197 197
198 /* save original PAC call ID in nat_info */ 198 /* save original PAC call ID in nat_info */
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 746edec8b86e..bac712293fd6 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -405,7 +405,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
405 405
406 ptr = *octets; 406 ptr = *octets;
407 while (ctx->pointer < eoc) { 407 while (ctx->pointer < eoc) {
408 if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { 408 if (!asn1_octet_decode(ctx, ptr++)) {
409 kfree(*octets); 409 kfree(*octets);
410 *octets = NULL; 410 *octets = NULL;
411 return 0; 411 return 0;
@@ -759,7 +759,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
759 } 759 }
760 break; 760 break;
761 case SNMP_OBJECTID: 761 case SNMP_OBJECTID:
762 if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { 762 if (!asn1_oid_decode(ctx, end, &lp, &len)) {
763 kfree(id); 763 kfree(id);
764 return 0; 764 return 0;
765 } 765 }
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index a2901bf829c0..9dbb8d284f99 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -8,10 +8,10 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/udp.h> 9#include <linux/udp.h>
10 10
11#include <net/netfilter/nf_nat_helper.h>
12#include <net/netfilter/nf_nat_rule.h>
13#include <net/netfilter/nf_conntrack_helper.h> 11#include <net/netfilter/nf_conntrack_helper.h>
14#include <net/netfilter/nf_conntrack_expect.h> 12#include <net/netfilter/nf_conntrack_expect.h>
13#include <net/netfilter/nf_nat_helper.h>
14#include <net/netfilter/nf_nat_rule.h>
15#include <linux/netfilter/nf_conntrack_tftp.h> 15#include <linux/netfilter/nf_conntrack_tftp.h>
16 16
17MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); 17MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2c00e8bf684d..6232d476f37e 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -371,6 +371,7 @@ void ping_err(struct sk_buff *skb, u32 info)
371 break; 371 break;
372 case ICMP_DEST_UNREACH: 372 case ICMP_DEST_UNREACH:
373 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ 373 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
374 ipv4_sk_update_pmtu(skb, sk, info);
374 if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { 375 if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
375 err = EMSGSIZE; 376 err = EMSGSIZE;
376 harderr = 1; 377 harderr = 1;
@@ -386,6 +387,7 @@ void ping_err(struct sk_buff *skb, u32 info)
386 break; 387 break;
387 case ICMP_REDIRECT: 388 case ICMP_REDIRECT:
388 /* See ICMP_SOURCE_QUENCH */ 389 /* See ICMP_SOURCE_QUENCH */
390 ipv4_sk_redirect(skb, sk);
389 err = EREMOTEIO; 391 err = EREMOTEIO;
390 break; 392 break;
391 } 393 }
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8af0d44e4e22..957acd12250b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -232,7 +232,6 @@ static const struct snmp_mib snmp4_net_list[] = {
232 SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), 232 SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
233 SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), 233 SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
234 SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), 234 SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
235 SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
236 SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), 235 SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
237 SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), 236 SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
238 SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), 237 SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
@@ -258,6 +257,12 @@ static const struct snmp_mib snmp4_net_list[] = {
258 SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), 257 SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
259 SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), 258 SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
260 SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), 259 SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
260 SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
261 SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
262 SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
261 SNMP_MIB_SENTINEL 266 SNMP_MIB_SENTINEL
262}; 267};
263 268
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01cd0b2..8918eff1426d 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
36 36
37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) 37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
38{ 38{
39 int hash = protocol & (MAX_INET_PROTOS - 1); 39 return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
40
41 return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
42 NULL, prot) ? 0 : -1; 40 NULL, prot) ? 0 : -1;
43} 41}
44EXPORT_SYMBOL(inet_add_protocol); 42EXPORT_SYMBOL(inet_add_protocol);
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol);
49 47
50int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) 48int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
51{ 49{
52 int ret, hash = protocol & (MAX_INET_PROTOS - 1); 50 int ret;
53 51
54 ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], 52 ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
55 prot, NULL) == prot) ? 0 : -1; 53 prot, NULL) == prot) ? 0 : -1;
56 54
57 synchronize_net(); 55 synchronize_net();
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4032b818f3e4..ff0f071969ea 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -216,6 +216,11 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
216 int err = 0; 216 int err = 0;
217 int harderr = 0; 217 int harderr = 0;
218 218
219 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
220 ipv4_sk_update_pmtu(skb, sk, info);
221 else if (type == ICMP_REDIRECT)
222 ipv4_sk_redirect(skb, sk);
223
219 /* Report error on raw socket, if: 224 /* Report error on raw socket, if:
220 1. User requested ip_recverr. 225 1. User requested ip_recverr.
221 2. Socket is connected (otherwise the error indication 226 2. Socket is connected (otherwise the error indication
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 98b30d08efe9..6bcb8fc71cbc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256; 135static int ip_rt_min_advmss __read_mostly = 256;
136static int rt_chain_length_max __read_mostly = 20;
137
138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140 136
141/* 137/*
142 * Interface to generic destination cache. 138 * Interface to generic destination cache.
@@ -145,11 +141,12 @@ static unsigned long expires_ljiffies;
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
147static unsigned int ipv4_mtu(const struct dst_entry *dst); 143static unsigned int ipv4_mtu(const struct dst_entry *dst);
148static void ipv4_dst_destroy(struct dst_entry *dst);
149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb); 145static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 146static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
152static int rt_garbage_collect(struct dst_ops *ops); 147 struct sk_buff *skb, u32 mtu);
148static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 struct sk_buff *skb);
153 150
154static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 151static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155 int how) 152 int how)
@@ -158,54 +155,26 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158 155
159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 156static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160{ 157{
161 struct rtable *rt = (struct rtable *) dst; 158 WARN_ON(1);
162 struct inet_peer *peer; 159 return NULL;
163 u32 *p = NULL;
164
165 if (!rt->peer)
166 rt_bind_peer(rt, rt->rt_dst, 1);
167
168 peer = rt->peer;
169 if (peer) {
170 u32 *old_p = __DST_METRICS_PTR(old);
171 unsigned long prev, new;
172
173 p = peer->metrics;
174 if (inet_metrics_new(peer))
175 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
176
177 new = (unsigned long) p;
178 prev = cmpxchg(&dst->_metrics, old, new);
179
180 if (prev != old) {
181 p = __DST_METRICS_PTR(prev);
182 if (prev & DST_METRICS_READ_ONLY)
183 p = NULL;
184 } else {
185 if (rt->fi) {
186 fib_info_put(rt->fi);
187 rt->fi = NULL;
188 }
189 }
190 }
191 return p;
192} 160}
193 161
194static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); 162static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
163 struct sk_buff *skb,
164 const void *daddr);
195 165
196static struct dst_ops ipv4_dst_ops = { 166static struct dst_ops ipv4_dst_ops = {
197 .family = AF_INET, 167 .family = AF_INET,
198 .protocol = cpu_to_be16(ETH_P_IP), 168 .protocol = cpu_to_be16(ETH_P_IP),
199 .gc = rt_garbage_collect,
200 .check = ipv4_dst_check, 169 .check = ipv4_dst_check,
201 .default_advmss = ipv4_default_advmss, 170 .default_advmss = ipv4_default_advmss,
202 .mtu = ipv4_mtu, 171 .mtu = ipv4_mtu,
203 .cow_metrics = ipv4_cow_metrics, 172 .cow_metrics = ipv4_cow_metrics,
204 .destroy = ipv4_dst_destroy,
205 .ifdown = ipv4_dst_ifdown, 173 .ifdown = ipv4_dst_ifdown,
206 .negative_advice = ipv4_negative_advice, 174 .negative_advice = ipv4_negative_advice,
207 .link_failure = ipv4_link_failure, 175 .link_failure = ipv4_link_failure,
208 .update_pmtu = ip_rt_update_pmtu, 176 .update_pmtu = ip_rt_update_pmtu,
177 .redirect = ip_do_redirect,
209 .local_out = __ip_local_out, 178 .local_out = __ip_local_out,
210 .neigh_lookup = ipv4_neigh_lookup, 179 .neigh_lookup = ipv4_neigh_lookup,
211}; 180};
@@ -232,184 +201,30 @@ const __u8 ip_tos2prio[16] = {
232}; 201};
233EXPORT_SYMBOL(ip_tos2prio); 202EXPORT_SYMBOL(ip_tos2prio);
234 203
235/*
236 * Route cache.
237 */
238
239/* The locking scheme is rather straight forward:
240 *
241 * 1) Read-Copy Update protects the buckets of the central route hash.
242 * 2) Only writers remove entries, and they hold the lock
243 * as they look at rtable reference counts.
244 * 3) Only readers acquire references to rtable entries,
245 * they do so with atomic increments and with the
246 * lock held.
247 */
248
249struct rt_hash_bucket {
250 struct rtable __rcu *chain;
251};
252
253#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
254 defined(CONFIG_PROVE_LOCKING)
255/*
256 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
257 * The size of this table is a power of two and depends on the number of CPUS.
258 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
259 */
260#ifdef CONFIG_LOCKDEP
261# define RT_HASH_LOCK_SZ 256
262#else
263# if NR_CPUS >= 32
264# define RT_HASH_LOCK_SZ 4096
265# elif NR_CPUS >= 16
266# define RT_HASH_LOCK_SZ 2048
267# elif NR_CPUS >= 8
268# define RT_HASH_LOCK_SZ 1024
269# elif NR_CPUS >= 4
270# define RT_HASH_LOCK_SZ 512
271# else
272# define RT_HASH_LOCK_SZ 256
273# endif
274#endif
275
276static spinlock_t *rt_hash_locks;
277# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
278
279static __init void rt_hash_lock_init(void)
280{
281 int i;
282
283 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
284 GFP_KERNEL);
285 if (!rt_hash_locks)
286 panic("IP: failed to allocate rt_hash_locks\n");
287
288 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
289 spin_lock_init(&rt_hash_locks[i]);
290}
291#else
292# define rt_hash_lock_addr(slot) NULL
293
294static inline void rt_hash_lock_init(void)
295{
296}
297#endif
298
299static struct rt_hash_bucket *rt_hash_table __read_mostly;
300static unsigned int rt_hash_mask __read_mostly;
301static unsigned int rt_hash_log __read_mostly;
302
303static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 204static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
304#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 205#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
305 206
306static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
307 int genid)
308{
309 return jhash_3words((__force u32)daddr, (__force u32)saddr,
310 idx, genid)
311 & rt_hash_mask;
312}
313
314static inline int rt_genid(struct net *net) 207static inline int rt_genid(struct net *net)
315{ 208{
316 return atomic_read(&net->ipv4.rt_genid); 209 return atomic_read(&net->ipv4.rt_genid);
317} 210}
318 211
319#ifdef CONFIG_PROC_FS 212#ifdef CONFIG_PROC_FS
320struct rt_cache_iter_state {
321 struct seq_net_private p;
322 int bucket;
323 int genid;
324};
325
326static struct rtable *rt_cache_get_first(struct seq_file *seq)
327{
328 struct rt_cache_iter_state *st = seq->private;
329 struct rtable *r = NULL;
330
331 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
332 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
333 continue;
334 rcu_read_lock_bh();
335 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
336 while (r) {
337 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
338 r->rt_genid == st->genid)
339 return r;
340 r = rcu_dereference_bh(r->dst.rt_next);
341 }
342 rcu_read_unlock_bh();
343 }
344 return r;
345}
346
347static struct rtable *__rt_cache_get_next(struct seq_file *seq,
348 struct rtable *r)
349{
350 struct rt_cache_iter_state *st = seq->private;
351
352 r = rcu_dereference_bh(r->dst.rt_next);
353 while (!r) {
354 rcu_read_unlock_bh();
355 do {
356 if (--st->bucket < 0)
357 return NULL;
358 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
359 rcu_read_lock_bh();
360 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
361 }
362 return r;
363}
364
365static struct rtable *rt_cache_get_next(struct seq_file *seq,
366 struct rtable *r)
367{
368 struct rt_cache_iter_state *st = seq->private;
369 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
370 if (dev_net(r->dst.dev) != seq_file_net(seq))
371 continue;
372 if (r->rt_genid == st->genid)
373 break;
374 }
375 return r;
376}
377
378static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
379{
380 struct rtable *r = rt_cache_get_first(seq);
381
382 if (r)
383 while (pos && (r = rt_cache_get_next(seq, r)))
384 --pos;
385 return pos ? NULL : r;
386}
387
388static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 213static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
389{ 214{
390 struct rt_cache_iter_state *st = seq->private;
391 if (*pos) 215 if (*pos)
392 return rt_cache_get_idx(seq, *pos - 1); 216 return NULL;
393 st->genid = rt_genid(seq_file_net(seq));
394 return SEQ_START_TOKEN; 217 return SEQ_START_TOKEN;
395} 218}
396 219
397static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 220static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
398{ 221{
399 struct rtable *r;
400
401 if (v == SEQ_START_TOKEN)
402 r = rt_cache_get_first(seq);
403 else
404 r = rt_cache_get_next(seq, v);
405 ++*pos; 222 ++*pos;
406 return r; 223 return NULL;
407} 224}
408 225
409static void rt_cache_seq_stop(struct seq_file *seq, void *v) 226static void rt_cache_seq_stop(struct seq_file *seq, void *v)
410{ 227{
411 if (v && v != SEQ_START_TOKEN)
412 rcu_read_unlock_bh();
413} 228}
414 229
415static int rt_cache_seq_show(struct seq_file *seq, void *v) 230static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -419,34 +234,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
419 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 234 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
420 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 235 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
421 "HHUptod\tSpecDst"); 236 "HHUptod\tSpecDst");
422 else {
423 struct rtable *r = v;
424 struct neighbour *n;
425 int len, HHUptod;
426
427 rcu_read_lock();
428 n = dst_get_neighbour_noref(&r->dst);
429 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430 rcu_read_unlock();
431
432 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
433 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
434 r->dst.dev ? r->dst.dev->name : "*",
435 (__force u32)r->rt_dst,
436 (__force u32)r->rt_gateway,
437 r->rt_flags, atomic_read(&r->dst.__refcnt),
438 r->dst.__use, 0, (__force u32)r->rt_src,
439 dst_metric_advmss(&r->dst) + 40,
440 dst_metric(&r->dst, RTAX_WINDOW),
441 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
442 dst_metric(&r->dst, RTAX_RTTVAR)),
443 r->rt_key_tos,
444 -1,
445 HHUptod,
446 r->rt_spec_dst, &len);
447
448 seq_printf(seq, "%*s\n", 127 - len, "");
449 }
450 return 0; 237 return 0;
451} 238}
452 239
@@ -459,8 +246,7 @@ static const struct seq_operations rt_cache_seq_ops = {
459 246
460static int rt_cache_seq_open(struct inode *inode, struct file *file) 247static int rt_cache_seq_open(struct inode *inode, struct file *file)
461{ 248{
462 return seq_open_net(inode, file, &rt_cache_seq_ops, 249 return seq_open(file, &rt_cache_seq_ops);
463 sizeof(struct rt_cache_iter_state));
464} 250}
465 251
466static const struct file_operations rt_cache_seq_fops = { 252static const struct file_operations rt_cache_seq_fops = {
@@ -468,7 +254,7 @@ static const struct file_operations rt_cache_seq_fops = {
468 .open = rt_cache_seq_open, 254 .open = rt_cache_seq_open,
469 .read = seq_read, 255 .read = seq_read,
470 .llseek = seq_lseek, 256 .llseek = seq_lseek,
471 .release = seq_release_net, 257 .release = seq_release,
472}; 258};
473 259
474 260
@@ -658,275 +444,12 @@ static inline int ip_rt_proc_init(void)
658} 444}
659#endif /* CONFIG_PROC_FS */ 445#endif /* CONFIG_PROC_FS */
660 446
661static inline void rt_free(struct rtable *rt)
662{
663 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
664}
665
666static inline void rt_drop(struct rtable *rt)
667{
668 ip_rt_put(rt);
669 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
670}
671
672static inline int rt_fast_clean(struct rtable *rth)
673{
674 /* Kill broadcast/multicast entries very aggresively, if they
675 collide in hash table with more useful entries */
676 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
677 rt_is_input_route(rth) && rth->dst.rt_next;
678}
679
680static inline int rt_valuable(struct rtable *rth)
681{
682 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
683 (rth->peer && rth->peer->pmtu_expires);
684}
685
686static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
687{
688 unsigned long age;
689 int ret = 0;
690
691 if (atomic_read(&rth->dst.__refcnt))
692 goto out;
693
694 age = jiffies - rth->dst.lastuse;
695 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
696 (age <= tmo2 && rt_valuable(rth)))
697 goto out;
698 ret = 1;
699out: return ret;
700}
701
702/* Bits of score are:
703 * 31: very valuable
704 * 30: not quite useless
705 * 29..0: usage counter
706 */
707static inline u32 rt_score(struct rtable *rt)
708{
709 u32 score = jiffies - rt->dst.lastuse;
710
711 score = ~score & ~(3<<30);
712
713 if (rt_valuable(rt))
714 score |= (1<<31);
715
716 if (rt_is_output_route(rt) ||
717 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 score |= (1<<30);
719
720 return score;
721}
722
723static inline bool rt_caching(const struct net *net)
724{
725 return net->ipv4.current_rt_cache_rebuild_count <=
726 net->ipv4.sysctl_rt_cache_rebuild_count;
727}
728
729static inline bool compare_hash_inputs(const struct rtable *rt1,
730 const struct rtable *rt2)
731{
732 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
733 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
734 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
735}
736
737static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
738{
739 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
740 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
741 (rt1->rt_mark ^ rt2->rt_mark) |
742 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
743 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
744 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
745}
746
747static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
748{
749 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
750}
751
752static inline int rt_is_expired(struct rtable *rth) 447static inline int rt_is_expired(struct rtable *rth)
753{ 448{
754 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 449 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
755} 450}
756 451
757/* 452/*
758 * Perform a full scan of hash table and free all entries.
759 * Can be called by a softirq or a process.
760 * In the later case, we want to be reschedule if necessary
761 */
762static void rt_do_flush(struct net *net, int process_context)
763{
764 unsigned int i;
765 struct rtable *rth, *next;
766
767 for (i = 0; i <= rt_hash_mask; i++) {
768 struct rtable __rcu **pprev;
769 struct rtable *list;
770
771 if (process_context && need_resched())
772 cond_resched();
773 rth = rcu_access_pointer(rt_hash_table[i].chain);
774 if (!rth)
775 continue;
776
777 spin_lock_bh(rt_hash_lock_addr(i));
778
779 list = NULL;
780 pprev = &rt_hash_table[i].chain;
781 rth = rcu_dereference_protected(*pprev,
782 lockdep_is_held(rt_hash_lock_addr(i)));
783
784 while (rth) {
785 next = rcu_dereference_protected(rth->dst.rt_next,
786 lockdep_is_held(rt_hash_lock_addr(i)));
787
788 if (!net ||
789 net_eq(dev_net(rth->dst.dev), net)) {
790 rcu_assign_pointer(*pprev, next);
791 rcu_assign_pointer(rth->dst.rt_next, list);
792 list = rth;
793 } else {
794 pprev = &rth->dst.rt_next;
795 }
796 rth = next;
797 }
798
799 spin_unlock_bh(rt_hash_lock_addr(i));
800
801 for (; list; list = next) {
802 next = rcu_dereference_protected(list->dst.rt_next, 1);
803 rt_free(list);
804 }
805 }
806}
807
808/*
809 * While freeing expired entries, we compute average chain length
810 * and standard deviation, using fixed-point arithmetic.
811 * This to have an estimation of rt_chain_length_max
812 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
813 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
814 */
815
816#define FRACT_BITS 3
817#define ONE (1UL << FRACT_BITS)
818
819/*
820 * Given a hash chain and an item in this hash chain,
821 * find if a previous entry has the same hash_inputs
822 * (but differs on tos, mark or oif)
823 * Returns 0 if an alias is found.
824 * Returns ONE if rth has no alias before itself.
825 */
826static int has_noalias(const struct rtable *head, const struct rtable *rth)
827{
828 const struct rtable *aux = head;
829
830 while (aux != rth) {
831 if (compare_hash_inputs(aux, rth))
832 return 0;
833 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
834 }
835 return ONE;
836}
837
838static void rt_check_expire(void)
839{
840 static unsigned int rover;
841 unsigned int i = rover, goal;
842 struct rtable *rth;
843 struct rtable __rcu **rthp;
844 unsigned long samples = 0;
845 unsigned long sum = 0, sum2 = 0;
846 unsigned long delta;
847 u64 mult;
848
849 delta = jiffies - expires_ljiffies;
850 expires_ljiffies = jiffies;
851 mult = ((u64)delta) << rt_hash_log;
852 if (ip_rt_gc_timeout > 1)
853 do_div(mult, ip_rt_gc_timeout);
854 goal = (unsigned int)mult;
855 if (goal > rt_hash_mask)
856 goal = rt_hash_mask + 1;
857 for (; goal > 0; goal--) {
858 unsigned long tmo = ip_rt_gc_timeout;
859 unsigned long length;
860
861 i = (i + 1) & rt_hash_mask;
862 rthp = &rt_hash_table[i].chain;
863
864 if (need_resched())
865 cond_resched();
866
867 samples++;
868
869 if (rcu_dereference_raw(*rthp) == NULL)
870 continue;
871 length = 0;
872 spin_lock_bh(rt_hash_lock_addr(i));
873 while ((rth = rcu_dereference_protected(*rthp,
874 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875 prefetch(rth->dst.rt_next);
876 if (rt_is_expired(rth)) {
877 *rthp = rth->dst.rt_next;
878 rt_free(rth);
879 continue;
880 }
881 if (rth->dst.expires) {
882 /* Entry is expired even if it is in use */
883 if (time_before_eq(jiffies, rth->dst.expires)) {
884nofree:
885 tmo >>= 1;
886 rthp = &rth->dst.rt_next;
887 /*
888 * We only count entries on
889 * a chain with equal hash inputs once
890 * so that entries for different QOS
891 * levels, and other non-hash input
892 * attributes don't unfairly skew
893 * the length computation
894 */
895 length += has_noalias(rt_hash_table[i].chain, rth);
896 continue;
897 }
898 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899 goto nofree;
900
901 /* Cleanup aged off entries. */
902 *rthp = rth->dst.rt_next;
903 rt_free(rth);
904 }
905 spin_unlock_bh(rt_hash_lock_addr(i));
906 sum += length;
907 sum2 += length*length;
908 }
909 if (samples) {
910 unsigned long avg = sum / samples;
911 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
912 rt_chain_length_max = max_t(unsigned long,
913 ip_rt_gc_elasticity,
914 (avg + 4*sd) >> FRACT_BITS);
915 }
916 rover = i;
917}
918
919/*
920 * rt_worker_func() is run in process context.
921 * we call rt_check_expire() to scan part of the hash table
922 */
923static void rt_worker_func(struct work_struct *work)
924{
925 rt_check_expire();
926 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
927}
928
929/*
930 * Perturbation of rt_genid by a small quantity [1..256] 453 * Perturbation of rt_genid by a small quantity [1..256]
931 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 454 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
932 * many times (2^24) without giving recent rt_genid. 455 * many times (2^24) without giving recent rt_genid.
@@ -938,7 +461,6 @@ static void rt_cache_invalidate(struct net *net)
938 461
939 get_random_bytes(&shuffle, sizeof(shuffle)); 462 get_random_bytes(&shuffle, sizeof(shuffle));
940 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 463 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
941 inetpeer_invalidate_tree(AF_INET);
942} 464}
943 465
944/* 466/*
@@ -948,183 +470,22 @@ static void rt_cache_invalidate(struct net *net)
948void rt_cache_flush(struct net *net, int delay) 470void rt_cache_flush(struct net *net, int delay)
949{ 471{
950 rt_cache_invalidate(net); 472 rt_cache_invalidate(net);
951 if (delay >= 0)
952 rt_do_flush(net, !in_softirq());
953}
954
955/* Flush previous cache invalidated entries from the cache */
956void rt_cache_flush_batch(struct net *net)
957{
958 rt_do_flush(net, !in_softirq());
959}
960
961static void rt_emergency_hash_rebuild(struct net *net)
962{
963 net_warn_ratelimited("Route hash chain too long!\n");
964 rt_cache_invalidate(net);
965}
966
967/*
968 Short description of GC goals.
969
970 We want to build algorithm, which will keep routing cache
971 at some equilibrium point, when number of aged off entries
972 is kept approximately equal to newly generated ones.
973
974 Current expiration strength is variable "expire".
975 We try to adjust it dynamically, so that if networking
976 is idle expires is large enough to keep enough of warm entries,
977 and when load increases it reduces to limit cache size.
978 */
979
980static int rt_garbage_collect(struct dst_ops *ops)
981{
982 static unsigned long expire = RT_GC_TIMEOUT;
983 static unsigned long last_gc;
984 static int rover;
985 static int equilibrium;
986 struct rtable *rth;
987 struct rtable __rcu **rthp;
988 unsigned long now = jiffies;
989 int goal;
990 int entries = dst_entries_get_fast(&ipv4_dst_ops);
991
992 /*
993 * Garbage collection is pretty expensive,
994 * do not make it too frequently.
995 */
996
997 RT_CACHE_STAT_INC(gc_total);
998
999 if (now - last_gc < ip_rt_gc_min_interval &&
1000 entries < ip_rt_max_size) {
1001 RT_CACHE_STAT_INC(gc_ignored);
1002 goto out;
1003 }
1004
1005 entries = dst_entries_get_slow(&ipv4_dst_ops);
1006 /* Calculate number of entries, which we want to expire now. */
1007 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008 if (goal <= 0) {
1009 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 equilibrium = ipv4_dst_ops.gc_thresh;
1011 goal = entries - equilibrium;
1012 if (goal > 0) {
1013 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014 goal = entries - equilibrium;
1015 }
1016 } else {
1017 /* We are in dangerous area. Try to reduce cache really
1018 * aggressively.
1019 */
1020 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021 equilibrium = entries - goal;
1022 }
1023
1024 if (now - last_gc >= ip_rt_gc_min_interval)
1025 last_gc = now;
1026
1027 if (goal <= 0) {
1028 equilibrium += goal;
1029 goto work_done;
1030 }
1031
1032 do {
1033 int i, k;
1034
1035 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 unsigned long tmo = expire;
1037
1038 k = (k + 1) & rt_hash_mask;
1039 rthp = &rt_hash_table[k].chain;
1040 spin_lock_bh(rt_hash_lock_addr(k));
1041 while ((rth = rcu_dereference_protected(*rthp,
1042 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043 if (!rt_is_expired(rth) &&
1044 !rt_may_expire(rth, tmo, expire)) {
1045 tmo >>= 1;
1046 rthp = &rth->dst.rt_next;
1047 continue;
1048 }
1049 *rthp = rth->dst.rt_next;
1050 rt_free(rth);
1051 goal--;
1052 }
1053 spin_unlock_bh(rt_hash_lock_addr(k));
1054 if (goal <= 0)
1055 break;
1056 }
1057 rover = k;
1058
1059 if (goal <= 0)
1060 goto work_done;
1061
1062 /* Goal is not achieved. We stop process if:
1063
1064 - if expire reduced to zero. Otherwise, expire is halfed.
1065 - if table is not full.
1066 - if we are called from interrupt.
1067 - jiffies check is just fallback/debug loop breaker.
1068 We will not spin here for long time in any case.
1069 */
1070
1071 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073 if (expire == 0)
1074 break;
1075
1076 expire >>= 1;
1077
1078 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079 goto out;
1080 } while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 goto out;
1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085 goto out;
1086 net_warn_ratelimited("dst cache overflow\n");
1087 RT_CACHE_STAT_INC(gc_dst_overflow);
1088 return 1;
1089
1090work_done:
1091 expire += ip_rt_gc_min_interval;
1092 if (expire > ip_rt_gc_timeout ||
1093 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095 expire = ip_rt_gc_timeout;
1096out: return 0;
1097}
1098
1099/*
1100 * Returns number of entries in a hash chain that have different hash_inputs
1101 */
1102static int slow_chain_length(const struct rtable *head)
1103{
1104 int length = 0;
1105 const struct rtable *rth = head;
1106
1107 while (rth) {
1108 length += has_noalias(head, rth);
1109 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110 }
1111 return length >> FRACT_BITS;
1112} 473}
1113 474
1114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) 475static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
476 struct sk_buff *skb,
477 const void *daddr)
1115{ 478{
1116 static const __be32 inaddr_any = 0;
1117 struct net_device *dev = dst->dev; 479 struct net_device *dev = dst->dev;
1118 const __be32 *pkey = daddr; 480 const __be32 *pkey = daddr;
1119 const struct rtable *rt; 481 const struct rtable *rt;
1120 struct neighbour *n; 482 struct neighbour *n;
1121 483
1122 rt = (const struct rtable *) dst; 484 rt = (const struct rtable *) dst;
1123 485 if (rt->rt_gateway)
1124 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125 pkey = &inaddr_any;
1126 else if (rt->rt_gateway)
1127 pkey = (const __be32 *) &rt->rt_gateway; 486 pkey = (const __be32 *) &rt->rt_gateway;
487 else if (skb)
488 pkey = &ip_hdr(skb)->daddr;
1128 489
1129 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); 490 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130 if (n) 491 if (n)
@@ -1132,212 +493,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo
1132 return neigh_create(&arp_tbl, pkey, dev); 493 return neigh_create(&arp_tbl, pkey, dev);
1133} 494}
1134 495
1135static int rt_bind_neighbour(struct rtable *rt)
1136{
1137 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138 if (IS_ERR(n))
1139 return PTR_ERR(n);
1140 dst_set_neighbour(&rt->dst, n);
1141
1142 return 0;
1143}
1144
1145static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1146 struct sk_buff *skb, int ifindex)
1147{
1148 struct rtable *rth, *cand;
1149 struct rtable __rcu **rthp, **candp;
1150 unsigned long now;
1151 u32 min_score;
1152 int chain_length;
1153 int attempts = !in_softirq();
1154
1155restart:
1156 chain_length = 0;
1157 min_score = ~(u32)0;
1158 cand = NULL;
1159 candp = NULL;
1160 now = jiffies;
1161
1162 if (!rt_caching(dev_net(rt->dst.dev))) {
1163 /*
1164 * If we're not caching, just tell the caller we
1165 * were successful and don't touch the route. The
1166 * caller hold the sole reference to the cache entry, and
1167 * it will be released when the caller is done with it.
1168 * If we drop it here, the callers have no way to resolve routes
1169 * when we're not caching. Instead, just point *rp at rt, so
1170 * the caller gets a single use out of the route
1171 * Note that we do rt_free on this new route entry, so that
1172 * once its refcount hits zero, we are still able to reap it
1173 * (Thanks Alexey)
1174 * Note: To avoid expensive rcu stuff for this uncached dst,
1175 * we set DST_NOCACHE so that dst_release() can free dst without
1176 * waiting a grace period.
1177 */
1178
1179 rt->dst.flags |= DST_NOCACHE;
1180 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181 int err = rt_bind_neighbour(rt);
1182 if (err) {
1183 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1184 ip_rt_put(rt);
1185 return ERR_PTR(err);
1186 }
1187 }
1188
1189 goto skip_hashing;
1190 }
1191
1192 rthp = &rt_hash_table[hash].chain;
1193
1194 spin_lock_bh(rt_hash_lock_addr(hash));
1195 while ((rth = rcu_dereference_protected(*rthp,
1196 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1197 if (rt_is_expired(rth)) {
1198 *rthp = rth->dst.rt_next;
1199 rt_free(rth);
1200 continue;
1201 }
1202 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1203 /* Put it first */
1204 *rthp = rth->dst.rt_next;
1205 /*
1206 * Since lookup is lockfree, the deletion
1207 * must be visible to another weakly ordered CPU before
1208 * the insertion at the start of the hash chain.
1209 */
1210 rcu_assign_pointer(rth->dst.rt_next,
1211 rt_hash_table[hash].chain);
1212 /*
1213 * Since lookup is lockfree, the update writes
1214 * must be ordered for consistency on SMP.
1215 */
1216 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1217
1218 dst_use(&rth->dst, now);
1219 spin_unlock_bh(rt_hash_lock_addr(hash));
1220
1221 rt_drop(rt);
1222 if (skb)
1223 skb_dst_set(skb, &rth->dst);
1224 return rth;
1225 }
1226
1227 if (!atomic_read(&rth->dst.__refcnt)) {
1228 u32 score = rt_score(rth);
1229
1230 if (score <= min_score) {
1231 cand = rth;
1232 candp = rthp;
1233 min_score = score;
1234 }
1235 }
1236
1237 chain_length++;
1238
1239 rthp = &rth->dst.rt_next;
1240 }
1241
1242 if (cand) {
1243 /* ip_rt_gc_elasticity used to be average length of chain
1244 * length, when exceeded gc becomes really aggressive.
1245 *
1246 * The second limit is less certain. At the moment it allows
1247 * only 2 entries per bucket. We will see.
1248 */
1249 if (chain_length > ip_rt_gc_elasticity) {
1250 *candp = cand->dst.rt_next;
1251 rt_free(cand);
1252 }
1253 } else {
1254 if (chain_length > rt_chain_length_max &&
1255 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1256 struct net *net = dev_net(rt->dst.dev);
1257 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1258 if (!rt_caching(net)) {
1259 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1260 rt->dst.dev->name, num);
1261 }
1262 rt_emergency_hash_rebuild(net);
1263 spin_unlock_bh(rt_hash_lock_addr(hash));
1264
1265 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1266 ifindex, rt_genid(net));
1267 goto restart;
1268 }
1269 }
1270
1271 /* Try to bind route to arp only if it is output
1272 route or unicast forwarding path.
1273 */
1274 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1275 int err = rt_bind_neighbour(rt);
1276 if (err) {
1277 spin_unlock_bh(rt_hash_lock_addr(hash));
1278
1279 if (err != -ENOBUFS) {
1280 rt_drop(rt);
1281 return ERR_PTR(err);
1282 }
1283
1284 /* Neighbour tables are full and nothing
1285 can be released. Try to shrink route cache,
1286 it is most likely it holds some neighbour records.
1287 */
1288 if (attempts-- > 0) {
1289 int saved_elasticity = ip_rt_gc_elasticity;
1290 int saved_int = ip_rt_gc_min_interval;
1291 ip_rt_gc_elasticity = 1;
1292 ip_rt_gc_min_interval = 0;
1293 rt_garbage_collect(&ipv4_dst_ops);
1294 ip_rt_gc_min_interval = saved_int;
1295 ip_rt_gc_elasticity = saved_elasticity;
1296 goto restart;
1297 }
1298
1299 net_warn_ratelimited("Neighbour table overflow\n");
1300 rt_drop(rt);
1301 return ERR_PTR(-ENOBUFS);
1302 }
1303 }
1304
1305 rt->dst.rt_next = rt_hash_table[hash].chain;
1306
1307 /*
1308 * Since lookup is lockfree, we must make sure
1309 * previous writes to rt are committed to memory
1310 * before making rt visible to other CPUS.
1311 */
1312 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1313
1314 spin_unlock_bh(rt_hash_lock_addr(hash));
1315
1316skip_hashing:
1317 if (skb)
1318 skb_dst_set(skb, &rt->dst);
1319 return rt;
1320}
1321
1322static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323
1324static u32 rt_peer_genid(void)
1325{
1326 return atomic_read(&__rt_peer_genid);
1327}
1328
1329void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1330{
1331 struct inet_peer *peer;
1332
1333 peer = inet_getpeer_v4(daddr, create);
1334
1335 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1336 inet_putpeer(peer);
1337 else
1338 rt->rt_peer_genid = rt_peer_genid();
1339}
1340
1341/* 496/*
1342 * Peer allocation may fail only in serious out-of-memory conditions. However 497 * Peer allocation may fail only in serious out-of-memory conditions. However
1343 * we still can generate some output. 498 * we still can generate some output.
@@ -1360,83 +515,188 @@ static void ip_select_fb_ident(struct iphdr *iph)
1360 515
1361void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 516void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1362{ 517{
1363 struct rtable *rt = (struct rtable *) dst; 518 struct net *net = dev_net(dst->dev);
1364 519 struct inet_peer *peer;
1365 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1366 if (rt->peer == NULL)
1367 rt_bind_peer(rt, rt->rt_dst, 1);
1368 520
1369 /* If peer is attached to destination, it is never detached, 521 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1370 so that we need not to grab a lock to dereference it. 522 if (peer) {
1371 */ 523 iph->id = htons(inet_getid(peer, more));
1372 if (rt->peer) { 524 inet_putpeer(peer);
1373 iph->id = htons(inet_getid(rt->peer, more)); 525 return;
1374 return; 526 }
1375 }
1376 } else if (!rt)
1377 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1378 527
1379 ip_select_fb_ident(iph); 528 ip_select_fb_ident(iph);
1380} 529}
1381EXPORT_SYMBOL(__ip_select_ident); 530EXPORT_SYMBOL(__ip_select_ident);
1382 531
1383static void rt_del(unsigned int hash, struct rtable *rt) 532static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
533 const struct iphdr *iph,
534 int oif, u8 tos,
535 u8 prot, u32 mark, int flow_flags)
1384{ 536{
1385 struct rtable __rcu **rthp; 537 if (sk) {
1386 struct rtable *aux; 538 const struct inet_sock *inet = inet_sk(sk);
1387 539
1388 rthp = &rt_hash_table[hash].chain; 540 oif = sk->sk_bound_dev_if;
1389 spin_lock_bh(rt_hash_lock_addr(hash)); 541 mark = sk->sk_mark;
1390 ip_rt_put(rt); 542 tos = RT_CONN_FLAGS(sk);
1391 while ((aux = rcu_dereference_protected(*rthp, 543 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1392 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1393 if (aux == rt || rt_is_expired(aux)) {
1394 *rthp = aux->dst.rt_next;
1395 rt_free(aux);
1396 continue;
1397 }
1398 rthp = &aux->dst.rt_next;
1399 } 544 }
1400 spin_unlock_bh(rt_hash_lock_addr(hash)); 545 flowi4_init_output(fl4, oif, mark, tos,
546 RT_SCOPE_UNIVERSE, prot,
547 flow_flags,
548 iph->daddr, iph->saddr, 0, 0);
1401} 549}
1402 550
1403static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) 551static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
552 const struct sock *sk)
1404{ 553{
1405 struct rtable *rt = (struct rtable *) dst; 554 const struct iphdr *iph = ip_hdr(skb);
1406 __be32 orig_gw = rt->rt_gateway; 555 int oif = skb->dev->ifindex;
1407 struct neighbour *n, *old_n; 556 u8 tos = RT_TOS(iph->tos);
557 u8 prot = iph->protocol;
558 u32 mark = skb->mark;
1408 559
1409 dst_confirm(&rt->dst); 560 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
561}
1410 562
1411 rt->rt_gateway = peer->redirect_learned.a4; 563static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564{
565 const struct inet_sock *inet = inet_sk(sk);
566 const struct ip_options_rcu *inet_opt;
567 __be32 daddr = inet->inet_daddr;
1412 568
1413 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); 569 rcu_read_lock();
1414 if (IS_ERR(n)) { 570 inet_opt = rcu_dereference(inet->inet_opt);
1415 rt->rt_gateway = orig_gw; 571 if (inet_opt && inet_opt->opt.srr)
1416 return; 572 daddr = inet_opt->opt.faddr;
573 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576 inet_sk_flowi_flags(sk),
577 daddr, inet->inet_saddr, 0, 0);
578 rcu_read_unlock();
579}
580
581static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582 const struct sk_buff *skb)
583{
584 if (skb)
585 build_skb_flow_key(fl4, skb, sk);
586 else
587 build_sk_flow_key(fl4, sk);
588}
589
590static DEFINE_SEQLOCK(fnhe_seqlock);
591
592static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
593{
594 struct fib_nh_exception *fnhe, *oldest;
595
596 oldest = rcu_dereference(hash->chain);
597 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
598 fnhe = rcu_dereference(fnhe->fnhe_next)) {
599 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
600 oldest = fnhe;
1417 } 601 }
1418 old_n = xchg(&rt->dst._neighbour, n); 602 return oldest;
1419 if (old_n) 603}
1420 neigh_release(old_n); 604
1421 if (!(n->nud_state & NUD_VALID)) { 605static inline u32 fnhe_hashfun(__be32 daddr)
1422 neigh_event_send(n, NULL); 606{
607 u32 hval;
608
609 hval = (__force u32) daddr;
610 hval ^= (hval >> 11) ^ (hval >> 22);
611
612 return hval & (FNHE_HASH_SIZE - 1);
613}
614
615static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
616 u32 pmtu, unsigned long expires)
617{
618 struct fnhe_hash_bucket *hash;
619 struct fib_nh_exception *fnhe;
620 int depth;
621 u32 hval = fnhe_hashfun(daddr);
622
623 write_seqlock_bh(&fnhe_seqlock);
624
625 hash = nh->nh_exceptions;
626 if (!hash) {
627 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
628 if (!hash)
629 goto out_unlock;
630 nh->nh_exceptions = hash;
631 }
632
633 hash += hval;
634
635 depth = 0;
636 for (fnhe = rcu_dereference(hash->chain); fnhe;
637 fnhe = rcu_dereference(fnhe->fnhe_next)) {
638 if (fnhe->fnhe_daddr == daddr)
639 break;
640 depth++;
641 }
642
643 if (fnhe) {
644 if (gw)
645 fnhe->fnhe_gw = gw;
646 if (pmtu) {
647 fnhe->fnhe_pmtu = pmtu;
648 fnhe->fnhe_expires = expires;
649 }
1423 } else { 650 } else {
1424 rt->rt_flags |= RTCF_REDIRECTED; 651 if (depth > FNHE_RECLAIM_DEPTH)
1425 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 652 fnhe = fnhe_oldest(hash);
653 else {
654 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
655 if (!fnhe)
656 goto out_unlock;
657
658 fnhe->fnhe_next = hash->chain;
659 rcu_assign_pointer(hash->chain, fnhe);
660 }
661 fnhe->fnhe_daddr = daddr;
662 fnhe->fnhe_gw = gw;
663 fnhe->fnhe_pmtu = pmtu;
664 fnhe->fnhe_expires = expires;
1426 } 665 }
666
667 fnhe->fnhe_stamp = jiffies;
668
669out_unlock:
670 write_sequnlock_bh(&fnhe_seqlock);
671 return;
1427} 672}
1428 673
1429/* called in rcu_read_lock() section */ 674static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
1430void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 675 bool kill_route)
1431 __be32 saddr, struct net_device *dev)
1432{ 676{
1433 int s, i; 677 __be32 new_gw = icmp_hdr(skb)->un.gateway;
1434 struct in_device *in_dev = __in_dev_get_rcu(dev); 678 __be32 old_gw = ip_hdr(skb)->saddr;
1435 __be32 skeys[2] = { saddr, 0 }; 679 struct net_device *dev = skb->dev;
1436 int ikeys[2] = { dev->ifindex, 0 }; 680 struct in_device *in_dev;
1437 struct inet_peer *peer; 681 struct fib_result res;
682 struct neighbour *n;
1438 struct net *net; 683 struct net *net;
1439 684
685 switch (icmp_hdr(skb)->code & 7) {
686 case ICMP_REDIR_NET:
687 case ICMP_REDIR_NETTOS:
688 case ICMP_REDIR_HOST:
689 case ICMP_REDIR_HOSTTOS:
690 break;
691
692 default:
693 return;
694 }
695
696 if (rt->rt_gateway != old_gw)
697 return;
698
699 in_dev = __in_dev_get_rcu(dev);
1440 if (!in_dev) 700 if (!in_dev)
1441 return; 701 return;
1442 702
@@ -1456,72 +716,50 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1456 goto reject_redirect; 716 goto reject_redirect;
1457 } 717 }
1458 718
1459 for (s = 0; s < 2; s++) { 719 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1460 for (i = 0; i < 2; i++) { 720 if (n) {
1461 unsigned int hash; 721 if (!(n->nud_state & NUD_VALID)) {
1462 struct rtable __rcu **rthp; 722 neigh_event_send(n, NULL);
1463 struct rtable *rt; 723 } else {
1464 724 if (fib_lookup(net, fl4, &res) == 0) {
1465 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); 725 struct fib_nh *nh = &FIB_RES_NH(res);
1466 726
1467 rthp = &rt_hash_table[hash].chain; 727 update_or_create_fnhe(nh, fl4->daddr, new_gw,
1468 728 0, 0);
1469 while ((rt = rcu_dereference(*rthp)) != NULL) {
1470 rthp = &rt->dst.rt_next;
1471
1472 if (rt->rt_key_dst != daddr ||
1473 rt->rt_key_src != skeys[s] ||
1474 rt->rt_oif != ikeys[i] ||
1475 rt_is_input_route(rt) ||
1476 rt_is_expired(rt) ||
1477 !net_eq(dev_net(rt->dst.dev), net) ||
1478 rt->dst.error ||
1479 rt->dst.dev != dev ||
1480 rt->rt_gateway != old_gw)
1481 continue;
1482
1483 if (!rt->peer)
1484 rt_bind_peer(rt, rt->rt_dst, 1);
1485
1486 peer = rt->peer;
1487 if (peer) {
1488 if (peer->redirect_learned.a4 != new_gw) {
1489 peer->redirect_learned.a4 = new_gw;
1490 atomic_inc(&__rt_peer_genid);
1491 }
1492 check_peer_redir(&rt->dst, peer);
1493 }
1494 } 729 }
730 if (kill_route)
731 rt->dst.obsolete = DST_OBSOLETE_KILL;
732 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1495 } 733 }
734 neigh_release(n);
1496 } 735 }
1497 return; 736 return;
1498 737
1499reject_redirect: 738reject_redirect:
1500#ifdef CONFIG_IP_ROUTE_VERBOSE 739#ifdef CONFIG_IP_ROUTE_VERBOSE
1501 if (IN_DEV_LOG_MARTIANS(in_dev)) 740 if (IN_DEV_LOG_MARTIANS(in_dev)) {
741 const struct iphdr *iph = (const struct iphdr *) skb->data;
742 __be32 daddr = iph->daddr;
743 __be32 saddr = iph->saddr;
744
1502 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 745 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1503 " Advised path = %pI4 -> %pI4\n", 746 " Advised path = %pI4 -> %pI4\n",
1504 &old_gw, dev->name, &new_gw, 747 &old_gw, dev->name, &new_gw,
1505 &saddr, &daddr); 748 &saddr, &daddr);
749 }
1506#endif 750#endif
1507 ; 751 ;
1508} 752}
1509 753
1510static bool peer_pmtu_expired(struct inet_peer *peer) 754static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1511{ 755{
1512 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 756 struct rtable *rt;
757 struct flowi4 fl4;
1513 758
1514 return orig && 759 rt = (struct rtable *) dst;
1515 time_after_eq(jiffies, orig) &&
1516 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1517}
1518 760
1519static bool peer_pmtu_cleaned(struct inet_peer *peer) 761 ip_rt_build_flow_key(&fl4, sk, skb);
1520{ 762 __ip_do_redirect(rt, skb, &fl4, true);
1521 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1522
1523 return orig &&
1524 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525} 763}
1526 764
1527static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 765static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1533,14 +771,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1533 if (dst->obsolete > 0) { 771 if (dst->obsolete > 0) {
1534 ip_rt_put(rt); 772 ip_rt_put(rt);
1535 ret = NULL; 773 ret = NULL;
1536 } else if (rt->rt_flags & RTCF_REDIRECTED) { 774 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1537 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 775 rt->dst.expires) {
1538 rt->rt_oif, 776 ip_rt_put(rt);
1539 rt_genid(dev_net(dst->dev)));
1540 rt_del(hash, rt);
1541 ret = NULL; 777 ret = NULL;
1542 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1543 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1544 } 778 }
1545 } 779 }
1546 return ret; 780 return ret;
@@ -1567,6 +801,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1567 struct rtable *rt = skb_rtable(skb); 801 struct rtable *rt = skb_rtable(skb);
1568 struct in_device *in_dev; 802 struct in_device *in_dev;
1569 struct inet_peer *peer; 803 struct inet_peer *peer;
804 struct net *net;
1570 int log_martians; 805 int log_martians;
1571 806
1572 rcu_read_lock(); 807 rcu_read_lock();
@@ -1578,9 +813,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1578 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 813 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1579 rcu_read_unlock(); 814 rcu_read_unlock();
1580 815
1581 if (!rt->peer) 816 net = dev_net(rt->dst.dev);
1582 rt_bind_peer(rt, rt->rt_dst, 1); 817 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1583 peer = rt->peer;
1584 if (!peer) { 818 if (!peer) {
1585 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 819 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1586 return; 820 return;
@@ -1597,7 +831,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1597 */ 831 */
1598 if (peer->rate_tokens >= ip_rt_redirect_number) { 832 if (peer->rate_tokens >= ip_rt_redirect_number) {
1599 peer->rate_last = jiffies; 833 peer->rate_last = jiffies;
1600 return; 834 goto out_put_peer;
1601 } 835 }
1602 836
1603 /* Check for load limit; set rate_last to the latest sent 837 /* Check for load limit; set rate_last to the latest sent
@@ -1614,20 +848,38 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1614 if (log_martians && 848 if (log_martians &&
1615 peer->rate_tokens == ip_rt_redirect_number) 849 peer->rate_tokens == ip_rt_redirect_number)
1616 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 850 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1617 &ip_hdr(skb)->saddr, rt->rt_iif, 851 &ip_hdr(skb)->saddr, inet_iif(skb),
1618 &rt->rt_dst, &rt->rt_gateway); 852 &ip_hdr(skb)->daddr, &rt->rt_gateway);
1619#endif 853#endif
1620 } 854 }
855out_put_peer:
856 inet_putpeer(peer);
1621} 857}
1622 858
1623static int ip_error(struct sk_buff *skb) 859static int ip_error(struct sk_buff *skb)
1624{ 860{
861 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1625 struct rtable *rt = skb_rtable(skb); 862 struct rtable *rt = skb_rtable(skb);
1626 struct inet_peer *peer; 863 struct inet_peer *peer;
1627 unsigned long now; 864 unsigned long now;
865 struct net *net;
1628 bool send; 866 bool send;
1629 int code; 867 int code;
1630 868
869 net = dev_net(rt->dst.dev);
870 if (!IN_DEV_FORWARD(in_dev)) {
871 switch (rt->dst.error) {
872 case EHOSTUNREACH:
873 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
874 break;
875
876 case ENETUNREACH:
877 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
878 break;
879 }
880 goto out;
881 }
882
1631 switch (rt->dst.error) { 883 switch (rt->dst.error) {
1632 case EINVAL: 884 case EINVAL:
1633 default: 885 default:
@@ -1637,17 +889,14 @@ static int ip_error(struct sk_buff *skb)
1637 break; 889 break;
1638 case ENETUNREACH: 890 case ENETUNREACH:
1639 code = ICMP_NET_UNREACH; 891 code = ICMP_NET_UNREACH;
1640 IP_INC_STATS_BH(dev_net(rt->dst.dev), 892 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1641 IPSTATS_MIB_INNOROUTES);
1642 break; 893 break;
1643 case EACCES: 894 case EACCES:
1644 code = ICMP_PKT_FILTERED; 895 code = ICMP_PKT_FILTERED;
1645 break; 896 break;
1646 } 897 }
1647 898
1648 if (!rt->peer) 899 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1649 rt_bind_peer(rt, rt->rt_dst, 1);
1650 peer = rt->peer;
1651 900
1652 send = true; 901 send = true;
1653 if (peer) { 902 if (peer) {
@@ -1660,6 +909,7 @@ static int ip_error(struct sk_buff *skb)
1660 peer->rate_tokens -= ip_rt_error_cost; 909 peer->rate_tokens -= ip_rt_error_cost;
1661 else 910 else
1662 send = false; 911 send = false;
912 inet_putpeer(peer);
1663 } 913 }
1664 if (send) 914 if (send)
1665 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 915 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -1668,163 +918,120 @@ out: kfree_skb(skb);
1668 return 0; 918 return 0;
1669} 919}
1670 920
1671/* 921static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1672 * The last two values are not from the RFC but 922{
1673 * are needed for AMPRnet AX.25 paths. 923 struct fib_result res;
1674 */
1675 924
1676static const unsigned short mtu_plateau[] = 925 if (mtu < ip_rt_min_pmtu)
1677{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 926 mtu = ip_rt_min_pmtu;
1678 927
1679static inline unsigned short guess_mtu(unsigned short old_mtu) 928 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1680{ 929 struct fib_nh *nh = &FIB_RES_NH(res);
1681 int i;
1682 930
1683 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 931 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1684 if (old_mtu > mtu_plateau[i]) 932 jiffies + ip_rt_mtu_expires);
1685 return mtu_plateau[i]; 933 }
1686 return 68; 934 return mtu;
1687} 935}
1688 936
1689unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, 937static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1690 unsigned short new_mtu, 938 struct sk_buff *skb, u32 mtu)
1691 struct net_device *dev)
1692{ 939{
1693 unsigned short old_mtu = ntohs(iph->tot_len); 940 struct rtable *rt = (struct rtable *) dst;
1694 unsigned short est_mtu = 0; 941 struct flowi4 fl4;
1695 struct inet_peer *peer;
1696
1697 peer = inet_getpeer_v4(iph->daddr, 1);
1698 if (peer) {
1699 unsigned short mtu = new_mtu;
1700
1701 if (new_mtu < 68 || new_mtu >= old_mtu) {
1702 /* BSD 4.2 derived systems incorrectly adjust
1703 * tot_len by the IP header length, and report
1704 * a zero MTU in the ICMP message.
1705 */
1706 if (mtu == 0 &&
1707 old_mtu >= 68 + (iph->ihl << 2))
1708 old_mtu -= iph->ihl << 2;
1709 mtu = guess_mtu(old_mtu);
1710 }
1711
1712 if (mtu < ip_rt_min_pmtu)
1713 mtu = ip_rt_min_pmtu;
1714 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715 unsigned long pmtu_expires;
1716
1717 pmtu_expires = jiffies + ip_rt_mtu_expires;
1718 if (!pmtu_expires)
1719 pmtu_expires = 1UL;
1720 942
1721 est_mtu = mtu; 943 ip_rt_build_flow_key(&fl4, sk, skb);
1722 peer->pmtu_learned = mtu; 944 mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
1723 peer->pmtu_expires = pmtu_expires;
1724 atomic_inc(&__rt_peer_genid);
1725 }
1726 945
1727 inet_putpeer(peer); 946 if (!rt->rt_pmtu) {
947 dst->obsolete = DST_OBSOLETE_KILL;
948 } else {
949 rt->rt_pmtu = mtu;
950 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1728 } 951 }
1729 return est_mtu ? : new_mtu;
1730} 952}
1731 953
1732static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) 954void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
955 int oif, u32 mark, u8 protocol, int flow_flags)
1733{ 956{
1734 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); 957 const struct iphdr *iph = (const struct iphdr *) skb->data;
958 struct flowi4 fl4;
959 struct rtable *rt;
1735 960
1736 if (!expires) 961 __build_flow_key(&fl4, NULL, iph, oif,
1737 return; 962 RT_TOS(iph->tos), protocol, mark, flow_flags);
1738 if (time_before(jiffies, expires)) { 963 rt = __ip_route_output_key(net, &fl4);
1739 u32 orig_dst_mtu = dst_mtu(dst); 964 if (!IS_ERR(rt)) {
1740 if (peer->pmtu_learned < orig_dst_mtu) { 965 __ip_rt_update_pmtu(rt, &fl4, mtu);
1741 if (!peer->pmtu_orig) 966 ip_rt_put(rt);
1742 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); 967 }
1743 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744 }
1745 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747} 968}
969EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1748 970
1749static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 971void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1750{ 972{
1751 struct rtable *rt = (struct rtable *) dst; 973 const struct iphdr *iph = (const struct iphdr *) skb->data;
1752 struct inet_peer *peer; 974 struct flowi4 fl4;
1753 975 struct rtable *rt;
1754 dst_confirm(dst);
1755
1756 if (!rt->peer)
1757 rt_bind_peer(rt, rt->rt_dst, 1);
1758 peer = rt->peer;
1759 if (peer) {
1760 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
1762 if (mtu < ip_rt_min_pmtu)
1763 mtu = ip_rt_min_pmtu;
1764 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765
1766 pmtu_expires = jiffies + ip_rt_mtu_expires;
1767 if (!pmtu_expires)
1768 pmtu_expires = 1UL;
1769
1770 peer->pmtu_learned = mtu;
1771 peer->pmtu_expires = pmtu_expires;
1772 976
1773 atomic_inc(&__rt_peer_genid); 977 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1774 rt->rt_peer_genid = rt_peer_genid(); 978 rt = __ip_route_output_key(sock_net(sk), &fl4);
1775 } 979 if (!IS_ERR(rt)) {
1776 check_peer_pmtu(dst, peer); 980 __ip_rt_update_pmtu(rt, &fl4, mtu);
981 ip_rt_put(rt);
1777 } 982 }
1778} 983}
984EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1779 985
1780 986void ipv4_redirect(struct sk_buff *skb, struct net *net,
1781static void ipv4_validate_peer(struct rtable *rt) 987 int oif, u32 mark, u8 protocol, int flow_flags)
1782{ 988{
1783 if (rt->rt_peer_genid != rt_peer_genid()) { 989 const struct iphdr *iph = (const struct iphdr *) skb->data;
1784 struct inet_peer *peer; 990 struct flowi4 fl4;
1785 991 struct rtable *rt;
1786 if (!rt->peer)
1787 rt_bind_peer(rt, rt->rt_dst, 0);
1788 992
1789 peer = rt->peer; 993 __build_flow_key(&fl4, NULL, iph, oif,
1790 if (peer) { 994 RT_TOS(iph->tos), protocol, mark, flow_flags);
1791 check_peer_pmtu(&rt->dst, peer); 995 rt = __ip_route_output_key(net, &fl4);
996 if (!IS_ERR(rt)) {
997 __ip_do_redirect(rt, skb, &fl4, false);
998 ip_rt_put(rt);
999 }
1000}
1001EXPORT_SYMBOL_GPL(ipv4_redirect);
1792 1002
1793 if (peer->redirect_learned.a4 && 1003void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1794 peer->redirect_learned.a4 != rt->rt_gateway) 1004{
1795 check_peer_redir(&rt->dst, peer); 1005 const struct iphdr *iph = (const struct iphdr *) skb->data;
1796 } 1006 struct flowi4 fl4;
1007 struct rtable *rt;
1797 1008
1798 rt->rt_peer_genid = rt_peer_genid(); 1009 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1010 rt = __ip_route_output_key(sock_net(sk), &fl4);
1011 if (!IS_ERR(rt)) {
1012 __ip_do_redirect(rt, skb, &fl4, false);
1013 ip_rt_put(rt);
1799 } 1014 }
1800} 1015}
1016EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1801 1017
1802static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1018static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803{ 1019{
1804 struct rtable *rt = (struct rtable *) dst; 1020 struct rtable *rt = (struct rtable *) dst;
1805 1021
1806 if (rt_is_expired(rt)) 1022 /* All IPV4 dsts are created with ->obsolete set to the value
1023 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1024 * into this function always.
1025 *
1026 * When a PMTU/redirect information update invalidates a
1027 * route, this is indicated by setting obsolete to
1028 * DST_OBSOLETE_KILL.
1029 */
1030 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1807 return NULL; 1031 return NULL;
1808 ipv4_validate_peer(rt);
1809 return dst; 1032 return dst;
1810} 1033}
1811 1034
1812static void ipv4_dst_destroy(struct dst_entry *dst)
1813{
1814 struct rtable *rt = (struct rtable *) dst;
1815 struct inet_peer *peer = rt->peer;
1816
1817 if (rt->fi) {
1818 fib_info_put(rt->fi);
1819 rt->fi = NULL;
1820 }
1821 if (peer) {
1822 rt->peer = NULL;
1823 inet_putpeer(peer);
1824 }
1825}
1826
1827
1828static void ipv4_link_failure(struct sk_buff *skb) 1035static void ipv4_link_failure(struct sk_buff *skb)
1829{ 1036{
1830 struct rtable *rt; 1037 struct rtable *rt;
@@ -1832,8 +1039,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
1832 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1039 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833 1040
1834 rt = skb_rtable(skb); 1041 rt = skb_rtable(skb);
1835 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) 1042 if (rt)
1836 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); 1043 dst_set_expires(&rt->dst, 0);
1837} 1044}
1838 1045
1839static int ip_rt_bug(struct sk_buff *skb) 1046static int ip_rt_bug(struct sk_buff *skb)
@@ -1880,8 +1087,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1880 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1087 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1088 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882 else 1089 else
1883 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1090 src = inet_select_addr(rt->dst.dev,
1884 RT_SCOPE_UNIVERSE); 1091 rt_nexthop(rt, iph->daddr),
1092 RT_SCOPE_UNIVERSE);
1885 rcu_read_unlock(); 1093 rcu_read_unlock();
1886 } 1094 }
1887 memcpy(addr, &src, 4); 1095 memcpy(addr, &src, 4);
@@ -1913,7 +1121,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1913static unsigned int ipv4_mtu(const struct dst_entry *dst) 1121static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914{ 1122{
1915 const struct rtable *rt = (const struct rtable *) dst; 1123 const struct rtable *rt = (const struct rtable *) dst;
1916 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 1124 unsigned int mtu = rt->rt_pmtu;
1125
1126 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1127 mtu = 0;
1128
1129 if (!mtu)
1130 mtu = dst_metric_raw(dst, RTAX_MTU);
1917 1131
1918 if (mtu && rt_is_output_route(rt)) 1132 if (mtu && rt_is_output_route(rt))
1919 return mtu; 1133 return mtu;
@@ -1921,8 +1135,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1921 mtu = dst->dev->mtu; 1135 mtu = dst->dev->mtu;
1922 1136
1923 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1137 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924 1138 if (rt->rt_gateway && mtu > 576)
1925 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926 mtu = 576; 1139 mtu = 576;
1927 } 1140 }
1928 1141
@@ -1932,76 +1145,121 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1932 return mtu; 1145 return mtu;
1933} 1146}
1934 1147
1935static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1148static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1936 struct fib_info *fi)
1937{ 1149{
1938 struct inet_peer *peer; 1150 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1939 int create = 0; 1151 struct fib_nh_exception *fnhe;
1152 u32 hval;
1940 1153
1941 /* If a peer entry exists for this destination, we must hook 1154 if (!hash)
1942 * it up in order to get at cached metrics. 1155 return NULL;
1943 */
1944 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945 create = 1;
1946 1156
1947 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); 1157 hval = fnhe_hashfun(daddr);
1948 if (peer) { 1158
1949 rt->rt_peer_genid = rt_peer_genid(); 1159 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1950 if (inet_metrics_new(peer)) 1160 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1951 memcpy(peer->metrics, fi->fib_metrics, 1161 if (fnhe->fnhe_daddr == daddr)
1952 sizeof(u32) * RTAX_MAX); 1162 return fnhe;
1953 dst_init_metrics(&rt->dst, peer->metrics, false); 1163 }
1954 1164 return NULL;
1955 check_peer_pmtu(&rt->dst, peer); 1165}
1956 1166
1957 if (peer->redirect_learned.a4 && 1167static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1958 peer->redirect_learned.a4 != rt->rt_gateway) { 1168 __be32 daddr)
1959 rt->rt_gateway = peer->redirect_learned.a4; 1169{
1960 rt->rt_flags |= RTCF_REDIRECTED; 1170 __be32 fnhe_daddr, gw;
1961 } 1171 unsigned long expires;
1962 } else { 1172 unsigned int seq;
1963 if (fi->fib_metrics != (u32 *) dst_default_metrics) { 1173 u32 pmtu;
1964 rt->fi = fi; 1174
1965 atomic_inc(&fi->fib_clntref); 1175restart:
1176 seq = read_seqbegin(&fnhe_seqlock);
1177 fnhe_daddr = fnhe->fnhe_daddr;
1178 gw = fnhe->fnhe_gw;
1179 pmtu = fnhe->fnhe_pmtu;
1180 expires = fnhe->fnhe_expires;
1181 if (read_seqretry(&fnhe_seqlock, seq))
1182 goto restart;
1183
1184 if (daddr != fnhe_daddr)
1185 return;
1186
1187 if (pmtu) {
1188 unsigned long diff = expires - jiffies;
1189
1190 if (time_before(jiffies, expires)) {
1191 rt->rt_pmtu = pmtu;
1192 dst_set_expires(&rt->dst, diff);
1966 } 1193 }
1967 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968 } 1194 }
1195 if (gw) {
1196 rt->rt_flags |= RTCF_REDIRECTED;
1197 rt->rt_gateway = gw;
1198 }
1199 fnhe->fnhe_stamp = jiffies;
1200}
1201
1202static inline void rt_release_rcu(struct rcu_head *head)
1203{
1204 struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
1205 dst_release(dst);
1206}
1207
1208static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1209{
1210 struct rtable *orig, *prev, **p = &nh->nh_rth_output;
1211
1212 if (rt_is_input_route(rt))
1213 p = &nh->nh_rth_input;
1214
1215 orig = *p;
1216
1217 prev = cmpxchg(p, orig, rt);
1218 if (prev == orig) {
1219 dst_clone(&rt->dst);
1220 if (orig)
1221 call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);
1222 }
1223}
1224
1225static bool rt_cache_valid(struct rtable *rt)
1226{
1227 return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK);
1969} 1228}
1970 1229
1971static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1230static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1972 const struct fib_result *res, 1231 const struct fib_result *res,
1232 struct fib_nh_exception *fnhe,
1973 struct fib_info *fi, u16 type, u32 itag) 1233 struct fib_info *fi, u16 type, u32 itag)
1974{ 1234{
1975 struct dst_entry *dst = &rt->dst;
1976
1977 if (fi) { 1235 if (fi) {
1978 if (FIB_RES_GW(*res) && 1236 struct fib_nh *nh = &FIB_RES_NH(*res);
1979 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1237
1980 rt->rt_gateway = FIB_RES_GW(*res); 1238 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1981 rt_init_metrics(rt, fl4, fi); 1239 rt->rt_gateway = nh->nh_gw;
1240 if (unlikely(fnhe))
1241 rt_bind_exception(rt, fnhe, daddr);
1242 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1982#ifdef CONFIG_IP_ROUTE_CLASSID 1243#ifdef CONFIG_IP_ROUTE_CLASSID
1983 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1244 rt->dst.tclassid = nh->nh_tclassid;
1984#endif 1245#endif
1246 if (!(rt->dst.flags & DST_HOST))
1247 rt_cache_route(nh, rt);
1985 } 1248 }
1986 1249
1987 if (dst_mtu(dst) > IP_MAX_MTU)
1988 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1991
1992#ifdef CONFIG_IP_ROUTE_CLASSID 1250#ifdef CONFIG_IP_ROUTE_CLASSID
1993#ifdef CONFIG_IP_MULTIPLE_TABLES 1251#ifdef CONFIG_IP_MULTIPLE_TABLES
1994 set_class_tag(rt, fib_rules_tclass(res)); 1252 set_class_tag(rt, res->tclassid);
1995#endif 1253#endif
1996 set_class_tag(rt, itag); 1254 set_class_tag(rt, itag);
1997#endif 1255#endif
1998} 1256}
1999 1257
2000static struct rtable *rt_dst_alloc(struct net_device *dev, 1258static struct rtable *rt_dst_alloc(struct net_device *dev,
2001 bool nopolicy, bool noxfrm) 1259 bool nopolicy, bool noxfrm, bool will_cache)
2002{ 1260{
2003 return dst_alloc(&ipv4_dst_ops, dev, 1, -1, 1261 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
2004 DST_HOST | 1262 (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
2005 (nopolicy ? DST_NOPOLICY : 0) | 1263 (nopolicy ? DST_NOPOLICY : 0) |
2006 (noxfrm ? DST_NOXFRM : 0)); 1264 (noxfrm ? DST_NOXFRM : 0));
2007} 1265}
@@ -2010,9 +1268,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
2010static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1268static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011 u8 tos, struct net_device *dev, int our) 1269 u8 tos, struct net_device *dev, int our)
2012{ 1270{
2013 unsigned int hash;
2014 struct rtable *rth; 1271 struct rtable *rth;
2015 __be32 spec_dst;
2016 struct in_device *in_dev = __in_dev_get_rcu(dev); 1272 struct in_device *in_dev = __in_dev_get_rcu(dev);
2017 u32 itag = 0; 1273 u32 itag = 0;
2018 int err; 1274 int err;
@@ -2023,21 +1279,24 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2023 return -EINVAL; 1279 return -EINVAL;
2024 1280
2025 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1281 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) 1282 skb->protocol != htons(ETH_P_IP))
2027 goto e_inval; 1283 goto e_inval;
2028 1284
1285 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1286 if (ipv4_is_loopback(saddr))
1287 goto e_inval;
1288
2029 if (ipv4_is_zeronet(saddr)) { 1289 if (ipv4_is_zeronet(saddr)) {
2030 if (!ipv4_is_local_multicast(daddr)) 1290 if (!ipv4_is_local_multicast(daddr))
2031 goto e_inval; 1291 goto e_inval;
2032 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033 } else { 1292 } else {
2034 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 1293 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2035 &itag); 1294 in_dev, &itag);
2036 if (err < 0) 1295 if (err < 0)
2037 goto e_err; 1296 goto e_err;
2038 } 1297 }
2039 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, 1298 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2040 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 1299 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
2041 if (!rth) 1300 if (!rth)
2042 goto e_nobufs; 1301 goto e_nobufs;
2043 1302
@@ -2046,23 +1305,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2046#endif 1305#endif
2047 rth->dst.output = ip_rt_bug; 1306 rth->dst.output = ip_rt_bug;
2048 1307
2049 rth->rt_key_dst = daddr;
2050 rth->rt_key_src = saddr;
2051 rth->rt_genid = rt_genid(dev_net(dev)); 1308 rth->rt_genid = rt_genid(dev_net(dev));
2052 rth->rt_flags = RTCF_MULTICAST; 1309 rth->rt_flags = RTCF_MULTICAST;
2053 rth->rt_type = RTN_MULTICAST; 1310 rth->rt_type = RTN_MULTICAST;
2054 rth->rt_key_tos = tos; 1311 rth->rt_is_input= 1;
2055 rth->rt_dst = daddr; 1312 rth->rt_iif = 0;
2056 rth->rt_src = saddr; 1313 rth->rt_pmtu = 0;
2057 rth->rt_route_iif = dev->ifindex; 1314 rth->rt_gateway = 0;
2058 rth->rt_iif = dev->ifindex;
2059 rth->rt_oif = 0;
2060 rth->rt_mark = skb->mark;
2061 rth->rt_gateway = daddr;
2062 rth->rt_spec_dst= spec_dst;
2063 rth->rt_peer_genid = 0;
2064 rth->peer = NULL;
2065 rth->fi = NULL;
2066 if (our) { 1315 if (our) {
2067 rth->dst.input= ip_local_deliver; 1316 rth->dst.input= ip_local_deliver;
2068 rth->rt_flags |= RTCF_LOCAL; 1317 rth->rt_flags |= RTCF_LOCAL;
@@ -2074,9 +1323,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2074#endif 1323#endif
2075 RT_CACHE_STAT_INC(in_slow_mc); 1324 RT_CACHE_STAT_INC(in_slow_mc);
2076 1325
2077 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1326 skb_dst_set(skb, &rth->dst);
2078 rth = rt_intern_hash(hash, rth, skb, dev->ifindex); 1327 return 0;
2079 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080 1328
2081e_nobufs: 1329e_nobufs:
2082 return -ENOBUFS; 1330 return -ENOBUFS;
@@ -2123,7 +1371,7 @@ static int __mkroute_input(struct sk_buff *skb,
2123 int err; 1371 int err;
2124 struct in_device *out_dev; 1372 struct in_device *out_dev;
2125 unsigned int flags = 0; 1373 unsigned int flags = 0;
2126 __be32 spec_dst; 1374 bool do_cache;
2127 u32 itag; 1375 u32 itag;
2128 1376
2129 /* get a working reference to the output device */ 1377 /* get a working reference to the output device */
@@ -2135,7 +1383,7 @@ static int __mkroute_input(struct sk_buff *skb,
2135 1383
2136 1384
2137 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1385 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2138 in_dev->dev, &spec_dst, &itag); 1386 in_dev->dev, in_dev, &itag);
2139 if (err < 0) { 1387 if (err < 0) {
2140 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1388 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2141 saddr); 1389 saddr);
@@ -2143,9 +1391,6 @@ static int __mkroute_input(struct sk_buff *skb,
2143 goto cleanup; 1391 goto cleanup;
2144 } 1392 }
2145 1393
2146 if (err)
2147 flags |= RTCF_DIRECTSRC;
2148
2149 if (out_dev == in_dev && err && 1394 if (out_dev == in_dev && err &&
2150 (IN_DEV_SHARED_MEDIA(out_dev) || 1395 (IN_DEV_SHARED_MEDIA(out_dev) ||
2151 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 1396 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
@@ -2166,37 +1411,39 @@ static int __mkroute_input(struct sk_buff *skb,
2166 } 1411 }
2167 } 1412 }
2168 1413
1414 do_cache = false;
1415 if (res->fi) {
1416 if (!itag) {
1417 rth = FIB_RES_NH(*res).nh_rth_input;
1418 if (rt_cache_valid(rth)) {
1419 dst_hold(&rth->dst);
1420 goto out;
1421 }
1422 do_cache = true;
1423 }
1424 }
1425
2169 rth = rt_dst_alloc(out_dev->dev, 1426 rth = rt_dst_alloc(out_dev->dev,
2170 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1427 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2171 IN_DEV_CONF_GET(out_dev, NOXFRM)); 1428 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
2172 if (!rth) { 1429 if (!rth) {
2173 err = -ENOBUFS; 1430 err = -ENOBUFS;
2174 goto cleanup; 1431 goto cleanup;
2175 } 1432 }
2176 1433
2177 rth->rt_key_dst = daddr;
2178 rth->rt_key_src = saddr;
2179 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 1434 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2180 rth->rt_flags = flags; 1435 rth->rt_flags = flags;
2181 rth->rt_type = res->type; 1436 rth->rt_type = res->type;
2182 rth->rt_key_tos = tos; 1437 rth->rt_is_input = 1;
2183 rth->rt_dst = daddr; 1438 rth->rt_iif = 0;
2184 rth->rt_src = saddr; 1439 rth->rt_pmtu = 0;
2185 rth->rt_route_iif = in_dev->dev->ifindex; 1440 rth->rt_gateway = 0;
2186 rth->rt_iif = in_dev->dev->ifindex;
2187 rth->rt_oif = 0;
2188 rth->rt_mark = skb->mark;
2189 rth->rt_gateway = daddr;
2190 rth->rt_spec_dst= spec_dst;
2191 rth->rt_peer_genid = 0;
2192 rth->peer = NULL;
2193 rth->fi = NULL;
2194 1441
2195 rth->dst.input = ip_forward; 1442 rth->dst.input = ip_forward;
2196 rth->dst.output = ip_output; 1443 rth->dst.output = ip_output;
2197 1444
2198 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); 1445 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
2199 1446out:
2200 *result = rth; 1447 *result = rth;
2201 err = 0; 1448 err = 0;
2202 cleanup: 1449 cleanup:
@@ -2211,7 +1458,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
2211{ 1458{
2212 struct rtable *rth = NULL; 1459 struct rtable *rth = NULL;
2213 int err; 1460 int err;
2214 unsigned int hash;
2215 1461
2216#ifdef CONFIG_IP_ROUTE_MULTIPATH 1462#ifdef CONFIG_IP_ROUTE_MULTIPATH
2217 if (res->fi && res->fi->fib_nhs > 1) 1463 if (res->fi && res->fi->fib_nhs > 1)
@@ -2223,12 +1469,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2223 if (err) 1469 if (err)
2224 return err; 1470 return err;
2225 1471
2226 /* put it into the cache */ 1472 skb_dst_set(skb, &rth->dst);
2227 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2228 rt_genid(dev_net(rth->dst.dev)));
2229 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2230 if (IS_ERR(rth))
2231 return PTR_ERR(rth);
2232 return 0; 1473 return 0;
2233} 1474}
2234 1475
@@ -2252,10 +1493,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2252 unsigned int flags = 0; 1493 unsigned int flags = 0;
2253 u32 itag = 0; 1494 u32 itag = 0;
2254 struct rtable *rth; 1495 struct rtable *rth;
2255 unsigned int hash;
2256 __be32 spec_dst;
2257 int err = -EINVAL; 1496 int err = -EINVAL;
2258 struct net *net = dev_net(dev); 1497 struct net *net = dev_net(dev);
1498 bool do_cache;
2259 1499
2260 /* IP on this device is disabled. */ 1500 /* IP on this device is disabled. */
2261 1501
@@ -2266,10 +1506,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2266 by fib_lookup. 1506 by fib_lookup.
2267 */ 1507 */
2268 1508
2269 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1509 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2270 ipv4_is_loopback(saddr))
2271 goto martian_source; 1510 goto martian_source;
2272 1511
1512 res.fi = NULL;
2273 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 1513 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2274 goto brd_input; 1514 goto brd_input;
2275 1515
@@ -2279,9 +1519,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2279 if (ipv4_is_zeronet(saddr)) 1519 if (ipv4_is_zeronet(saddr))
2280 goto martian_source; 1520 goto martian_source;
2281 1521
2282 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) 1522 if (ipv4_is_zeronet(daddr))
2283 goto martian_destination; 1523 goto martian_destination;
2284 1524
1525 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1526 if (ipv4_is_loopback(daddr))
1527 goto martian_destination;
1528
1529 if (ipv4_is_loopback(saddr))
1530 goto martian_source;
1531 }
1532
2285 /* 1533 /*
2286 * Now we are ready to route packet. 1534 * Now we are ready to route packet.
2287 */ 1535 */
@@ -2293,11 +1541,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2293 fl4.daddr = daddr; 1541 fl4.daddr = daddr;
2294 fl4.saddr = saddr; 1542 fl4.saddr = saddr;
2295 err = fib_lookup(net, &fl4, &res); 1543 err = fib_lookup(net, &fl4, &res);
2296 if (err != 0) { 1544 if (err != 0)
2297 if (!IN_DEV_FORWARD(in_dev))
2298 goto e_hostunreach;
2299 goto no_route; 1545 goto no_route;
2300 }
2301 1546
2302 RT_CACHE_STAT_INC(in_slow_tot); 1547 RT_CACHE_STAT_INC(in_slow_tot);
2303 1548
@@ -2307,17 +1552,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2307 if (res.type == RTN_LOCAL) { 1552 if (res.type == RTN_LOCAL) {
2308 err = fib_validate_source(skb, saddr, daddr, tos, 1553 err = fib_validate_source(skb, saddr, daddr, tos,
2309 net->loopback_dev->ifindex, 1554 net->loopback_dev->ifindex,
2310 dev, &spec_dst, &itag); 1555 dev, in_dev, &itag);
2311 if (err < 0) 1556 if (err < 0)
2312 goto martian_source_keep_err; 1557 goto martian_source_keep_err;
2313 if (err)
2314 flags |= RTCF_DIRECTSRC;
2315 spec_dst = daddr;
2316 goto local_input; 1558 goto local_input;
2317 } 1559 }
2318 1560
2319 if (!IN_DEV_FORWARD(in_dev)) 1561 if (!IN_DEV_FORWARD(in_dev))
2320 goto e_hostunreach; 1562 goto no_route;
2321 if (res.type != RTN_UNICAST) 1563 if (res.type != RTN_UNICAST)
2322 goto martian_destination; 1564 goto martian_destination;
2323 1565
@@ -2328,23 +1570,31 @@ brd_input:
2328 if (skb->protocol != htons(ETH_P_IP)) 1570 if (skb->protocol != htons(ETH_P_IP))
2329 goto e_inval; 1571 goto e_inval;
2330 1572
2331 if (ipv4_is_zeronet(saddr)) 1573 if (!ipv4_is_zeronet(saddr)) {
2332 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1574 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2333 else { 1575 in_dev, &itag);
2334 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2335 &itag);
2336 if (err < 0) 1576 if (err < 0)
2337 goto martian_source_keep_err; 1577 goto martian_source_keep_err;
2338 if (err)
2339 flags |= RTCF_DIRECTSRC;
2340 } 1578 }
2341 flags |= RTCF_BROADCAST; 1579 flags |= RTCF_BROADCAST;
2342 res.type = RTN_BROADCAST; 1580 res.type = RTN_BROADCAST;
2343 RT_CACHE_STAT_INC(in_brd); 1581 RT_CACHE_STAT_INC(in_brd);
2344 1582
2345local_input: 1583local_input:
1584 do_cache = false;
1585 if (res.fi) {
1586 if (!itag) {
1587 rth = FIB_RES_NH(res).nh_rth_input;
1588 if (rt_cache_valid(rth)) {
1589 dst_hold(&rth->dst);
1590 goto set_and_out;
1591 }
1592 do_cache = true;
1593 }
1594 }
1595
2346 rth = rt_dst_alloc(net->loopback_dev, 1596 rth = rt_dst_alloc(net->loopback_dev,
2347 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 1597 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2348 if (!rth) 1598 if (!rth)
2349 goto e_nobufs; 1599 goto e_nobufs;
2350 1600
@@ -2354,41 +1604,27 @@ local_input:
2354 rth->dst.tclassid = itag; 1604 rth->dst.tclassid = itag;
2355#endif 1605#endif
2356 1606
2357 rth->rt_key_dst = daddr;
2358 rth->rt_key_src = saddr;
2359 rth->rt_genid = rt_genid(net); 1607 rth->rt_genid = rt_genid(net);
2360 rth->rt_flags = flags|RTCF_LOCAL; 1608 rth->rt_flags = flags|RTCF_LOCAL;
2361 rth->rt_type = res.type; 1609 rth->rt_type = res.type;
2362 rth->rt_key_tos = tos; 1610 rth->rt_is_input = 1;
2363 rth->rt_dst = daddr; 1611 rth->rt_iif = 0;
2364 rth->rt_src = saddr; 1612 rth->rt_pmtu = 0;
2365#ifdef CONFIG_IP_ROUTE_CLASSID 1613 rth->rt_gateway = 0;
2366 rth->dst.tclassid = itag;
2367#endif
2368 rth->rt_route_iif = dev->ifindex;
2369 rth->rt_iif = dev->ifindex;
2370 rth->rt_oif = 0;
2371 rth->rt_mark = skb->mark;
2372 rth->rt_gateway = daddr;
2373 rth->rt_spec_dst= spec_dst;
2374 rth->rt_peer_genid = 0;
2375 rth->peer = NULL;
2376 rth->fi = NULL;
2377 if (res.type == RTN_UNREACHABLE) { 1614 if (res.type == RTN_UNREACHABLE) {
2378 rth->dst.input= ip_error; 1615 rth->dst.input= ip_error;
2379 rth->dst.error= -err; 1616 rth->dst.error= -err;
2380 rth->rt_flags &= ~RTCF_LOCAL; 1617 rth->rt_flags &= ~RTCF_LOCAL;
2381 } 1618 }
2382 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); 1619 if (do_cache)
2383 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); 1620 rt_cache_route(&FIB_RES_NH(res), rth);
1621set_and_out:
1622 skb_dst_set(skb, &rth->dst);
2384 err = 0; 1623 err = 0;
2385 if (IS_ERR(rth))
2386 err = PTR_ERR(rth);
2387 goto out; 1624 goto out;
2388 1625
2389no_route: 1626no_route:
2390 RT_CACHE_STAT_INC(in_no_route); 1627 RT_CACHE_STAT_INC(in_no_route);
2391 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2392 res.type = RTN_UNREACHABLE; 1628 res.type = RTN_UNREACHABLE;
2393 if (err == -ESRCH) 1629 if (err == -ESRCH)
2394 err = -ENETUNREACH; 1630 err = -ENETUNREACH;
@@ -2405,10 +1641,6 @@ martian_destination:
2405 &daddr, &saddr, dev->name); 1641 &daddr, &saddr, dev->name);
2406#endif 1642#endif
2407 1643
2408e_hostunreach:
2409 err = -EHOSTUNREACH;
2410 goto out;
2411
2412e_inval: 1644e_inval:
2413 err = -EINVAL; 1645 err = -EINVAL;
2414 goto out; 1646 goto out;
@@ -2424,50 +1656,13 @@ martian_source_keep_err:
2424 goto out; 1656 goto out;
2425} 1657}
2426 1658
2427int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1659int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2428 u8 tos, struct net_device *dev, bool noref) 1660 u8 tos, struct net_device *dev)
2429{ 1661{
2430 struct rtable *rth;
2431 unsigned int hash;
2432 int iif = dev->ifindex;
2433 struct net *net;
2434 int res; 1662 int res;
2435 1663
2436 net = dev_net(dev);
2437
2438 rcu_read_lock(); 1664 rcu_read_lock();
2439 1665
2440 if (!rt_caching(net))
2441 goto skip_cache;
2442
2443 tos &= IPTOS_RT_MASK;
2444 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2445
2446 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2447 rth = rcu_dereference(rth->dst.rt_next)) {
2448 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2449 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2450 (rth->rt_route_iif ^ iif) |
2451 (rth->rt_key_tos ^ tos)) == 0 &&
2452 rth->rt_mark == skb->mark &&
2453 net_eq(dev_net(rth->dst.dev), net) &&
2454 !rt_is_expired(rth)) {
2455 ipv4_validate_peer(rth);
2456 if (noref) {
2457 dst_use_noref(&rth->dst, jiffies);
2458 skb_dst_set_noref(skb, &rth->dst);
2459 } else {
2460 dst_use(&rth->dst, jiffies);
2461 skb_dst_set(skb, &rth->dst);
2462 }
2463 RT_CACHE_STAT_INC(in_hit);
2464 rcu_read_unlock();
2465 return 0;
2466 }
2467 RT_CACHE_STAT_INC(in_hlist_search);
2468 }
2469
2470skip_cache:
2471 /* Multicast recognition logic is moved from route cache to here. 1666 /* Multicast recognition logic is moved from route cache to here.
2472 The problem was that too many Ethernet cards have broken/missing 1667 The problem was that too many Ethernet cards have broken/missing
2473 hardware multicast filters :-( As result the host on multicasting 1668 hardware multicast filters :-( As result the host on multicasting
@@ -2505,24 +1700,28 @@ skip_cache:
2505 rcu_read_unlock(); 1700 rcu_read_unlock();
2506 return res; 1701 return res;
2507} 1702}
2508EXPORT_SYMBOL(ip_route_input_common); 1703EXPORT_SYMBOL(ip_route_input);
2509 1704
2510/* called with rcu_read_lock() */ 1705/* called with rcu_read_lock() */
2511static struct rtable *__mkroute_output(const struct fib_result *res, 1706static struct rtable *__mkroute_output(const struct fib_result *res,
2512 const struct flowi4 *fl4, 1707 const struct flowi4 *fl4, int orig_oif,
2513 __be32 orig_daddr, __be32 orig_saddr,
2514 int orig_oif, __u8 orig_rtos,
2515 struct net_device *dev_out, 1708 struct net_device *dev_out,
2516 unsigned int flags) 1709 unsigned int flags)
2517{ 1710{
2518 struct fib_info *fi = res->fi; 1711 struct fib_info *fi = res->fi;
1712 struct fib_nh_exception *fnhe;
2519 struct in_device *in_dev; 1713 struct in_device *in_dev;
2520 u16 type = res->type; 1714 u16 type = res->type;
2521 struct rtable *rth; 1715 struct rtable *rth;
2522 1716
2523 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) 1717 in_dev = __in_dev_get_rcu(dev_out);
1718 if (!in_dev)
2524 return ERR_PTR(-EINVAL); 1719 return ERR_PTR(-EINVAL);
2525 1720
1721 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1722 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1723 return ERR_PTR(-EINVAL);
1724
2526 if (ipv4_is_lbcast(fl4->daddr)) 1725 if (ipv4_is_lbcast(fl4->daddr))
2527 type = RTN_BROADCAST; 1726 type = RTN_BROADCAST;
2528 else if (ipv4_is_multicast(fl4->daddr)) 1727 else if (ipv4_is_multicast(fl4->daddr))
@@ -2533,10 +1732,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2533 if (dev_out->flags & IFF_LOOPBACK) 1732 if (dev_out->flags & IFF_LOOPBACK)
2534 flags |= RTCF_LOCAL; 1733 flags |= RTCF_LOCAL;
2535 1734
2536 in_dev = __in_dev_get_rcu(dev_out);
2537 if (!in_dev)
2538 return ERR_PTR(-EINVAL);
2539
2540 if (type == RTN_BROADCAST) { 1735 if (type == RTN_BROADCAST) {
2541 flags |= RTCF_BROADCAST | RTCF_LOCAL; 1736 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2542 fi = NULL; 1737 fi = NULL;
@@ -2553,40 +1748,39 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2553 fi = NULL; 1748 fi = NULL;
2554 } 1749 }
2555 1750
1751 fnhe = NULL;
1752 if (fi) {
1753 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1754 if (!fnhe) {
1755 rth = FIB_RES_NH(*res).nh_rth_output;
1756 if (rt_cache_valid(rth)) {
1757 dst_hold(&rth->dst);
1758 return rth;
1759 }
1760 }
1761 }
2556 rth = rt_dst_alloc(dev_out, 1762 rth = rt_dst_alloc(dev_out,
2557 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1763 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2558 IN_DEV_CONF_GET(in_dev, NOXFRM)); 1764 IN_DEV_CONF_GET(in_dev, NOXFRM),
1765 fi && !fnhe);
2559 if (!rth) 1766 if (!rth)
2560 return ERR_PTR(-ENOBUFS); 1767 return ERR_PTR(-ENOBUFS);
2561 1768
2562 rth->dst.output = ip_output; 1769 rth->dst.output = ip_output;
2563 1770
2564 rth->rt_key_dst = orig_daddr;
2565 rth->rt_key_src = orig_saddr;
2566 rth->rt_genid = rt_genid(dev_net(dev_out)); 1771 rth->rt_genid = rt_genid(dev_net(dev_out));
2567 rth->rt_flags = flags; 1772 rth->rt_flags = flags;
2568 rth->rt_type = type; 1773 rth->rt_type = type;
2569 rth->rt_key_tos = orig_rtos; 1774 rth->rt_is_input = 0;
2570 rth->rt_dst = fl4->daddr; 1775 rth->rt_iif = orig_oif ? : 0;
2571 rth->rt_src = fl4->saddr; 1776 rth->rt_pmtu = 0;
2572 rth->rt_route_iif = 0; 1777 rth->rt_gateway = 0;
2573 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2574 rth->rt_oif = orig_oif;
2575 rth->rt_mark = fl4->flowi4_mark;
2576 rth->rt_gateway = fl4->daddr;
2577 rth->rt_spec_dst= fl4->saddr;
2578 rth->rt_peer_genid = 0;
2579 rth->peer = NULL;
2580 rth->fi = NULL;
2581 1778
2582 RT_CACHE_STAT_INC(out_slow_tot); 1779 RT_CACHE_STAT_INC(out_slow_tot);
2583 1780
2584 if (flags & RTCF_LOCAL) { 1781 if (flags & RTCF_LOCAL)
2585 rth->dst.input = ip_local_deliver; 1782 rth->dst.input = ip_local_deliver;
2586 rth->rt_spec_dst = fl4->daddr;
2587 }
2588 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 1783 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2589 rth->rt_spec_dst = fl4->saddr;
2590 if (flags & RTCF_LOCAL && 1784 if (flags & RTCF_LOCAL &&
2591 !(dev_out->flags & IFF_LOOPBACK)) { 1785 !(dev_out->flags & IFF_LOOPBACK)) {
2592 rth->dst.output = ip_mc_output; 1786 rth->dst.output = ip_mc_output;
@@ -2603,34 +1797,28 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2603#endif 1797#endif
2604 } 1798 }
2605 1799
2606 rt_set_nexthop(rth, fl4, res, fi, type, 0); 1800 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2607 1801
2608 return rth; 1802 return rth;
2609} 1803}
2610 1804
2611/* 1805/*
2612 * Major route resolver routine. 1806 * Major route resolver routine.
2613 * called with rcu_read_lock();
2614 */ 1807 */
2615 1808
2616static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) 1809struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2617{ 1810{
2618 struct net_device *dev_out = NULL; 1811 struct net_device *dev_out = NULL;
2619 __u8 tos = RT_FL_TOS(fl4); 1812 __u8 tos = RT_FL_TOS(fl4);
2620 unsigned int flags = 0; 1813 unsigned int flags = 0;
2621 struct fib_result res; 1814 struct fib_result res;
2622 struct rtable *rth; 1815 struct rtable *rth;
2623 __be32 orig_daddr;
2624 __be32 orig_saddr;
2625 int orig_oif; 1816 int orig_oif;
2626 1817
1818 res.tclassid = 0;
2627 res.fi = NULL; 1819 res.fi = NULL;
2628#ifdef CONFIG_IP_MULTIPLE_TABLES 1820 res.table = NULL;
2629 res.r = NULL;
2630#endif
2631 1821
2632 orig_daddr = fl4->daddr;
2633 orig_saddr = fl4->saddr;
2634 orig_oif = fl4->flowi4_oif; 1822 orig_oif = fl4->flowi4_oif;
2635 1823
2636 fl4->flowi4_iif = net->loopback_dev->ifindex; 1824 fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -2730,6 +1918,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2730 1918
2731 if (fib_lookup(net, fl4, &res)) { 1919 if (fib_lookup(net, fl4, &res)) {
2732 res.fi = NULL; 1920 res.fi = NULL;
1921 res.table = NULL;
2733 if (fl4->flowi4_oif) { 1922 if (fl4->flowi4_oif) {
2734 /* Apparently, routing tables are wrong. Assume, 1923 /* Apparently, routing tables are wrong. Assume,
2735 that the destination is on link. 1924 that the destination is on link.
@@ -2791,60 +1980,12 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2791 1980
2792 1981
2793make_route: 1982make_route:
2794 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, 1983 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2795 tos, dev_out, flags);
2796 if (!IS_ERR(rth)) {
2797 unsigned int hash;
2798
2799 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2800 rt_genid(dev_net(dev_out)));
2801 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2802 }
2803 1984
2804out: 1985out:
2805 rcu_read_unlock(); 1986 rcu_read_unlock();
2806 return rth; 1987 return rth;
2807} 1988}
2808
2809struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2810{
2811 struct rtable *rth;
2812 unsigned int hash;
2813
2814 if (!rt_caching(net))
2815 goto slow_output;
2816
2817 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2818
2819 rcu_read_lock_bh();
2820 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2821 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2822 if (rth->rt_key_dst == flp4->daddr &&
2823 rth->rt_key_src == flp4->saddr &&
2824 rt_is_output_route(rth) &&
2825 rth->rt_oif == flp4->flowi4_oif &&
2826 rth->rt_mark == flp4->flowi4_mark &&
2827 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2828 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2829 net_eq(dev_net(rth->dst.dev), net) &&
2830 !rt_is_expired(rth)) {
2831 ipv4_validate_peer(rth);
2832 dst_use(&rth->dst, jiffies);
2833 RT_CACHE_STAT_INC(out_hit);
2834 rcu_read_unlock_bh();
2835 if (!flp4->saddr)
2836 flp4->saddr = rth->rt_src;
2837 if (!flp4->daddr)
2838 flp4->daddr = rth->rt_dst;
2839 return rth;
2840 }
2841 RT_CACHE_STAT_INC(out_hlist_search);
2842 }
2843 rcu_read_unlock_bh();
2844
2845slow_output:
2846 return ip_route_output_slow(net, flp4);
2847}
2848EXPORT_SYMBOL_GPL(__ip_route_output_key); 1989EXPORT_SYMBOL_GPL(__ip_route_output_key);
2849 1990
2850static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 1991static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -2859,7 +2000,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2859 return mtu ? : dst->dev->mtu; 2000 return mtu ? : dst->dev->mtu;
2860} 2001}
2861 2002
2862static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2003static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2004 struct sk_buff *skb, u32 mtu)
2005{
2006}
2007
2008static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2009 struct sk_buff *skb)
2863{ 2010{
2864} 2011}
2865 2012
@@ -2872,53 +2019,40 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2872static struct dst_ops ipv4_dst_blackhole_ops = { 2019static struct dst_ops ipv4_dst_blackhole_ops = {
2873 .family = AF_INET, 2020 .family = AF_INET,
2874 .protocol = cpu_to_be16(ETH_P_IP), 2021 .protocol = cpu_to_be16(ETH_P_IP),
2875 .destroy = ipv4_dst_destroy,
2876 .check = ipv4_blackhole_dst_check, 2022 .check = ipv4_blackhole_dst_check,
2877 .mtu = ipv4_blackhole_mtu, 2023 .mtu = ipv4_blackhole_mtu,
2878 .default_advmss = ipv4_default_advmss, 2024 .default_advmss = ipv4_default_advmss,
2879 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2025 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2026 .redirect = ipv4_rt_blackhole_redirect,
2880 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2027 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2881 .neigh_lookup = ipv4_neigh_lookup, 2028 .neigh_lookup = ipv4_neigh_lookup,
2882}; 2029};
2883 2030
2884struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2031struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2885{ 2032{
2886 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2887 struct rtable *ort = (struct rtable *) dst_orig; 2033 struct rtable *ort = (struct rtable *) dst_orig;
2034 struct rtable *rt;
2888 2035
2036 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2889 if (rt) { 2037 if (rt) {
2890 struct dst_entry *new = &rt->dst; 2038 struct dst_entry *new = &rt->dst;
2891 2039
2892 new->__use = 1; 2040 new->__use = 1;
2893 new->input = dst_discard; 2041 new->input = dst_discard;
2894 new->output = dst_discard; 2042 new->output = dst_discard;
2895 dst_copy_metrics(new, &ort->dst);
2896 2043
2897 new->dev = ort->dst.dev; 2044 new->dev = ort->dst.dev;
2898 if (new->dev) 2045 if (new->dev)
2899 dev_hold(new->dev); 2046 dev_hold(new->dev);
2900 2047
2901 rt->rt_key_dst = ort->rt_key_dst; 2048 rt->rt_is_input = ort->rt_is_input;
2902 rt->rt_key_src = ort->rt_key_src;
2903 rt->rt_key_tos = ort->rt_key_tos;
2904 rt->rt_route_iif = ort->rt_route_iif;
2905 rt->rt_iif = ort->rt_iif; 2049 rt->rt_iif = ort->rt_iif;
2906 rt->rt_oif = ort->rt_oif; 2050 rt->rt_pmtu = ort->rt_pmtu;
2907 rt->rt_mark = ort->rt_mark;
2908 2051
2909 rt->rt_genid = rt_genid(net); 2052 rt->rt_genid = rt_genid(net);
2910 rt->rt_flags = ort->rt_flags; 2053 rt->rt_flags = ort->rt_flags;
2911 rt->rt_type = ort->rt_type; 2054 rt->rt_type = ort->rt_type;
2912 rt->rt_dst = ort->rt_dst;
2913 rt->rt_src = ort->rt_src;
2914 rt->rt_gateway = ort->rt_gateway; 2055 rt->rt_gateway = ort->rt_gateway;
2915 rt->rt_spec_dst = ort->rt_spec_dst;
2916 rt->peer = ort->peer;
2917 if (rt->peer)
2918 atomic_inc(&rt->peer->refcnt);
2919 rt->fi = ort->fi;
2920 if (rt->fi)
2921 atomic_inc(&rt->fi->fib_clntref);
2922 2056
2923 dst_free(new); 2057 dst_free(new);
2924 } 2058 }
@@ -2945,16 +2079,16 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2945} 2079}
2946EXPORT_SYMBOL_GPL(ip_route_output_flow); 2080EXPORT_SYMBOL_GPL(ip_route_output_flow);
2947 2081
2948static int rt_fill_info(struct net *net, 2082static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2949 struct sk_buff *skb, u32 pid, u32 seq, int event, 2083 struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2950 int nowait, unsigned int flags) 2084 u32 seq, int event, int nowait, unsigned int flags)
2951{ 2085{
2952 struct rtable *rt = skb_rtable(skb); 2086 struct rtable *rt = skb_rtable(skb);
2953 struct rtmsg *r; 2087 struct rtmsg *r;
2954 struct nlmsghdr *nlh; 2088 struct nlmsghdr *nlh;
2955 unsigned long expires = 0; 2089 unsigned long expires = 0;
2956 const struct inet_peer *peer = rt->peer; 2090 u32 error;
2957 u32 id = 0, ts = 0, tsage = 0, error; 2091 u32 metrics[RTAX_MAX];
2958 2092
2959 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2093 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2960 if (nlh == NULL) 2094 if (nlh == NULL)
@@ -2964,7 +2098,7 @@ static int rt_fill_info(struct net *net,
2964 r->rtm_family = AF_INET; 2098 r->rtm_family = AF_INET;
2965 r->rtm_dst_len = 32; 2099 r->rtm_dst_len = 32;
2966 r->rtm_src_len = 0; 2100 r->rtm_src_len = 0;
2967 r->rtm_tos = rt->rt_key_tos; 2101 r->rtm_tos = fl4->flowi4_tos;
2968 r->rtm_table = RT_TABLE_MAIN; 2102 r->rtm_table = RT_TABLE_MAIN;
2969 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) 2103 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2970 goto nla_put_failure; 2104 goto nla_put_failure;
@@ -2975,11 +2109,11 @@ static int rt_fill_info(struct net *net,
2975 if (rt->rt_flags & RTCF_NOTIFY) 2109 if (rt->rt_flags & RTCF_NOTIFY)
2976 r->rtm_flags |= RTM_F_NOTIFY; 2110 r->rtm_flags |= RTM_F_NOTIFY;
2977 2111
2978 if (nla_put_be32(skb, RTA_DST, rt->rt_dst)) 2112 if (nla_put_be32(skb, RTA_DST, dst))
2979 goto nla_put_failure; 2113 goto nla_put_failure;
2980 if (rt->rt_key_src) { 2114 if (src) {
2981 r->rtm_src_len = 32; 2115 r->rtm_src_len = 32;
2982 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src)) 2116 if (nla_put_be32(skb, RTA_SRC, src))
2983 goto nla_put_failure; 2117 goto nla_put_failure;
2984 } 2118 }
2985 if (rt->dst.dev && 2119 if (rt->dst.dev &&
@@ -2990,69 +2124,40 @@ static int rt_fill_info(struct net *net,
2990 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2124 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2991 goto nla_put_failure; 2125 goto nla_put_failure;
2992#endif 2126#endif
2993 if (rt_is_input_route(rt)) { 2127 if (!rt_is_input_route(rt) &&
2994 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst)) 2128 fl4->saddr != src) {
2995 goto nla_put_failure; 2129 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2996 } else if (rt->rt_src != rt->rt_key_src) {
2997 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2998 goto nla_put_failure; 2130 goto nla_put_failure;
2999 } 2131 }
3000 if (rt->rt_dst != rt->rt_gateway && 2132 if (rt->rt_gateway &&
3001 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) 2133 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
3002 goto nla_put_failure; 2134 goto nla_put_failure;
3003 2135
3004 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 2136 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2137 if (rt->rt_pmtu)
2138 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2139 if (rtnetlink_put_metrics(skb, metrics) < 0)
3005 goto nla_put_failure; 2140 goto nla_put_failure;
3006 2141
3007 if (rt->rt_mark && 2142 if (fl4->flowi4_mark &&
3008 nla_put_be32(skb, RTA_MARK, rt->rt_mark)) 2143 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
3009 goto nla_put_failure; 2144 goto nla_put_failure;
3010 2145
3011 error = rt->dst.error; 2146 error = rt->dst.error;
3012 if (peer) { 2147 expires = rt->dst.expires;
3013 inet_peer_refcheck(rt->peer); 2148 if (expires) {
3014 id = atomic_read(&peer->ip_id_count) & 0xffff; 2149 if (time_before(jiffies, expires))
3015 if (peer->tcp_ts_stamp) { 2150 expires -= jiffies;
3016 ts = peer->tcp_ts; 2151 else
3017 tsage = get_seconds() - peer->tcp_ts_stamp; 2152 expires = 0;
3018 }
3019 expires = ACCESS_ONCE(peer->pmtu_expires);
3020 if (expires) {
3021 if (time_before(jiffies, expires))
3022 expires -= jiffies;
3023 else
3024 expires = 0;
3025 }
3026 } 2153 }
3027 2154
3028 if (rt_is_input_route(rt)) { 2155 if (rt_is_input_route(rt)) {
3029#ifdef CONFIG_IP_MROUTE 2156 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3030 __be32 dst = rt->rt_dst; 2157 goto nla_put_failure;
3031
3032 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3033 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3034 int err = ipmr_get_route(net, skb,
3035 rt->rt_src, rt->rt_dst,
3036 r, nowait);
3037 if (err <= 0) {
3038 if (!nowait) {
3039 if (err == 0)
3040 return 0;
3041 goto nla_put_failure;
3042 } else {
3043 if (err == -EMSGSIZE)
3044 goto nla_put_failure;
3045 error = err;
3046 }
3047 }
3048 } else
3049#endif
3050 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3051 goto nla_put_failure;
3052 } 2158 }
3053 2159
3054 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2160 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3055 expires, error) < 0)
3056 goto nla_put_failure; 2161 goto nla_put_failure;
3057 2162
3058 return nlmsg_end(skb, nlh); 2163 return nlmsg_end(skb, nlh);
@@ -3068,6 +2173,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
3068 struct rtmsg *rtm; 2173 struct rtmsg *rtm;
3069 struct nlattr *tb[RTA_MAX+1]; 2174 struct nlattr *tb[RTA_MAX+1];
3070 struct rtable *rt = NULL; 2175 struct rtable *rt = NULL;
2176 struct flowi4 fl4;
3071 __be32 dst = 0; 2177 __be32 dst = 0;
3072 __be32 src = 0; 2178 __be32 src = 0;
3073 u32 iif; 2179 u32 iif;
@@ -3102,6 +2208,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
3102 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2208 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3103 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2209 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3104 2210
2211 memset(&fl4, 0, sizeof(fl4));
2212 fl4.daddr = dst;
2213 fl4.saddr = src;
2214 fl4.flowi4_tos = rtm->rtm_tos;
2215 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2216 fl4.flowi4_mark = mark;
2217
3105 if (iif) { 2218 if (iif) {
3106 struct net_device *dev; 2219 struct net_device *dev;
3107 2220
@@ -3122,13 +2235,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
3122 if (err == 0 && rt->dst.error) 2235 if (err == 0 && rt->dst.error)
3123 err = -rt->dst.error; 2236 err = -rt->dst.error;
3124 } else { 2237 } else {
3125 struct flowi4 fl4 = {
3126 .daddr = dst,
3127 .saddr = src,
3128 .flowi4_tos = rtm->rtm_tos,
3129 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3130 .flowi4_mark = mark,
3131 };
3132 rt = ip_route_output_key(net, &fl4); 2238 rt = ip_route_output_key(net, &fl4);
3133 2239
3134 err = 0; 2240 err = 0;
@@ -3143,7 +2249,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
3143 if (rtm->rtm_flags & RTM_F_NOTIFY) 2249 if (rtm->rtm_flags & RTM_F_NOTIFY)
3144 rt->rt_flags |= RTCF_NOTIFY; 2250 rt->rt_flags |= RTCF_NOTIFY;
3145 2251
3146 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2252 err = rt_fill_info(net, dst, src, &fl4, skb,
2253 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3147 RTM_NEWROUTE, 0, 0); 2254 RTM_NEWROUTE, 0, 0);
3148 if (err <= 0) 2255 if (err <= 0)
3149 goto errout_free; 2256 goto errout_free;
@@ -3159,43 +2266,6 @@ errout_free:
3159 2266
3160int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 2267int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3161{ 2268{
3162 struct rtable *rt;
3163 int h, s_h;
3164 int idx, s_idx;
3165 struct net *net;
3166
3167 net = sock_net(skb->sk);
3168
3169 s_h = cb->args[0];
3170 if (s_h < 0)
3171 s_h = 0;
3172 s_idx = idx = cb->args[1];
3173 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3174 if (!rt_hash_table[h].chain)
3175 continue;
3176 rcu_read_lock_bh();
3177 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3178 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3179 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3180 continue;
3181 if (rt_is_expired(rt))
3182 continue;
3183 skb_dst_set_noref(skb, &rt->dst);
3184 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3185 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3186 1, NLM_F_MULTI) <= 0) {
3187 skb_dst_drop(skb);
3188 rcu_read_unlock_bh();
3189 goto done;
3190 }
3191 skb_dst_drop(skb);
3192 }
3193 rcu_read_unlock_bh();
3194 }
3195
3196done:
3197 cb->args[0] = h;
3198 cb->args[1] = idx;
3199 return skb->len; 2269 return skb->len;
3200} 2270}
3201 2271
@@ -3400,26 +2470,34 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3400 .init = rt_genid_init, 2470 .init = rt_genid_init,
3401}; 2471};
3402 2472
2473static int __net_init ipv4_inetpeer_init(struct net *net)
2474{
2475 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3403 2476
3404#ifdef CONFIG_IP_ROUTE_CLASSID 2477 if (!bp)
3405struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 2478 return -ENOMEM;
3406#endif /* CONFIG_IP_ROUTE_CLASSID */ 2479 inet_peer_base_init(bp);
2480 net->ipv4.peers = bp;
2481 return 0;
2482}
3407 2483
3408static __initdata unsigned long rhash_entries; 2484static void __net_exit ipv4_inetpeer_exit(struct net *net)
3409static int __init set_rhash_entries(char *str)
3410{ 2485{
3411 ssize_t ret; 2486 struct inet_peer_base *bp = net->ipv4.peers;
3412 2487
3413 if (!str) 2488 net->ipv4.peers = NULL;
3414 return 0; 2489 inetpeer_invalidate_tree(bp);
2490 kfree(bp);
2491}
3415 2492
3416 ret = kstrtoul(str, 0, &rhash_entries); 2493static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3417 if (ret) 2494 .init = ipv4_inetpeer_init,
3418 return 0; 2495 .exit = ipv4_inetpeer_exit,
2496};
3419 2497
3420 return 1; 2498#ifdef CONFIG_IP_ROUTE_CLASSID
3421} 2499struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3422__setup("rhash_entries=", set_rhash_entries); 2500#endif /* CONFIG_IP_ROUTE_CLASSID */
3423 2501
3424int __init ip_rt_init(void) 2502int __init ip_rt_init(void)
3425{ 2503{
@@ -3443,31 +2521,12 @@ int __init ip_rt_init(void)
3443 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 2521 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3444 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 2522 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3445 2523
3446 rt_hash_table = (struct rt_hash_bucket *) 2524 ipv4_dst_ops.gc_thresh = ~0;
3447 alloc_large_system_hash("IP route cache", 2525 ip_rt_max_size = INT_MAX;
3448 sizeof(struct rt_hash_bucket),
3449 rhash_entries,
3450 (totalram_pages >= 128 * 1024) ?
3451 15 : 17,
3452 0,
3453 &rt_hash_log,
3454 &rt_hash_mask,
3455 0,
3456 rhash_entries ? 0 : 512 * 1024);
3457 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3458 rt_hash_lock_init();
3459
3460 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3461 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3462 2526
3463 devinet_init(); 2527 devinet_init();
3464 ip_fib_init(); 2528 ip_fib_init();
3465 2529
3466 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3467 expires_ljiffies = jiffies;
3468 schedule_delayed_work(&expires_work,
3469 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3470
3471 if (ip_rt_proc_init()) 2530 if (ip_rt_proc_init())
3472 pr_err("Unable to create route proc files\n"); 2531 pr_err("Unable to create route proc files\n");
3473#ifdef CONFIG_XFRM 2532#ifdef CONFIG_XFRM
@@ -3480,6 +2539,7 @@ int __init ip_rt_init(void)
3480 register_pernet_subsys(&sysctl_route_ops); 2539 register_pernet_subsys(&sysctl_route_ops);
3481#endif 2540#endif
3482 register_pernet_subsys(&rt_genid_ops); 2541 register_pernet_subsys(&rt_genid_ops);
2542 register_pernet_subsys(&ipv4_inetpeer_ops);
3483 return rc; 2543 return rc;
3484} 2544}
3485 2545
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eab2a7fb15d1..650e1528e1e6 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
293 293
294 /* check for timestamp cookie support */ 294 /* check for timestamp cookie support */
295 memset(&tcp_opt, 0, sizeof(tcp_opt)); 295 memset(&tcp_opt, 0, sizeof(tcp_opt));
296 tcp_parse_options(skb, &tcp_opt, &hash_location, 0); 296 tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
297 297
298 if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) 298 if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
299 goto out; 299 goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ef32956ed655..5840c3255721 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -301,6 +301,13 @@ static struct ctl_table ipv4_table[] = {
301 .proc_handler = proc_dointvec 301 .proc_handler = proc_dointvec
302 }, 302 },
303 { 303 {
304 .procname = "ip_early_demux",
305 .data = &sysctl_ip_early_demux,
306 .maxlen = sizeof(int),
307 .mode = 0644,
308 .proc_handler = proc_dointvec
309 },
310 {
304 .procname = "ip_dynaddr", 311 .procname = "ip_dynaddr",
305 .data = &sysctl_ip_dynaddr, 312 .data = &sysctl_ip_dynaddr,
306 .maxlen = sizeof(int), 313 .maxlen = sizeof(int),
@@ -360,6 +367,13 @@ static struct ctl_table ipv4_table[] = {
360 }, 367 },
361#endif 368#endif
362 { 369 {
370 .procname = "tcp_fastopen",
371 .data = &sysctl_tcp_fastopen,
372 .maxlen = sizeof(int),
373 .mode = 0644,
374 .proc_handler = proc_dointvec,
375 },
376 {
363 .procname = "tcp_tw_recycle", 377 .procname = "tcp_tw_recycle",
364 .data = &tcp_death_row.sysctl_tw_recycle, 378 .data = &tcp_death_row.sysctl_tw_recycle,
365 .maxlen = sizeof(int), 379 .maxlen = sizeof(int),
@@ -591,6 +605,20 @@ static struct ctl_table ipv4_table[] = {
591 .mode = 0644, 605 .mode = 0644,
592 .proc_handler = proc_dointvec 606 .proc_handler = proc_dointvec
593 }, 607 },
608 {
609 .procname = "tcp_limit_output_bytes",
610 .data = &sysctl_tcp_limit_output_bytes,
611 .maxlen = sizeof(int),
612 .mode = 0644,
613 .proc_handler = proc_dointvec
614 },
615 {
616 .procname = "tcp_challenge_ack_limit",
617 .data = &sysctl_tcp_challenge_ack_limit,
618 .maxlen = sizeof(int),
619 .mode = 0644,
620 .proc_handler = proc_dointvec
621 },
594#ifdef CONFIG_NET_DMA 622#ifdef CONFIG_NET_DMA
595 { 623 {
596 .procname = "tcp_dma_copybreak", 624 .procname = "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f60e4e..581ecf02c6b5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
270#include <linux/slab.h> 270#include <linux/slab.h>
271 271
272#include <net/icmp.h> 272#include <net/icmp.h>
273#include <net/inet_common.h>
273#include <net/tcp.h> 274#include <net/tcp.h>
274#include <net/xfrm.h> 275#include <net/xfrm.h>
275#include <net/ip.h> 276#include <net/ip.h>
@@ -376,6 +377,7 @@ void tcp_init_sock(struct sock *sk)
376 skb_queue_head_init(&tp->out_of_order_queue); 377 skb_queue_head_init(&tp->out_of_order_queue);
377 tcp_init_xmit_timers(sk); 378 tcp_init_xmit_timers(sk);
378 tcp_prequeue_init(tp); 379 tcp_prequeue_init(tp);
380 INIT_LIST_HEAD(&tp->tsq_node);
379 381
380 icsk->icsk_rto = TCP_TIMEOUT_INIT; 382 icsk->icsk_rto = TCP_TIMEOUT_INIT;
381 tp->mdev = TCP_TIMEOUT_INIT; 383 tp->mdev = TCP_TIMEOUT_INIT;
@@ -796,6 +798,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
796 inet_csk(sk)->icsk_ext_hdr_len - 798 inet_csk(sk)->icsk_ext_hdr_len -
797 tp->tcp_header_len); 799 tp->tcp_header_len);
798 800
801 /* TSQ : try to have two TSO segments in flight */
802 xmit_size_goal = min_t(u32, xmit_size_goal,
803 sysctl_tcp_limit_output_bytes >> 1);
804
799 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); 805 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
800 806
801 /* We try hard to avoid divides here */ 807 /* We try hard to avoid divides here */
@@ -977,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg)
977 return tmp; 983 return tmp;
978} 984}
979 985
986void tcp_free_fastopen_req(struct tcp_sock *tp)
987{
988 if (tp->fastopen_req != NULL) {
989 kfree(tp->fastopen_req);
990 tp->fastopen_req = NULL;
991 }
992}
993
994static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
995{
996 struct tcp_sock *tp = tcp_sk(sk);
997 int err, flags;
998
999 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1000 return -EOPNOTSUPP;
1001 if (tp->fastopen_req != NULL)
1002 return -EALREADY; /* Another Fast Open is in progress */
1003
1004 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1005 sk->sk_allocation);
1006 if (unlikely(tp->fastopen_req == NULL))
1007 return -ENOBUFS;
1008 tp->fastopen_req->data = msg;
1009
1010 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1011 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1012 msg->msg_namelen, flags);
1013 *size = tp->fastopen_req->copied;
1014 tcp_free_fastopen_req(tp);
1015 return err;
1016}
1017
980int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 1018int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
981 size_t size) 1019 size_t size)
982{ 1020{
983 struct iovec *iov; 1021 struct iovec *iov;
984 struct tcp_sock *tp = tcp_sk(sk); 1022 struct tcp_sock *tp = tcp_sk(sk);
985 struct sk_buff *skb; 1023 struct sk_buff *skb;
986 int iovlen, flags, err, copied; 1024 int iovlen, flags, err, copied = 0;
987 int mss_now = 0, size_goal; 1025 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
988 bool sg; 1026 bool sg;
989 long timeo; 1027 long timeo;
990 1028
991 lock_sock(sk); 1029 lock_sock(sk);
992 1030
993 flags = msg->msg_flags; 1031 flags = msg->msg_flags;
1032 if (flags & MSG_FASTOPEN) {
1033 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1034 if (err == -EINPROGRESS && copied_syn > 0)
1035 goto out;
1036 else if (err)
1037 goto out_err;
1038 offset = copied_syn;
1039 }
1040
994 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1041 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
995 1042
996 /* Wait for a connection to finish. */ 1043 /* Wait for a connection to finish. */
997 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1044 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
998 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 1045 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
999 goto out_err; 1046 goto do_error;
1000 1047
1001 if (unlikely(tp->repair)) { 1048 if (unlikely(tp->repair)) {
1002 if (tp->repair_queue == TCP_RECV_QUEUE) { 1049 if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1032,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1032 unsigned char __user *from = iov->iov_base; 1079 unsigned char __user *from = iov->iov_base;
1033 1080
1034 iov++; 1081 iov++;
1082 if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
1083 if (offset >= seglen) {
1084 offset -= seglen;
1085 continue;
1086 }
1087 seglen -= offset;
1088 from += offset;
1089 offset = 0;
1090 }
1035 1091
1036 while (seglen > 0) { 1092 while (seglen > 0) {
1037 int copy = 0; 1093 int copy = 0;
@@ -1194,7 +1250,7 @@ out:
1194 if (copied && likely(!tp->repair)) 1250 if (copied && likely(!tp->repair))
1195 tcp_push(sk, flags, mss_now, tp->nonagle); 1251 tcp_push(sk, flags, mss_now, tp->nonagle);
1196 release_sock(sk); 1252 release_sock(sk);
1197 return copied; 1253 return copied + copied_syn;
1198 1254
1199do_fault: 1255do_fault:
1200 if (!skb->len) { 1256 if (!skb->len) {
@@ -1207,7 +1263,7 @@ do_fault:
1207 } 1263 }
1208 1264
1209do_error: 1265do_error:
1210 if (copied) 1266 if (copied + copied_syn)
1211 goto out; 1267 goto out;
1212out_err: 1268out_err:
1213 err = sk_stream_error(sk, flags, err); 1269 err = sk_stream_error(sk, flags, err);
@@ -3310,8 +3366,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
3310 3366
3311#endif 3367#endif
3312 3368
3313/** 3369/* Each Responder maintains up to two secret values concurrently for
3314 * Each Responder maintains up to two secret values concurrently for
3315 * efficient secret rollover. Each secret value has 4 states: 3370 * efficient secret rollover. Each secret value has 4 states:
3316 * 3371 *
3317 * Generating. (tcp_secret_generating != tcp_secret_primary) 3372 * Generating. (tcp_secret_generating != tcp_secret_primary)
@@ -3563,6 +3618,8 @@ void __init tcp_init(void)
3563 pr_info("Hash tables configured (established %u bind %u)\n", 3618 pr_info("Hash tables configured (established %u bind %u)\n",
3564 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3619 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3565 3620
3621 tcp_metrics_init();
3622
3566 tcp_register_congestion_control(&tcp_reno); 3623 tcp_register_congestion_control(&tcp_reno);
3567 3624
3568 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); 3625 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
@@ -3573,4 +3630,5 @@ void __init tcp_init(void)
3573 tcp_secret_primary = &tcp_secret_one; 3630 tcp_secret_primary = &tcp_secret_one;
3574 tcp_secret_retiring = &tcp_secret_two; 3631 tcp_secret_retiring = &tcp_secret_two;
3575 tcp_secret_secondary = &tcp_secret_two; 3632 tcp_secret_secondary = &tcp_secret_two;
3633 tcp_tasklet_init();
3576} 3634}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 04dbd7ae7c62..4d4db16e336e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
307void tcp_slow_start(struct tcp_sock *tp) 307void tcp_slow_start(struct tcp_sock *tp)
308{ 308{
309 int cnt; /* increase in packets */ 309 int cnt; /* increase in packets */
310 unsigned int delta = 0;
310 311
311 /* RFC3465: ABC Slow start 312 /* RFC3465: ABC Slow start
312 * Increase only after a full MSS of bytes is acked 313 * Increase only after a full MSS of bytes is acked
@@ -333,9 +334,9 @@ void tcp_slow_start(struct tcp_sock *tp)
333 tp->snd_cwnd_cnt += cnt; 334 tp->snd_cwnd_cnt += cnt;
334 while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 335 while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
335 tp->snd_cwnd_cnt -= tp->snd_cwnd; 336 tp->snd_cwnd_cnt -= tp->snd_cwnd;
336 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 337 delta++;
337 tp->snd_cwnd++;
338 } 338 }
339 tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp);
339} 340}
340EXPORT_SYMBOL_GPL(tcp_slow_start); 341EXPORT_SYMBOL_GPL(tcp_slow_start);
341 342
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 000000000000..a7f729c409d7
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,11 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3
4int sysctl_tcp_fastopen;
5
6static int __init tcp_fastopen_init(void)
7{
8 return 0;
9}
10
11late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8bce8b..3e07a64ca44e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -88,12 +88,14 @@ int sysctl_tcp_app_win __read_mostly = 31;
88int sysctl_tcp_adv_win_scale __read_mostly = 1; 88int sysctl_tcp_adv_win_scale __read_mostly = 1;
89EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); 89EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
90 90
91/* rfc5961 challenge ack rate limiting */
92int sysctl_tcp_challenge_ack_limit = 100;
93
91int sysctl_tcp_stdurg __read_mostly; 94int sysctl_tcp_stdurg __read_mostly;
92int sysctl_tcp_rfc1337 __read_mostly; 95int sysctl_tcp_rfc1337 __read_mostly;
93int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 96int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
94int sysctl_tcp_frto __read_mostly = 2; 97int sysctl_tcp_frto __read_mostly = 2;
95int sysctl_tcp_frto_response __read_mostly; 98int sysctl_tcp_frto_response __read_mostly;
96int sysctl_tcp_nometrics_save __read_mostly;
97 99
98int sysctl_tcp_thin_dupack __read_mostly; 100int sysctl_tcp_thin_dupack __read_mostly;
99 101
@@ -701,7 +703,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
701/* Calculate rto without backoff. This is the second half of Van Jacobson's 703/* Calculate rto without backoff. This is the second half of Van Jacobson's
702 * routine referred to above. 704 * routine referred to above.
703 */ 705 */
704static inline void tcp_set_rto(struct sock *sk) 706void tcp_set_rto(struct sock *sk)
705{ 707{
706 const struct tcp_sock *tp = tcp_sk(sk); 708 const struct tcp_sock *tp = tcp_sk(sk);
707 /* Old crap is replaced with new one. 8) 709 /* Old crap is replaced with new one. 8)
@@ -728,109 +730,6 @@ static inline void tcp_set_rto(struct sock *sk)
728 tcp_bound_rto(sk); 730 tcp_bound_rto(sk);
729} 731}
730 732
731/* Save metrics learned by this TCP session.
732 This function is called only, when TCP finishes successfully
733 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
734 */
735void tcp_update_metrics(struct sock *sk)
736{
737 struct tcp_sock *tp = tcp_sk(sk);
738 struct dst_entry *dst = __sk_dst_get(sk);
739
740 if (sysctl_tcp_nometrics_save)
741 return;
742
743 dst_confirm(dst);
744
745 if (dst && (dst->flags & DST_HOST)) {
746 const struct inet_connection_sock *icsk = inet_csk(sk);
747 int m;
748 unsigned long rtt;
749
750 if (icsk->icsk_backoff || !tp->srtt) {
751 /* This session failed to estimate rtt. Why?
752 * Probably, no packets returned in time.
753 * Reset our results.
754 */
755 if (!(dst_metric_locked(dst, RTAX_RTT)))
756 dst_metric_set(dst, RTAX_RTT, 0);
757 return;
758 }
759
760 rtt = dst_metric_rtt(dst, RTAX_RTT);
761 m = rtt - tp->srtt;
762
763 /* If newly calculated rtt larger than stored one,
764 * store new one. Otherwise, use EWMA. Remember,
765 * rtt overestimation is always better than underestimation.
766 */
767 if (!(dst_metric_locked(dst, RTAX_RTT))) {
768 if (m <= 0)
769 set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
770 else
771 set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
772 }
773
774 if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
775 unsigned long var;
776 if (m < 0)
777 m = -m;
778
779 /* Scale deviation to rttvar fixed point */
780 m >>= 1;
781 if (m < tp->mdev)
782 m = tp->mdev;
783
784 var = dst_metric_rtt(dst, RTAX_RTTVAR);
785 if (m >= var)
786 var = m;
787 else
788 var -= (var - m) >> 2;
789
790 set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
791 }
792
793 if (tcp_in_initial_slowstart(tp)) {
794 /* Slow start still did not finish. */
795 if (dst_metric(dst, RTAX_SSTHRESH) &&
796 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
797 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
798 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
799 if (!dst_metric_locked(dst, RTAX_CWND) &&
800 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
801 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
802 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
803 icsk->icsk_ca_state == TCP_CA_Open) {
804 /* Cong. avoidance phase, cwnd is reliable. */
805 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
806 dst_metric_set(dst, RTAX_SSTHRESH,
807 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
808 if (!dst_metric_locked(dst, RTAX_CWND))
809 dst_metric_set(dst, RTAX_CWND,
810 (dst_metric(dst, RTAX_CWND) +
811 tp->snd_cwnd) >> 1);
812 } else {
813 /* Else slow start did not finish, cwnd is non-sense,
814 ssthresh may be also invalid.
815 */
816 if (!dst_metric_locked(dst, RTAX_CWND))
817 dst_metric_set(dst, RTAX_CWND,
818 (dst_metric(dst, RTAX_CWND) +
819 tp->snd_ssthresh) >> 1);
820 if (dst_metric(dst, RTAX_SSTHRESH) &&
821 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
822 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
823 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
824 }
825
826 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
827 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
828 tp->reordering != sysctl_tcp_reordering)
829 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
830 }
831 }
832}
833
834__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) 733__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
835{ 734{
836 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 735 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -867,7 +766,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
867 * Packet counting of FACK is based on in-order assumptions, therefore TCP 766 * Packet counting of FACK is based on in-order assumptions, therefore TCP
868 * disables it when reordering is detected 767 * disables it when reordering is detected
869 */ 768 */
870static void tcp_disable_fack(struct tcp_sock *tp) 769void tcp_disable_fack(struct tcp_sock *tp)
871{ 770{
872 /* RFC3517 uses different metric in lost marker => reset on change */ 771 /* RFC3517 uses different metric in lost marker => reset on change */
873 if (tcp_is_fack(tp)) 772 if (tcp_is_fack(tp))
@@ -881,86 +780,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
881 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 780 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
882} 781}
883 782
884/* Initialize metrics on socket. */
885
886static void tcp_init_metrics(struct sock *sk)
887{
888 struct tcp_sock *tp = tcp_sk(sk);
889 struct dst_entry *dst = __sk_dst_get(sk);
890
891 if (dst == NULL)
892 goto reset;
893
894 dst_confirm(dst);
895
896 if (dst_metric_locked(dst, RTAX_CWND))
897 tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
898 if (dst_metric(dst, RTAX_SSTHRESH)) {
899 tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
900 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
901 tp->snd_ssthresh = tp->snd_cwnd_clamp;
902 } else {
903 /* ssthresh may have been reduced unnecessarily during.
904 * 3WHS. Restore it back to its initial default.
905 */
906 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
907 }
908 if (dst_metric(dst, RTAX_REORDERING) &&
909 tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
910 tcp_disable_fack(tp);
911 tcp_disable_early_retrans(tp);
912 tp->reordering = dst_metric(dst, RTAX_REORDERING);
913 }
914
915 if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
916 goto reset;
917
918 /* Initial rtt is determined from SYN,SYN-ACK.
919 * The segment is small and rtt may appear much
920 * less than real one. Use per-dst memory
921 * to make it more realistic.
922 *
923 * A bit of theory. RTT is time passed after "normal" sized packet
924 * is sent until it is ACKed. In normal circumstances sending small
925 * packets force peer to delay ACKs and calculation is correct too.
926 * The algorithm is adaptive and, provided we follow specs, it
927 * NEVER underestimate RTT. BUT! If peer tries to make some clever
928 * tricks sort of "quick acks" for time long enough to decrease RTT
929 * to low value, and then abruptly stops to do it and starts to delay
930 * ACKs, wait for troubles.
931 */
932 if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
933 tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
934 tp->rtt_seq = tp->snd_nxt;
935 }
936 if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
937 tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
938 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
939 }
940 tcp_set_rto(sk);
941reset:
942 if (tp->srtt == 0) {
943 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
944 * 3WHS. This is most likely due to retransmission,
945 * including spurious one. Reset the RTO back to 3secs
946 * from the more aggressive 1sec to avoid more spurious
947 * retransmission.
948 */
949 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
950 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
951 }
952 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
953 * retransmitted. In light of RFC6298 more aggressive 1sec
954 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
955 * retransmission has occurred.
956 */
957 if (tp->total_retrans > 1)
958 tp->snd_cwnd = 1;
959 else
960 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
961 tp->snd_cwnd_stamp = tcp_time_stamp;
962}
963
964static void tcp_update_reordering(struct sock *sk, const int metric, 783static void tcp_update_reordering(struct sock *sk, const int metric,
965 const int ts) 784 const int ts)
966{ 785{
@@ -2702,7 +2521,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag)
2702/* Nothing was retransmitted or returned timestamp is less 2521/* Nothing was retransmitted or returned timestamp is less
2703 * than timestamp of the first retransmission. 2522 * than timestamp of the first retransmission.
2704 */ 2523 */
2705static inline int tcp_packet_delayed(const struct tcp_sock *tp) 2524static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2706{ 2525{
2707 return !tp->retrans_stamp || 2526 return !tp->retrans_stamp ||
2708 (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2527 (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2763,7 +2582,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
2763 tp->snd_cwnd_stamp = tcp_time_stamp; 2582 tp->snd_cwnd_stamp = tcp_time_stamp;
2764} 2583}
2765 2584
2766static inline int tcp_may_undo(const struct tcp_sock *tp) 2585static inline bool tcp_may_undo(const struct tcp_sock *tp)
2767{ 2586{
2768 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); 2587 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
2769} 2588}
@@ -3552,13 +3371,13 @@ static void tcp_ack_probe(struct sock *sk)
3552 } 3371 }
3553} 3372}
3554 3373
3555static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) 3374static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3556{ 3375{
3557 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3376 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3558 inet_csk(sk)->icsk_ca_state != TCP_CA_Open; 3377 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3559} 3378}
3560 3379
3561static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3380static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3562{ 3381{
3563 const struct tcp_sock *tp = tcp_sk(sk); 3382 const struct tcp_sock *tp = tcp_sk(sk);
3564 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && 3383 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
@@ -3568,7 +3387,7 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3568/* Check that window update is acceptable. 3387/* Check that window update is acceptable.
3569 * The function assumes that snd_una<=ack<=snd_next. 3388 * The function assumes that snd_una<=ack<=snd_next.
3570 */ 3389 */
3571static inline int tcp_may_update_window(const struct tcp_sock *tp, 3390static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3572 const u32 ack, const u32 ack_seq, 3391 const u32 ack, const u32 ack_seq,
3573 const u32 nwin) 3392 const u32 nwin)
3574{ 3393{
@@ -3869,9 +3688,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3869 tcp_cong_avoid(sk, ack, prior_in_flight); 3688 tcp_cong_avoid(sk, ack, prior_in_flight);
3870 } 3689 }
3871 3690
3872 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3691 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3873 dst_confirm(__sk_dst_get(sk)); 3692 struct dst_entry *dst = __sk_dst_get(sk);
3874 3693 if (dst)
3694 dst_confirm(dst);
3695 }
3875 return 1; 3696 return 1;
3876 3697
3877no_queue: 3698no_queue:
@@ -3911,7 +3732,8 @@ old_ack:
3911 * the fast version below fails. 3732 * the fast version below fails.
3912 */ 3733 */
3913void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, 3734void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
3914 const u8 **hvpp, int estab) 3735 const u8 **hvpp, int estab,
3736 struct tcp_fastopen_cookie *foc)
3915{ 3737{
3916 const unsigned char *ptr; 3738 const unsigned char *ptr;
3917 const struct tcphdr *th = tcp_hdr(skb); 3739 const struct tcphdr *th = tcp_hdr(skb);
@@ -4018,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
4018 break; 3840 break;
4019 } 3841 }
4020 break; 3842 break;
4021 }
4022 3843
3844 case TCPOPT_EXP:
3845 /* Fast Open option shares code 254 using a
3846 * 16 bits magic number. It's valid only in
3847 * SYN or SYN-ACK with an even size.
3848 */
3849 if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
3850 get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
3851 foc == NULL || !th->syn || (opsize & 1))
3852 break;
3853 foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
3854 if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
3855 foc->len <= TCP_FASTOPEN_COOKIE_MAX)
3856 memcpy(foc->val, ptr + 2, foc->len);
3857 else if (foc->len != 0)
3858 foc->len = -1;
3859 break;
3860
3861 }
4023 ptr += opsize-2; 3862 ptr += opsize-2;
4024 length -= opsize; 3863 length -= opsize;
4025 } 3864 }
@@ -4061,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
4061 if (tcp_parse_aligned_timestamp(tp, th)) 3900 if (tcp_parse_aligned_timestamp(tp, th))
4062 return true; 3901 return true;
4063 } 3902 }
4064 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); 3903 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
4065 return true; 3904 return true;
4066} 3905}
4067 3906
@@ -4167,7 +4006,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
4167 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); 4006 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
4168} 4007}
4169 4008
4170static inline int tcp_paws_discard(const struct sock *sk, 4009static inline bool tcp_paws_discard(const struct sock *sk,
4171 const struct sk_buff *skb) 4010 const struct sk_buff *skb)
4172{ 4011{
4173 const struct tcp_sock *tp = tcp_sk(sk); 4012 const struct tcp_sock *tp = tcp_sk(sk);
@@ -4189,7 +4028,7 @@ static inline int tcp_paws_discard(const struct sock *sk,
4189 * (borrowed from freebsd) 4028 * (borrowed from freebsd)
4190 */ 4029 */
4191 4030
4192static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) 4031static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
4193{ 4032{
4194 return !before(end_seq, tp->rcv_wup) && 4033 return !before(end_seq, tp->rcv_wup) &&
4195 !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); 4034 !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
@@ -4579,8 +4418,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4579 4418
4580 TCP_ECN_check_ce(tp, skb); 4419 TCP_ECN_check_ce(tp, skb);
4581 4420
4582 if (tcp_try_rmem_schedule(sk, skb->truesize)) { 4421 if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) {
4583 /* TODO: should increment a counter */ 4422 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
4584 __kfree_skb(skb); 4423 __kfree_skb(skb);
4585 return; 4424 return;
4586 } 4425 }
@@ -4589,6 +4428,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4589 tp->pred_flags = 0; 4428 tp->pred_flags = 0;
4590 inet_csk_schedule_ack(sk); 4429 inet_csk_schedule_ack(sk);
4591 4430
4431 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4592 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", 4432 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4593 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 4433 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4594 4434
@@ -4642,6 +4482,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4642 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { 4482 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4643 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { 4483 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4644 /* All the bits are present. Drop. */ 4484 /* All the bits are present. Drop. */
4485 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4645 __kfree_skb(skb); 4486 __kfree_skb(skb);
4646 skb = NULL; 4487 skb = NULL;
4647 tcp_dsack_set(sk, seq, end_seq); 4488 tcp_dsack_set(sk, seq, end_seq);
@@ -4680,6 +4521,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4680 __skb_unlink(skb1, &tp->out_of_order_queue); 4521 __skb_unlink(skb1, &tp->out_of_order_queue);
4681 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, 4522 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4682 TCP_SKB_CB(skb1)->end_seq); 4523 TCP_SKB_CB(skb1)->end_seq);
4524 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4683 __kfree_skb(skb1); 4525 __kfree_skb(skb1);
4684 } 4526 }
4685 4527
@@ -5372,7 +5214,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk,
5372 return result; 5214 return result;
5373} 5215}
5374 5216
5375static inline int tcp_checksum_complete_user(struct sock *sk, 5217static inline bool tcp_checksum_complete_user(struct sock *sk,
5376 struct sk_buff *skb) 5218 struct sk_buff *skb)
5377{ 5219{
5378 return !skb_csum_unnecessary(skb) && 5220 return !skb_csum_unnecessary(skb) &&
@@ -5426,11 +5268,28 @@ out:
5426} 5268}
5427#endif /* CONFIG_NET_DMA */ 5269#endif /* CONFIG_NET_DMA */
5428 5270
5271static void tcp_send_challenge_ack(struct sock *sk)
5272{
5273 /* unprotected vars, we dont care of overwrites */
5274 static u32 challenge_timestamp;
5275 static unsigned int challenge_count;
5276 u32 now = jiffies / HZ;
5277
5278 if (now != challenge_timestamp) {
5279 challenge_timestamp = now;
5280 challenge_count = 0;
5281 }
5282 if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
5283 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
5284 tcp_send_ack(sk);
5285 }
5286}
5287
5429/* Does PAWS and seqno based validation of an incoming segment, flags will 5288/* Does PAWS and seqno based validation of an incoming segment, flags will
5430 * play significant role here. 5289 * play significant role here.
5431 */ 5290 */
5432static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 5291static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5433 const struct tcphdr *th, int syn_inerr) 5292 const struct tcphdr *th, int syn_inerr)
5434{ 5293{
5435 const u8 *hash_location; 5294 const u8 *hash_location;
5436 struct tcp_sock *tp = tcp_sk(sk); 5295 struct tcp_sock *tp = tcp_sk(sk);
@@ -5455,14 +5314,26 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5455 * an acknowledgment should be sent in reply (unless the RST 5314 * an acknowledgment should be sent in reply (unless the RST
5456 * bit is set, if so drop the segment and return)". 5315 * bit is set, if so drop the segment and return)".
5457 */ 5316 */
5458 if (!th->rst) 5317 if (!th->rst) {
5318 if (th->syn)
5319 goto syn_challenge;
5459 tcp_send_dupack(sk, skb); 5320 tcp_send_dupack(sk, skb);
5321 }
5460 goto discard; 5322 goto discard;
5461 } 5323 }
5462 5324
5463 /* Step 2: check RST bit */ 5325 /* Step 2: check RST bit */
5464 if (th->rst) { 5326 if (th->rst) {
5465 tcp_reset(sk); 5327 /* RFC 5961 3.2 :
5328 * If sequence number exactly matches RCV.NXT, then
5329 * RESET the connection
5330 * else
5331 * Send a challenge ACK
5332 */
5333 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
5334 tcp_reset(sk);
5335 else
5336 tcp_send_challenge_ack(sk);
5466 goto discard; 5337 goto discard;
5467 } 5338 }
5468 5339
@@ -5473,20 +5344,23 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5473 5344
5474 /* step 3: check security and precedence [ignored] */ 5345 /* step 3: check security and precedence [ignored] */
5475 5346
5476 /* step 4: Check for a SYN in window. */ 5347 /* step 4: Check for a SYN
5477 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 5348 * RFC 5691 4.2 : Send a challenge ack
5349 */
5350 if (th->syn) {
5351syn_challenge:
5478 if (syn_inerr) 5352 if (syn_inerr)
5479 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 5353 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); 5354 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5481 tcp_reset(sk); 5355 tcp_send_challenge_ack(sk);
5482 return -1; 5356 goto discard;
5483 } 5357 }
5484 5358
5485 return 1; 5359 return true;
5486 5360
5487discard: 5361discard:
5488 __kfree_skb(skb); 5362 __kfree_skb(skb);
5489 return 0; 5363 return false;
5490} 5364}
5491 5365
5492/* 5366/*
@@ -5516,7 +5390,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5516 const struct tcphdr *th, unsigned int len) 5390 const struct tcphdr *th, unsigned int len)
5517{ 5391{
5518 struct tcp_sock *tp = tcp_sk(sk); 5392 struct tcp_sock *tp = tcp_sk(sk);
5519 int res;
5520 5393
5521 /* 5394 /*
5522 * Header prediction. 5395 * Header prediction.
@@ -5693,9 +5566,8 @@ slow_path:
5693 * Standard slow path. 5566 * Standard slow path.
5694 */ 5567 */
5695 5568
5696 res = tcp_validate_incoming(sk, skb, th, 1); 5569 if (!tcp_validate_incoming(sk, skb, th, 1))
5697 if (res <= 0) 5570 return 0;
5698 return -res;
5699 5571
5700step5: 5572step5:
5701 if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) 5573 if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
@@ -5729,8 +5601,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5729 5601
5730 tcp_set_state(sk, TCP_ESTABLISHED); 5602 tcp_set_state(sk, TCP_ESTABLISHED);
5731 5603
5732 if (skb != NULL) 5604 if (skb != NULL) {
5605 sk->sk_rx_dst = dst_clone(skb_dst(skb));
5733 security_inet_conn_established(sk, skb); 5606 security_inet_conn_established(sk, skb);
5607 }
5734 5608
5735 /* Make sure socket is routed, for correct metrics. */ 5609 /* Make sure socket is routed, for correct metrics. */
5736 icsk->icsk_af_ops->rebuild_header(sk); 5610 icsk->icsk_af_ops->rebuild_header(sk);
@@ -5760,6 +5634,45 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5760 } 5634 }
5761} 5635}
5762 5636
5637static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5638 struct tcp_fastopen_cookie *cookie)
5639{
5640 struct tcp_sock *tp = tcp_sk(sk);
5641 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
5642 u16 mss = tp->rx_opt.mss_clamp;
5643 bool syn_drop;
5644
5645 if (mss == tp->rx_opt.user_mss) {
5646 struct tcp_options_received opt;
5647 const u8 *hash_location;
5648
5649 /* Get original SYNACK MSS value if user MSS sets mss_clamp */
5650 tcp_clear_options(&opt);
5651 opt.user_mss = opt.mss_clamp = 0;
5652 tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
5653 mss = opt.mss_clamp;
5654 }
5655
5656 if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */
5657 cookie->len = -1;
5658
5659 /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
5660 * the remote receives only the retransmitted (regular) SYNs: either
5661 * the original SYN-data or the corresponding SYN-ACK is lost.
5662 */
5663 syn_drop = (cookie->len <= 0 && data &&
5664 inet_csk(sk)->icsk_retransmits);
5665
5666 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
5667
5668 if (data) { /* Retransmit unacked data in SYN */
5669 tcp_retransmit_skb(sk, data);
5670 tcp_rearm_rto(sk);
5671 return true;
5672 }
5673 return false;
5674}
5675
5763static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5676static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5764 const struct tcphdr *th, unsigned int len) 5677 const struct tcphdr *th, unsigned int len)
5765{ 5678{
@@ -5767,9 +5680,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5767 struct inet_connection_sock *icsk = inet_csk(sk); 5680 struct inet_connection_sock *icsk = inet_csk(sk);
5768 struct tcp_sock *tp = tcp_sk(sk); 5681 struct tcp_sock *tp = tcp_sk(sk);
5769 struct tcp_cookie_values *cvp = tp->cookie_values; 5682 struct tcp_cookie_values *cvp = tp->cookie_values;
5683 struct tcp_fastopen_cookie foc = { .len = -1 };
5770 int saved_clamp = tp->rx_opt.mss_clamp; 5684 int saved_clamp = tp->rx_opt.mss_clamp;
5771 5685
5772 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); 5686 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
5773 5687
5774 if (th->ack) { 5688 if (th->ack) {
5775 /* rfc793: 5689 /* rfc793:
@@ -5779,11 +5693,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5779 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send 5693 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
5780 * a reset (unless the RST bit is set, if so drop 5694 * a reset (unless the RST bit is set, if so drop
5781 * the segment and return)" 5695 * the segment and return)"
5782 *
5783 * We do not send data with SYN, so that RFC-correct
5784 * test reduces to:
5785 */ 5696 */
5786 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) 5697 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
5698 after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
5787 goto reset_and_undo; 5699 goto reset_and_undo;
5788 5700
5789 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 5701 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5895,6 +5807,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5895 5807
5896 tcp_finish_connect(sk, skb); 5808 tcp_finish_connect(sk, skb);
5897 5809
5810 if ((tp->syn_fastopen || tp->syn_data) &&
5811 tcp_rcv_fastopen_synack(sk, skb, &foc))
5812 return -1;
5813
5898 if (sk->sk_write_pending || 5814 if (sk->sk_write_pending ||
5899 icsk->icsk_accept_queue.rskq_defer_accept || 5815 icsk->icsk_accept_queue.rskq_defer_accept ||
5900 icsk->icsk_ack.pingpong) { 5816 icsk->icsk_ack.pingpong) {
@@ -6013,7 +5929,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6013 struct tcp_sock *tp = tcp_sk(sk); 5929 struct tcp_sock *tp = tcp_sk(sk);
6014 struct inet_connection_sock *icsk = inet_csk(sk); 5930 struct inet_connection_sock *icsk = inet_csk(sk);
6015 int queued = 0; 5931 int queued = 0;
6016 int res;
6017 5932
6018 tp->rx_opt.saw_tstamp = 0; 5933 tp->rx_opt.saw_tstamp = 0;
6019 5934
@@ -6068,9 +5983,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6068 return 0; 5983 return 0;
6069 } 5984 }
6070 5985
6071 res = tcp_validate_incoming(sk, skb, th, 0); 5986 if (!tcp_validate_incoming(sk, skb, th, 0))
6072 if (res <= 0) 5987 return 0;
6073 return -res;
6074 5988
6075 /* step 5: check the ACK field */ 5989 /* step 5: check the ACK field */
6076 if (th->ack) { 5990 if (th->ack) {
@@ -6126,9 +6040,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6126 6040
6127 case TCP_FIN_WAIT1: 6041 case TCP_FIN_WAIT1:
6128 if (tp->snd_una == tp->write_seq) { 6042 if (tp->snd_una == tp->write_seq) {
6043 struct dst_entry *dst;
6044
6129 tcp_set_state(sk, TCP_FIN_WAIT2); 6045 tcp_set_state(sk, TCP_FIN_WAIT2);
6130 sk->sk_shutdown |= SEND_SHUTDOWN; 6046 sk->sk_shutdown |= SEND_SHUTDOWN;
6131 dst_confirm(__sk_dst_get(sk)); 6047
6048 dst = __sk_dst_get(sk);
6049 if (dst)
6050 dst_confirm(dst);
6132 6051
6133 if (!sock_flag(sk, SOCK_DEAD)) 6052 if (!sock_flag(sk, SOCK_DEAD))
6134 /* Wake up lingering close() */ 6053 /* Wake up lingering close() */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c8d28c433b2b..3e30548ac32a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
209 } 209 }
210 210
211 if (tcp_death_row.sysctl_tw_recycle && 211 if (tcp_death_row.sysctl_tw_recycle &&
212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { 212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
213 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); 213 tcp_fetch_timewait_stamp(sk, &rt->dst);
214 /*
215 * VJ's idea. We save last timestamp seen from
216 * the destination in peer table, when entering state
217 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
218 * when trying new connection.
219 */
220 if (peer) {
221 inet_peer_refcheck(peer);
222 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
223 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
224 tp->rx_opt.ts_recent = peer->tcp_ts;
225 }
226 }
227 }
228 214
229 inet->inet_dport = usin->sin_port; 215 inet->inet_dport = usin->sin_port;
230 inet->inet_daddr = daddr; 216 inet->inet_daddr = daddr;
@@ -289,12 +275,15 @@ failure:
289EXPORT_SYMBOL(tcp_v4_connect); 275EXPORT_SYMBOL(tcp_v4_connect);
290 276
291/* 277/*
292 * This routine does path mtu discovery as defined in RFC1191. 278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279 * It can be called through tcp_release_cb() if socket was owned by user
280 * at the time tcp_v4_err() was called to handle ICMP message.
293 */ 281 */
294static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) 282static void tcp_v4_mtu_reduced(struct sock *sk)
295{ 283{
296 struct dst_entry *dst; 284 struct dst_entry *dst;
297 struct inet_sock *inet = inet_sk(sk); 285 struct inet_sock *inet = inet_sk(sk);
286 u32 mtu = tcp_sk(sk)->mtu_info;
298 287
299 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 288 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
300 * send out by Linux are always <576bytes so they should go through 289 * send out by Linux are always <576bytes so they should go through
@@ -303,17 +292,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
303 if (sk->sk_state == TCP_LISTEN) 292 if (sk->sk_state == TCP_LISTEN)
304 return; 293 return;
305 294
306 /* We don't check in the destentry if pmtu discovery is forbidden 295 dst = inet_csk_update_pmtu(sk, mtu);
307 * on this route. We just assume that no packet_to_big packets 296 if (!dst)
308 * are send back when pmtu discovery is not active.
309 * There is a small race when the user changes this flag in the
310 * route, but I think that's acceptable.
311 */
312 if ((dst = __sk_dst_check(sk, 0)) == NULL)
313 return; 297 return;
314 298
315 dst->ops->update_pmtu(dst, mtu);
316
317 /* Something is about to be wrong... Remember soft error 299 /* Something is about to be wrong... Remember soft error
318 * for the case, if this connection will not able to recover. 300 * for the case, if this connection will not able to recover.
319 */ 301 */
@@ -335,6 +317,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
335 } /* else let the usual retransmit timer handle it */ 317 } /* else let the usual retransmit timer handle it */
336} 318}
337 319
320static void do_redirect(struct sk_buff *skb, struct sock *sk)
321{
322 struct dst_entry *dst = __sk_dst_check(sk, 0);
323
324 if (dst)
325 dst->ops->redirect(dst, sk, skb);
326}
327
338/* 328/*
339 * This routine is called by the ICMP module when it gets some 329 * This routine is called by the ICMP module when it gets some
340 * sort of error condition. If err < 0 then the socket should 330 * sort of error condition. If err < 0 then the socket should
@@ -386,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
386 bh_lock_sock(sk); 376 bh_lock_sock(sk);
387 /* If too many ICMPs get dropped on busy 377 /* If too many ICMPs get dropped on busy
388 * servers this needs to be solved differently. 378 * servers this needs to be solved differently.
379 * We do take care of PMTU discovery (RFC1191) special case :
380 * we can receive locally generated ICMP messages while socket is held.
389 */ 381 */
390 if (sock_owned_by_user(sk)) 382 if (sock_owned_by_user(sk) &&
383 type != ICMP_DEST_UNREACH &&
384 code != ICMP_FRAG_NEEDED)
391 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); 385 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
392 386
393 if (sk->sk_state == TCP_CLOSE) 387 if (sk->sk_state == TCP_CLOSE)
@@ -408,6 +402,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
408 } 402 }
409 403
410 switch (type) { 404 switch (type) {
405 case ICMP_REDIRECT:
406 do_redirect(icmp_skb, sk);
407 goto out;
411 case ICMP_SOURCE_QUENCH: 408 case ICMP_SOURCE_QUENCH:
412 /* Just silently ignore these. */ 409 /* Just silently ignore these. */
413 goto out; 410 goto out;
@@ -419,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
419 goto out; 416 goto out;
420 417
421 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 418 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
419 tp->mtu_info = info;
422 if (!sock_owned_by_user(sk)) 420 if (!sock_owned_by_user(sk))
423 do_pmtu_discovery(sk, iph, info); 421 tcp_v4_mtu_reduced(sk);
422 else
423 set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
424 goto out; 424 goto out;
425 } 425 }
426 426
@@ -698,8 +698,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
698 698
699 net = dev_net(skb_dst(skb)->dev); 699 net = dev_net(skb_dst(skb)->dev);
700 arg.tos = ip_hdr(skb)->tos; 700 arg.tos = ip_hdr(skb)->tos;
701 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, 701 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
702 &arg, arg.iov[0].iov_len); 702 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
703 703
704 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 704 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
705 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); 705 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
@@ -781,8 +781,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
781 if (oif) 781 if (oif)
782 arg.bound_dev_if = oif; 782 arg.bound_dev_if = oif;
783 arg.tos = tos; 783 arg.tos = tos;
784 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, 784 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
785 &arg, arg.iov[0].iov_len); 785 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
786 786
787 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 787 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
788} 788}
@@ -825,7 +825,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
825static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 825static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
826 struct request_sock *req, 826 struct request_sock *req,
827 struct request_values *rvp, 827 struct request_values *rvp,
828 u16 queue_mapping) 828 u16 queue_mapping,
829 bool nocache)
829{ 830{
830 const struct inet_request_sock *ireq = inet_rsk(req); 831 const struct inet_request_sock *ireq = inet_rsk(req);
831 struct flowi4 fl4; 832 struct flowi4 fl4;
@@ -848,7 +849,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
848 err = net_xmit_eval(err); 849 err = net_xmit_eval(err);
849 } 850 }
850 851
851 dst_release(dst);
852 return err; 852 return err;
853} 853}
854 854
@@ -856,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
856 struct request_values *rvp) 856 struct request_values *rvp)
857{ 857{
858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
859 return tcp_v4_send_synack(sk, NULL, req, rvp, 0); 859 return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
860} 860}
861 861
862/* 862/*
@@ -1317,7 +1317,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1317 tcp_clear_options(&tmp_opt); 1317 tcp_clear_options(&tmp_opt);
1318 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1318 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1319 tmp_opt.user_mss = tp->rx_opt.user_mss; 1319 tmp_opt.user_mss = tp->rx_opt.user_mss;
1320 tcp_parse_options(skb, &tmp_opt, &hash_location, 0); 1320 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
1321 1321
1322 if (tmp_opt.cookie_plus > 0 && 1322 if (tmp_opt.cookie_plus > 0 &&
1323 tmp_opt.saw_tstamp && 1323 tmp_opt.saw_tstamp &&
@@ -1375,7 +1375,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1375 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1375 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1376 req->cookie_ts = tmp_opt.tstamp_ok; 1376 req->cookie_ts = tmp_opt.tstamp_ok;
1377 } else if (!isn) { 1377 } else if (!isn) {
1378 struct inet_peer *peer = NULL;
1379 struct flowi4 fl4; 1378 struct flowi4 fl4;
1380 1379
1381 /* VJ's idea. We save last timestamp seen 1380 /* VJ's idea. We save last timestamp seen
@@ -1390,12 +1389,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1390 if (tmp_opt.saw_tstamp && 1389 if (tmp_opt.saw_tstamp &&
1391 tcp_death_row.sysctl_tw_recycle && 1390 tcp_death_row.sysctl_tw_recycle &&
1392 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && 1391 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1393 fl4.daddr == saddr && 1392 fl4.daddr == saddr) {
1394 (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { 1393 if (!tcp_peer_is_proven(req, dst, true)) {
1395 inet_peer_refcheck(peer);
1396 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1397 (s32)(peer->tcp_ts - req->ts_recent) >
1398 TCP_PAWS_WINDOW) {
1399 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1394 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1400 goto drop_and_release; 1395 goto drop_and_release;
1401 } 1396 }
@@ -1404,8 +1399,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1404 else if (!sysctl_tcp_syncookies && 1399 else if (!sysctl_tcp_syncookies &&
1405 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1400 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1406 (sysctl_max_syn_backlog >> 2)) && 1401 (sysctl_max_syn_backlog >> 2)) &&
1407 (!peer || !peer->tcp_ts_stamp) && 1402 !tcp_peer_is_proven(req, dst, false)) {
1408 (!dst || !dst_metric(dst, RTAX_RTT))) {
1409 /* Without syncookies last quarter of 1403 /* Without syncookies last quarter of
1410 * backlog is filled with destinations, 1404 * backlog is filled with destinations,
1411 * proven to be alive. 1405 * proven to be alive.
@@ -1425,7 +1419,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1425 1419
1426 if (tcp_v4_send_synack(sk, dst, req, 1420 if (tcp_v4_send_synack(sk, dst, req,
1427 (struct request_values *)&tmp_ext, 1421 (struct request_values *)&tmp_ext,
1428 skb_get_queue_mapping(skb)) || 1422 skb_get_queue_mapping(skb),
1423 want_cookie) ||
1429 want_cookie) 1424 want_cookie)
1430 goto drop_and_free; 1425 goto drop_and_free;
1431 1426
@@ -1623,6 +1618,20 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1623 1618
1624 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1619 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1625 sock_rps_save_rxhash(sk, skb); 1620 sock_rps_save_rxhash(sk, skb);
1621 if (sk->sk_rx_dst) {
1622 struct dst_entry *dst = sk->sk_rx_dst;
1623 if (dst->ops->check(dst, 0) == NULL) {
1624 dst_release(dst);
1625 sk->sk_rx_dst = NULL;
1626 }
1627 }
1628 if (unlikely(sk->sk_rx_dst == NULL)) {
1629 struct inet_sock *icsk = inet_sk(sk);
1630 struct rtable *rt = skb_rtable(skb);
1631
1632 sk->sk_rx_dst = dst_clone(&rt->dst);
1633 icsk->rx_dst_ifindex = inet_iif(skb);
1634 }
1626 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1635 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1627 rsk = sk; 1636 rsk = sk;
1628 goto reset; 1637 goto reset;
@@ -1672,6 +1681,49 @@ csum_err:
1672} 1681}
1673EXPORT_SYMBOL(tcp_v4_do_rcv); 1682EXPORT_SYMBOL(tcp_v4_do_rcv);
1674 1683
1684void tcp_v4_early_demux(struct sk_buff *skb)
1685{
1686 struct net *net = dev_net(skb->dev);
1687 const struct iphdr *iph;
1688 const struct tcphdr *th;
1689 struct net_device *dev;
1690 struct sock *sk;
1691
1692 if (skb->pkt_type != PACKET_HOST)
1693 return;
1694
1695 if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1696 return;
1697
1698 iph = ip_hdr(skb);
1699 th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1700
1701 if (th->doff < sizeof(struct tcphdr) / 4)
1702 return;
1703
1704 if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1705 return;
1706
1707 dev = skb->dev;
1708 sk = __inet_lookup_established(net, &tcp_hashinfo,
1709 iph->saddr, th->source,
1710 iph->daddr, ntohs(th->dest),
1711 dev->ifindex);
1712 if (sk) {
1713 skb->sk = sk;
1714 skb->destructor = sock_edemux;
1715 if (sk->sk_state != TCP_TIME_WAIT) {
1716 struct dst_entry *dst = sk->sk_rx_dst;
1717 struct inet_sock *icsk = inet_sk(sk);
1718 if (dst)
1719 dst = dst_check(dst, 0);
1720 if (dst &&
1721 icsk->rx_dst_ifindex == dev->ifindex)
1722 skb_dst_set_noref(skb, dst);
1723 }
1724 }
1725}
1726
1675/* 1727/*
1676 * From tcp_input.c 1728 * From tcp_input.c
1677 */ 1729 */
@@ -1821,40 +1873,10 @@ do_time_wait:
1821 goto discard_it; 1873 goto discard_it;
1822} 1874}
1823 1875
1824struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1825{
1826 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1827 struct inet_sock *inet = inet_sk(sk);
1828 struct inet_peer *peer;
1829
1830 if (!rt ||
1831 inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1832 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1833 *release_it = true;
1834 } else {
1835 if (!rt->peer)
1836 rt_bind_peer(rt, inet->inet_daddr, 1);
1837 peer = rt->peer;
1838 *release_it = false;
1839 }
1840
1841 return peer;
1842}
1843EXPORT_SYMBOL(tcp_v4_get_peer);
1844
1845void *tcp_v4_tw_get_peer(struct sock *sk)
1846{
1847 const struct inet_timewait_sock *tw = inet_twsk(sk);
1848
1849 return inet_getpeer_v4(tw->tw_daddr, 1);
1850}
1851EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1852
1853static struct timewait_sock_ops tcp_timewait_sock_ops = { 1876static struct timewait_sock_ops tcp_timewait_sock_ops = {
1854 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1877 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1855 .twsk_unique = tcp_twsk_unique, 1878 .twsk_unique = tcp_twsk_unique,
1856 .twsk_destructor= tcp_twsk_destructor, 1879 .twsk_destructor= tcp_twsk_destructor,
1857 .twsk_getpeer = tcp_v4_tw_get_peer,
1858}; 1880};
1859 1881
1860const struct inet_connection_sock_af_ops ipv4_specific = { 1882const struct inet_connection_sock_af_ops ipv4_specific = {
@@ -1863,7 +1885,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1863 .rebuild_header = inet_sk_rebuild_header, 1885 .rebuild_header = inet_sk_rebuild_header,
1864 .conn_request = tcp_v4_conn_request, 1886 .conn_request = tcp_v4_conn_request,
1865 .syn_recv_sock = tcp_v4_syn_recv_sock, 1887 .syn_recv_sock = tcp_v4_syn_recv_sock,
1866 .get_peer = tcp_v4_get_peer,
1867 .net_header_len = sizeof(struct iphdr), 1888 .net_header_len = sizeof(struct iphdr),
1868 .setsockopt = ip_setsockopt, 1889 .setsockopt = ip_setsockopt,
1869 .getsockopt = ip_getsockopt, 1890 .getsockopt = ip_getsockopt,
@@ -1953,6 +1974,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
1953 tp->cookie_values = NULL; 1974 tp->cookie_values = NULL;
1954 } 1975 }
1955 1976
1977 /* If socket is aborted during connect operation */
1978 tcp_free_fastopen_req(tp);
1979
1956 sk_sockets_allocated_dec(sk); 1980 sk_sockets_allocated_dec(sk);
1957 sock_release_memcg(sk); 1981 sock_release_memcg(sk);
1958} 1982}
@@ -2593,6 +2617,8 @@ struct proto tcp_prot = {
2593 .sendmsg = tcp_sendmsg, 2617 .sendmsg = tcp_sendmsg,
2594 .sendpage = tcp_sendpage, 2618 .sendpage = tcp_sendpage,
2595 .backlog_rcv = tcp_v4_do_rcv, 2619 .backlog_rcv = tcp_v4_do_rcv,
2620 .release_cb = tcp_release_cb,
2621 .mtu_reduced = tcp_v4_mtu_reduced,
2596 .hash = inet_hash, 2622 .hash = inet_hash,
2597 .unhash = inet_unhash, 2623 .unhash = inet_unhash,
2598 .get_port = inet_csk_get_port, 2624 .get_port = inet_csk_get_port,
@@ -2624,13 +2650,11 @@ EXPORT_SYMBOL(tcp_prot);
2624 2650
2625static int __net_init tcp_sk_init(struct net *net) 2651static int __net_init tcp_sk_init(struct net *net)
2626{ 2652{
2627 return inet_ctl_sock_create(&net->ipv4.tcp_sock, 2653 return 0;
2628 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2629} 2654}
2630 2655
2631static void __net_exit tcp_sk_exit(struct net *net) 2656static void __net_exit tcp_sk_exit(struct net *net)
2632{ 2657{
2633 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2634} 2658}
2635 2659
2636static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2660static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000000..2288a6399e1e
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,745 @@
1#include <linux/rcupdate.h>
2#include <linux/spinlock.h>
3#include <linux/jiffies.h>
4#include <linux/bootmem.h>
5#include <linux/module.h>
6#include <linux/cache.h>
7#include <linux/slab.h>
8#include <linux/init.h>
9#include <linux/tcp.h>
10#include <linux/hash.h>
11
12#include <net/inet_connection_sock.h>
13#include <net/net_namespace.h>
14#include <net/request_sock.h>
15#include <net/inetpeer.h>
16#include <net/sock.h>
17#include <net/ipv6.h>
18#include <net/dst.h>
19#include <net/tcp.h>
20
21int sysctl_tcp_nometrics_save __read_mostly;
22
23enum tcp_metric_index {
24 TCP_METRIC_RTT,
25 TCP_METRIC_RTTVAR,
26 TCP_METRIC_SSTHRESH,
27 TCP_METRIC_CWND,
28 TCP_METRIC_REORDERING,
29
30 /* Always last. */
31 TCP_METRIC_MAX,
32};
33
34struct tcp_fastopen_metrics {
35 u16 mss;
36 u16 syn_loss:10; /* Recurring Fast Open SYN losses */
37 unsigned long last_syn_loss; /* Last Fast Open SYN loss */
38 struct tcp_fastopen_cookie cookie;
39};
40
41struct tcp_metrics_block {
42 struct tcp_metrics_block __rcu *tcpm_next;
43 struct inetpeer_addr tcpm_addr;
44 unsigned long tcpm_stamp;
45 u32 tcpm_ts;
46 u32 tcpm_ts_stamp;
47 u32 tcpm_lock;
48 u32 tcpm_vals[TCP_METRIC_MAX];
49 struct tcp_fastopen_metrics tcpm_fastopen;
50};
51
52static bool tcp_metric_locked(struct tcp_metrics_block *tm,
53 enum tcp_metric_index idx)
54{
55 return tm->tcpm_lock & (1 << idx);
56}
57
58static u32 tcp_metric_get(struct tcp_metrics_block *tm,
59 enum tcp_metric_index idx)
60{
61 return tm->tcpm_vals[idx];
62}
63
64static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
65 enum tcp_metric_index idx)
66{
67 return msecs_to_jiffies(tm->tcpm_vals[idx]);
68}
69
70static void tcp_metric_set(struct tcp_metrics_block *tm,
71 enum tcp_metric_index idx,
72 u32 val)
73{
74 tm->tcpm_vals[idx] = val;
75}
76
77static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
78 enum tcp_metric_index idx,
79 u32 val)
80{
81 tm->tcpm_vals[idx] = jiffies_to_msecs(val);
82}
83
84static bool addr_same(const struct inetpeer_addr *a,
85 const struct inetpeer_addr *b)
86{
87 const struct in6_addr *a6, *b6;
88
89 if (a->family != b->family)
90 return false;
91 if (a->family == AF_INET)
92 return a->addr.a4 == b->addr.a4;
93
94 a6 = (const struct in6_addr *) &a->addr.a6[0];
95 b6 = (const struct in6_addr *) &b->addr.a6[0];
96
97 return ipv6_addr_equal(a6, b6);
98}
99
100struct tcpm_hash_bucket {
101 struct tcp_metrics_block __rcu *chain;
102};
103
104static DEFINE_SPINLOCK(tcp_metrics_lock);
105
106static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
107{
108 u32 val;
109
110 tm->tcpm_stamp = jiffies;
111
112 val = 0;
113 if (dst_metric_locked(dst, RTAX_RTT))
114 val |= 1 << TCP_METRIC_RTT;
115 if (dst_metric_locked(dst, RTAX_RTTVAR))
116 val |= 1 << TCP_METRIC_RTTVAR;
117 if (dst_metric_locked(dst, RTAX_SSTHRESH))
118 val |= 1 << TCP_METRIC_SSTHRESH;
119 if (dst_metric_locked(dst, RTAX_CWND))
120 val |= 1 << TCP_METRIC_CWND;
121 if (dst_metric_locked(dst, RTAX_REORDERING))
122 val |= 1 << TCP_METRIC_REORDERING;
123 tm->tcpm_lock = val;
124
125 tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
126 tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
127 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
128 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
129 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
130 tm->tcpm_ts = 0;
131 tm->tcpm_ts_stamp = 0;
132 tm->tcpm_fastopen.mss = 0;
133 tm->tcpm_fastopen.syn_loss = 0;
134 tm->tcpm_fastopen.cookie.len = 0;
135}
136
137static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
138 struct inetpeer_addr *addr,
139 unsigned int hash,
140 bool reclaim)
141{
142 struct tcp_metrics_block *tm;
143 struct net *net;
144
145 spin_lock_bh(&tcp_metrics_lock);
146 net = dev_net(dst->dev);
147 if (unlikely(reclaim)) {
148 struct tcp_metrics_block *oldest;
149
150 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
151 for (tm = rcu_dereference(oldest->tcpm_next); tm;
152 tm = rcu_dereference(tm->tcpm_next)) {
153 if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
154 oldest = tm;
155 }
156 tm = oldest;
157 } else {
158 tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
159 if (!tm)
160 goto out_unlock;
161 }
162 tm->tcpm_addr = *addr;
163
164 tcpm_suck_dst(tm, dst);
165
166 if (likely(!reclaim)) {
167 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
168 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
169 }
170
171out_unlock:
172 spin_unlock_bh(&tcp_metrics_lock);
173 return tm;
174}
175
176#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
177
178static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
179{
180 if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
181 tcpm_suck_dst(tm, dst);
182}
183
184#define TCP_METRICS_RECLAIM_DEPTH 5
185#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
186
187static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
188{
189 if (tm)
190 return tm;
191 if (depth > TCP_METRICS_RECLAIM_DEPTH)
192 return TCP_METRICS_RECLAIM_PTR;
193 return NULL;
194}
195
196static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
197 struct net *net, unsigned int hash)
198{
199 struct tcp_metrics_block *tm;
200 int depth = 0;
201
202 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
203 tm = rcu_dereference(tm->tcpm_next)) {
204 if (addr_same(&tm->tcpm_addr, addr))
205 break;
206 depth++;
207 }
208 return tcp_get_encode(tm, depth);
209}
210
211static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
212 struct dst_entry *dst)
213{
214 struct tcp_metrics_block *tm;
215 struct inetpeer_addr addr;
216 unsigned int hash;
217 struct net *net;
218
219 addr.family = req->rsk_ops->family;
220 switch (addr.family) {
221 case AF_INET:
222 addr.addr.a4 = inet_rsk(req)->rmt_addr;
223 hash = (__force unsigned int) addr.addr.a4;
224 break;
225 case AF_INET6:
226 *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
227 hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
228 break;
229 default:
230 return NULL;
231 }
232
233 net = dev_net(dst->dev);
234 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
235
236 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
237 tm = rcu_dereference(tm->tcpm_next)) {
238 if (addr_same(&tm->tcpm_addr, &addr))
239 break;
240 }
241 tcpm_check_stamp(tm, dst);
242 return tm;
243}
244
245static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
246{
247 struct inet6_timewait_sock *tw6;
248 struct tcp_metrics_block *tm;
249 struct inetpeer_addr addr;
250 unsigned int hash;
251 struct net *net;
252
253 addr.family = tw->tw_family;
254 switch (addr.family) {
255 case AF_INET:
256 addr.addr.a4 = tw->tw_daddr;
257 hash = (__force unsigned int) addr.addr.a4;
258 break;
259 case AF_INET6:
260 tw6 = inet6_twsk((struct sock *)tw);
261 *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
262 hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
263 break;
264 default:
265 return NULL;
266 }
267
268 net = twsk_net(tw);
269 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
270
271 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
272 tm = rcu_dereference(tm->tcpm_next)) {
273 if (addr_same(&tm->tcpm_addr, &addr))
274 break;
275 }
276 return tm;
277}
278
279static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
280 struct dst_entry *dst,
281 bool create)
282{
283 struct tcp_metrics_block *tm;
284 struct inetpeer_addr addr;
285 unsigned int hash;
286 struct net *net;
287 bool reclaim;
288
289 addr.family = sk->sk_family;
290 switch (addr.family) {
291 case AF_INET:
292 addr.addr.a4 = inet_sk(sk)->inet_daddr;
293 hash = (__force unsigned int) addr.addr.a4;
294 break;
295 case AF_INET6:
296 *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
297 hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
298 break;
299 default:
300 return NULL;
301 }
302
303 net = dev_net(dst->dev);
304 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
305
306 tm = __tcp_get_metrics(&addr, net, hash);
307 reclaim = false;
308 if (tm == TCP_METRICS_RECLAIM_PTR) {
309 reclaim = true;
310 tm = NULL;
311 }
312 if (!tm && create)
313 tm = tcpm_new(dst, &addr, hash, reclaim);
314 else
315 tcpm_check_stamp(tm, dst);
316
317 return tm;
318}
319
320/* Save metrics learned by this TCP session. This function is called
321 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
322 * or goes from LAST-ACK to CLOSE.
323 */
324void tcp_update_metrics(struct sock *sk)
325{
326 const struct inet_connection_sock *icsk = inet_csk(sk);
327 struct dst_entry *dst = __sk_dst_get(sk);
328 struct tcp_sock *tp = tcp_sk(sk);
329 struct tcp_metrics_block *tm;
330 unsigned long rtt;
331 u32 val;
332 int m;
333
334 if (sysctl_tcp_nometrics_save || !dst)
335 return;
336
337 if (dst->flags & DST_HOST)
338 dst_confirm(dst);
339
340 rcu_read_lock();
341 if (icsk->icsk_backoff || !tp->srtt) {
342 /* This session failed to estimate rtt. Why?
343 * Probably, no packets returned in time. Reset our
344 * results.
345 */
346 tm = tcp_get_metrics(sk, dst, false);
347 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
348 tcp_metric_set(tm, TCP_METRIC_RTT, 0);
349 goto out_unlock;
350 } else
351 tm = tcp_get_metrics(sk, dst, true);
352
353 if (!tm)
354 goto out_unlock;
355
356 rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
357 m = rtt - tp->srtt;
358
359 /* If newly calculated rtt larger than stored one, store new
360 * one. Otherwise, use EWMA. Remember, rtt overestimation is
361 * always better than underestimation.
362 */
363 if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
364 if (m <= 0)
365 rtt = tp->srtt;
366 else
367 rtt -= (m >> 3);
368 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
369 }
370
371 if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
372 unsigned long var;
373
374 if (m < 0)
375 m = -m;
376
377 /* Scale deviation to rttvar fixed point */
378 m >>= 1;
379 if (m < tp->mdev)
380 m = tp->mdev;
381
382 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
383 if (m >= var)
384 var = m;
385 else
386 var -= (var - m) >> 2;
387
388 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
389 }
390
391 if (tcp_in_initial_slowstart(tp)) {
392 /* Slow start still did not finish. */
393 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
394 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
395 if (val && (tp->snd_cwnd >> 1) > val)
396 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
397 tp->snd_cwnd >> 1);
398 }
399 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
400 val = tcp_metric_get(tm, TCP_METRIC_CWND);
401 if (tp->snd_cwnd > val)
402 tcp_metric_set(tm, TCP_METRIC_CWND,
403 tp->snd_cwnd);
404 }
405 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
406 icsk->icsk_ca_state == TCP_CA_Open) {
407 /* Cong. avoidance phase, cwnd is reliable. */
408 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
409 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
410 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
411 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
412 val = tcp_metric_get(tm, TCP_METRIC_CWND);
413 tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
414 }
415 } else {
416 /* Else slow start did not finish, cwnd is non-sense,
417 * ssthresh may be also invalid.
418 */
419 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
420 val = tcp_metric_get(tm, TCP_METRIC_CWND);
421 tcp_metric_set(tm, TCP_METRIC_CWND,
422 (val + tp->snd_ssthresh) >> 1);
423 }
424 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
425 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
426 if (val && tp->snd_ssthresh > val)
427 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
428 tp->snd_ssthresh);
429 }
430 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
431 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
432 if (val < tp->reordering &&
433 tp->reordering != sysctl_tcp_reordering)
434 tcp_metric_set(tm, TCP_METRIC_REORDERING,
435 tp->reordering);
436 }
437 }
438 tm->tcpm_stamp = jiffies;
439out_unlock:
440 rcu_read_unlock();
441}
442
443/* Initialize metrics on socket. */
444
445void tcp_init_metrics(struct sock *sk)
446{
447 struct dst_entry *dst = __sk_dst_get(sk);
448 struct tcp_sock *tp = tcp_sk(sk);
449 struct tcp_metrics_block *tm;
450 u32 val;
451
452 if (dst == NULL)
453 goto reset;
454
455 dst_confirm(dst);
456
457 rcu_read_lock();
458 tm = tcp_get_metrics(sk, dst, true);
459 if (!tm) {
460 rcu_read_unlock();
461 goto reset;
462 }
463
464 if (tcp_metric_locked(tm, TCP_METRIC_CWND))
465 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
466
467 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
468 if (val) {
469 tp->snd_ssthresh = val;
470 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
471 tp->snd_ssthresh = tp->snd_cwnd_clamp;
472 } else {
473 /* ssthresh may have been reduced unnecessarily during.
474 * 3WHS. Restore it back to its initial default.
475 */
476 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
477 }
478 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
479 if (val && tp->reordering != val) {
480 tcp_disable_fack(tp);
481 tcp_disable_early_retrans(tp);
482 tp->reordering = val;
483 }
484
485 val = tcp_metric_get(tm, TCP_METRIC_RTT);
486 if (val == 0 || tp->srtt == 0) {
487 rcu_read_unlock();
488 goto reset;
489 }
490 /* Initial rtt is determined from SYN,SYN-ACK.
491 * The segment is small and rtt may appear much
492 * less than real one. Use per-dst memory
493 * to make it more realistic.
494 *
495 * A bit of theory. RTT is time passed after "normal" sized packet
496 * is sent until it is ACKed. In normal circumstances sending small
497 * packets force peer to delay ACKs and calculation is correct too.
498 * The algorithm is adaptive and, provided we follow specs, it
499 * NEVER underestimate RTT. BUT! If peer tries to make some clever
500 * tricks sort of "quick acks" for time long enough to decrease RTT
501 * to low value, and then abruptly stops to do it and starts to delay
502 * ACKs, wait for troubles.
503 */
504 val = msecs_to_jiffies(val);
505 if (val > tp->srtt) {
506 tp->srtt = val;
507 tp->rtt_seq = tp->snd_nxt;
508 }
509 val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
510 if (val > tp->mdev) {
511 tp->mdev = val;
512 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
513 }
514 rcu_read_unlock();
515
516 tcp_set_rto(sk);
517reset:
518 if (tp->srtt == 0) {
519 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
520 * 3WHS. This is most likely due to retransmission,
521 * including spurious one. Reset the RTO back to 3secs
522 * from the more aggressive 1sec to avoid more spurious
523 * retransmission.
524 */
525 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
526 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
527 }
528 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
529 * retransmitted. In light of RFC6298 more aggressive 1sec
530 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
531 * retransmission has occurred.
532 */
533 if (tp->total_retrans > 1)
534 tp->snd_cwnd = 1;
535 else
536 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
537 tp->snd_cwnd_stamp = tcp_time_stamp;
538}
539
540bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
541{
542 struct tcp_metrics_block *tm;
543 bool ret;
544
545 if (!dst)
546 return false;
547
548 rcu_read_lock();
549 tm = __tcp_get_metrics_req(req, dst);
550 if (paws_check) {
551 if (tm &&
552 (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
553 (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
554 ret = false;
555 else
556 ret = true;
557 } else {
558 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
559 ret = true;
560 else
561 ret = false;
562 }
563 rcu_read_unlock();
564
565 return ret;
566}
567EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
568
569void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
570{
571 struct tcp_metrics_block *tm;
572
573 rcu_read_lock();
574 tm = tcp_get_metrics(sk, dst, true);
575 if (tm) {
576 struct tcp_sock *tp = tcp_sk(sk);
577
578 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
579 tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
580 tp->rx_opt.ts_recent = tm->tcpm_ts;
581 }
582 }
583 rcu_read_unlock();
584}
585EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
586
587/* VJ's idea. Save last timestamp seen from this destination and hold
588 * it at least for normal timewait interval to use for duplicate
589 * segment detection in subsequent connections, before they enter
590 * synchronized state.
591 */
592bool tcp_remember_stamp(struct sock *sk)
593{
594 struct dst_entry *dst = __sk_dst_get(sk);
595 bool ret = false;
596
597 if (dst) {
598 struct tcp_metrics_block *tm;
599
600 rcu_read_lock();
601 tm = tcp_get_metrics(sk, dst, true);
602 if (tm) {
603 struct tcp_sock *tp = tcp_sk(sk);
604
605 if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
606 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
607 tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
608 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
609 tm->tcpm_ts = tp->rx_opt.ts_recent;
610 }
611 ret = true;
612 }
613 rcu_read_unlock();
614 }
615 return ret;
616}
617
618bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
619{
620 struct tcp_metrics_block *tm;
621 bool ret = false;
622
623 rcu_read_lock();
624 tm = __tcp_get_metrics_tw(tw);
625 if (tm) {
626 const struct tcp_timewait_sock *tcptw;
627 struct sock *sk = (struct sock *) tw;
628
629 tcptw = tcp_twsk(sk);
630 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
631 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
632 tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
633 tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
634 tm->tcpm_ts = tcptw->tw_ts_recent;
635 }
636 ret = true;
637 }
638 rcu_read_unlock();
639
640 return ret;
641}
642
643static DEFINE_SEQLOCK(fastopen_seqlock);
644
645void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
646 struct tcp_fastopen_cookie *cookie,
647 int *syn_loss, unsigned long *last_syn_loss)
648{
649 struct tcp_metrics_block *tm;
650
651 rcu_read_lock();
652 tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
653 if (tm) {
654 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
655 unsigned int seq;
656
657 do {
658 seq = read_seqbegin(&fastopen_seqlock);
659 if (tfom->mss)
660 *mss = tfom->mss;
661 *cookie = tfom->cookie;
662 *syn_loss = tfom->syn_loss;
663 *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
664 } while (read_seqretry(&fastopen_seqlock, seq));
665 }
666 rcu_read_unlock();
667}
668
669void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
670 struct tcp_fastopen_cookie *cookie, bool syn_lost)
671{
672 struct tcp_metrics_block *tm;
673
674 rcu_read_lock();
675 tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
676 if (tm) {
677 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
678
679 write_seqlock_bh(&fastopen_seqlock);
680 tfom->mss = mss;
681 if (cookie->len > 0)
682 tfom->cookie = *cookie;
683 if (syn_lost) {
684 ++tfom->syn_loss;
685 tfom->last_syn_loss = jiffies;
686 } else
687 tfom->syn_loss = 0;
688 write_sequnlock_bh(&fastopen_seqlock);
689 }
690 rcu_read_unlock();
691}
692
693static unsigned int tcpmhash_entries;
694static int __init set_tcpmhash_entries(char *str)
695{
696 ssize_t ret;
697
698 if (!str)
699 return 0;
700
701 ret = kstrtouint(str, 0, &tcpmhash_entries);
702 if (ret)
703 return 0;
704
705 return 1;
706}
707__setup("tcpmhash_entries=", set_tcpmhash_entries);
708
709static int __net_init tcp_net_metrics_init(struct net *net)
710{
711 size_t size;
712 unsigned int slots;
713
714 slots = tcpmhash_entries;
715 if (!slots) {
716 if (totalram_pages >= 128 * 1024)
717 slots = 16 * 1024;
718 else
719 slots = 8 * 1024;
720 }
721
722 net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
723 size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
724
725 net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
726 if (!net->ipv4.tcp_metrics_hash)
727 return -ENOMEM;
728
729 return 0;
730}
731
732static void __net_exit tcp_net_metrics_exit(struct net *net)
733{
734 kfree(net->ipv4.tcp_metrics_hash);
735}
736
737static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
738 .init = tcp_net_metrics_init,
739 .exit = tcp_net_metrics_exit,
740};
741
742void __init tcp_metrics_init(void)
743{
744 register_pernet_subsys(&tcp_net_metrics_ops);
745}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b85d9fe7d663..5912ac3fd240 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,56 +49,6 @@ struct inet_timewait_death_row tcp_death_row = {
49}; 49};
50EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
51 51
52/* VJ's idea. Save last timestamp seen from this destination
53 * and hold it at least for normal timewait interval to use for duplicate
54 * segment detection in subsequent connections, before they enter synchronized
55 * state.
56 */
57
58static bool tcp_remember_stamp(struct sock *sk)
59{
60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk);
62 struct inet_peer *peer;
63 bool release_it;
64
65 peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
66 if (peer) {
67 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
68 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
69 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
70 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
71 peer->tcp_ts = tp->rx_opt.ts_recent;
72 }
73 if (release_it)
74 inet_putpeer(peer);
75 return true;
76 }
77
78 return false;
79}
80
81static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
82{
83 struct sock *sk = (struct sock *) tw;
84 struct inet_peer *peer;
85
86 peer = twsk_getpeer(sk);
87 if (peer) {
88 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
89
90 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
91 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
92 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
93 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
94 peer->tcp_ts = tcptw->tw_ts_recent;
95 }
96 inet_putpeer(peer);
97 return true;
98 }
99 return false;
100}
101
102static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 52static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
103{ 53{
104 if (seq == s_win) 54 if (seq == s_win)
@@ -147,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
147 97
148 tmp_opt.saw_tstamp = 0; 98 tmp_opt.saw_tstamp = 0;
149 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 99 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
150 tcp_parse_options(skb, &tmp_opt, &hash_location, 0); 100 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
151 101
152 if (tmp_opt.saw_tstamp) { 102 if (tmp_opt.saw_tstamp) {
153 tmp_opt.ts_recent = tcptw->tw_ts_recent; 103 tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -327,8 +277,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
327 if (tw != NULL) { 277 if (tw != NULL) {
328 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 278 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
329 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 279 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
280 struct inet_sock *inet = inet_sk(sk);
330 281
331 tw->tw_transparent = inet_sk(sk)->transparent; 282 tw->tw_transparent = inet->transparent;
332 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 283 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
333 tcptw->tw_rcv_nxt = tp->rcv_nxt; 284 tcptw->tw_rcv_nxt = tp->rcv_nxt;
334 tcptw->tw_snd_nxt = tp->snd_nxt; 285 tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -403,6 +354,7 @@ void tcp_twsk_destructor(struct sock *sk)
403{ 354{
404#ifdef CONFIG_TCP_MD5SIG 355#ifdef CONFIG_TCP_MD5SIG
405 struct tcp_timewait_sock *twsk = tcp_twsk(sk); 356 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
357
406 if (twsk->tw_md5_key) { 358 if (twsk->tw_md5_key) {
407 tcp_free_md5sig_pool(); 359 tcp_free_md5sig_pool();
408 kfree_rcu(twsk->tw_md5_key, rcu); 360 kfree_rcu(twsk->tw_md5_key, rcu);
@@ -435,6 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
435 struct tcp_sock *oldtp = tcp_sk(sk); 387 struct tcp_sock *oldtp = tcp_sk(sk);
436 struct tcp_cookie_values *oldcvp = oldtp->cookie_values; 388 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
437 389
390 newsk->sk_rx_dst = dst_clone(skb_dst(skb));
391
438 /* TCP Cookie Transactions require space for the cookie pair, 392 /* TCP Cookie Transactions require space for the cookie pair,
439 * as it differs for each connection. There is no need to 393 * as it differs for each connection. There is no need to
440 * copy any s_data_payload stored at the original socket. 394 * copy any s_data_payload stored at the original socket.
@@ -470,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
470 treq->snt_isn + 1 + tcp_s_data_size(oldtp); 424 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
471 425
472 tcp_prequeue_init(newtp); 426 tcp_prequeue_init(newtp);
427 INIT_LIST_HEAD(&newtp->tsq_node);
473 428
474 tcp_init_wl(newtp, treq->rcv_isn); 429 tcp_init_wl(newtp, treq->rcv_isn);
475 430
@@ -579,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
579 534
580 tmp_opt.saw_tstamp = 0; 535 tmp_opt.saw_tstamp = 0;
581 if (th->doff > (sizeof(struct tcphdr)>>2)) { 536 if (th->doff > (sizeof(struct tcphdr)>>2)) {
582 tcp_parse_options(skb, &tmp_opt, &hash_location, 0); 537 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
583 538
584 if (tmp_opt.saw_tstamp) { 539 if (tmp_opt.saw_tstamp) {
585 tmp_opt.ts_recent = req->ts_recent; 540 tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 803cbfe82fbc..33cd065cfbd8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
50 */ 50 */
51int sysctl_tcp_workaround_signed_windows __read_mostly = 0; 51int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52 52
53/* Default TSQ limit of two TSO segments */
54int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
55
53/* This limits the percentage of the congestion window which we 56/* This limits the percentage of the congestion window which we
54 * will allow a single TSO frame to consume. Building TSO frames 57 * will allow a single TSO frame to consume. Building TSO frames
55 * which are too large can cause TCP streams to be bursty. 58 * which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
65int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ 68int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
66EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); 69EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
67 70
71static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
72 int push_one, gfp_t gfp);
68 73
69/* Account for new data that has been sent to the network. */ 74/* Account for new data that has been sent to the network. */
70static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -380,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
380#define OPTION_MD5 (1 << 2) 385#define OPTION_MD5 (1 << 2)
381#define OPTION_WSCALE (1 << 3) 386#define OPTION_WSCALE (1 << 3)
382#define OPTION_COOKIE_EXTENSION (1 << 4) 387#define OPTION_COOKIE_EXTENSION (1 << 4)
388#define OPTION_FAST_OPEN_COOKIE (1 << 8)
383 389
384struct tcp_out_options { 390struct tcp_out_options {
385 u8 options; /* bit field of OPTION_* */ 391 u16 options; /* bit field of OPTION_* */
392 u16 mss; /* 0 to disable */
386 u8 ws; /* window scale, 0 to disable */ 393 u8 ws; /* window scale, 0 to disable */
387 u8 num_sack_blocks; /* number of SACK blocks to include */ 394 u8 num_sack_blocks; /* number of SACK blocks to include */
388 u8 hash_size; /* bytes in hash_location */ 395 u8 hash_size; /* bytes in hash_location */
389 u16 mss; /* 0 to disable */
390 __u32 tsval, tsecr; /* need to include OPTION_TS */
391 __u8 *hash_location; /* temporary pointer, overloaded */ 396 __u8 *hash_location; /* temporary pointer, overloaded */
397 __u32 tsval, tsecr; /* need to include OPTION_TS */
398 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
392}; 399};
393 400
394/* The sysctl int routines are generic, so check consistency here. 401/* The sysctl int routines are generic, so check consistency here.
@@ -437,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired)
437static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, 444static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
438 struct tcp_out_options *opts) 445 struct tcp_out_options *opts)
439{ 446{
440 u8 options = opts->options; /* mungable copy */ 447 u16 options = opts->options; /* mungable copy */
441 448
442 /* Having both authentication and cookies for security is redundant, 449 /* Having both authentication and cookies for security is redundant,
443 * and there's certainly not enough room. Instead, the cookie-less 450 * and there's certainly not enough room. Instead, the cookie-less
@@ -559,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
559 566
560 tp->rx_opt.dsack = 0; 567 tp->rx_opt.dsack = 0;
561 } 568 }
569
570 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
571 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
572
573 *ptr++ = htonl((TCPOPT_EXP << 24) |
574 ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
575 TCPOPT_FASTOPEN_MAGIC);
576
577 memcpy(ptr, foc->val, foc->len);
578 if ((foc->len & 3) == 2) {
579 u8 *align = ((u8 *)ptr) + foc->len;
580 align[0] = align[1] = TCPOPT_NOP;
581 }
582 ptr += (foc->len + 3) >> 2;
583 }
562} 584}
563 585
564/* Compute TCP options for SYN packets. This is not the final 586/* Compute TCP options for SYN packets. This is not the final
@@ -574,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
574 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? 596 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
575 tcp_cookie_size_check(cvp->cookie_desired) : 597 tcp_cookie_size_check(cvp->cookie_desired) :
576 0; 598 0;
599 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
577 600
578#ifdef CONFIG_TCP_MD5SIG 601#ifdef CONFIG_TCP_MD5SIG
579 *md5 = tp->af_specific->md5_lookup(sk, sk); 602 *md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -614,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
614 remaining -= TCPOLEN_SACKPERM_ALIGNED; 637 remaining -= TCPOLEN_SACKPERM_ALIGNED;
615 } 638 }
616 639
640 if (fastopen && fastopen->cookie.len >= 0) {
641 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
642 need = (need + 3) & ~3U; /* Align to 32 bits */
643 if (remaining >= need) {
644 opts->options |= OPTION_FAST_OPEN_COOKIE;
645 opts->fastopen_cookie = &fastopen->cookie;
646 remaining -= need;
647 tp->syn_fastopen = 1;
648 }
649 }
617 /* Note that timestamps are required by the specification. 650 /* Note that timestamps are required by the specification.
618 * 651 *
619 * Odd numbers of bytes are prohibited by the specification, ensuring 652 * Odd numbers of bytes are prohibited by the specification, ensuring
@@ -783,6 +816,156 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
783 return size; 816 return size;
784} 817}
785 818
819
820/* TCP SMALL QUEUES (TSQ)
821 *
822 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
823 * to reduce RTT and bufferbloat.
824 * We do this using a special skb destructor (tcp_wfree).
825 *
826 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
827 * needs to be reallocated in a driver.
828 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
829 *
830 * Since transmit from skb destructor is forbidden, we use a tasklet
831 * to process all sockets that eventually need to send more skbs.
832 * We use one tasklet per cpu, with its own queue of sockets.
833 */
834struct tsq_tasklet {
835 struct tasklet_struct tasklet;
836 struct list_head head; /* queue of tcp sockets */
837};
838static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
839
840static void tcp_tsq_handler(struct sock *sk)
841{
842 if ((1 << sk->sk_state) &
843 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
844 TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
845 tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
846}
847/*
848 * One tasklest per cpu tries to send more skbs.
849 * We run in tasklet context but need to disable irqs when
850 * transfering tsq->head because tcp_wfree() might
851 * interrupt us (non NAPI drivers)
852 */
853static void tcp_tasklet_func(unsigned long data)
854{
855 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
856 LIST_HEAD(list);
857 unsigned long flags;
858 struct list_head *q, *n;
859 struct tcp_sock *tp;
860 struct sock *sk;
861
862 local_irq_save(flags);
863 list_splice_init(&tsq->head, &list);
864 local_irq_restore(flags);
865
866 list_for_each_safe(q, n, &list) {
867 tp = list_entry(q, struct tcp_sock, tsq_node);
868 list_del(&tp->tsq_node);
869
870 sk = (struct sock *)tp;
871 bh_lock_sock(sk);
872
873 if (!sock_owned_by_user(sk)) {
874 tcp_tsq_handler(sk);
875 } else {
876 /* defer the work to tcp_release_cb() */
877 set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
878 }
879 bh_unlock_sock(sk);
880
881 clear_bit(TSQ_QUEUED, &tp->tsq_flags);
882 sk_free(sk);
883 }
884}
885
886#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
887 (1UL << TCP_WRITE_TIMER_DEFERRED) | \
888 (1UL << TCP_DELACK_TIMER_DEFERRED) | \
889 (1UL << TCP_MTU_REDUCED_DEFERRED))
890/**
891 * tcp_release_cb - tcp release_sock() callback
892 * @sk: socket
893 *
894 * called from release_sock() to perform protocol dependent
895 * actions before socket release.
896 */
897void tcp_release_cb(struct sock *sk)
898{
899 struct tcp_sock *tp = tcp_sk(sk);
900 unsigned long flags, nflags;
901
902 /* perform an atomic operation only if at least one flag is set */
903 do {
904 flags = tp->tsq_flags;
905 if (!(flags & TCP_DEFERRED_ALL))
906 return;
907 nflags = flags & ~TCP_DEFERRED_ALL;
908 } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
909
910 if (flags & (1UL << TCP_TSQ_DEFERRED))
911 tcp_tsq_handler(sk);
912
913 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED))
914 tcp_write_timer_handler(sk);
915
916 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
917 tcp_delack_timer_handler(sk);
918
919 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED))
920 sk->sk_prot->mtu_reduced(sk);
921}
922EXPORT_SYMBOL(tcp_release_cb);
923
924void __init tcp_tasklet_init(void)
925{
926 int i;
927
928 for_each_possible_cpu(i) {
929 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
930
931 INIT_LIST_HEAD(&tsq->head);
932 tasklet_init(&tsq->tasklet,
933 tcp_tasklet_func,
934 (unsigned long)tsq);
935 }
936}
937
938/*
939 * Write buffer destructor automatically called from kfree_skb.
940 * We cant xmit new skbs from this context, as we might already
941 * hold qdisc lock.
942 */
943void tcp_wfree(struct sk_buff *skb)
944{
945 struct sock *sk = skb->sk;
946 struct tcp_sock *tp = tcp_sk(sk);
947
948 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
949 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
950 unsigned long flags;
951 struct tsq_tasklet *tsq;
952
953 /* Keep a ref on socket.
954 * This last ref will be released in tcp_tasklet_func()
955 */
956 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
957
958 /* queue this socket to tasklet queue */
959 local_irq_save(flags);
960 tsq = &__get_cpu_var(tsq_tasklet);
961 list_add(&tp->tsq_node, &tsq->head);
962 tasklet_schedule(&tsq->tasklet);
963 local_irq_restore(flags);
964 } else {
965 sock_wfree(skb);
966 }
967}
968
786/* This routine actually transmits TCP packets queued in by 969/* This routine actually transmits TCP packets queued in by
787 * tcp_do_sendmsg(). This is used by both the initial 970 * tcp_do_sendmsg(). This is used by both the initial
788 * transmission and possible later retransmissions. 971 * transmission and possible later retransmissions.
@@ -844,7 +1027,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
844 1027
845 skb_push(skb, tcp_header_size); 1028 skb_push(skb, tcp_header_size);
846 skb_reset_transport_header(skb); 1029 skb_reset_transport_header(skb);
847 skb_set_owner_w(skb, sk); 1030
1031 skb_orphan(skb);
1032 skb->sk = sk;
1033 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
1034 tcp_wfree : sock_wfree;
1035 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
848 1036
849 /* Build TCP header and checksum it. */ 1037 /* Build TCP header and checksum it. */
850 th = tcp_hdr(skb); 1038 th = tcp_hdr(skb);
@@ -1780,6 +1968,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1780 while ((skb = tcp_send_head(sk))) { 1968 while ((skb = tcp_send_head(sk))) {
1781 unsigned int limit; 1969 unsigned int limit;
1782 1970
1971
1783 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1972 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1784 BUG_ON(!tso_segs); 1973 BUG_ON(!tso_segs);
1785 1974
@@ -1800,6 +1989,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1800 break; 1989 break;
1801 } 1990 }
1802 1991
1992 /* TSQ : sk_wmem_alloc accounts skb truesize,
1993 * including skb overhead. But thats OK.
1994 */
1995 if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
1996 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1997 break;
1998 }
1803 limit = mss_now; 1999 limit = mss_now;
1804 if (tso_segs > 1 && !tcp_urg_mode(tp)) 2000 if (tso_segs > 1 && !tcp_urg_mode(tp))
1805 limit = tcp_mss_split_point(sk, skb, mss_now, 2001 limit = tcp_mss_split_point(sk, skb, mss_now,
@@ -2442,7 +2638,16 @@ int tcp_send_synack(struct sock *sk)
2442 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2638 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2443} 2639}
2444 2640
2445/* Prepare a SYN-ACK. */ 2641/**
2642 * tcp_make_synack - Prepare a SYN-ACK.
2643 * sk: listener socket
2644 * dst: dst entry attached to the SYNACK
2645 * req: request_sock pointer
2646 * rvp: request_values pointer
2647 *
2648 * Allocate one skb and build a SYNACK packet.
2649 * @dst is consumed : Caller should not use it again.
2650 */
2446struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2651struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2447 struct request_sock *req, 2652 struct request_sock *req,
2448 struct request_values *rvp) 2653 struct request_values *rvp)
@@ -2461,14 +2666,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2461 2666
2462 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) 2667 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2463 s_data_desired = cvp->s_data_desired; 2668 s_data_desired = cvp->s_data_desired;
2464 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); 2669 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
2465 if (skb == NULL) 2670 if (unlikely(!skb)) {
2671 dst_release(dst);
2466 return NULL; 2672 return NULL;
2467 2673 }
2468 /* Reserve space for headers. */ 2674 /* Reserve space for headers. */
2469 skb_reserve(skb, MAX_TCP_HEADER); 2675 skb_reserve(skb, MAX_TCP_HEADER);
2470 2676
2471 skb_dst_set(skb, dst_clone(dst)); 2677 skb_dst_set(skb, dst);
2472 2678
2473 mss = dst_metric_advmss(dst); 2679 mss = dst_metric_advmss(dst);
2474 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2680 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2645,6 +2851,109 @@ void tcp_connect_init(struct sock *sk)
2645 tcp_clear_retrans(tp); 2851 tcp_clear_retrans(tp);
2646} 2852}
2647 2853
2854static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
2855{
2856 struct tcp_sock *tp = tcp_sk(sk);
2857 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2858
2859 tcb->end_seq += skb->len;
2860 skb_header_release(skb);
2861 __tcp_add_write_queue_tail(sk, skb);
2862 sk->sk_wmem_queued += skb->truesize;
2863 sk_mem_charge(sk, skb->truesize);
2864 tp->write_seq = tcb->end_seq;
2865 tp->packets_out += tcp_skb_pcount(skb);
2866}
2867
2868/* Build and send a SYN with data and (cached) Fast Open cookie. However,
2869 * queue a data-only packet after the regular SYN, such that regular SYNs
2870 * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
2871 * only the SYN sequence, the data are retransmitted in the first ACK.
2872 * If cookie is not cached or other error occurs, falls back to send a
2873 * regular SYN with Fast Open cookie request option.
2874 */
2875static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
2876{
2877 struct tcp_sock *tp = tcp_sk(sk);
2878 struct tcp_fastopen_request *fo = tp->fastopen_req;
2879 int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
2880 struct sk_buff *syn_data = NULL, *data;
2881 unsigned long last_syn_loss = 0;
2882
2883 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
2884 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
2885 &syn_loss, &last_syn_loss);
2886 /* Recurring FO SYN losses: revert to regular handshake temporarily */
2887 if (syn_loss > 1 &&
2888 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
2889 fo->cookie.len = -1;
2890 goto fallback;
2891 }
2892
2893 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
2894 fo->cookie.len = -1;
2895 else if (fo->cookie.len <= 0)
2896 goto fallback;
2897
2898 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
2899 * user-MSS. Reserve maximum option space for middleboxes that add
2900 * private TCP options. The cost is reduced data space in SYN :(
2901 */
2902 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
2903 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
2904 space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
2905 MAX_TCP_OPTION_SPACE;
2906
2907 syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
2908 sk->sk_allocation);
2909 if (syn_data == NULL)
2910 goto fallback;
2911
2912 for (i = 0; i < iovlen && syn_data->len < space; ++i) {
2913 struct iovec *iov = &fo->data->msg_iov[i];
2914 unsigned char __user *from = iov->iov_base;
2915 int len = iov->iov_len;
2916
2917 if (syn_data->len + len > space)
2918 len = space - syn_data->len;
2919 else if (i + 1 == iovlen)
2920 /* No more data pending in inet_wait_for_connect() */
2921 fo->data = NULL;
2922
2923 if (skb_add_data(syn_data, from, len))
2924 goto fallback;
2925 }
2926
2927 /* Queue a data-only packet after the regular SYN for retransmission */
2928 data = pskb_copy(syn_data, sk->sk_allocation);
2929 if (data == NULL)
2930 goto fallback;
2931 TCP_SKB_CB(data)->seq++;
2932 TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
2933 TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
2934 tcp_connect_queue_skb(sk, data);
2935 fo->copied = data->len;
2936
2937 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
2938 tp->syn_data = (fo->copied > 0);
2939 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
2940 goto done;
2941 }
2942 syn_data = NULL;
2943
2944fallback:
2945 /* Send a regular SYN with Fast Open cookie request option */
2946 if (fo->cookie.len > 0)
2947 fo->cookie.len = 0;
2948 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
2949 if (err)
2950 tp->syn_fastopen = 0;
2951 kfree_skb(syn_data);
2952done:
2953 fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
2954 return err;
2955}
2956
2648/* Build a SYN and send it off. */ 2957/* Build a SYN and send it off. */
2649int tcp_connect(struct sock *sk) 2958int tcp_connect(struct sock *sk)
2650{ 2959{
@@ -2662,17 +2971,13 @@ int tcp_connect(struct sock *sk)
2662 skb_reserve(buff, MAX_TCP_HEADER); 2971 skb_reserve(buff, MAX_TCP_HEADER);
2663 2972
2664 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 2973 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
2974 tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
2975 tcp_connect_queue_skb(sk, buff);
2665 TCP_ECN_send_syn(sk, buff); 2976 TCP_ECN_send_syn(sk, buff);
2666 2977
2667 /* Send it off. */ 2978 /* Send off SYN; include data in Fast Open. */
2668 TCP_SKB_CB(buff)->when = tcp_time_stamp; 2979 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
2669 tp->retrans_stamp = TCP_SKB_CB(buff)->when; 2980 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2670 skb_header_release(buff);
2671 __tcp_add_write_queue_tail(sk, buff);
2672 sk->sk_wmem_queued += buff->truesize;
2673 sk_mem_charge(sk, buff->truesize);
2674 tp->packets_out += tcp_skb_pcount(buff);
2675 err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2676 if (err == -ECONNREFUSED) 2981 if (err == -ECONNREFUSED)
2677 return err; 2982 return err;
2678 2983
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e911e6c523ec..6df36ad55a38 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
32int sysctl_tcp_orphan_retries __read_mostly; 32int sysctl_tcp_orphan_retries __read_mostly;
33int sysctl_tcp_thin_linear_timeouts __read_mostly; 33int sysctl_tcp_thin_linear_timeouts __read_mostly;
34 34
35static void tcp_write_timer(unsigned long);
36static void tcp_delack_timer(unsigned long);
37static void tcp_keepalive_timer (unsigned long data);
38
39void tcp_init_xmit_timers(struct sock *sk)
40{
41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
42 &tcp_keepalive_timer);
43}
44EXPORT_SYMBOL(tcp_init_xmit_timers);
45
46static void tcp_write_err(struct sock *sk) 35static void tcp_write_err(struct sock *sk)
47{ 36{
48 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; 37 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -205,21 +194,11 @@ static int tcp_write_timeout(struct sock *sk)
205 return 0; 194 return 0;
206} 195}
207 196
208static void tcp_delack_timer(unsigned long data) 197void tcp_delack_timer_handler(struct sock *sk)
209{ 198{
210 struct sock *sk = (struct sock *)data;
211 struct tcp_sock *tp = tcp_sk(sk); 199 struct tcp_sock *tp = tcp_sk(sk);
212 struct inet_connection_sock *icsk = inet_csk(sk); 200 struct inet_connection_sock *icsk = inet_csk(sk);
213 201
214 bh_lock_sock(sk);
215 if (sock_owned_by_user(sk)) {
216 /* Try again later. */
217 icsk->icsk_ack.blocked = 1;
218 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
219 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
220 goto out_unlock;
221 }
222
223 sk_mem_reclaim_partial(sk); 202 sk_mem_reclaim_partial(sk);
224 203
225 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) 204 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
@@ -260,7 +239,21 @@ static void tcp_delack_timer(unsigned long data)
260out: 239out:
261 if (sk_under_memory_pressure(sk)) 240 if (sk_under_memory_pressure(sk))
262 sk_mem_reclaim(sk); 241 sk_mem_reclaim(sk);
263out_unlock: 242}
243
244static void tcp_delack_timer(unsigned long data)
245{
246 struct sock *sk = (struct sock *)data;
247
248 bh_lock_sock(sk);
249 if (!sock_owned_by_user(sk)) {
250 tcp_delack_timer_handler(sk);
251 } else {
252 inet_csk(sk)->icsk_ack.blocked = 1;
253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
254 /* deleguate our work to tcp_release_cb() */
255 set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
256 }
264 bh_unlock_sock(sk); 257 bh_unlock_sock(sk);
265 sock_put(sk); 258 sock_put(sk);
266} 259}
@@ -450,19 +443,11 @@ out_reset_timer:
450out:; 443out:;
451} 444}
452 445
453static void tcp_write_timer(unsigned long data) 446void tcp_write_timer_handler(struct sock *sk)
454{ 447{
455 struct sock *sk = (struct sock *)data;
456 struct inet_connection_sock *icsk = inet_csk(sk); 448 struct inet_connection_sock *icsk = inet_csk(sk);
457 int event; 449 int event;
458 450
459 bh_lock_sock(sk);
460 if (sock_owned_by_user(sk)) {
461 /* Try again later */
462 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
463 goto out_unlock;
464 }
465
466 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) 451 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
467 goto out; 452 goto out;
468 453
@@ -485,7 +470,19 @@ static void tcp_write_timer(unsigned long data)
485 470
486out: 471out:
487 sk_mem_reclaim(sk); 472 sk_mem_reclaim(sk);
488out_unlock: 473}
474
475static void tcp_write_timer(unsigned long data)
476{
477 struct sock *sk = (struct sock *)data;
478
479 bh_lock_sock(sk);
480 if (!sock_owned_by_user(sk)) {
481 tcp_write_timer_handler(sk);
482 } else {
483 /* deleguate our work to tcp_release_cb() */
484 set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
485 }
489 bh_unlock_sock(sk); 486 bh_unlock_sock(sk);
490 sock_put(sk); 487 sock_put(sk);
491} 488}
@@ -602,3 +599,10 @@ out:
602 bh_unlock_sock(sk); 599 bh_unlock_sock(sk);
603 sock_put(sk); 600 sock_put(sk);
604} 601}
602
603void tcp_init_xmit_timers(struct sock *sk)
604{
605 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
606 &tcp_keepalive_timer);
607}
608EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eaca73644e79..b4c3582a991f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -108,6 +108,7 @@
108#include <net/xfrm.h> 108#include <net/xfrm.h>
109#include <trace/events/udp.h> 109#include <trace/events/udp.h>
110#include <linux/static_key.h> 110#include <linux/static_key.h>
111#include <trace/events/skb.h>
111#include "udp_impl.h" 112#include "udp_impl.h"
112 113
113struct udp_table udp_table __read_mostly; 114struct udp_table udp_table __read_mostly;
@@ -615,6 +616,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
615 break; 616 break;
616 case ICMP_DEST_UNREACH: 617 case ICMP_DEST_UNREACH:
617 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ 618 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
619 ipv4_sk_update_pmtu(skb, sk, info);
618 if (inet->pmtudisc != IP_PMTUDISC_DONT) { 620 if (inet->pmtudisc != IP_PMTUDISC_DONT) {
619 err = EMSGSIZE; 621 err = EMSGSIZE;
620 harderr = 1; 622 harderr = 1;
@@ -628,6 +630,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
628 err = icmp_err_convert[code].errno; 630 err = icmp_err_convert[code].errno;
629 } 631 }
630 break; 632 break;
633 case ICMP_REDIRECT:
634 ipv4_sk_redirect(skb, sk);
635 break;
631 } 636 }
632 637
633 /* 638 /*
@@ -1219,8 +1224,10 @@ try_again:
1219 goto csum_copy_err; 1224 goto csum_copy_err;
1220 } 1225 }
1221 1226
1222 if (err) 1227 if (unlikely(err)) {
1228 trace_kfree_skb(skb, udp_recvmsg);
1223 goto out_free; 1229 goto out_free;
1230 }
1224 1231
1225 if (!peeked) 1232 if (!peeked)
1226 UDP_INC_STATS_USER(sock_net(sk), 1233 UDP_INC_STATS_USER(sock_net(sk),
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index a7f86a3cd502..16d0960062be 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -34,15 +34,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
34 int err = -EINVAL; 34 int err = -EINVAL;
35 struct sock *sk; 35 struct sock *sk;
36 struct sk_buff *rep; 36 struct sk_buff *rep;
37 struct net *net = sock_net(in_skb->sk);
37 38
38 if (req->sdiag_family == AF_INET) 39 if (req->sdiag_family == AF_INET)
39 sk = __udp4_lib_lookup(&init_net, 40 sk = __udp4_lib_lookup(net,
40 req->id.idiag_src[0], req->id.idiag_sport, 41 req->id.idiag_src[0], req->id.idiag_sport,
41 req->id.idiag_dst[0], req->id.idiag_dport, 42 req->id.idiag_dst[0], req->id.idiag_dport,
42 req->id.idiag_if, tbl); 43 req->id.idiag_if, tbl);
43#if IS_ENABLED(CONFIG_IPV6) 44#if IS_ENABLED(CONFIG_IPV6)
44 else if (req->sdiag_family == AF_INET6) 45 else if (req->sdiag_family == AF_INET6)
45 sk = __udp6_lib_lookup(&init_net, 46 sk = __udp6_lib_lookup(net,
46 (struct in6_addr *)req->id.idiag_src, 47 (struct in6_addr *)req->id.idiag_src,
47 req->id.idiag_sport, 48 req->id.idiag_sport,
48 (struct in6_addr *)req->id.idiag_dst, 49 (struct in6_addr *)req->id.idiag_dst,
@@ -75,7 +76,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
75 kfree_skb(rep); 76 kfree_skb(rep);
76 goto out; 77 goto out;
77 } 78 }
78 err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, 79 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
79 MSG_DONTWAIT); 80 MSG_DONTWAIT);
80 if (err > 0) 81 if (err > 0)
81 err = 0; 82 err = 0;
@@ -90,6 +91,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
90 struct inet_diag_req_v2 *r, struct nlattr *bc) 91 struct inet_diag_req_v2 *r, struct nlattr *bc)
91{ 92{
92 int num, s_num, slot, s_slot; 93 int num, s_num, slot, s_slot;
94 struct net *net = sock_net(skb->sk);
93 95
94 s_slot = cb->args[0]; 96 s_slot = cb->args[0];
95 num = s_num = cb->args[1]; 97 num = s_num = cb->args[1];
@@ -106,6 +108,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
106 sk_nulls_for_each(sk, node, &hslot->head) { 108 sk_nulls_for_each(sk, node, &hslot->head) {
107 struct inet_sock *inet = inet_sk(sk); 109 struct inet_sock *inet = inet_sk(sk);
108 110
111 if (!net_eq(sock_net(sk), net))
112 continue;
109 if (num < s_num) 113 if (num < s_num)
110 goto next; 114 goto next;
111 if (!(r->idiag_states & (1 << sk->sk_state))) 115 if (!(r->idiag_states & (1 << sk->sk_state)))
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216dc..58d23a572509 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
27 if (skb_dst(skb) == NULL) { 27 if (skb_dst(skb) == NULL) {
28 const struct iphdr *iph = ip_hdr(skb); 28 const struct iphdr *iph = ip_hdr(skb);
29 29
30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr, 30 if (ip_route_input(skb, iph->daddr, iph->saddr,
31 iph->tos, skb->dev)) 31 iph->tos, skb->dev))
32 goto drop; 32 goto drop;
33 } 33 }
34 return dst_input(skb); 34 return dst_input(skb);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ed4bf11ef9f4..ddee0a099a2c 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,6 +15,65 @@
15#include <net/ip.h> 15#include <net/ip.h>
16#include <net/xfrm.h> 16#include <net/xfrm.h>
17 17
18/* Informational hook. The decap is still done here. */
19static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
20static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
21
22int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
23{
24 struct xfrm_tunnel __rcu **pprev;
25 struct xfrm_tunnel *t;
26 int ret = -EEXIST;
27 int priority = handler->priority;
28
29 mutex_lock(&xfrm4_mode_tunnel_input_mutex);
30
31 for (pprev = &rcv_notify_handlers;
32 (t = rcu_dereference_protected(*pprev,
33 lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
34 pprev = &t->next) {
35 if (t->priority > priority)
36 break;
37 if (t->priority == priority)
38 goto err;
39
40 }
41
42 handler->next = *pprev;
43 rcu_assign_pointer(*pprev, handler);
44
45 ret = 0;
46
47err:
48 mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
49 return ret;
50}
51EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
52
53int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
54{
55 struct xfrm_tunnel __rcu **pprev;
56 struct xfrm_tunnel *t;
57 int ret = -ENOENT;
58
59 mutex_lock(&xfrm4_mode_tunnel_input_mutex);
60 for (pprev = &rcv_notify_handlers;
61 (t = rcu_dereference_protected(*pprev,
62 lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
63 pprev = &t->next) {
64 if (t == handler) {
65 *pprev = handler->next;
66 ret = 0;
67 break;
68 }
69 }
70 mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
71 synchronize_net();
72
73 return ret;
74}
75EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
76
18static inline void ipip_ecn_decapsulate(struct sk_buff *skb) 77static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
19{ 78{
20 struct iphdr *inner_iph = ipip_hdr(skb); 79 struct iphdr *inner_iph = ipip_hdr(skb);
@@ -64,8 +123,14 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
64 return 0; 123 return 0;
65} 124}
66 125
126#define for_each_input_rcu(head, handler) \
127 for (handler = rcu_dereference(head); \
128 handler != NULL; \
129 handler = rcu_dereference(handler->next))
130
67static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) 131static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
68{ 132{
133 struct xfrm_tunnel *handler;
69 int err = -EINVAL; 134 int err = -EINVAL;
70 135
71 if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) 136 if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -74,6 +139,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
74 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 139 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
75 goto out; 140 goto out;
76 141
142 for_each_input_rcu(rcv_notify_handlers, handler)
143 handler->handler(skb);
144
77 if (skb_cloned(skb) && 145 if (skb_cloned(skb) &&
78 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 146 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
79 goto out; 147 goto out;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 0d3426cb5c4f..c6281847f16a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,30 +79,19 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
79 struct rtable *rt = (struct rtable *)xdst->route; 79 struct rtable *rt = (struct rtable *)xdst->route;
80 const struct flowi4 *fl4 = &fl->u.ip4; 80 const struct flowi4 *fl4 = &fl->u.ip4;
81 81
82 xdst->u.rt.rt_key_dst = fl4->daddr;
83 xdst->u.rt.rt_key_src = fl4->saddr;
84 xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
85 xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
86 xdst->u.rt.rt_iif = fl4->flowi4_iif; 82 xdst->u.rt.rt_iif = fl4->flowi4_iif;
87 xdst->u.rt.rt_oif = fl4->flowi4_oif;
88 xdst->u.rt.rt_mark = fl4->flowi4_mark;
89 83
90 xdst->u.dst.dev = dev; 84 xdst->u.dst.dev = dev;
91 dev_hold(dev); 85 dev_hold(dev);
92 86
93 xdst->u.rt.peer = rt->peer;
94 if (rt->peer)
95 atomic_inc(&rt->peer->refcnt);
96
97 /* Sheit... I remember I did this right. Apparently, 87 /* Sheit... I remember I did this right. Apparently,
98 * it was magically lost, so this code needs audit */ 88 * it was magically lost, so this code needs audit */
89 xdst->u.rt.rt_is_input = rt->rt_is_input;
99 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | 90 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
100 RTCF_LOCAL); 91 RTCF_LOCAL);
101 xdst->u.rt.rt_type = rt->rt_type; 92 xdst->u.rt.rt_type = rt->rt_type;
102 xdst->u.rt.rt_src = rt->rt_src;
103 xdst->u.rt.rt_dst = rt->rt_dst;
104 xdst->u.rt.rt_gateway = rt->rt_gateway; 93 xdst->u.rt.rt_gateway = rt->rt_gateway;
105 xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; 94 xdst->u.rt.rt_pmtu = rt->rt_pmtu;
106 95
107 return 0; 96 return 0;
108} 97}
@@ -198,12 +187,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
198 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); 187 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
199} 188}
200 189
201static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) 190static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
191 struct sk_buff *skb, u32 mtu)
192{
193 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
194 struct dst_entry *path = xdst->route;
195
196 path->ops->update_pmtu(path, sk, skb, mtu);
197}
198
199static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
200 struct sk_buff *skb)
202{ 201{
203 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 202 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
204 struct dst_entry *path = xdst->route; 203 struct dst_entry *path = xdst->route;
205 204
206 path->ops->update_pmtu(path, mtu); 205 path->ops->redirect(path, sk, skb);
207} 206}
208 207
209static void xfrm4_dst_destroy(struct dst_entry *dst) 208static void xfrm4_dst_destroy(struct dst_entry *dst)
@@ -212,9 +211,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
212 211
213 dst_destroy_metrics_generic(dst); 212 dst_destroy_metrics_generic(dst);
214 213
215 if (likely(xdst->u.rt.peer))
216 inet_putpeer(xdst->u.rt.peer);
217
218 xfrm_dst_destroy(xdst); 214 xfrm_dst_destroy(xdst);
219} 215}
220 216
@@ -232,6 +228,7 @@ static struct dst_ops xfrm4_dst_ops = {
232 .protocol = cpu_to_be16(ETH_P_IP), 228 .protocol = cpu_to_be16(ETH_P_IP),
233 .gc = xfrm4_garbage_collect, 229 .gc = xfrm4_garbage_collect,
234 .update_pmtu = xfrm4_update_pmtu, 230 .update_pmtu = xfrm4_update_pmtu,
231 .redirect = xfrm4_redirect,
235 .cow_metrics = dst_cow_metrics_generic, 232 .cow_metrics = dst_cow_metrics_generic,
236 .destroy = xfrm4_dst_destroy, 233 .destroy = xfrm4_dst_destroy,
237 .ifdown = xfrm4_dst_ifdown, 234 .ifdown = xfrm4_dst_ifdown,