diff options
Diffstat (limited to 'net/ipv4')
59 files changed, 4048 insertions, 2570 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 20f1cb5c8aba..5a19aeb86094 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -310,6 +310,17 @@ config SYN_COOKIES | |||
310 | 310 | ||
311 | If unsure, say N. | 311 | If unsure, say N. |
312 | 312 | ||
313 | config NET_IPVTI | ||
314 | tristate "Virtual (secure) IP: tunneling" | ||
315 | select INET_TUNNEL | ||
316 | depends on INET_XFRM_MODE_TUNNEL | ||
317 | ---help--- | ||
318 | Tunneling means encapsulating data of one protocol type within | ||
319 | another protocol and sending it over a channel that understands the | ||
320 | encapsulating protocol. This can be used with xfrm mode tunnel to give | ||
321 | the notion of a secure tunnel for IPSEC and then use routing protocol | ||
322 | on top. | ||
323 | |||
313 | config INET_AH | 324 | config INET_AH |
314 | tristate "IP: AH transformation" | 325 | tristate "IP: AH transformation" |
315 | select XFRM_ALGO | 326 | select XFRM_ALGO |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ff75d3bbcd6a..ae2ccf2890e4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \ | |||
7 | ip_output.o ip_sockglue.o inet_hashtables.o \ | 7 | ip_output.o ip_sockglue.o inet_hashtables.o \ |
8 | inet_timewait_sock.o inet_connection_sock.o \ | 8 | inet_timewait_sock.o inet_connection_sock.o \ |
9 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ | 9 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ |
10 | tcp_minisocks.o tcp_cong.o \ | 10 | tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ |
11 | datagram.o raw.o udp.o udplite.o \ | 11 | datagram.o raw.o udp.o udplite.o \ |
12 | arp.o icmp.o devinet.o af_inet.o igmp.o \ | 12 | arp.o icmp.o devinet.o af_inet.o igmp.o \ |
13 | fib_frontend.o fib_semantics.o fib_trie.o \ | 13 | fib_frontend.o fib_semantics.o fib_trie.o \ |
@@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o | |||
20 | obj-$(CONFIG_NET_IPIP) += ipip.o | 20 | obj-$(CONFIG_NET_IPIP) += ipip.o |
21 | obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o | 21 | obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o |
22 | obj-$(CONFIG_NET_IPGRE) += ip_gre.o | 22 | obj-$(CONFIG_NET_IPGRE) += ip_gre.o |
23 | obj-$(CONFIG_NET_IPVTI) += ip_vti.o | ||
23 | obj-$(CONFIG_SYN_COOKIES) += syncookies.o | 24 | obj-$(CONFIG_SYN_COOKIES) += syncookies.o |
24 | obj-$(CONFIG_INET_AH) += ah4.o | 25 | obj-$(CONFIG_INET_AH) += ah4.o |
25 | obj-$(CONFIG_INET_ESP) += esp4.o | 26 | obj-$(CONFIG_INET_ESP) += esp4.o |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c8f7aee587d1..fe4582ca969a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk) | |||
157 | 157 | ||
158 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); | 158 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); |
159 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); | 159 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); |
160 | dst_release(sk->sk_rx_dst); | ||
160 | sk_refcnt_debug_dec(sk); | 161 | sk_refcnt_debug_dec(sk); |
161 | } | 162 | } |
162 | EXPORT_SYMBOL(inet_sock_destruct); | 163 | EXPORT_SYMBOL(inet_sock_destruct); |
@@ -242,20 +243,18 @@ void build_ehash_secret(void) | |||
242 | } | 243 | } |
243 | EXPORT_SYMBOL(build_ehash_secret); | 244 | EXPORT_SYMBOL(build_ehash_secret); |
244 | 245 | ||
245 | static inline int inet_netns_ok(struct net *net, int protocol) | 246 | static inline int inet_netns_ok(struct net *net, __u8 protocol) |
246 | { | 247 | { |
247 | int hash; | ||
248 | const struct net_protocol *ipprot; | 248 | const struct net_protocol *ipprot; |
249 | 249 | ||
250 | if (net_eq(net, &init_net)) | 250 | if (net_eq(net, &init_net)) |
251 | return 1; | 251 | return 1; |
252 | 252 | ||
253 | hash = protocol & (MAX_INET_PROTOS - 1); | 253 | ipprot = rcu_dereference(inet_protos[protocol]); |
254 | ipprot = rcu_dereference(inet_protos[hash]); | 254 | if (ipprot == NULL) { |
255 | |||
256 | if (ipprot == NULL) | ||
257 | /* raw IP is OK */ | 255 | /* raw IP is OK */ |
258 | return 1; | 256 | return 1; |
257 | } | ||
259 | return ipprot->netns_ok; | 258 | return ipprot->netns_ok; |
260 | } | 259 | } |
261 | 260 | ||
@@ -553,15 +552,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, | |||
553 | 552 | ||
554 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) | 553 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) |
555 | return -EAGAIN; | 554 | return -EAGAIN; |
556 | return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); | 555 | return sk->sk_prot->connect(sk, uaddr, addr_len); |
557 | } | 556 | } |
558 | EXPORT_SYMBOL(inet_dgram_connect); | 557 | EXPORT_SYMBOL(inet_dgram_connect); |
559 | 558 | ||
560 | static long inet_wait_for_connect(struct sock *sk, long timeo) | 559 | static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) |
561 | { | 560 | { |
562 | DEFINE_WAIT(wait); | 561 | DEFINE_WAIT(wait); |
563 | 562 | ||
564 | prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); | 563 | prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); |
564 | sk->sk_write_pending += writebias; | ||
565 | 565 | ||
566 | /* Basic assumption: if someone sets sk->sk_err, he _must_ | 566 | /* Basic assumption: if someone sets sk->sk_err, he _must_ |
567 | * change state of the socket from TCP_SYN_*. | 567 | * change state of the socket from TCP_SYN_*. |
@@ -577,6 +577,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) | |||
577 | prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); | 577 | prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); |
578 | } | 578 | } |
579 | finish_wait(sk_sleep(sk), &wait); | 579 | finish_wait(sk_sleep(sk), &wait); |
580 | sk->sk_write_pending -= writebias; | ||
580 | return timeo; | 581 | return timeo; |
581 | } | 582 | } |
582 | 583 | ||
@@ -584,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) | |||
584 | * Connect to a remote host. There is regrettably still a little | 585 | * Connect to a remote host. There is regrettably still a little |
585 | * TCP 'magic' in here. | 586 | * TCP 'magic' in here. |
586 | */ | 587 | */ |
587 | int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | 588 | int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, |
588 | int addr_len, int flags) | 589 | int addr_len, int flags) |
589 | { | 590 | { |
590 | struct sock *sk = sock->sk; | 591 | struct sock *sk = sock->sk; |
591 | int err; | 592 | int err; |
@@ -594,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | |||
594 | if (addr_len < sizeof(uaddr->sa_family)) | 595 | if (addr_len < sizeof(uaddr->sa_family)) |
595 | return -EINVAL; | 596 | return -EINVAL; |
596 | 597 | ||
597 | lock_sock(sk); | ||
598 | |||
599 | if (uaddr->sa_family == AF_UNSPEC) { | 598 | if (uaddr->sa_family == AF_UNSPEC) { |
600 | err = sk->sk_prot->disconnect(sk, flags); | 599 | err = sk->sk_prot->disconnect(sk, flags); |
601 | sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; | 600 | sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; |
@@ -635,8 +634,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | |||
635 | timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); | 634 | timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); |
636 | 635 | ||
637 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 636 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
637 | int writebias = (sk->sk_protocol == IPPROTO_TCP) && | ||
638 | tcp_sk(sk)->fastopen_req && | ||
639 | tcp_sk(sk)->fastopen_req->data ? 1 : 0; | ||
640 | |||
638 | /* Error code is set above */ | 641 | /* Error code is set above */ |
639 | if (!timeo || !inet_wait_for_connect(sk, timeo)) | 642 | if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) |
640 | goto out; | 643 | goto out; |
641 | 644 | ||
642 | err = sock_intr_errno(timeo); | 645 | err = sock_intr_errno(timeo); |
@@ -658,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | |||
658 | sock->state = SS_CONNECTED; | 661 | sock->state = SS_CONNECTED; |
659 | err = 0; | 662 | err = 0; |
660 | out: | 663 | out: |
661 | release_sock(sk); | ||
662 | return err; | 664 | return err; |
663 | 665 | ||
664 | sock_error: | 666 | sock_error: |
@@ -668,6 +670,18 @@ sock_error: | |||
668 | sock->state = SS_DISCONNECTING; | 670 | sock->state = SS_DISCONNECTING; |
669 | goto out; | 671 | goto out; |
670 | } | 672 | } |
673 | EXPORT_SYMBOL(__inet_stream_connect); | ||
674 | |||
675 | int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | ||
676 | int addr_len, int flags) | ||
677 | { | ||
678 | int err; | ||
679 | |||
680 | lock_sock(sock->sk); | ||
681 | err = __inet_stream_connect(sock, uaddr, addr_len, flags); | ||
682 | release_sock(sock->sk); | ||
683 | return err; | ||
684 | } | ||
671 | EXPORT_SYMBOL(inet_stream_connect); | 685 | EXPORT_SYMBOL(inet_stream_connect); |
672 | 686 | ||
673 | /* | 687 | /* |
@@ -1216,8 +1230,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); | |||
1216 | 1230 | ||
1217 | static int inet_gso_send_check(struct sk_buff *skb) | 1231 | static int inet_gso_send_check(struct sk_buff *skb) |
1218 | { | 1232 | { |
1219 | const struct iphdr *iph; | ||
1220 | const struct net_protocol *ops; | 1233 | const struct net_protocol *ops; |
1234 | const struct iphdr *iph; | ||
1221 | int proto; | 1235 | int proto; |
1222 | int ihl; | 1236 | int ihl; |
1223 | int err = -EINVAL; | 1237 | int err = -EINVAL; |
@@ -1236,7 +1250,7 @@ static int inet_gso_send_check(struct sk_buff *skb) | |||
1236 | __skb_pull(skb, ihl); | 1250 | __skb_pull(skb, ihl); |
1237 | skb_reset_transport_header(skb); | 1251 | skb_reset_transport_header(skb); |
1238 | iph = ip_hdr(skb); | 1252 | iph = ip_hdr(skb); |
1239 | proto = iph->protocol & (MAX_INET_PROTOS - 1); | 1253 | proto = iph->protocol; |
1240 | err = -EPROTONOSUPPORT; | 1254 | err = -EPROTONOSUPPORT; |
1241 | 1255 | ||
1242 | rcu_read_lock(); | 1256 | rcu_read_lock(); |
@@ -1253,8 +1267,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
1253 | netdev_features_t features) | 1267 | netdev_features_t features) |
1254 | { | 1268 | { |
1255 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 1269 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
1256 | struct iphdr *iph; | ||
1257 | const struct net_protocol *ops; | 1270 | const struct net_protocol *ops; |
1271 | struct iphdr *iph; | ||
1258 | int proto; | 1272 | int proto; |
1259 | int ihl; | 1273 | int ihl; |
1260 | int id; | 1274 | int id; |
@@ -1286,7 +1300,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
1286 | skb_reset_transport_header(skb); | 1300 | skb_reset_transport_header(skb); |
1287 | iph = ip_hdr(skb); | 1301 | iph = ip_hdr(skb); |
1288 | id = ntohs(iph->id); | 1302 | id = ntohs(iph->id); |
1289 | proto = iph->protocol & (MAX_INET_PROTOS - 1); | 1303 | proto = iph->protocol; |
1290 | segs = ERR_PTR(-EPROTONOSUPPORT); | 1304 | segs = ERR_PTR(-EPROTONOSUPPORT); |
1291 | 1305 | ||
1292 | rcu_read_lock(); | 1306 | rcu_read_lock(); |
@@ -1340,7 +1354,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, | |||
1340 | goto out; | 1354 | goto out; |
1341 | } | 1355 | } |
1342 | 1356 | ||
1343 | proto = iph->protocol & (MAX_INET_PROTOS - 1); | 1357 | proto = iph->protocol; |
1344 | 1358 | ||
1345 | rcu_read_lock(); | 1359 | rcu_read_lock(); |
1346 | ops = rcu_dereference(inet_protos[proto]); | 1360 | ops = rcu_dereference(inet_protos[proto]); |
@@ -1398,11 +1412,11 @@ out: | |||
1398 | 1412 | ||
1399 | static int inet_gro_complete(struct sk_buff *skb) | 1413 | static int inet_gro_complete(struct sk_buff *skb) |
1400 | { | 1414 | { |
1401 | const struct net_protocol *ops; | 1415 | __be16 newlen = htons(skb->len - skb_network_offset(skb)); |
1402 | struct iphdr *iph = ip_hdr(skb); | 1416 | struct iphdr *iph = ip_hdr(skb); |
1403 | int proto = iph->protocol & (MAX_INET_PROTOS - 1); | 1417 | const struct net_protocol *ops; |
1418 | int proto = iph->protocol; | ||
1404 | int err = -ENOSYS; | 1419 | int err = -ENOSYS; |
1405 | __be16 newlen = htons(skb->len - skb_network_offset(skb)); | ||
1406 | 1420 | ||
1407 | csum_replace2(&iph->check, iph->tot_len, newlen); | 1421 | csum_replace2(&iph->check, iph->tot_len, newlen); |
1408 | iph->tot_len = newlen; | 1422 | iph->tot_len = newlen; |
@@ -1520,14 +1534,15 @@ static const struct net_protocol igmp_protocol = { | |||
1520 | #endif | 1534 | #endif |
1521 | 1535 | ||
1522 | static const struct net_protocol tcp_protocol = { | 1536 | static const struct net_protocol tcp_protocol = { |
1523 | .handler = tcp_v4_rcv, | 1537 | .early_demux = tcp_v4_early_demux, |
1524 | .err_handler = tcp_v4_err, | 1538 | .handler = tcp_v4_rcv, |
1525 | .gso_send_check = tcp_v4_gso_send_check, | 1539 | .err_handler = tcp_v4_err, |
1526 | .gso_segment = tcp_tso_segment, | 1540 | .gso_send_check = tcp_v4_gso_send_check, |
1527 | .gro_receive = tcp4_gro_receive, | 1541 | .gso_segment = tcp_tso_segment, |
1528 | .gro_complete = tcp4_gro_complete, | 1542 | .gro_receive = tcp4_gro_receive, |
1529 | .no_policy = 1, | 1543 | .gro_complete = tcp4_gro_complete, |
1530 | .netns_ok = 1, | 1544 | .no_policy = 1, |
1545 | .netns_ok = 1, | ||
1531 | }; | 1546 | }; |
1532 | 1547 | ||
1533 | static const struct net_protocol udp_protocol = { | 1548 | static const struct net_protocol udp_protocol = { |
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e8f2617ecd47..a0d8392491c3 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -398,16 +398,25 @@ static void ah4_err(struct sk_buff *skb, u32 info) | |||
398 | struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); | 398 | struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); |
399 | struct xfrm_state *x; | 399 | struct xfrm_state *x; |
400 | 400 | ||
401 | if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || | 401 | switch (icmp_hdr(skb)->type) { |
402 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 402 | case ICMP_DEST_UNREACH: |
403 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | ||
404 | return; | ||
405 | case ICMP_REDIRECT: | ||
406 | break; | ||
407 | default: | ||
403 | return; | 408 | return; |
409 | } | ||
404 | 410 | ||
405 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 411 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
406 | ah->spi, IPPROTO_AH, AF_INET); | 412 | ah->spi, IPPROTO_AH, AF_INET); |
407 | if (!x) | 413 | if (!x) |
408 | return; | 414 | return; |
409 | pr_debug("pmtu discovery on SA AH/%08x/%08x\n", | 415 | |
410 | ntohl(ah->spi), ntohl(iph->daddr)); | 416 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) |
417 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); | ||
418 | else | ||
419 | ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); | ||
411 | xfrm_state_put(x); | 420 | xfrm_state_put(x); |
412 | } | 421 | } |
413 | 422 | ||
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cda37be02f8d..a0124eb7dbea 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
@@ -475,8 +475,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) | |||
475 | return 1; | 475 | return 1; |
476 | } | 476 | } |
477 | 477 | ||
478 | paddr = skb_rtable(skb)->rt_gateway; | 478 | paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr); |
479 | |||
480 | if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, | 479 | if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, |
481 | paddr, dev)) | 480 | paddr, dev)) |
482 | return 0; | 481 | return 0; |
@@ -790,7 +789,8 @@ static int arp_process(struct sk_buff *skb) | |||
790 | * Check for bad requests for 127.x.x.x and requests for multicast | 789 | * Check for bad requests for 127.x.x.x and requests for multicast |
791 | * addresses. If this is one such, delete it. | 790 | * addresses. If this is one such, delete it. |
792 | */ | 791 | */ |
793 | if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) | 792 | if (ipv4_is_multicast(tip) || |
793 | (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) | ||
794 | goto out; | 794 | goto out; |
795 | 795 | ||
796 | /* | 796 | /* |
@@ -827,7 +827,7 @@ static int arp_process(struct sk_buff *skb) | |||
827 | } | 827 | } |
828 | 828 | ||
829 | if (arp->ar_op == htons(ARPOP_REQUEST) && | 829 | if (arp->ar_op == htons(ARPOP_REQUEST) && |
830 | ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { | 830 | ip_route_input(skb, tip, sip, 0, dev) == 0) { |
831 | 831 | ||
832 | rt = skb_rtable(skb); | 832 | rt = skb_rtable(skb); |
833 | addr_type = rt->rt_type; | 833 | addr_type = rt->rt_type; |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 10e15a144e95..44bf82e3aef7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -1500,7 +1500,8 @@ static int devinet_conf_proc(ctl_table *ctl, int write, | |||
1500 | 1500 | ||
1501 | if (cnf == net->ipv4.devconf_dflt) | 1501 | if (cnf == net->ipv4.devconf_dflt) |
1502 | devinet_copy_dflt_conf(net, i); | 1502 | devinet_copy_dflt_conf(net, i); |
1503 | if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) | 1503 | if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || |
1504 | i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) | ||
1504 | if ((new_value == 0) && (old_value != 0)) | 1505 | if ((new_value == 0) && (old_value != 0)) |
1505 | rt_cache_flush(net, 0); | 1506 | rt_cache_flush(net, 0); |
1506 | } | 1507 | } |
@@ -1617,6 +1618,8 @@ static struct devinet_sysctl_table { | |||
1617 | "force_igmp_version"), | 1618 | "force_igmp_version"), |
1618 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, | 1619 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, |
1619 | "promote_secondaries"), | 1620 | "promote_secondaries"), |
1621 | DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, | ||
1622 | "route_localnet"), | ||
1620 | }, | 1623 | }, |
1621 | }; | 1624 | }; |
1622 | 1625 | ||
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index cb982a61536f..b61e9deb7c7e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -484,16 +484,25 @@ static void esp4_err(struct sk_buff *skb, u32 info) | |||
484 | struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); | 484 | struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); |
485 | struct xfrm_state *x; | 485 | struct xfrm_state *x; |
486 | 486 | ||
487 | if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || | 487 | switch (icmp_hdr(skb)->type) { |
488 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 488 | case ICMP_DEST_UNREACH: |
489 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | ||
490 | return; | ||
491 | case ICMP_REDIRECT: | ||
492 | break; | ||
493 | default: | ||
489 | return; | 494 | return; |
495 | } | ||
490 | 496 | ||
491 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 497 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
492 | esph->spi, IPPROTO_ESP, AF_INET); | 498 | esph->spi, IPPROTO_ESP, AF_INET); |
493 | if (!x) | 499 | if (!x) |
494 | return; | 500 | return; |
495 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", | 501 | |
496 | ntohl(esph->spi), ntohl(iph->daddr)); | 502 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) |
503 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); | ||
504 | else | ||
505 | ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); | ||
497 | xfrm_state_put(x); | 506 | xfrm_state_put(x); |
498 | } | 507 | } |
499 | 508 | ||
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3854411fa37c..8732cc7920ed 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/if_addr.h> | 31 | #include <linux/if_addr.h> |
32 | #include <linux/if_arp.h> | 32 | #include <linux/if_arp.h> |
33 | #include <linux/skbuff.h> | 33 | #include <linux/skbuff.h> |
34 | #include <linux/cache.h> | ||
34 | #include <linux/init.h> | 35 | #include <linux/init.h> |
35 | #include <linux/list.h> | 36 | #include <linux/list.h> |
36 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
@@ -85,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id) | |||
85 | tb = fib_trie_table(id); | 86 | tb = fib_trie_table(id); |
86 | if (!tb) | 87 | if (!tb) |
87 | return NULL; | 88 | return NULL; |
89 | |||
90 | switch (id) { | ||
91 | case RT_TABLE_LOCAL: | ||
92 | net->ipv4.fib_local = tb; | ||
93 | break; | ||
94 | |||
95 | case RT_TABLE_MAIN: | ||
96 | net->ipv4.fib_main = tb; | ||
97 | break; | ||
98 | |||
99 | case RT_TABLE_DEFAULT: | ||
100 | net->ipv4.fib_default = tb; | ||
101 | break; | ||
102 | |||
103 | default: | ||
104 | break; | ||
105 | } | ||
106 | |||
88 | h = id & (FIB_TABLE_HASHSZ - 1); | 107 | h = id & (FIB_TABLE_HASHSZ - 1); |
89 | hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); | 108 | hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); |
90 | return tb; | 109 | return tb; |
@@ -150,10 +169,6 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, | |||
150 | if (ipv4_is_multicast(addr)) | 169 | if (ipv4_is_multicast(addr)) |
151 | return RTN_MULTICAST; | 170 | return RTN_MULTICAST; |
152 | 171 | ||
153 | #ifdef CONFIG_IP_MULTIPLE_TABLES | ||
154 | res.r = NULL; | ||
155 | #endif | ||
156 | |||
157 | local_table = fib_get_table(net, RT_TABLE_LOCAL); | 172 | local_table = fib_get_table(net, RT_TABLE_LOCAL); |
158 | if (local_table) { | 173 | if (local_table) { |
159 | ret = RTN_UNICAST; | 174 | ret = RTN_UNICAST; |
@@ -180,6 +195,44 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, | |||
180 | } | 195 | } |
181 | EXPORT_SYMBOL(inet_dev_addr_type); | 196 | EXPORT_SYMBOL(inet_dev_addr_type); |
182 | 197 | ||
198 | __be32 fib_compute_spec_dst(struct sk_buff *skb) | ||
199 | { | ||
200 | struct net_device *dev = skb->dev; | ||
201 | struct in_device *in_dev; | ||
202 | struct fib_result res; | ||
203 | struct rtable *rt; | ||
204 | struct flowi4 fl4; | ||
205 | struct net *net; | ||
206 | int scope; | ||
207 | |||
208 | rt = skb_rtable(skb); | ||
209 | if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == | ||
210 | RTCF_LOCAL) | ||
211 | return ip_hdr(skb)->daddr; | ||
212 | |||
213 | in_dev = __in_dev_get_rcu(dev); | ||
214 | BUG_ON(!in_dev); | ||
215 | |||
216 | net = dev_net(dev); | ||
217 | |||
218 | scope = RT_SCOPE_UNIVERSE; | ||
219 | if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { | ||
220 | fl4.flowi4_oif = 0; | ||
221 | fl4.flowi4_iif = net->loopback_dev->ifindex; | ||
222 | fl4.daddr = ip_hdr(skb)->saddr; | ||
223 | fl4.saddr = 0; | ||
224 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); | ||
225 | fl4.flowi4_scope = scope; | ||
226 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; | ||
227 | if (!fib_lookup(net, &fl4, &res)) | ||
228 | return FIB_RES_PREFSRC(net, res); | ||
229 | } else { | ||
230 | scope = RT_SCOPE_LINK; | ||
231 | } | ||
232 | |||
233 | return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); | ||
234 | } | ||
235 | |||
183 | /* Given (packet source, input interface) and optional (dst, oif, tos): | 236 | /* Given (packet source, input interface) and optional (dst, oif, tos): |
184 | * - (main) check, that source is valid i.e. not broadcast or our local | 237 | * - (main) check, that source is valid i.e. not broadcast or our local |
185 | * address. | 238 | * address. |
@@ -188,17 +241,15 @@ EXPORT_SYMBOL(inet_dev_addr_type); | |||
188 | * - check, that packet arrived from expected physical interface. | 241 | * - check, that packet arrived from expected physical interface. |
189 | * called with rcu_read_lock() | 242 | * called with rcu_read_lock() |
190 | */ | 243 | */ |
191 | int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | 244 | static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, |
192 | int oif, struct net_device *dev, __be32 *spec_dst, | 245 | u8 tos, int oif, struct net_device *dev, |
193 | u32 *itag) | 246 | int rpf, struct in_device *idev, u32 *itag) |
194 | { | 247 | { |
195 | struct in_device *in_dev; | 248 | int ret, no_addr, accept_local; |
196 | struct flowi4 fl4; | ||
197 | struct fib_result res; | 249 | struct fib_result res; |
198 | int no_addr, rpf, accept_local; | 250 | struct flowi4 fl4; |
199 | bool dev_match; | ||
200 | int ret; | ||
201 | struct net *net; | 251 | struct net *net; |
252 | bool dev_match; | ||
202 | 253 | ||
203 | fl4.flowi4_oif = 0; | 254 | fl4.flowi4_oif = 0; |
204 | fl4.flowi4_iif = oif; | 255 | fl4.flowi4_iif = oif; |
@@ -207,20 +258,10 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | |||
207 | fl4.flowi4_tos = tos; | 258 | fl4.flowi4_tos = tos; |
208 | fl4.flowi4_scope = RT_SCOPE_UNIVERSE; | 259 | fl4.flowi4_scope = RT_SCOPE_UNIVERSE; |
209 | 260 | ||
210 | no_addr = rpf = accept_local = 0; | 261 | no_addr = idev->ifa_list == NULL; |
211 | in_dev = __in_dev_get_rcu(dev); | ||
212 | if (in_dev) { | ||
213 | no_addr = in_dev->ifa_list == NULL; | ||
214 | |||
215 | /* Ignore rp_filter for packets protected by IPsec. */ | ||
216 | rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); | ||
217 | |||
218 | accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); | ||
219 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; | ||
220 | } | ||
221 | 262 | ||
222 | if (in_dev == NULL) | 263 | accept_local = IN_DEV_ACCEPT_LOCAL(idev); |
223 | goto e_inval; | 264 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; |
224 | 265 | ||
225 | net = dev_net(dev); | 266 | net = dev_net(dev); |
226 | if (fib_lookup(net, &fl4, &res)) | 267 | if (fib_lookup(net, &fl4, &res)) |
@@ -229,7 +270,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | |||
229 | if (res.type != RTN_LOCAL || !accept_local) | 270 | if (res.type != RTN_LOCAL || !accept_local) |
230 | goto e_inval; | 271 | goto e_inval; |
231 | } | 272 | } |
232 | *spec_dst = FIB_RES_PREFSRC(net, res); | ||
233 | fib_combine_itag(itag, &res); | 273 | fib_combine_itag(itag, &res); |
234 | dev_match = false; | 274 | dev_match = false; |
235 | 275 | ||
@@ -258,17 +298,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | |||
258 | 298 | ||
259 | ret = 0; | 299 | ret = 0; |
260 | if (fib_lookup(net, &fl4, &res) == 0) { | 300 | if (fib_lookup(net, &fl4, &res) == 0) { |
261 | if (res.type == RTN_UNICAST) { | 301 | if (res.type == RTN_UNICAST) |
262 | *spec_dst = FIB_RES_PREFSRC(net, res); | ||
263 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; | 302 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; |
264 | } | ||
265 | } | 303 | } |
266 | return ret; | 304 | return ret; |
267 | 305 | ||
268 | last_resort: | 306 | last_resort: |
269 | if (rpf) | 307 | if (rpf) |
270 | goto e_rpf; | 308 | goto e_rpf; |
271 | *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
272 | *itag = 0; | 309 | *itag = 0; |
273 | return 0; | 310 | return 0; |
274 | 311 | ||
@@ -278,6 +315,20 @@ e_rpf: | |||
278 | return -EXDEV; | 315 | return -EXDEV; |
279 | } | 316 | } |
280 | 317 | ||
318 | /* Ignore rp_filter for packets protected by IPsec. */ | ||
319 | int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | ||
320 | u8 tos, int oif, struct net_device *dev, | ||
321 | struct in_device *idev, u32 *itag) | ||
322 | { | ||
323 | int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); | ||
324 | |||
325 | if (!r && !fib_num_tclassid_users(dev_net(dev))) { | ||
326 | *itag = 0; | ||
327 | return 0; | ||
328 | } | ||
329 | return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); | ||
330 | } | ||
331 | |||
281 | static inline __be32 sk_extract_addr(struct sockaddr *addr) | 332 | static inline __be32 sk_extract_addr(struct sockaddr *addr) |
282 | { | 333 | { |
283 | return ((struct sockaddr_in *) addr)->sin_addr.s_addr; | 334 | return ((struct sockaddr_in *) addr)->sin_addr.s_addr; |
@@ -879,10 +930,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) | |||
879 | .flowi4_scope = frn->fl_scope, | 930 | .flowi4_scope = frn->fl_scope, |
880 | }; | 931 | }; |
881 | 932 | ||
882 | #ifdef CONFIG_IP_MULTIPLE_TABLES | ||
883 | res.r = NULL; | ||
884 | #endif | ||
885 | |||
886 | frn->err = -ENOENT; | 933 | frn->err = -ENOENT; |
887 | if (tb) { | 934 | if (tb) { |
888 | local_bh_disable(); | 935 | local_bh_disable(); |
@@ -935,8 +982,11 @@ static void nl_fib_input(struct sk_buff *skb) | |||
935 | static int __net_init nl_fib_lookup_init(struct net *net) | 982 | static int __net_init nl_fib_lookup_init(struct net *net) |
936 | { | 983 | { |
937 | struct sock *sk; | 984 | struct sock *sk; |
938 | sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, | 985 | struct netlink_kernel_cfg cfg = { |
939 | nl_fib_input, NULL, THIS_MODULE); | 986 | .input = nl_fib_input, |
987 | }; | ||
988 | |||
989 | sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg); | ||
940 | if (sk == NULL) | 990 | if (sk == NULL) |
941 | return -EAFNOSUPPORT; | 991 | return -EAFNOSUPPORT; |
942 | net->ipv4.fibnl = sk; | 992 | net->ipv4.fibnl = sk; |
@@ -1021,11 +1071,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo | |||
1021 | rt_cache_flush(dev_net(dev), 0); | 1071 | rt_cache_flush(dev_net(dev), 0); |
1022 | break; | 1072 | break; |
1023 | case NETDEV_UNREGISTER_BATCH: | 1073 | case NETDEV_UNREGISTER_BATCH: |
1024 | /* The batch unregister is only called on the first | ||
1025 | * device in the list of devices being unregistered. | ||
1026 | * Therefore we should not pass dev_net(dev) in here. | ||
1027 | */ | ||
1028 | rt_cache_flush_batch(NULL); | ||
1029 | break; | 1074 | break; |
1030 | } | 1075 | } |
1031 | return NOTIFY_DONE; | 1076 | return NOTIFY_DONE; |
@@ -1090,6 +1135,9 @@ static int __net_init fib_net_init(struct net *net) | |||
1090 | { | 1135 | { |
1091 | int error; | 1136 | int error; |
1092 | 1137 | ||
1138 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
1139 | net->ipv4.fib_num_tclassid_users = 0; | ||
1140 | #endif | ||
1093 | error = ip_fib_net_init(net); | 1141 | error = ip_fib_net_init(net); |
1094 | if (error < 0) | 1142 | if (error < 0) |
1095 | goto out; | 1143 | goto out; |
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2d043f71ef70..a83d74e498d2 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c | |||
@@ -47,14 +47,7 @@ struct fib4_rule { | |||
47 | #endif | 47 | #endif |
48 | }; | 48 | }; |
49 | 49 | ||
50 | #ifdef CONFIG_IP_ROUTE_CLASSID | 50 | int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) |
51 | u32 fib_rules_tclass(const struct fib_result *res) | ||
52 | { | ||
53 | return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; | ||
54 | } | ||
55 | #endif | ||
56 | |||
57 | int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) | ||
58 | { | 51 | { |
59 | struct fib_lookup_arg arg = { | 52 | struct fib_lookup_arg arg = { |
60 | .result = res, | 53 | .result = res, |
@@ -63,11 +56,15 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) | |||
63 | int err; | 56 | int err; |
64 | 57 | ||
65 | err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); | 58 | err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); |
66 | res->r = arg.rule; | 59 | #ifdef CONFIG_IP_ROUTE_CLASSID |
67 | 60 | if (arg.rule) | |
61 | res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; | ||
62 | else | ||
63 | res->tclassid = 0; | ||
64 | #endif | ||
68 | return err; | 65 | return err; |
69 | } | 66 | } |
70 | EXPORT_SYMBOL_GPL(fib_lookup); | 67 | EXPORT_SYMBOL_GPL(__fib_lookup); |
71 | 68 | ||
72 | static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, | 69 | static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, |
73 | int flags, struct fib_lookup_arg *arg) | 70 | int flags, struct fib_lookup_arg *arg) |
@@ -169,8 +166,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, | |||
169 | rule4->dst = nla_get_be32(tb[FRA_DST]); | 166 | rule4->dst = nla_get_be32(tb[FRA_DST]); |
170 | 167 | ||
171 | #ifdef CONFIG_IP_ROUTE_CLASSID | 168 | #ifdef CONFIG_IP_ROUTE_CLASSID |
172 | if (tb[FRA_FLOW]) | 169 | if (tb[FRA_FLOW]) { |
173 | rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); | 170 | rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); |
171 | if (rule4->tclassid) | ||
172 | net->ipv4.fib_num_tclassid_users++; | ||
173 | } | ||
174 | #endif | 174 | #endif |
175 | 175 | ||
176 | rule4->src_len = frh->src_len; | 176 | rule4->src_len = frh->src_len; |
@@ -179,11 +179,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, | |||
179 | rule4->dstmask = inet_make_mask(rule4->dst_len); | 179 | rule4->dstmask = inet_make_mask(rule4->dst_len); |
180 | rule4->tos = frh->tos; | 180 | rule4->tos = frh->tos; |
181 | 181 | ||
182 | net->ipv4.fib_has_custom_rules = true; | ||
182 | err = 0; | 183 | err = 0; |
183 | errout: | 184 | errout: |
184 | return err; | 185 | return err; |
185 | } | 186 | } |
186 | 187 | ||
188 | static void fib4_rule_delete(struct fib_rule *rule) | ||
189 | { | ||
190 | struct net *net = rule->fr_net; | ||
191 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
192 | struct fib4_rule *rule4 = (struct fib4_rule *) rule; | ||
193 | |||
194 | if (rule4->tclassid) | ||
195 | net->ipv4.fib_num_tclassid_users--; | ||
196 | #endif | ||
197 | net->ipv4.fib_has_custom_rules = true; | ||
198 | } | ||
199 | |||
187 | static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, | 200 | static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, |
188 | struct nlattr **tb) | 201 | struct nlattr **tb) |
189 | { | 202 | { |
@@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { | |||
256 | .action = fib4_rule_action, | 269 | .action = fib4_rule_action, |
257 | .match = fib4_rule_match, | 270 | .match = fib4_rule_match, |
258 | .configure = fib4_rule_configure, | 271 | .configure = fib4_rule_configure, |
272 | .delete = fib4_rule_delete, | ||
259 | .compare = fib4_rule_compare, | 273 | .compare = fib4_rule_compare, |
260 | .fill = fib4_rule_fill, | 274 | .fill = fib4_rule_fill, |
261 | .default_pref = fib_default_rule_pref, | 275 | .default_pref = fib_default_rule_pref, |
@@ -295,6 +309,7 @@ int __net_init fib4_rules_init(struct net *net) | |||
295 | if (err < 0) | 309 | if (err < 0) |
296 | goto fail; | 310 | goto fail; |
297 | net->ipv4.rules_ops = ops; | 311 | net->ipv4.rules_ops = ops; |
312 | net->ipv4.fib_has_custom_rules = false; | ||
298 | return 0; | 313 | return 0; |
299 | 314 | ||
300 | fail: | 315 | fail: |
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e5b7182fa099..e55171f184f9 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
@@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { | |||
140 | }, | 140 | }, |
141 | }; | 141 | }; |
142 | 142 | ||
143 | static void free_nh_exceptions(struct fib_nh *nh) | ||
144 | { | ||
145 | struct fnhe_hash_bucket *hash = nh->nh_exceptions; | ||
146 | int i; | ||
147 | |||
148 | for (i = 0; i < FNHE_HASH_SIZE; i++) { | ||
149 | struct fib_nh_exception *fnhe; | ||
150 | |||
151 | fnhe = rcu_dereference_protected(hash[i].chain, 1); | ||
152 | while (fnhe) { | ||
153 | struct fib_nh_exception *next; | ||
154 | |||
155 | next = rcu_dereference_protected(fnhe->fnhe_next, 1); | ||
156 | kfree(fnhe); | ||
157 | |||
158 | fnhe = next; | ||
159 | } | ||
160 | } | ||
161 | kfree(hash); | ||
162 | } | ||
163 | |||
143 | /* Release a nexthop info record */ | 164 | /* Release a nexthop info record */ |
144 | static void free_fib_info_rcu(struct rcu_head *head) | 165 | static void free_fib_info_rcu(struct rcu_head *head) |
145 | { | 166 | { |
@@ -148,6 +169,12 @@ static void free_fib_info_rcu(struct rcu_head *head) | |||
148 | change_nexthops(fi) { | 169 | change_nexthops(fi) { |
149 | if (nexthop_nh->nh_dev) | 170 | if (nexthop_nh->nh_dev) |
150 | dev_put(nexthop_nh->nh_dev); | 171 | dev_put(nexthop_nh->nh_dev); |
172 | if (nexthop_nh->nh_exceptions) | ||
173 | free_nh_exceptions(nexthop_nh); | ||
174 | if (nexthop_nh->nh_rth_output) | ||
175 | dst_release(&nexthop_nh->nh_rth_output->dst); | ||
176 | if (nexthop_nh->nh_rth_input) | ||
177 | dst_release(&nexthop_nh->nh_rth_input->dst); | ||
151 | } endfor_nexthops(fi); | 178 | } endfor_nexthops(fi); |
152 | 179 | ||
153 | release_net(fi->fib_net); | 180 | release_net(fi->fib_net); |
@@ -163,6 +190,12 @@ void free_fib_info(struct fib_info *fi) | |||
163 | return; | 190 | return; |
164 | } | 191 | } |
165 | fib_info_cnt--; | 192 | fib_info_cnt--; |
193 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
194 | change_nexthops(fi) { | ||
195 | if (nexthop_nh->nh_tclassid) | ||
196 | fi->fib_net->ipv4.fib_num_tclassid_users--; | ||
197 | } endfor_nexthops(fi); | ||
198 | #endif | ||
166 | call_rcu(&fi->rcu, free_fib_info_rcu); | 199 | call_rcu(&fi->rcu, free_fib_info_rcu); |
167 | } | 200 | } |
168 | 201 | ||
@@ -421,6 +454,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, | |||
421 | #ifdef CONFIG_IP_ROUTE_CLASSID | 454 | #ifdef CONFIG_IP_ROUTE_CLASSID |
422 | nla = nla_find(attrs, attrlen, RTA_FLOW); | 455 | nla = nla_find(attrs, attrlen, RTA_FLOW); |
423 | nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; | 456 | nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; |
457 | if (nexthop_nh->nh_tclassid) | ||
458 | fi->fib_net->ipv4.fib_num_tclassid_users++; | ||
424 | #endif | 459 | #endif |
425 | } | 460 | } |
426 | 461 | ||
@@ -779,9 +814,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
779 | int type = nla_type(nla); | 814 | int type = nla_type(nla); |
780 | 815 | ||
781 | if (type) { | 816 | if (type) { |
817 | u32 val; | ||
818 | |||
782 | if (type > RTAX_MAX) | 819 | if (type > RTAX_MAX) |
783 | goto err_inval; | 820 | goto err_inval; |
784 | fi->fib_metrics[type - 1] = nla_get_u32(nla); | 821 | val = nla_get_u32(nla); |
822 | if (type == RTAX_ADVMSS && val > 65535 - 40) | ||
823 | val = 65535 - 40; | ||
824 | if (type == RTAX_MTU && val > 65535 - 15) | ||
825 | val = 65535 - 15; | ||
826 | fi->fib_metrics[type - 1] = val; | ||
785 | } | 827 | } |
786 | } | 828 | } |
787 | } | 829 | } |
@@ -810,6 +852,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
810 | nh->nh_flags = cfg->fc_flags; | 852 | nh->nh_flags = cfg->fc_flags; |
811 | #ifdef CONFIG_IP_ROUTE_CLASSID | 853 | #ifdef CONFIG_IP_ROUTE_CLASSID |
812 | nh->nh_tclassid = cfg->fc_flow; | 854 | nh->nh_tclassid = cfg->fc_flow; |
855 | if (nh->nh_tclassid) | ||
856 | fi->fib_net->ipv4.fib_num_tclassid_users++; | ||
813 | #endif | 857 | #endif |
814 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 858 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
815 | nh->nh_weight = 1; | 859 | nh->nh_weight = 1; |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 30b88d7b4bd6..18cbc15b20d5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
@@ -1007,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) | |||
1007 | while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { | 1007 | while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { |
1008 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1008 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1009 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); | 1009 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); |
1010 | tn = (struct tnode *) resize(t, (struct tnode *)tn); | 1010 | tn = (struct tnode *)resize(t, tn); |
1011 | 1011 | ||
1012 | tnode_put_child_reorg((struct tnode *)tp, cindex, | 1012 | tnode_put_child_reorg(tp, cindex, |
1013 | (struct rt_trie_node *)tn, wasfull); | 1013 | (struct rt_trie_node *)tn, wasfull); |
1014 | 1014 | ||
1015 | tp = node_parent((struct rt_trie_node *) tn); | 1015 | tp = node_parent((struct rt_trie_node *) tn); |
@@ -1024,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) | |||
1024 | 1024 | ||
1025 | /* Handle last (top) tnode */ | 1025 | /* Handle last (top) tnode */ |
1026 | if (IS_TNODE(tn)) | 1026 | if (IS_TNODE(tn)) |
1027 | tn = (struct tnode *)resize(t, (struct tnode *)tn); | 1027 | tn = (struct tnode *)resize(t, tn); |
1028 | 1028 | ||
1029 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | 1029 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
1030 | tnode_free_flush(); | 1030 | tnode_free_flush(); |
@@ -1125,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
1125 | node_set_parent((struct rt_trie_node *)l, tp); | 1125 | node_set_parent((struct rt_trie_node *)l, tp); |
1126 | 1126 | ||
1127 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1127 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1128 | put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); | 1128 | put_child(t, tp, cindex, (struct rt_trie_node *)l); |
1129 | } else { | 1129 | } else { |
1130 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ | 1130 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ |
1131 | /* | 1131 | /* |
@@ -1160,8 +1160,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
1160 | 1160 | ||
1161 | if (tp) { | 1161 | if (tp) { |
1162 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1162 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1163 | put_child(t, (struct tnode *)tp, cindex, | 1163 | put_child(t, tp, cindex, (struct rt_trie_node *)tn); |
1164 | (struct rt_trie_node *)tn); | ||
1165 | } else { | 1164 | } else { |
1166 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | 1165 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
1167 | tp = tn; | 1166 | tp = tn; |
@@ -1620,7 +1619,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) | |||
1620 | 1619 | ||
1621 | if (tp) { | 1620 | if (tp) { |
1622 | t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); | 1621 | t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); |
1623 | put_child(t, (struct tnode *)tp, cindex, NULL); | 1622 | put_child(t, tp, cindex, NULL); |
1624 | trie_rebalance(t, tp); | 1623 | trie_rebalance(t, tp); |
1625 | } else | 1624 | } else |
1626 | RCU_INIT_POINTER(t->trie, NULL); | 1625 | RCU_INIT_POINTER(t->trie, NULL); |
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c75efbdc71cb..f2eccd531746 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -95,6 +95,7 @@ | |||
95 | #include <net/checksum.h> | 95 | #include <net/checksum.h> |
96 | #include <net/xfrm.h> | 96 | #include <net/xfrm.h> |
97 | #include <net/inet_common.h> | 97 | #include <net/inet_common.h> |
98 | #include <net/ip_fib.h> | ||
98 | 99 | ||
99 | /* | 100 | /* |
100 | * Build xmit assembly blocks | 101 | * Build xmit assembly blocks |
@@ -253,10 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, | |||
253 | 254 | ||
254 | /* Limit if icmp type is enabled in ratemask. */ | 255 | /* Limit if icmp type is enabled in ratemask. */ |
255 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { | 256 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { |
256 | if (!rt->peer) | 257 | struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); |
257 | rt_bind_peer(rt, fl4->daddr, 1); | 258 | rc = inet_peer_xrlim_allow(peer, |
258 | rc = inet_peer_xrlim_allow(rt->peer, | ||
259 | net->ipv4.sysctl_icmp_ratelimit); | 259 | net->ipv4.sysctl_icmp_ratelimit); |
260 | inet_putpeer(peer); | ||
260 | } | 261 | } |
261 | out: | 262 | out: |
262 | return rc; | 263 | return rc; |
@@ -334,7 +335,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
334 | struct flowi4 fl4; | 335 | struct flowi4 fl4; |
335 | struct sock *sk; | 336 | struct sock *sk; |
336 | struct inet_sock *inet; | 337 | struct inet_sock *inet; |
337 | __be32 daddr; | 338 | __be32 daddr, saddr; |
338 | 339 | ||
339 | if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) | 340 | if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) |
340 | return; | 341 | return; |
@@ -348,6 +349,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
348 | 349 | ||
349 | inet->tos = ip_hdr(skb)->tos; | 350 | inet->tos = ip_hdr(skb)->tos; |
350 | daddr = ipc.addr = ip_hdr(skb)->saddr; | 351 | daddr = ipc.addr = ip_hdr(skb)->saddr; |
352 | saddr = fib_compute_spec_dst(skb); | ||
351 | ipc.opt = NULL; | 353 | ipc.opt = NULL; |
352 | ipc.tx_flags = 0; | 354 | ipc.tx_flags = 0; |
353 | if (icmp_param->replyopts.opt.opt.optlen) { | 355 | if (icmp_param->replyopts.opt.opt.optlen) { |
@@ -357,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
357 | } | 359 | } |
358 | memset(&fl4, 0, sizeof(fl4)); | 360 | memset(&fl4, 0, sizeof(fl4)); |
359 | fl4.daddr = daddr; | 361 | fl4.daddr = daddr; |
360 | fl4.saddr = rt->rt_spec_dst; | 362 | fl4.saddr = saddr; |
361 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); | 363 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); |
362 | fl4.flowi4_proto = IPPROTO_ICMP; | 364 | fl4.flowi4_proto = IPPROTO_ICMP; |
363 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); | 365 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
@@ -569,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
569 | rcu_read_lock(); | 571 | rcu_read_lock(); |
570 | if (rt_is_input_route(rt) && | 572 | if (rt_is_input_route(rt) && |
571 | net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) | 573 | net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) |
572 | dev = dev_get_by_index_rcu(net, rt->rt_iif); | 574 | dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); |
573 | 575 | ||
574 | if (dev) | 576 | if (dev) |
575 | saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); | 577 | saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); |
@@ -632,6 +634,27 @@ out:; | |||
632 | EXPORT_SYMBOL(icmp_send); | 634 | EXPORT_SYMBOL(icmp_send); |
633 | 635 | ||
634 | 636 | ||
637 | static void icmp_socket_deliver(struct sk_buff *skb, u32 info) | ||
638 | { | ||
639 | const struct iphdr *iph = (const struct iphdr *) skb->data; | ||
640 | const struct net_protocol *ipprot; | ||
641 | int protocol = iph->protocol; | ||
642 | |||
643 | /* Checkin full IP header plus 8 bytes of protocol to | ||
644 | * avoid additional coding at protocol handlers. | ||
645 | */ | ||
646 | if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) | ||
647 | return; | ||
648 | |||
649 | raw_icmp_error(skb, protocol, info); | ||
650 | |||
651 | rcu_read_lock(); | ||
652 | ipprot = rcu_dereference(inet_protos[protocol]); | ||
653 | if (ipprot && ipprot->err_handler) | ||
654 | ipprot->err_handler(skb, info); | ||
655 | rcu_read_unlock(); | ||
656 | } | ||
657 | |||
635 | /* | 658 | /* |
636 | * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. | 659 | * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. |
637 | */ | 660 | */ |
@@ -640,10 +663,8 @@ static void icmp_unreach(struct sk_buff *skb) | |||
640 | { | 663 | { |
641 | const struct iphdr *iph; | 664 | const struct iphdr *iph; |
642 | struct icmphdr *icmph; | 665 | struct icmphdr *icmph; |
643 | int hash, protocol; | ||
644 | const struct net_protocol *ipprot; | ||
645 | u32 info = 0; | ||
646 | struct net *net; | 666 | struct net *net; |
667 | u32 info = 0; | ||
647 | 668 | ||
648 | net = dev_net(skb_dst(skb)->dev); | 669 | net = dev_net(skb_dst(skb)->dev); |
649 | 670 | ||
@@ -674,9 +695,7 @@ static void icmp_unreach(struct sk_buff *skb) | |||
674 | LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), | 695 | LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), |
675 | &iph->daddr); | 696 | &iph->daddr); |
676 | } else { | 697 | } else { |
677 | info = ip_rt_frag_needed(net, iph, | 698 | info = ntohs(icmph->un.frag.mtu); |
678 | ntohs(icmph->un.frag.mtu), | ||
679 | skb->dev); | ||
680 | if (!info) | 699 | if (!info) |
681 | goto out; | 700 | goto out; |
682 | } | 701 | } |
@@ -720,26 +739,7 @@ static void icmp_unreach(struct sk_buff *skb) | |||
720 | goto out; | 739 | goto out; |
721 | } | 740 | } |
722 | 741 | ||
723 | /* Checkin full IP header plus 8 bytes of protocol to | 742 | icmp_socket_deliver(skb, info); |
724 | * avoid additional coding at protocol handlers. | ||
725 | */ | ||
726 | if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) | ||
727 | goto out; | ||
728 | |||
729 | iph = (const struct iphdr *)skb->data; | ||
730 | protocol = iph->protocol; | ||
731 | |||
732 | /* | ||
733 | * Deliver ICMP message to raw sockets. Pretty useless feature? | ||
734 | */ | ||
735 | raw_icmp_error(skb, protocol, info); | ||
736 | |||
737 | hash = protocol & (MAX_INET_PROTOS - 1); | ||
738 | rcu_read_lock(); | ||
739 | ipprot = rcu_dereference(inet_protos[hash]); | ||
740 | if (ipprot && ipprot->err_handler) | ||
741 | ipprot->err_handler(skb, info); | ||
742 | rcu_read_unlock(); | ||
743 | 743 | ||
744 | out: | 744 | out: |
745 | return; | 745 | return; |
@@ -755,46 +755,15 @@ out_err: | |||
755 | 755 | ||
756 | static void icmp_redirect(struct sk_buff *skb) | 756 | static void icmp_redirect(struct sk_buff *skb) |
757 | { | 757 | { |
758 | const struct iphdr *iph; | 758 | if (skb->len < sizeof(struct iphdr)) { |
759 | 759 | ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); | |
760 | if (skb->len < sizeof(struct iphdr)) | 760 | return; |
761 | goto out_err; | ||
762 | |||
763 | /* | ||
764 | * Get the copied header of the packet that caused the redirect | ||
765 | */ | ||
766 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | ||
767 | goto out; | ||
768 | |||
769 | iph = (const struct iphdr *)skb->data; | ||
770 | |||
771 | switch (icmp_hdr(skb)->code & 7) { | ||
772 | case ICMP_REDIR_NET: | ||
773 | case ICMP_REDIR_NETTOS: | ||
774 | /* | ||
775 | * As per RFC recommendations now handle it as a host redirect. | ||
776 | */ | ||
777 | case ICMP_REDIR_HOST: | ||
778 | case ICMP_REDIR_HOSTTOS: | ||
779 | ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, | ||
780 | icmp_hdr(skb)->un.gateway, | ||
781 | iph->saddr, skb->dev); | ||
782 | break; | ||
783 | } | 761 | } |
784 | 762 | ||
785 | /* Ping wants to see redirects. | 763 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) |
786 | * Let's pretend they are errors of sorts... */ | 764 | return; |
787 | if (iph->protocol == IPPROTO_ICMP && | ||
788 | iph->ihl >= 5 && | ||
789 | pskb_may_pull(skb, (iph->ihl<<2)+8)) { | ||
790 | ping_err(skb, icmp_hdr(skb)->un.gateway); | ||
791 | } | ||
792 | 765 | ||
793 | out: | 766 | icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); |
794 | return; | ||
795 | out_err: | ||
796 | ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); | ||
797 | goto out; | ||
798 | } | 767 | } |
799 | 768 | ||
800 | /* | 769 | /* |
@@ -868,86 +837,6 @@ out_err: | |||
868 | goto out; | 837 | goto out; |
869 | } | 838 | } |
870 | 839 | ||
871 | |||
872 | /* | ||
873 | * Handle ICMP_ADDRESS_MASK requests. (RFC950) | ||
874 | * | ||
875 | * RFC1122 (3.2.2.9). A host MUST only send replies to | ||
876 | * ADDRESS_MASK requests if it's been configured as an address mask | ||
877 | * agent. Receiving a request doesn't constitute implicit permission to | ||
878 | * act as one. Of course, implementing this correctly requires (SHOULD) | ||
879 | * a way to turn the functionality on and off. Another one for sysctl(), | ||
880 | * I guess. -- MS | ||
881 | * | ||
882 | * RFC1812 (4.3.3.9). A router MUST implement it. | ||
883 | * A router SHOULD have switch turning it on/off. | ||
884 | * This switch MUST be ON by default. | ||
885 | * | ||
886 | * Gratuitous replies, zero-source replies are not implemented, | ||
887 | * that complies with RFC. DO NOT implement them!!! All the idea | ||
888 | * of broadcast addrmask replies as specified in RFC950 is broken. | ||
889 | * The problem is that it is not uncommon to have several prefixes | ||
890 | * on one physical interface. Moreover, addrmask agent can even be | ||
891 | * not aware of existing another prefixes. | ||
892 | * If source is zero, addrmask agent cannot choose correct prefix. | ||
893 | * Gratuitous mask announcements suffer from the same problem. | ||
894 | * RFC1812 explains it, but still allows to use ADDRMASK, | ||
895 | * that is pretty silly. --ANK | ||
896 | * | ||
897 | * All these rules are so bizarre, that I removed kernel addrmask | ||
898 | * support at all. It is wrong, it is obsolete, nobody uses it in | ||
899 | * any case. --ANK | ||
900 | * | ||
901 | * Furthermore you can do it with a usermode address agent program | ||
902 | * anyway... | ||
903 | */ | ||
904 | |||
905 | static void icmp_address(struct sk_buff *skb) | ||
906 | { | ||
907 | #if 0 | ||
908 | net_dbg_ratelimited("a guy asks for address mask. Who is it?\n"); | ||
909 | #endif | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain | ||
914 | * loudly if an inconsistency is found. | ||
915 | * called with rcu_read_lock() | ||
916 | */ | ||
917 | |||
918 | static void icmp_address_reply(struct sk_buff *skb) | ||
919 | { | ||
920 | struct rtable *rt = skb_rtable(skb); | ||
921 | struct net_device *dev = skb->dev; | ||
922 | struct in_device *in_dev; | ||
923 | struct in_ifaddr *ifa; | ||
924 | |||
925 | if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) | ||
926 | return; | ||
927 | |||
928 | in_dev = __in_dev_get_rcu(dev); | ||
929 | if (!in_dev) | ||
930 | return; | ||
931 | |||
932 | if (in_dev->ifa_list && | ||
933 | IN_DEV_LOG_MARTIANS(in_dev) && | ||
934 | IN_DEV_FORWARD(in_dev)) { | ||
935 | __be32 _mask, *mp; | ||
936 | |||
937 | mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); | ||
938 | BUG_ON(mp == NULL); | ||
939 | for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { | ||
940 | if (*mp == ifa->ifa_mask && | ||
941 | inet_ifa_match(ip_hdr(skb)->saddr, ifa)) | ||
942 | break; | ||
943 | } | ||
944 | if (!ifa) | ||
945 | net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n", | ||
946 | mp, | ||
947 | dev->name, &ip_hdr(skb)->saddr); | ||
948 | } | ||
949 | } | ||
950 | |||
951 | static void icmp_discard(struct sk_buff *skb) | 840 | static void icmp_discard(struct sk_buff *skb) |
952 | { | 841 | { |
953 | } | 842 | } |
@@ -1111,10 +1000,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { | |||
1111 | .handler = icmp_discard, | 1000 | .handler = icmp_discard, |
1112 | }, | 1001 | }, |
1113 | [ICMP_ADDRESS] = { | 1002 | [ICMP_ADDRESS] = { |
1114 | .handler = icmp_address, | 1003 | .handler = icmp_discard, |
1115 | }, | 1004 | }, |
1116 | [ICMP_ADDRESSREPLY] = { | 1005 | [ICMP_ADDRESSREPLY] = { |
1117 | .handler = icmp_address_reply, | 1006 | .handler = icmp_discard, |
1118 | }, | 1007 | }, |
1119 | }; | 1008 | }; |
1120 | 1009 | ||
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f9ee7417f6a0..db0cf17c00f7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -374,18 +374,19 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, | |||
374 | const struct inet_request_sock *ireq = inet_rsk(req); | 374 | const struct inet_request_sock *ireq = inet_rsk(req); |
375 | struct ip_options_rcu *opt = inet_rsk(req)->opt; | 375 | struct ip_options_rcu *opt = inet_rsk(req)->opt; |
376 | struct net *net = sock_net(sk); | 376 | struct net *net = sock_net(sk); |
377 | int flags = inet_sk_flowi_flags(sk); | ||
377 | 378 | ||
378 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, | 379 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, |
379 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, | 380 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, |
380 | sk->sk_protocol, | 381 | sk->sk_protocol, |
381 | inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS, | 382 | flags, |
382 | (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, | 383 | (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, |
383 | ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); | 384 | ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); |
384 | security_req_classify_flow(req, flowi4_to_flowi(fl4)); | 385 | security_req_classify_flow(req, flowi4_to_flowi(fl4)); |
385 | rt = ip_route_output_flow(net, fl4, sk); | 386 | rt = ip_route_output_flow(net, fl4, sk); |
386 | if (IS_ERR(rt)) | 387 | if (IS_ERR(rt)) |
387 | goto no_route; | 388 | goto no_route; |
388 | if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) | 389 | if (opt && opt->opt.is_strictroute && rt->rt_gateway) |
389 | goto route_err; | 390 | goto route_err; |
390 | return &rt->dst; | 391 | return &rt->dst; |
391 | 392 | ||
@@ -418,7 +419,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, | |||
418 | rt = ip_route_output_flow(net, fl4, sk); | 419 | rt = ip_route_output_flow(net, fl4, sk); |
419 | if (IS_ERR(rt)) | 420 | if (IS_ERR(rt)) |
420 | goto no_route; | 421 | goto no_route; |
421 | if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) | 422 | if (opt && opt->opt.is_strictroute && rt->rt_gateway) |
422 | goto route_err; | 423 | goto route_err; |
423 | return &rt->dst; | 424 | return &rt->dst; |
424 | 425 | ||
@@ -799,3 +800,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, | |||
799 | } | 800 | } |
800 | EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); | 801 | EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); |
801 | #endif | 802 | #endif |
803 | |||
804 | static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) | ||
805 | { | ||
806 | const struct inet_sock *inet = inet_sk(sk); | ||
807 | const struct ip_options_rcu *inet_opt; | ||
808 | __be32 daddr = inet->inet_daddr; | ||
809 | struct flowi4 *fl4; | ||
810 | struct rtable *rt; | ||
811 | |||
812 | rcu_read_lock(); | ||
813 | inet_opt = rcu_dereference(inet->inet_opt); | ||
814 | if (inet_opt && inet_opt->opt.srr) | ||
815 | daddr = inet_opt->opt.faddr; | ||
816 | fl4 = &fl->u.ip4; | ||
817 | rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, | ||
818 | inet->inet_saddr, inet->inet_dport, | ||
819 | inet->inet_sport, sk->sk_protocol, | ||
820 | RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); | ||
821 | if (IS_ERR(rt)) | ||
822 | rt = NULL; | ||
823 | if (rt) | ||
824 | sk_setup_caps(sk, &rt->dst); | ||
825 | rcu_read_unlock(); | ||
826 | |||
827 | return &rt->dst; | ||
828 | } | ||
829 | |||
830 | struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) | ||
831 | { | ||
832 | struct dst_entry *dst = __sk_dst_check(sk, 0); | ||
833 | struct inet_sock *inet = inet_sk(sk); | ||
834 | |||
835 | if (!dst) { | ||
836 | dst = inet_csk_rebuild_route(sk, &inet->cork.fl); | ||
837 | if (!dst) | ||
838 | goto out; | ||
839 | } | ||
840 | dst->ops->update_pmtu(dst, sk, NULL, mtu); | ||
841 | |||
842 | dst = __sk_dst_check(sk, 0); | ||
843 | if (!dst) | ||
844 | dst = inet_csk_rebuild_route(sk, &inet->cork.fl); | ||
845 | out: | ||
846 | return dst; | ||
847 | } | ||
848 | EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); | ||
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 46d1e7199a8c..570e61f9611f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -46,9 +46,6 @@ struct inet_diag_entry { | |||
46 | u16 userlocks; | 46 | u16 userlocks; |
47 | }; | 47 | }; |
48 | 48 | ||
49 | #define INET_DIAG_PUT(skb, attrtype, attrlen) \ | ||
50 | RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) | ||
51 | |||
52 | static DEFINE_MUTEX(inet_diag_table_mutex); | 49 | static DEFINE_MUTEX(inet_diag_table_mutex); |
53 | 50 | ||
54 | static const struct inet_diag_handler *inet_diag_lock_handler(int proto) | 51 | static const struct inet_diag_handler *inet_diag_lock_handler(int proto) |
@@ -78,24 +75,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
78 | const struct inet_sock *inet = inet_sk(sk); | 75 | const struct inet_sock *inet = inet_sk(sk); |
79 | struct inet_diag_msg *r; | 76 | struct inet_diag_msg *r; |
80 | struct nlmsghdr *nlh; | 77 | struct nlmsghdr *nlh; |
78 | struct nlattr *attr; | ||
81 | void *info = NULL; | 79 | void *info = NULL; |
82 | struct inet_diag_meminfo *minfo = NULL; | ||
83 | unsigned char *b = skb_tail_pointer(skb); | ||
84 | const struct inet_diag_handler *handler; | 80 | const struct inet_diag_handler *handler; |
85 | int ext = req->idiag_ext; | 81 | int ext = req->idiag_ext; |
86 | 82 | ||
87 | handler = inet_diag_table[req->sdiag_protocol]; | 83 | handler = inet_diag_table[req->sdiag_protocol]; |
88 | BUG_ON(handler == NULL); | 84 | BUG_ON(handler == NULL); |
89 | 85 | ||
90 | nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); | 86 | nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), |
91 | nlh->nlmsg_flags = nlmsg_flags; | 87 | nlmsg_flags); |
88 | if (!nlh) | ||
89 | return -EMSGSIZE; | ||
92 | 90 | ||
93 | r = NLMSG_DATA(nlh); | 91 | r = nlmsg_data(nlh); |
94 | BUG_ON(sk->sk_state == TCP_TIME_WAIT); | 92 | BUG_ON(sk->sk_state == TCP_TIME_WAIT); |
95 | 93 | ||
96 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) | ||
97 | minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); | ||
98 | |||
99 | r->idiag_family = sk->sk_family; | 94 | r->idiag_family = sk->sk_family; |
100 | r->idiag_state = sk->sk_state; | 95 | r->idiag_state = sk->sk_state; |
101 | r->idiag_timer = 0; | 96 | r->idiag_timer = 0; |
@@ -113,7 +108,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
113 | * hence this needs to be included regardless of socket family. | 108 | * hence this needs to be included regardless of socket family. |
114 | */ | 109 | */ |
115 | if (ext & (1 << (INET_DIAG_TOS - 1))) | 110 | if (ext & (1 << (INET_DIAG_TOS - 1))) |
116 | RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); | 111 | if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0) |
112 | goto errout; | ||
117 | 113 | ||
118 | #if IS_ENABLED(CONFIG_IPV6) | 114 | #if IS_ENABLED(CONFIG_IPV6) |
119 | if (r->idiag_family == AF_INET6) { | 115 | if (r->idiag_family == AF_INET6) { |
@@ -121,24 +117,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
121 | 117 | ||
122 | *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; | 118 | *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; |
123 | *(struct in6_addr *)r->id.idiag_dst = np->daddr; | 119 | *(struct in6_addr *)r->id.idiag_dst = np->daddr; |
120 | |||
124 | if (ext & (1 << (INET_DIAG_TCLASS - 1))) | 121 | if (ext & (1 << (INET_DIAG_TCLASS - 1))) |
125 | RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); | 122 | if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0) |
123 | goto errout; | ||
126 | } | 124 | } |
127 | #endif | 125 | #endif |
128 | 126 | ||
129 | r->idiag_uid = sock_i_uid(sk); | 127 | r->idiag_uid = sock_i_uid(sk); |
130 | r->idiag_inode = sock_i_ino(sk); | 128 | r->idiag_inode = sock_i_ino(sk); |
131 | 129 | ||
132 | if (minfo) { | 130 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { |
133 | minfo->idiag_rmem = sk_rmem_alloc_get(sk); | 131 | struct inet_diag_meminfo minfo = { |
134 | minfo->idiag_wmem = sk->sk_wmem_queued; | 132 | .idiag_rmem = sk_rmem_alloc_get(sk), |
135 | minfo->idiag_fmem = sk->sk_forward_alloc; | 133 | .idiag_wmem = sk->sk_wmem_queued, |
136 | minfo->idiag_tmem = sk_wmem_alloc_get(sk); | 134 | .idiag_fmem = sk->sk_forward_alloc, |
135 | .idiag_tmem = sk_wmem_alloc_get(sk), | ||
136 | }; | ||
137 | |||
138 | if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) | ||
139 | goto errout; | ||
137 | } | 140 | } |
138 | 141 | ||
139 | if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) | 142 | if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) |
140 | if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) | 143 | if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) |
141 | goto rtattr_failure; | 144 | goto errout; |
142 | 145 | ||
143 | if (icsk == NULL) { | 146 | if (icsk == NULL) { |
144 | handler->idiag_get_info(sk, r, NULL); | 147 | handler->idiag_get_info(sk, r, NULL); |
@@ -165,16 +168,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
165 | } | 168 | } |
166 | #undef EXPIRES_IN_MS | 169 | #undef EXPIRES_IN_MS |
167 | 170 | ||
168 | if (ext & (1 << (INET_DIAG_INFO - 1))) | 171 | if (ext & (1 << (INET_DIAG_INFO - 1))) { |
169 | info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info)); | 172 | attr = nla_reserve(skb, INET_DIAG_INFO, |
170 | 173 | sizeof(struct tcp_info)); | |
171 | if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { | 174 | if (!attr) |
172 | const size_t len = strlen(icsk->icsk_ca_ops->name); | 175 | goto errout; |
173 | 176 | ||
174 | strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), | 177 | info = nla_data(attr); |
175 | icsk->icsk_ca_ops->name); | ||
176 | } | 178 | } |
177 | 179 | ||
180 | if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) | ||
181 | if (nla_put_string(skb, INET_DIAG_CONG, | ||
182 | icsk->icsk_ca_ops->name) < 0) | ||
183 | goto errout; | ||
184 | |||
178 | handler->idiag_get_info(sk, r, info); | 185 | handler->idiag_get_info(sk, r, info); |
179 | 186 | ||
180 | if (sk->sk_state < TCP_TIME_WAIT && | 187 | if (sk->sk_state < TCP_TIME_WAIT && |
@@ -182,12 +189,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
182 | icsk->icsk_ca_ops->get_info(sk, ext, skb); | 189 | icsk->icsk_ca_ops->get_info(sk, ext, skb); |
183 | 190 | ||
184 | out: | 191 | out: |
185 | nlh->nlmsg_len = skb_tail_pointer(skb) - b; | 192 | return nlmsg_end(skb, nlh); |
186 | return skb->len; | ||
187 | 193 | ||
188 | rtattr_failure: | 194 | errout: |
189 | nlmsg_failure: | 195 | nlmsg_cancel(skb, nlh); |
190 | nlmsg_trim(skb, b); | ||
191 | return -EMSGSIZE; | 196 | return -EMSGSIZE; |
192 | } | 197 | } |
193 | EXPORT_SYMBOL_GPL(inet_sk_diag_fill); | 198 | EXPORT_SYMBOL_GPL(inet_sk_diag_fill); |
@@ -208,14 +213,15 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | |||
208 | { | 213 | { |
209 | long tmo; | 214 | long tmo; |
210 | struct inet_diag_msg *r; | 215 | struct inet_diag_msg *r; |
211 | const unsigned char *previous_tail = skb_tail_pointer(skb); | 216 | struct nlmsghdr *nlh; |
212 | struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, | ||
213 | unlh->nlmsg_type, sizeof(*r)); | ||
214 | 217 | ||
215 | r = NLMSG_DATA(nlh); | 218 | nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), |
216 | BUG_ON(tw->tw_state != TCP_TIME_WAIT); | 219 | nlmsg_flags); |
220 | if (!nlh) | ||
221 | return -EMSGSIZE; | ||
217 | 222 | ||
218 | nlh->nlmsg_flags = nlmsg_flags; | 223 | r = nlmsg_data(nlh); |
224 | BUG_ON(tw->tw_state != TCP_TIME_WAIT); | ||
219 | 225 | ||
220 | tmo = tw->tw_ttd - jiffies; | 226 | tmo = tw->tw_ttd - jiffies; |
221 | if (tmo < 0) | 227 | if (tmo < 0) |
@@ -245,11 +251,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | |||
245 | *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; | 251 | *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; |
246 | } | 252 | } |
247 | #endif | 253 | #endif |
248 | nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; | 254 | |
249 | return skb->len; | 255 | return nlmsg_end(skb, nlh); |
250 | nlmsg_failure: | ||
251 | nlmsg_trim(skb, previous_tail); | ||
252 | return -EMSGSIZE; | ||
253 | } | 256 | } |
254 | 257 | ||
255 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, | 258 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, |
@@ -269,16 +272,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s | |||
269 | int err; | 272 | int err; |
270 | struct sock *sk; | 273 | struct sock *sk; |
271 | struct sk_buff *rep; | 274 | struct sk_buff *rep; |
275 | struct net *net = sock_net(in_skb->sk); | ||
272 | 276 | ||
273 | err = -EINVAL; | 277 | err = -EINVAL; |
274 | if (req->sdiag_family == AF_INET) { | 278 | if (req->sdiag_family == AF_INET) { |
275 | sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], | 279 | sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], |
276 | req->id.idiag_dport, req->id.idiag_src[0], | 280 | req->id.idiag_dport, req->id.idiag_src[0], |
277 | req->id.idiag_sport, req->id.idiag_if); | 281 | req->id.idiag_sport, req->id.idiag_if); |
278 | } | 282 | } |
279 | #if IS_ENABLED(CONFIG_IPV6) | 283 | #if IS_ENABLED(CONFIG_IPV6) |
280 | else if (req->sdiag_family == AF_INET6) { | 284 | else if (req->sdiag_family == AF_INET6) { |
281 | sk = inet6_lookup(&init_net, hashinfo, | 285 | sk = inet6_lookup(net, hashinfo, |
282 | (struct in6_addr *)req->id.idiag_dst, | 286 | (struct in6_addr *)req->id.idiag_dst, |
283 | req->id.idiag_dport, | 287 | req->id.idiag_dport, |
284 | (struct in6_addr *)req->id.idiag_src, | 288 | (struct in6_addr *)req->id.idiag_src, |
@@ -298,23 +302,23 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s | |||
298 | if (err) | 302 | if (err) |
299 | goto out; | 303 | goto out; |
300 | 304 | ||
301 | err = -ENOMEM; | 305 | rep = nlmsg_new(sizeof(struct inet_diag_msg) + |
302 | rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + | 306 | sizeof(struct inet_diag_meminfo) + |
303 | sizeof(struct inet_diag_meminfo) + | 307 | sizeof(struct tcp_info) + 64, GFP_KERNEL); |
304 | sizeof(struct tcp_info) + 64)), | 308 | if (!rep) { |
305 | GFP_KERNEL); | 309 | err = -ENOMEM; |
306 | if (!rep) | ||
307 | goto out; | 310 | goto out; |
311 | } | ||
308 | 312 | ||
309 | err = sk_diag_fill(sk, rep, req, | 313 | err = sk_diag_fill(sk, rep, req, |
310 | NETLINK_CB(in_skb).pid, | 314 | NETLINK_CB(in_skb).pid, |
311 | nlh->nlmsg_seq, 0, nlh); | 315 | nlh->nlmsg_seq, 0, nlh); |
312 | if (err < 0) { | 316 | if (err < 0) { |
313 | WARN_ON(err == -EMSGSIZE); | 317 | WARN_ON(err == -EMSGSIZE); |
314 | kfree_skb(rep); | 318 | nlmsg_free(rep); |
315 | goto out; | 319 | goto out; |
316 | } | 320 | } |
317 | err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, | 321 | err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, |
318 | MSG_DONTWAIT); | 322 | MSG_DONTWAIT); |
319 | if (err > 0) | 323 | if (err > 0) |
320 | err = 0; | 324 | err = 0; |
@@ -592,15 +596,16 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | |||
592 | { | 596 | { |
593 | const struct inet_request_sock *ireq = inet_rsk(req); | 597 | const struct inet_request_sock *ireq = inet_rsk(req); |
594 | struct inet_sock *inet = inet_sk(sk); | 598 | struct inet_sock *inet = inet_sk(sk); |
595 | unsigned char *b = skb_tail_pointer(skb); | ||
596 | struct inet_diag_msg *r; | 599 | struct inet_diag_msg *r; |
597 | struct nlmsghdr *nlh; | 600 | struct nlmsghdr *nlh; |
598 | long tmo; | 601 | long tmo; |
599 | 602 | ||
600 | nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); | 603 | nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), |
601 | nlh->nlmsg_flags = NLM_F_MULTI; | 604 | NLM_F_MULTI); |
602 | r = NLMSG_DATA(nlh); | 605 | if (!nlh) |
606 | return -EMSGSIZE; | ||
603 | 607 | ||
608 | r = nlmsg_data(nlh); | ||
604 | r->idiag_family = sk->sk_family; | 609 | r->idiag_family = sk->sk_family; |
605 | r->idiag_state = TCP_SYN_RECV; | 610 | r->idiag_state = TCP_SYN_RECV; |
606 | r->idiag_timer = 1; | 611 | r->idiag_timer = 1; |
@@ -628,13 +633,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | |||
628 | *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; | 633 | *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; |
629 | } | 634 | } |
630 | #endif | 635 | #endif |
631 | nlh->nlmsg_len = skb_tail_pointer(skb) - b; | ||
632 | |||
633 | return skb->len; | ||
634 | 636 | ||
635 | nlmsg_failure: | 637 | return nlmsg_end(skb, nlh); |
636 | nlmsg_trim(skb, b); | ||
637 | return -1; | ||
638 | } | 638 | } |
639 | 639 | ||
640 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | 640 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, |
@@ -725,6 +725,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, | |||
725 | { | 725 | { |
726 | int i, num; | 726 | int i, num; |
727 | int s_i, s_num; | 727 | int s_i, s_num; |
728 | struct net *net = sock_net(skb->sk); | ||
728 | 729 | ||
729 | s_i = cb->args[1]; | 730 | s_i = cb->args[1]; |
730 | s_num = num = cb->args[2]; | 731 | s_num = num = cb->args[2]; |
@@ -744,6 +745,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, | |||
744 | sk_nulls_for_each(sk, node, &ilb->head) { | 745 | sk_nulls_for_each(sk, node, &ilb->head) { |
745 | struct inet_sock *inet = inet_sk(sk); | 746 | struct inet_sock *inet = inet_sk(sk); |
746 | 747 | ||
748 | if (!net_eq(sock_net(sk), net)) | ||
749 | continue; | ||
750 | |||
747 | if (num < s_num) { | 751 | if (num < s_num) { |
748 | num++; | 752 | num++; |
749 | continue; | 753 | continue; |
@@ -814,6 +818,8 @@ skip_listen_ht: | |||
814 | sk_nulls_for_each(sk, node, &head->chain) { | 818 | sk_nulls_for_each(sk, node, &head->chain) { |
815 | struct inet_sock *inet = inet_sk(sk); | 819 | struct inet_sock *inet = inet_sk(sk); |
816 | 820 | ||
821 | if (!net_eq(sock_net(sk), net)) | ||
822 | continue; | ||
817 | if (num < s_num) | 823 | if (num < s_num) |
818 | goto next_normal; | 824 | goto next_normal; |
819 | if (!(r->idiag_states & (1 << sk->sk_state))) | 825 | if (!(r->idiag_states & (1 << sk->sk_state))) |
@@ -840,6 +846,8 @@ next_normal: | |||
840 | 846 | ||
841 | inet_twsk_for_each(tw, node, | 847 | inet_twsk_for_each(tw, node, |
842 | &head->twchain) { | 848 | &head->twchain) { |
849 | if (!net_eq(twsk_net(tw), net)) | ||
850 | continue; | ||
843 | 851 | ||
844 | if (num < s_num) | 852 | if (num < s_num) |
845 | goto next_dying; | 853 | goto next_dying; |
@@ -892,7 +900,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
892 | if (nlmsg_attrlen(cb->nlh, hdrlen)) | 900 | if (nlmsg_attrlen(cb->nlh, hdrlen)) |
893 | bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); | 901 | bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); |
894 | 902 | ||
895 | return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc); | 903 | return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); |
896 | } | 904 | } |
897 | 905 | ||
898 | static inline int inet_diag_type2proto(int type) | 906 | static inline int inet_diag_type2proto(int type) |
@@ -909,7 +917,7 @@ static inline int inet_diag_type2proto(int type) | |||
909 | 917 | ||
910 | static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) | 918 | static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) |
911 | { | 919 | { |
912 | struct inet_diag_req *rc = NLMSG_DATA(cb->nlh); | 920 | struct inet_diag_req *rc = nlmsg_data(cb->nlh); |
913 | struct inet_diag_req_v2 req; | 921 | struct inet_diag_req_v2 req; |
914 | struct nlattr *bc = NULL; | 922 | struct nlattr *bc = NULL; |
915 | int hdrlen = sizeof(struct inet_diag_req); | 923 | int hdrlen = sizeof(struct inet_diag_req); |
@@ -929,7 +937,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c | |||
929 | static int inet_diag_get_exact_compat(struct sk_buff *in_skb, | 937 | static int inet_diag_get_exact_compat(struct sk_buff *in_skb, |
930 | const struct nlmsghdr *nlh) | 938 | const struct nlmsghdr *nlh) |
931 | { | 939 | { |
932 | struct inet_diag_req *rc = NLMSG_DATA(nlh); | 940 | struct inet_diag_req *rc = nlmsg_data(nlh); |
933 | struct inet_diag_req_v2 req; | 941 | struct inet_diag_req_v2 req; |
934 | 942 | ||
935 | req.sdiag_family = rc->idiag_family; | 943 | req.sdiag_family = rc->idiag_family; |
@@ -944,6 +952,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, | |||
944 | static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) | 952 | static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) |
945 | { | 953 | { |
946 | int hdrlen = sizeof(struct inet_diag_req); | 954 | int hdrlen = sizeof(struct inet_diag_req); |
955 | struct net *net = sock_net(skb->sk); | ||
947 | 956 | ||
948 | if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || | 957 | if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || |
949 | nlmsg_len(nlh) < hdrlen) | 958 | nlmsg_len(nlh) < hdrlen) |
@@ -964,7 +973,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
964 | struct netlink_dump_control c = { | 973 | struct netlink_dump_control c = { |
965 | .dump = inet_diag_dump_compat, | 974 | .dump = inet_diag_dump_compat, |
966 | }; | 975 | }; |
967 | return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c); | 976 | return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); |
968 | } | 977 | } |
969 | } | 978 | } |
970 | 979 | ||
@@ -974,6 +983,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
974 | static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) | 983 | static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) |
975 | { | 984 | { |
976 | int hdrlen = sizeof(struct inet_diag_req_v2); | 985 | int hdrlen = sizeof(struct inet_diag_req_v2); |
986 | struct net *net = sock_net(skb->sk); | ||
977 | 987 | ||
978 | if (nlmsg_len(h) < hdrlen) | 988 | if (nlmsg_len(h) < hdrlen) |
979 | return -EINVAL; | 989 | return -EINVAL; |
@@ -992,11 +1002,11 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) | |||
992 | struct netlink_dump_control c = { | 1002 | struct netlink_dump_control c = { |
993 | .dump = inet_diag_dump, | 1003 | .dump = inet_diag_dump, |
994 | }; | 1004 | }; |
995 | return netlink_dump_start(sock_diag_nlsk, skb, h, &c); | 1005 | return netlink_dump_start(net->diag_nlsk, skb, h, &c); |
996 | } | 1006 | } |
997 | } | 1007 | } |
998 | 1008 | ||
999 | return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h)); | 1009 | return inet_diag_get_exact(skb, h, nlmsg_data(h)); |
1000 | } | 1010 | } |
1001 | 1011 | ||
1002 | static const struct sock_diag_handler inet_diag_handler = { | 1012 | static const struct sock_diag_handler inet_diag_handler = { |
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ff2a51b6d0c..85190e69297b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c | |||
@@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, | |||
243 | if (q == NULL) | 243 | if (q == NULL) |
244 | return NULL; | 244 | return NULL; |
245 | 245 | ||
246 | q->net = nf; | ||
246 | f->constructor(q, arg); | 247 | f->constructor(q, arg); |
247 | atomic_add(f->qsize, &nf->mem); | 248 | atomic_add(f->qsize, &nf->mem); |
248 | setup_timer(&q->timer, f->frag_expire, (unsigned long)q); | 249 | setup_timer(&q->timer, f->frag_expire, (unsigned long)q); |
249 | spin_lock_init(&q->lock); | 250 | spin_lock_init(&q->lock); |
250 | atomic_set(&q->refcnt, 1); | 251 | atomic_set(&q->refcnt, 1); |
251 | q->net = nf; | ||
252 | 252 | ||
253 | return q; | 253 | return q; |
254 | } | 254 | } |
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index dfba343b2509..e1e0a4e8fd34 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -82,23 +82,39 @@ static const struct inet_peer peer_fake_node = { | |||
82 | .avl_height = 0 | 82 | .avl_height = 0 |
83 | }; | 83 | }; |
84 | 84 | ||
85 | struct inet_peer_base { | 85 | void inet_peer_base_init(struct inet_peer_base *bp) |
86 | struct inet_peer __rcu *root; | 86 | { |
87 | seqlock_t lock; | 87 | bp->root = peer_avl_empty_rcu; |
88 | int total; | 88 | seqlock_init(&bp->lock); |
89 | }; | 89 | bp->flush_seq = ~0U; |
90 | bp->total = 0; | ||
91 | } | ||
92 | EXPORT_SYMBOL_GPL(inet_peer_base_init); | ||
90 | 93 | ||
91 | static struct inet_peer_base v4_peers = { | 94 | static atomic_t v4_seq = ATOMIC_INIT(0); |
92 | .root = peer_avl_empty_rcu, | 95 | static atomic_t v6_seq = ATOMIC_INIT(0); |
93 | .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), | ||
94 | .total = 0, | ||
95 | }; | ||
96 | 96 | ||
97 | static struct inet_peer_base v6_peers = { | 97 | static atomic_t *inetpeer_seq_ptr(int family) |
98 | .root = peer_avl_empty_rcu, | 98 | { |
99 | .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), | 99 | return (family == AF_INET ? &v4_seq : &v6_seq); |
100 | .total = 0, | 100 | } |
101 | }; | 101 | |
102 | static inline void flush_check(struct inet_peer_base *base, int family) | ||
103 | { | ||
104 | atomic_t *fp = inetpeer_seq_ptr(family); | ||
105 | |||
106 | if (unlikely(base->flush_seq != atomic_read(fp))) { | ||
107 | inetpeer_invalidate_tree(base); | ||
108 | base->flush_seq = atomic_read(fp); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | void inetpeer_invalidate_family(int family) | ||
113 | { | ||
114 | atomic_t *fp = inetpeer_seq_ptr(family); | ||
115 | |||
116 | atomic_inc(fp); | ||
117 | } | ||
102 | 118 | ||
103 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ | 119 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ |
104 | 120 | ||
@@ -110,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min | |||
110 | 126 | ||
111 | static void inetpeer_gc_worker(struct work_struct *work) | 127 | static void inetpeer_gc_worker(struct work_struct *work) |
112 | { | 128 | { |
113 | struct inet_peer *p, *n; | 129 | struct inet_peer *p, *n, *c; |
114 | LIST_HEAD(list); | 130 | LIST_HEAD(list); |
115 | 131 | ||
116 | spin_lock_bh(&gc_lock); | 132 | spin_lock_bh(&gc_lock); |
@@ -122,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work) | |||
122 | 138 | ||
123 | list_for_each_entry_safe(p, n, &list, gc_list) { | 139 | list_for_each_entry_safe(p, n, &list, gc_list) { |
124 | 140 | ||
125 | if(need_resched()) | 141 | if (need_resched()) |
126 | cond_resched(); | 142 | cond_resched(); |
127 | 143 | ||
128 | if (p->avl_left != peer_avl_empty) { | 144 | c = rcu_dereference_protected(p->avl_left, 1); |
129 | list_add_tail(&p->avl_left->gc_list, &list); | 145 | if (c != peer_avl_empty) { |
130 | p->avl_left = peer_avl_empty; | 146 | list_add_tail(&c->gc_list, &list); |
147 | p->avl_left = peer_avl_empty_rcu; | ||
131 | } | 148 | } |
132 | 149 | ||
133 | if (p->avl_right != peer_avl_empty) { | 150 | c = rcu_dereference_protected(p->avl_right, 1); |
134 | list_add_tail(&p->avl_right->gc_list, &list); | 151 | if (c != peer_avl_empty) { |
135 | p->avl_right = peer_avl_empty; | 152 | list_add_tail(&c->gc_list, &list); |
153 | p->avl_right = peer_avl_empty_rcu; | ||
136 | } | 154 | } |
137 | 155 | ||
138 | n = list_entry(p->gc_list.next, struct inet_peer, gc_list); | 156 | n = list_entry(p->gc_list.next, struct inet_peer, gc_list); |
@@ -401,11 +419,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, | |||
401 | call_rcu(&p->rcu, inetpeer_free_rcu); | 419 | call_rcu(&p->rcu, inetpeer_free_rcu); |
402 | } | 420 | } |
403 | 421 | ||
404 | static struct inet_peer_base *family_to_base(int family) | ||
405 | { | ||
406 | return family == AF_INET ? &v4_peers : &v6_peers; | ||
407 | } | ||
408 | |||
409 | /* perform garbage collect on all items stacked during a lookup */ | 422 | /* perform garbage collect on all items stacked during a lookup */ |
410 | static int inet_peer_gc(struct inet_peer_base *base, | 423 | static int inet_peer_gc(struct inet_peer_base *base, |
411 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], | 424 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], |
@@ -443,14 +456,17 @@ static int inet_peer_gc(struct inet_peer_base *base, | |||
443 | return cnt; | 456 | return cnt; |
444 | } | 457 | } |
445 | 458 | ||
446 | struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) | 459 | struct inet_peer *inet_getpeer(struct inet_peer_base *base, |
460 | const struct inetpeer_addr *daddr, | ||
461 | int create) | ||
447 | { | 462 | { |
448 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; | 463 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; |
449 | struct inet_peer_base *base = family_to_base(daddr->family); | ||
450 | struct inet_peer *p; | 464 | struct inet_peer *p; |
451 | unsigned int sequence; | 465 | unsigned int sequence; |
452 | int invalidated, gccnt = 0; | 466 | int invalidated, gccnt = 0; |
453 | 467 | ||
468 | flush_check(base, daddr->family); | ||
469 | |||
454 | /* Attempt a lockless lookup first. | 470 | /* Attempt a lockless lookup first. |
455 | * Because of a concurrent writer, we might not find an existing entry. | 471 | * Because of a concurrent writer, we might not find an existing entry. |
456 | */ | 472 | */ |
@@ -492,13 +508,9 @@ relookup: | |||
492 | (daddr->family == AF_INET) ? | 508 | (daddr->family == AF_INET) ? |
493 | secure_ip_id(daddr->addr.a4) : | 509 | secure_ip_id(daddr->addr.a4) : |
494 | secure_ipv6_id(daddr->addr.a6)); | 510 | secure_ipv6_id(daddr->addr.a6)); |
495 | p->tcp_ts_stamp = 0; | ||
496 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; | 511 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; |
497 | p->rate_tokens = 0; | 512 | p->rate_tokens = 0; |
498 | p->rate_last = 0; | 513 | p->rate_last = 0; |
499 | p->pmtu_expires = 0; | ||
500 | p->pmtu_orig = 0; | ||
501 | memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); | ||
502 | INIT_LIST_HEAD(&p->gc_list); | 514 | INIT_LIST_HEAD(&p->gc_list); |
503 | 515 | ||
504 | /* Link the node. */ | 516 | /* Link the node. */ |
@@ -571,26 +583,19 @@ static void inetpeer_inval_rcu(struct rcu_head *head) | |||
571 | schedule_delayed_work(&gc_work, gc_delay); | 583 | schedule_delayed_work(&gc_work, gc_delay); |
572 | } | 584 | } |
573 | 585 | ||
574 | void inetpeer_invalidate_tree(int family) | 586 | void inetpeer_invalidate_tree(struct inet_peer_base *base) |
575 | { | 587 | { |
576 | struct inet_peer *old, *new, *prev; | 588 | struct inet_peer *root; |
577 | struct inet_peer_base *base = family_to_base(family); | ||
578 | 589 | ||
579 | write_seqlock_bh(&base->lock); | 590 | write_seqlock_bh(&base->lock); |
580 | 591 | ||
581 | old = base->root; | 592 | root = rcu_deref_locked(base->root, base); |
582 | if (old == peer_avl_empty_rcu) | 593 | if (root != peer_avl_empty) { |
583 | goto out; | 594 | base->root = peer_avl_empty_rcu; |
584 | |||
585 | new = peer_avl_empty_rcu; | ||
586 | |||
587 | prev = cmpxchg(&base->root, old, new); | ||
588 | if (prev == old) { | ||
589 | base->total = 0; | 595 | base->total = 0; |
590 | call_rcu(&prev->gc_rcu, inetpeer_inval_rcu); | 596 | call_rcu(&root->gc_rcu, inetpeer_inval_rcu); |
591 | } | 597 | } |
592 | 598 | ||
593 | out: | ||
594 | write_sequnlock_bh(&base->lock); | 599 | write_sequnlock_bh(&base->lock); |
595 | } | 600 | } |
596 | EXPORT_SYMBOL(inetpeer_invalidate_tree); | 601 | EXPORT_SYMBOL(inetpeer_invalidate_tree); |
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9dbd3dd6022d..7ad88e5e7110 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c | |||
@@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) | |||
171 | static void ip4_frag_init(struct inet_frag_queue *q, void *a) | 171 | static void ip4_frag_init(struct inet_frag_queue *q, void *a) |
172 | { | 172 | { |
173 | struct ipq *qp = container_of(q, struct ipq, q); | 173 | struct ipq *qp = container_of(q, struct ipq, q); |
174 | struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, | ||
175 | frags); | ||
176 | struct net *net = container_of(ipv4, struct net, ipv4); | ||
177 | |||
174 | struct ip4_create_arg *arg = a; | 178 | struct ip4_create_arg *arg = a; |
175 | 179 | ||
176 | qp->protocol = arg->iph->protocol; | 180 | qp->protocol = arg->iph->protocol; |
@@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) | |||
180 | qp->daddr = arg->iph->daddr; | 184 | qp->daddr = arg->iph->daddr; |
181 | qp->user = arg->user; | 185 | qp->user = arg->user; |
182 | qp->peer = sysctl_ipfrag_max_dist ? | 186 | qp->peer = sysctl_ipfrag_max_dist ? |
183 | inet_getpeer_v4(arg->iph->saddr, 1) : NULL; | 187 | inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; |
184 | } | 188 | } |
185 | 189 | ||
186 | static __inline__ void ip4_frag_free(struct inet_frag_queue *q) | 190 | static __inline__ void ip4_frag_free(struct inet_frag_queue *q) |
@@ -254,8 +258,8 @@ static void ip_expire(unsigned long arg) | |||
254 | /* skb dst is stale, drop it, and perform route lookup again */ | 258 | /* skb dst is stale, drop it, and perform route lookup again */ |
255 | skb_dst_drop(head); | 259 | skb_dst_drop(head); |
256 | iph = ip_hdr(head); | 260 | iph = ip_hdr(head); |
257 | err = ip_route_input_noref(head, iph->daddr, iph->saddr, | 261 | err = ip_route_input(head, iph->daddr, iph->saddr, |
258 | iph->tos, head->dev); | 262 | iph->tos, head->dev); |
259 | if (err) | 263 | if (err) |
260 | goto out_rcu_unlock; | 264 | goto out_rcu_unlock; |
261 | 265 | ||
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f49047b79609..b062a98574f2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -516,9 +516,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
516 | case ICMP_PORT_UNREACH: | 516 | case ICMP_PORT_UNREACH: |
517 | /* Impossible event. */ | 517 | /* Impossible event. */ |
518 | return; | 518 | return; |
519 | case ICMP_FRAG_NEEDED: | ||
520 | /* Soft state for pmtu is maintained by IP core. */ | ||
521 | return; | ||
522 | default: | 519 | default: |
523 | /* All others are translated to HOST_UNREACH. | 520 | /* All others are translated to HOST_UNREACH. |
524 | rfc2003 contains "deep thoughts" about NET_UNREACH, | 521 | rfc2003 contains "deep thoughts" about NET_UNREACH, |
@@ -531,6 +528,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
531 | if (code != ICMP_EXC_TTL) | 528 | if (code != ICMP_EXC_TTL) |
532 | return; | 529 | return; |
533 | break; | 530 | break; |
531 | |||
532 | case ICMP_REDIRECT: | ||
533 | break; | ||
534 | } | 534 | } |
535 | 535 | ||
536 | rcu_read_lock(); | 536 | rcu_read_lock(); |
@@ -538,7 +538,20 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
538 | flags & GRE_KEY ? | 538 | flags & GRE_KEY ? |
539 | *(((__be32 *)p) + (grehlen / 4) - 1) : 0, | 539 | *(((__be32 *)p) + (grehlen / 4) - 1) : 0, |
540 | p[1]); | 540 | p[1]); |
541 | if (t == NULL || t->parms.iph.daddr == 0 || | 541 | if (t == NULL) |
542 | goto out; | ||
543 | |||
544 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { | ||
545 | ipv4_update_pmtu(skb, dev_net(skb->dev), info, | ||
546 | t->parms.link, 0, IPPROTO_GRE, 0); | ||
547 | goto out; | ||
548 | } | ||
549 | if (type == ICMP_REDIRECT) { | ||
550 | ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, | ||
551 | IPPROTO_GRE, 0); | ||
552 | goto out; | ||
553 | } | ||
554 | if (t->parms.iph.daddr == 0 || | ||
542 | ipv4_is_multicast(t->parms.iph.daddr)) | 555 | ipv4_is_multicast(t->parms.iph.daddr)) |
543 | goto out; | 556 | goto out; |
544 | 557 | ||
@@ -753,7 +766,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
753 | 766 | ||
754 | if (skb->protocol == htons(ETH_P_IP)) { | 767 | if (skb->protocol == htons(ETH_P_IP)) { |
755 | rt = skb_rtable(skb); | 768 | rt = skb_rtable(skb); |
756 | dst = rt->rt_gateway; | 769 | dst = rt_nexthop(rt, old_iph->daddr); |
757 | } | 770 | } |
758 | #if IS_ENABLED(CONFIG_IPV6) | 771 | #if IS_ENABLED(CONFIG_IPV6) |
759 | else if (skb->protocol == htons(ETH_P_IPV6)) { | 772 | else if (skb->protocol == htons(ETH_P_IPV6)) { |
@@ -820,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
820 | mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; | 833 | mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; |
821 | 834 | ||
822 | if (skb_dst(skb)) | 835 | if (skb_dst(skb)) |
823 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); | 836 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); |
824 | 837 | ||
825 | if (skb->protocol == htons(ETH_P_IP)) { | 838 | if (skb->protocol == htons(ETH_P_IP)) { |
826 | df |= (old_iph->frag_off&htons(IP_DF)); | 839 | df |= (old_iph->frag_off&htons(IP_DF)); |
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 8590144ca330..4ebc6feee250 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb) | |||
198 | rcu_read_lock(); | 198 | rcu_read_lock(); |
199 | { | 199 | { |
200 | int protocol = ip_hdr(skb)->protocol; | 200 | int protocol = ip_hdr(skb)->protocol; |
201 | int hash, raw; | ||
202 | const struct net_protocol *ipprot; | 201 | const struct net_protocol *ipprot; |
202 | int raw; | ||
203 | 203 | ||
204 | resubmit: | 204 | resubmit: |
205 | raw = raw_local_deliver(skb, protocol); | 205 | raw = raw_local_deliver(skb, protocol); |
206 | 206 | ||
207 | hash = protocol & (MAX_INET_PROTOS - 1); | 207 | ipprot = rcu_dereference(inet_protos[protocol]); |
208 | ipprot = rcu_dereference(inet_protos[hash]); | ||
209 | if (ipprot != NULL) { | 208 | if (ipprot != NULL) { |
210 | int ret; | 209 | int ret; |
211 | 210 | ||
@@ -314,26 +313,33 @@ drop: | |||
314 | return true; | 313 | return true; |
315 | } | 314 | } |
316 | 315 | ||
316 | int sysctl_ip_early_demux __read_mostly = 1; | ||
317 | |||
317 | static int ip_rcv_finish(struct sk_buff *skb) | 318 | static int ip_rcv_finish(struct sk_buff *skb) |
318 | { | 319 | { |
319 | const struct iphdr *iph = ip_hdr(skb); | 320 | const struct iphdr *iph = ip_hdr(skb); |
320 | struct rtable *rt; | 321 | struct rtable *rt; |
321 | 322 | ||
323 | if (sysctl_ip_early_demux && !skb_dst(skb)) { | ||
324 | const struct net_protocol *ipprot; | ||
325 | int protocol = iph->protocol; | ||
326 | |||
327 | rcu_read_lock(); | ||
328 | ipprot = rcu_dereference(inet_protos[protocol]); | ||
329 | if (ipprot && ipprot->early_demux) | ||
330 | ipprot->early_demux(skb); | ||
331 | rcu_read_unlock(); | ||
332 | } | ||
333 | |||
322 | /* | 334 | /* |
323 | * Initialise the virtual path cache for the packet. It describes | 335 | * Initialise the virtual path cache for the packet. It describes |
324 | * how the packet travels inside Linux networking. | 336 | * how the packet travels inside Linux networking. |
325 | */ | 337 | */ |
326 | if (skb_dst(skb) == NULL) { | 338 | if (!skb_dst(skb)) { |
327 | int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, | 339 | int err = ip_route_input(skb, iph->daddr, iph->saddr, |
328 | iph->tos, skb->dev); | 340 | iph->tos, skb->dev); |
329 | if (unlikely(err)) { | 341 | if (unlikely(err)) { |
330 | if (err == -EHOSTUNREACH) | 342 | if (err == -EXDEV) |
331 | IP_INC_STATS_BH(dev_net(skb->dev), | ||
332 | IPSTATS_MIB_INADDRERRORS); | ||
333 | else if (err == -ENETUNREACH) | ||
334 | IP_INC_STATS_BH(dev_net(skb->dev), | ||
335 | IPSTATS_MIB_INNOROUTES); | ||
336 | else if (err == -EXDEV) | ||
337 | NET_INC_STATS_BH(dev_net(skb->dev), | 343 | NET_INC_STATS_BH(dev_net(skb->dev), |
338 | LINUX_MIB_IPRPFILTER); | 344 | LINUX_MIB_IPRPFILTER); |
339 | goto drop; | 345 | goto drop; |
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 708b99494e23..1dc01f9793d5 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <net/icmp.h> | 27 | #include <net/icmp.h> |
28 | #include <net/route.h> | 28 | #include <net/route.h> |
29 | #include <net/cipso_ipv4.h> | 29 | #include <net/cipso_ipv4.h> |
30 | #include <net/ip_fib.h> | ||
30 | 31 | ||
31 | /* | 32 | /* |
32 | * Write options to IP header, record destination address to | 33 | * Write options to IP header, record destination address to |
@@ -92,7 +93,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | |||
92 | unsigned char *sptr, *dptr; | 93 | unsigned char *sptr, *dptr; |
93 | int soffset, doffset; | 94 | int soffset, doffset; |
94 | int optlen; | 95 | int optlen; |
95 | __be32 daddr; | ||
96 | 96 | ||
97 | memset(dopt, 0, sizeof(struct ip_options)); | 97 | memset(dopt, 0, sizeof(struct ip_options)); |
98 | 98 | ||
@@ -104,8 +104,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | |||
104 | sptr = skb_network_header(skb); | 104 | sptr = skb_network_header(skb); |
105 | dptr = dopt->__data; | 105 | dptr = dopt->__data; |
106 | 106 | ||
107 | daddr = skb_rtable(skb)->rt_spec_dst; | ||
108 | |||
109 | if (sopt->rr) { | 107 | if (sopt->rr) { |
110 | optlen = sptr[sopt->rr+1]; | 108 | optlen = sptr[sopt->rr+1]; |
111 | soffset = sptr[sopt->rr+2]; | 109 | soffset = sptr[sopt->rr+2]; |
@@ -179,6 +177,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | |||
179 | doffset -= 4; | 177 | doffset -= 4; |
180 | } | 178 | } |
181 | if (doffset > 3) { | 179 | if (doffset > 3) { |
180 | __be32 daddr = fib_compute_spec_dst(skb); | ||
181 | |||
182 | memcpy(&start[doffset-1], &daddr, 4); | 182 | memcpy(&start[doffset-1], &daddr, 4); |
183 | dopt->faddr = faddr; | 183 | dopt->faddr = faddr; |
184 | dptr[0] = start[0]; | 184 | dptr[0] = start[0]; |
@@ -241,6 +241,15 @@ void ip_options_fragment(struct sk_buff *skb) | |||
241 | opt->ts_needtime = 0; | 241 | opt->ts_needtime = 0; |
242 | } | 242 | } |
243 | 243 | ||
244 | /* helper used by ip_options_compile() to call fib_compute_spec_dst() | ||
245 | * at most one time. | ||
246 | */ | ||
247 | static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) | ||
248 | { | ||
249 | if (*spec_dst == htonl(INADDR_ANY)) | ||
250 | *spec_dst = fib_compute_spec_dst(skb); | ||
251 | } | ||
252 | |||
244 | /* | 253 | /* |
245 | * Verify options and fill pointers in struct options. | 254 | * Verify options and fill pointers in struct options. |
246 | * Caller should clear *opt, and set opt->data. | 255 | * Caller should clear *opt, and set opt->data. |
@@ -250,12 +259,12 @@ void ip_options_fragment(struct sk_buff *skb) | |||
250 | int ip_options_compile(struct net *net, | 259 | int ip_options_compile(struct net *net, |
251 | struct ip_options *opt, struct sk_buff *skb) | 260 | struct ip_options *opt, struct sk_buff *skb) |
252 | { | 261 | { |
253 | int l; | 262 | __be32 spec_dst = htonl(INADDR_ANY); |
254 | unsigned char *iph; | ||
255 | unsigned char *optptr; | ||
256 | int optlen; | ||
257 | unsigned char *pp_ptr = NULL; | 263 | unsigned char *pp_ptr = NULL; |
258 | struct rtable *rt = NULL; | 264 | struct rtable *rt = NULL; |
265 | unsigned char *optptr; | ||
266 | unsigned char *iph; | ||
267 | int optlen, l; | ||
259 | 268 | ||
260 | if (skb != NULL) { | 269 | if (skb != NULL) { |
261 | rt = skb_rtable(skb); | 270 | rt = skb_rtable(skb); |
@@ -331,7 +340,8 @@ int ip_options_compile(struct net *net, | |||
331 | goto error; | 340 | goto error; |
332 | } | 341 | } |
333 | if (rt) { | 342 | if (rt) { |
334 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); | 343 | spec_dst_fill(&spec_dst, skb); |
344 | memcpy(&optptr[optptr[2]-1], &spec_dst, 4); | ||
335 | opt->is_changed = 1; | 345 | opt->is_changed = 1; |
336 | } | 346 | } |
337 | optptr[2] += 4; | 347 | optptr[2] += 4; |
@@ -373,7 +383,8 @@ int ip_options_compile(struct net *net, | |||
373 | } | 383 | } |
374 | opt->ts = optptr - iph; | 384 | opt->ts = optptr - iph; |
375 | if (rt) { | 385 | if (rt) { |
376 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); | 386 | spec_dst_fill(&spec_dst, skb); |
387 | memcpy(&optptr[optptr[2]-1], &spec_dst, 4); | ||
377 | timeptr = &optptr[optptr[2]+3]; | 388 | timeptr = &optptr[optptr[2]+3]; |
378 | } | 389 | } |
379 | opt->ts_needaddr = 1; | 390 | opt->ts_needaddr = 1; |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 451f97c42eb4..ba39a52d18c1 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb) | |||
113 | } | 113 | } |
114 | EXPORT_SYMBOL_GPL(ip_local_out); | 114 | EXPORT_SYMBOL_GPL(ip_local_out); |
115 | 115 | ||
116 | /* dev_loopback_xmit for use with netfilter. */ | ||
117 | static int ip_dev_loopback_xmit(struct sk_buff *newskb) | ||
118 | { | ||
119 | skb_reset_mac_header(newskb); | ||
120 | __skb_pull(newskb, skb_network_offset(newskb)); | ||
121 | newskb->pkt_type = PACKET_LOOPBACK; | ||
122 | newskb->ip_summed = CHECKSUM_UNNECESSARY; | ||
123 | WARN_ON(!skb_dst(newskb)); | ||
124 | skb_dst_force(newskb); | ||
125 | netif_rx_ni(newskb); | ||
126 | return 0; | ||
127 | } | ||
128 | |||
129 | static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) | 116 | static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) |
130 | { | 117 | { |
131 | int ttl = inet->uc_ttl; | 118 | int ttl = inet->uc_ttl; |
@@ -183,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) | |||
183 | struct net_device *dev = dst->dev; | 170 | struct net_device *dev = dst->dev; |
184 | unsigned int hh_len = LL_RESERVED_SPACE(dev); | 171 | unsigned int hh_len = LL_RESERVED_SPACE(dev); |
185 | struct neighbour *neigh; | 172 | struct neighbour *neigh; |
173 | u32 nexthop; | ||
186 | 174 | ||
187 | if (rt->rt_type == RTN_MULTICAST) { | 175 | if (rt->rt_type == RTN_MULTICAST) { |
188 | IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); | 176 | IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); |
@@ -200,19 +188,22 @@ static inline int ip_finish_output2(struct sk_buff *skb) | |||
200 | } | 188 | } |
201 | if (skb->sk) | 189 | if (skb->sk) |
202 | skb_set_owner_w(skb2, skb->sk); | 190 | skb_set_owner_w(skb2, skb->sk); |
203 | kfree_skb(skb); | 191 | consume_skb(skb); |
204 | skb = skb2; | 192 | skb = skb2; |
205 | } | 193 | } |
206 | 194 | ||
207 | rcu_read_lock(); | 195 | rcu_read_lock_bh(); |
208 | neigh = dst_get_neighbour_noref(dst); | 196 | nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr; |
197 | neigh = __ipv4_neigh_lookup_noref(dev, nexthop); | ||
198 | if (unlikely(!neigh)) | ||
199 | neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); | ||
209 | if (neigh) { | 200 | if (neigh) { |
210 | int res = neigh_output(neigh, skb); | 201 | int res = dst_neigh_output(dst, neigh, skb); |
211 | 202 | ||
212 | rcu_read_unlock(); | 203 | rcu_read_unlock_bh(); |
213 | return res; | 204 | return res; |
214 | } | 205 | } |
215 | rcu_read_unlock(); | 206 | rcu_read_unlock_bh(); |
216 | 207 | ||
217 | net_dbg_ratelimited("%s: No header cache and no neighbour!\n", | 208 | net_dbg_ratelimited("%s: No header cache and no neighbour!\n", |
218 | __func__); | 209 | __func__); |
@@ -281,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb) | |||
281 | if (newskb) | 272 | if (newskb) |
282 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, | 273 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, |
283 | newskb, NULL, newskb->dev, | 274 | newskb, NULL, newskb->dev, |
284 | ip_dev_loopback_xmit); | 275 | dev_loopback_xmit); |
285 | } | 276 | } |
286 | 277 | ||
287 | /* Multicasts with ttl 0 must not go beyond the host */ | 278 | /* Multicasts with ttl 0 must not go beyond the host */ |
@@ -296,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb) | |||
296 | struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); | 287 | struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); |
297 | if (newskb) | 288 | if (newskb) |
298 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, | 289 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, |
299 | NULL, newskb->dev, ip_dev_loopback_xmit); | 290 | NULL, newskb->dev, dev_loopback_xmit); |
300 | } | 291 | } |
301 | 292 | ||
302 | return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, | 293 | return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, |
@@ -380,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) | |||
380 | skb_dst_set_noref(skb, &rt->dst); | 371 | skb_dst_set_noref(skb, &rt->dst); |
381 | 372 | ||
382 | packet_routed: | 373 | packet_routed: |
383 | if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) | 374 | if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway) |
384 | goto no_route; | 375 | goto no_route; |
385 | 376 | ||
386 | /* OK, we know where to send it, allocate and build IP header. */ | 377 | /* OK, we know where to send it, allocate and build IP header. */ |
@@ -709,7 +700,7 @@ slow_path: | |||
709 | 700 | ||
710 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); | 701 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); |
711 | } | 702 | } |
712 | kfree_skb(skb); | 703 | consume_skb(skb); |
713 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); | 704 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); |
714 | return err; | 705 | return err; |
715 | 706 | ||
@@ -1472,19 +1463,34 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, | |||
1472 | 1463 | ||
1473 | /* | 1464 | /* |
1474 | * Generic function to send a packet as reply to another packet. | 1465 | * Generic function to send a packet as reply to another packet. |
1475 | * Used to send TCP resets so far. ICMP should use this function too. | 1466 | * Used to send some TCP resets/acks so far. |
1476 | * | 1467 | * |
1477 | * Should run single threaded per socket because it uses the sock | 1468 | * Use a fake percpu inet socket to avoid false sharing and contention. |
1478 | * structure to pass arguments. | ||
1479 | */ | 1469 | */ |
1480 | void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, | 1470 | static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { |
1481 | const struct ip_reply_arg *arg, unsigned int len) | 1471 | .sk = { |
1472 | .__sk_common = { | ||
1473 | .skc_refcnt = ATOMIC_INIT(1), | ||
1474 | }, | ||
1475 | .sk_wmem_alloc = ATOMIC_INIT(1), | ||
1476 | .sk_allocation = GFP_ATOMIC, | ||
1477 | .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), | ||
1478 | }, | ||
1479 | .pmtudisc = IP_PMTUDISC_WANT, | ||
1480 | .uc_ttl = -1, | ||
1481 | }; | ||
1482 | |||
1483 | void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, | ||
1484 | __be32 saddr, const struct ip_reply_arg *arg, | ||
1485 | unsigned int len) | ||
1482 | { | 1486 | { |
1483 | struct inet_sock *inet = inet_sk(sk); | ||
1484 | struct ip_options_data replyopts; | 1487 | struct ip_options_data replyopts; |
1485 | struct ipcm_cookie ipc; | 1488 | struct ipcm_cookie ipc; |
1486 | struct flowi4 fl4; | 1489 | struct flowi4 fl4; |
1487 | struct rtable *rt = skb_rtable(skb); | 1490 | struct rtable *rt = skb_rtable(skb); |
1491 | struct sk_buff *nskb; | ||
1492 | struct sock *sk; | ||
1493 | struct inet_sock *inet; | ||
1488 | 1494 | ||
1489 | if (ip_options_echo(&replyopts.opt.opt, skb)) | 1495 | if (ip_options_echo(&replyopts.opt.opt, skb)) |
1490 | return; | 1496 | return; |
@@ -1502,38 +1508,39 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, | |||
1502 | 1508 | ||
1503 | flowi4_init_output(&fl4, arg->bound_dev_if, 0, | 1509 | flowi4_init_output(&fl4, arg->bound_dev_if, 0, |
1504 | RT_TOS(arg->tos), | 1510 | RT_TOS(arg->tos), |
1505 | RT_SCOPE_UNIVERSE, sk->sk_protocol, | 1511 | RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, |
1506 | ip_reply_arg_flowi_flags(arg), | 1512 | ip_reply_arg_flowi_flags(arg), |
1507 | daddr, rt->rt_spec_dst, | 1513 | daddr, saddr, |
1508 | tcp_hdr(skb)->source, tcp_hdr(skb)->dest); | 1514 | tcp_hdr(skb)->source, tcp_hdr(skb)->dest); |
1509 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); | 1515 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
1510 | rt = ip_route_output_key(sock_net(sk), &fl4); | 1516 | rt = ip_route_output_key(net, &fl4); |
1511 | if (IS_ERR(rt)) | 1517 | if (IS_ERR(rt)) |
1512 | return; | 1518 | return; |
1513 | 1519 | ||
1514 | /* And let IP do all the hard work. | 1520 | inet = &get_cpu_var(unicast_sock); |
1515 | 1521 | ||
1516 | This chunk is not reenterable, hence spinlock. | ||
1517 | Note that it uses the fact, that this function is called | ||
1518 | with locally disabled BH and that sk cannot be already spinlocked. | ||
1519 | */ | ||
1520 | bh_lock_sock(sk); | ||
1521 | inet->tos = arg->tos; | 1522 | inet->tos = arg->tos; |
1523 | sk = &inet->sk; | ||
1522 | sk->sk_priority = skb->priority; | 1524 | sk->sk_priority = skb->priority; |
1523 | sk->sk_protocol = ip_hdr(skb)->protocol; | 1525 | sk->sk_protocol = ip_hdr(skb)->protocol; |
1524 | sk->sk_bound_dev_if = arg->bound_dev_if; | 1526 | sk->sk_bound_dev_if = arg->bound_dev_if; |
1527 | sock_net_set(sk, net); | ||
1528 | __skb_queue_head_init(&sk->sk_write_queue); | ||
1529 | sk->sk_sndbuf = sysctl_wmem_default; | ||
1525 | ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, | 1530 | ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, |
1526 | &ipc, &rt, MSG_DONTWAIT); | 1531 | &ipc, &rt, MSG_DONTWAIT); |
1527 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { | 1532 | nskb = skb_peek(&sk->sk_write_queue); |
1533 | if (nskb) { | ||
1528 | if (arg->csumoffset >= 0) | 1534 | if (arg->csumoffset >= 0) |
1529 | *((__sum16 *)skb_transport_header(skb) + | 1535 | *((__sum16 *)skb_transport_header(nskb) + |
1530 | arg->csumoffset) = csum_fold(csum_add(skb->csum, | 1536 | arg->csumoffset) = csum_fold(csum_add(nskb->csum, |
1531 | arg->csum)); | 1537 | arg->csum)); |
1532 | skb->ip_summed = CHECKSUM_NONE; | 1538 | nskb->ip_summed = CHECKSUM_NONE; |
1539 | skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); | ||
1533 | ip_push_pending_frames(sk, &fl4); | 1540 | ip_push_pending_frames(sk, &fl4); |
1534 | } | 1541 | } |
1535 | 1542 | ||
1536 | bh_unlock_sock(sk); | 1543 | put_cpu_var(unicast_sock); |
1537 | 1544 | ||
1538 | ip_rt_put(rt); | 1545 | ip_rt_put(rt); |
1539 | } | 1546 | } |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 0d11f234d615..5eea4a811042 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #if IS_ENABLED(CONFIG_IPV6) | 40 | #if IS_ENABLED(CONFIG_IPV6) |
41 | #include <net/transp_v6.h> | 41 | #include <net/transp_v6.h> |
42 | #endif | 42 | #endif |
43 | #include <net/ip_fib.h> | ||
43 | 44 | ||
44 | #include <linux/errqueue.h> | 45 | #include <linux/errqueue.h> |
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
@@ -1019,18 +1020,17 @@ e_inval: | |||
1019 | * @sk: socket | 1020 | * @sk: socket |
1020 | * @skb: buffer | 1021 | * @skb: buffer |
1021 | * | 1022 | * |
1022 | * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst | 1023 | * To support IP_CMSG_PKTINFO option, we store rt_iif and specific |
1023 | * in skb->cb[] before dst drop. | 1024 | * destination in skb->cb[] before dst drop. |
1024 | * This way, receiver doesnt make cache line misses to read rtable. | 1025 | * This way, receiver doesnt make cache line misses to read rtable. |
1025 | */ | 1026 | */ |
1026 | void ipv4_pktinfo_prepare(struct sk_buff *skb) | 1027 | void ipv4_pktinfo_prepare(struct sk_buff *skb) |
1027 | { | 1028 | { |
1028 | struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); | 1029 | struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); |
1029 | const struct rtable *rt = skb_rtable(skb); | ||
1030 | 1030 | ||
1031 | if (rt) { | 1031 | if (skb_rtable(skb)) { |
1032 | pktinfo->ipi_ifindex = rt->rt_iif; | 1032 | pktinfo->ipi_ifindex = inet_iif(skb); |
1033 | pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst; | 1033 | pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); |
1034 | } else { | 1034 | } else { |
1035 | pktinfo->ipi_ifindex = 0; | 1035 | pktinfo->ipi_ifindex = 0; |
1036 | pktinfo->ipi_spec_dst.s_addr = 0; | 1036 | pktinfo->ipi_spec_dst.s_addr = 0; |
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c new file mode 100644 index 000000000000..3511ffba7bd4 --- /dev/null +++ b/net/ipv4/ip_vti.c | |||
@@ -0,0 +1,956 @@ | |||
1 | /* | ||
2 | * Linux NET3: IP/IP protocol decoder modified to support | ||
3 | * virtual tunnel interface | ||
4 | * | ||
5 | * Authors: | ||
6 | * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c | ||
17 | |||
18 | For comments look at net/ipv4/ip_gre.c --ANK | ||
19 | */ | ||
20 | |||
21 | |||
22 | #include <linux/capability.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include <linux/skbuff.h> | ||
28 | #include <linux/netdevice.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/tcp.h> | ||
31 | #include <linux/udp.h> | ||
32 | #include <linux/if_arp.h> | ||
33 | #include <linux/mroute.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/netfilter_ipv4.h> | ||
36 | #include <linux/if_ether.h> | ||
37 | |||
38 | #include <net/sock.h> | ||
39 | #include <net/ip.h> | ||
40 | #include <net/icmp.h> | ||
41 | #include <net/ipip.h> | ||
42 | #include <net/inet_ecn.h> | ||
43 | #include <net/xfrm.h> | ||
44 | #include <net/net_namespace.h> | ||
45 | #include <net/netns/generic.h> | ||
46 | |||
47 | #define HASH_SIZE 16 | ||
48 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1)) | ||
49 | |||
50 | static struct rtnl_link_ops vti_link_ops __read_mostly; | ||
51 | |||
52 | static int vti_net_id __read_mostly; | ||
53 | struct vti_net { | ||
54 | struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; | ||
55 | struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; | ||
56 | struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; | ||
57 | struct ip_tunnel __rcu *tunnels_wc[1]; | ||
58 | struct ip_tunnel __rcu **tunnels[4]; | ||
59 | |||
60 | struct net_device *fb_tunnel_dev; | ||
61 | }; | ||
62 | |||
63 | static int vti_fb_tunnel_init(struct net_device *dev); | ||
64 | static int vti_tunnel_init(struct net_device *dev); | ||
65 | static void vti_tunnel_setup(struct net_device *dev); | ||
66 | static void vti_dev_free(struct net_device *dev); | ||
67 | static int vti_tunnel_bind_dev(struct net_device *dev); | ||
68 | |||
69 | /* Locking : hash tables are protected by RCU and RTNL */ | ||
70 | |||
71 | #define for_each_ip_tunnel_rcu(start) \ | ||
72 | for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) | ||
73 | |||
74 | /* often modified stats are per cpu, other are shared (netdev->stats) */ | ||
75 | struct pcpu_tstats { | ||
76 | u64 rx_packets; | ||
77 | u64 rx_bytes; | ||
78 | u64 tx_packets; | ||
79 | u64 tx_bytes; | ||
80 | struct u64_stats_sync syncp; | ||
81 | }; | ||
82 | |||
83 | #define VTI_XMIT(stats1, stats2) do { \ | ||
84 | int err; \ | ||
85 | int pkt_len = skb->len; \ | ||
86 | err = dst_output(skb); \ | ||
87 | if (net_xmit_eval(err) == 0) { \ | ||
88 | u64_stats_update_begin(&(stats1)->syncp); \ | ||
89 | (stats1)->tx_bytes += pkt_len; \ | ||
90 | (stats1)->tx_packets++; \ | ||
91 | u64_stats_update_end(&(stats1)->syncp); \ | ||
92 | } else { \ | ||
93 | (stats2)->tx_errors++; \ | ||
94 | (stats2)->tx_aborted_errors++; \ | ||
95 | } \ | ||
96 | } while (0) | ||
97 | |||
98 | |||
99 | static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev, | ||
100 | struct rtnl_link_stats64 *tot) | ||
101 | { | ||
102 | int i; | ||
103 | |||
104 | for_each_possible_cpu(i) { | ||
105 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | ||
106 | u64 rx_packets, rx_bytes, tx_packets, tx_bytes; | ||
107 | unsigned int start; | ||
108 | |||
109 | do { | ||
110 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | ||
111 | rx_packets = tstats->rx_packets; | ||
112 | tx_packets = tstats->tx_packets; | ||
113 | rx_bytes = tstats->rx_bytes; | ||
114 | tx_bytes = tstats->tx_bytes; | ||
115 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | ||
116 | |||
117 | tot->rx_packets += rx_packets; | ||
118 | tot->tx_packets += tx_packets; | ||
119 | tot->rx_bytes += rx_bytes; | ||
120 | tot->tx_bytes += tx_bytes; | ||
121 | } | ||
122 | |||
123 | tot->multicast = dev->stats.multicast; | ||
124 | tot->rx_crc_errors = dev->stats.rx_crc_errors; | ||
125 | tot->rx_fifo_errors = dev->stats.rx_fifo_errors; | ||
126 | tot->rx_length_errors = dev->stats.rx_length_errors; | ||
127 | tot->rx_errors = dev->stats.rx_errors; | ||
128 | tot->tx_fifo_errors = dev->stats.tx_fifo_errors; | ||
129 | tot->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
130 | tot->tx_dropped = dev->stats.tx_dropped; | ||
131 | tot->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
132 | tot->tx_errors = dev->stats.tx_errors; | ||
133 | |||
134 | return tot; | ||
135 | } | ||
136 | |||
137 | static struct ip_tunnel *vti_tunnel_lookup(struct net *net, | ||
138 | __be32 remote, __be32 local) | ||
139 | { | ||
140 | unsigned h0 = HASH(remote); | ||
141 | unsigned h1 = HASH(local); | ||
142 | struct ip_tunnel *t; | ||
143 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
144 | |||
145 | for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1]) | ||
146 | if (local == t->parms.iph.saddr && | ||
147 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
148 | return t; | ||
149 | for_each_ip_tunnel_rcu(ipn->tunnels_r[h0]) | ||
150 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
151 | return t; | ||
152 | |||
153 | for_each_ip_tunnel_rcu(ipn->tunnels_l[h1]) | ||
154 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) | ||
155 | return t; | ||
156 | |||
157 | for_each_ip_tunnel_rcu(ipn->tunnels_wc[0]) | ||
158 | if (t && (t->dev->flags&IFF_UP)) | ||
159 | return t; | ||
160 | return NULL; | ||
161 | } | ||
162 | |||
163 | static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn, | ||
164 | struct ip_tunnel_parm *parms) | ||
165 | { | ||
166 | __be32 remote = parms->iph.daddr; | ||
167 | __be32 local = parms->iph.saddr; | ||
168 | unsigned h = 0; | ||
169 | int prio = 0; | ||
170 | |||
171 | if (remote) { | ||
172 | prio |= 2; | ||
173 | h ^= HASH(remote); | ||
174 | } | ||
175 | if (local) { | ||
176 | prio |= 1; | ||
177 | h ^= HASH(local); | ||
178 | } | ||
179 | return &ipn->tunnels[prio][h]; | ||
180 | } | ||
181 | |||
182 | static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn, | ||
183 | struct ip_tunnel *t) | ||
184 | { | ||
185 | return __vti_bucket(ipn, &t->parms); | ||
186 | } | ||
187 | |||
188 | static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t) | ||
189 | { | ||
190 | struct ip_tunnel __rcu **tp; | ||
191 | struct ip_tunnel *iter; | ||
192 | |||
193 | for (tp = vti_bucket(ipn, t); | ||
194 | (iter = rtnl_dereference(*tp)) != NULL; | ||
195 | tp = &iter->next) { | ||
196 | if (t == iter) { | ||
197 | rcu_assign_pointer(*tp, t->next); | ||
198 | break; | ||
199 | } | ||
200 | } | ||
201 | } | ||
202 | |||
203 | static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t) | ||
204 | { | ||
205 | struct ip_tunnel __rcu **tp = vti_bucket(ipn, t); | ||
206 | |||
207 | rcu_assign_pointer(t->next, rtnl_dereference(*tp)); | ||
208 | rcu_assign_pointer(*tp, t); | ||
209 | } | ||
210 | |||
211 | static struct ip_tunnel *vti_tunnel_locate(struct net *net, | ||
212 | struct ip_tunnel_parm *parms, | ||
213 | int create) | ||
214 | { | ||
215 | __be32 remote = parms->iph.daddr; | ||
216 | __be32 local = parms->iph.saddr; | ||
217 | struct ip_tunnel *t, *nt; | ||
218 | struct ip_tunnel __rcu **tp; | ||
219 | struct net_device *dev; | ||
220 | char name[IFNAMSIZ]; | ||
221 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
222 | |||
223 | for (tp = __vti_bucket(ipn, parms); | ||
224 | (t = rtnl_dereference(*tp)) != NULL; | ||
225 | tp = &t->next) { | ||
226 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) | ||
227 | return t; | ||
228 | } | ||
229 | if (!create) | ||
230 | return NULL; | ||
231 | |||
232 | if (parms->name[0]) | ||
233 | strlcpy(name, parms->name, IFNAMSIZ); | ||
234 | else | ||
235 | strcpy(name, "vti%d"); | ||
236 | |||
237 | dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup); | ||
238 | if (dev == NULL) | ||
239 | return NULL; | ||
240 | |||
241 | dev_net_set(dev, net); | ||
242 | |||
243 | nt = netdev_priv(dev); | ||
244 | nt->parms = *parms; | ||
245 | dev->rtnl_link_ops = &vti_link_ops; | ||
246 | |||
247 | vti_tunnel_bind_dev(dev); | ||
248 | |||
249 | if (register_netdevice(dev) < 0) | ||
250 | goto failed_free; | ||
251 | |||
252 | dev_hold(dev); | ||
253 | vti_tunnel_link(ipn, nt); | ||
254 | return nt; | ||
255 | |||
256 | failed_free: | ||
257 | free_netdev(dev); | ||
258 | return NULL; | ||
259 | } | ||
260 | |||
261 | static void vti_tunnel_uninit(struct net_device *dev) | ||
262 | { | ||
263 | struct net *net = dev_net(dev); | ||
264 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
265 | |||
266 | vti_tunnel_unlink(ipn, netdev_priv(dev)); | ||
267 | dev_put(dev); | ||
268 | } | ||
269 | |||
270 | static int vti_err(struct sk_buff *skb, u32 info) | ||
271 | { | ||
272 | |||
273 | /* All the routers (except for Linux) return only | ||
274 | * 8 bytes of packet payload. It means, that precise relaying of | ||
275 | * ICMP in the real Internet is absolutely infeasible. | ||
276 | */ | ||
277 | struct iphdr *iph = (struct iphdr *)skb->data; | ||
278 | const int type = icmp_hdr(skb)->type; | ||
279 | const int code = icmp_hdr(skb)->code; | ||
280 | struct ip_tunnel *t; | ||
281 | int err; | ||
282 | |||
283 | switch (type) { | ||
284 | default: | ||
285 | case ICMP_PARAMETERPROB: | ||
286 | return 0; | ||
287 | |||
288 | case ICMP_DEST_UNREACH: | ||
289 | switch (code) { | ||
290 | case ICMP_SR_FAILED: | ||
291 | case ICMP_PORT_UNREACH: | ||
292 | /* Impossible event. */ | ||
293 | return 0; | ||
294 | default: | ||
295 | /* All others are translated to HOST_UNREACH. */ | ||
296 | break; | ||
297 | } | ||
298 | break; | ||
299 | case ICMP_TIME_EXCEEDED: | ||
300 | if (code != ICMP_EXC_TTL) | ||
301 | return 0; | ||
302 | break; | ||
303 | } | ||
304 | |||
305 | err = -ENOENT; | ||
306 | |||
307 | rcu_read_lock(); | ||
308 | t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | ||
309 | if (t == NULL) | ||
310 | goto out; | ||
311 | |||
312 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { | ||
313 | ipv4_update_pmtu(skb, dev_net(skb->dev), info, | ||
314 | t->parms.link, 0, IPPROTO_IPIP, 0); | ||
315 | err = 0; | ||
316 | goto out; | ||
317 | } | ||
318 | |||
319 | err = 0; | ||
320 | if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) | ||
321 | goto out; | ||
322 | |||
323 | if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) | ||
324 | t->err_count++; | ||
325 | else | ||
326 | t->err_count = 1; | ||
327 | t->err_time = jiffies; | ||
328 | out: | ||
329 | rcu_read_unlock(); | ||
330 | return err; | ||
331 | } | ||
332 | |||
333 | /* We dont digest the packet therefore let the packet pass */ | ||
334 | static int vti_rcv(struct sk_buff *skb) | ||
335 | { | ||
336 | struct ip_tunnel *tunnel; | ||
337 | const struct iphdr *iph = ip_hdr(skb); | ||
338 | |||
339 | rcu_read_lock(); | ||
340 | tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); | ||
341 | if (tunnel != NULL) { | ||
342 | struct pcpu_tstats *tstats; | ||
343 | |||
344 | tstats = this_cpu_ptr(tunnel->dev->tstats); | ||
345 | u64_stats_update_begin(&tstats->syncp); | ||
346 | tstats->rx_packets++; | ||
347 | tstats->rx_bytes += skb->len; | ||
348 | u64_stats_update_end(&tstats->syncp); | ||
349 | |||
350 | skb->dev = tunnel->dev; | ||
351 | rcu_read_unlock(); | ||
352 | return 1; | ||
353 | } | ||
354 | rcu_read_unlock(); | ||
355 | |||
356 | return -1; | ||
357 | } | ||
358 | |||
359 | /* This function assumes it is being called from dev_queue_xmit() | ||
360 | * and that skb is filled properly by that function. | ||
361 | */ | ||
362 | |||
363 | static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | ||
364 | { | ||
365 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
366 | struct pcpu_tstats *tstats; | ||
367 | struct iphdr *tiph = &tunnel->parms.iph; | ||
368 | u8 tos; | ||
369 | struct rtable *rt; /* Route to the other host */ | ||
370 | struct net_device *tdev; /* Device to other host */ | ||
371 | struct iphdr *old_iph = ip_hdr(skb); | ||
372 | __be32 dst = tiph->daddr; | ||
373 | struct flowi4 fl4; | ||
374 | |||
375 | if (skb->protocol != htons(ETH_P_IP)) | ||
376 | goto tx_error; | ||
377 | |||
378 | tos = old_iph->tos; | ||
379 | |||
380 | memset(&fl4, 0, sizeof(fl4)); | ||
381 | flowi4_init_output(&fl4, tunnel->parms.link, | ||
382 | htonl(tunnel->parms.i_key), RT_TOS(tos), | ||
383 | RT_SCOPE_UNIVERSE, | ||
384 | IPPROTO_IPIP, 0, | ||
385 | dst, tiph->saddr, 0, 0); | ||
386 | rt = ip_route_output_key(dev_net(dev), &fl4); | ||
387 | if (IS_ERR(rt)) { | ||
388 | dev->stats.tx_carrier_errors++; | ||
389 | goto tx_error_icmp; | ||
390 | } | ||
391 | /* if there is no transform then this tunnel is not functional. | ||
392 | * Or if the xfrm is not mode tunnel. | ||
393 | */ | ||
394 | if (!rt->dst.xfrm || | ||
395 | rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { | ||
396 | dev->stats.tx_carrier_errors++; | ||
397 | goto tx_error_icmp; | ||
398 | } | ||
399 | tdev = rt->dst.dev; | ||
400 | |||
401 | if (tdev == dev) { | ||
402 | ip_rt_put(rt); | ||
403 | dev->stats.collisions++; | ||
404 | goto tx_error; | ||
405 | } | ||
406 | |||
407 | if (tunnel->err_count > 0) { | ||
408 | if (time_before(jiffies, | ||
409 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { | ||
410 | tunnel->err_count--; | ||
411 | dst_link_failure(skb); | ||
412 | } else | ||
413 | tunnel->err_count = 0; | ||
414 | } | ||
415 | |||
416 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | | ||
417 | IPSKB_REROUTED); | ||
418 | skb_dst_drop(skb); | ||
419 | skb_dst_set(skb, &rt->dst); | ||
420 | nf_reset(skb); | ||
421 | skb->dev = skb_dst(skb)->dev; | ||
422 | |||
423 | tstats = this_cpu_ptr(dev->tstats); | ||
424 | VTI_XMIT(tstats, &dev->stats); | ||
425 | return NETDEV_TX_OK; | ||
426 | |||
427 | tx_error_icmp: | ||
428 | dst_link_failure(skb); | ||
429 | tx_error: | ||
430 | dev->stats.tx_errors++; | ||
431 | dev_kfree_skb(skb); | ||
432 | return NETDEV_TX_OK; | ||
433 | } | ||
434 | |||
435 | static int vti_tunnel_bind_dev(struct net_device *dev) | ||
436 | { | ||
437 | struct net_device *tdev = NULL; | ||
438 | struct ip_tunnel *tunnel; | ||
439 | struct iphdr *iph; | ||
440 | |||
441 | tunnel = netdev_priv(dev); | ||
442 | iph = &tunnel->parms.iph; | ||
443 | |||
444 | if (iph->daddr) { | ||
445 | struct rtable *rt; | ||
446 | struct flowi4 fl4; | ||
447 | memset(&fl4, 0, sizeof(fl4)); | ||
448 | flowi4_init_output(&fl4, tunnel->parms.link, | ||
449 | htonl(tunnel->parms.i_key), | ||
450 | RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, | ||
451 | IPPROTO_IPIP, 0, | ||
452 | iph->daddr, iph->saddr, 0, 0); | ||
453 | rt = ip_route_output_key(dev_net(dev), &fl4); | ||
454 | if (!IS_ERR(rt)) { | ||
455 | tdev = rt->dst.dev; | ||
456 | ip_rt_put(rt); | ||
457 | } | ||
458 | dev->flags |= IFF_POINTOPOINT; | ||
459 | } | ||
460 | |||
461 | if (!tdev && tunnel->parms.link) | ||
462 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | ||
463 | |||
464 | if (tdev) { | ||
465 | dev->hard_header_len = tdev->hard_header_len + | ||
466 | sizeof(struct iphdr); | ||
467 | dev->mtu = tdev->mtu; | ||
468 | } | ||
469 | dev->iflink = tunnel->parms.link; | ||
470 | return dev->mtu; | ||
471 | } | ||
472 | |||
473 | static int | ||
474 | vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) | ||
475 | { | ||
476 | int err = 0; | ||
477 | struct ip_tunnel_parm p; | ||
478 | struct ip_tunnel *t; | ||
479 | struct net *net = dev_net(dev); | ||
480 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
481 | |||
482 | switch (cmd) { | ||
483 | case SIOCGETTUNNEL: | ||
484 | t = NULL; | ||
485 | if (dev == ipn->fb_tunnel_dev) { | ||
486 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, | ||
487 | sizeof(p))) { | ||
488 | err = -EFAULT; | ||
489 | break; | ||
490 | } | ||
491 | t = vti_tunnel_locate(net, &p, 0); | ||
492 | } | ||
493 | if (t == NULL) | ||
494 | t = netdev_priv(dev); | ||
495 | memcpy(&p, &t->parms, sizeof(p)); | ||
496 | p.i_flags |= GRE_KEY | VTI_ISVTI; | ||
497 | p.o_flags |= GRE_KEY; | ||
498 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) | ||
499 | err = -EFAULT; | ||
500 | break; | ||
501 | |||
502 | case SIOCADDTUNNEL: | ||
503 | case SIOCCHGTUNNEL: | ||
504 | err = -EPERM; | ||
505 | if (!capable(CAP_NET_ADMIN)) | ||
506 | goto done; | ||
507 | |||
508 | err = -EFAULT; | ||
509 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | ||
510 | goto done; | ||
511 | |||
512 | err = -EINVAL; | ||
513 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || | ||
514 | p.iph.ihl != 5) | ||
515 | goto done; | ||
516 | |||
517 | t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); | ||
518 | |||
519 | if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | ||
520 | if (t != NULL) { | ||
521 | if (t->dev != dev) { | ||
522 | err = -EEXIST; | ||
523 | break; | ||
524 | } | ||
525 | } else { | ||
526 | if (((dev->flags&IFF_POINTOPOINT) && | ||
527 | !p.iph.daddr) || | ||
528 | (!(dev->flags&IFF_POINTOPOINT) && | ||
529 | p.iph.daddr)) { | ||
530 | err = -EINVAL; | ||
531 | break; | ||
532 | } | ||
533 | t = netdev_priv(dev); | ||
534 | vti_tunnel_unlink(ipn, t); | ||
535 | synchronize_net(); | ||
536 | t->parms.iph.saddr = p.iph.saddr; | ||
537 | t->parms.iph.daddr = p.iph.daddr; | ||
538 | t->parms.i_key = p.i_key; | ||
539 | t->parms.o_key = p.o_key; | ||
540 | t->parms.iph.protocol = IPPROTO_IPIP; | ||
541 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
542 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
543 | vti_tunnel_link(ipn, t); | ||
544 | netdev_state_change(dev); | ||
545 | } | ||
546 | } | ||
547 | |||
548 | if (t) { | ||
549 | err = 0; | ||
550 | if (cmd == SIOCCHGTUNNEL) { | ||
551 | t->parms.i_key = p.i_key; | ||
552 | t->parms.o_key = p.o_key; | ||
553 | if (t->parms.link != p.link) { | ||
554 | t->parms.link = p.link; | ||
555 | vti_tunnel_bind_dev(dev); | ||
556 | netdev_state_change(dev); | ||
557 | } | ||
558 | } | ||
559 | p.i_flags |= GRE_KEY | VTI_ISVTI; | ||
560 | p.o_flags |= GRE_KEY; | ||
561 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, | ||
562 | sizeof(p))) | ||
563 | err = -EFAULT; | ||
564 | } else | ||
565 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | ||
566 | break; | ||
567 | |||
568 | case SIOCDELTUNNEL: | ||
569 | err = -EPERM; | ||
570 | if (!capable(CAP_NET_ADMIN)) | ||
571 | goto done; | ||
572 | |||
573 | if (dev == ipn->fb_tunnel_dev) { | ||
574 | err = -EFAULT; | ||
575 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, | ||
576 | sizeof(p))) | ||
577 | goto done; | ||
578 | err = -ENOENT; | ||
579 | |||
580 | t = vti_tunnel_locate(net, &p, 0); | ||
581 | if (t == NULL) | ||
582 | goto done; | ||
583 | err = -EPERM; | ||
584 | if (t->dev == ipn->fb_tunnel_dev) | ||
585 | goto done; | ||
586 | dev = t->dev; | ||
587 | } | ||
588 | unregister_netdevice(dev); | ||
589 | err = 0; | ||
590 | break; | ||
591 | |||
592 | default: | ||
593 | err = -EINVAL; | ||
594 | } | ||
595 | |||
596 | done: | ||
597 | return err; | ||
598 | } | ||
599 | |||
600 | static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu) | ||
601 | { | ||
602 | if (new_mtu < 68 || new_mtu > 0xFFF8) | ||
603 | return -EINVAL; | ||
604 | dev->mtu = new_mtu; | ||
605 | return 0; | ||
606 | } | ||
607 | |||
608 | static const struct net_device_ops vti_netdev_ops = { | ||
609 | .ndo_init = vti_tunnel_init, | ||
610 | .ndo_uninit = vti_tunnel_uninit, | ||
611 | .ndo_start_xmit = vti_tunnel_xmit, | ||
612 | .ndo_do_ioctl = vti_tunnel_ioctl, | ||
613 | .ndo_change_mtu = vti_tunnel_change_mtu, | ||
614 | .ndo_get_stats64 = vti_get_stats64, | ||
615 | }; | ||
616 | |||
617 | static void vti_dev_free(struct net_device *dev) | ||
618 | { | ||
619 | free_percpu(dev->tstats); | ||
620 | free_netdev(dev); | ||
621 | } | ||
622 | |||
623 | static void vti_tunnel_setup(struct net_device *dev) | ||
624 | { | ||
625 | dev->netdev_ops = &vti_netdev_ops; | ||
626 | dev->destructor = vti_dev_free; | ||
627 | |||
628 | dev->type = ARPHRD_TUNNEL; | ||
629 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); | ||
630 | dev->mtu = ETH_DATA_LEN; | ||
631 | dev->flags = IFF_NOARP; | ||
632 | dev->iflink = 0; | ||
633 | dev->addr_len = 4; | ||
634 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
635 | dev->features |= NETIF_F_LLTX; | ||
636 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | ||
637 | } | ||
638 | |||
639 | static int vti_tunnel_init(struct net_device *dev) | ||
640 | { | ||
641 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
642 | |||
643 | tunnel->dev = dev; | ||
644 | strcpy(tunnel->parms.name, dev->name); | ||
645 | |||
646 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | ||
647 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | ||
648 | |||
649 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
650 | if (!dev->tstats) | ||
651 | return -ENOMEM; | ||
652 | |||
653 | return 0; | ||
654 | } | ||
655 | |||
656 | static int __net_init vti_fb_tunnel_init(struct net_device *dev) | ||
657 | { | ||
658 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
659 | struct iphdr *iph = &tunnel->parms.iph; | ||
660 | struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id); | ||
661 | |||
662 | tunnel->dev = dev; | ||
663 | strcpy(tunnel->parms.name, dev->name); | ||
664 | |||
665 | iph->version = 4; | ||
666 | iph->protocol = IPPROTO_IPIP; | ||
667 | iph->ihl = 5; | ||
668 | |||
669 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
670 | if (!dev->tstats) | ||
671 | return -ENOMEM; | ||
672 | |||
673 | dev_hold(dev); | ||
674 | rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); | ||
675 | return 0; | ||
676 | } | ||
677 | |||
678 | static struct xfrm_tunnel vti_handler __read_mostly = { | ||
679 | .handler = vti_rcv, | ||
680 | .err_handler = vti_err, | ||
681 | .priority = 1, | ||
682 | }; | ||
683 | |||
684 | static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head) | ||
685 | { | ||
686 | int prio; | ||
687 | |||
688 | for (prio = 1; prio < 4; prio++) { | ||
689 | int h; | ||
690 | for (h = 0; h < HASH_SIZE; h++) { | ||
691 | struct ip_tunnel *t; | ||
692 | |||
693 | t = rtnl_dereference(ipn->tunnels[prio][h]); | ||
694 | while (t != NULL) { | ||
695 | unregister_netdevice_queue(t->dev, head); | ||
696 | t = rtnl_dereference(t->next); | ||
697 | } | ||
698 | } | ||
699 | } | ||
700 | } | ||
701 | |||
702 | static int __net_init vti_init_net(struct net *net) | ||
703 | { | ||
704 | int err; | ||
705 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
706 | |||
707 | ipn->tunnels[0] = ipn->tunnels_wc; | ||
708 | ipn->tunnels[1] = ipn->tunnels_l; | ||
709 | ipn->tunnels[2] = ipn->tunnels_r; | ||
710 | ipn->tunnels[3] = ipn->tunnels_r_l; | ||
711 | |||
712 | ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), | ||
713 | "ip_vti0", | ||
714 | vti_tunnel_setup); | ||
715 | if (!ipn->fb_tunnel_dev) { | ||
716 | err = -ENOMEM; | ||
717 | goto err_alloc_dev; | ||
718 | } | ||
719 | dev_net_set(ipn->fb_tunnel_dev, net); | ||
720 | |||
721 | err = vti_fb_tunnel_init(ipn->fb_tunnel_dev); | ||
722 | if (err) | ||
723 | goto err_reg_dev; | ||
724 | ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops; | ||
725 | |||
726 | err = register_netdev(ipn->fb_tunnel_dev); | ||
727 | if (err) | ||
728 | goto err_reg_dev; | ||
729 | return 0; | ||
730 | |||
731 | err_reg_dev: | ||
732 | vti_dev_free(ipn->fb_tunnel_dev); | ||
733 | err_alloc_dev: | ||
734 | /* nothing */ | ||
735 | return err; | ||
736 | } | ||
737 | |||
738 | static void __net_exit vti_exit_net(struct net *net) | ||
739 | { | ||
740 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
741 | LIST_HEAD(list); | ||
742 | |||
743 | rtnl_lock(); | ||
744 | vti_destroy_tunnels(ipn, &list); | ||
745 | unregister_netdevice_many(&list); | ||
746 | rtnl_unlock(); | ||
747 | } | ||
748 | |||
749 | static struct pernet_operations vti_net_ops = { | ||
750 | .init = vti_init_net, | ||
751 | .exit = vti_exit_net, | ||
752 | .id = &vti_net_id, | ||
753 | .size = sizeof(struct vti_net), | ||
754 | }; | ||
755 | |||
756 | static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) | ||
757 | { | ||
758 | return 0; | ||
759 | } | ||
760 | |||
761 | static void vti_netlink_parms(struct nlattr *data[], | ||
762 | struct ip_tunnel_parm *parms) | ||
763 | { | ||
764 | memset(parms, 0, sizeof(*parms)); | ||
765 | |||
766 | parms->iph.protocol = IPPROTO_IPIP; | ||
767 | |||
768 | if (!data) | ||
769 | return; | ||
770 | |||
771 | if (data[IFLA_VTI_LINK]) | ||
772 | parms->link = nla_get_u32(data[IFLA_VTI_LINK]); | ||
773 | |||
774 | if (data[IFLA_VTI_IKEY]) | ||
775 | parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); | ||
776 | |||
777 | if (data[IFLA_VTI_OKEY]) | ||
778 | parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); | ||
779 | |||
780 | if (data[IFLA_VTI_LOCAL]) | ||
781 | parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]); | ||
782 | |||
783 | if (data[IFLA_VTI_REMOTE]) | ||
784 | parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]); | ||
785 | |||
786 | } | ||
787 | |||
788 | static int vti_newlink(struct net *src_net, struct net_device *dev, | ||
789 | struct nlattr *tb[], struct nlattr *data[]) | ||
790 | { | ||
791 | struct ip_tunnel *nt; | ||
792 | struct net *net = dev_net(dev); | ||
793 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
794 | int mtu; | ||
795 | int err; | ||
796 | |||
797 | nt = netdev_priv(dev); | ||
798 | vti_netlink_parms(data, &nt->parms); | ||
799 | |||
800 | if (vti_tunnel_locate(net, &nt->parms, 0)) | ||
801 | return -EEXIST; | ||
802 | |||
803 | mtu = vti_tunnel_bind_dev(dev); | ||
804 | if (!tb[IFLA_MTU]) | ||
805 | dev->mtu = mtu; | ||
806 | |||
807 | err = register_netdevice(dev); | ||
808 | if (err) | ||
809 | goto out; | ||
810 | |||
811 | dev_hold(dev); | ||
812 | vti_tunnel_link(ipn, nt); | ||
813 | |||
814 | out: | ||
815 | return err; | ||
816 | } | ||
817 | |||
818 | static int vti_changelink(struct net_device *dev, struct nlattr *tb[], | ||
819 | struct nlattr *data[]) | ||
820 | { | ||
821 | struct ip_tunnel *t, *nt; | ||
822 | struct net *net = dev_net(dev); | ||
823 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
824 | struct ip_tunnel_parm p; | ||
825 | int mtu; | ||
826 | |||
827 | if (dev == ipn->fb_tunnel_dev) | ||
828 | return -EINVAL; | ||
829 | |||
830 | nt = netdev_priv(dev); | ||
831 | vti_netlink_parms(data, &p); | ||
832 | |||
833 | t = vti_tunnel_locate(net, &p, 0); | ||
834 | |||
835 | if (t) { | ||
836 | if (t->dev != dev) | ||
837 | return -EEXIST; | ||
838 | } else { | ||
839 | t = nt; | ||
840 | |||
841 | vti_tunnel_unlink(ipn, t); | ||
842 | t->parms.iph.saddr = p.iph.saddr; | ||
843 | t->parms.iph.daddr = p.iph.daddr; | ||
844 | t->parms.i_key = p.i_key; | ||
845 | t->parms.o_key = p.o_key; | ||
846 | if (dev->type != ARPHRD_ETHER) { | ||
847 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
848 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
849 | } | ||
850 | vti_tunnel_link(ipn, t); | ||
851 | netdev_state_change(dev); | ||
852 | } | ||
853 | |||
854 | if (t->parms.link != p.link) { | ||
855 | t->parms.link = p.link; | ||
856 | mtu = vti_tunnel_bind_dev(dev); | ||
857 | if (!tb[IFLA_MTU]) | ||
858 | dev->mtu = mtu; | ||
859 | netdev_state_change(dev); | ||
860 | } | ||
861 | |||
862 | return 0; | ||
863 | } | ||
864 | |||
865 | static size_t vti_get_size(const struct net_device *dev) | ||
866 | { | ||
867 | return | ||
868 | /* IFLA_VTI_LINK */ | ||
869 | nla_total_size(4) + | ||
870 | /* IFLA_VTI_IKEY */ | ||
871 | nla_total_size(4) + | ||
872 | /* IFLA_VTI_OKEY */ | ||
873 | nla_total_size(4) + | ||
874 | /* IFLA_VTI_LOCAL */ | ||
875 | nla_total_size(4) + | ||
876 | /* IFLA_VTI_REMOTE */ | ||
877 | nla_total_size(4) + | ||
878 | 0; | ||
879 | } | ||
880 | |||
881 | static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) | ||
882 | { | ||
883 | struct ip_tunnel *t = netdev_priv(dev); | ||
884 | struct ip_tunnel_parm *p = &t->parms; | ||
885 | |||
886 | nla_put_u32(skb, IFLA_VTI_LINK, p->link); | ||
887 | nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); | ||
888 | nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); | ||
889 | nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr); | ||
890 | nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr); | ||
891 | |||
892 | return 0; | ||
893 | } | ||
894 | |||
895 | static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { | ||
896 | [IFLA_VTI_LINK] = { .type = NLA_U32 }, | ||
897 | [IFLA_VTI_IKEY] = { .type = NLA_U32 }, | ||
898 | [IFLA_VTI_OKEY] = { .type = NLA_U32 }, | ||
899 | [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, | ||
900 | [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, | ||
901 | }; | ||
902 | |||
903 | static struct rtnl_link_ops vti_link_ops __read_mostly = { | ||
904 | .kind = "vti", | ||
905 | .maxtype = IFLA_VTI_MAX, | ||
906 | .policy = vti_policy, | ||
907 | .priv_size = sizeof(struct ip_tunnel), | ||
908 | .setup = vti_tunnel_setup, | ||
909 | .validate = vti_tunnel_validate, | ||
910 | .newlink = vti_newlink, | ||
911 | .changelink = vti_changelink, | ||
912 | .get_size = vti_get_size, | ||
913 | .fill_info = vti_fill_info, | ||
914 | }; | ||
915 | |||
916 | static int __init vti_init(void) | ||
917 | { | ||
918 | int err; | ||
919 | |||
920 | pr_info("IPv4 over IPSec tunneling driver\n"); | ||
921 | |||
922 | err = register_pernet_device(&vti_net_ops); | ||
923 | if (err < 0) | ||
924 | return err; | ||
925 | err = xfrm4_mode_tunnel_input_register(&vti_handler); | ||
926 | if (err < 0) { | ||
927 | unregister_pernet_device(&vti_net_ops); | ||
928 | pr_info(KERN_INFO "vti init: can't register tunnel\n"); | ||
929 | } | ||
930 | |||
931 | err = rtnl_link_register(&vti_link_ops); | ||
932 | if (err < 0) | ||
933 | goto rtnl_link_failed; | ||
934 | |||
935 | return err; | ||
936 | |||
937 | rtnl_link_failed: | ||
938 | xfrm4_mode_tunnel_input_deregister(&vti_handler); | ||
939 | unregister_pernet_device(&vti_net_ops); | ||
940 | return err; | ||
941 | } | ||
942 | |||
943 | static void __exit vti_fini(void) | ||
944 | { | ||
945 | rtnl_link_unregister(&vti_link_ops); | ||
946 | if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) | ||
947 | pr_info("vti close: can't deregister tunnel\n"); | ||
948 | |||
949 | unregister_pernet_device(&vti_net_ops); | ||
950 | } | ||
951 | |||
952 | module_init(vti_init); | ||
953 | module_exit(vti_fini); | ||
954 | MODULE_LICENSE("GPL"); | ||
955 | MODULE_ALIAS_RTNL_LINK("vti"); | ||
956 | MODULE_ALIAS_NETDEV("ip_vti0"); | ||
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 63b64c45a826..d3ab47e19a89 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
@@ -31,17 +31,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) | |||
31 | struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); | 31 | struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); |
32 | struct xfrm_state *x; | 32 | struct xfrm_state *x; |
33 | 33 | ||
34 | if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || | 34 | switch (icmp_hdr(skb)->type) { |
35 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 35 | case ICMP_DEST_UNREACH: |
36 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | ||
37 | return; | ||
38 | case ICMP_REDIRECT: | ||
39 | break; | ||
40 | default: | ||
36 | return; | 41 | return; |
42 | } | ||
37 | 43 | ||
38 | spi = htonl(ntohs(ipch->cpi)); | 44 | spi = htonl(ntohs(ipch->cpi)); |
39 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 45 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
40 | spi, IPPROTO_COMP, AF_INET); | 46 | spi, IPPROTO_COMP, AF_INET); |
41 | if (!x) | 47 | if (!x) |
42 | return; | 48 | return; |
43 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", | 49 | |
44 | spi, &iph->daddr); | 50 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) |
51 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); | ||
52 | else | ||
53 | ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); | ||
45 | xfrm_state_put(x); | 54 | xfrm_state_put(x); |
46 | } | 55 | } |
47 | 56 | ||
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2d0f99bf61b3..99af1f0cc658 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -348,9 +348,6 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
348 | case ICMP_PORT_UNREACH: | 348 | case ICMP_PORT_UNREACH: |
349 | /* Impossible event. */ | 349 | /* Impossible event. */ |
350 | return 0; | 350 | return 0; |
351 | case ICMP_FRAG_NEEDED: | ||
352 | /* Soft state for pmtu is maintained by IP core. */ | ||
353 | return 0; | ||
354 | default: | 351 | default: |
355 | /* All others are translated to HOST_UNREACH. | 352 | /* All others are translated to HOST_UNREACH. |
356 | rfc2003 contains "deep thoughts" about NET_UNREACH, | 353 | rfc2003 contains "deep thoughts" about NET_UNREACH, |
@@ -363,13 +360,32 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
363 | if (code != ICMP_EXC_TTL) | 360 | if (code != ICMP_EXC_TTL) |
364 | return 0; | 361 | return 0; |
365 | break; | 362 | break; |
363 | case ICMP_REDIRECT: | ||
364 | break; | ||
366 | } | 365 | } |
367 | 366 | ||
368 | err = -ENOENT; | 367 | err = -ENOENT; |
369 | 368 | ||
370 | rcu_read_lock(); | 369 | rcu_read_lock(); |
371 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | 370 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); |
372 | if (t == NULL || t->parms.iph.daddr == 0) | 371 | if (t == NULL) |
372 | goto out; | ||
373 | |||
374 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { | ||
375 | ipv4_update_pmtu(skb, dev_net(skb->dev), info, | ||
376 | t->dev->ifindex, 0, IPPROTO_IPIP, 0); | ||
377 | err = 0; | ||
378 | goto out; | ||
379 | } | ||
380 | |||
381 | if (type == ICMP_REDIRECT) { | ||
382 | ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, | ||
383 | IPPROTO_IPIP, 0); | ||
384 | err = 0; | ||
385 | goto out; | ||
386 | } | ||
387 | |||
388 | if (t->parms.iph.daddr == 0) | ||
373 | goto out; | 389 | goto out; |
374 | 390 | ||
375 | err = 0; | 391 | err = 0; |
@@ -471,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
471 | dev->stats.tx_fifo_errors++; | 487 | dev->stats.tx_fifo_errors++; |
472 | goto tx_error; | 488 | goto tx_error; |
473 | } | 489 | } |
474 | dst = rt->rt_gateway; | 490 | dst = rt_nexthop(rt, old_iph->daddr); |
475 | } | 491 | } |
476 | 492 | ||
477 | rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, | 493 | rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, |
@@ -503,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
503 | } | 519 | } |
504 | 520 | ||
505 | if (skb_dst(skb)) | 521 | if (skb_dst(skb)) |
506 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); | 522 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); |
507 | 523 | ||
508 | if ((old_iph->frag_off & htons(IP_DF)) && | 524 | if ((old_iph->frag_off & htons(IP_DF)) && |
509 | mtu < ntohs(old_iph->tot_len)) { | 525 | mtu < ntohs(old_iph->tot_len)) { |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c94bbc6f2ba3..8eec8f4a0536 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -524,8 +524,8 @@ failure: | |||
524 | } | 524 | } |
525 | #endif | 525 | #endif |
526 | 526 | ||
527 | /* | 527 | /** |
528 | * Delete a VIF entry | 528 | * vif_delete - Delete a VIF entry |
529 | * @notify: Set to 1, if the caller is a notifier_call | 529 | * @notify: Set to 1, if the caller is a notifier_call |
530 | */ | 530 | */ |
531 | 531 | ||
@@ -1795,9 +1795,12 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) | |||
1795 | .daddr = iph->daddr, | 1795 | .daddr = iph->daddr, |
1796 | .saddr = iph->saddr, | 1796 | .saddr = iph->saddr, |
1797 | .flowi4_tos = RT_TOS(iph->tos), | 1797 | .flowi4_tos = RT_TOS(iph->tos), |
1798 | .flowi4_oif = rt->rt_oif, | 1798 | .flowi4_oif = (rt_is_output_route(rt) ? |
1799 | .flowi4_iif = rt->rt_iif, | 1799 | skb->dev->ifindex : 0), |
1800 | .flowi4_mark = rt->rt_mark, | 1800 | .flowi4_iif = (rt_is_output_route(rt) ? |
1801 | net->loopback_dev->ifindex : | ||
1802 | skb->dev->ifindex), | ||
1803 | .flowi4_mark = skb->mark, | ||
1801 | }; | 1804 | }; |
1802 | struct mr_table *mrt; | 1805 | struct mr_table *mrt; |
1803 | int err; | 1806 | int err; |
@@ -2006,37 +2009,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | |||
2006 | { | 2009 | { |
2007 | int ct; | 2010 | int ct; |
2008 | struct rtnexthop *nhp; | 2011 | struct rtnexthop *nhp; |
2009 | u8 *b = skb_tail_pointer(skb); | 2012 | struct nlattr *mp_attr; |
2010 | struct rtattr *mp_head; | ||
2011 | 2013 | ||
2012 | /* If cache is unresolved, don't try to parse IIF and OIF */ | 2014 | /* If cache is unresolved, don't try to parse IIF and OIF */ |
2013 | if (c->mfc_parent >= MAXVIFS) | 2015 | if (c->mfc_parent >= MAXVIFS) |
2014 | return -ENOENT; | 2016 | return -ENOENT; |
2015 | 2017 | ||
2016 | if (VIF_EXISTS(mrt, c->mfc_parent)) | 2018 | if (VIF_EXISTS(mrt, c->mfc_parent) && |
2017 | RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); | 2019 | nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) |
2020 | return -EMSGSIZE; | ||
2018 | 2021 | ||
2019 | mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); | 2022 | if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) |
2023 | return -EMSGSIZE; | ||
2020 | 2024 | ||
2021 | for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { | 2025 | for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { |
2022 | if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { | 2026 | if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { |
2023 | if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) | 2027 | if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { |
2024 | goto rtattr_failure; | 2028 | nla_nest_cancel(skb, mp_attr); |
2025 | nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); | 2029 | return -EMSGSIZE; |
2030 | } | ||
2031 | |||
2026 | nhp->rtnh_flags = 0; | 2032 | nhp->rtnh_flags = 0; |
2027 | nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; | 2033 | nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; |
2028 | nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; | 2034 | nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; |
2029 | nhp->rtnh_len = sizeof(*nhp); | 2035 | nhp->rtnh_len = sizeof(*nhp); |
2030 | } | 2036 | } |
2031 | } | 2037 | } |
2032 | mp_head->rta_type = RTA_MULTIPATH; | 2038 | |
2033 | mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; | 2039 | nla_nest_end(skb, mp_attr); |
2040 | |||
2034 | rtm->rtm_type = RTN_MULTICAST; | 2041 | rtm->rtm_type = RTN_MULTICAST; |
2035 | return 1; | 2042 | return 1; |
2036 | |||
2037 | rtattr_failure: | ||
2038 | nlmsg_trim(skb, b); | ||
2039 | return -EMSGSIZE; | ||
2040 | } | 2043 | } |
2041 | 2044 | ||
2042 | int ipmr_get_route(struct net *net, struct sk_buff *skb, | 2045 | int ipmr_get_route(struct net *net, struct sk_buff *skb, |
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 2f210c79dc87..cbb6a1a6f6f7 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
@@ -52,7 +52,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) | |||
52 | struct nf_nat_ipv4_range newrange; | 52 | struct nf_nat_ipv4_range newrange; |
53 | const struct nf_nat_ipv4_multi_range_compat *mr; | 53 | const struct nf_nat_ipv4_multi_range_compat *mr; |
54 | const struct rtable *rt; | 54 | const struct rtable *rt; |
55 | __be32 newsrc; | 55 | __be32 newsrc, nh; |
56 | 56 | ||
57 | NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); | 57 | NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); |
58 | 58 | ||
@@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) | |||
70 | 70 | ||
71 | mr = par->targinfo; | 71 | mr = par->targinfo; |
72 | rt = skb_rtable(skb); | 72 | rt = skb_rtable(skb); |
73 | newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); | 73 | nh = rt_nexthop(rt, ip_hdr(skb)->daddr); |
74 | newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); | ||
74 | if (!newsrc) { | 75 | if (!newsrc) { |
75 | pr_info("%s ate my IP address\n", par->out->name); | 76 | pr_info("%s ate my IP address\n", par->out->name); |
76 | return NF_DROP; | 77 | return NF_DROP; |
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index ba5756d20165..1109f7f6c254 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c | |||
@@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum, | |||
196 | 196 | ||
197 | pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); | 197 | pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); |
198 | 198 | ||
199 | /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ | 199 | nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, |
200 | nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, | 200 | sizeof(*pm)+copy_len, 0); |
201 | sizeof(*pm)+copy_len); | 201 | if (!nlh) { |
202 | pr_debug("error during nlmsg_put\n"); | ||
203 | goto out_unlock; | ||
204 | } | ||
202 | ub->qlen++; | 205 | ub->qlen++; |
203 | 206 | ||
204 | pm = NLMSG_DATA(nlh); | 207 | pm = nlmsg_data(nlh); |
205 | 208 | ||
206 | /* We might not have a timestamp, get one */ | 209 | /* We might not have a timestamp, get one */ |
207 | if (skb->tstamp.tv64 == 0) | 210 | if (skb->tstamp.tv64 == 0) |
@@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum, | |||
261 | nlh->nlmsg_type = NLMSG_DONE; | 264 | nlh->nlmsg_type = NLMSG_DONE; |
262 | ulog_send(groupnum); | 265 | ulog_send(groupnum); |
263 | } | 266 | } |
264 | 267 | out_unlock: | |
265 | spin_unlock_bh(&ulog_lock); | 268 | spin_unlock_bh(&ulog_lock); |
266 | 269 | ||
267 | return; | 270 | return; |
268 | 271 | ||
269 | nlmsg_failure: | ||
270 | pr_debug("error during NLMSG_PUT\n"); | ||
271 | alloc_failure: | 272 | alloc_failure: |
272 | pr_debug("Error building netlink message\n"); | 273 | pr_debug("Error building netlink message\n"); |
273 | spin_unlock_bh(&ulog_lock); | 274 | spin_unlock_bh(&ulog_lock); |
@@ -380,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = { | |||
380 | static int __init ulog_tg_init(void) | 381 | static int __init ulog_tg_init(void) |
381 | { | 382 | { |
382 | int ret, i; | 383 | int ret, i; |
384 | struct netlink_kernel_cfg cfg = { | ||
385 | .groups = ULOG_MAXNLGROUPS, | ||
386 | }; | ||
383 | 387 | ||
384 | pr_debug("init module\n"); | 388 | pr_debug("init module\n"); |
385 | 389 | ||
@@ -392,9 +396,8 @@ static int __init ulog_tg_init(void) | |||
392 | for (i = 0; i < ULOG_MAXNLGROUPS; i++) | 396 | for (i = 0; i < ULOG_MAXNLGROUPS; i++) |
393 | setup_timer(&ulog_buffers[i].timer, ulog_timer, i); | 397 | setup_timer(&ulog_buffers[i].timer, ulog_timer, i); |
394 | 398 | ||
395 | nflognl = netlink_kernel_create(&init_net, | 399 | nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, |
396 | NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, | 400 | THIS_MODULE, &cfg); |
397 | NULL, THIS_MODULE); | ||
398 | if (!nflognl) | 401 | if (!nflognl) |
399 | return -ENOMEM; | 402 | return -ENOMEM; |
400 | 403 | ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 91747d4ebc26..e7ff2dcab6ce 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
@@ -95,11 +95,11 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, | |||
95 | return NF_ACCEPT; | 95 | return NF_ACCEPT; |
96 | } | 96 | } |
97 | 97 | ||
98 | static unsigned int ipv4_confirm(unsigned int hooknum, | 98 | static unsigned int ipv4_helper(unsigned int hooknum, |
99 | struct sk_buff *skb, | 99 | struct sk_buff *skb, |
100 | const struct net_device *in, | 100 | const struct net_device *in, |
101 | const struct net_device *out, | 101 | const struct net_device *out, |
102 | int (*okfn)(struct sk_buff *)) | 102 | int (*okfn)(struct sk_buff *)) |
103 | { | 103 | { |
104 | struct nf_conn *ct; | 104 | struct nf_conn *ct; |
105 | enum ip_conntrack_info ctinfo; | 105 | enum ip_conntrack_info ctinfo; |
@@ -110,24 +110,38 @@ static unsigned int ipv4_confirm(unsigned int hooknum, | |||
110 | /* This is where we call the helper: as the packet goes out. */ | 110 | /* This is where we call the helper: as the packet goes out. */ |
111 | ct = nf_ct_get(skb, &ctinfo); | 111 | ct = nf_ct_get(skb, &ctinfo); |
112 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) | 112 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) |
113 | goto out; | 113 | return NF_ACCEPT; |
114 | 114 | ||
115 | help = nfct_help(ct); | 115 | help = nfct_help(ct); |
116 | if (!help) | 116 | if (!help) |
117 | goto out; | 117 | return NF_ACCEPT; |
118 | 118 | ||
119 | /* rcu_read_lock()ed by nf_hook_slow */ | 119 | /* rcu_read_lock()ed by nf_hook_slow */ |
120 | helper = rcu_dereference(help->helper); | 120 | helper = rcu_dereference(help->helper); |
121 | if (!helper) | 121 | if (!helper) |
122 | goto out; | 122 | return NF_ACCEPT; |
123 | 123 | ||
124 | ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), | 124 | ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), |
125 | ct, ctinfo); | 125 | ct, ctinfo); |
126 | if (ret != NF_ACCEPT) { | 126 | if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) { |
127 | nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, | 127 | nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, |
128 | "nf_ct_%s: dropping packet", helper->name); | 128 | "nf_ct_%s: dropping packet", helper->name); |
129 | return ret; | ||
130 | } | 129 | } |
130 | return ret; | ||
131 | } | ||
132 | |||
133 | static unsigned int ipv4_confirm(unsigned int hooknum, | ||
134 | struct sk_buff *skb, | ||
135 | const struct net_device *in, | ||
136 | const struct net_device *out, | ||
137 | int (*okfn)(struct sk_buff *)) | ||
138 | { | ||
139 | struct nf_conn *ct; | ||
140 | enum ip_conntrack_info ctinfo; | ||
141 | |||
142 | ct = nf_ct_get(skb, &ctinfo); | ||
143 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) | ||
144 | goto out; | ||
131 | 145 | ||
132 | /* adjust seqs for loopback traffic only in outgoing direction */ | 146 | /* adjust seqs for loopback traffic only in outgoing direction */ |
133 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && | 147 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && |
@@ -185,6 +199,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { | |||
185 | .priority = NF_IP_PRI_CONNTRACK, | 199 | .priority = NF_IP_PRI_CONNTRACK, |
186 | }, | 200 | }, |
187 | { | 201 | { |
202 | .hook = ipv4_helper, | ||
203 | .owner = THIS_MODULE, | ||
204 | .pf = NFPROTO_IPV4, | ||
205 | .hooknum = NF_INET_POST_ROUTING, | ||
206 | .priority = NF_IP_PRI_CONNTRACK_HELPER, | ||
207 | }, | ||
208 | { | ||
188 | .hook = ipv4_confirm, | 209 | .hook = ipv4_confirm, |
189 | .owner = THIS_MODULE, | 210 | .owner = THIS_MODULE, |
190 | .pf = NFPROTO_IPV4, | 211 | .pf = NFPROTO_IPV4, |
@@ -192,6 +213,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { | |||
192 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM, | 213 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM, |
193 | }, | 214 | }, |
194 | { | 215 | { |
216 | .hook = ipv4_helper, | ||
217 | .owner = THIS_MODULE, | ||
218 | .pf = NFPROTO_IPV4, | ||
219 | .hooknum = NF_INET_LOCAL_IN, | ||
220 | .priority = NF_IP_PRI_CONNTRACK_HELPER, | ||
221 | }, | ||
222 | { | ||
195 | .hook = ipv4_confirm, | 223 | .hook = ipv4_confirm, |
196 | .owner = THIS_MODULE, | 224 | .owner = THIS_MODULE, |
197 | .pf = NFPROTO_IPV4, | 225 | .pf = NFPROTO_IPV4, |
@@ -207,35 +235,30 @@ static int log_invalid_proto_max = 255; | |||
207 | static ctl_table ip_ct_sysctl_table[] = { | 235 | static ctl_table ip_ct_sysctl_table[] = { |
208 | { | 236 | { |
209 | .procname = "ip_conntrack_max", | 237 | .procname = "ip_conntrack_max", |
210 | .data = &nf_conntrack_max, | ||
211 | .maxlen = sizeof(int), | 238 | .maxlen = sizeof(int), |
212 | .mode = 0644, | 239 | .mode = 0644, |
213 | .proc_handler = proc_dointvec, | 240 | .proc_handler = proc_dointvec, |
214 | }, | 241 | }, |
215 | { | 242 | { |
216 | .procname = "ip_conntrack_count", | 243 | .procname = "ip_conntrack_count", |
217 | .data = &init_net.ct.count, | ||
218 | .maxlen = sizeof(int), | 244 | .maxlen = sizeof(int), |
219 | .mode = 0444, | 245 | .mode = 0444, |
220 | .proc_handler = proc_dointvec, | 246 | .proc_handler = proc_dointvec, |
221 | }, | 247 | }, |
222 | { | 248 | { |
223 | .procname = "ip_conntrack_buckets", | 249 | .procname = "ip_conntrack_buckets", |
224 | .data = &init_net.ct.htable_size, | ||
225 | .maxlen = sizeof(unsigned int), | 250 | .maxlen = sizeof(unsigned int), |
226 | .mode = 0444, | 251 | .mode = 0444, |
227 | .proc_handler = proc_dointvec, | 252 | .proc_handler = proc_dointvec, |
228 | }, | 253 | }, |
229 | { | 254 | { |
230 | .procname = "ip_conntrack_checksum", | 255 | .procname = "ip_conntrack_checksum", |
231 | .data = &init_net.ct.sysctl_checksum, | ||
232 | .maxlen = sizeof(int), | 256 | .maxlen = sizeof(int), |
233 | .mode = 0644, | 257 | .mode = 0644, |
234 | .proc_handler = proc_dointvec, | 258 | .proc_handler = proc_dointvec, |
235 | }, | 259 | }, |
236 | { | 260 | { |
237 | .procname = "ip_conntrack_log_invalid", | 261 | .procname = "ip_conntrack_log_invalid", |
238 | .data = &init_net.ct.sysctl_log_invalid, | ||
239 | .maxlen = sizeof(unsigned int), | 262 | .maxlen = sizeof(unsigned int), |
240 | .mode = 0644, | 263 | .mode = 0644, |
241 | .proc_handler = proc_dointvec_minmax, | 264 | .proc_handler = proc_dointvec_minmax, |
@@ -351,6 +374,25 @@ static struct nf_sockopt_ops so_getorigdst = { | |||
351 | .owner = THIS_MODULE, | 374 | .owner = THIS_MODULE, |
352 | }; | 375 | }; |
353 | 376 | ||
377 | static int ipv4_init_net(struct net *net) | ||
378 | { | ||
379 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | ||
380 | struct nf_ip_net *in = &net->ct.nf_ct_proto; | ||
381 | in->ctl_table = kmemdup(ip_ct_sysctl_table, | ||
382 | sizeof(ip_ct_sysctl_table), | ||
383 | GFP_KERNEL); | ||
384 | if (!in->ctl_table) | ||
385 | return -ENOMEM; | ||
386 | |||
387 | in->ctl_table[0].data = &nf_conntrack_max; | ||
388 | in->ctl_table[1].data = &net->ct.count; | ||
389 | in->ctl_table[2].data = &net->ct.htable_size; | ||
390 | in->ctl_table[3].data = &net->ct.sysctl_checksum; | ||
391 | in->ctl_table[4].data = &net->ct.sysctl_log_invalid; | ||
392 | #endif | ||
393 | return 0; | ||
394 | } | ||
395 | |||
354 | struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { | 396 | struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { |
355 | .l3proto = PF_INET, | 397 | .l3proto = PF_INET, |
356 | .name = "ipv4", | 398 | .name = "ipv4", |
@@ -366,8 +408,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { | |||
366 | #endif | 408 | #endif |
367 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | 409 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) |
368 | .ctl_table_path = "net/ipv4/netfilter", | 410 | .ctl_table_path = "net/ipv4/netfilter", |
369 | .ctl_table = ip_ct_sysctl_table, | ||
370 | #endif | 411 | #endif |
412 | .init_net = ipv4_init_net, | ||
371 | .me = THIS_MODULE, | 413 | .me = THIS_MODULE, |
372 | }; | 414 | }; |
373 | 415 | ||
@@ -378,6 +420,65 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); | |||
378 | MODULE_ALIAS("ip_conntrack"); | 420 | MODULE_ALIAS("ip_conntrack"); |
379 | MODULE_LICENSE("GPL"); | 421 | MODULE_LICENSE("GPL"); |
380 | 422 | ||
423 | static int ipv4_net_init(struct net *net) | ||
424 | { | ||
425 | int ret = 0; | ||
426 | |||
427 | ret = nf_conntrack_l4proto_register(net, | ||
428 | &nf_conntrack_l4proto_tcp4); | ||
429 | if (ret < 0) { | ||
430 | pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n"); | ||
431 | goto out_tcp; | ||
432 | } | ||
433 | ret = nf_conntrack_l4proto_register(net, | ||
434 | &nf_conntrack_l4proto_udp4); | ||
435 | if (ret < 0) { | ||
436 | pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n"); | ||
437 | goto out_udp; | ||
438 | } | ||
439 | ret = nf_conntrack_l4proto_register(net, | ||
440 | &nf_conntrack_l4proto_icmp); | ||
441 | if (ret < 0) { | ||
442 | pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n"); | ||
443 | goto out_icmp; | ||
444 | } | ||
445 | ret = nf_conntrack_l3proto_register(net, | ||
446 | &nf_conntrack_l3proto_ipv4); | ||
447 | if (ret < 0) { | ||
448 | pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n"); | ||
449 | goto out_ipv4; | ||
450 | } | ||
451 | return 0; | ||
452 | out_ipv4: | ||
453 | nf_conntrack_l4proto_unregister(net, | ||
454 | &nf_conntrack_l4proto_icmp); | ||
455 | out_icmp: | ||
456 | nf_conntrack_l4proto_unregister(net, | ||
457 | &nf_conntrack_l4proto_udp4); | ||
458 | out_udp: | ||
459 | nf_conntrack_l4proto_unregister(net, | ||
460 | &nf_conntrack_l4proto_tcp4); | ||
461 | out_tcp: | ||
462 | return ret; | ||
463 | } | ||
464 | |||
465 | static void ipv4_net_exit(struct net *net) | ||
466 | { | ||
467 | nf_conntrack_l3proto_unregister(net, | ||
468 | &nf_conntrack_l3proto_ipv4); | ||
469 | nf_conntrack_l4proto_unregister(net, | ||
470 | &nf_conntrack_l4proto_icmp); | ||
471 | nf_conntrack_l4proto_unregister(net, | ||
472 | &nf_conntrack_l4proto_udp4); | ||
473 | nf_conntrack_l4proto_unregister(net, | ||
474 | &nf_conntrack_l4proto_tcp4); | ||
475 | } | ||
476 | |||
477 | static struct pernet_operations ipv4_net_ops = { | ||
478 | .init = ipv4_net_init, | ||
479 | .exit = ipv4_net_exit, | ||
480 | }; | ||
481 | |||
381 | static int __init nf_conntrack_l3proto_ipv4_init(void) | 482 | static int __init nf_conntrack_l3proto_ipv4_init(void) |
382 | { | 483 | { |
383 | int ret = 0; | 484 | int ret = 0; |
@@ -391,35 +492,17 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) | |||
391 | return ret; | 492 | return ret; |
392 | } | 493 | } |
393 | 494 | ||
394 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); | 495 | ret = register_pernet_subsys(&ipv4_net_ops); |
395 | if (ret < 0) { | 496 | if (ret < 0) { |
396 | pr_err("nf_conntrack_ipv4: can't register tcp.\n"); | 497 | pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); |
397 | goto cleanup_sockopt; | 498 | goto cleanup_sockopt; |
398 | } | 499 | } |
399 | 500 | ||
400 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); | ||
401 | if (ret < 0) { | ||
402 | pr_err("nf_conntrack_ipv4: can't register udp.\n"); | ||
403 | goto cleanup_tcp; | ||
404 | } | ||
405 | |||
406 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); | ||
407 | if (ret < 0) { | ||
408 | pr_err("nf_conntrack_ipv4: can't register icmp.\n"); | ||
409 | goto cleanup_udp; | ||
410 | } | ||
411 | |||
412 | ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); | ||
413 | if (ret < 0) { | ||
414 | pr_err("nf_conntrack_ipv4: can't register ipv4\n"); | ||
415 | goto cleanup_icmp; | ||
416 | } | ||
417 | |||
418 | ret = nf_register_hooks(ipv4_conntrack_ops, | 501 | ret = nf_register_hooks(ipv4_conntrack_ops, |
419 | ARRAY_SIZE(ipv4_conntrack_ops)); | 502 | ARRAY_SIZE(ipv4_conntrack_ops)); |
420 | if (ret < 0) { | 503 | if (ret < 0) { |
421 | pr_err("nf_conntrack_ipv4: can't register hooks.\n"); | 504 | pr_err("nf_conntrack_ipv4: can't register hooks.\n"); |
422 | goto cleanup_ipv4; | 505 | goto cleanup_pernet; |
423 | } | 506 | } |
424 | #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | 507 | #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) |
425 | ret = nf_conntrack_ipv4_compat_init(); | 508 | ret = nf_conntrack_ipv4_compat_init(); |
@@ -431,14 +514,8 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) | |||
431 | cleanup_hooks: | 514 | cleanup_hooks: |
432 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); | 515 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); |
433 | #endif | 516 | #endif |
434 | cleanup_ipv4: | 517 | cleanup_pernet: |
435 | nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); | 518 | unregister_pernet_subsys(&ipv4_net_ops); |
436 | cleanup_icmp: | ||
437 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); | ||
438 | cleanup_udp: | ||
439 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); | ||
440 | cleanup_tcp: | ||
441 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); | ||
442 | cleanup_sockopt: | 519 | cleanup_sockopt: |
443 | nf_unregister_sockopt(&so_getorigdst); | 520 | nf_unregister_sockopt(&so_getorigdst); |
444 | return ret; | 521 | return ret; |
@@ -451,10 +528,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) | |||
451 | nf_conntrack_ipv4_compat_fini(); | 528 | nf_conntrack_ipv4_compat_fini(); |
452 | #endif | 529 | #endif |
453 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); | 530 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); |
454 | nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); | 531 | unregister_pernet_subsys(&ipv4_net_ops); |
455 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); | ||
456 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); | ||
457 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); | ||
458 | nf_unregister_sockopt(&so_getorigdst); | 532 | nf_unregister_sockopt(&so_getorigdst); |
459 | } | 533 | } |
460 | 534 | ||
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 0847e373d33c..5241d997ab75 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c | |||
@@ -23,6 +23,11 @@ | |||
23 | 23 | ||
24 | static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; | 24 | static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; |
25 | 25 | ||
26 | static inline struct nf_icmp_net *icmp_pernet(struct net *net) | ||
27 | { | ||
28 | return &net->ct.nf_ct_proto.icmp; | ||
29 | } | ||
30 | |||
26 | static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, | 31 | static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, |
27 | struct nf_conntrack_tuple *tuple) | 32 | struct nf_conntrack_tuple *tuple) |
28 | { | 33 | { |
@@ -77,7 +82,7 @@ static int icmp_print_tuple(struct seq_file *s, | |||
77 | 82 | ||
78 | static unsigned int *icmp_get_timeouts(struct net *net) | 83 | static unsigned int *icmp_get_timeouts(struct net *net) |
79 | { | 84 | { |
80 | return &nf_ct_icmp_timeout; | 85 | return &icmp_pernet(net)->timeout; |
81 | } | 86 | } |
82 | 87 | ||
83 | /* Returns verdict for packet, or -1 for invalid. */ | 88 | /* Returns verdict for packet, or -1 for invalid. */ |
@@ -274,16 +279,18 @@ static int icmp_nlattr_tuple_size(void) | |||
274 | #include <linux/netfilter/nfnetlink.h> | 279 | #include <linux/netfilter/nfnetlink.h> |
275 | #include <linux/netfilter/nfnetlink_cttimeout.h> | 280 | #include <linux/netfilter/nfnetlink_cttimeout.h> |
276 | 281 | ||
277 | static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) | 282 | static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], |
283 | struct net *net, void *data) | ||
278 | { | 284 | { |
279 | unsigned int *timeout = data; | 285 | unsigned int *timeout = data; |
286 | struct nf_icmp_net *in = icmp_pernet(net); | ||
280 | 287 | ||
281 | if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { | 288 | if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { |
282 | *timeout = | 289 | *timeout = |
283 | ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; | 290 | ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; |
284 | } else { | 291 | } else { |
285 | /* Set default ICMP timeout. */ | 292 | /* Set default ICMP timeout. */ |
286 | *timeout = nf_ct_icmp_timeout; | 293 | *timeout = in->timeout; |
287 | } | 294 | } |
288 | return 0; | 295 | return 0; |
289 | } | 296 | } |
@@ -308,11 +315,9 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { | |||
308 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ | 315 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ |
309 | 316 | ||
310 | #ifdef CONFIG_SYSCTL | 317 | #ifdef CONFIG_SYSCTL |
311 | static struct ctl_table_header *icmp_sysctl_header; | ||
312 | static struct ctl_table icmp_sysctl_table[] = { | 318 | static struct ctl_table icmp_sysctl_table[] = { |
313 | { | 319 | { |
314 | .procname = "nf_conntrack_icmp_timeout", | 320 | .procname = "nf_conntrack_icmp_timeout", |
315 | .data = &nf_ct_icmp_timeout, | ||
316 | .maxlen = sizeof(unsigned int), | 321 | .maxlen = sizeof(unsigned int), |
317 | .mode = 0644, | 322 | .mode = 0644, |
318 | .proc_handler = proc_dointvec_jiffies, | 323 | .proc_handler = proc_dointvec_jiffies, |
@@ -323,7 +328,6 @@ static struct ctl_table icmp_sysctl_table[] = { | |||
323 | static struct ctl_table icmp_compat_sysctl_table[] = { | 328 | static struct ctl_table icmp_compat_sysctl_table[] = { |
324 | { | 329 | { |
325 | .procname = "ip_conntrack_icmp_timeout", | 330 | .procname = "ip_conntrack_icmp_timeout", |
326 | .data = &nf_ct_icmp_timeout, | ||
327 | .maxlen = sizeof(unsigned int), | 331 | .maxlen = sizeof(unsigned int), |
328 | .mode = 0644, | 332 | .mode = 0644, |
329 | .proc_handler = proc_dointvec_jiffies, | 333 | .proc_handler = proc_dointvec_jiffies, |
@@ -333,6 +337,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = { | |||
333 | #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ | 337 | #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ |
334 | #endif /* CONFIG_SYSCTL */ | 338 | #endif /* CONFIG_SYSCTL */ |
335 | 339 | ||
340 | static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, | ||
341 | struct nf_icmp_net *in) | ||
342 | { | ||
343 | #ifdef CONFIG_SYSCTL | ||
344 | pn->ctl_table = kmemdup(icmp_sysctl_table, | ||
345 | sizeof(icmp_sysctl_table), | ||
346 | GFP_KERNEL); | ||
347 | if (!pn->ctl_table) | ||
348 | return -ENOMEM; | ||
349 | |||
350 | pn->ctl_table[0].data = &in->timeout; | ||
351 | #endif | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, | ||
356 | struct nf_icmp_net *in) | ||
357 | { | ||
358 | #ifdef CONFIG_SYSCTL | ||
359 | #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT | ||
360 | pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, | ||
361 | sizeof(icmp_compat_sysctl_table), | ||
362 | GFP_KERNEL); | ||
363 | if (!pn->ctl_compat_table) | ||
364 | return -ENOMEM; | ||
365 | |||
366 | pn->ctl_compat_table[0].data = &in->timeout; | ||
367 | #endif | ||
368 | #endif | ||
369 | return 0; | ||
370 | } | ||
371 | |||
372 | static int icmp_init_net(struct net *net, u_int16_t proto) | ||
373 | { | ||
374 | int ret; | ||
375 | struct nf_icmp_net *in = icmp_pernet(net); | ||
376 | struct nf_proto_net *pn = &in->pn; | ||
377 | |||
378 | in->timeout = nf_ct_icmp_timeout; | ||
379 | |||
380 | ret = icmp_kmemdup_compat_sysctl_table(pn, in); | ||
381 | if (ret < 0) | ||
382 | return ret; | ||
383 | |||
384 | ret = icmp_kmemdup_sysctl_table(pn, in); | ||
385 | if (ret < 0) | ||
386 | nf_ct_kfree_compat_sysctl_table(pn); | ||
387 | |||
388 | return ret; | ||
389 | } | ||
390 | |||
391 | static struct nf_proto_net *icmp_get_net_proto(struct net *net) | ||
392 | { | ||
393 | return &net->ct.nf_ct_proto.icmp.pn; | ||
394 | } | ||
395 | |||
336 | struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = | 396 | struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = |
337 | { | 397 | { |
338 | .l3proto = PF_INET, | 398 | .l3proto = PF_INET, |
@@ -362,11 +422,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = | |||
362 | .nla_policy = icmp_timeout_nla_policy, | 422 | .nla_policy = icmp_timeout_nla_policy, |
363 | }, | 423 | }, |
364 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ | 424 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ |
365 | #ifdef CONFIG_SYSCTL | 425 | .init_net = icmp_init_net, |
366 | .ctl_table_header = &icmp_sysctl_header, | 426 | .get_net_proto = icmp_get_net_proto, |
367 | .ctl_table = icmp_sysctl_table, | ||
368 | #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT | ||
369 | .ctl_compat_table = icmp_compat_sysctl_table, | ||
370 | #endif | ||
371 | #endif | ||
372 | }; | 427 | }; |
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 9bb1b8a37a22..742815518b0f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c | |||
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = { | |||
94 | { | 94 | { |
95 | .hook = ipv4_conntrack_defrag, | 95 | .hook = ipv4_conntrack_defrag, |
96 | .owner = THIS_MODULE, | 96 | .owner = THIS_MODULE, |
97 | .pf = PF_INET, | 97 | .pf = NFPROTO_IPV4, |
98 | .hooknum = NF_INET_PRE_ROUTING, | 98 | .hooknum = NF_INET_PRE_ROUTING, |
99 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | 99 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, |
100 | }, | 100 | }, |
101 | { | 101 | { |
102 | .hook = ipv4_conntrack_defrag, | 102 | .hook = ipv4_conntrack_defrag, |
103 | .owner = THIS_MODULE, | 103 | .owner = THIS_MODULE, |
104 | .pf = PF_INET, | 104 | .pf = NFPROTO_IPV4, |
105 | .hooknum = NF_INET_LOCAL_OUT, | 105 | .hooknum = NF_INET_LOCAL_OUT, |
106 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | 106 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, |
107 | }, | 107 | }, |
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 7b22382ff0e9..3c04d24e2976 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c | |||
@@ -13,10 +13,10 @@ | |||
13 | #include <linux/skbuff.h> | 13 | #include <linux/skbuff.h> |
14 | #include <linux/udp.h> | 14 | #include <linux/udp.h> |
15 | 15 | ||
16 | #include <net/netfilter/nf_nat_helper.h> | ||
17 | #include <net/netfilter/nf_nat_rule.h> | ||
18 | #include <net/netfilter/nf_conntrack_helper.h> | 16 | #include <net/netfilter/nf_conntrack_helper.h> |
19 | #include <net/netfilter/nf_conntrack_expect.h> | 17 | #include <net/netfilter/nf_conntrack_expect.h> |
18 | #include <net/netfilter/nf_nat_helper.h> | ||
19 | #include <net/netfilter/nf_nat_rule.h> | ||
20 | #include <linux/netfilter/nf_conntrack_amanda.h> | 20 | #include <linux/netfilter/nf_conntrack_amanda.h> |
21 | 21 | ||
22 | MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); | 22 | MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); |
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index abb52adf5acd..44b082fd48ab 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c | |||
@@ -691,6 +691,10 @@ static struct nf_ct_helper_expectfn follow_master_nat = { | |||
691 | .expectfn = nf_nat_follow_master, | 691 | .expectfn = nf_nat_follow_master, |
692 | }; | 692 | }; |
693 | 693 | ||
694 | static struct nfq_ct_nat_hook nfq_ct_nat = { | ||
695 | .seq_adjust = nf_nat_tcp_seq_adjust, | ||
696 | }; | ||
697 | |||
694 | static int __init nf_nat_init(void) | 698 | static int __init nf_nat_init(void) |
695 | { | 699 | { |
696 | size_t i; | 700 | size_t i; |
@@ -731,6 +735,7 @@ static int __init nf_nat_init(void) | |||
731 | nfnetlink_parse_nat_setup); | 735 | nfnetlink_parse_nat_setup); |
732 | BUG_ON(nf_ct_nat_offset != NULL); | 736 | BUG_ON(nf_ct_nat_offset != NULL); |
733 | RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); | 737 | RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); |
738 | RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat); | ||
734 | return 0; | 739 | return 0; |
735 | 740 | ||
736 | cleanup_extend: | 741 | cleanup_extend: |
@@ -747,6 +752,7 @@ static void __exit nf_nat_cleanup(void) | |||
747 | RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); | 752 | RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); |
748 | RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); | 753 | RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); |
749 | RCU_INIT_POINTER(nf_ct_nat_offset, NULL); | 754 | RCU_INIT_POINTER(nf_ct_nat_offset, NULL); |
755 | RCU_INIT_POINTER(nfq_ct_nat_hook, NULL); | ||
750 | synchronize_net(); | 756 | synchronize_net(); |
751 | } | 757 | } |
752 | 758 | ||
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index cad29c121318..c6784a18c1c4 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c | |||
@@ -95,7 +95,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, | |||
95 | unsigned char **data, | 95 | unsigned char **data, |
96 | TransportAddress *taddr, int count) | 96 | TransportAddress *taddr, int count) |
97 | { | 97 | { |
98 | const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; | 98 | const struct nf_ct_h323_master *info = nfct_help_data(ct); |
99 | int dir = CTINFO2DIR(ctinfo); | 99 | int dir = CTINFO2DIR(ctinfo); |
100 | int i; | 100 | int i; |
101 | __be16 port; | 101 | __be16 port; |
@@ -178,7 +178,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, | |||
178 | struct nf_conntrack_expect *rtp_exp, | 178 | struct nf_conntrack_expect *rtp_exp, |
179 | struct nf_conntrack_expect *rtcp_exp) | 179 | struct nf_conntrack_expect *rtcp_exp) |
180 | { | 180 | { |
181 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; | 181 | struct nf_ct_h323_master *info = nfct_help_data(ct); |
182 | int dir = CTINFO2DIR(ctinfo); | 182 | int dir = CTINFO2DIR(ctinfo); |
183 | int i; | 183 | int i; |
184 | u_int16_t nated_port; | 184 | u_int16_t nated_port; |
@@ -330,7 +330,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, | |||
330 | TransportAddress *taddr, __be16 port, | 330 | TransportAddress *taddr, __be16 port, |
331 | struct nf_conntrack_expect *exp) | 331 | struct nf_conntrack_expect *exp) |
332 | { | 332 | { |
333 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; | 333 | struct nf_ct_h323_master *info = nfct_help_data(ct); |
334 | int dir = CTINFO2DIR(ctinfo); | 334 | int dir = CTINFO2DIR(ctinfo); |
335 | u_int16_t nated_port = ntohs(port); | 335 | u_int16_t nated_port = ntohs(port); |
336 | 336 | ||
@@ -419,7 +419,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, | |||
419 | unsigned char **data, TransportAddress *taddr, int idx, | 419 | unsigned char **data, TransportAddress *taddr, int idx, |
420 | __be16 port, struct nf_conntrack_expect *exp) | 420 | __be16 port, struct nf_conntrack_expect *exp) |
421 | { | 421 | { |
422 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; | 422 | struct nf_ct_h323_master *info = nfct_help_data(ct); |
423 | int dir = CTINFO2DIR(ctinfo); | 423 | int dir = CTINFO2DIR(ctinfo); |
424 | u_int16_t nated_port = ntohs(port); | 424 | u_int16_t nated_port = ntohs(port); |
425 | union nf_inet_addr addr; | 425 | union nf_inet_addr addr; |
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index af65958f6308..2e59ad0b90ca 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c | |||
@@ -153,6 +153,19 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, | |||
153 | } | 153 | } |
154 | EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); | 154 | EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); |
155 | 155 | ||
156 | void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, | ||
157 | u32 ctinfo, int off) | ||
158 | { | ||
159 | const struct tcphdr *th; | ||
160 | |||
161 | if (nf_ct_protonum(ct) != IPPROTO_TCP) | ||
162 | return; | ||
163 | |||
164 | th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb)); | ||
165 | nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); | ||
166 | } | ||
167 | EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust); | ||
168 | |||
156 | static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, | 169 | static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, |
157 | int datalen, __sum16 *check, int oldlen) | 170 | int datalen, __sum16 *check, int oldlen) |
158 | { | 171 | { |
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index c273d58980ae..388140881ebe 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c | |||
@@ -49,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct, | |||
49 | const struct nf_nat_pptp *nat_pptp_info; | 49 | const struct nf_nat_pptp *nat_pptp_info; |
50 | struct nf_nat_ipv4_range range; | 50 | struct nf_nat_ipv4_range range; |
51 | 51 | ||
52 | ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; | 52 | ct_pptp_info = nfct_help_data(master); |
53 | nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; | 53 | nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; |
54 | 54 | ||
55 | /* And here goes the grand finale of corrosion... */ | 55 | /* And here goes the grand finale of corrosion... */ |
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb, | |||
123 | __be16 new_callid; | 123 | __be16 new_callid; |
124 | unsigned int cid_off; | 124 | unsigned int cid_off; |
125 | 125 | ||
126 | ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; | 126 | ct_pptp_info = nfct_help_data(ct); |
127 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; | 127 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; |
128 | 128 | ||
129 | new_callid = ct_pptp_info->pns_call_id; | 129 | new_callid = ct_pptp_info->pns_call_id; |
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig, | |||
192 | struct nf_ct_pptp_master *ct_pptp_info; | 192 | struct nf_ct_pptp_master *ct_pptp_info; |
193 | struct nf_nat_pptp *nat_pptp_info; | 193 | struct nf_nat_pptp *nat_pptp_info; |
194 | 194 | ||
195 | ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; | 195 | ct_pptp_info = nfct_help_data(ct); |
196 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; | 196 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; |
197 | 197 | ||
198 | /* save original PAC call ID in nat_info */ | 198 | /* save original PAC call ID in nat_info */ |
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 746edec8b86e..bac712293fd6 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c | |||
@@ -405,7 +405,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, | |||
405 | 405 | ||
406 | ptr = *octets; | 406 | ptr = *octets; |
407 | while (ctx->pointer < eoc) { | 407 | while (ctx->pointer < eoc) { |
408 | if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { | 408 | if (!asn1_octet_decode(ctx, ptr++)) { |
409 | kfree(*octets); | 409 | kfree(*octets); |
410 | *octets = NULL; | 410 | *octets = NULL; |
411 | return 0; | 411 | return 0; |
@@ -759,7 +759,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
759 | } | 759 | } |
760 | break; | 760 | break; |
761 | case SNMP_OBJECTID: | 761 | case SNMP_OBJECTID: |
762 | if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { | 762 | if (!asn1_oid_decode(ctx, end, &lp, &len)) { |
763 | kfree(id); | 763 | kfree(id); |
764 | return 0; | 764 | return 0; |
765 | } | 765 | } |
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c index a2901bf829c0..9dbb8d284f99 100644 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ b/net/ipv4/netfilter/nf_nat_tftp.c | |||
@@ -8,10 +8,10 @@ | |||
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/udp.h> | 9 | #include <linux/udp.h> |
10 | 10 | ||
11 | #include <net/netfilter/nf_nat_helper.h> | ||
12 | #include <net/netfilter/nf_nat_rule.h> | ||
13 | #include <net/netfilter/nf_conntrack_helper.h> | 11 | #include <net/netfilter/nf_conntrack_helper.h> |
14 | #include <net/netfilter/nf_conntrack_expect.h> | 12 | #include <net/netfilter/nf_conntrack_expect.h> |
13 | #include <net/netfilter/nf_nat_helper.h> | ||
14 | #include <net/netfilter/nf_nat_rule.h> | ||
15 | #include <linux/netfilter/nf_conntrack_tftp.h> | 15 | #include <linux/netfilter/nf_conntrack_tftp.h> |
16 | 16 | ||
17 | MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); | 17 | MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); |
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2c00e8bf684d..6232d476f37e 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
@@ -371,6 +371,7 @@ void ping_err(struct sk_buff *skb, u32 info) | |||
371 | break; | 371 | break; |
372 | case ICMP_DEST_UNREACH: | 372 | case ICMP_DEST_UNREACH: |
373 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ | 373 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ |
374 | ipv4_sk_update_pmtu(skb, sk, info); | ||
374 | if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { | 375 | if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { |
375 | err = EMSGSIZE; | 376 | err = EMSGSIZE; |
376 | harderr = 1; | 377 | harderr = 1; |
@@ -386,6 +387,7 @@ void ping_err(struct sk_buff *skb, u32 info) | |||
386 | break; | 387 | break; |
387 | case ICMP_REDIRECT: | 388 | case ICMP_REDIRECT: |
388 | /* See ICMP_SOURCE_QUENCH */ | 389 | /* See ICMP_SOURCE_QUENCH */ |
390 | ipv4_sk_redirect(skb, sk); | ||
389 | err = EREMOTEIO; | 391 | err = EREMOTEIO; |
390 | break; | 392 | break; |
391 | } | 393 | } |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8af0d44e4e22..957acd12250b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -232,7 +232,6 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
232 | SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), | 232 | SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), |
233 | SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), | 233 | SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), |
234 | SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), | 234 | SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), |
235 | SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), | ||
236 | SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), | 235 | SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), |
237 | SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), | 236 | SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), |
238 | SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), | 237 | SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), |
@@ -258,6 +257,12 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
258 | SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), | 257 | SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), |
259 | SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), | 258 | SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), |
260 | SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), | 259 | SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), |
260 | SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), | ||
261 | SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), | ||
262 | SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), | ||
263 | SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), | ||
264 | SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), | ||
265 | SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), | ||
261 | SNMP_MIB_SENTINEL | 266 | SNMP_MIB_SENTINEL |
262 | }; | 267 | }; |
263 | 268 | ||
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9ae5c01cd0b2..8918eff1426d 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c | |||
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; | |||
36 | 36 | ||
37 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) | 37 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) |
38 | { | 38 | { |
39 | int hash = protocol & (MAX_INET_PROTOS - 1); | 39 | return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], |
40 | |||
41 | return !cmpxchg((const struct net_protocol **)&inet_protos[hash], | ||
42 | NULL, prot) ? 0 : -1; | 40 | NULL, prot) ? 0 : -1; |
43 | } | 41 | } |
44 | EXPORT_SYMBOL(inet_add_protocol); | 42 | EXPORT_SYMBOL(inet_add_protocol); |
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol); | |||
49 | 47 | ||
50 | int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) | 48 | int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) |
51 | { | 49 | { |
52 | int ret, hash = protocol & (MAX_INET_PROTOS - 1); | 50 | int ret; |
53 | 51 | ||
54 | ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], | 52 | ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol], |
55 | prot, NULL) == prot) ? 0 : -1; | 53 | prot, NULL) == prot) ? 0 : -1; |
56 | 54 | ||
57 | synchronize_net(); | 55 | synchronize_net(); |
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4032b818f3e4..ff0f071969ea 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -216,6 +216,11 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) | |||
216 | int err = 0; | 216 | int err = 0; |
217 | int harderr = 0; | 217 | int harderr = 0; |
218 | 218 | ||
219 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) | ||
220 | ipv4_sk_update_pmtu(skb, sk, info); | ||
221 | else if (type == ICMP_REDIRECT) | ||
222 | ipv4_sk_redirect(skb, sk); | ||
223 | |||
219 | /* Report error on raw socket, if: | 224 | /* Report error on raw socket, if: |
220 | 1. User requested ip_recverr. | 225 | 1. User requested ip_recverr. |
221 | 2. Socket is connected (otherwise the error indication | 226 | 2. Socket is connected (otherwise the error indication |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98b30d08efe9..6bcb8fc71cbc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8; | |||
133 | static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; | 133 | static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; |
134 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; | 134 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; |
135 | static int ip_rt_min_advmss __read_mostly = 256; | 135 | static int ip_rt_min_advmss __read_mostly = 256; |
136 | static int rt_chain_length_max __read_mostly = 20; | ||
137 | |||
138 | static struct delayed_work expires_work; | ||
139 | static unsigned long expires_ljiffies; | ||
140 | 136 | ||
141 | /* | 137 | /* |
142 | * Interface to generic destination cache. | 138 | * Interface to generic destination cache. |
@@ -145,11 +141,12 @@ static unsigned long expires_ljiffies; | |||
145 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); | 141 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); |
146 | static unsigned int ipv4_default_advmss(const struct dst_entry *dst); | 142 | static unsigned int ipv4_default_advmss(const struct dst_entry *dst); |
147 | static unsigned int ipv4_mtu(const struct dst_entry *dst); | 143 | static unsigned int ipv4_mtu(const struct dst_entry *dst); |
148 | static void ipv4_dst_destroy(struct dst_entry *dst); | ||
149 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); | 144 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); |
150 | static void ipv4_link_failure(struct sk_buff *skb); | 145 | static void ipv4_link_failure(struct sk_buff *skb); |
151 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); | 146 | static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, |
152 | static int rt_garbage_collect(struct dst_ops *ops); | 147 | struct sk_buff *skb, u32 mtu); |
148 | static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, | ||
149 | struct sk_buff *skb); | ||
153 | 150 | ||
154 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | 151 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, |
155 | int how) | 152 | int how) |
@@ -158,54 +155,26 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | |||
158 | 155 | ||
159 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) | 156 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) |
160 | { | 157 | { |
161 | struct rtable *rt = (struct rtable *) dst; | 158 | WARN_ON(1); |
162 | struct inet_peer *peer; | 159 | return NULL; |
163 | u32 *p = NULL; | ||
164 | |||
165 | if (!rt->peer) | ||
166 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
167 | |||
168 | peer = rt->peer; | ||
169 | if (peer) { | ||
170 | u32 *old_p = __DST_METRICS_PTR(old); | ||
171 | unsigned long prev, new; | ||
172 | |||
173 | p = peer->metrics; | ||
174 | if (inet_metrics_new(peer)) | ||
175 | memcpy(p, old_p, sizeof(u32) * RTAX_MAX); | ||
176 | |||
177 | new = (unsigned long) p; | ||
178 | prev = cmpxchg(&dst->_metrics, old, new); | ||
179 | |||
180 | if (prev != old) { | ||
181 | p = __DST_METRICS_PTR(prev); | ||
182 | if (prev & DST_METRICS_READ_ONLY) | ||
183 | p = NULL; | ||
184 | } else { | ||
185 | if (rt->fi) { | ||
186 | fib_info_put(rt->fi); | ||
187 | rt->fi = NULL; | ||
188 | } | ||
189 | } | ||
190 | } | ||
191 | return p; | ||
192 | } | 160 | } |
193 | 161 | ||
194 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); | 162 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, |
163 | struct sk_buff *skb, | ||
164 | const void *daddr); | ||
195 | 165 | ||
196 | static struct dst_ops ipv4_dst_ops = { | 166 | static struct dst_ops ipv4_dst_ops = { |
197 | .family = AF_INET, | 167 | .family = AF_INET, |
198 | .protocol = cpu_to_be16(ETH_P_IP), | 168 | .protocol = cpu_to_be16(ETH_P_IP), |
199 | .gc = rt_garbage_collect, | ||
200 | .check = ipv4_dst_check, | 169 | .check = ipv4_dst_check, |
201 | .default_advmss = ipv4_default_advmss, | 170 | .default_advmss = ipv4_default_advmss, |
202 | .mtu = ipv4_mtu, | 171 | .mtu = ipv4_mtu, |
203 | .cow_metrics = ipv4_cow_metrics, | 172 | .cow_metrics = ipv4_cow_metrics, |
204 | .destroy = ipv4_dst_destroy, | ||
205 | .ifdown = ipv4_dst_ifdown, | 173 | .ifdown = ipv4_dst_ifdown, |
206 | .negative_advice = ipv4_negative_advice, | 174 | .negative_advice = ipv4_negative_advice, |
207 | .link_failure = ipv4_link_failure, | 175 | .link_failure = ipv4_link_failure, |
208 | .update_pmtu = ip_rt_update_pmtu, | 176 | .update_pmtu = ip_rt_update_pmtu, |
177 | .redirect = ip_do_redirect, | ||
209 | .local_out = __ip_local_out, | 178 | .local_out = __ip_local_out, |
210 | .neigh_lookup = ipv4_neigh_lookup, | 179 | .neigh_lookup = ipv4_neigh_lookup, |
211 | }; | 180 | }; |
@@ -232,184 +201,30 @@ const __u8 ip_tos2prio[16] = { | |||
232 | }; | 201 | }; |
233 | EXPORT_SYMBOL(ip_tos2prio); | 202 | EXPORT_SYMBOL(ip_tos2prio); |
234 | 203 | ||
235 | /* | ||
236 | * Route cache. | ||
237 | */ | ||
238 | |||
239 | /* The locking scheme is rather straight forward: | ||
240 | * | ||
241 | * 1) Read-Copy Update protects the buckets of the central route hash. | ||
242 | * 2) Only writers remove entries, and they hold the lock | ||
243 | * as they look at rtable reference counts. | ||
244 | * 3) Only readers acquire references to rtable entries, | ||
245 | * they do so with atomic increments and with the | ||
246 | * lock held. | ||
247 | */ | ||
248 | |||
249 | struct rt_hash_bucket { | ||
250 | struct rtable __rcu *chain; | ||
251 | }; | ||
252 | |||
253 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ | ||
254 | defined(CONFIG_PROVE_LOCKING) | ||
255 | /* | ||
256 | * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks | ||
257 | * The size of this table is a power of two and depends on the number of CPUS. | ||
258 | * (on lockdep we have a quite big spinlock_t, so keep the size down there) | ||
259 | */ | ||
260 | #ifdef CONFIG_LOCKDEP | ||
261 | # define RT_HASH_LOCK_SZ 256 | ||
262 | #else | ||
263 | # if NR_CPUS >= 32 | ||
264 | # define RT_HASH_LOCK_SZ 4096 | ||
265 | # elif NR_CPUS >= 16 | ||
266 | # define RT_HASH_LOCK_SZ 2048 | ||
267 | # elif NR_CPUS >= 8 | ||
268 | # define RT_HASH_LOCK_SZ 1024 | ||
269 | # elif NR_CPUS >= 4 | ||
270 | # define RT_HASH_LOCK_SZ 512 | ||
271 | # else | ||
272 | # define RT_HASH_LOCK_SZ 256 | ||
273 | # endif | ||
274 | #endif | ||
275 | |||
276 | static spinlock_t *rt_hash_locks; | ||
277 | # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] | ||
278 | |||
279 | static __init void rt_hash_lock_init(void) | ||
280 | { | ||
281 | int i; | ||
282 | |||
283 | rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, | ||
284 | GFP_KERNEL); | ||
285 | if (!rt_hash_locks) | ||
286 | panic("IP: failed to allocate rt_hash_locks\n"); | ||
287 | |||
288 | for (i = 0; i < RT_HASH_LOCK_SZ; i++) | ||
289 | spin_lock_init(&rt_hash_locks[i]); | ||
290 | } | ||
291 | #else | ||
292 | # define rt_hash_lock_addr(slot) NULL | ||
293 | |||
294 | static inline void rt_hash_lock_init(void) | ||
295 | { | ||
296 | } | ||
297 | #endif | ||
298 | |||
299 | static struct rt_hash_bucket *rt_hash_table __read_mostly; | ||
300 | static unsigned int rt_hash_mask __read_mostly; | ||
301 | static unsigned int rt_hash_log __read_mostly; | ||
302 | |||
303 | static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); | 204 | static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); |
304 | #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) | 205 | #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) |
305 | 206 | ||
306 | static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, | ||
307 | int genid) | ||
308 | { | ||
309 | return jhash_3words((__force u32)daddr, (__force u32)saddr, | ||
310 | idx, genid) | ||
311 | & rt_hash_mask; | ||
312 | } | ||
313 | |||
314 | static inline int rt_genid(struct net *net) | 207 | static inline int rt_genid(struct net *net) |
315 | { | 208 | { |
316 | return atomic_read(&net->ipv4.rt_genid); | 209 | return atomic_read(&net->ipv4.rt_genid); |
317 | } | 210 | } |
318 | 211 | ||
319 | #ifdef CONFIG_PROC_FS | 212 | #ifdef CONFIG_PROC_FS |
320 | struct rt_cache_iter_state { | ||
321 | struct seq_net_private p; | ||
322 | int bucket; | ||
323 | int genid; | ||
324 | }; | ||
325 | |||
326 | static struct rtable *rt_cache_get_first(struct seq_file *seq) | ||
327 | { | ||
328 | struct rt_cache_iter_state *st = seq->private; | ||
329 | struct rtable *r = NULL; | ||
330 | |||
331 | for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { | ||
332 | if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) | ||
333 | continue; | ||
334 | rcu_read_lock_bh(); | ||
335 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); | ||
336 | while (r) { | ||
337 | if (dev_net(r->dst.dev) == seq_file_net(seq) && | ||
338 | r->rt_genid == st->genid) | ||
339 | return r; | ||
340 | r = rcu_dereference_bh(r->dst.rt_next); | ||
341 | } | ||
342 | rcu_read_unlock_bh(); | ||
343 | } | ||
344 | return r; | ||
345 | } | ||
346 | |||
347 | static struct rtable *__rt_cache_get_next(struct seq_file *seq, | ||
348 | struct rtable *r) | ||
349 | { | ||
350 | struct rt_cache_iter_state *st = seq->private; | ||
351 | |||
352 | r = rcu_dereference_bh(r->dst.rt_next); | ||
353 | while (!r) { | ||
354 | rcu_read_unlock_bh(); | ||
355 | do { | ||
356 | if (--st->bucket < 0) | ||
357 | return NULL; | ||
358 | } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); | ||
359 | rcu_read_lock_bh(); | ||
360 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); | ||
361 | } | ||
362 | return r; | ||
363 | } | ||
364 | |||
365 | static struct rtable *rt_cache_get_next(struct seq_file *seq, | ||
366 | struct rtable *r) | ||
367 | { | ||
368 | struct rt_cache_iter_state *st = seq->private; | ||
369 | while ((r = __rt_cache_get_next(seq, r)) != NULL) { | ||
370 | if (dev_net(r->dst.dev) != seq_file_net(seq)) | ||
371 | continue; | ||
372 | if (r->rt_genid == st->genid) | ||
373 | break; | ||
374 | } | ||
375 | return r; | ||
376 | } | ||
377 | |||
378 | static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) | ||
379 | { | ||
380 | struct rtable *r = rt_cache_get_first(seq); | ||
381 | |||
382 | if (r) | ||
383 | while (pos && (r = rt_cache_get_next(seq, r))) | ||
384 | --pos; | ||
385 | return pos ? NULL : r; | ||
386 | } | ||
387 | |||
388 | static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) | 213 | static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) |
389 | { | 214 | { |
390 | struct rt_cache_iter_state *st = seq->private; | ||
391 | if (*pos) | 215 | if (*pos) |
392 | return rt_cache_get_idx(seq, *pos - 1); | 216 | return NULL; |
393 | st->genid = rt_genid(seq_file_net(seq)); | ||
394 | return SEQ_START_TOKEN; | 217 | return SEQ_START_TOKEN; |
395 | } | 218 | } |
396 | 219 | ||
397 | static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) | 220 | static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
398 | { | 221 | { |
399 | struct rtable *r; | ||
400 | |||
401 | if (v == SEQ_START_TOKEN) | ||
402 | r = rt_cache_get_first(seq); | ||
403 | else | ||
404 | r = rt_cache_get_next(seq, v); | ||
405 | ++*pos; | 222 | ++*pos; |
406 | return r; | 223 | return NULL; |
407 | } | 224 | } |
408 | 225 | ||
409 | static void rt_cache_seq_stop(struct seq_file *seq, void *v) | 226 | static void rt_cache_seq_stop(struct seq_file *seq, void *v) |
410 | { | 227 | { |
411 | if (v && v != SEQ_START_TOKEN) | ||
412 | rcu_read_unlock_bh(); | ||
413 | } | 228 | } |
414 | 229 | ||
415 | static int rt_cache_seq_show(struct seq_file *seq, void *v) | 230 | static int rt_cache_seq_show(struct seq_file *seq, void *v) |
@@ -419,34 +234,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) | |||
419 | "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" | 234 | "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" |
420 | "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" | 235 | "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" |
421 | "HHUptod\tSpecDst"); | 236 | "HHUptod\tSpecDst"); |
422 | else { | ||
423 | struct rtable *r = v; | ||
424 | struct neighbour *n; | ||
425 | int len, HHUptod; | ||
426 | |||
427 | rcu_read_lock(); | ||
428 | n = dst_get_neighbour_noref(&r->dst); | ||
429 | HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; | ||
430 | rcu_read_unlock(); | ||
431 | |||
432 | seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" | ||
433 | "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", | ||
434 | r->dst.dev ? r->dst.dev->name : "*", | ||
435 | (__force u32)r->rt_dst, | ||
436 | (__force u32)r->rt_gateway, | ||
437 | r->rt_flags, atomic_read(&r->dst.__refcnt), | ||
438 | r->dst.__use, 0, (__force u32)r->rt_src, | ||
439 | dst_metric_advmss(&r->dst) + 40, | ||
440 | dst_metric(&r->dst, RTAX_WINDOW), | ||
441 | (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + | ||
442 | dst_metric(&r->dst, RTAX_RTTVAR)), | ||
443 | r->rt_key_tos, | ||
444 | -1, | ||
445 | HHUptod, | ||
446 | r->rt_spec_dst, &len); | ||
447 | |||
448 | seq_printf(seq, "%*s\n", 127 - len, ""); | ||
449 | } | ||
450 | return 0; | 237 | return 0; |
451 | } | 238 | } |
452 | 239 | ||
@@ -459,8 +246,7 @@ static const struct seq_operations rt_cache_seq_ops = { | |||
459 | 246 | ||
460 | static int rt_cache_seq_open(struct inode *inode, struct file *file) | 247 | static int rt_cache_seq_open(struct inode *inode, struct file *file) |
461 | { | 248 | { |
462 | return seq_open_net(inode, file, &rt_cache_seq_ops, | 249 | return seq_open(file, &rt_cache_seq_ops); |
463 | sizeof(struct rt_cache_iter_state)); | ||
464 | } | 250 | } |
465 | 251 | ||
466 | static const struct file_operations rt_cache_seq_fops = { | 252 | static const struct file_operations rt_cache_seq_fops = { |
@@ -468,7 +254,7 @@ static const struct file_operations rt_cache_seq_fops = { | |||
468 | .open = rt_cache_seq_open, | 254 | .open = rt_cache_seq_open, |
469 | .read = seq_read, | 255 | .read = seq_read, |
470 | .llseek = seq_lseek, | 256 | .llseek = seq_lseek, |
471 | .release = seq_release_net, | 257 | .release = seq_release, |
472 | }; | 258 | }; |
473 | 259 | ||
474 | 260 | ||
@@ -658,275 +444,12 @@ static inline int ip_rt_proc_init(void) | |||
658 | } | 444 | } |
659 | #endif /* CONFIG_PROC_FS */ | 445 | #endif /* CONFIG_PROC_FS */ |
660 | 446 | ||
661 | static inline void rt_free(struct rtable *rt) | ||
662 | { | ||
663 | call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); | ||
664 | } | ||
665 | |||
666 | static inline void rt_drop(struct rtable *rt) | ||
667 | { | ||
668 | ip_rt_put(rt); | ||
669 | call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); | ||
670 | } | ||
671 | |||
672 | static inline int rt_fast_clean(struct rtable *rth) | ||
673 | { | ||
674 | /* Kill broadcast/multicast entries very aggresively, if they | ||
675 | collide in hash table with more useful entries */ | ||
676 | return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && | ||
677 | rt_is_input_route(rth) && rth->dst.rt_next; | ||
678 | } | ||
679 | |||
680 | static inline int rt_valuable(struct rtable *rth) | ||
681 | { | ||
682 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || | ||
683 | (rth->peer && rth->peer->pmtu_expires); | ||
684 | } | ||
685 | |||
686 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) | ||
687 | { | ||
688 | unsigned long age; | ||
689 | int ret = 0; | ||
690 | |||
691 | if (atomic_read(&rth->dst.__refcnt)) | ||
692 | goto out; | ||
693 | |||
694 | age = jiffies - rth->dst.lastuse; | ||
695 | if ((age <= tmo1 && !rt_fast_clean(rth)) || | ||
696 | (age <= tmo2 && rt_valuable(rth))) | ||
697 | goto out; | ||
698 | ret = 1; | ||
699 | out: return ret; | ||
700 | } | ||
701 | |||
702 | /* Bits of score are: | ||
703 | * 31: very valuable | ||
704 | * 30: not quite useless | ||
705 | * 29..0: usage counter | ||
706 | */ | ||
707 | static inline u32 rt_score(struct rtable *rt) | ||
708 | { | ||
709 | u32 score = jiffies - rt->dst.lastuse; | ||
710 | |||
711 | score = ~score & ~(3<<30); | ||
712 | |||
713 | if (rt_valuable(rt)) | ||
714 | score |= (1<<31); | ||
715 | |||
716 | if (rt_is_output_route(rt) || | ||
717 | !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) | ||
718 | score |= (1<<30); | ||
719 | |||
720 | return score; | ||
721 | } | ||
722 | |||
723 | static inline bool rt_caching(const struct net *net) | ||
724 | { | ||
725 | return net->ipv4.current_rt_cache_rebuild_count <= | ||
726 | net->ipv4.sysctl_rt_cache_rebuild_count; | ||
727 | } | ||
728 | |||
729 | static inline bool compare_hash_inputs(const struct rtable *rt1, | ||
730 | const struct rtable *rt2) | ||
731 | { | ||
732 | return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | | ||
733 | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | | ||
734 | (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); | ||
735 | } | ||
736 | |||
737 | static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) | ||
738 | { | ||
739 | return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | | ||
740 | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | | ||
741 | (rt1->rt_mark ^ rt2->rt_mark) | | ||
742 | (rt1->rt_key_tos ^ rt2->rt_key_tos) | | ||
743 | (rt1->rt_route_iif ^ rt2->rt_route_iif) | | ||
744 | (rt1->rt_oif ^ rt2->rt_oif)) == 0; | ||
745 | } | ||
746 | |||
747 | static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) | ||
748 | { | ||
749 | return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); | ||
750 | } | ||
751 | |||
752 | static inline int rt_is_expired(struct rtable *rth) | 447 | static inline int rt_is_expired(struct rtable *rth) |
753 | { | 448 | { |
754 | return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); | 449 | return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); |
755 | } | 450 | } |
756 | 451 | ||
757 | /* | 452 | /* |
758 | * Perform a full scan of hash table and free all entries. | ||
759 | * Can be called by a softirq or a process. | ||
760 | * In the later case, we want to be reschedule if necessary | ||
761 | */ | ||
762 | static void rt_do_flush(struct net *net, int process_context) | ||
763 | { | ||
764 | unsigned int i; | ||
765 | struct rtable *rth, *next; | ||
766 | |||
767 | for (i = 0; i <= rt_hash_mask; i++) { | ||
768 | struct rtable __rcu **pprev; | ||
769 | struct rtable *list; | ||
770 | |||
771 | if (process_context && need_resched()) | ||
772 | cond_resched(); | ||
773 | rth = rcu_access_pointer(rt_hash_table[i].chain); | ||
774 | if (!rth) | ||
775 | continue; | ||
776 | |||
777 | spin_lock_bh(rt_hash_lock_addr(i)); | ||
778 | |||
779 | list = NULL; | ||
780 | pprev = &rt_hash_table[i].chain; | ||
781 | rth = rcu_dereference_protected(*pprev, | ||
782 | lockdep_is_held(rt_hash_lock_addr(i))); | ||
783 | |||
784 | while (rth) { | ||
785 | next = rcu_dereference_protected(rth->dst.rt_next, | ||
786 | lockdep_is_held(rt_hash_lock_addr(i))); | ||
787 | |||
788 | if (!net || | ||
789 | net_eq(dev_net(rth->dst.dev), net)) { | ||
790 | rcu_assign_pointer(*pprev, next); | ||
791 | rcu_assign_pointer(rth->dst.rt_next, list); | ||
792 | list = rth; | ||
793 | } else { | ||
794 | pprev = &rth->dst.rt_next; | ||
795 | } | ||
796 | rth = next; | ||
797 | } | ||
798 | |||
799 | spin_unlock_bh(rt_hash_lock_addr(i)); | ||
800 | |||
801 | for (; list; list = next) { | ||
802 | next = rcu_dereference_protected(list->dst.rt_next, 1); | ||
803 | rt_free(list); | ||
804 | } | ||
805 | } | ||
806 | } | ||
807 | |||
808 | /* | ||
809 | * While freeing expired entries, we compute average chain length | ||
810 | * and standard deviation, using fixed-point arithmetic. | ||
811 | * This to have an estimation of rt_chain_length_max | ||
812 | * rt_chain_length_max = max(elasticity, AVG + 4*SD) | ||
813 | * We use 3 bits for frational part, and 29 (or 61) for magnitude. | ||
814 | */ | ||
815 | |||
816 | #define FRACT_BITS 3 | ||
817 | #define ONE (1UL << FRACT_BITS) | ||
818 | |||
819 | /* | ||
820 | * Given a hash chain and an item in this hash chain, | ||
821 | * find if a previous entry has the same hash_inputs | ||
822 | * (but differs on tos, mark or oif) | ||
823 | * Returns 0 if an alias is found. | ||
824 | * Returns ONE if rth has no alias before itself. | ||
825 | */ | ||
826 | static int has_noalias(const struct rtable *head, const struct rtable *rth) | ||
827 | { | ||
828 | const struct rtable *aux = head; | ||
829 | |||
830 | while (aux != rth) { | ||
831 | if (compare_hash_inputs(aux, rth)) | ||
832 | return 0; | ||
833 | aux = rcu_dereference_protected(aux->dst.rt_next, 1); | ||
834 | } | ||
835 | return ONE; | ||
836 | } | ||
837 | |||
838 | static void rt_check_expire(void) | ||
839 | { | ||
840 | static unsigned int rover; | ||
841 | unsigned int i = rover, goal; | ||
842 | struct rtable *rth; | ||
843 | struct rtable __rcu **rthp; | ||
844 | unsigned long samples = 0; | ||
845 | unsigned long sum = 0, sum2 = 0; | ||
846 | unsigned long delta; | ||
847 | u64 mult; | ||
848 | |||
849 | delta = jiffies - expires_ljiffies; | ||
850 | expires_ljiffies = jiffies; | ||
851 | mult = ((u64)delta) << rt_hash_log; | ||
852 | if (ip_rt_gc_timeout > 1) | ||
853 | do_div(mult, ip_rt_gc_timeout); | ||
854 | goal = (unsigned int)mult; | ||
855 | if (goal > rt_hash_mask) | ||
856 | goal = rt_hash_mask + 1; | ||
857 | for (; goal > 0; goal--) { | ||
858 | unsigned long tmo = ip_rt_gc_timeout; | ||
859 | unsigned long length; | ||
860 | |||
861 | i = (i + 1) & rt_hash_mask; | ||
862 | rthp = &rt_hash_table[i].chain; | ||
863 | |||
864 | if (need_resched()) | ||
865 | cond_resched(); | ||
866 | |||
867 | samples++; | ||
868 | |||
869 | if (rcu_dereference_raw(*rthp) == NULL) | ||
870 | continue; | ||
871 | length = 0; | ||
872 | spin_lock_bh(rt_hash_lock_addr(i)); | ||
873 | while ((rth = rcu_dereference_protected(*rthp, | ||
874 | lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { | ||
875 | prefetch(rth->dst.rt_next); | ||
876 | if (rt_is_expired(rth)) { | ||
877 | *rthp = rth->dst.rt_next; | ||
878 | rt_free(rth); | ||
879 | continue; | ||
880 | } | ||
881 | if (rth->dst.expires) { | ||
882 | /* Entry is expired even if it is in use */ | ||
883 | if (time_before_eq(jiffies, rth->dst.expires)) { | ||
884 | nofree: | ||
885 | tmo >>= 1; | ||
886 | rthp = &rth->dst.rt_next; | ||
887 | /* | ||
888 | * We only count entries on | ||
889 | * a chain with equal hash inputs once | ||
890 | * so that entries for different QOS | ||
891 | * levels, and other non-hash input | ||
892 | * attributes don't unfairly skew | ||
893 | * the length computation | ||
894 | */ | ||
895 | length += has_noalias(rt_hash_table[i].chain, rth); | ||
896 | continue; | ||
897 | } | ||
898 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) | ||
899 | goto nofree; | ||
900 | |||
901 | /* Cleanup aged off entries. */ | ||
902 | *rthp = rth->dst.rt_next; | ||
903 | rt_free(rth); | ||
904 | } | ||
905 | spin_unlock_bh(rt_hash_lock_addr(i)); | ||
906 | sum += length; | ||
907 | sum2 += length*length; | ||
908 | } | ||
909 | if (samples) { | ||
910 | unsigned long avg = sum / samples; | ||
911 | unsigned long sd = int_sqrt(sum2 / samples - avg*avg); | ||
912 | rt_chain_length_max = max_t(unsigned long, | ||
913 | ip_rt_gc_elasticity, | ||
914 | (avg + 4*sd) >> FRACT_BITS); | ||
915 | } | ||
916 | rover = i; | ||
917 | } | ||
918 | |||
919 | /* | ||
920 | * rt_worker_func() is run in process context. | ||
921 | * we call rt_check_expire() to scan part of the hash table | ||
922 | */ | ||
923 | static void rt_worker_func(struct work_struct *work) | ||
924 | { | ||
925 | rt_check_expire(); | ||
926 | schedule_delayed_work(&expires_work, ip_rt_gc_interval); | ||
927 | } | ||
928 | |||
929 | /* | ||
930 | * Perturbation of rt_genid by a small quantity [1..256] | 453 | * Perturbation of rt_genid by a small quantity [1..256] |
931 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() | 454 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() |
932 | * many times (2^24) without giving recent rt_genid. | 455 | * many times (2^24) without giving recent rt_genid. |
@@ -938,7 +461,6 @@ static void rt_cache_invalidate(struct net *net) | |||
938 | 461 | ||
939 | get_random_bytes(&shuffle, sizeof(shuffle)); | 462 | get_random_bytes(&shuffle, sizeof(shuffle)); |
940 | atomic_add(shuffle + 1U, &net->ipv4.rt_genid); | 463 | atomic_add(shuffle + 1U, &net->ipv4.rt_genid); |
941 | inetpeer_invalidate_tree(AF_INET); | ||
942 | } | 464 | } |
943 | 465 | ||
944 | /* | 466 | /* |
@@ -948,183 +470,22 @@ static void rt_cache_invalidate(struct net *net) | |||
948 | void rt_cache_flush(struct net *net, int delay) | 470 | void rt_cache_flush(struct net *net, int delay) |
949 | { | 471 | { |
950 | rt_cache_invalidate(net); | 472 | rt_cache_invalidate(net); |
951 | if (delay >= 0) | ||
952 | rt_do_flush(net, !in_softirq()); | ||
953 | } | ||
954 | |||
955 | /* Flush previous cache invalidated entries from the cache */ | ||
956 | void rt_cache_flush_batch(struct net *net) | ||
957 | { | ||
958 | rt_do_flush(net, !in_softirq()); | ||
959 | } | ||
960 | |||
961 | static void rt_emergency_hash_rebuild(struct net *net) | ||
962 | { | ||
963 | net_warn_ratelimited("Route hash chain too long!\n"); | ||
964 | rt_cache_invalidate(net); | ||
965 | } | ||
966 | |||
967 | /* | ||
968 | Short description of GC goals. | ||
969 | |||
970 | We want to build algorithm, which will keep routing cache | ||
971 | at some equilibrium point, when number of aged off entries | ||
972 | is kept approximately equal to newly generated ones. | ||
973 | |||
974 | Current expiration strength is variable "expire". | ||
975 | We try to adjust it dynamically, so that if networking | ||
976 | is idle expires is large enough to keep enough of warm entries, | ||
977 | and when load increases it reduces to limit cache size. | ||
978 | */ | ||
979 | |||
980 | static int rt_garbage_collect(struct dst_ops *ops) | ||
981 | { | ||
982 | static unsigned long expire = RT_GC_TIMEOUT; | ||
983 | static unsigned long last_gc; | ||
984 | static int rover; | ||
985 | static int equilibrium; | ||
986 | struct rtable *rth; | ||
987 | struct rtable __rcu **rthp; | ||
988 | unsigned long now = jiffies; | ||
989 | int goal; | ||
990 | int entries = dst_entries_get_fast(&ipv4_dst_ops); | ||
991 | |||
992 | /* | ||
993 | * Garbage collection is pretty expensive, | ||
994 | * do not make it too frequently. | ||
995 | */ | ||
996 | |||
997 | RT_CACHE_STAT_INC(gc_total); | ||
998 | |||
999 | if (now - last_gc < ip_rt_gc_min_interval && | ||
1000 | entries < ip_rt_max_size) { | ||
1001 | RT_CACHE_STAT_INC(gc_ignored); | ||
1002 | goto out; | ||
1003 | } | ||
1004 | |||
1005 | entries = dst_entries_get_slow(&ipv4_dst_ops); | ||
1006 | /* Calculate number of entries, which we want to expire now. */ | ||
1007 | goal = entries - (ip_rt_gc_elasticity << rt_hash_log); | ||
1008 | if (goal <= 0) { | ||
1009 | if (equilibrium < ipv4_dst_ops.gc_thresh) | ||
1010 | equilibrium = ipv4_dst_ops.gc_thresh; | ||
1011 | goal = entries - equilibrium; | ||
1012 | if (goal > 0) { | ||
1013 | equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); | ||
1014 | goal = entries - equilibrium; | ||
1015 | } | ||
1016 | } else { | ||
1017 | /* We are in dangerous area. Try to reduce cache really | ||
1018 | * aggressively. | ||
1019 | */ | ||
1020 | goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); | ||
1021 | equilibrium = entries - goal; | ||
1022 | } | ||
1023 | |||
1024 | if (now - last_gc >= ip_rt_gc_min_interval) | ||
1025 | last_gc = now; | ||
1026 | |||
1027 | if (goal <= 0) { | ||
1028 | equilibrium += goal; | ||
1029 | goto work_done; | ||
1030 | } | ||
1031 | |||
1032 | do { | ||
1033 | int i, k; | ||
1034 | |||
1035 | for (i = rt_hash_mask, k = rover; i >= 0; i--) { | ||
1036 | unsigned long tmo = expire; | ||
1037 | |||
1038 | k = (k + 1) & rt_hash_mask; | ||
1039 | rthp = &rt_hash_table[k].chain; | ||
1040 | spin_lock_bh(rt_hash_lock_addr(k)); | ||
1041 | while ((rth = rcu_dereference_protected(*rthp, | ||
1042 | lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { | ||
1043 | if (!rt_is_expired(rth) && | ||
1044 | !rt_may_expire(rth, tmo, expire)) { | ||
1045 | tmo >>= 1; | ||
1046 | rthp = &rth->dst.rt_next; | ||
1047 | continue; | ||
1048 | } | ||
1049 | *rthp = rth->dst.rt_next; | ||
1050 | rt_free(rth); | ||
1051 | goal--; | ||
1052 | } | ||
1053 | spin_unlock_bh(rt_hash_lock_addr(k)); | ||
1054 | if (goal <= 0) | ||
1055 | break; | ||
1056 | } | ||
1057 | rover = k; | ||
1058 | |||
1059 | if (goal <= 0) | ||
1060 | goto work_done; | ||
1061 | |||
1062 | /* Goal is not achieved. We stop process if: | ||
1063 | |||
1064 | - if expire reduced to zero. Otherwise, expire is halfed. | ||
1065 | - if table is not full. | ||
1066 | - if we are called from interrupt. | ||
1067 | - jiffies check is just fallback/debug loop breaker. | ||
1068 | We will not spin here for long time in any case. | ||
1069 | */ | ||
1070 | |||
1071 | RT_CACHE_STAT_INC(gc_goal_miss); | ||
1072 | |||
1073 | if (expire == 0) | ||
1074 | break; | ||
1075 | |||
1076 | expire >>= 1; | ||
1077 | |||
1078 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) | ||
1079 | goto out; | ||
1080 | } while (!in_softirq() && time_before_eq(jiffies, now)); | ||
1081 | |||
1082 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) | ||
1083 | goto out; | ||
1084 | if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) | ||
1085 | goto out; | ||
1086 | net_warn_ratelimited("dst cache overflow\n"); | ||
1087 | RT_CACHE_STAT_INC(gc_dst_overflow); | ||
1088 | return 1; | ||
1089 | |||
1090 | work_done: | ||
1091 | expire += ip_rt_gc_min_interval; | ||
1092 | if (expire > ip_rt_gc_timeout || | ||
1093 | dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || | ||
1094 | dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) | ||
1095 | expire = ip_rt_gc_timeout; | ||
1096 | out: return 0; | ||
1097 | } | ||
1098 | |||
1099 | /* | ||
1100 | * Returns number of entries in a hash chain that have different hash_inputs | ||
1101 | */ | ||
1102 | static int slow_chain_length(const struct rtable *head) | ||
1103 | { | ||
1104 | int length = 0; | ||
1105 | const struct rtable *rth = head; | ||
1106 | |||
1107 | while (rth) { | ||
1108 | length += has_noalias(head, rth); | ||
1109 | rth = rcu_dereference_protected(rth->dst.rt_next, 1); | ||
1110 | } | ||
1111 | return length >> FRACT_BITS; | ||
1112 | } | 473 | } |
1113 | 474 | ||
1114 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) | 475 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, |
476 | struct sk_buff *skb, | ||
477 | const void *daddr) | ||
1115 | { | 478 | { |
1116 | static const __be32 inaddr_any = 0; | ||
1117 | struct net_device *dev = dst->dev; | 479 | struct net_device *dev = dst->dev; |
1118 | const __be32 *pkey = daddr; | 480 | const __be32 *pkey = daddr; |
1119 | const struct rtable *rt; | 481 | const struct rtable *rt; |
1120 | struct neighbour *n; | 482 | struct neighbour *n; |
1121 | 483 | ||
1122 | rt = (const struct rtable *) dst; | 484 | rt = (const struct rtable *) dst; |
1123 | 485 | if (rt->rt_gateway) | |
1124 | if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) | ||
1125 | pkey = &inaddr_any; | ||
1126 | else if (rt->rt_gateway) | ||
1127 | pkey = (const __be32 *) &rt->rt_gateway; | 486 | pkey = (const __be32 *) &rt->rt_gateway; |
487 | else if (skb) | ||
488 | pkey = &ip_hdr(skb)->daddr; | ||
1128 | 489 | ||
1129 | n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); | 490 | n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); |
1130 | if (n) | 491 | if (n) |
@@ -1132,212 +493,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo | |||
1132 | return neigh_create(&arp_tbl, pkey, dev); | 493 | return neigh_create(&arp_tbl, pkey, dev); |
1133 | } | 494 | } |
1134 | 495 | ||
1135 | static int rt_bind_neighbour(struct rtable *rt) | ||
1136 | { | ||
1137 | struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); | ||
1138 | if (IS_ERR(n)) | ||
1139 | return PTR_ERR(n); | ||
1140 | dst_set_neighbour(&rt->dst, n); | ||
1141 | |||
1142 | return 0; | ||
1143 | } | ||
1144 | |||
1145 | static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, | ||
1146 | struct sk_buff *skb, int ifindex) | ||
1147 | { | ||
1148 | struct rtable *rth, *cand; | ||
1149 | struct rtable __rcu **rthp, **candp; | ||
1150 | unsigned long now; | ||
1151 | u32 min_score; | ||
1152 | int chain_length; | ||
1153 | int attempts = !in_softirq(); | ||
1154 | |||
1155 | restart: | ||
1156 | chain_length = 0; | ||
1157 | min_score = ~(u32)0; | ||
1158 | cand = NULL; | ||
1159 | candp = NULL; | ||
1160 | now = jiffies; | ||
1161 | |||
1162 | if (!rt_caching(dev_net(rt->dst.dev))) { | ||
1163 | /* | ||
1164 | * If we're not caching, just tell the caller we | ||
1165 | * were successful and don't touch the route. The | ||
1166 | * caller hold the sole reference to the cache entry, and | ||
1167 | * it will be released when the caller is done with it. | ||
1168 | * If we drop it here, the callers have no way to resolve routes | ||
1169 | * when we're not caching. Instead, just point *rp at rt, so | ||
1170 | * the caller gets a single use out of the route | ||
1171 | * Note that we do rt_free on this new route entry, so that | ||
1172 | * once its refcount hits zero, we are still able to reap it | ||
1173 | * (Thanks Alexey) | ||
1174 | * Note: To avoid expensive rcu stuff for this uncached dst, | ||
1175 | * we set DST_NOCACHE so that dst_release() can free dst without | ||
1176 | * waiting a grace period. | ||
1177 | */ | ||
1178 | |||
1179 | rt->dst.flags |= DST_NOCACHE; | ||
1180 | if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { | ||
1181 | int err = rt_bind_neighbour(rt); | ||
1182 | if (err) { | ||
1183 | net_warn_ratelimited("Neighbour table failure & not caching routes\n"); | ||
1184 | ip_rt_put(rt); | ||
1185 | return ERR_PTR(err); | ||
1186 | } | ||
1187 | } | ||
1188 | |||
1189 | goto skip_hashing; | ||
1190 | } | ||
1191 | |||
1192 | rthp = &rt_hash_table[hash].chain; | ||
1193 | |||
1194 | spin_lock_bh(rt_hash_lock_addr(hash)); | ||
1195 | while ((rth = rcu_dereference_protected(*rthp, | ||
1196 | lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { | ||
1197 | if (rt_is_expired(rth)) { | ||
1198 | *rthp = rth->dst.rt_next; | ||
1199 | rt_free(rth); | ||
1200 | continue; | ||
1201 | } | ||
1202 | if (compare_keys(rth, rt) && compare_netns(rth, rt)) { | ||
1203 | /* Put it first */ | ||
1204 | *rthp = rth->dst.rt_next; | ||
1205 | /* | ||
1206 | * Since lookup is lockfree, the deletion | ||
1207 | * must be visible to another weakly ordered CPU before | ||
1208 | * the insertion at the start of the hash chain. | ||
1209 | */ | ||
1210 | rcu_assign_pointer(rth->dst.rt_next, | ||
1211 | rt_hash_table[hash].chain); | ||
1212 | /* | ||
1213 | * Since lookup is lockfree, the update writes | ||
1214 | * must be ordered for consistency on SMP. | ||
1215 | */ | ||
1216 | rcu_assign_pointer(rt_hash_table[hash].chain, rth); | ||
1217 | |||
1218 | dst_use(&rth->dst, now); | ||
1219 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1220 | |||
1221 | rt_drop(rt); | ||
1222 | if (skb) | ||
1223 | skb_dst_set(skb, &rth->dst); | ||
1224 | return rth; | ||
1225 | } | ||
1226 | |||
1227 | if (!atomic_read(&rth->dst.__refcnt)) { | ||
1228 | u32 score = rt_score(rth); | ||
1229 | |||
1230 | if (score <= min_score) { | ||
1231 | cand = rth; | ||
1232 | candp = rthp; | ||
1233 | min_score = score; | ||
1234 | } | ||
1235 | } | ||
1236 | |||
1237 | chain_length++; | ||
1238 | |||
1239 | rthp = &rth->dst.rt_next; | ||
1240 | } | ||
1241 | |||
1242 | if (cand) { | ||
1243 | /* ip_rt_gc_elasticity used to be average length of chain | ||
1244 | * length, when exceeded gc becomes really aggressive. | ||
1245 | * | ||
1246 | * The second limit is less certain. At the moment it allows | ||
1247 | * only 2 entries per bucket. We will see. | ||
1248 | */ | ||
1249 | if (chain_length > ip_rt_gc_elasticity) { | ||
1250 | *candp = cand->dst.rt_next; | ||
1251 | rt_free(cand); | ||
1252 | } | ||
1253 | } else { | ||
1254 | if (chain_length > rt_chain_length_max && | ||
1255 | slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { | ||
1256 | struct net *net = dev_net(rt->dst.dev); | ||
1257 | int num = ++net->ipv4.current_rt_cache_rebuild_count; | ||
1258 | if (!rt_caching(net)) { | ||
1259 | pr_warn("%s: %d rebuilds is over limit, route caching disabled\n", | ||
1260 | rt->dst.dev->name, num); | ||
1261 | } | ||
1262 | rt_emergency_hash_rebuild(net); | ||
1263 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1264 | |||
1265 | hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, | ||
1266 | ifindex, rt_genid(net)); | ||
1267 | goto restart; | ||
1268 | } | ||
1269 | } | ||
1270 | |||
1271 | /* Try to bind route to arp only if it is output | ||
1272 | route or unicast forwarding path. | ||
1273 | */ | ||
1274 | if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { | ||
1275 | int err = rt_bind_neighbour(rt); | ||
1276 | if (err) { | ||
1277 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1278 | |||
1279 | if (err != -ENOBUFS) { | ||
1280 | rt_drop(rt); | ||
1281 | return ERR_PTR(err); | ||
1282 | } | ||
1283 | |||
1284 | /* Neighbour tables are full and nothing | ||
1285 | can be released. Try to shrink route cache, | ||
1286 | it is most likely it holds some neighbour records. | ||
1287 | */ | ||
1288 | if (attempts-- > 0) { | ||
1289 | int saved_elasticity = ip_rt_gc_elasticity; | ||
1290 | int saved_int = ip_rt_gc_min_interval; | ||
1291 | ip_rt_gc_elasticity = 1; | ||
1292 | ip_rt_gc_min_interval = 0; | ||
1293 | rt_garbage_collect(&ipv4_dst_ops); | ||
1294 | ip_rt_gc_min_interval = saved_int; | ||
1295 | ip_rt_gc_elasticity = saved_elasticity; | ||
1296 | goto restart; | ||
1297 | } | ||
1298 | |||
1299 | net_warn_ratelimited("Neighbour table overflow\n"); | ||
1300 | rt_drop(rt); | ||
1301 | return ERR_PTR(-ENOBUFS); | ||
1302 | } | ||
1303 | } | ||
1304 | |||
1305 | rt->dst.rt_next = rt_hash_table[hash].chain; | ||
1306 | |||
1307 | /* | ||
1308 | * Since lookup is lockfree, we must make sure | ||
1309 | * previous writes to rt are committed to memory | ||
1310 | * before making rt visible to other CPUS. | ||
1311 | */ | ||
1312 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); | ||
1313 | |||
1314 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1315 | |||
1316 | skip_hashing: | ||
1317 | if (skb) | ||
1318 | skb_dst_set(skb, &rt->dst); | ||
1319 | return rt; | ||
1320 | } | ||
1321 | |||
1322 | static atomic_t __rt_peer_genid = ATOMIC_INIT(0); | ||
1323 | |||
1324 | static u32 rt_peer_genid(void) | ||
1325 | { | ||
1326 | return atomic_read(&__rt_peer_genid); | ||
1327 | } | ||
1328 | |||
1329 | void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) | ||
1330 | { | ||
1331 | struct inet_peer *peer; | ||
1332 | |||
1333 | peer = inet_getpeer_v4(daddr, create); | ||
1334 | |||
1335 | if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) | ||
1336 | inet_putpeer(peer); | ||
1337 | else | ||
1338 | rt->rt_peer_genid = rt_peer_genid(); | ||
1339 | } | ||
1340 | |||
1341 | /* | 496 | /* |
1342 | * Peer allocation may fail only in serious out-of-memory conditions. However | 497 | * Peer allocation may fail only in serious out-of-memory conditions. However |
1343 | * we still can generate some output. | 498 | * we still can generate some output. |
@@ -1360,83 +515,188 @@ static void ip_select_fb_ident(struct iphdr *iph) | |||
1360 | 515 | ||
1361 | void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) | 516 | void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) |
1362 | { | 517 | { |
1363 | struct rtable *rt = (struct rtable *) dst; | 518 | struct net *net = dev_net(dst->dev); |
1364 | 519 | struct inet_peer *peer; | |
1365 | if (rt && !(rt->dst.flags & DST_NOPEER)) { | ||
1366 | if (rt->peer == NULL) | ||
1367 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
1368 | 520 | ||
1369 | /* If peer is attached to destination, it is never detached, | 521 | peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); |
1370 | so that we need not to grab a lock to dereference it. | 522 | if (peer) { |
1371 | */ | 523 | iph->id = htons(inet_getid(peer, more)); |
1372 | if (rt->peer) { | 524 | inet_putpeer(peer); |
1373 | iph->id = htons(inet_getid(rt->peer, more)); | 525 | return; |
1374 | return; | 526 | } |
1375 | } | ||
1376 | } else if (!rt) | ||
1377 | pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0)); | ||
1378 | 527 | ||
1379 | ip_select_fb_ident(iph); | 528 | ip_select_fb_ident(iph); |
1380 | } | 529 | } |
1381 | EXPORT_SYMBOL(__ip_select_ident); | 530 | EXPORT_SYMBOL(__ip_select_ident); |
1382 | 531 | ||
1383 | static void rt_del(unsigned int hash, struct rtable *rt) | 532 | static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, |
533 | const struct iphdr *iph, | ||
534 | int oif, u8 tos, | ||
535 | u8 prot, u32 mark, int flow_flags) | ||
1384 | { | 536 | { |
1385 | struct rtable __rcu **rthp; | 537 | if (sk) { |
1386 | struct rtable *aux; | 538 | const struct inet_sock *inet = inet_sk(sk); |
1387 | 539 | ||
1388 | rthp = &rt_hash_table[hash].chain; | 540 | oif = sk->sk_bound_dev_if; |
1389 | spin_lock_bh(rt_hash_lock_addr(hash)); | 541 | mark = sk->sk_mark; |
1390 | ip_rt_put(rt); | 542 | tos = RT_CONN_FLAGS(sk); |
1391 | while ((aux = rcu_dereference_protected(*rthp, | 543 | prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; |
1392 | lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { | ||
1393 | if (aux == rt || rt_is_expired(aux)) { | ||
1394 | *rthp = aux->dst.rt_next; | ||
1395 | rt_free(aux); | ||
1396 | continue; | ||
1397 | } | ||
1398 | rthp = &aux->dst.rt_next; | ||
1399 | } | 544 | } |
1400 | spin_unlock_bh(rt_hash_lock_addr(hash)); | 545 | flowi4_init_output(fl4, oif, mark, tos, |
546 | RT_SCOPE_UNIVERSE, prot, | ||
547 | flow_flags, | ||
548 | iph->daddr, iph->saddr, 0, 0); | ||
1401 | } | 549 | } |
1402 | 550 | ||
1403 | static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) | 551 | static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, |
552 | const struct sock *sk) | ||
1404 | { | 553 | { |
1405 | struct rtable *rt = (struct rtable *) dst; | 554 | const struct iphdr *iph = ip_hdr(skb); |
1406 | __be32 orig_gw = rt->rt_gateway; | 555 | int oif = skb->dev->ifindex; |
1407 | struct neighbour *n, *old_n; | 556 | u8 tos = RT_TOS(iph->tos); |
557 | u8 prot = iph->protocol; | ||
558 | u32 mark = skb->mark; | ||
1408 | 559 | ||
1409 | dst_confirm(&rt->dst); | 560 | __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); |
561 | } | ||
1410 | 562 | ||
1411 | rt->rt_gateway = peer->redirect_learned.a4; | 563 | static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) |
564 | { | ||
565 | const struct inet_sock *inet = inet_sk(sk); | ||
566 | const struct ip_options_rcu *inet_opt; | ||
567 | __be32 daddr = inet->inet_daddr; | ||
1412 | 568 | ||
1413 | n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); | 569 | rcu_read_lock(); |
1414 | if (IS_ERR(n)) { | 570 | inet_opt = rcu_dereference(inet->inet_opt); |
1415 | rt->rt_gateway = orig_gw; | 571 | if (inet_opt && inet_opt->opt.srr) |
1416 | return; | 572 | daddr = inet_opt->opt.faddr; |
573 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, | ||
574 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, | ||
575 | inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, | ||
576 | inet_sk_flowi_flags(sk), | ||
577 | daddr, inet->inet_saddr, 0, 0); | ||
578 | rcu_read_unlock(); | ||
579 | } | ||
580 | |||
581 | static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, | ||
582 | const struct sk_buff *skb) | ||
583 | { | ||
584 | if (skb) | ||
585 | build_skb_flow_key(fl4, skb, sk); | ||
586 | else | ||
587 | build_sk_flow_key(fl4, sk); | ||
588 | } | ||
589 | |||
590 | static DEFINE_SEQLOCK(fnhe_seqlock); | ||
591 | |||
592 | static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) | ||
593 | { | ||
594 | struct fib_nh_exception *fnhe, *oldest; | ||
595 | |||
596 | oldest = rcu_dereference(hash->chain); | ||
597 | for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; | ||
598 | fnhe = rcu_dereference(fnhe->fnhe_next)) { | ||
599 | if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) | ||
600 | oldest = fnhe; | ||
1417 | } | 601 | } |
1418 | old_n = xchg(&rt->dst._neighbour, n); | 602 | return oldest; |
1419 | if (old_n) | 603 | } |
1420 | neigh_release(old_n); | 604 | |
1421 | if (!(n->nud_state & NUD_VALID)) { | 605 | static inline u32 fnhe_hashfun(__be32 daddr) |
1422 | neigh_event_send(n, NULL); | 606 | { |
607 | u32 hval; | ||
608 | |||
609 | hval = (__force u32) daddr; | ||
610 | hval ^= (hval >> 11) ^ (hval >> 22); | ||
611 | |||
612 | return hval & (FNHE_HASH_SIZE - 1); | ||
613 | } | ||
614 | |||
615 | static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, | ||
616 | u32 pmtu, unsigned long expires) | ||
617 | { | ||
618 | struct fnhe_hash_bucket *hash; | ||
619 | struct fib_nh_exception *fnhe; | ||
620 | int depth; | ||
621 | u32 hval = fnhe_hashfun(daddr); | ||
622 | |||
623 | write_seqlock_bh(&fnhe_seqlock); | ||
624 | |||
625 | hash = nh->nh_exceptions; | ||
626 | if (!hash) { | ||
627 | hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); | ||
628 | if (!hash) | ||
629 | goto out_unlock; | ||
630 | nh->nh_exceptions = hash; | ||
631 | } | ||
632 | |||
633 | hash += hval; | ||
634 | |||
635 | depth = 0; | ||
636 | for (fnhe = rcu_dereference(hash->chain); fnhe; | ||
637 | fnhe = rcu_dereference(fnhe->fnhe_next)) { | ||
638 | if (fnhe->fnhe_daddr == daddr) | ||
639 | break; | ||
640 | depth++; | ||
641 | } | ||
642 | |||
643 | if (fnhe) { | ||
644 | if (gw) | ||
645 | fnhe->fnhe_gw = gw; | ||
646 | if (pmtu) { | ||
647 | fnhe->fnhe_pmtu = pmtu; | ||
648 | fnhe->fnhe_expires = expires; | ||
649 | } | ||
1423 | } else { | 650 | } else { |
1424 | rt->rt_flags |= RTCF_REDIRECTED; | 651 | if (depth > FNHE_RECLAIM_DEPTH) |
1425 | call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); | 652 | fnhe = fnhe_oldest(hash); |
653 | else { | ||
654 | fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); | ||
655 | if (!fnhe) | ||
656 | goto out_unlock; | ||
657 | |||
658 | fnhe->fnhe_next = hash->chain; | ||
659 | rcu_assign_pointer(hash->chain, fnhe); | ||
660 | } | ||
661 | fnhe->fnhe_daddr = daddr; | ||
662 | fnhe->fnhe_gw = gw; | ||
663 | fnhe->fnhe_pmtu = pmtu; | ||
664 | fnhe->fnhe_expires = expires; | ||
1426 | } | 665 | } |
666 | |||
667 | fnhe->fnhe_stamp = jiffies; | ||
668 | |||
669 | out_unlock: | ||
670 | write_sequnlock_bh(&fnhe_seqlock); | ||
671 | return; | ||
1427 | } | 672 | } |
1428 | 673 | ||
1429 | /* called in rcu_read_lock() section */ | 674 | static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, |
1430 | void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | 675 | bool kill_route) |
1431 | __be32 saddr, struct net_device *dev) | ||
1432 | { | 676 | { |
1433 | int s, i; | 677 | __be32 new_gw = icmp_hdr(skb)->un.gateway; |
1434 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 678 | __be32 old_gw = ip_hdr(skb)->saddr; |
1435 | __be32 skeys[2] = { saddr, 0 }; | 679 | struct net_device *dev = skb->dev; |
1436 | int ikeys[2] = { dev->ifindex, 0 }; | 680 | struct in_device *in_dev; |
1437 | struct inet_peer *peer; | 681 | struct fib_result res; |
682 | struct neighbour *n; | ||
1438 | struct net *net; | 683 | struct net *net; |
1439 | 684 | ||
685 | switch (icmp_hdr(skb)->code & 7) { | ||
686 | case ICMP_REDIR_NET: | ||
687 | case ICMP_REDIR_NETTOS: | ||
688 | case ICMP_REDIR_HOST: | ||
689 | case ICMP_REDIR_HOSTTOS: | ||
690 | break; | ||
691 | |||
692 | default: | ||
693 | return; | ||
694 | } | ||
695 | |||
696 | if (rt->rt_gateway != old_gw) | ||
697 | return; | ||
698 | |||
699 | in_dev = __in_dev_get_rcu(dev); | ||
1440 | if (!in_dev) | 700 | if (!in_dev) |
1441 | return; | 701 | return; |
1442 | 702 | ||
@@ -1456,72 +716,50 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1456 | goto reject_redirect; | 716 | goto reject_redirect; |
1457 | } | 717 | } |
1458 | 718 | ||
1459 | for (s = 0; s < 2; s++) { | 719 | n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); |
1460 | for (i = 0; i < 2; i++) { | 720 | if (n) { |
1461 | unsigned int hash; | 721 | if (!(n->nud_state & NUD_VALID)) { |
1462 | struct rtable __rcu **rthp; | 722 | neigh_event_send(n, NULL); |
1463 | struct rtable *rt; | 723 | } else { |
1464 | 724 | if (fib_lookup(net, fl4, &res) == 0) { | |
1465 | hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); | 725 | struct fib_nh *nh = &FIB_RES_NH(res); |
1466 | 726 | ||
1467 | rthp = &rt_hash_table[hash].chain; | 727 | update_or_create_fnhe(nh, fl4->daddr, new_gw, |
1468 | 728 | 0, 0); | |
1469 | while ((rt = rcu_dereference(*rthp)) != NULL) { | ||
1470 | rthp = &rt->dst.rt_next; | ||
1471 | |||
1472 | if (rt->rt_key_dst != daddr || | ||
1473 | rt->rt_key_src != skeys[s] || | ||
1474 | rt->rt_oif != ikeys[i] || | ||
1475 | rt_is_input_route(rt) || | ||
1476 | rt_is_expired(rt) || | ||
1477 | !net_eq(dev_net(rt->dst.dev), net) || | ||
1478 | rt->dst.error || | ||
1479 | rt->dst.dev != dev || | ||
1480 | rt->rt_gateway != old_gw) | ||
1481 | continue; | ||
1482 | |||
1483 | if (!rt->peer) | ||
1484 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
1485 | |||
1486 | peer = rt->peer; | ||
1487 | if (peer) { | ||
1488 | if (peer->redirect_learned.a4 != new_gw) { | ||
1489 | peer->redirect_learned.a4 = new_gw; | ||
1490 | atomic_inc(&__rt_peer_genid); | ||
1491 | } | ||
1492 | check_peer_redir(&rt->dst, peer); | ||
1493 | } | ||
1494 | } | 729 | } |
730 | if (kill_route) | ||
731 | rt->dst.obsolete = DST_OBSOLETE_KILL; | ||
732 | call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); | ||
1495 | } | 733 | } |
734 | neigh_release(n); | ||
1496 | } | 735 | } |
1497 | return; | 736 | return; |
1498 | 737 | ||
1499 | reject_redirect: | 738 | reject_redirect: |
1500 | #ifdef CONFIG_IP_ROUTE_VERBOSE | 739 | #ifdef CONFIG_IP_ROUTE_VERBOSE |
1501 | if (IN_DEV_LOG_MARTIANS(in_dev)) | 740 | if (IN_DEV_LOG_MARTIANS(in_dev)) { |
741 | const struct iphdr *iph = (const struct iphdr *) skb->data; | ||
742 | __be32 daddr = iph->daddr; | ||
743 | __be32 saddr = iph->saddr; | ||
744 | |||
1502 | net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" | 745 | net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" |
1503 | " Advised path = %pI4 -> %pI4\n", | 746 | " Advised path = %pI4 -> %pI4\n", |
1504 | &old_gw, dev->name, &new_gw, | 747 | &old_gw, dev->name, &new_gw, |
1505 | &saddr, &daddr); | 748 | &saddr, &daddr); |
749 | } | ||
1506 | #endif | 750 | #endif |
1507 | ; | 751 | ; |
1508 | } | 752 | } |
1509 | 753 | ||
1510 | static bool peer_pmtu_expired(struct inet_peer *peer) | 754 | static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) |
1511 | { | 755 | { |
1512 | unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); | 756 | struct rtable *rt; |
757 | struct flowi4 fl4; | ||
1513 | 758 | ||
1514 | return orig && | 759 | rt = (struct rtable *) dst; |
1515 | time_after_eq(jiffies, orig) && | ||
1516 | cmpxchg(&peer->pmtu_expires, orig, 0) == orig; | ||
1517 | } | ||
1518 | 760 | ||
1519 | static bool peer_pmtu_cleaned(struct inet_peer *peer) | 761 | ip_rt_build_flow_key(&fl4, sk, skb); |
1520 | { | 762 | __ip_do_redirect(rt, skb, &fl4, true); |
1521 | unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); | ||
1522 | |||
1523 | return orig && | ||
1524 | cmpxchg(&peer->pmtu_expires, orig, 0) == orig; | ||
1525 | } | 763 | } |
1526 | 764 | ||
1527 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | 765 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) |
@@ -1533,14 +771,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | |||
1533 | if (dst->obsolete > 0) { | 771 | if (dst->obsolete > 0) { |
1534 | ip_rt_put(rt); | 772 | ip_rt_put(rt); |
1535 | ret = NULL; | 773 | ret = NULL; |
1536 | } else if (rt->rt_flags & RTCF_REDIRECTED) { | 774 | } else if ((rt->rt_flags & RTCF_REDIRECTED) || |
1537 | unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, | 775 | rt->dst.expires) { |
1538 | rt->rt_oif, | 776 | ip_rt_put(rt); |
1539 | rt_genid(dev_net(dst->dev))); | ||
1540 | rt_del(hash, rt); | ||
1541 | ret = NULL; | 777 | ret = NULL; |
1542 | } else if (rt->peer && peer_pmtu_expired(rt->peer)) { | ||
1543 | dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); | ||
1544 | } | 778 | } |
1545 | } | 779 | } |
1546 | return ret; | 780 | return ret; |
@@ -1567,6 +801,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1567 | struct rtable *rt = skb_rtable(skb); | 801 | struct rtable *rt = skb_rtable(skb); |
1568 | struct in_device *in_dev; | 802 | struct in_device *in_dev; |
1569 | struct inet_peer *peer; | 803 | struct inet_peer *peer; |
804 | struct net *net; | ||
1570 | int log_martians; | 805 | int log_martians; |
1571 | 806 | ||
1572 | rcu_read_lock(); | 807 | rcu_read_lock(); |
@@ -1578,9 +813,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1578 | log_martians = IN_DEV_LOG_MARTIANS(in_dev); | 813 | log_martians = IN_DEV_LOG_MARTIANS(in_dev); |
1579 | rcu_read_unlock(); | 814 | rcu_read_unlock(); |
1580 | 815 | ||
1581 | if (!rt->peer) | 816 | net = dev_net(rt->dst.dev); |
1582 | rt_bind_peer(rt, rt->rt_dst, 1); | 817 | peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); |
1583 | peer = rt->peer; | ||
1584 | if (!peer) { | 818 | if (!peer) { |
1585 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); | 819 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); |
1586 | return; | 820 | return; |
@@ -1597,7 +831,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1597 | */ | 831 | */ |
1598 | if (peer->rate_tokens >= ip_rt_redirect_number) { | 832 | if (peer->rate_tokens >= ip_rt_redirect_number) { |
1599 | peer->rate_last = jiffies; | 833 | peer->rate_last = jiffies; |
1600 | return; | 834 | goto out_put_peer; |
1601 | } | 835 | } |
1602 | 836 | ||
1603 | /* Check for load limit; set rate_last to the latest sent | 837 | /* Check for load limit; set rate_last to the latest sent |
@@ -1614,20 +848,38 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1614 | if (log_martians && | 848 | if (log_martians && |
1615 | peer->rate_tokens == ip_rt_redirect_number) | 849 | peer->rate_tokens == ip_rt_redirect_number) |
1616 | net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", | 850 | net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", |
1617 | &ip_hdr(skb)->saddr, rt->rt_iif, | 851 | &ip_hdr(skb)->saddr, inet_iif(skb), |
1618 | &rt->rt_dst, &rt->rt_gateway); | 852 | &ip_hdr(skb)->daddr, &rt->rt_gateway); |
1619 | #endif | 853 | #endif |
1620 | } | 854 | } |
855 | out_put_peer: | ||
856 | inet_putpeer(peer); | ||
1621 | } | 857 | } |
1622 | 858 | ||
1623 | static int ip_error(struct sk_buff *skb) | 859 | static int ip_error(struct sk_buff *skb) |
1624 | { | 860 | { |
861 | struct in_device *in_dev = __in_dev_get_rcu(skb->dev); | ||
1625 | struct rtable *rt = skb_rtable(skb); | 862 | struct rtable *rt = skb_rtable(skb); |
1626 | struct inet_peer *peer; | 863 | struct inet_peer *peer; |
1627 | unsigned long now; | 864 | unsigned long now; |
865 | struct net *net; | ||
1628 | bool send; | 866 | bool send; |
1629 | int code; | 867 | int code; |
1630 | 868 | ||
869 | net = dev_net(rt->dst.dev); | ||
870 | if (!IN_DEV_FORWARD(in_dev)) { | ||
871 | switch (rt->dst.error) { | ||
872 | case EHOSTUNREACH: | ||
873 | IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); | ||
874 | break; | ||
875 | |||
876 | case ENETUNREACH: | ||
877 | IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); | ||
878 | break; | ||
879 | } | ||
880 | goto out; | ||
881 | } | ||
882 | |||
1631 | switch (rt->dst.error) { | 883 | switch (rt->dst.error) { |
1632 | case EINVAL: | 884 | case EINVAL: |
1633 | default: | 885 | default: |
@@ -1637,17 +889,14 @@ static int ip_error(struct sk_buff *skb) | |||
1637 | break; | 889 | break; |
1638 | case ENETUNREACH: | 890 | case ENETUNREACH: |
1639 | code = ICMP_NET_UNREACH; | 891 | code = ICMP_NET_UNREACH; |
1640 | IP_INC_STATS_BH(dev_net(rt->dst.dev), | 892 | IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); |
1641 | IPSTATS_MIB_INNOROUTES); | ||
1642 | break; | 893 | break; |
1643 | case EACCES: | 894 | case EACCES: |
1644 | code = ICMP_PKT_FILTERED; | 895 | code = ICMP_PKT_FILTERED; |
1645 | break; | 896 | break; |
1646 | } | 897 | } |
1647 | 898 | ||
1648 | if (!rt->peer) | 899 | peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); |
1649 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
1650 | peer = rt->peer; | ||
1651 | 900 | ||
1652 | send = true; | 901 | send = true; |
1653 | if (peer) { | 902 | if (peer) { |
@@ -1660,6 +909,7 @@ static int ip_error(struct sk_buff *skb) | |||
1660 | peer->rate_tokens -= ip_rt_error_cost; | 909 | peer->rate_tokens -= ip_rt_error_cost; |
1661 | else | 910 | else |
1662 | send = false; | 911 | send = false; |
912 | inet_putpeer(peer); | ||
1663 | } | 913 | } |
1664 | if (send) | 914 | if (send) |
1665 | icmp_send(skb, ICMP_DEST_UNREACH, code, 0); | 915 | icmp_send(skb, ICMP_DEST_UNREACH, code, 0); |
@@ -1668,163 +918,120 @@ out: kfree_skb(skb); | |||
1668 | return 0; | 918 | return 0; |
1669 | } | 919 | } |
1670 | 920 | ||
1671 | /* | 921 | static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) |
1672 | * The last two values are not from the RFC but | 922 | { |
1673 | * are needed for AMPRnet AX.25 paths. | 923 | struct fib_result res; |
1674 | */ | ||
1675 | 924 | ||
1676 | static const unsigned short mtu_plateau[] = | 925 | if (mtu < ip_rt_min_pmtu) |
1677 | {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; | 926 | mtu = ip_rt_min_pmtu; |
1678 | 927 | ||
1679 | static inline unsigned short guess_mtu(unsigned short old_mtu) | 928 | if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { |
1680 | { | 929 | struct fib_nh *nh = &FIB_RES_NH(res); |
1681 | int i; | ||
1682 | 930 | ||
1683 | for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) | 931 | update_or_create_fnhe(nh, fl4->daddr, 0, mtu, |
1684 | if (old_mtu > mtu_plateau[i]) | 932 | jiffies + ip_rt_mtu_expires); |
1685 | return mtu_plateau[i]; | 933 | } |
1686 | return 68; | 934 | return mtu; |
1687 | } | 935 | } |
1688 | 936 | ||
1689 | unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, | 937 | static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, |
1690 | unsigned short new_mtu, | 938 | struct sk_buff *skb, u32 mtu) |
1691 | struct net_device *dev) | ||
1692 | { | 939 | { |
1693 | unsigned short old_mtu = ntohs(iph->tot_len); | 940 | struct rtable *rt = (struct rtable *) dst; |
1694 | unsigned short est_mtu = 0; | 941 | struct flowi4 fl4; |
1695 | struct inet_peer *peer; | ||
1696 | |||
1697 | peer = inet_getpeer_v4(iph->daddr, 1); | ||
1698 | if (peer) { | ||
1699 | unsigned short mtu = new_mtu; | ||
1700 | |||
1701 | if (new_mtu < 68 || new_mtu >= old_mtu) { | ||
1702 | /* BSD 4.2 derived systems incorrectly adjust | ||
1703 | * tot_len by the IP header length, and report | ||
1704 | * a zero MTU in the ICMP message. | ||
1705 | */ | ||
1706 | if (mtu == 0 && | ||
1707 | old_mtu >= 68 + (iph->ihl << 2)) | ||
1708 | old_mtu -= iph->ihl << 2; | ||
1709 | mtu = guess_mtu(old_mtu); | ||
1710 | } | ||
1711 | |||
1712 | if (mtu < ip_rt_min_pmtu) | ||
1713 | mtu = ip_rt_min_pmtu; | ||
1714 | if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { | ||
1715 | unsigned long pmtu_expires; | ||
1716 | |||
1717 | pmtu_expires = jiffies + ip_rt_mtu_expires; | ||
1718 | if (!pmtu_expires) | ||
1719 | pmtu_expires = 1UL; | ||
1720 | 942 | ||
1721 | est_mtu = mtu; | 943 | ip_rt_build_flow_key(&fl4, sk, skb); |
1722 | peer->pmtu_learned = mtu; | 944 | mtu = __ip_rt_update_pmtu(rt, &fl4, mtu); |
1723 | peer->pmtu_expires = pmtu_expires; | ||
1724 | atomic_inc(&__rt_peer_genid); | ||
1725 | } | ||
1726 | 945 | ||
1727 | inet_putpeer(peer); | 946 | if (!rt->rt_pmtu) { |
947 | dst->obsolete = DST_OBSOLETE_KILL; | ||
948 | } else { | ||
949 | rt->rt_pmtu = mtu; | ||
950 | dst_set_expires(&rt->dst, ip_rt_mtu_expires); | ||
1728 | } | 951 | } |
1729 | return est_mtu ? : new_mtu; | ||
1730 | } | 952 | } |
1731 | 953 | ||
1732 | static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) | 954 | void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, |
955 | int oif, u32 mark, u8 protocol, int flow_flags) | ||
1733 | { | 956 | { |
1734 | unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); | 957 | const struct iphdr *iph = (const struct iphdr *) skb->data; |
958 | struct flowi4 fl4; | ||
959 | struct rtable *rt; | ||
1735 | 960 | ||
1736 | if (!expires) | 961 | __build_flow_key(&fl4, NULL, iph, oif, |
1737 | return; | 962 | RT_TOS(iph->tos), protocol, mark, flow_flags); |
1738 | if (time_before(jiffies, expires)) { | 963 | rt = __ip_route_output_key(net, &fl4); |
1739 | u32 orig_dst_mtu = dst_mtu(dst); | 964 | if (!IS_ERR(rt)) { |
1740 | if (peer->pmtu_learned < orig_dst_mtu) { | 965 | __ip_rt_update_pmtu(rt, &fl4, mtu); |
1741 | if (!peer->pmtu_orig) | 966 | ip_rt_put(rt); |
1742 | peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); | 967 | } |
1743 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); | ||
1744 | } | ||
1745 | } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) | ||
1746 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); | ||
1747 | } | 968 | } |
969 | EXPORT_SYMBOL_GPL(ipv4_update_pmtu); | ||
1748 | 970 | ||
1749 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) | 971 | void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) |
1750 | { | 972 | { |
1751 | struct rtable *rt = (struct rtable *) dst; | 973 | const struct iphdr *iph = (const struct iphdr *) skb->data; |
1752 | struct inet_peer *peer; | 974 | struct flowi4 fl4; |
1753 | 975 | struct rtable *rt; | |
1754 | dst_confirm(dst); | ||
1755 | |||
1756 | if (!rt->peer) | ||
1757 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
1758 | peer = rt->peer; | ||
1759 | if (peer) { | ||
1760 | unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); | ||
1761 | |||
1762 | if (mtu < ip_rt_min_pmtu) | ||
1763 | mtu = ip_rt_min_pmtu; | ||
1764 | if (!pmtu_expires || mtu < peer->pmtu_learned) { | ||
1765 | |||
1766 | pmtu_expires = jiffies + ip_rt_mtu_expires; | ||
1767 | if (!pmtu_expires) | ||
1768 | pmtu_expires = 1UL; | ||
1769 | |||
1770 | peer->pmtu_learned = mtu; | ||
1771 | peer->pmtu_expires = pmtu_expires; | ||
1772 | 976 | ||
1773 | atomic_inc(&__rt_peer_genid); | 977 | __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); |
1774 | rt->rt_peer_genid = rt_peer_genid(); | 978 | rt = __ip_route_output_key(sock_net(sk), &fl4); |
1775 | } | 979 | if (!IS_ERR(rt)) { |
1776 | check_peer_pmtu(dst, peer); | 980 | __ip_rt_update_pmtu(rt, &fl4, mtu); |
981 | ip_rt_put(rt); | ||
1777 | } | 982 | } |
1778 | } | 983 | } |
984 | EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); | ||
1779 | 985 | ||
1780 | 986 | void ipv4_redirect(struct sk_buff *skb, struct net *net, | |
1781 | static void ipv4_validate_peer(struct rtable *rt) | 987 | int oif, u32 mark, u8 protocol, int flow_flags) |
1782 | { | 988 | { |
1783 | if (rt->rt_peer_genid != rt_peer_genid()) { | 989 | const struct iphdr *iph = (const struct iphdr *) skb->data; |
1784 | struct inet_peer *peer; | 990 | struct flowi4 fl4; |
1785 | 991 | struct rtable *rt; | |
1786 | if (!rt->peer) | ||
1787 | rt_bind_peer(rt, rt->rt_dst, 0); | ||
1788 | 992 | ||
1789 | peer = rt->peer; | 993 | __build_flow_key(&fl4, NULL, iph, oif, |
1790 | if (peer) { | 994 | RT_TOS(iph->tos), protocol, mark, flow_flags); |
1791 | check_peer_pmtu(&rt->dst, peer); | 995 | rt = __ip_route_output_key(net, &fl4); |
996 | if (!IS_ERR(rt)) { | ||
997 | __ip_do_redirect(rt, skb, &fl4, false); | ||
998 | ip_rt_put(rt); | ||
999 | } | ||
1000 | } | ||
1001 | EXPORT_SYMBOL_GPL(ipv4_redirect); | ||
1792 | 1002 | ||
1793 | if (peer->redirect_learned.a4 && | 1003 | void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) |
1794 | peer->redirect_learned.a4 != rt->rt_gateway) | 1004 | { |
1795 | check_peer_redir(&rt->dst, peer); | 1005 | const struct iphdr *iph = (const struct iphdr *) skb->data; |
1796 | } | 1006 | struct flowi4 fl4; |
1007 | struct rtable *rt; | ||
1797 | 1008 | ||
1798 | rt->rt_peer_genid = rt_peer_genid(); | 1009 | __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); |
1010 | rt = __ip_route_output_key(sock_net(sk), &fl4); | ||
1011 | if (!IS_ERR(rt)) { | ||
1012 | __ip_do_redirect(rt, skb, &fl4, false); | ||
1013 | ip_rt_put(rt); | ||
1799 | } | 1014 | } |
1800 | } | 1015 | } |
1016 | EXPORT_SYMBOL_GPL(ipv4_sk_redirect); | ||
1801 | 1017 | ||
1802 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) | 1018 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) |
1803 | { | 1019 | { |
1804 | struct rtable *rt = (struct rtable *) dst; | 1020 | struct rtable *rt = (struct rtable *) dst; |
1805 | 1021 | ||
1806 | if (rt_is_expired(rt)) | 1022 | /* All IPV4 dsts are created with ->obsolete set to the value |
1023 | * DST_OBSOLETE_FORCE_CHK which forces validation calls down | ||
1024 | * into this function always. | ||
1025 | * | ||
1026 | * When a PMTU/redirect information update invalidates a | ||
1027 | * route, this is indicated by setting obsolete to | ||
1028 | * DST_OBSOLETE_KILL. | ||
1029 | */ | ||
1030 | if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt)) | ||
1807 | return NULL; | 1031 | return NULL; |
1808 | ipv4_validate_peer(rt); | ||
1809 | return dst; | 1032 | return dst; |
1810 | } | 1033 | } |
1811 | 1034 | ||
1812 | static void ipv4_dst_destroy(struct dst_entry *dst) | ||
1813 | { | ||
1814 | struct rtable *rt = (struct rtable *) dst; | ||
1815 | struct inet_peer *peer = rt->peer; | ||
1816 | |||
1817 | if (rt->fi) { | ||
1818 | fib_info_put(rt->fi); | ||
1819 | rt->fi = NULL; | ||
1820 | } | ||
1821 | if (peer) { | ||
1822 | rt->peer = NULL; | ||
1823 | inet_putpeer(peer); | ||
1824 | } | ||
1825 | } | ||
1826 | |||
1827 | |||
1828 | static void ipv4_link_failure(struct sk_buff *skb) | 1035 | static void ipv4_link_failure(struct sk_buff *skb) |
1829 | { | 1036 | { |
1830 | struct rtable *rt; | 1037 | struct rtable *rt; |
@@ -1832,8 +1039,8 @@ static void ipv4_link_failure(struct sk_buff *skb) | |||
1832 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); | 1039 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); |
1833 | 1040 | ||
1834 | rt = skb_rtable(skb); | 1041 | rt = skb_rtable(skb); |
1835 | if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) | 1042 | if (rt) |
1836 | dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); | 1043 | dst_set_expires(&rt->dst, 0); |
1837 | } | 1044 | } |
1838 | 1045 | ||
1839 | static int ip_rt_bug(struct sk_buff *skb) | 1046 | static int ip_rt_bug(struct sk_buff *skb) |
@@ -1880,8 +1087,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) | |||
1880 | if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) | 1087 | if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) |
1881 | src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); | 1088 | src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); |
1882 | else | 1089 | else |
1883 | src = inet_select_addr(rt->dst.dev, rt->rt_gateway, | 1090 | src = inet_select_addr(rt->dst.dev, |
1884 | RT_SCOPE_UNIVERSE); | 1091 | rt_nexthop(rt, iph->daddr), |
1092 | RT_SCOPE_UNIVERSE); | ||
1885 | rcu_read_unlock(); | 1093 | rcu_read_unlock(); |
1886 | } | 1094 | } |
1887 | memcpy(addr, &src, 4); | 1095 | memcpy(addr, &src, 4); |
@@ -1913,7 +1121,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) | |||
1913 | static unsigned int ipv4_mtu(const struct dst_entry *dst) | 1121 | static unsigned int ipv4_mtu(const struct dst_entry *dst) |
1914 | { | 1122 | { |
1915 | const struct rtable *rt = (const struct rtable *) dst; | 1123 | const struct rtable *rt = (const struct rtable *) dst; |
1916 | unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); | 1124 | unsigned int mtu = rt->rt_pmtu; |
1125 | |||
1126 | if (mtu && time_after_eq(jiffies, rt->dst.expires)) | ||
1127 | mtu = 0; | ||
1128 | |||
1129 | if (!mtu) | ||
1130 | mtu = dst_metric_raw(dst, RTAX_MTU); | ||
1917 | 1131 | ||
1918 | if (mtu && rt_is_output_route(rt)) | 1132 | if (mtu && rt_is_output_route(rt)) |
1919 | return mtu; | 1133 | return mtu; |
@@ -1921,8 +1135,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) | |||
1921 | mtu = dst->dev->mtu; | 1135 | mtu = dst->dev->mtu; |
1922 | 1136 | ||
1923 | if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { | 1137 | if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { |
1924 | 1138 | if (rt->rt_gateway && mtu > 576) | |
1925 | if (rt->rt_gateway != rt->rt_dst && mtu > 576) | ||
1926 | mtu = 576; | 1139 | mtu = 576; |
1927 | } | 1140 | } |
1928 | 1141 | ||
@@ -1932,76 +1145,121 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) | |||
1932 | return mtu; | 1145 | return mtu; |
1933 | } | 1146 | } |
1934 | 1147 | ||
1935 | static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, | 1148 | static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) |
1936 | struct fib_info *fi) | ||
1937 | { | 1149 | { |
1938 | struct inet_peer *peer; | 1150 | struct fnhe_hash_bucket *hash = nh->nh_exceptions; |
1939 | int create = 0; | 1151 | struct fib_nh_exception *fnhe; |
1152 | u32 hval; | ||
1940 | 1153 | ||
1941 | /* If a peer entry exists for this destination, we must hook | 1154 | if (!hash) |
1942 | * it up in order to get at cached metrics. | 1155 | return NULL; |
1943 | */ | ||
1944 | if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) | ||
1945 | create = 1; | ||
1946 | 1156 | ||
1947 | rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); | 1157 | hval = fnhe_hashfun(daddr); |
1948 | if (peer) { | 1158 | |
1949 | rt->rt_peer_genid = rt_peer_genid(); | 1159 | for (fnhe = rcu_dereference(hash[hval].chain); fnhe; |
1950 | if (inet_metrics_new(peer)) | 1160 | fnhe = rcu_dereference(fnhe->fnhe_next)) { |
1951 | memcpy(peer->metrics, fi->fib_metrics, | 1161 | if (fnhe->fnhe_daddr == daddr) |
1952 | sizeof(u32) * RTAX_MAX); | 1162 | return fnhe; |
1953 | dst_init_metrics(&rt->dst, peer->metrics, false); | 1163 | } |
1954 | 1164 | return NULL; | |
1955 | check_peer_pmtu(&rt->dst, peer); | 1165 | } |
1956 | 1166 | ||
1957 | if (peer->redirect_learned.a4 && | 1167 | static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, |
1958 | peer->redirect_learned.a4 != rt->rt_gateway) { | 1168 | __be32 daddr) |
1959 | rt->rt_gateway = peer->redirect_learned.a4; | 1169 | { |
1960 | rt->rt_flags |= RTCF_REDIRECTED; | 1170 | __be32 fnhe_daddr, gw; |
1961 | } | 1171 | unsigned long expires; |
1962 | } else { | 1172 | unsigned int seq; |
1963 | if (fi->fib_metrics != (u32 *) dst_default_metrics) { | 1173 | u32 pmtu; |
1964 | rt->fi = fi; | 1174 | |
1965 | atomic_inc(&fi->fib_clntref); | 1175 | restart: |
1176 | seq = read_seqbegin(&fnhe_seqlock); | ||
1177 | fnhe_daddr = fnhe->fnhe_daddr; | ||
1178 | gw = fnhe->fnhe_gw; | ||
1179 | pmtu = fnhe->fnhe_pmtu; | ||
1180 | expires = fnhe->fnhe_expires; | ||
1181 | if (read_seqretry(&fnhe_seqlock, seq)) | ||
1182 | goto restart; | ||
1183 | |||
1184 | if (daddr != fnhe_daddr) | ||
1185 | return; | ||
1186 | |||
1187 | if (pmtu) { | ||
1188 | unsigned long diff = expires - jiffies; | ||
1189 | |||
1190 | if (time_before(jiffies, expires)) { | ||
1191 | rt->rt_pmtu = pmtu; | ||
1192 | dst_set_expires(&rt->dst, diff); | ||
1966 | } | 1193 | } |
1967 | dst_init_metrics(&rt->dst, fi->fib_metrics, true); | ||
1968 | } | 1194 | } |
1195 | if (gw) { | ||
1196 | rt->rt_flags |= RTCF_REDIRECTED; | ||
1197 | rt->rt_gateway = gw; | ||
1198 | } | ||
1199 | fnhe->fnhe_stamp = jiffies; | ||
1200 | } | ||
1201 | |||
1202 | static inline void rt_release_rcu(struct rcu_head *head) | ||
1203 | { | ||
1204 | struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); | ||
1205 | dst_release(dst); | ||
1206 | } | ||
1207 | |||
1208 | static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) | ||
1209 | { | ||
1210 | struct rtable *orig, *prev, **p = &nh->nh_rth_output; | ||
1211 | |||
1212 | if (rt_is_input_route(rt)) | ||
1213 | p = &nh->nh_rth_input; | ||
1214 | |||
1215 | orig = *p; | ||
1216 | |||
1217 | prev = cmpxchg(p, orig, rt); | ||
1218 | if (prev == orig) { | ||
1219 | dst_clone(&rt->dst); | ||
1220 | if (orig) | ||
1221 | call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu); | ||
1222 | } | ||
1223 | } | ||
1224 | |||
1225 | static bool rt_cache_valid(struct rtable *rt) | ||
1226 | { | ||
1227 | return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK); | ||
1969 | } | 1228 | } |
1970 | 1229 | ||
1971 | static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, | 1230 | static void rt_set_nexthop(struct rtable *rt, __be32 daddr, |
1972 | const struct fib_result *res, | 1231 | const struct fib_result *res, |
1232 | struct fib_nh_exception *fnhe, | ||
1973 | struct fib_info *fi, u16 type, u32 itag) | 1233 | struct fib_info *fi, u16 type, u32 itag) |
1974 | { | 1234 | { |
1975 | struct dst_entry *dst = &rt->dst; | ||
1976 | |||
1977 | if (fi) { | 1235 | if (fi) { |
1978 | if (FIB_RES_GW(*res) && | 1236 | struct fib_nh *nh = &FIB_RES_NH(*res); |
1979 | FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) | 1237 | |
1980 | rt->rt_gateway = FIB_RES_GW(*res); | 1238 | if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) |
1981 | rt_init_metrics(rt, fl4, fi); | 1239 | rt->rt_gateway = nh->nh_gw; |
1240 | if (unlikely(fnhe)) | ||
1241 | rt_bind_exception(rt, fnhe, daddr); | ||
1242 | dst_init_metrics(&rt->dst, fi->fib_metrics, true); | ||
1982 | #ifdef CONFIG_IP_ROUTE_CLASSID | 1243 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1983 | dst->tclassid = FIB_RES_NH(*res).nh_tclassid; | 1244 | rt->dst.tclassid = nh->nh_tclassid; |
1984 | #endif | 1245 | #endif |
1246 | if (!(rt->dst.flags & DST_HOST)) | ||
1247 | rt_cache_route(nh, rt); | ||
1985 | } | 1248 | } |
1986 | 1249 | ||
1987 | if (dst_mtu(dst) > IP_MAX_MTU) | ||
1988 | dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); | ||
1989 | if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) | ||
1990 | dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); | ||
1991 | |||
1992 | #ifdef CONFIG_IP_ROUTE_CLASSID | 1250 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1993 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 1251 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
1994 | set_class_tag(rt, fib_rules_tclass(res)); | 1252 | set_class_tag(rt, res->tclassid); |
1995 | #endif | 1253 | #endif |
1996 | set_class_tag(rt, itag); | 1254 | set_class_tag(rt, itag); |
1997 | #endif | 1255 | #endif |
1998 | } | 1256 | } |
1999 | 1257 | ||
2000 | static struct rtable *rt_dst_alloc(struct net_device *dev, | 1258 | static struct rtable *rt_dst_alloc(struct net_device *dev, |
2001 | bool nopolicy, bool noxfrm) | 1259 | bool nopolicy, bool noxfrm, bool will_cache) |
2002 | { | 1260 | { |
2003 | return dst_alloc(&ipv4_dst_ops, dev, 1, -1, | 1261 | return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, |
2004 | DST_HOST | | 1262 | (will_cache ? 0 : DST_HOST) | DST_NOCACHE | |
2005 | (nopolicy ? DST_NOPOLICY : 0) | | 1263 | (nopolicy ? DST_NOPOLICY : 0) | |
2006 | (noxfrm ? DST_NOXFRM : 0)); | 1264 | (noxfrm ? DST_NOXFRM : 0)); |
2007 | } | 1265 | } |
@@ -2010,9 +1268,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, | |||
2010 | static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 1268 | static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
2011 | u8 tos, struct net_device *dev, int our) | 1269 | u8 tos, struct net_device *dev, int our) |
2012 | { | 1270 | { |
2013 | unsigned int hash; | ||
2014 | struct rtable *rth; | 1271 | struct rtable *rth; |
2015 | __be32 spec_dst; | ||
2016 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 1272 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
2017 | u32 itag = 0; | 1273 | u32 itag = 0; |
2018 | int err; | 1274 | int err; |
@@ -2023,21 +1279,24 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2023 | return -EINVAL; | 1279 | return -EINVAL; |
2024 | 1280 | ||
2025 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || | 1281 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || |
2026 | ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) | 1282 | skb->protocol != htons(ETH_P_IP)) |
2027 | goto e_inval; | 1283 | goto e_inval; |
2028 | 1284 | ||
1285 | if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) | ||
1286 | if (ipv4_is_loopback(saddr)) | ||
1287 | goto e_inval; | ||
1288 | |||
2029 | if (ipv4_is_zeronet(saddr)) { | 1289 | if (ipv4_is_zeronet(saddr)) { |
2030 | if (!ipv4_is_local_multicast(daddr)) | 1290 | if (!ipv4_is_local_multicast(daddr)) |
2031 | goto e_inval; | 1291 | goto e_inval; |
2032 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); | ||
2033 | } else { | 1292 | } else { |
2034 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, | 1293 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, |
2035 | &itag); | 1294 | in_dev, &itag); |
2036 | if (err < 0) | 1295 | if (err < 0) |
2037 | goto e_err; | 1296 | goto e_err; |
2038 | } | 1297 | } |
2039 | rth = rt_dst_alloc(dev_net(dev)->loopback_dev, | 1298 | rth = rt_dst_alloc(dev_net(dev)->loopback_dev, |
2040 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false); | 1299 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); |
2041 | if (!rth) | 1300 | if (!rth) |
2042 | goto e_nobufs; | 1301 | goto e_nobufs; |
2043 | 1302 | ||
@@ -2046,23 +1305,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2046 | #endif | 1305 | #endif |
2047 | rth->dst.output = ip_rt_bug; | 1306 | rth->dst.output = ip_rt_bug; |
2048 | 1307 | ||
2049 | rth->rt_key_dst = daddr; | ||
2050 | rth->rt_key_src = saddr; | ||
2051 | rth->rt_genid = rt_genid(dev_net(dev)); | 1308 | rth->rt_genid = rt_genid(dev_net(dev)); |
2052 | rth->rt_flags = RTCF_MULTICAST; | 1309 | rth->rt_flags = RTCF_MULTICAST; |
2053 | rth->rt_type = RTN_MULTICAST; | 1310 | rth->rt_type = RTN_MULTICAST; |
2054 | rth->rt_key_tos = tos; | 1311 | rth->rt_is_input= 1; |
2055 | rth->rt_dst = daddr; | 1312 | rth->rt_iif = 0; |
2056 | rth->rt_src = saddr; | 1313 | rth->rt_pmtu = 0; |
2057 | rth->rt_route_iif = dev->ifindex; | 1314 | rth->rt_gateway = 0; |
2058 | rth->rt_iif = dev->ifindex; | ||
2059 | rth->rt_oif = 0; | ||
2060 | rth->rt_mark = skb->mark; | ||
2061 | rth->rt_gateway = daddr; | ||
2062 | rth->rt_spec_dst= spec_dst; | ||
2063 | rth->rt_peer_genid = 0; | ||
2064 | rth->peer = NULL; | ||
2065 | rth->fi = NULL; | ||
2066 | if (our) { | 1315 | if (our) { |
2067 | rth->dst.input= ip_local_deliver; | 1316 | rth->dst.input= ip_local_deliver; |
2068 | rth->rt_flags |= RTCF_LOCAL; | 1317 | rth->rt_flags |= RTCF_LOCAL; |
@@ -2074,9 +1323,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2074 | #endif | 1323 | #endif |
2075 | RT_CACHE_STAT_INC(in_slow_mc); | 1324 | RT_CACHE_STAT_INC(in_slow_mc); |
2076 | 1325 | ||
2077 | hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); | 1326 | skb_dst_set(skb, &rth->dst); |
2078 | rth = rt_intern_hash(hash, rth, skb, dev->ifindex); | 1327 | return 0; |
2079 | return IS_ERR(rth) ? PTR_ERR(rth) : 0; | ||
2080 | 1328 | ||
2081 | e_nobufs: | 1329 | e_nobufs: |
2082 | return -ENOBUFS; | 1330 | return -ENOBUFS; |
@@ -2123,7 +1371,7 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2123 | int err; | 1371 | int err; |
2124 | struct in_device *out_dev; | 1372 | struct in_device *out_dev; |
2125 | unsigned int flags = 0; | 1373 | unsigned int flags = 0; |
2126 | __be32 spec_dst; | 1374 | bool do_cache; |
2127 | u32 itag; | 1375 | u32 itag; |
2128 | 1376 | ||
2129 | /* get a working reference to the output device */ | 1377 | /* get a working reference to the output device */ |
@@ -2135,7 +1383,7 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2135 | 1383 | ||
2136 | 1384 | ||
2137 | err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), | 1385 | err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), |
2138 | in_dev->dev, &spec_dst, &itag); | 1386 | in_dev->dev, in_dev, &itag); |
2139 | if (err < 0) { | 1387 | if (err < 0) { |
2140 | ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, | 1388 | ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, |
2141 | saddr); | 1389 | saddr); |
@@ -2143,9 +1391,6 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2143 | goto cleanup; | 1391 | goto cleanup; |
2144 | } | 1392 | } |
2145 | 1393 | ||
2146 | if (err) | ||
2147 | flags |= RTCF_DIRECTSRC; | ||
2148 | |||
2149 | if (out_dev == in_dev && err && | 1394 | if (out_dev == in_dev && err && |
2150 | (IN_DEV_SHARED_MEDIA(out_dev) || | 1395 | (IN_DEV_SHARED_MEDIA(out_dev) || |
2151 | inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) | 1396 | inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) |
@@ -2166,37 +1411,39 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2166 | } | 1411 | } |
2167 | } | 1412 | } |
2168 | 1413 | ||
1414 | do_cache = false; | ||
1415 | if (res->fi) { | ||
1416 | if (!itag) { | ||
1417 | rth = FIB_RES_NH(*res).nh_rth_input; | ||
1418 | if (rt_cache_valid(rth)) { | ||
1419 | dst_hold(&rth->dst); | ||
1420 | goto out; | ||
1421 | } | ||
1422 | do_cache = true; | ||
1423 | } | ||
1424 | } | ||
1425 | |||
2169 | rth = rt_dst_alloc(out_dev->dev, | 1426 | rth = rt_dst_alloc(out_dev->dev, |
2170 | IN_DEV_CONF_GET(in_dev, NOPOLICY), | 1427 | IN_DEV_CONF_GET(in_dev, NOPOLICY), |
2171 | IN_DEV_CONF_GET(out_dev, NOXFRM)); | 1428 | IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); |
2172 | if (!rth) { | 1429 | if (!rth) { |
2173 | err = -ENOBUFS; | 1430 | err = -ENOBUFS; |
2174 | goto cleanup; | 1431 | goto cleanup; |
2175 | } | 1432 | } |
2176 | 1433 | ||
2177 | rth->rt_key_dst = daddr; | ||
2178 | rth->rt_key_src = saddr; | ||
2179 | rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); | 1434 | rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); |
2180 | rth->rt_flags = flags; | 1435 | rth->rt_flags = flags; |
2181 | rth->rt_type = res->type; | 1436 | rth->rt_type = res->type; |
2182 | rth->rt_key_tos = tos; | 1437 | rth->rt_is_input = 1; |
2183 | rth->rt_dst = daddr; | 1438 | rth->rt_iif = 0; |
2184 | rth->rt_src = saddr; | 1439 | rth->rt_pmtu = 0; |
2185 | rth->rt_route_iif = in_dev->dev->ifindex; | 1440 | rth->rt_gateway = 0; |
2186 | rth->rt_iif = in_dev->dev->ifindex; | ||
2187 | rth->rt_oif = 0; | ||
2188 | rth->rt_mark = skb->mark; | ||
2189 | rth->rt_gateway = daddr; | ||
2190 | rth->rt_spec_dst= spec_dst; | ||
2191 | rth->rt_peer_genid = 0; | ||
2192 | rth->peer = NULL; | ||
2193 | rth->fi = NULL; | ||
2194 | 1441 | ||
2195 | rth->dst.input = ip_forward; | 1442 | rth->dst.input = ip_forward; |
2196 | rth->dst.output = ip_output; | 1443 | rth->dst.output = ip_output; |
2197 | 1444 | ||
2198 | rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); | 1445 | rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); |
2199 | 1446 | out: | |
2200 | *result = rth; | 1447 | *result = rth; |
2201 | err = 0; | 1448 | err = 0; |
2202 | cleanup: | 1449 | cleanup: |
@@ -2211,7 +1458,6 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
2211 | { | 1458 | { |
2212 | struct rtable *rth = NULL; | 1459 | struct rtable *rth = NULL; |
2213 | int err; | 1460 | int err; |
2214 | unsigned int hash; | ||
2215 | 1461 | ||
2216 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 1462 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
2217 | if (res->fi && res->fi->fib_nhs > 1) | 1463 | if (res->fi && res->fi->fib_nhs > 1) |
@@ -2223,12 +1469,7 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
2223 | if (err) | 1469 | if (err) |
2224 | return err; | 1470 | return err; |
2225 | 1471 | ||
2226 | /* put it into the cache */ | 1472 | skb_dst_set(skb, &rth->dst); |
2227 | hash = rt_hash(daddr, saddr, fl4->flowi4_iif, | ||
2228 | rt_genid(dev_net(rth->dst.dev))); | ||
2229 | rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); | ||
2230 | if (IS_ERR(rth)) | ||
2231 | return PTR_ERR(rth); | ||
2232 | return 0; | 1473 | return 0; |
2233 | } | 1474 | } |
2234 | 1475 | ||
@@ -2252,10 +1493,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2252 | unsigned int flags = 0; | 1493 | unsigned int flags = 0; |
2253 | u32 itag = 0; | 1494 | u32 itag = 0; |
2254 | struct rtable *rth; | 1495 | struct rtable *rth; |
2255 | unsigned int hash; | ||
2256 | __be32 spec_dst; | ||
2257 | int err = -EINVAL; | 1496 | int err = -EINVAL; |
2258 | struct net *net = dev_net(dev); | 1497 | struct net *net = dev_net(dev); |
1498 | bool do_cache; | ||
2259 | 1499 | ||
2260 | /* IP on this device is disabled. */ | 1500 | /* IP on this device is disabled. */ |
2261 | 1501 | ||
@@ -2266,10 +1506,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2266 | by fib_lookup. | 1506 | by fib_lookup. |
2267 | */ | 1507 | */ |
2268 | 1508 | ||
2269 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || | 1509 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) |
2270 | ipv4_is_loopback(saddr)) | ||
2271 | goto martian_source; | 1510 | goto martian_source; |
2272 | 1511 | ||
1512 | res.fi = NULL; | ||
2273 | if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) | 1513 | if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) |
2274 | goto brd_input; | 1514 | goto brd_input; |
2275 | 1515 | ||
@@ -2279,9 +1519,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2279 | if (ipv4_is_zeronet(saddr)) | 1519 | if (ipv4_is_zeronet(saddr)) |
2280 | goto martian_source; | 1520 | goto martian_source; |
2281 | 1521 | ||
2282 | if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) | 1522 | if (ipv4_is_zeronet(daddr)) |
2283 | goto martian_destination; | 1523 | goto martian_destination; |
2284 | 1524 | ||
1525 | if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) { | ||
1526 | if (ipv4_is_loopback(daddr)) | ||
1527 | goto martian_destination; | ||
1528 | |||
1529 | if (ipv4_is_loopback(saddr)) | ||
1530 | goto martian_source; | ||
1531 | } | ||
1532 | |||
2285 | /* | 1533 | /* |
2286 | * Now we are ready to route packet. | 1534 | * Now we are ready to route packet. |
2287 | */ | 1535 | */ |
@@ -2293,11 +1541,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2293 | fl4.daddr = daddr; | 1541 | fl4.daddr = daddr; |
2294 | fl4.saddr = saddr; | 1542 | fl4.saddr = saddr; |
2295 | err = fib_lookup(net, &fl4, &res); | 1543 | err = fib_lookup(net, &fl4, &res); |
2296 | if (err != 0) { | 1544 | if (err != 0) |
2297 | if (!IN_DEV_FORWARD(in_dev)) | ||
2298 | goto e_hostunreach; | ||
2299 | goto no_route; | 1545 | goto no_route; |
2300 | } | ||
2301 | 1546 | ||
2302 | RT_CACHE_STAT_INC(in_slow_tot); | 1547 | RT_CACHE_STAT_INC(in_slow_tot); |
2303 | 1548 | ||
@@ -2307,17 +1552,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2307 | if (res.type == RTN_LOCAL) { | 1552 | if (res.type == RTN_LOCAL) { |
2308 | err = fib_validate_source(skb, saddr, daddr, tos, | 1553 | err = fib_validate_source(skb, saddr, daddr, tos, |
2309 | net->loopback_dev->ifindex, | 1554 | net->loopback_dev->ifindex, |
2310 | dev, &spec_dst, &itag); | 1555 | dev, in_dev, &itag); |
2311 | if (err < 0) | 1556 | if (err < 0) |
2312 | goto martian_source_keep_err; | 1557 | goto martian_source_keep_err; |
2313 | if (err) | ||
2314 | flags |= RTCF_DIRECTSRC; | ||
2315 | spec_dst = daddr; | ||
2316 | goto local_input; | 1558 | goto local_input; |
2317 | } | 1559 | } |
2318 | 1560 | ||
2319 | if (!IN_DEV_FORWARD(in_dev)) | 1561 | if (!IN_DEV_FORWARD(in_dev)) |
2320 | goto e_hostunreach; | 1562 | goto no_route; |
2321 | if (res.type != RTN_UNICAST) | 1563 | if (res.type != RTN_UNICAST) |
2322 | goto martian_destination; | 1564 | goto martian_destination; |
2323 | 1565 | ||
@@ -2328,23 +1570,31 @@ brd_input: | |||
2328 | if (skb->protocol != htons(ETH_P_IP)) | 1570 | if (skb->protocol != htons(ETH_P_IP)) |
2329 | goto e_inval; | 1571 | goto e_inval; |
2330 | 1572 | ||
2331 | if (ipv4_is_zeronet(saddr)) | 1573 | if (!ipv4_is_zeronet(saddr)) { |
2332 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); | 1574 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, |
2333 | else { | 1575 | in_dev, &itag); |
2334 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, | ||
2335 | &itag); | ||
2336 | if (err < 0) | 1576 | if (err < 0) |
2337 | goto martian_source_keep_err; | 1577 | goto martian_source_keep_err; |
2338 | if (err) | ||
2339 | flags |= RTCF_DIRECTSRC; | ||
2340 | } | 1578 | } |
2341 | flags |= RTCF_BROADCAST; | 1579 | flags |= RTCF_BROADCAST; |
2342 | res.type = RTN_BROADCAST; | 1580 | res.type = RTN_BROADCAST; |
2343 | RT_CACHE_STAT_INC(in_brd); | 1581 | RT_CACHE_STAT_INC(in_brd); |
2344 | 1582 | ||
2345 | local_input: | 1583 | local_input: |
1584 | do_cache = false; | ||
1585 | if (res.fi) { | ||
1586 | if (!itag) { | ||
1587 | rth = FIB_RES_NH(res).nh_rth_input; | ||
1588 | if (rt_cache_valid(rth)) { | ||
1589 | dst_hold(&rth->dst); | ||
1590 | goto set_and_out; | ||
1591 | } | ||
1592 | do_cache = true; | ||
1593 | } | ||
1594 | } | ||
1595 | |||
2346 | rth = rt_dst_alloc(net->loopback_dev, | 1596 | rth = rt_dst_alloc(net->loopback_dev, |
2347 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false); | 1597 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); |
2348 | if (!rth) | 1598 | if (!rth) |
2349 | goto e_nobufs; | 1599 | goto e_nobufs; |
2350 | 1600 | ||
@@ -2354,41 +1604,27 @@ local_input: | |||
2354 | rth->dst.tclassid = itag; | 1604 | rth->dst.tclassid = itag; |
2355 | #endif | 1605 | #endif |
2356 | 1606 | ||
2357 | rth->rt_key_dst = daddr; | ||
2358 | rth->rt_key_src = saddr; | ||
2359 | rth->rt_genid = rt_genid(net); | 1607 | rth->rt_genid = rt_genid(net); |
2360 | rth->rt_flags = flags|RTCF_LOCAL; | 1608 | rth->rt_flags = flags|RTCF_LOCAL; |
2361 | rth->rt_type = res.type; | 1609 | rth->rt_type = res.type; |
2362 | rth->rt_key_tos = tos; | 1610 | rth->rt_is_input = 1; |
2363 | rth->rt_dst = daddr; | 1611 | rth->rt_iif = 0; |
2364 | rth->rt_src = saddr; | 1612 | rth->rt_pmtu = 0; |
2365 | #ifdef CONFIG_IP_ROUTE_CLASSID | 1613 | rth->rt_gateway = 0; |
2366 | rth->dst.tclassid = itag; | ||
2367 | #endif | ||
2368 | rth->rt_route_iif = dev->ifindex; | ||
2369 | rth->rt_iif = dev->ifindex; | ||
2370 | rth->rt_oif = 0; | ||
2371 | rth->rt_mark = skb->mark; | ||
2372 | rth->rt_gateway = daddr; | ||
2373 | rth->rt_spec_dst= spec_dst; | ||
2374 | rth->rt_peer_genid = 0; | ||
2375 | rth->peer = NULL; | ||
2376 | rth->fi = NULL; | ||
2377 | if (res.type == RTN_UNREACHABLE) { | 1614 | if (res.type == RTN_UNREACHABLE) { |
2378 | rth->dst.input= ip_error; | 1615 | rth->dst.input= ip_error; |
2379 | rth->dst.error= -err; | 1616 | rth->dst.error= -err; |
2380 | rth->rt_flags &= ~RTCF_LOCAL; | 1617 | rth->rt_flags &= ~RTCF_LOCAL; |
2381 | } | 1618 | } |
2382 | hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); | 1619 | if (do_cache) |
2383 | rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); | 1620 | rt_cache_route(&FIB_RES_NH(res), rth); |
1621 | set_and_out: | ||
1622 | skb_dst_set(skb, &rth->dst); | ||
2384 | err = 0; | 1623 | err = 0; |
2385 | if (IS_ERR(rth)) | ||
2386 | err = PTR_ERR(rth); | ||
2387 | goto out; | 1624 | goto out; |
2388 | 1625 | ||
2389 | no_route: | 1626 | no_route: |
2390 | RT_CACHE_STAT_INC(in_no_route); | 1627 | RT_CACHE_STAT_INC(in_no_route); |
2391 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
2392 | res.type = RTN_UNREACHABLE; | 1628 | res.type = RTN_UNREACHABLE; |
2393 | if (err == -ESRCH) | 1629 | if (err == -ESRCH) |
2394 | err = -ENETUNREACH; | 1630 | err = -ENETUNREACH; |
@@ -2405,10 +1641,6 @@ martian_destination: | |||
2405 | &daddr, &saddr, dev->name); | 1641 | &daddr, &saddr, dev->name); |
2406 | #endif | 1642 | #endif |
2407 | 1643 | ||
2408 | e_hostunreach: | ||
2409 | err = -EHOSTUNREACH; | ||
2410 | goto out; | ||
2411 | |||
2412 | e_inval: | 1644 | e_inval: |
2413 | err = -EINVAL; | 1645 | err = -EINVAL; |
2414 | goto out; | 1646 | goto out; |
@@ -2424,50 +1656,13 @@ martian_source_keep_err: | |||
2424 | goto out; | 1656 | goto out; |
2425 | } | 1657 | } |
2426 | 1658 | ||
2427 | int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 1659 | int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
2428 | u8 tos, struct net_device *dev, bool noref) | 1660 | u8 tos, struct net_device *dev) |
2429 | { | 1661 | { |
2430 | struct rtable *rth; | ||
2431 | unsigned int hash; | ||
2432 | int iif = dev->ifindex; | ||
2433 | struct net *net; | ||
2434 | int res; | 1662 | int res; |
2435 | 1663 | ||
2436 | net = dev_net(dev); | ||
2437 | |||
2438 | rcu_read_lock(); | 1664 | rcu_read_lock(); |
2439 | 1665 | ||
2440 | if (!rt_caching(net)) | ||
2441 | goto skip_cache; | ||
2442 | |||
2443 | tos &= IPTOS_RT_MASK; | ||
2444 | hash = rt_hash(daddr, saddr, iif, rt_genid(net)); | ||
2445 | |||
2446 | for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; | ||
2447 | rth = rcu_dereference(rth->dst.rt_next)) { | ||
2448 | if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | | ||
2449 | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | | ||
2450 | (rth->rt_route_iif ^ iif) | | ||
2451 | (rth->rt_key_tos ^ tos)) == 0 && | ||
2452 | rth->rt_mark == skb->mark && | ||
2453 | net_eq(dev_net(rth->dst.dev), net) && | ||
2454 | !rt_is_expired(rth)) { | ||
2455 | ipv4_validate_peer(rth); | ||
2456 | if (noref) { | ||
2457 | dst_use_noref(&rth->dst, jiffies); | ||
2458 | skb_dst_set_noref(skb, &rth->dst); | ||
2459 | } else { | ||
2460 | dst_use(&rth->dst, jiffies); | ||
2461 | skb_dst_set(skb, &rth->dst); | ||
2462 | } | ||
2463 | RT_CACHE_STAT_INC(in_hit); | ||
2464 | rcu_read_unlock(); | ||
2465 | return 0; | ||
2466 | } | ||
2467 | RT_CACHE_STAT_INC(in_hlist_search); | ||
2468 | } | ||
2469 | |||
2470 | skip_cache: | ||
2471 | /* Multicast recognition logic is moved from route cache to here. | 1666 | /* Multicast recognition logic is moved from route cache to here. |
2472 | The problem was that too many Ethernet cards have broken/missing | 1667 | The problem was that too many Ethernet cards have broken/missing |
2473 | hardware multicast filters :-( As result the host on multicasting | 1668 | hardware multicast filters :-( As result the host on multicasting |
@@ -2505,24 +1700,28 @@ skip_cache: | |||
2505 | rcu_read_unlock(); | 1700 | rcu_read_unlock(); |
2506 | return res; | 1701 | return res; |
2507 | } | 1702 | } |
2508 | EXPORT_SYMBOL(ip_route_input_common); | 1703 | EXPORT_SYMBOL(ip_route_input); |
2509 | 1704 | ||
2510 | /* called with rcu_read_lock() */ | 1705 | /* called with rcu_read_lock() */ |
2511 | static struct rtable *__mkroute_output(const struct fib_result *res, | 1706 | static struct rtable *__mkroute_output(const struct fib_result *res, |
2512 | const struct flowi4 *fl4, | 1707 | const struct flowi4 *fl4, int orig_oif, |
2513 | __be32 orig_daddr, __be32 orig_saddr, | ||
2514 | int orig_oif, __u8 orig_rtos, | ||
2515 | struct net_device *dev_out, | 1708 | struct net_device *dev_out, |
2516 | unsigned int flags) | 1709 | unsigned int flags) |
2517 | { | 1710 | { |
2518 | struct fib_info *fi = res->fi; | 1711 | struct fib_info *fi = res->fi; |
1712 | struct fib_nh_exception *fnhe; | ||
2519 | struct in_device *in_dev; | 1713 | struct in_device *in_dev; |
2520 | u16 type = res->type; | 1714 | u16 type = res->type; |
2521 | struct rtable *rth; | 1715 | struct rtable *rth; |
2522 | 1716 | ||
2523 | if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) | 1717 | in_dev = __in_dev_get_rcu(dev_out); |
1718 | if (!in_dev) | ||
2524 | return ERR_PTR(-EINVAL); | 1719 | return ERR_PTR(-EINVAL); |
2525 | 1720 | ||
1721 | if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) | ||
1722 | if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) | ||
1723 | return ERR_PTR(-EINVAL); | ||
1724 | |||
2526 | if (ipv4_is_lbcast(fl4->daddr)) | 1725 | if (ipv4_is_lbcast(fl4->daddr)) |
2527 | type = RTN_BROADCAST; | 1726 | type = RTN_BROADCAST; |
2528 | else if (ipv4_is_multicast(fl4->daddr)) | 1727 | else if (ipv4_is_multicast(fl4->daddr)) |
@@ -2533,10 +1732,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
2533 | if (dev_out->flags & IFF_LOOPBACK) | 1732 | if (dev_out->flags & IFF_LOOPBACK) |
2534 | flags |= RTCF_LOCAL; | 1733 | flags |= RTCF_LOCAL; |
2535 | 1734 | ||
2536 | in_dev = __in_dev_get_rcu(dev_out); | ||
2537 | if (!in_dev) | ||
2538 | return ERR_PTR(-EINVAL); | ||
2539 | |||
2540 | if (type == RTN_BROADCAST) { | 1735 | if (type == RTN_BROADCAST) { |
2541 | flags |= RTCF_BROADCAST | RTCF_LOCAL; | 1736 | flags |= RTCF_BROADCAST | RTCF_LOCAL; |
2542 | fi = NULL; | 1737 | fi = NULL; |
@@ -2553,40 +1748,39 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
2553 | fi = NULL; | 1748 | fi = NULL; |
2554 | } | 1749 | } |
2555 | 1750 | ||
1751 | fnhe = NULL; | ||
1752 | if (fi) { | ||
1753 | fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); | ||
1754 | if (!fnhe) { | ||
1755 | rth = FIB_RES_NH(*res).nh_rth_output; | ||
1756 | if (rt_cache_valid(rth)) { | ||
1757 | dst_hold(&rth->dst); | ||
1758 | return rth; | ||
1759 | } | ||
1760 | } | ||
1761 | } | ||
2556 | rth = rt_dst_alloc(dev_out, | 1762 | rth = rt_dst_alloc(dev_out, |
2557 | IN_DEV_CONF_GET(in_dev, NOPOLICY), | 1763 | IN_DEV_CONF_GET(in_dev, NOPOLICY), |
2558 | IN_DEV_CONF_GET(in_dev, NOXFRM)); | 1764 | IN_DEV_CONF_GET(in_dev, NOXFRM), |
1765 | fi && !fnhe); | ||
2559 | if (!rth) | 1766 | if (!rth) |
2560 | return ERR_PTR(-ENOBUFS); | 1767 | return ERR_PTR(-ENOBUFS); |
2561 | 1768 | ||
2562 | rth->dst.output = ip_output; | 1769 | rth->dst.output = ip_output; |
2563 | 1770 | ||
2564 | rth->rt_key_dst = orig_daddr; | ||
2565 | rth->rt_key_src = orig_saddr; | ||
2566 | rth->rt_genid = rt_genid(dev_net(dev_out)); | 1771 | rth->rt_genid = rt_genid(dev_net(dev_out)); |
2567 | rth->rt_flags = flags; | 1772 | rth->rt_flags = flags; |
2568 | rth->rt_type = type; | 1773 | rth->rt_type = type; |
2569 | rth->rt_key_tos = orig_rtos; | 1774 | rth->rt_is_input = 0; |
2570 | rth->rt_dst = fl4->daddr; | 1775 | rth->rt_iif = orig_oif ? : 0; |
2571 | rth->rt_src = fl4->saddr; | 1776 | rth->rt_pmtu = 0; |
2572 | rth->rt_route_iif = 0; | 1777 | rth->rt_gateway = 0; |
2573 | rth->rt_iif = orig_oif ? : dev_out->ifindex; | ||
2574 | rth->rt_oif = orig_oif; | ||
2575 | rth->rt_mark = fl4->flowi4_mark; | ||
2576 | rth->rt_gateway = fl4->daddr; | ||
2577 | rth->rt_spec_dst= fl4->saddr; | ||
2578 | rth->rt_peer_genid = 0; | ||
2579 | rth->peer = NULL; | ||
2580 | rth->fi = NULL; | ||
2581 | 1778 | ||
2582 | RT_CACHE_STAT_INC(out_slow_tot); | 1779 | RT_CACHE_STAT_INC(out_slow_tot); |
2583 | 1780 | ||
2584 | if (flags & RTCF_LOCAL) { | 1781 | if (flags & RTCF_LOCAL) |
2585 | rth->dst.input = ip_local_deliver; | 1782 | rth->dst.input = ip_local_deliver; |
2586 | rth->rt_spec_dst = fl4->daddr; | ||
2587 | } | ||
2588 | if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { | 1783 | if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { |
2589 | rth->rt_spec_dst = fl4->saddr; | ||
2590 | if (flags & RTCF_LOCAL && | 1784 | if (flags & RTCF_LOCAL && |
2591 | !(dev_out->flags & IFF_LOOPBACK)) { | 1785 | !(dev_out->flags & IFF_LOOPBACK)) { |
2592 | rth->dst.output = ip_mc_output; | 1786 | rth->dst.output = ip_mc_output; |
@@ -2603,34 +1797,28 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
2603 | #endif | 1797 | #endif |
2604 | } | 1798 | } |
2605 | 1799 | ||
2606 | rt_set_nexthop(rth, fl4, res, fi, type, 0); | 1800 | rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); |
2607 | 1801 | ||
2608 | return rth; | 1802 | return rth; |
2609 | } | 1803 | } |
2610 | 1804 | ||
2611 | /* | 1805 | /* |
2612 | * Major route resolver routine. | 1806 | * Major route resolver routine. |
2613 | * called with rcu_read_lock(); | ||
2614 | */ | 1807 | */ |
2615 | 1808 | ||
2616 | static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) | 1809 | struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) |
2617 | { | 1810 | { |
2618 | struct net_device *dev_out = NULL; | 1811 | struct net_device *dev_out = NULL; |
2619 | __u8 tos = RT_FL_TOS(fl4); | 1812 | __u8 tos = RT_FL_TOS(fl4); |
2620 | unsigned int flags = 0; | 1813 | unsigned int flags = 0; |
2621 | struct fib_result res; | 1814 | struct fib_result res; |
2622 | struct rtable *rth; | 1815 | struct rtable *rth; |
2623 | __be32 orig_daddr; | ||
2624 | __be32 orig_saddr; | ||
2625 | int orig_oif; | 1816 | int orig_oif; |
2626 | 1817 | ||
1818 | res.tclassid = 0; | ||
2627 | res.fi = NULL; | 1819 | res.fi = NULL; |
2628 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 1820 | res.table = NULL; |
2629 | res.r = NULL; | ||
2630 | #endif | ||
2631 | 1821 | ||
2632 | orig_daddr = fl4->daddr; | ||
2633 | orig_saddr = fl4->saddr; | ||
2634 | orig_oif = fl4->flowi4_oif; | 1822 | orig_oif = fl4->flowi4_oif; |
2635 | 1823 | ||
2636 | fl4->flowi4_iif = net->loopback_dev->ifindex; | 1824 | fl4->flowi4_iif = net->loopback_dev->ifindex; |
@@ -2730,6 +1918,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) | |||
2730 | 1918 | ||
2731 | if (fib_lookup(net, fl4, &res)) { | 1919 | if (fib_lookup(net, fl4, &res)) { |
2732 | res.fi = NULL; | 1920 | res.fi = NULL; |
1921 | res.table = NULL; | ||
2733 | if (fl4->flowi4_oif) { | 1922 | if (fl4->flowi4_oif) { |
2734 | /* Apparently, routing tables are wrong. Assume, | 1923 | /* Apparently, routing tables are wrong. Assume, |
2735 | that the destination is on link. | 1924 | that the destination is on link. |
@@ -2791,60 +1980,12 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) | |||
2791 | 1980 | ||
2792 | 1981 | ||
2793 | make_route: | 1982 | make_route: |
2794 | rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, | 1983 | rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); |
2795 | tos, dev_out, flags); | ||
2796 | if (!IS_ERR(rth)) { | ||
2797 | unsigned int hash; | ||
2798 | |||
2799 | hash = rt_hash(orig_daddr, orig_saddr, orig_oif, | ||
2800 | rt_genid(dev_net(dev_out))); | ||
2801 | rth = rt_intern_hash(hash, rth, NULL, orig_oif); | ||
2802 | } | ||
2803 | 1984 | ||
2804 | out: | 1985 | out: |
2805 | rcu_read_unlock(); | 1986 | rcu_read_unlock(); |
2806 | return rth; | 1987 | return rth; |
2807 | } | 1988 | } |
2808 | |||
2809 | struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) | ||
2810 | { | ||
2811 | struct rtable *rth; | ||
2812 | unsigned int hash; | ||
2813 | |||
2814 | if (!rt_caching(net)) | ||
2815 | goto slow_output; | ||
2816 | |||
2817 | hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); | ||
2818 | |||
2819 | rcu_read_lock_bh(); | ||
2820 | for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; | ||
2821 | rth = rcu_dereference_bh(rth->dst.rt_next)) { | ||
2822 | if (rth->rt_key_dst == flp4->daddr && | ||
2823 | rth->rt_key_src == flp4->saddr && | ||
2824 | rt_is_output_route(rth) && | ||
2825 | rth->rt_oif == flp4->flowi4_oif && | ||
2826 | rth->rt_mark == flp4->flowi4_mark && | ||
2827 | !((rth->rt_key_tos ^ flp4->flowi4_tos) & | ||
2828 | (IPTOS_RT_MASK | RTO_ONLINK)) && | ||
2829 | net_eq(dev_net(rth->dst.dev), net) && | ||
2830 | !rt_is_expired(rth)) { | ||
2831 | ipv4_validate_peer(rth); | ||
2832 | dst_use(&rth->dst, jiffies); | ||
2833 | RT_CACHE_STAT_INC(out_hit); | ||
2834 | rcu_read_unlock_bh(); | ||
2835 | if (!flp4->saddr) | ||
2836 | flp4->saddr = rth->rt_src; | ||
2837 | if (!flp4->daddr) | ||
2838 | flp4->daddr = rth->rt_dst; | ||
2839 | return rth; | ||
2840 | } | ||
2841 | RT_CACHE_STAT_INC(out_hlist_search); | ||
2842 | } | ||
2843 | rcu_read_unlock_bh(); | ||
2844 | |||
2845 | slow_output: | ||
2846 | return ip_route_output_slow(net, flp4); | ||
2847 | } | ||
2848 | EXPORT_SYMBOL_GPL(__ip_route_output_key); | 1989 | EXPORT_SYMBOL_GPL(__ip_route_output_key); |
2849 | 1990 | ||
2850 | static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) | 1991 | static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) |
@@ -2859,7 +2000,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) | |||
2859 | return mtu ? : dst->dev->mtu; | 2000 | return mtu ? : dst->dev->mtu; |
2860 | } | 2001 | } |
2861 | 2002 | ||
2862 | static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) | 2003 | static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, |
2004 | struct sk_buff *skb, u32 mtu) | ||
2005 | { | ||
2006 | } | ||
2007 | |||
2008 | static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, | ||
2009 | struct sk_buff *skb) | ||
2863 | { | 2010 | { |
2864 | } | 2011 | } |
2865 | 2012 | ||
@@ -2872,53 +2019,40 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, | |||
2872 | static struct dst_ops ipv4_dst_blackhole_ops = { | 2019 | static struct dst_ops ipv4_dst_blackhole_ops = { |
2873 | .family = AF_INET, | 2020 | .family = AF_INET, |
2874 | .protocol = cpu_to_be16(ETH_P_IP), | 2021 | .protocol = cpu_to_be16(ETH_P_IP), |
2875 | .destroy = ipv4_dst_destroy, | ||
2876 | .check = ipv4_blackhole_dst_check, | 2022 | .check = ipv4_blackhole_dst_check, |
2877 | .mtu = ipv4_blackhole_mtu, | 2023 | .mtu = ipv4_blackhole_mtu, |
2878 | .default_advmss = ipv4_default_advmss, | 2024 | .default_advmss = ipv4_default_advmss, |
2879 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, | 2025 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, |
2026 | .redirect = ipv4_rt_blackhole_redirect, | ||
2880 | .cow_metrics = ipv4_rt_blackhole_cow_metrics, | 2027 | .cow_metrics = ipv4_rt_blackhole_cow_metrics, |
2881 | .neigh_lookup = ipv4_neigh_lookup, | 2028 | .neigh_lookup = ipv4_neigh_lookup, |
2882 | }; | 2029 | }; |
2883 | 2030 | ||
2884 | struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) | 2031 | struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) |
2885 | { | 2032 | { |
2886 | struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); | ||
2887 | struct rtable *ort = (struct rtable *) dst_orig; | 2033 | struct rtable *ort = (struct rtable *) dst_orig; |
2034 | struct rtable *rt; | ||
2888 | 2035 | ||
2036 | rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); | ||
2889 | if (rt) { | 2037 | if (rt) { |
2890 | struct dst_entry *new = &rt->dst; | 2038 | struct dst_entry *new = &rt->dst; |
2891 | 2039 | ||
2892 | new->__use = 1; | 2040 | new->__use = 1; |
2893 | new->input = dst_discard; | 2041 | new->input = dst_discard; |
2894 | new->output = dst_discard; | 2042 | new->output = dst_discard; |
2895 | dst_copy_metrics(new, &ort->dst); | ||
2896 | 2043 | ||
2897 | new->dev = ort->dst.dev; | 2044 | new->dev = ort->dst.dev; |
2898 | if (new->dev) | 2045 | if (new->dev) |
2899 | dev_hold(new->dev); | 2046 | dev_hold(new->dev); |
2900 | 2047 | ||
2901 | rt->rt_key_dst = ort->rt_key_dst; | 2048 | rt->rt_is_input = ort->rt_is_input; |
2902 | rt->rt_key_src = ort->rt_key_src; | ||
2903 | rt->rt_key_tos = ort->rt_key_tos; | ||
2904 | rt->rt_route_iif = ort->rt_route_iif; | ||
2905 | rt->rt_iif = ort->rt_iif; | 2049 | rt->rt_iif = ort->rt_iif; |
2906 | rt->rt_oif = ort->rt_oif; | 2050 | rt->rt_pmtu = ort->rt_pmtu; |
2907 | rt->rt_mark = ort->rt_mark; | ||
2908 | 2051 | ||
2909 | rt->rt_genid = rt_genid(net); | 2052 | rt->rt_genid = rt_genid(net); |
2910 | rt->rt_flags = ort->rt_flags; | 2053 | rt->rt_flags = ort->rt_flags; |
2911 | rt->rt_type = ort->rt_type; | 2054 | rt->rt_type = ort->rt_type; |
2912 | rt->rt_dst = ort->rt_dst; | ||
2913 | rt->rt_src = ort->rt_src; | ||
2914 | rt->rt_gateway = ort->rt_gateway; | 2055 | rt->rt_gateway = ort->rt_gateway; |
2915 | rt->rt_spec_dst = ort->rt_spec_dst; | ||
2916 | rt->peer = ort->peer; | ||
2917 | if (rt->peer) | ||
2918 | atomic_inc(&rt->peer->refcnt); | ||
2919 | rt->fi = ort->fi; | ||
2920 | if (rt->fi) | ||
2921 | atomic_inc(&rt->fi->fib_clntref); | ||
2922 | 2056 | ||
2923 | dst_free(new); | 2057 | dst_free(new); |
2924 | } | 2058 | } |
@@ -2945,16 +2079,16 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, | |||
2945 | } | 2079 | } |
2946 | EXPORT_SYMBOL_GPL(ip_route_output_flow); | 2080 | EXPORT_SYMBOL_GPL(ip_route_output_flow); |
2947 | 2081 | ||
2948 | static int rt_fill_info(struct net *net, | 2082 | static int rt_fill_info(struct net *net, __be32 dst, __be32 src, |
2949 | struct sk_buff *skb, u32 pid, u32 seq, int event, | 2083 | struct flowi4 *fl4, struct sk_buff *skb, u32 pid, |
2950 | int nowait, unsigned int flags) | 2084 | u32 seq, int event, int nowait, unsigned int flags) |
2951 | { | 2085 | { |
2952 | struct rtable *rt = skb_rtable(skb); | 2086 | struct rtable *rt = skb_rtable(skb); |
2953 | struct rtmsg *r; | 2087 | struct rtmsg *r; |
2954 | struct nlmsghdr *nlh; | 2088 | struct nlmsghdr *nlh; |
2955 | unsigned long expires = 0; | 2089 | unsigned long expires = 0; |
2956 | const struct inet_peer *peer = rt->peer; | 2090 | u32 error; |
2957 | u32 id = 0, ts = 0, tsage = 0, error; | 2091 | u32 metrics[RTAX_MAX]; |
2958 | 2092 | ||
2959 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); | 2093 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); |
2960 | if (nlh == NULL) | 2094 | if (nlh == NULL) |
@@ -2964,7 +2098,7 @@ static int rt_fill_info(struct net *net, | |||
2964 | r->rtm_family = AF_INET; | 2098 | r->rtm_family = AF_INET; |
2965 | r->rtm_dst_len = 32; | 2099 | r->rtm_dst_len = 32; |
2966 | r->rtm_src_len = 0; | 2100 | r->rtm_src_len = 0; |
2967 | r->rtm_tos = rt->rt_key_tos; | 2101 | r->rtm_tos = fl4->flowi4_tos; |
2968 | r->rtm_table = RT_TABLE_MAIN; | 2102 | r->rtm_table = RT_TABLE_MAIN; |
2969 | if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) | 2103 | if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) |
2970 | goto nla_put_failure; | 2104 | goto nla_put_failure; |
@@ -2975,11 +2109,11 @@ static int rt_fill_info(struct net *net, | |||
2975 | if (rt->rt_flags & RTCF_NOTIFY) | 2109 | if (rt->rt_flags & RTCF_NOTIFY) |
2976 | r->rtm_flags |= RTM_F_NOTIFY; | 2110 | r->rtm_flags |= RTM_F_NOTIFY; |
2977 | 2111 | ||
2978 | if (nla_put_be32(skb, RTA_DST, rt->rt_dst)) | 2112 | if (nla_put_be32(skb, RTA_DST, dst)) |
2979 | goto nla_put_failure; | 2113 | goto nla_put_failure; |
2980 | if (rt->rt_key_src) { | 2114 | if (src) { |
2981 | r->rtm_src_len = 32; | 2115 | r->rtm_src_len = 32; |
2982 | if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src)) | 2116 | if (nla_put_be32(skb, RTA_SRC, src)) |
2983 | goto nla_put_failure; | 2117 | goto nla_put_failure; |
2984 | } | 2118 | } |
2985 | if (rt->dst.dev && | 2119 | if (rt->dst.dev && |
@@ -2990,69 +2124,40 @@ static int rt_fill_info(struct net *net, | |||
2990 | nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) | 2124 | nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) |
2991 | goto nla_put_failure; | 2125 | goto nla_put_failure; |
2992 | #endif | 2126 | #endif |
2993 | if (rt_is_input_route(rt)) { | 2127 | if (!rt_is_input_route(rt) && |
2994 | if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst)) | 2128 | fl4->saddr != src) { |
2995 | goto nla_put_failure; | 2129 | if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) |
2996 | } else if (rt->rt_src != rt->rt_key_src) { | ||
2997 | if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) | ||
2998 | goto nla_put_failure; | 2130 | goto nla_put_failure; |
2999 | } | 2131 | } |
3000 | if (rt->rt_dst != rt->rt_gateway && | 2132 | if (rt->rt_gateway && |
3001 | nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) | 2133 | nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) |
3002 | goto nla_put_failure; | 2134 | goto nla_put_failure; |
3003 | 2135 | ||
3004 | if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) | 2136 | memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); |
2137 | if (rt->rt_pmtu) | ||
2138 | metrics[RTAX_MTU - 1] = rt->rt_pmtu; | ||
2139 | if (rtnetlink_put_metrics(skb, metrics) < 0) | ||
3005 | goto nla_put_failure; | 2140 | goto nla_put_failure; |
3006 | 2141 | ||
3007 | if (rt->rt_mark && | 2142 | if (fl4->flowi4_mark && |
3008 | nla_put_be32(skb, RTA_MARK, rt->rt_mark)) | 2143 | nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark)) |
3009 | goto nla_put_failure; | 2144 | goto nla_put_failure; |
3010 | 2145 | ||
3011 | error = rt->dst.error; | 2146 | error = rt->dst.error; |
3012 | if (peer) { | 2147 | expires = rt->dst.expires; |
3013 | inet_peer_refcheck(rt->peer); | 2148 | if (expires) { |
3014 | id = atomic_read(&peer->ip_id_count) & 0xffff; | 2149 | if (time_before(jiffies, expires)) |
3015 | if (peer->tcp_ts_stamp) { | 2150 | expires -= jiffies; |
3016 | ts = peer->tcp_ts; | 2151 | else |
3017 | tsage = get_seconds() - peer->tcp_ts_stamp; | 2152 | expires = 0; |
3018 | } | ||
3019 | expires = ACCESS_ONCE(peer->pmtu_expires); | ||
3020 | if (expires) { | ||
3021 | if (time_before(jiffies, expires)) | ||
3022 | expires -= jiffies; | ||
3023 | else | ||
3024 | expires = 0; | ||
3025 | } | ||
3026 | } | 2153 | } |
3027 | 2154 | ||
3028 | if (rt_is_input_route(rt)) { | 2155 | if (rt_is_input_route(rt)) { |
3029 | #ifdef CONFIG_IP_MROUTE | 2156 | if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) |
3030 | __be32 dst = rt->rt_dst; | 2157 | goto nla_put_failure; |
3031 | |||
3032 | if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && | ||
3033 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { | ||
3034 | int err = ipmr_get_route(net, skb, | ||
3035 | rt->rt_src, rt->rt_dst, | ||
3036 | r, nowait); | ||
3037 | if (err <= 0) { | ||
3038 | if (!nowait) { | ||
3039 | if (err == 0) | ||
3040 | return 0; | ||
3041 | goto nla_put_failure; | ||
3042 | } else { | ||
3043 | if (err == -EMSGSIZE) | ||
3044 | goto nla_put_failure; | ||
3045 | error = err; | ||
3046 | } | ||
3047 | } | ||
3048 | } else | ||
3049 | #endif | ||
3050 | if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) | ||
3051 | goto nla_put_failure; | ||
3052 | } | 2158 | } |
3053 | 2159 | ||
3054 | if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, | 2160 | if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) |
3055 | expires, error) < 0) | ||
3056 | goto nla_put_failure; | 2161 | goto nla_put_failure; |
3057 | 2162 | ||
3058 | return nlmsg_end(skb, nlh); | 2163 | return nlmsg_end(skb, nlh); |
@@ -3068,6 +2173,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void | |||
3068 | struct rtmsg *rtm; | 2173 | struct rtmsg *rtm; |
3069 | struct nlattr *tb[RTA_MAX+1]; | 2174 | struct nlattr *tb[RTA_MAX+1]; |
3070 | struct rtable *rt = NULL; | 2175 | struct rtable *rt = NULL; |
2176 | struct flowi4 fl4; | ||
3071 | __be32 dst = 0; | 2177 | __be32 dst = 0; |
3072 | __be32 src = 0; | 2178 | __be32 src = 0; |
3073 | u32 iif; | 2179 | u32 iif; |
@@ -3102,6 +2208,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void | |||
3102 | iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; | 2208 | iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; |
3103 | mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; | 2209 | mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; |
3104 | 2210 | ||
2211 | memset(&fl4, 0, sizeof(fl4)); | ||
2212 | fl4.daddr = dst; | ||
2213 | fl4.saddr = src; | ||
2214 | fl4.flowi4_tos = rtm->rtm_tos; | ||
2215 | fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; | ||
2216 | fl4.flowi4_mark = mark; | ||
2217 | |||
3105 | if (iif) { | 2218 | if (iif) { |
3106 | struct net_device *dev; | 2219 | struct net_device *dev; |
3107 | 2220 | ||
@@ -3122,13 +2235,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void | |||
3122 | if (err == 0 && rt->dst.error) | 2235 | if (err == 0 && rt->dst.error) |
3123 | err = -rt->dst.error; | 2236 | err = -rt->dst.error; |
3124 | } else { | 2237 | } else { |
3125 | struct flowi4 fl4 = { | ||
3126 | .daddr = dst, | ||
3127 | .saddr = src, | ||
3128 | .flowi4_tos = rtm->rtm_tos, | ||
3129 | .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, | ||
3130 | .flowi4_mark = mark, | ||
3131 | }; | ||
3132 | rt = ip_route_output_key(net, &fl4); | 2238 | rt = ip_route_output_key(net, &fl4); |
3133 | 2239 | ||
3134 | err = 0; | 2240 | err = 0; |
@@ -3143,7 +2249,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void | |||
3143 | if (rtm->rtm_flags & RTM_F_NOTIFY) | 2249 | if (rtm->rtm_flags & RTM_F_NOTIFY) |
3144 | rt->rt_flags |= RTCF_NOTIFY; | 2250 | rt->rt_flags |= RTCF_NOTIFY; |
3145 | 2251 | ||
3146 | err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, | 2252 | err = rt_fill_info(net, dst, src, &fl4, skb, |
2253 | NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, | ||
3147 | RTM_NEWROUTE, 0, 0); | 2254 | RTM_NEWROUTE, 0, 0); |
3148 | if (err <= 0) | 2255 | if (err <= 0) |
3149 | goto errout_free; | 2256 | goto errout_free; |
@@ -3159,43 +2266,6 @@ errout_free: | |||
3159 | 2266 | ||
3160 | int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) | 2267 | int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) |
3161 | { | 2268 | { |
3162 | struct rtable *rt; | ||
3163 | int h, s_h; | ||
3164 | int idx, s_idx; | ||
3165 | struct net *net; | ||
3166 | |||
3167 | net = sock_net(skb->sk); | ||
3168 | |||
3169 | s_h = cb->args[0]; | ||
3170 | if (s_h < 0) | ||
3171 | s_h = 0; | ||
3172 | s_idx = idx = cb->args[1]; | ||
3173 | for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { | ||
3174 | if (!rt_hash_table[h].chain) | ||
3175 | continue; | ||
3176 | rcu_read_lock_bh(); | ||
3177 | for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; | ||
3178 | rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { | ||
3179 | if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) | ||
3180 | continue; | ||
3181 | if (rt_is_expired(rt)) | ||
3182 | continue; | ||
3183 | skb_dst_set_noref(skb, &rt->dst); | ||
3184 | if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, | ||
3185 | cb->nlh->nlmsg_seq, RTM_NEWROUTE, | ||
3186 | 1, NLM_F_MULTI) <= 0) { | ||
3187 | skb_dst_drop(skb); | ||
3188 | rcu_read_unlock_bh(); | ||
3189 | goto done; | ||
3190 | } | ||
3191 | skb_dst_drop(skb); | ||
3192 | } | ||
3193 | rcu_read_unlock_bh(); | ||
3194 | } | ||
3195 | |||
3196 | done: | ||
3197 | cb->args[0] = h; | ||
3198 | cb->args[1] = idx; | ||
3199 | return skb->len; | 2269 | return skb->len; |
3200 | } | 2270 | } |
3201 | 2271 | ||
@@ -3400,26 +2470,34 @@ static __net_initdata struct pernet_operations rt_genid_ops = { | |||
3400 | .init = rt_genid_init, | 2470 | .init = rt_genid_init, |
3401 | }; | 2471 | }; |
3402 | 2472 | ||
2473 | static int __net_init ipv4_inetpeer_init(struct net *net) | ||
2474 | { | ||
2475 | struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); | ||
3403 | 2476 | ||
3404 | #ifdef CONFIG_IP_ROUTE_CLASSID | 2477 | if (!bp) |
3405 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; | 2478 | return -ENOMEM; |
3406 | #endif /* CONFIG_IP_ROUTE_CLASSID */ | 2479 | inet_peer_base_init(bp); |
2480 | net->ipv4.peers = bp; | ||
2481 | return 0; | ||
2482 | } | ||
3407 | 2483 | ||
3408 | static __initdata unsigned long rhash_entries; | 2484 | static void __net_exit ipv4_inetpeer_exit(struct net *net) |
3409 | static int __init set_rhash_entries(char *str) | ||
3410 | { | 2485 | { |
3411 | ssize_t ret; | 2486 | struct inet_peer_base *bp = net->ipv4.peers; |
3412 | 2487 | ||
3413 | if (!str) | 2488 | net->ipv4.peers = NULL; |
3414 | return 0; | 2489 | inetpeer_invalidate_tree(bp); |
2490 | kfree(bp); | ||
2491 | } | ||
3415 | 2492 | ||
3416 | ret = kstrtoul(str, 0, &rhash_entries); | 2493 | static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { |
3417 | if (ret) | 2494 | .init = ipv4_inetpeer_init, |
3418 | return 0; | 2495 | .exit = ipv4_inetpeer_exit, |
2496 | }; | ||
3419 | 2497 | ||
3420 | return 1; | 2498 | #ifdef CONFIG_IP_ROUTE_CLASSID |
3421 | } | 2499 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; |
3422 | __setup("rhash_entries=", set_rhash_entries); | 2500 | #endif /* CONFIG_IP_ROUTE_CLASSID */ |
3423 | 2501 | ||
3424 | int __init ip_rt_init(void) | 2502 | int __init ip_rt_init(void) |
3425 | { | 2503 | { |
@@ -3443,31 +2521,12 @@ int __init ip_rt_init(void) | |||
3443 | if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) | 2521 | if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) |
3444 | panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); | 2522 | panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); |
3445 | 2523 | ||
3446 | rt_hash_table = (struct rt_hash_bucket *) | 2524 | ipv4_dst_ops.gc_thresh = ~0; |
3447 | alloc_large_system_hash("IP route cache", | 2525 | ip_rt_max_size = INT_MAX; |
3448 | sizeof(struct rt_hash_bucket), | ||
3449 | rhash_entries, | ||
3450 | (totalram_pages >= 128 * 1024) ? | ||
3451 | 15 : 17, | ||
3452 | 0, | ||
3453 | &rt_hash_log, | ||
3454 | &rt_hash_mask, | ||
3455 | 0, | ||
3456 | rhash_entries ? 0 : 512 * 1024); | ||
3457 | memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); | ||
3458 | rt_hash_lock_init(); | ||
3459 | |||
3460 | ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); | ||
3461 | ip_rt_max_size = (rt_hash_mask + 1) * 16; | ||
3462 | 2526 | ||
3463 | devinet_init(); | 2527 | devinet_init(); |
3464 | ip_fib_init(); | 2528 | ip_fib_init(); |
3465 | 2529 | ||
3466 | INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); | ||
3467 | expires_ljiffies = jiffies; | ||
3468 | schedule_delayed_work(&expires_work, | ||
3469 | net_random() % ip_rt_gc_interval + ip_rt_gc_interval); | ||
3470 | |||
3471 | if (ip_rt_proc_init()) | 2530 | if (ip_rt_proc_init()) |
3472 | pr_err("Unable to create route proc files\n"); | 2531 | pr_err("Unable to create route proc files\n"); |
3473 | #ifdef CONFIG_XFRM | 2532 | #ifdef CONFIG_XFRM |
@@ -3480,6 +2539,7 @@ int __init ip_rt_init(void) | |||
3480 | register_pernet_subsys(&sysctl_route_ops); | 2539 | register_pernet_subsys(&sysctl_route_ops); |
3481 | #endif | 2540 | #endif |
3482 | register_pernet_subsys(&rt_genid_ops); | 2541 | register_pernet_subsys(&rt_genid_ops); |
2542 | register_pernet_subsys(&ipv4_inetpeer_ops); | ||
3483 | return rc; | 2543 | return rc; |
3484 | } | 2544 | } |
3485 | 2545 | ||
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eab2a7fb15d1..650e1528e1e6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
293 | 293 | ||
294 | /* check for timestamp cookie support */ | 294 | /* check for timestamp cookie support */ |
295 | memset(&tcp_opt, 0, sizeof(tcp_opt)); | 295 | memset(&tcp_opt, 0, sizeof(tcp_opt)); |
296 | tcp_parse_options(skb, &tcp_opt, &hash_location, 0); | 296 | tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); |
297 | 297 | ||
298 | if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) | 298 | if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) |
299 | goto out; | 299 | goto out; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ef32956ed655..5840c3255721 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -301,6 +301,13 @@ static struct ctl_table ipv4_table[] = { | |||
301 | .proc_handler = proc_dointvec | 301 | .proc_handler = proc_dointvec |
302 | }, | 302 | }, |
303 | { | 303 | { |
304 | .procname = "ip_early_demux", | ||
305 | .data = &sysctl_ip_early_demux, | ||
306 | .maxlen = sizeof(int), | ||
307 | .mode = 0644, | ||
308 | .proc_handler = proc_dointvec | ||
309 | }, | ||
310 | { | ||
304 | .procname = "ip_dynaddr", | 311 | .procname = "ip_dynaddr", |
305 | .data = &sysctl_ip_dynaddr, | 312 | .data = &sysctl_ip_dynaddr, |
306 | .maxlen = sizeof(int), | 313 | .maxlen = sizeof(int), |
@@ -360,6 +367,13 @@ static struct ctl_table ipv4_table[] = { | |||
360 | }, | 367 | }, |
361 | #endif | 368 | #endif |
362 | { | 369 | { |
370 | .procname = "tcp_fastopen", | ||
371 | .data = &sysctl_tcp_fastopen, | ||
372 | .maxlen = sizeof(int), | ||
373 | .mode = 0644, | ||
374 | .proc_handler = proc_dointvec, | ||
375 | }, | ||
376 | { | ||
363 | .procname = "tcp_tw_recycle", | 377 | .procname = "tcp_tw_recycle", |
364 | .data = &tcp_death_row.sysctl_tw_recycle, | 378 | .data = &tcp_death_row.sysctl_tw_recycle, |
365 | .maxlen = sizeof(int), | 379 | .maxlen = sizeof(int), |
@@ -591,6 +605,20 @@ static struct ctl_table ipv4_table[] = { | |||
591 | .mode = 0644, | 605 | .mode = 0644, |
592 | .proc_handler = proc_dointvec | 606 | .proc_handler = proc_dointvec |
593 | }, | 607 | }, |
608 | { | ||
609 | .procname = "tcp_limit_output_bytes", | ||
610 | .data = &sysctl_tcp_limit_output_bytes, | ||
611 | .maxlen = sizeof(int), | ||
612 | .mode = 0644, | ||
613 | .proc_handler = proc_dointvec | ||
614 | }, | ||
615 | { | ||
616 | .procname = "tcp_challenge_ack_limit", | ||
617 | .data = &sysctl_tcp_challenge_ack_limit, | ||
618 | .maxlen = sizeof(int), | ||
619 | .mode = 0644, | ||
620 | .proc_handler = proc_dointvec | ||
621 | }, | ||
594 | #ifdef CONFIG_NET_DMA | 622 | #ifdef CONFIG_NET_DMA |
595 | { | 623 | { |
596 | .procname = "tcp_dma_copybreak", | 624 | .procname = "tcp_dma_copybreak", |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3ba605f60e4e..581ecf02c6b5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -270,6 +270,7 @@ | |||
270 | #include <linux/slab.h> | 270 | #include <linux/slab.h> |
271 | 271 | ||
272 | #include <net/icmp.h> | 272 | #include <net/icmp.h> |
273 | #include <net/inet_common.h> | ||
273 | #include <net/tcp.h> | 274 | #include <net/tcp.h> |
274 | #include <net/xfrm.h> | 275 | #include <net/xfrm.h> |
275 | #include <net/ip.h> | 276 | #include <net/ip.h> |
@@ -376,6 +377,7 @@ void tcp_init_sock(struct sock *sk) | |||
376 | skb_queue_head_init(&tp->out_of_order_queue); | 377 | skb_queue_head_init(&tp->out_of_order_queue); |
377 | tcp_init_xmit_timers(sk); | 378 | tcp_init_xmit_timers(sk); |
378 | tcp_prequeue_init(tp); | 379 | tcp_prequeue_init(tp); |
380 | INIT_LIST_HEAD(&tp->tsq_node); | ||
379 | 381 | ||
380 | icsk->icsk_rto = TCP_TIMEOUT_INIT; | 382 | icsk->icsk_rto = TCP_TIMEOUT_INIT; |
381 | tp->mdev = TCP_TIMEOUT_INIT; | 383 | tp->mdev = TCP_TIMEOUT_INIT; |
@@ -796,6 +798,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
796 | inet_csk(sk)->icsk_ext_hdr_len - | 798 | inet_csk(sk)->icsk_ext_hdr_len - |
797 | tp->tcp_header_len); | 799 | tp->tcp_header_len); |
798 | 800 | ||
801 | /* TSQ : try to have two TSO segments in flight */ | ||
802 | xmit_size_goal = min_t(u32, xmit_size_goal, | ||
803 | sysctl_tcp_limit_output_bytes >> 1); | ||
804 | |||
799 | xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); | 805 | xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); |
800 | 806 | ||
801 | /* We try hard to avoid divides here */ | 807 | /* We try hard to avoid divides here */ |
@@ -977,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg) | |||
977 | return tmp; | 983 | return tmp; |
978 | } | 984 | } |
979 | 985 | ||
986 | void tcp_free_fastopen_req(struct tcp_sock *tp) | ||
987 | { | ||
988 | if (tp->fastopen_req != NULL) { | ||
989 | kfree(tp->fastopen_req); | ||
990 | tp->fastopen_req = NULL; | ||
991 | } | ||
992 | } | ||
993 | |||
994 | static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) | ||
995 | { | ||
996 | struct tcp_sock *tp = tcp_sk(sk); | ||
997 | int err, flags; | ||
998 | |||
999 | if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) | ||
1000 | return -EOPNOTSUPP; | ||
1001 | if (tp->fastopen_req != NULL) | ||
1002 | return -EALREADY; /* Another Fast Open is in progress */ | ||
1003 | |||
1004 | tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), | ||
1005 | sk->sk_allocation); | ||
1006 | if (unlikely(tp->fastopen_req == NULL)) | ||
1007 | return -ENOBUFS; | ||
1008 | tp->fastopen_req->data = msg; | ||
1009 | |||
1010 | flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; | ||
1011 | err = __inet_stream_connect(sk->sk_socket, msg->msg_name, | ||
1012 | msg->msg_namelen, flags); | ||
1013 | *size = tp->fastopen_req->copied; | ||
1014 | tcp_free_fastopen_req(tp); | ||
1015 | return err; | ||
1016 | } | ||
1017 | |||
980 | int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | 1018 | int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
981 | size_t size) | 1019 | size_t size) |
982 | { | 1020 | { |
983 | struct iovec *iov; | 1021 | struct iovec *iov; |
984 | struct tcp_sock *tp = tcp_sk(sk); | 1022 | struct tcp_sock *tp = tcp_sk(sk); |
985 | struct sk_buff *skb; | 1023 | struct sk_buff *skb; |
986 | int iovlen, flags, err, copied; | 1024 | int iovlen, flags, err, copied = 0; |
987 | int mss_now = 0, size_goal; | 1025 | int mss_now = 0, size_goal, copied_syn = 0, offset = 0; |
988 | bool sg; | 1026 | bool sg; |
989 | long timeo; | 1027 | long timeo; |
990 | 1028 | ||
991 | lock_sock(sk); | 1029 | lock_sock(sk); |
992 | 1030 | ||
993 | flags = msg->msg_flags; | 1031 | flags = msg->msg_flags; |
1032 | if (flags & MSG_FASTOPEN) { | ||
1033 | err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); | ||
1034 | if (err == -EINPROGRESS && copied_syn > 0) | ||
1035 | goto out; | ||
1036 | else if (err) | ||
1037 | goto out_err; | ||
1038 | offset = copied_syn; | ||
1039 | } | ||
1040 | |||
994 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 1041 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
995 | 1042 | ||
996 | /* Wait for a connection to finish. */ | 1043 | /* Wait for a connection to finish. */ |
997 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | 1044 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) |
998 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 1045 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
999 | goto out_err; | 1046 | goto do_error; |
1000 | 1047 | ||
1001 | if (unlikely(tp->repair)) { | 1048 | if (unlikely(tp->repair)) { |
1002 | if (tp->repair_queue == TCP_RECV_QUEUE) { | 1049 | if (tp->repair_queue == TCP_RECV_QUEUE) { |
@@ -1032,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1032 | unsigned char __user *from = iov->iov_base; | 1079 | unsigned char __user *from = iov->iov_base; |
1033 | 1080 | ||
1034 | iov++; | 1081 | iov++; |
1082 | if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ | ||
1083 | if (offset >= seglen) { | ||
1084 | offset -= seglen; | ||
1085 | continue; | ||
1086 | } | ||
1087 | seglen -= offset; | ||
1088 | from += offset; | ||
1089 | offset = 0; | ||
1090 | } | ||
1035 | 1091 | ||
1036 | while (seglen > 0) { | 1092 | while (seglen > 0) { |
1037 | int copy = 0; | 1093 | int copy = 0; |
@@ -1194,7 +1250,7 @@ out: | |||
1194 | if (copied && likely(!tp->repair)) | 1250 | if (copied && likely(!tp->repair)) |
1195 | tcp_push(sk, flags, mss_now, tp->nonagle); | 1251 | tcp_push(sk, flags, mss_now, tp->nonagle); |
1196 | release_sock(sk); | 1252 | release_sock(sk); |
1197 | return copied; | 1253 | return copied + copied_syn; |
1198 | 1254 | ||
1199 | do_fault: | 1255 | do_fault: |
1200 | if (!skb->len) { | 1256 | if (!skb->len) { |
@@ -1207,7 +1263,7 @@ do_fault: | |||
1207 | } | 1263 | } |
1208 | 1264 | ||
1209 | do_error: | 1265 | do_error: |
1210 | if (copied) | 1266 | if (copied + copied_syn) |
1211 | goto out; | 1267 | goto out; |
1212 | out_err: | 1268 | out_err: |
1213 | err = sk_stream_error(sk, flags, err); | 1269 | err = sk_stream_error(sk, flags, err); |
@@ -3310,8 +3366,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key); | |||
3310 | 3366 | ||
3311 | #endif | 3367 | #endif |
3312 | 3368 | ||
3313 | /** | 3369 | /* Each Responder maintains up to two secret values concurrently for |
3314 | * Each Responder maintains up to two secret values concurrently for | ||
3315 | * efficient secret rollover. Each secret value has 4 states: | 3370 | * efficient secret rollover. Each secret value has 4 states: |
3316 | * | 3371 | * |
3317 | * Generating. (tcp_secret_generating != tcp_secret_primary) | 3372 | * Generating. (tcp_secret_generating != tcp_secret_primary) |
@@ -3563,6 +3618,8 @@ void __init tcp_init(void) | |||
3563 | pr_info("Hash tables configured (established %u bind %u)\n", | 3618 | pr_info("Hash tables configured (established %u bind %u)\n", |
3564 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); | 3619 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); |
3565 | 3620 | ||
3621 | tcp_metrics_init(); | ||
3622 | |||
3566 | tcp_register_congestion_control(&tcp_reno); | 3623 | tcp_register_congestion_control(&tcp_reno); |
3567 | 3624 | ||
3568 | memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); | 3625 | memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); |
@@ -3573,4 +3630,5 @@ void __init tcp_init(void) | |||
3573 | tcp_secret_primary = &tcp_secret_one; | 3630 | tcp_secret_primary = &tcp_secret_one; |
3574 | tcp_secret_retiring = &tcp_secret_two; | 3631 | tcp_secret_retiring = &tcp_secret_two; |
3575 | tcp_secret_secondary = &tcp_secret_two; | 3632 | tcp_secret_secondary = &tcp_secret_two; |
3633 | tcp_tasklet_init(); | ||
3576 | } | 3634 | } |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 04dbd7ae7c62..4d4db16e336e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); | |||
307 | void tcp_slow_start(struct tcp_sock *tp) | 307 | void tcp_slow_start(struct tcp_sock *tp) |
308 | { | 308 | { |
309 | int cnt; /* increase in packets */ | 309 | int cnt; /* increase in packets */ |
310 | unsigned int delta = 0; | ||
310 | 311 | ||
311 | /* RFC3465: ABC Slow start | 312 | /* RFC3465: ABC Slow start |
312 | * Increase only after a full MSS of bytes is acked | 313 | * Increase only after a full MSS of bytes is acked |
@@ -333,9 +334,9 @@ void tcp_slow_start(struct tcp_sock *tp) | |||
333 | tp->snd_cwnd_cnt += cnt; | 334 | tp->snd_cwnd_cnt += cnt; |
334 | while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | 335 | while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { |
335 | tp->snd_cwnd_cnt -= tp->snd_cwnd; | 336 | tp->snd_cwnd_cnt -= tp->snd_cwnd; |
336 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 337 | delta++; |
337 | tp->snd_cwnd++; | ||
338 | } | 338 | } |
339 | tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp); | ||
339 | } | 340 | } |
340 | EXPORT_SYMBOL_GPL(tcp_slow_start); | 341 | EXPORT_SYMBOL_GPL(tcp_slow_start); |
341 | 342 | ||
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c new file mode 100644 index 000000000000..a7f729c409d7 --- /dev/null +++ b/net/ipv4/tcp_fastopen.c | |||
@@ -0,0 +1,11 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/kernel.h> | ||
3 | |||
4 | int sysctl_tcp_fastopen; | ||
5 | |||
6 | static int __init tcp_fastopen_init(void) | ||
7 | { | ||
8 | return 0; | ||
9 | } | ||
10 | |||
11 | late_initcall(tcp_fastopen_init); | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b224eb8bce8b..3e07a64ca44e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -88,12 +88,14 @@ int sysctl_tcp_app_win __read_mostly = 31; | |||
88 | int sysctl_tcp_adv_win_scale __read_mostly = 1; | 88 | int sysctl_tcp_adv_win_scale __read_mostly = 1; |
89 | EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); | 89 | EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); |
90 | 90 | ||
91 | /* rfc5961 challenge ack rate limiting */ | ||
92 | int sysctl_tcp_challenge_ack_limit = 100; | ||
93 | |||
91 | int sysctl_tcp_stdurg __read_mostly; | 94 | int sysctl_tcp_stdurg __read_mostly; |
92 | int sysctl_tcp_rfc1337 __read_mostly; | 95 | int sysctl_tcp_rfc1337 __read_mostly; |
93 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; | 96 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
94 | int sysctl_tcp_frto __read_mostly = 2; | 97 | int sysctl_tcp_frto __read_mostly = 2; |
95 | int sysctl_tcp_frto_response __read_mostly; | 98 | int sysctl_tcp_frto_response __read_mostly; |
96 | int sysctl_tcp_nometrics_save __read_mostly; | ||
97 | 99 | ||
98 | int sysctl_tcp_thin_dupack __read_mostly; | 100 | int sysctl_tcp_thin_dupack __read_mostly; |
99 | 101 | ||
@@ -701,7 +703,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
701 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 703 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
702 | * routine referred to above. | 704 | * routine referred to above. |
703 | */ | 705 | */ |
704 | static inline void tcp_set_rto(struct sock *sk) | 706 | void tcp_set_rto(struct sock *sk) |
705 | { | 707 | { |
706 | const struct tcp_sock *tp = tcp_sk(sk); | 708 | const struct tcp_sock *tp = tcp_sk(sk); |
707 | /* Old crap is replaced with new one. 8) | 709 | /* Old crap is replaced with new one. 8) |
@@ -728,109 +730,6 @@ static inline void tcp_set_rto(struct sock *sk) | |||
728 | tcp_bound_rto(sk); | 730 | tcp_bound_rto(sk); |
729 | } | 731 | } |
730 | 732 | ||
731 | /* Save metrics learned by this TCP session. | ||
732 | This function is called only, when TCP finishes successfully | ||
733 | i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. | ||
734 | */ | ||
735 | void tcp_update_metrics(struct sock *sk) | ||
736 | { | ||
737 | struct tcp_sock *tp = tcp_sk(sk); | ||
738 | struct dst_entry *dst = __sk_dst_get(sk); | ||
739 | |||
740 | if (sysctl_tcp_nometrics_save) | ||
741 | return; | ||
742 | |||
743 | dst_confirm(dst); | ||
744 | |||
745 | if (dst && (dst->flags & DST_HOST)) { | ||
746 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
747 | int m; | ||
748 | unsigned long rtt; | ||
749 | |||
750 | if (icsk->icsk_backoff || !tp->srtt) { | ||
751 | /* This session failed to estimate rtt. Why? | ||
752 | * Probably, no packets returned in time. | ||
753 | * Reset our results. | ||
754 | */ | ||
755 | if (!(dst_metric_locked(dst, RTAX_RTT))) | ||
756 | dst_metric_set(dst, RTAX_RTT, 0); | ||
757 | return; | ||
758 | } | ||
759 | |||
760 | rtt = dst_metric_rtt(dst, RTAX_RTT); | ||
761 | m = rtt - tp->srtt; | ||
762 | |||
763 | /* If newly calculated rtt larger than stored one, | ||
764 | * store new one. Otherwise, use EWMA. Remember, | ||
765 | * rtt overestimation is always better than underestimation. | ||
766 | */ | ||
767 | if (!(dst_metric_locked(dst, RTAX_RTT))) { | ||
768 | if (m <= 0) | ||
769 | set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); | ||
770 | else | ||
771 | set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); | ||
772 | } | ||
773 | |||
774 | if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { | ||
775 | unsigned long var; | ||
776 | if (m < 0) | ||
777 | m = -m; | ||
778 | |||
779 | /* Scale deviation to rttvar fixed point */ | ||
780 | m >>= 1; | ||
781 | if (m < tp->mdev) | ||
782 | m = tp->mdev; | ||
783 | |||
784 | var = dst_metric_rtt(dst, RTAX_RTTVAR); | ||
785 | if (m >= var) | ||
786 | var = m; | ||
787 | else | ||
788 | var -= (var - m) >> 2; | ||
789 | |||
790 | set_dst_metric_rtt(dst, RTAX_RTTVAR, var); | ||
791 | } | ||
792 | |||
793 | if (tcp_in_initial_slowstart(tp)) { | ||
794 | /* Slow start still did not finish. */ | ||
795 | if (dst_metric(dst, RTAX_SSTHRESH) && | ||
796 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | ||
797 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) | ||
798 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); | ||
799 | if (!dst_metric_locked(dst, RTAX_CWND) && | ||
800 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) | ||
801 | dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); | ||
802 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | ||
803 | icsk->icsk_ca_state == TCP_CA_Open) { | ||
804 | /* Cong. avoidance phase, cwnd is reliable. */ | ||
805 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) | ||
806 | dst_metric_set(dst, RTAX_SSTHRESH, | ||
807 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); | ||
808 | if (!dst_metric_locked(dst, RTAX_CWND)) | ||
809 | dst_metric_set(dst, RTAX_CWND, | ||
810 | (dst_metric(dst, RTAX_CWND) + | ||
811 | tp->snd_cwnd) >> 1); | ||
812 | } else { | ||
813 | /* Else slow start did not finish, cwnd is non-sense, | ||
814 | ssthresh may be also invalid. | ||
815 | */ | ||
816 | if (!dst_metric_locked(dst, RTAX_CWND)) | ||
817 | dst_metric_set(dst, RTAX_CWND, | ||
818 | (dst_metric(dst, RTAX_CWND) + | ||
819 | tp->snd_ssthresh) >> 1); | ||
820 | if (dst_metric(dst, RTAX_SSTHRESH) && | ||
821 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | ||
822 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) | ||
823 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); | ||
824 | } | ||
825 | |||
826 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { | ||
827 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && | ||
828 | tp->reordering != sysctl_tcp_reordering) | ||
829 | dst_metric_set(dst, RTAX_REORDERING, tp->reordering); | ||
830 | } | ||
831 | } | ||
832 | } | ||
833 | |||
834 | __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) | 733 | __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) |
835 | { | 734 | { |
836 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); | 735 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); |
@@ -867,7 +766,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) | |||
867 | * Packet counting of FACK is based on in-order assumptions, therefore TCP | 766 | * Packet counting of FACK is based on in-order assumptions, therefore TCP |
868 | * disables it when reordering is detected | 767 | * disables it when reordering is detected |
869 | */ | 768 | */ |
870 | static void tcp_disable_fack(struct tcp_sock *tp) | 769 | void tcp_disable_fack(struct tcp_sock *tp) |
871 | { | 770 | { |
872 | /* RFC3517 uses different metric in lost marker => reset on change */ | 771 | /* RFC3517 uses different metric in lost marker => reset on change */ |
873 | if (tcp_is_fack(tp)) | 772 | if (tcp_is_fack(tp)) |
@@ -881,86 +780,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp) | |||
881 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; | 780 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; |
882 | } | 781 | } |
883 | 782 | ||
884 | /* Initialize metrics on socket. */ | ||
885 | |||
886 | static void tcp_init_metrics(struct sock *sk) | ||
887 | { | ||
888 | struct tcp_sock *tp = tcp_sk(sk); | ||
889 | struct dst_entry *dst = __sk_dst_get(sk); | ||
890 | |||
891 | if (dst == NULL) | ||
892 | goto reset; | ||
893 | |||
894 | dst_confirm(dst); | ||
895 | |||
896 | if (dst_metric_locked(dst, RTAX_CWND)) | ||
897 | tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); | ||
898 | if (dst_metric(dst, RTAX_SSTHRESH)) { | ||
899 | tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); | ||
900 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) | ||
901 | tp->snd_ssthresh = tp->snd_cwnd_clamp; | ||
902 | } else { | ||
903 | /* ssthresh may have been reduced unnecessarily during. | ||
904 | * 3WHS. Restore it back to its initial default. | ||
905 | */ | ||
906 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
907 | } | ||
908 | if (dst_metric(dst, RTAX_REORDERING) && | ||
909 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { | ||
910 | tcp_disable_fack(tp); | ||
911 | tcp_disable_early_retrans(tp); | ||
912 | tp->reordering = dst_metric(dst, RTAX_REORDERING); | ||
913 | } | ||
914 | |||
915 | if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) | ||
916 | goto reset; | ||
917 | |||
918 | /* Initial rtt is determined from SYN,SYN-ACK. | ||
919 | * The segment is small and rtt may appear much | ||
920 | * less than real one. Use per-dst memory | ||
921 | * to make it more realistic. | ||
922 | * | ||
923 | * A bit of theory. RTT is time passed after "normal" sized packet | ||
924 | * is sent until it is ACKed. In normal circumstances sending small | ||
925 | * packets force peer to delay ACKs and calculation is correct too. | ||
926 | * The algorithm is adaptive and, provided we follow specs, it | ||
927 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | ||
928 | * tricks sort of "quick acks" for time long enough to decrease RTT | ||
929 | * to low value, and then abruptly stops to do it and starts to delay | ||
930 | * ACKs, wait for troubles. | ||
931 | */ | ||
932 | if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { | ||
933 | tp->srtt = dst_metric_rtt(dst, RTAX_RTT); | ||
934 | tp->rtt_seq = tp->snd_nxt; | ||
935 | } | ||
936 | if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { | ||
937 | tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); | ||
938 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
939 | } | ||
940 | tcp_set_rto(sk); | ||
941 | reset: | ||
942 | if (tp->srtt == 0) { | ||
943 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | ||
944 | * 3WHS. This is most likely due to retransmission, | ||
945 | * including spurious one. Reset the RTO back to 3secs | ||
946 | * from the more aggressive 1sec to avoid more spurious | ||
947 | * retransmission. | ||
948 | */ | ||
949 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | ||
950 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | ||
951 | } | ||
952 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | ||
953 | * retransmitted. In light of RFC6298 more aggressive 1sec | ||
954 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK | ||
955 | * retransmission has occurred. | ||
956 | */ | ||
957 | if (tp->total_retrans > 1) | ||
958 | tp->snd_cwnd = 1; | ||
959 | else | ||
960 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | ||
961 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
962 | } | ||
963 | |||
964 | static void tcp_update_reordering(struct sock *sk, const int metric, | 783 | static void tcp_update_reordering(struct sock *sk, const int metric, |
965 | const int ts) | 784 | const int ts) |
966 | { | 785 | { |
@@ -2702,7 +2521,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) | |||
2702 | /* Nothing was retransmitted or returned timestamp is less | 2521 | /* Nothing was retransmitted or returned timestamp is less |
2703 | * than timestamp of the first retransmission. | 2522 | * than timestamp of the first retransmission. |
2704 | */ | 2523 | */ |
2705 | static inline int tcp_packet_delayed(const struct tcp_sock *tp) | 2524 | static inline bool tcp_packet_delayed(const struct tcp_sock *tp) |
2706 | { | 2525 | { |
2707 | return !tp->retrans_stamp || | 2526 | return !tp->retrans_stamp || |
2708 | (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 2527 | (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
@@ -2763,7 +2582,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) | |||
2763 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2582 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2764 | } | 2583 | } |
2765 | 2584 | ||
2766 | static inline int tcp_may_undo(const struct tcp_sock *tp) | 2585 | static inline bool tcp_may_undo(const struct tcp_sock *tp) |
2767 | { | 2586 | { |
2768 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); | 2587 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); |
2769 | } | 2588 | } |
@@ -3552,13 +3371,13 @@ static void tcp_ack_probe(struct sock *sk) | |||
3552 | } | 3371 | } |
3553 | } | 3372 | } |
3554 | 3373 | ||
3555 | static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) | 3374 | static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) |
3556 | { | 3375 | { |
3557 | return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || | 3376 | return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || |
3558 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; | 3377 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; |
3559 | } | 3378 | } |
3560 | 3379 | ||
3561 | static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) | 3380 | static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
3562 | { | 3381 | { |
3563 | const struct tcp_sock *tp = tcp_sk(sk); | 3382 | const struct tcp_sock *tp = tcp_sk(sk); |
3564 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && | 3383 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && |
@@ -3568,7 +3387,7 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) | |||
3568 | /* Check that window update is acceptable. | 3387 | /* Check that window update is acceptable. |
3569 | * The function assumes that snd_una<=ack<=snd_next. | 3388 | * The function assumes that snd_una<=ack<=snd_next. |
3570 | */ | 3389 | */ |
3571 | static inline int tcp_may_update_window(const struct tcp_sock *tp, | 3390 | static inline bool tcp_may_update_window(const struct tcp_sock *tp, |
3572 | const u32 ack, const u32 ack_seq, | 3391 | const u32 ack, const u32 ack_seq, |
3573 | const u32 nwin) | 3392 | const u32 nwin) |
3574 | { | 3393 | { |
@@ -3869,9 +3688,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3869 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3688 | tcp_cong_avoid(sk, ack, prior_in_flight); |
3870 | } | 3689 | } |
3871 | 3690 | ||
3872 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) | 3691 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { |
3873 | dst_confirm(__sk_dst_get(sk)); | 3692 | struct dst_entry *dst = __sk_dst_get(sk); |
3874 | 3693 | if (dst) | |
3694 | dst_confirm(dst); | ||
3695 | } | ||
3875 | return 1; | 3696 | return 1; |
3876 | 3697 | ||
3877 | no_queue: | 3698 | no_queue: |
@@ -3911,7 +3732,8 @@ old_ack: | |||
3911 | * the fast version below fails. | 3732 | * the fast version below fails. |
3912 | */ | 3733 | */ |
3913 | void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, | 3734 | void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, |
3914 | const u8 **hvpp, int estab) | 3735 | const u8 **hvpp, int estab, |
3736 | struct tcp_fastopen_cookie *foc) | ||
3915 | { | 3737 | { |
3916 | const unsigned char *ptr; | 3738 | const unsigned char *ptr; |
3917 | const struct tcphdr *th = tcp_hdr(skb); | 3739 | const struct tcphdr *th = tcp_hdr(skb); |
@@ -4018,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
4018 | break; | 3840 | break; |
4019 | } | 3841 | } |
4020 | break; | 3842 | break; |
4021 | } | ||
4022 | 3843 | ||
3844 | case TCPOPT_EXP: | ||
3845 | /* Fast Open option shares code 254 using a | ||
3846 | * 16 bits magic number. It's valid only in | ||
3847 | * SYN or SYN-ACK with an even size. | ||
3848 | */ | ||
3849 | if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || | ||
3850 | get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || | ||
3851 | foc == NULL || !th->syn || (opsize & 1)) | ||
3852 | break; | ||
3853 | foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; | ||
3854 | if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && | ||
3855 | foc->len <= TCP_FASTOPEN_COOKIE_MAX) | ||
3856 | memcpy(foc->val, ptr + 2, foc->len); | ||
3857 | else if (foc->len != 0) | ||
3858 | foc->len = -1; | ||
3859 | break; | ||
3860 | |||
3861 | } | ||
4023 | ptr += opsize-2; | 3862 | ptr += opsize-2; |
4024 | length -= opsize; | 3863 | length -= opsize; |
4025 | } | 3864 | } |
@@ -4061,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, | |||
4061 | if (tcp_parse_aligned_timestamp(tp, th)) | 3900 | if (tcp_parse_aligned_timestamp(tp, th)) |
4062 | return true; | 3901 | return true; |
4063 | } | 3902 | } |
4064 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); | 3903 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); |
4065 | return true; | 3904 | return true; |
4066 | } | 3905 | } |
4067 | 3906 | ||
@@ -4167,7 +4006,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) | |||
4167 | (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); | 4006 | (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); |
4168 | } | 4007 | } |
4169 | 4008 | ||
4170 | static inline int tcp_paws_discard(const struct sock *sk, | 4009 | static inline bool tcp_paws_discard(const struct sock *sk, |
4171 | const struct sk_buff *skb) | 4010 | const struct sk_buff *skb) |
4172 | { | 4011 | { |
4173 | const struct tcp_sock *tp = tcp_sk(sk); | 4012 | const struct tcp_sock *tp = tcp_sk(sk); |
@@ -4189,7 +4028,7 @@ static inline int tcp_paws_discard(const struct sock *sk, | |||
4189 | * (borrowed from freebsd) | 4028 | * (borrowed from freebsd) |
4190 | */ | 4029 | */ |
4191 | 4030 | ||
4192 | static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) | 4031 | static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) |
4193 | { | 4032 | { |
4194 | return !before(end_seq, tp->rcv_wup) && | 4033 | return !before(end_seq, tp->rcv_wup) && |
4195 | !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); | 4034 | !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); |
@@ -4579,8 +4418,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4579 | 4418 | ||
4580 | TCP_ECN_check_ce(tp, skb); | 4419 | TCP_ECN_check_ce(tp, skb); |
4581 | 4420 | ||
4582 | if (tcp_try_rmem_schedule(sk, skb->truesize)) { | 4421 | if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { |
4583 | /* TODO: should increment a counter */ | 4422 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); |
4584 | __kfree_skb(skb); | 4423 | __kfree_skb(skb); |
4585 | return; | 4424 | return; |
4586 | } | 4425 | } |
@@ -4589,6 +4428,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4589 | tp->pred_flags = 0; | 4428 | tp->pred_flags = 0; |
4590 | inet_csk_schedule_ack(sk); | 4429 | inet_csk_schedule_ack(sk); |
4591 | 4430 | ||
4431 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); | ||
4592 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | 4432 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", |
4593 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | 4433 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); |
4594 | 4434 | ||
@@ -4642,6 +4482,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4642 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { | 4482 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { |
4643 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | 4483 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { |
4644 | /* All the bits are present. Drop. */ | 4484 | /* All the bits are present. Drop. */ |
4485 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); | ||
4645 | __kfree_skb(skb); | 4486 | __kfree_skb(skb); |
4646 | skb = NULL; | 4487 | skb = NULL; |
4647 | tcp_dsack_set(sk, seq, end_seq); | 4488 | tcp_dsack_set(sk, seq, end_seq); |
@@ -4680,6 +4521,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4680 | __skb_unlink(skb1, &tp->out_of_order_queue); | 4521 | __skb_unlink(skb1, &tp->out_of_order_queue); |
4681 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | 4522 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, |
4682 | TCP_SKB_CB(skb1)->end_seq); | 4523 | TCP_SKB_CB(skb1)->end_seq); |
4524 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); | ||
4683 | __kfree_skb(skb1); | 4525 | __kfree_skb(skb1); |
4684 | } | 4526 | } |
4685 | 4527 | ||
@@ -5372,7 +5214,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, | |||
5372 | return result; | 5214 | return result; |
5373 | } | 5215 | } |
5374 | 5216 | ||
5375 | static inline int tcp_checksum_complete_user(struct sock *sk, | 5217 | static inline bool tcp_checksum_complete_user(struct sock *sk, |
5376 | struct sk_buff *skb) | 5218 | struct sk_buff *skb) |
5377 | { | 5219 | { |
5378 | return !skb_csum_unnecessary(skb) && | 5220 | return !skb_csum_unnecessary(skb) && |
@@ -5426,11 +5268,28 @@ out: | |||
5426 | } | 5268 | } |
5427 | #endif /* CONFIG_NET_DMA */ | 5269 | #endif /* CONFIG_NET_DMA */ |
5428 | 5270 | ||
5271 | static void tcp_send_challenge_ack(struct sock *sk) | ||
5272 | { | ||
5273 | /* unprotected vars, we dont care of overwrites */ | ||
5274 | static u32 challenge_timestamp; | ||
5275 | static unsigned int challenge_count; | ||
5276 | u32 now = jiffies / HZ; | ||
5277 | |||
5278 | if (now != challenge_timestamp) { | ||
5279 | challenge_timestamp = now; | ||
5280 | challenge_count = 0; | ||
5281 | } | ||
5282 | if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { | ||
5283 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); | ||
5284 | tcp_send_ack(sk); | ||
5285 | } | ||
5286 | } | ||
5287 | |||
5429 | /* Does PAWS and seqno based validation of an incoming segment, flags will | 5288 | /* Does PAWS and seqno based validation of an incoming segment, flags will |
5430 | * play significant role here. | 5289 | * play significant role here. |
5431 | */ | 5290 | */ |
5432 | static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | 5291 | static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, |
5433 | const struct tcphdr *th, int syn_inerr) | 5292 | const struct tcphdr *th, int syn_inerr) |
5434 | { | 5293 | { |
5435 | const u8 *hash_location; | 5294 | const u8 *hash_location; |
5436 | struct tcp_sock *tp = tcp_sk(sk); | 5295 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -5455,14 +5314,26 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |||
5455 | * an acknowledgment should be sent in reply (unless the RST | 5314 | * an acknowledgment should be sent in reply (unless the RST |
5456 | * bit is set, if so drop the segment and return)". | 5315 | * bit is set, if so drop the segment and return)". |
5457 | */ | 5316 | */ |
5458 | if (!th->rst) | 5317 | if (!th->rst) { |
5318 | if (th->syn) | ||
5319 | goto syn_challenge; | ||
5459 | tcp_send_dupack(sk, skb); | 5320 | tcp_send_dupack(sk, skb); |
5321 | } | ||
5460 | goto discard; | 5322 | goto discard; |
5461 | } | 5323 | } |
5462 | 5324 | ||
5463 | /* Step 2: check RST bit */ | 5325 | /* Step 2: check RST bit */ |
5464 | if (th->rst) { | 5326 | if (th->rst) { |
5465 | tcp_reset(sk); | 5327 | /* RFC 5961 3.2 : |
5328 | * If sequence number exactly matches RCV.NXT, then | ||
5329 | * RESET the connection | ||
5330 | * else | ||
5331 | * Send a challenge ACK | ||
5332 | */ | ||
5333 | if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) | ||
5334 | tcp_reset(sk); | ||
5335 | else | ||
5336 | tcp_send_challenge_ack(sk); | ||
5466 | goto discard; | 5337 | goto discard; |
5467 | } | 5338 | } |
5468 | 5339 | ||
@@ -5473,20 +5344,23 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |||
5473 | 5344 | ||
5474 | /* step 3: check security and precedence [ignored] */ | 5345 | /* step 3: check security and precedence [ignored] */ |
5475 | 5346 | ||
5476 | /* step 4: Check for a SYN in window. */ | 5347 | /* step 4: Check for a SYN |
5477 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { | 5348 | * RFC 5691 4.2 : Send a challenge ack |
5349 | */ | ||
5350 | if (th->syn) { | ||
5351 | syn_challenge: | ||
5478 | if (syn_inerr) | 5352 | if (syn_inerr) |
5479 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); | 5353 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); |
5480 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); | 5354 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); |
5481 | tcp_reset(sk); | 5355 | tcp_send_challenge_ack(sk); |
5482 | return -1; | 5356 | goto discard; |
5483 | } | 5357 | } |
5484 | 5358 | ||
5485 | return 1; | 5359 | return true; |
5486 | 5360 | ||
5487 | discard: | 5361 | discard: |
5488 | __kfree_skb(skb); | 5362 | __kfree_skb(skb); |
5489 | return 0; | 5363 | return false; |
5490 | } | 5364 | } |
5491 | 5365 | ||
5492 | /* | 5366 | /* |
@@ -5516,7 +5390,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5516 | const struct tcphdr *th, unsigned int len) | 5390 | const struct tcphdr *th, unsigned int len) |
5517 | { | 5391 | { |
5518 | struct tcp_sock *tp = tcp_sk(sk); | 5392 | struct tcp_sock *tp = tcp_sk(sk); |
5519 | int res; | ||
5520 | 5393 | ||
5521 | /* | 5394 | /* |
5522 | * Header prediction. | 5395 | * Header prediction. |
@@ -5693,9 +5566,8 @@ slow_path: | |||
5693 | * Standard slow path. | 5566 | * Standard slow path. |
5694 | */ | 5567 | */ |
5695 | 5568 | ||
5696 | res = tcp_validate_incoming(sk, skb, th, 1); | 5569 | if (!tcp_validate_incoming(sk, skb, th, 1)) |
5697 | if (res <= 0) | 5570 | return 0; |
5698 | return -res; | ||
5699 | 5571 | ||
5700 | step5: | 5572 | step5: |
5701 | if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) | 5573 | if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) |
@@ -5729,8 +5601,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) | |||
5729 | 5601 | ||
5730 | tcp_set_state(sk, TCP_ESTABLISHED); | 5602 | tcp_set_state(sk, TCP_ESTABLISHED); |
5731 | 5603 | ||
5732 | if (skb != NULL) | 5604 | if (skb != NULL) { |
5605 | sk->sk_rx_dst = dst_clone(skb_dst(skb)); | ||
5733 | security_inet_conn_established(sk, skb); | 5606 | security_inet_conn_established(sk, skb); |
5607 | } | ||
5734 | 5608 | ||
5735 | /* Make sure socket is routed, for correct metrics. */ | 5609 | /* Make sure socket is routed, for correct metrics. */ |
5736 | icsk->icsk_af_ops->rebuild_header(sk); | 5610 | icsk->icsk_af_ops->rebuild_header(sk); |
@@ -5760,6 +5634,45 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) | |||
5760 | } | 5634 | } |
5761 | } | 5635 | } |
5762 | 5636 | ||
5637 | static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | ||
5638 | struct tcp_fastopen_cookie *cookie) | ||
5639 | { | ||
5640 | struct tcp_sock *tp = tcp_sk(sk); | ||
5641 | struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; | ||
5642 | u16 mss = tp->rx_opt.mss_clamp; | ||
5643 | bool syn_drop; | ||
5644 | |||
5645 | if (mss == tp->rx_opt.user_mss) { | ||
5646 | struct tcp_options_received opt; | ||
5647 | const u8 *hash_location; | ||
5648 | |||
5649 | /* Get original SYNACK MSS value if user MSS sets mss_clamp */ | ||
5650 | tcp_clear_options(&opt); | ||
5651 | opt.user_mss = opt.mss_clamp = 0; | ||
5652 | tcp_parse_options(synack, &opt, &hash_location, 0, NULL); | ||
5653 | mss = opt.mss_clamp; | ||
5654 | } | ||
5655 | |||
5656 | if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ | ||
5657 | cookie->len = -1; | ||
5658 | |||
5659 | /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably | ||
5660 | * the remote receives only the retransmitted (regular) SYNs: either | ||
5661 | * the original SYN-data or the corresponding SYN-ACK is lost. | ||
5662 | */ | ||
5663 | syn_drop = (cookie->len <= 0 && data && | ||
5664 | inet_csk(sk)->icsk_retransmits); | ||
5665 | |||
5666 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); | ||
5667 | |||
5668 | if (data) { /* Retransmit unacked data in SYN */ | ||
5669 | tcp_retransmit_skb(sk, data); | ||
5670 | tcp_rearm_rto(sk); | ||
5671 | return true; | ||
5672 | } | ||
5673 | return false; | ||
5674 | } | ||
5675 | |||
5763 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | 5676 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
5764 | const struct tcphdr *th, unsigned int len) | 5677 | const struct tcphdr *th, unsigned int len) |
5765 | { | 5678 | { |
@@ -5767,9 +5680,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5767 | struct inet_connection_sock *icsk = inet_csk(sk); | 5680 | struct inet_connection_sock *icsk = inet_csk(sk); |
5768 | struct tcp_sock *tp = tcp_sk(sk); | 5681 | struct tcp_sock *tp = tcp_sk(sk); |
5769 | struct tcp_cookie_values *cvp = tp->cookie_values; | 5682 | struct tcp_cookie_values *cvp = tp->cookie_values; |
5683 | struct tcp_fastopen_cookie foc = { .len = -1 }; | ||
5770 | int saved_clamp = tp->rx_opt.mss_clamp; | 5684 | int saved_clamp = tp->rx_opt.mss_clamp; |
5771 | 5685 | ||
5772 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); | 5686 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); |
5773 | 5687 | ||
5774 | if (th->ack) { | 5688 | if (th->ack) { |
5775 | /* rfc793: | 5689 | /* rfc793: |
@@ -5779,11 +5693,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5779 | * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send | 5693 | * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send |
5780 | * a reset (unless the RST bit is set, if so drop | 5694 | * a reset (unless the RST bit is set, if so drop |
5781 | * the segment and return)" | 5695 | * the segment and return)" |
5782 | * | ||
5783 | * We do not send data with SYN, so that RFC-correct | ||
5784 | * test reduces to: | ||
5785 | */ | 5696 | */ |
5786 | if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) | 5697 | if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || |
5698 | after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) | ||
5787 | goto reset_and_undo; | 5699 | goto reset_and_undo; |
5788 | 5700 | ||
5789 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 5701 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
@@ -5895,6 +5807,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5895 | 5807 | ||
5896 | tcp_finish_connect(sk, skb); | 5808 | tcp_finish_connect(sk, skb); |
5897 | 5809 | ||
5810 | if ((tp->syn_fastopen || tp->syn_data) && | ||
5811 | tcp_rcv_fastopen_synack(sk, skb, &foc)) | ||
5812 | return -1; | ||
5813 | |||
5898 | if (sk->sk_write_pending || | 5814 | if (sk->sk_write_pending || |
5899 | icsk->icsk_accept_queue.rskq_defer_accept || | 5815 | icsk->icsk_accept_queue.rskq_defer_accept || |
5900 | icsk->icsk_ack.pingpong) { | 5816 | icsk->icsk_ack.pingpong) { |
@@ -6013,7 +5929,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6013 | struct tcp_sock *tp = tcp_sk(sk); | 5929 | struct tcp_sock *tp = tcp_sk(sk); |
6014 | struct inet_connection_sock *icsk = inet_csk(sk); | 5930 | struct inet_connection_sock *icsk = inet_csk(sk); |
6015 | int queued = 0; | 5931 | int queued = 0; |
6016 | int res; | ||
6017 | 5932 | ||
6018 | tp->rx_opt.saw_tstamp = 0; | 5933 | tp->rx_opt.saw_tstamp = 0; |
6019 | 5934 | ||
@@ -6068,9 +5983,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6068 | return 0; | 5983 | return 0; |
6069 | } | 5984 | } |
6070 | 5985 | ||
6071 | res = tcp_validate_incoming(sk, skb, th, 0); | 5986 | if (!tcp_validate_incoming(sk, skb, th, 0)) |
6072 | if (res <= 0) | 5987 | return 0; |
6073 | return -res; | ||
6074 | 5988 | ||
6075 | /* step 5: check the ACK field */ | 5989 | /* step 5: check the ACK field */ |
6076 | if (th->ack) { | 5990 | if (th->ack) { |
@@ -6126,9 +6040,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6126 | 6040 | ||
6127 | case TCP_FIN_WAIT1: | 6041 | case TCP_FIN_WAIT1: |
6128 | if (tp->snd_una == tp->write_seq) { | 6042 | if (tp->snd_una == tp->write_seq) { |
6043 | struct dst_entry *dst; | ||
6044 | |||
6129 | tcp_set_state(sk, TCP_FIN_WAIT2); | 6045 | tcp_set_state(sk, TCP_FIN_WAIT2); |
6130 | sk->sk_shutdown |= SEND_SHUTDOWN; | 6046 | sk->sk_shutdown |= SEND_SHUTDOWN; |
6131 | dst_confirm(__sk_dst_get(sk)); | 6047 | |
6048 | dst = __sk_dst_get(sk); | ||
6049 | if (dst) | ||
6050 | dst_confirm(dst); | ||
6132 | 6051 | ||
6133 | if (!sock_flag(sk, SOCK_DEAD)) | 6052 | if (!sock_flag(sk, SOCK_DEAD)) |
6134 | /* Wake up lingering close() */ | 6053 | /* Wake up lingering close() */ |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c8d28c433b2b..3e30548ac32a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
209 | } | 209 | } |
210 | 210 | ||
211 | if (tcp_death_row.sysctl_tw_recycle && | 211 | if (tcp_death_row.sysctl_tw_recycle && |
212 | !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { | 212 | !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) |
213 | struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); | 213 | tcp_fetch_timewait_stamp(sk, &rt->dst); |
214 | /* | ||
215 | * VJ's idea. We save last timestamp seen from | ||
216 | * the destination in peer table, when entering state | ||
217 | * TIME-WAIT * and initialize rx_opt.ts_recent from it, | ||
218 | * when trying new connection. | ||
219 | */ | ||
220 | if (peer) { | ||
221 | inet_peer_refcheck(peer); | ||
222 | if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { | ||
223 | tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; | ||
224 | tp->rx_opt.ts_recent = peer->tcp_ts; | ||
225 | } | ||
226 | } | ||
227 | } | ||
228 | 214 | ||
229 | inet->inet_dport = usin->sin_port; | 215 | inet->inet_dport = usin->sin_port; |
230 | inet->inet_daddr = daddr; | 216 | inet->inet_daddr = daddr; |
@@ -289,12 +275,15 @@ failure: | |||
289 | EXPORT_SYMBOL(tcp_v4_connect); | 275 | EXPORT_SYMBOL(tcp_v4_connect); |
290 | 276 | ||
291 | /* | 277 | /* |
292 | * This routine does path mtu discovery as defined in RFC1191. | 278 | * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. |
279 | * It can be called through tcp_release_cb() if socket was owned by user | ||
280 | * at the time tcp_v4_err() was called to handle ICMP message. | ||
293 | */ | 281 | */ |
294 | static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) | 282 | static void tcp_v4_mtu_reduced(struct sock *sk) |
295 | { | 283 | { |
296 | struct dst_entry *dst; | 284 | struct dst_entry *dst; |
297 | struct inet_sock *inet = inet_sk(sk); | 285 | struct inet_sock *inet = inet_sk(sk); |
286 | u32 mtu = tcp_sk(sk)->mtu_info; | ||
298 | 287 | ||
299 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs | 288 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs |
300 | * send out by Linux are always <576bytes so they should go through | 289 | * send out by Linux are always <576bytes so they should go through |
@@ -303,17 +292,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) | |||
303 | if (sk->sk_state == TCP_LISTEN) | 292 | if (sk->sk_state == TCP_LISTEN) |
304 | return; | 293 | return; |
305 | 294 | ||
306 | /* We don't check in the destentry if pmtu discovery is forbidden | 295 | dst = inet_csk_update_pmtu(sk, mtu); |
307 | * on this route. We just assume that no packet_to_big packets | 296 | if (!dst) |
308 | * are send back when pmtu discovery is not active. | ||
309 | * There is a small race when the user changes this flag in the | ||
310 | * route, but I think that's acceptable. | ||
311 | */ | ||
312 | if ((dst = __sk_dst_check(sk, 0)) == NULL) | ||
313 | return; | 297 | return; |
314 | 298 | ||
315 | dst->ops->update_pmtu(dst, mtu); | ||
316 | |||
317 | /* Something is about to be wrong... Remember soft error | 299 | /* Something is about to be wrong... Remember soft error |
318 | * for the case, if this connection will not able to recover. | 300 | * for the case, if this connection will not able to recover. |
319 | */ | 301 | */ |
@@ -335,6 +317,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) | |||
335 | } /* else let the usual retransmit timer handle it */ | 317 | } /* else let the usual retransmit timer handle it */ |
336 | } | 318 | } |
337 | 319 | ||
320 | static void do_redirect(struct sk_buff *skb, struct sock *sk) | ||
321 | { | ||
322 | struct dst_entry *dst = __sk_dst_check(sk, 0); | ||
323 | |||
324 | if (dst) | ||
325 | dst->ops->redirect(dst, sk, skb); | ||
326 | } | ||
327 | |||
338 | /* | 328 | /* |
339 | * This routine is called by the ICMP module when it gets some | 329 | * This routine is called by the ICMP module when it gets some |
340 | * sort of error condition. If err < 0 then the socket should | 330 | * sort of error condition. If err < 0 then the socket should |
@@ -386,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
386 | bh_lock_sock(sk); | 376 | bh_lock_sock(sk); |
387 | /* If too many ICMPs get dropped on busy | 377 | /* If too many ICMPs get dropped on busy |
388 | * servers this needs to be solved differently. | 378 | * servers this needs to be solved differently. |
379 | * We do take care of PMTU discovery (RFC1191) special case : | ||
380 | * we can receive locally generated ICMP messages while socket is held. | ||
389 | */ | 381 | */ |
390 | if (sock_owned_by_user(sk)) | 382 | if (sock_owned_by_user(sk) && |
383 | type != ICMP_DEST_UNREACH && | ||
384 | code != ICMP_FRAG_NEEDED) | ||
391 | NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); | 385 | NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); |
392 | 386 | ||
393 | if (sk->sk_state == TCP_CLOSE) | 387 | if (sk->sk_state == TCP_CLOSE) |
@@ -408,6 +402,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
408 | } | 402 | } |
409 | 403 | ||
410 | switch (type) { | 404 | switch (type) { |
405 | case ICMP_REDIRECT: | ||
406 | do_redirect(icmp_skb, sk); | ||
407 | goto out; | ||
411 | case ICMP_SOURCE_QUENCH: | 408 | case ICMP_SOURCE_QUENCH: |
412 | /* Just silently ignore these. */ | 409 | /* Just silently ignore these. */ |
413 | goto out; | 410 | goto out; |
@@ -419,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
419 | goto out; | 416 | goto out; |
420 | 417 | ||
421 | if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ | 418 | if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ |
419 | tp->mtu_info = info; | ||
422 | if (!sock_owned_by_user(sk)) | 420 | if (!sock_owned_by_user(sk)) |
423 | do_pmtu_discovery(sk, iph, info); | 421 | tcp_v4_mtu_reduced(sk); |
422 | else | ||
423 | set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); | ||
424 | goto out; | 424 | goto out; |
425 | } | 425 | } |
426 | 426 | ||
@@ -698,8 +698,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
698 | 698 | ||
699 | net = dev_net(skb_dst(skb)->dev); | 699 | net = dev_net(skb_dst(skb)->dev); |
700 | arg.tos = ip_hdr(skb)->tos; | 700 | arg.tos = ip_hdr(skb)->tos; |
701 | ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, | 701 | ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, |
702 | &arg, arg.iov[0].iov_len); | 702 | ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); |
703 | 703 | ||
704 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 704 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
705 | TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); | 705 | TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); |
@@ -781,8 +781,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
781 | if (oif) | 781 | if (oif) |
782 | arg.bound_dev_if = oif; | 782 | arg.bound_dev_if = oif; |
783 | arg.tos = tos; | 783 | arg.tos = tos; |
784 | ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, | 784 | ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, |
785 | &arg, arg.iov[0].iov_len); | 785 | ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); |
786 | 786 | ||
787 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 787 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
788 | } | 788 | } |
@@ -825,7 +825,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
825 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | 825 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
826 | struct request_sock *req, | 826 | struct request_sock *req, |
827 | struct request_values *rvp, | 827 | struct request_values *rvp, |
828 | u16 queue_mapping) | 828 | u16 queue_mapping, |
829 | bool nocache) | ||
829 | { | 830 | { |
830 | const struct inet_request_sock *ireq = inet_rsk(req); | 831 | const struct inet_request_sock *ireq = inet_rsk(req); |
831 | struct flowi4 fl4; | 832 | struct flowi4 fl4; |
@@ -848,7 +849,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
848 | err = net_xmit_eval(err); | 849 | err = net_xmit_eval(err); |
849 | } | 850 | } |
850 | 851 | ||
851 | dst_release(dst); | ||
852 | return err; | 852 | return err; |
853 | } | 853 | } |
854 | 854 | ||
@@ -856,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, | |||
856 | struct request_values *rvp) | 856 | struct request_values *rvp) |
857 | { | 857 | { |
858 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); | 858 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
859 | return tcp_v4_send_synack(sk, NULL, req, rvp, 0); | 859 | return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); |
860 | } | 860 | } |
861 | 861 | ||
862 | /* | 862 | /* |
@@ -1317,7 +1317,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1317 | tcp_clear_options(&tmp_opt); | 1317 | tcp_clear_options(&tmp_opt); |
1318 | tmp_opt.mss_clamp = TCP_MSS_DEFAULT; | 1318 | tmp_opt.mss_clamp = TCP_MSS_DEFAULT; |
1319 | tmp_opt.user_mss = tp->rx_opt.user_mss; | 1319 | tmp_opt.user_mss = tp->rx_opt.user_mss; |
1320 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0); | 1320 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); |
1321 | 1321 | ||
1322 | if (tmp_opt.cookie_plus > 0 && | 1322 | if (tmp_opt.cookie_plus > 0 && |
1323 | tmp_opt.saw_tstamp && | 1323 | tmp_opt.saw_tstamp && |
@@ -1375,7 +1375,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1375 | isn = cookie_v4_init_sequence(sk, skb, &req->mss); | 1375 | isn = cookie_v4_init_sequence(sk, skb, &req->mss); |
1376 | req->cookie_ts = tmp_opt.tstamp_ok; | 1376 | req->cookie_ts = tmp_opt.tstamp_ok; |
1377 | } else if (!isn) { | 1377 | } else if (!isn) { |
1378 | struct inet_peer *peer = NULL; | ||
1379 | struct flowi4 fl4; | 1378 | struct flowi4 fl4; |
1380 | 1379 | ||
1381 | /* VJ's idea. We save last timestamp seen | 1380 | /* VJ's idea. We save last timestamp seen |
@@ -1390,12 +1389,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1390 | if (tmp_opt.saw_tstamp && | 1389 | if (tmp_opt.saw_tstamp && |
1391 | tcp_death_row.sysctl_tw_recycle && | 1390 | tcp_death_row.sysctl_tw_recycle && |
1392 | (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && | 1391 | (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && |
1393 | fl4.daddr == saddr && | 1392 | fl4.daddr == saddr) { |
1394 | (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { | 1393 | if (!tcp_peer_is_proven(req, dst, true)) { |
1395 | inet_peer_refcheck(peer); | ||
1396 | if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && | ||
1397 | (s32)(peer->tcp_ts - req->ts_recent) > | ||
1398 | TCP_PAWS_WINDOW) { | ||
1399 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); | 1394 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); |
1400 | goto drop_and_release; | 1395 | goto drop_and_release; |
1401 | } | 1396 | } |
@@ -1404,8 +1399,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1404 | else if (!sysctl_tcp_syncookies && | 1399 | else if (!sysctl_tcp_syncookies && |
1405 | (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < | 1400 | (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
1406 | (sysctl_max_syn_backlog >> 2)) && | 1401 | (sysctl_max_syn_backlog >> 2)) && |
1407 | (!peer || !peer->tcp_ts_stamp) && | 1402 | !tcp_peer_is_proven(req, dst, false)) { |
1408 | (!dst || !dst_metric(dst, RTAX_RTT))) { | ||
1409 | /* Without syncookies last quarter of | 1403 | /* Without syncookies last quarter of |
1410 | * backlog is filled with destinations, | 1404 | * backlog is filled with destinations, |
1411 | * proven to be alive. | 1405 | * proven to be alive. |
@@ -1425,7 +1419,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1425 | 1419 | ||
1426 | if (tcp_v4_send_synack(sk, dst, req, | 1420 | if (tcp_v4_send_synack(sk, dst, req, |
1427 | (struct request_values *)&tmp_ext, | 1421 | (struct request_values *)&tmp_ext, |
1428 | skb_get_queue_mapping(skb)) || | 1422 | skb_get_queue_mapping(skb), |
1423 | want_cookie) || | ||
1429 | want_cookie) | 1424 | want_cookie) |
1430 | goto drop_and_free; | 1425 | goto drop_and_free; |
1431 | 1426 | ||
@@ -1623,6 +1618,20 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
1623 | 1618 | ||
1624 | if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ | 1619 | if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ |
1625 | sock_rps_save_rxhash(sk, skb); | 1620 | sock_rps_save_rxhash(sk, skb); |
1621 | if (sk->sk_rx_dst) { | ||
1622 | struct dst_entry *dst = sk->sk_rx_dst; | ||
1623 | if (dst->ops->check(dst, 0) == NULL) { | ||
1624 | dst_release(dst); | ||
1625 | sk->sk_rx_dst = NULL; | ||
1626 | } | ||
1627 | } | ||
1628 | if (unlikely(sk->sk_rx_dst == NULL)) { | ||
1629 | struct inet_sock *icsk = inet_sk(sk); | ||
1630 | struct rtable *rt = skb_rtable(skb); | ||
1631 | |||
1632 | sk->sk_rx_dst = dst_clone(&rt->dst); | ||
1633 | icsk->rx_dst_ifindex = inet_iif(skb); | ||
1634 | } | ||
1626 | if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { | 1635 | if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { |
1627 | rsk = sk; | 1636 | rsk = sk; |
1628 | goto reset; | 1637 | goto reset; |
@@ -1672,6 +1681,49 @@ csum_err: | |||
1672 | } | 1681 | } |
1673 | EXPORT_SYMBOL(tcp_v4_do_rcv); | 1682 | EXPORT_SYMBOL(tcp_v4_do_rcv); |
1674 | 1683 | ||
1684 | void tcp_v4_early_demux(struct sk_buff *skb) | ||
1685 | { | ||
1686 | struct net *net = dev_net(skb->dev); | ||
1687 | const struct iphdr *iph; | ||
1688 | const struct tcphdr *th; | ||
1689 | struct net_device *dev; | ||
1690 | struct sock *sk; | ||
1691 | |||
1692 | if (skb->pkt_type != PACKET_HOST) | ||
1693 | return; | ||
1694 | |||
1695 | if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr))) | ||
1696 | return; | ||
1697 | |||
1698 | iph = ip_hdr(skb); | ||
1699 | th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb)); | ||
1700 | |||
1701 | if (th->doff < sizeof(struct tcphdr) / 4) | ||
1702 | return; | ||
1703 | |||
1704 | if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) | ||
1705 | return; | ||
1706 | |||
1707 | dev = skb->dev; | ||
1708 | sk = __inet_lookup_established(net, &tcp_hashinfo, | ||
1709 | iph->saddr, th->source, | ||
1710 | iph->daddr, ntohs(th->dest), | ||
1711 | dev->ifindex); | ||
1712 | if (sk) { | ||
1713 | skb->sk = sk; | ||
1714 | skb->destructor = sock_edemux; | ||
1715 | if (sk->sk_state != TCP_TIME_WAIT) { | ||
1716 | struct dst_entry *dst = sk->sk_rx_dst; | ||
1717 | struct inet_sock *icsk = inet_sk(sk); | ||
1718 | if (dst) | ||
1719 | dst = dst_check(dst, 0); | ||
1720 | if (dst && | ||
1721 | icsk->rx_dst_ifindex == dev->ifindex) | ||
1722 | skb_dst_set_noref(skb, dst); | ||
1723 | } | ||
1724 | } | ||
1725 | } | ||
1726 | |||
1675 | /* | 1727 | /* |
1676 | * From tcp_input.c | 1728 | * From tcp_input.c |
1677 | */ | 1729 | */ |
@@ -1821,40 +1873,10 @@ do_time_wait: | |||
1821 | goto discard_it; | 1873 | goto discard_it; |
1822 | } | 1874 | } |
1823 | 1875 | ||
1824 | struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) | ||
1825 | { | ||
1826 | struct rtable *rt = (struct rtable *) __sk_dst_get(sk); | ||
1827 | struct inet_sock *inet = inet_sk(sk); | ||
1828 | struct inet_peer *peer; | ||
1829 | |||
1830 | if (!rt || | ||
1831 | inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { | ||
1832 | peer = inet_getpeer_v4(inet->inet_daddr, 1); | ||
1833 | *release_it = true; | ||
1834 | } else { | ||
1835 | if (!rt->peer) | ||
1836 | rt_bind_peer(rt, inet->inet_daddr, 1); | ||
1837 | peer = rt->peer; | ||
1838 | *release_it = false; | ||
1839 | } | ||
1840 | |||
1841 | return peer; | ||
1842 | } | ||
1843 | EXPORT_SYMBOL(tcp_v4_get_peer); | ||
1844 | |||
1845 | void *tcp_v4_tw_get_peer(struct sock *sk) | ||
1846 | { | ||
1847 | const struct inet_timewait_sock *tw = inet_twsk(sk); | ||
1848 | |||
1849 | return inet_getpeer_v4(tw->tw_daddr, 1); | ||
1850 | } | ||
1851 | EXPORT_SYMBOL(tcp_v4_tw_get_peer); | ||
1852 | |||
1853 | static struct timewait_sock_ops tcp_timewait_sock_ops = { | 1876 | static struct timewait_sock_ops tcp_timewait_sock_ops = { |
1854 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), | 1877 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), |
1855 | .twsk_unique = tcp_twsk_unique, | 1878 | .twsk_unique = tcp_twsk_unique, |
1856 | .twsk_destructor= tcp_twsk_destructor, | 1879 | .twsk_destructor= tcp_twsk_destructor, |
1857 | .twsk_getpeer = tcp_v4_tw_get_peer, | ||
1858 | }; | 1880 | }; |
1859 | 1881 | ||
1860 | const struct inet_connection_sock_af_ops ipv4_specific = { | 1882 | const struct inet_connection_sock_af_ops ipv4_specific = { |
@@ -1863,7 +1885,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { | |||
1863 | .rebuild_header = inet_sk_rebuild_header, | 1885 | .rebuild_header = inet_sk_rebuild_header, |
1864 | .conn_request = tcp_v4_conn_request, | 1886 | .conn_request = tcp_v4_conn_request, |
1865 | .syn_recv_sock = tcp_v4_syn_recv_sock, | 1887 | .syn_recv_sock = tcp_v4_syn_recv_sock, |
1866 | .get_peer = tcp_v4_get_peer, | ||
1867 | .net_header_len = sizeof(struct iphdr), | 1888 | .net_header_len = sizeof(struct iphdr), |
1868 | .setsockopt = ip_setsockopt, | 1889 | .setsockopt = ip_setsockopt, |
1869 | .getsockopt = ip_getsockopt, | 1890 | .getsockopt = ip_getsockopt, |
@@ -1953,6 +1974,9 @@ void tcp_v4_destroy_sock(struct sock *sk) | |||
1953 | tp->cookie_values = NULL; | 1974 | tp->cookie_values = NULL; |
1954 | } | 1975 | } |
1955 | 1976 | ||
1977 | /* If socket is aborted during connect operation */ | ||
1978 | tcp_free_fastopen_req(tp); | ||
1979 | |||
1956 | sk_sockets_allocated_dec(sk); | 1980 | sk_sockets_allocated_dec(sk); |
1957 | sock_release_memcg(sk); | 1981 | sock_release_memcg(sk); |
1958 | } | 1982 | } |
@@ -2593,6 +2617,8 @@ struct proto tcp_prot = { | |||
2593 | .sendmsg = tcp_sendmsg, | 2617 | .sendmsg = tcp_sendmsg, |
2594 | .sendpage = tcp_sendpage, | 2618 | .sendpage = tcp_sendpage, |
2595 | .backlog_rcv = tcp_v4_do_rcv, | 2619 | .backlog_rcv = tcp_v4_do_rcv, |
2620 | .release_cb = tcp_release_cb, | ||
2621 | .mtu_reduced = tcp_v4_mtu_reduced, | ||
2596 | .hash = inet_hash, | 2622 | .hash = inet_hash, |
2597 | .unhash = inet_unhash, | 2623 | .unhash = inet_unhash, |
2598 | .get_port = inet_csk_get_port, | 2624 | .get_port = inet_csk_get_port, |
@@ -2624,13 +2650,11 @@ EXPORT_SYMBOL(tcp_prot); | |||
2624 | 2650 | ||
2625 | static int __net_init tcp_sk_init(struct net *net) | 2651 | static int __net_init tcp_sk_init(struct net *net) |
2626 | { | 2652 | { |
2627 | return inet_ctl_sock_create(&net->ipv4.tcp_sock, | 2653 | return 0; |
2628 | PF_INET, SOCK_RAW, IPPROTO_TCP, net); | ||
2629 | } | 2654 | } |
2630 | 2655 | ||
2631 | static void __net_exit tcp_sk_exit(struct net *net) | 2656 | static void __net_exit tcp_sk_exit(struct net *net) |
2632 | { | 2657 | { |
2633 | inet_ctl_sock_destroy(net->ipv4.tcp_sock); | ||
2634 | } | 2658 | } |
2635 | 2659 | ||
2636 | static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) | 2660 | static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) |
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c new file mode 100644 index 000000000000..2288a6399e1e --- /dev/null +++ b/net/ipv4/tcp_metrics.c | |||
@@ -0,0 +1,745 @@ | |||
1 | #include <linux/rcupdate.h> | ||
2 | #include <linux/spinlock.h> | ||
3 | #include <linux/jiffies.h> | ||
4 | #include <linux/bootmem.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/cache.h> | ||
7 | #include <linux/slab.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/tcp.h> | ||
10 | #include <linux/hash.h> | ||
11 | |||
12 | #include <net/inet_connection_sock.h> | ||
13 | #include <net/net_namespace.h> | ||
14 | #include <net/request_sock.h> | ||
15 | #include <net/inetpeer.h> | ||
16 | #include <net/sock.h> | ||
17 | #include <net/ipv6.h> | ||
18 | #include <net/dst.h> | ||
19 | #include <net/tcp.h> | ||
20 | |||
21 | int sysctl_tcp_nometrics_save __read_mostly; | ||
22 | |||
23 | enum tcp_metric_index { | ||
24 | TCP_METRIC_RTT, | ||
25 | TCP_METRIC_RTTVAR, | ||
26 | TCP_METRIC_SSTHRESH, | ||
27 | TCP_METRIC_CWND, | ||
28 | TCP_METRIC_REORDERING, | ||
29 | |||
30 | /* Always last. */ | ||
31 | TCP_METRIC_MAX, | ||
32 | }; | ||
33 | |||
34 | struct tcp_fastopen_metrics { | ||
35 | u16 mss; | ||
36 | u16 syn_loss:10; /* Recurring Fast Open SYN losses */ | ||
37 | unsigned long last_syn_loss; /* Last Fast Open SYN loss */ | ||
38 | struct tcp_fastopen_cookie cookie; | ||
39 | }; | ||
40 | |||
41 | struct tcp_metrics_block { | ||
42 | struct tcp_metrics_block __rcu *tcpm_next; | ||
43 | struct inetpeer_addr tcpm_addr; | ||
44 | unsigned long tcpm_stamp; | ||
45 | u32 tcpm_ts; | ||
46 | u32 tcpm_ts_stamp; | ||
47 | u32 tcpm_lock; | ||
48 | u32 tcpm_vals[TCP_METRIC_MAX]; | ||
49 | struct tcp_fastopen_metrics tcpm_fastopen; | ||
50 | }; | ||
51 | |||
52 | static bool tcp_metric_locked(struct tcp_metrics_block *tm, | ||
53 | enum tcp_metric_index idx) | ||
54 | { | ||
55 | return tm->tcpm_lock & (1 << idx); | ||
56 | } | ||
57 | |||
58 | static u32 tcp_metric_get(struct tcp_metrics_block *tm, | ||
59 | enum tcp_metric_index idx) | ||
60 | { | ||
61 | return tm->tcpm_vals[idx]; | ||
62 | } | ||
63 | |||
64 | static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, | ||
65 | enum tcp_metric_index idx) | ||
66 | { | ||
67 | return msecs_to_jiffies(tm->tcpm_vals[idx]); | ||
68 | } | ||
69 | |||
70 | static void tcp_metric_set(struct tcp_metrics_block *tm, | ||
71 | enum tcp_metric_index idx, | ||
72 | u32 val) | ||
73 | { | ||
74 | tm->tcpm_vals[idx] = val; | ||
75 | } | ||
76 | |||
77 | static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, | ||
78 | enum tcp_metric_index idx, | ||
79 | u32 val) | ||
80 | { | ||
81 | tm->tcpm_vals[idx] = jiffies_to_msecs(val); | ||
82 | } | ||
83 | |||
84 | static bool addr_same(const struct inetpeer_addr *a, | ||
85 | const struct inetpeer_addr *b) | ||
86 | { | ||
87 | const struct in6_addr *a6, *b6; | ||
88 | |||
89 | if (a->family != b->family) | ||
90 | return false; | ||
91 | if (a->family == AF_INET) | ||
92 | return a->addr.a4 == b->addr.a4; | ||
93 | |||
94 | a6 = (const struct in6_addr *) &a->addr.a6[0]; | ||
95 | b6 = (const struct in6_addr *) &b->addr.a6[0]; | ||
96 | |||
97 | return ipv6_addr_equal(a6, b6); | ||
98 | } | ||
99 | |||
100 | struct tcpm_hash_bucket { | ||
101 | struct tcp_metrics_block __rcu *chain; | ||
102 | }; | ||
103 | |||
104 | static DEFINE_SPINLOCK(tcp_metrics_lock); | ||
105 | |||
106 | static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) | ||
107 | { | ||
108 | u32 val; | ||
109 | |||
110 | tm->tcpm_stamp = jiffies; | ||
111 | |||
112 | val = 0; | ||
113 | if (dst_metric_locked(dst, RTAX_RTT)) | ||
114 | val |= 1 << TCP_METRIC_RTT; | ||
115 | if (dst_metric_locked(dst, RTAX_RTTVAR)) | ||
116 | val |= 1 << TCP_METRIC_RTTVAR; | ||
117 | if (dst_metric_locked(dst, RTAX_SSTHRESH)) | ||
118 | val |= 1 << TCP_METRIC_SSTHRESH; | ||
119 | if (dst_metric_locked(dst, RTAX_CWND)) | ||
120 | val |= 1 << TCP_METRIC_CWND; | ||
121 | if (dst_metric_locked(dst, RTAX_REORDERING)) | ||
122 | val |= 1 << TCP_METRIC_REORDERING; | ||
123 | tm->tcpm_lock = val; | ||
124 | |||
125 | tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); | ||
126 | tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); | ||
127 | tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); | ||
128 | tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); | ||
129 | tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); | ||
130 | tm->tcpm_ts = 0; | ||
131 | tm->tcpm_ts_stamp = 0; | ||
132 | tm->tcpm_fastopen.mss = 0; | ||
133 | tm->tcpm_fastopen.syn_loss = 0; | ||
134 | tm->tcpm_fastopen.cookie.len = 0; | ||
135 | } | ||
136 | |||
137 | static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, | ||
138 | struct inetpeer_addr *addr, | ||
139 | unsigned int hash, | ||
140 | bool reclaim) | ||
141 | { | ||
142 | struct tcp_metrics_block *tm; | ||
143 | struct net *net; | ||
144 | |||
145 | spin_lock_bh(&tcp_metrics_lock); | ||
146 | net = dev_net(dst->dev); | ||
147 | if (unlikely(reclaim)) { | ||
148 | struct tcp_metrics_block *oldest; | ||
149 | |||
150 | oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); | ||
151 | for (tm = rcu_dereference(oldest->tcpm_next); tm; | ||
152 | tm = rcu_dereference(tm->tcpm_next)) { | ||
153 | if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) | ||
154 | oldest = tm; | ||
155 | } | ||
156 | tm = oldest; | ||
157 | } else { | ||
158 | tm = kmalloc(sizeof(*tm), GFP_ATOMIC); | ||
159 | if (!tm) | ||
160 | goto out_unlock; | ||
161 | } | ||
162 | tm->tcpm_addr = *addr; | ||
163 | |||
164 | tcpm_suck_dst(tm, dst); | ||
165 | |||
166 | if (likely(!reclaim)) { | ||
167 | tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; | ||
168 | rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); | ||
169 | } | ||
170 | |||
171 | out_unlock: | ||
172 | spin_unlock_bh(&tcp_metrics_lock); | ||
173 | return tm; | ||
174 | } | ||
175 | |||
176 | #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) | ||
177 | |||
178 | static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) | ||
179 | { | ||
180 | if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) | ||
181 | tcpm_suck_dst(tm, dst); | ||
182 | } | ||
183 | |||
184 | #define TCP_METRICS_RECLAIM_DEPTH 5 | ||
185 | #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL | ||
186 | |||
187 | static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) | ||
188 | { | ||
189 | if (tm) | ||
190 | return tm; | ||
191 | if (depth > TCP_METRICS_RECLAIM_DEPTH) | ||
192 | return TCP_METRICS_RECLAIM_PTR; | ||
193 | return NULL; | ||
194 | } | ||
195 | |||
196 | static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, | ||
197 | struct net *net, unsigned int hash) | ||
198 | { | ||
199 | struct tcp_metrics_block *tm; | ||
200 | int depth = 0; | ||
201 | |||
202 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
203 | tm = rcu_dereference(tm->tcpm_next)) { | ||
204 | if (addr_same(&tm->tcpm_addr, addr)) | ||
205 | break; | ||
206 | depth++; | ||
207 | } | ||
208 | return tcp_get_encode(tm, depth); | ||
209 | } | ||
210 | |||
211 | static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, | ||
212 | struct dst_entry *dst) | ||
213 | { | ||
214 | struct tcp_metrics_block *tm; | ||
215 | struct inetpeer_addr addr; | ||
216 | unsigned int hash; | ||
217 | struct net *net; | ||
218 | |||
219 | addr.family = req->rsk_ops->family; | ||
220 | switch (addr.family) { | ||
221 | case AF_INET: | ||
222 | addr.addr.a4 = inet_rsk(req)->rmt_addr; | ||
223 | hash = (__force unsigned int) addr.addr.a4; | ||
224 | break; | ||
225 | case AF_INET6: | ||
226 | *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; | ||
227 | hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr); | ||
228 | break; | ||
229 | default: | ||
230 | return NULL; | ||
231 | } | ||
232 | |||
233 | net = dev_net(dst->dev); | ||
234 | hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); | ||
235 | |||
236 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
237 | tm = rcu_dereference(tm->tcpm_next)) { | ||
238 | if (addr_same(&tm->tcpm_addr, &addr)) | ||
239 | break; | ||
240 | } | ||
241 | tcpm_check_stamp(tm, dst); | ||
242 | return tm; | ||
243 | } | ||
244 | |||
245 | static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) | ||
246 | { | ||
247 | struct inet6_timewait_sock *tw6; | ||
248 | struct tcp_metrics_block *tm; | ||
249 | struct inetpeer_addr addr; | ||
250 | unsigned int hash; | ||
251 | struct net *net; | ||
252 | |||
253 | addr.family = tw->tw_family; | ||
254 | switch (addr.family) { | ||
255 | case AF_INET: | ||
256 | addr.addr.a4 = tw->tw_daddr; | ||
257 | hash = (__force unsigned int) addr.addr.a4; | ||
258 | break; | ||
259 | case AF_INET6: | ||
260 | tw6 = inet6_twsk((struct sock *)tw); | ||
261 | *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; | ||
262 | hash = ipv6_addr_hash(&tw6->tw_v6_daddr); | ||
263 | break; | ||
264 | default: | ||
265 | return NULL; | ||
266 | } | ||
267 | |||
268 | net = twsk_net(tw); | ||
269 | hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); | ||
270 | |||
271 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
272 | tm = rcu_dereference(tm->tcpm_next)) { | ||
273 | if (addr_same(&tm->tcpm_addr, &addr)) | ||
274 | break; | ||
275 | } | ||
276 | return tm; | ||
277 | } | ||
278 | |||
279 | static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, | ||
280 | struct dst_entry *dst, | ||
281 | bool create) | ||
282 | { | ||
283 | struct tcp_metrics_block *tm; | ||
284 | struct inetpeer_addr addr; | ||
285 | unsigned int hash; | ||
286 | struct net *net; | ||
287 | bool reclaim; | ||
288 | |||
289 | addr.family = sk->sk_family; | ||
290 | switch (addr.family) { | ||
291 | case AF_INET: | ||
292 | addr.addr.a4 = inet_sk(sk)->inet_daddr; | ||
293 | hash = (__force unsigned int) addr.addr.a4; | ||
294 | break; | ||
295 | case AF_INET6: | ||
296 | *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; | ||
297 | hash = ipv6_addr_hash(&inet6_sk(sk)->daddr); | ||
298 | break; | ||
299 | default: | ||
300 | return NULL; | ||
301 | } | ||
302 | |||
303 | net = dev_net(dst->dev); | ||
304 | hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); | ||
305 | |||
306 | tm = __tcp_get_metrics(&addr, net, hash); | ||
307 | reclaim = false; | ||
308 | if (tm == TCP_METRICS_RECLAIM_PTR) { | ||
309 | reclaim = true; | ||
310 | tm = NULL; | ||
311 | } | ||
312 | if (!tm && create) | ||
313 | tm = tcpm_new(dst, &addr, hash, reclaim); | ||
314 | else | ||
315 | tcpm_check_stamp(tm, dst); | ||
316 | |||
317 | return tm; | ||
318 | } | ||
319 | |||
320 | /* Save metrics learned by this TCP session. This function is called | ||
321 | * only, when TCP finishes successfully i.e. when it enters TIME-WAIT | ||
322 | * or goes from LAST-ACK to CLOSE. | ||
323 | */ | ||
324 | void tcp_update_metrics(struct sock *sk) | ||
325 | { | ||
326 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
327 | struct dst_entry *dst = __sk_dst_get(sk); | ||
328 | struct tcp_sock *tp = tcp_sk(sk); | ||
329 | struct tcp_metrics_block *tm; | ||
330 | unsigned long rtt; | ||
331 | u32 val; | ||
332 | int m; | ||
333 | |||
334 | if (sysctl_tcp_nometrics_save || !dst) | ||
335 | return; | ||
336 | |||
337 | if (dst->flags & DST_HOST) | ||
338 | dst_confirm(dst); | ||
339 | |||
340 | rcu_read_lock(); | ||
341 | if (icsk->icsk_backoff || !tp->srtt) { | ||
342 | /* This session failed to estimate rtt. Why? | ||
343 | * Probably, no packets returned in time. Reset our | ||
344 | * results. | ||
345 | */ | ||
346 | tm = tcp_get_metrics(sk, dst, false); | ||
347 | if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) | ||
348 | tcp_metric_set(tm, TCP_METRIC_RTT, 0); | ||
349 | goto out_unlock; | ||
350 | } else | ||
351 | tm = tcp_get_metrics(sk, dst, true); | ||
352 | |||
353 | if (!tm) | ||
354 | goto out_unlock; | ||
355 | |||
356 | rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); | ||
357 | m = rtt - tp->srtt; | ||
358 | |||
359 | /* If newly calculated rtt larger than stored one, store new | ||
360 | * one. Otherwise, use EWMA. Remember, rtt overestimation is | ||
361 | * always better than underestimation. | ||
362 | */ | ||
363 | if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { | ||
364 | if (m <= 0) | ||
365 | rtt = tp->srtt; | ||
366 | else | ||
367 | rtt -= (m >> 3); | ||
368 | tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); | ||
369 | } | ||
370 | |||
371 | if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { | ||
372 | unsigned long var; | ||
373 | |||
374 | if (m < 0) | ||
375 | m = -m; | ||
376 | |||
377 | /* Scale deviation to rttvar fixed point */ | ||
378 | m >>= 1; | ||
379 | if (m < tp->mdev) | ||
380 | m = tp->mdev; | ||
381 | |||
382 | var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | ||
383 | if (m >= var) | ||
384 | var = m; | ||
385 | else | ||
386 | var -= (var - m) >> 2; | ||
387 | |||
388 | tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); | ||
389 | } | ||
390 | |||
391 | if (tcp_in_initial_slowstart(tp)) { | ||
392 | /* Slow start still did not finish. */ | ||
393 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { | ||
394 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
395 | if (val && (tp->snd_cwnd >> 1) > val) | ||
396 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
397 | tp->snd_cwnd >> 1); | ||
398 | } | ||
399 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
400 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
401 | if (tp->snd_cwnd > val) | ||
402 | tcp_metric_set(tm, TCP_METRIC_CWND, | ||
403 | tp->snd_cwnd); | ||
404 | } | ||
405 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | ||
406 | icsk->icsk_ca_state == TCP_CA_Open) { | ||
407 | /* Cong. avoidance phase, cwnd is reliable. */ | ||
408 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) | ||
409 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
410 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); | ||
411 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
412 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
413 | tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1); | ||
414 | } | ||
415 | } else { | ||
416 | /* Else slow start did not finish, cwnd is non-sense, | ||
417 | * ssthresh may be also invalid. | ||
418 | */ | ||
419 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
420 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
421 | tcp_metric_set(tm, TCP_METRIC_CWND, | ||
422 | (val + tp->snd_ssthresh) >> 1); | ||
423 | } | ||
424 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { | ||
425 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
426 | if (val && tp->snd_ssthresh > val) | ||
427 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
428 | tp->snd_ssthresh); | ||
429 | } | ||
430 | if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { | ||
431 | val = tcp_metric_get(tm, TCP_METRIC_REORDERING); | ||
432 | if (val < tp->reordering && | ||
433 | tp->reordering != sysctl_tcp_reordering) | ||
434 | tcp_metric_set(tm, TCP_METRIC_REORDERING, | ||
435 | tp->reordering); | ||
436 | } | ||
437 | } | ||
438 | tm->tcpm_stamp = jiffies; | ||
439 | out_unlock: | ||
440 | rcu_read_unlock(); | ||
441 | } | ||
442 | |||
443 | /* Initialize metrics on socket. */ | ||
444 | |||
445 | void tcp_init_metrics(struct sock *sk) | ||
446 | { | ||
447 | struct dst_entry *dst = __sk_dst_get(sk); | ||
448 | struct tcp_sock *tp = tcp_sk(sk); | ||
449 | struct tcp_metrics_block *tm; | ||
450 | u32 val; | ||
451 | |||
452 | if (dst == NULL) | ||
453 | goto reset; | ||
454 | |||
455 | dst_confirm(dst); | ||
456 | |||
457 | rcu_read_lock(); | ||
458 | tm = tcp_get_metrics(sk, dst, true); | ||
459 | if (!tm) { | ||
460 | rcu_read_unlock(); | ||
461 | goto reset; | ||
462 | } | ||
463 | |||
464 | if (tcp_metric_locked(tm, TCP_METRIC_CWND)) | ||
465 | tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
466 | |||
467 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
468 | if (val) { | ||
469 | tp->snd_ssthresh = val; | ||
470 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) | ||
471 | tp->snd_ssthresh = tp->snd_cwnd_clamp; | ||
472 | } else { | ||
473 | /* ssthresh may have been reduced unnecessarily during. | ||
474 | * 3WHS. Restore it back to its initial default. | ||
475 | */ | ||
476 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
477 | } | ||
478 | val = tcp_metric_get(tm, TCP_METRIC_REORDERING); | ||
479 | if (val && tp->reordering != val) { | ||
480 | tcp_disable_fack(tp); | ||
481 | tcp_disable_early_retrans(tp); | ||
482 | tp->reordering = val; | ||
483 | } | ||
484 | |||
485 | val = tcp_metric_get(tm, TCP_METRIC_RTT); | ||
486 | if (val == 0 || tp->srtt == 0) { | ||
487 | rcu_read_unlock(); | ||
488 | goto reset; | ||
489 | } | ||
490 | /* Initial rtt is determined from SYN,SYN-ACK. | ||
491 | * The segment is small and rtt may appear much | ||
492 | * less than real one. Use per-dst memory | ||
493 | * to make it more realistic. | ||
494 | * | ||
495 | * A bit of theory. RTT is time passed after "normal" sized packet | ||
496 | * is sent until it is ACKed. In normal circumstances sending small | ||
497 | * packets force peer to delay ACKs and calculation is correct too. | ||
498 | * The algorithm is adaptive and, provided we follow specs, it | ||
499 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | ||
500 | * tricks sort of "quick acks" for time long enough to decrease RTT | ||
501 | * to low value, and then abruptly stops to do it and starts to delay | ||
502 | * ACKs, wait for troubles. | ||
503 | */ | ||
504 | val = msecs_to_jiffies(val); | ||
505 | if (val > tp->srtt) { | ||
506 | tp->srtt = val; | ||
507 | tp->rtt_seq = tp->snd_nxt; | ||
508 | } | ||
509 | val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | ||
510 | if (val > tp->mdev) { | ||
511 | tp->mdev = val; | ||
512 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
513 | } | ||
514 | rcu_read_unlock(); | ||
515 | |||
516 | tcp_set_rto(sk); | ||
517 | reset: | ||
518 | if (tp->srtt == 0) { | ||
519 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | ||
520 | * 3WHS. This is most likely due to retransmission, | ||
521 | * including spurious one. Reset the RTO back to 3secs | ||
522 | * from the more aggressive 1sec to avoid more spurious | ||
523 | * retransmission. | ||
524 | */ | ||
525 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | ||
526 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | ||
527 | } | ||
528 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | ||
529 | * retransmitted. In light of RFC6298 more aggressive 1sec | ||
530 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK | ||
531 | * retransmission has occurred. | ||
532 | */ | ||
533 | if (tp->total_retrans > 1) | ||
534 | tp->snd_cwnd = 1; | ||
535 | else | ||
536 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | ||
537 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
538 | } | ||
539 | |||
540 | bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) | ||
541 | { | ||
542 | struct tcp_metrics_block *tm; | ||
543 | bool ret; | ||
544 | |||
545 | if (!dst) | ||
546 | return false; | ||
547 | |||
548 | rcu_read_lock(); | ||
549 | tm = __tcp_get_metrics_req(req, dst); | ||
550 | if (paws_check) { | ||
551 | if (tm && | ||
552 | (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && | ||
553 | (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) | ||
554 | ret = false; | ||
555 | else | ||
556 | ret = true; | ||
557 | } else { | ||
558 | if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) | ||
559 | ret = true; | ||
560 | else | ||
561 | ret = false; | ||
562 | } | ||
563 | rcu_read_unlock(); | ||
564 | |||
565 | return ret; | ||
566 | } | ||
567 | EXPORT_SYMBOL_GPL(tcp_peer_is_proven); | ||
568 | |||
569 | void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) | ||
570 | { | ||
571 | struct tcp_metrics_block *tm; | ||
572 | |||
573 | rcu_read_lock(); | ||
574 | tm = tcp_get_metrics(sk, dst, true); | ||
575 | if (tm) { | ||
576 | struct tcp_sock *tp = tcp_sk(sk); | ||
577 | |||
578 | if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { | ||
579 | tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; | ||
580 | tp->rx_opt.ts_recent = tm->tcpm_ts; | ||
581 | } | ||
582 | } | ||
583 | rcu_read_unlock(); | ||
584 | } | ||
585 | EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); | ||
586 | |||
587 | /* VJ's idea. Save last timestamp seen from this destination and hold | ||
588 | * it at least for normal timewait interval to use for duplicate | ||
589 | * segment detection in subsequent connections, before they enter | ||
590 | * synchronized state. | ||
591 | */ | ||
592 | bool tcp_remember_stamp(struct sock *sk) | ||
593 | { | ||
594 | struct dst_entry *dst = __sk_dst_get(sk); | ||
595 | bool ret = false; | ||
596 | |||
597 | if (dst) { | ||
598 | struct tcp_metrics_block *tm; | ||
599 | |||
600 | rcu_read_lock(); | ||
601 | tm = tcp_get_metrics(sk, dst, true); | ||
602 | if (tm) { | ||
603 | struct tcp_sock *tp = tcp_sk(sk); | ||
604 | |||
605 | if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || | ||
606 | ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && | ||
607 | tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { | ||
608 | tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; | ||
609 | tm->tcpm_ts = tp->rx_opt.ts_recent; | ||
610 | } | ||
611 | ret = true; | ||
612 | } | ||
613 | rcu_read_unlock(); | ||
614 | } | ||
615 | return ret; | ||
616 | } | ||
617 | |||
618 | bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) | ||
619 | { | ||
620 | struct tcp_metrics_block *tm; | ||
621 | bool ret = false; | ||
622 | |||
623 | rcu_read_lock(); | ||
624 | tm = __tcp_get_metrics_tw(tw); | ||
625 | if (tm) { | ||
626 | const struct tcp_timewait_sock *tcptw; | ||
627 | struct sock *sk = (struct sock *) tw; | ||
628 | |||
629 | tcptw = tcp_twsk(sk); | ||
630 | if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || | ||
631 | ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && | ||
632 | tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { | ||
633 | tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; | ||
634 | tm->tcpm_ts = tcptw->tw_ts_recent; | ||
635 | } | ||
636 | ret = true; | ||
637 | } | ||
638 | rcu_read_unlock(); | ||
639 | |||
640 | return ret; | ||
641 | } | ||
642 | |||
643 | static DEFINE_SEQLOCK(fastopen_seqlock); | ||
644 | |||
645 | void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, | ||
646 | struct tcp_fastopen_cookie *cookie, | ||
647 | int *syn_loss, unsigned long *last_syn_loss) | ||
648 | { | ||
649 | struct tcp_metrics_block *tm; | ||
650 | |||
651 | rcu_read_lock(); | ||
652 | tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); | ||
653 | if (tm) { | ||
654 | struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; | ||
655 | unsigned int seq; | ||
656 | |||
657 | do { | ||
658 | seq = read_seqbegin(&fastopen_seqlock); | ||
659 | if (tfom->mss) | ||
660 | *mss = tfom->mss; | ||
661 | *cookie = tfom->cookie; | ||
662 | *syn_loss = tfom->syn_loss; | ||
663 | *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; | ||
664 | } while (read_seqretry(&fastopen_seqlock, seq)); | ||
665 | } | ||
666 | rcu_read_unlock(); | ||
667 | } | ||
668 | |||
669 | void tcp_fastopen_cache_set(struct sock *sk, u16 mss, | ||
670 | struct tcp_fastopen_cookie *cookie, bool syn_lost) | ||
671 | { | ||
672 | struct tcp_metrics_block *tm; | ||
673 | |||
674 | rcu_read_lock(); | ||
675 | tm = tcp_get_metrics(sk, __sk_dst_get(sk), true); | ||
676 | if (tm) { | ||
677 | struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; | ||
678 | |||
679 | write_seqlock_bh(&fastopen_seqlock); | ||
680 | tfom->mss = mss; | ||
681 | if (cookie->len > 0) | ||
682 | tfom->cookie = *cookie; | ||
683 | if (syn_lost) { | ||
684 | ++tfom->syn_loss; | ||
685 | tfom->last_syn_loss = jiffies; | ||
686 | } else | ||
687 | tfom->syn_loss = 0; | ||
688 | write_sequnlock_bh(&fastopen_seqlock); | ||
689 | } | ||
690 | rcu_read_unlock(); | ||
691 | } | ||
692 | |||
693 | static unsigned int tcpmhash_entries; | ||
694 | static int __init set_tcpmhash_entries(char *str) | ||
695 | { | ||
696 | ssize_t ret; | ||
697 | |||
698 | if (!str) | ||
699 | return 0; | ||
700 | |||
701 | ret = kstrtouint(str, 0, &tcpmhash_entries); | ||
702 | if (ret) | ||
703 | return 0; | ||
704 | |||
705 | return 1; | ||
706 | } | ||
707 | __setup("tcpmhash_entries=", set_tcpmhash_entries); | ||
708 | |||
709 | static int __net_init tcp_net_metrics_init(struct net *net) | ||
710 | { | ||
711 | size_t size; | ||
712 | unsigned int slots; | ||
713 | |||
714 | slots = tcpmhash_entries; | ||
715 | if (!slots) { | ||
716 | if (totalram_pages >= 128 * 1024) | ||
717 | slots = 16 * 1024; | ||
718 | else | ||
719 | slots = 8 * 1024; | ||
720 | } | ||
721 | |||
722 | net->ipv4.tcp_metrics_hash_log = order_base_2(slots); | ||
723 | size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; | ||
724 | |||
725 | net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL); | ||
726 | if (!net->ipv4.tcp_metrics_hash) | ||
727 | return -ENOMEM; | ||
728 | |||
729 | return 0; | ||
730 | } | ||
731 | |||
732 | static void __net_exit tcp_net_metrics_exit(struct net *net) | ||
733 | { | ||
734 | kfree(net->ipv4.tcp_metrics_hash); | ||
735 | } | ||
736 | |||
737 | static __net_initdata struct pernet_operations tcp_net_metrics_ops = { | ||
738 | .init = tcp_net_metrics_init, | ||
739 | .exit = tcp_net_metrics_exit, | ||
740 | }; | ||
741 | |||
742 | void __init tcp_metrics_init(void) | ||
743 | { | ||
744 | register_pernet_subsys(&tcp_net_metrics_ops); | ||
745 | } | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b85d9fe7d663..5912ac3fd240 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -49,56 +49,6 @@ struct inet_timewait_death_row tcp_death_row = { | |||
49 | }; | 49 | }; |
50 | EXPORT_SYMBOL_GPL(tcp_death_row); | 50 | EXPORT_SYMBOL_GPL(tcp_death_row); |
51 | 51 | ||
52 | /* VJ's idea. Save last timestamp seen from this destination | ||
53 | * and hold it at least for normal timewait interval to use for duplicate | ||
54 | * segment detection in subsequent connections, before they enter synchronized | ||
55 | * state. | ||
56 | */ | ||
57 | |||
58 | static bool tcp_remember_stamp(struct sock *sk) | ||
59 | { | ||
60 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
61 | struct tcp_sock *tp = tcp_sk(sk); | ||
62 | struct inet_peer *peer; | ||
63 | bool release_it; | ||
64 | |||
65 | peer = icsk->icsk_af_ops->get_peer(sk, &release_it); | ||
66 | if (peer) { | ||
67 | if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || | ||
68 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
69 | peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { | ||
70 | peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; | ||
71 | peer->tcp_ts = tp->rx_opt.ts_recent; | ||
72 | } | ||
73 | if (release_it) | ||
74 | inet_putpeer(peer); | ||
75 | return true; | ||
76 | } | ||
77 | |||
78 | return false; | ||
79 | } | ||
80 | |||
81 | static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) | ||
82 | { | ||
83 | struct sock *sk = (struct sock *) tw; | ||
84 | struct inet_peer *peer; | ||
85 | |||
86 | peer = twsk_getpeer(sk); | ||
87 | if (peer) { | ||
88 | const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); | ||
89 | |||
90 | if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || | ||
91 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
92 | peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { | ||
93 | peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; | ||
94 | peer->tcp_ts = tcptw->tw_ts_recent; | ||
95 | } | ||
96 | inet_putpeer(peer); | ||
97 | return true; | ||
98 | } | ||
99 | return false; | ||
100 | } | ||
101 | |||
102 | static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | 52 | static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) |
103 | { | 53 | { |
104 | if (seq == s_win) | 54 | if (seq == s_win) |
@@ -147,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, | |||
147 | 97 | ||
148 | tmp_opt.saw_tstamp = 0; | 98 | tmp_opt.saw_tstamp = 0; |
149 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { | 99 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { |
150 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0); | 100 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); |
151 | 101 | ||
152 | if (tmp_opt.saw_tstamp) { | 102 | if (tmp_opt.saw_tstamp) { |
153 | tmp_opt.ts_recent = tcptw->tw_ts_recent; | 103 | tmp_opt.ts_recent = tcptw->tw_ts_recent; |
@@ -327,8 +277,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
327 | if (tw != NULL) { | 277 | if (tw != NULL) { |
328 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 278 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
329 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); | 279 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); |
280 | struct inet_sock *inet = inet_sk(sk); | ||
330 | 281 | ||
331 | tw->tw_transparent = inet_sk(sk)->transparent; | 282 | tw->tw_transparent = inet->transparent; |
332 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; | 283 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; |
333 | tcptw->tw_rcv_nxt = tp->rcv_nxt; | 284 | tcptw->tw_rcv_nxt = tp->rcv_nxt; |
334 | tcptw->tw_snd_nxt = tp->snd_nxt; | 285 | tcptw->tw_snd_nxt = tp->snd_nxt; |
@@ -403,6 +354,7 @@ void tcp_twsk_destructor(struct sock *sk) | |||
403 | { | 354 | { |
404 | #ifdef CONFIG_TCP_MD5SIG | 355 | #ifdef CONFIG_TCP_MD5SIG |
405 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); | 356 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); |
357 | |||
406 | if (twsk->tw_md5_key) { | 358 | if (twsk->tw_md5_key) { |
407 | tcp_free_md5sig_pool(); | 359 | tcp_free_md5sig_pool(); |
408 | kfree_rcu(twsk->tw_md5_key, rcu); | 360 | kfree_rcu(twsk->tw_md5_key, rcu); |
@@ -435,6 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
435 | struct tcp_sock *oldtp = tcp_sk(sk); | 387 | struct tcp_sock *oldtp = tcp_sk(sk); |
436 | struct tcp_cookie_values *oldcvp = oldtp->cookie_values; | 388 | struct tcp_cookie_values *oldcvp = oldtp->cookie_values; |
437 | 389 | ||
390 | newsk->sk_rx_dst = dst_clone(skb_dst(skb)); | ||
391 | |||
438 | /* TCP Cookie Transactions require space for the cookie pair, | 392 | /* TCP Cookie Transactions require space for the cookie pair, |
439 | * as it differs for each connection. There is no need to | 393 | * as it differs for each connection. There is no need to |
440 | * copy any s_data_payload stored at the original socket. | 394 | * copy any s_data_payload stored at the original socket. |
@@ -470,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
470 | treq->snt_isn + 1 + tcp_s_data_size(oldtp); | 424 | treq->snt_isn + 1 + tcp_s_data_size(oldtp); |
471 | 425 | ||
472 | tcp_prequeue_init(newtp); | 426 | tcp_prequeue_init(newtp); |
427 | INIT_LIST_HEAD(&newtp->tsq_node); | ||
473 | 428 | ||
474 | tcp_init_wl(newtp, treq->rcv_isn); | 429 | tcp_init_wl(newtp, treq->rcv_isn); |
475 | 430 | ||
@@ -579,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
579 | 534 | ||
580 | tmp_opt.saw_tstamp = 0; | 535 | tmp_opt.saw_tstamp = 0; |
581 | if (th->doff > (sizeof(struct tcphdr)>>2)) { | 536 | if (th->doff > (sizeof(struct tcphdr)>>2)) { |
582 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0); | 537 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); |
583 | 538 | ||
584 | if (tmp_opt.saw_tstamp) { | 539 | if (tmp_opt.saw_tstamp) { |
585 | tmp_opt.ts_recent = req->ts_recent; | 540 | tmp_opt.ts_recent = req->ts_recent; |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 803cbfe82fbc..33cd065cfbd8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; | |||
50 | */ | 50 | */ |
51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | 51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
52 | 52 | ||
53 | /* Default TSQ limit of two TSO segments */ | ||
54 | int sysctl_tcp_limit_output_bytes __read_mostly = 131072; | ||
55 | |||
53 | /* This limits the percentage of the congestion window which we | 56 | /* This limits the percentage of the congestion window which we |
54 | * will allow a single TSO frame to consume. Building TSO frames | 57 | * will allow a single TSO frame to consume. Building TSO frames |
55 | * which are too large can cause TCP streams to be bursty. | 58 | * which are too large can cause TCP streams to be bursty. |
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | |||
65 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ | 68 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ |
66 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); | 69 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); |
67 | 70 | ||
71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | ||
72 | int push_one, gfp_t gfp); | ||
68 | 73 | ||
69 | /* Account for new data that has been sent to the network. */ | 74 | /* Account for new data that has been sent to the network. */ |
70 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | 75 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) |
@@ -380,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) | |||
380 | #define OPTION_MD5 (1 << 2) | 385 | #define OPTION_MD5 (1 << 2) |
381 | #define OPTION_WSCALE (1 << 3) | 386 | #define OPTION_WSCALE (1 << 3) |
382 | #define OPTION_COOKIE_EXTENSION (1 << 4) | 387 | #define OPTION_COOKIE_EXTENSION (1 << 4) |
388 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) | ||
383 | 389 | ||
384 | struct tcp_out_options { | 390 | struct tcp_out_options { |
385 | u8 options; /* bit field of OPTION_* */ | 391 | u16 options; /* bit field of OPTION_* */ |
392 | u16 mss; /* 0 to disable */ | ||
386 | u8 ws; /* window scale, 0 to disable */ | 393 | u8 ws; /* window scale, 0 to disable */ |
387 | u8 num_sack_blocks; /* number of SACK blocks to include */ | 394 | u8 num_sack_blocks; /* number of SACK blocks to include */ |
388 | u8 hash_size; /* bytes in hash_location */ | 395 | u8 hash_size; /* bytes in hash_location */ |
389 | u16 mss; /* 0 to disable */ | ||
390 | __u32 tsval, tsecr; /* need to include OPTION_TS */ | ||
391 | __u8 *hash_location; /* temporary pointer, overloaded */ | 396 | __u8 *hash_location; /* temporary pointer, overloaded */ |
397 | __u32 tsval, tsecr; /* need to include OPTION_TS */ | ||
398 | struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ | ||
392 | }; | 399 | }; |
393 | 400 | ||
394 | /* The sysctl int routines are generic, so check consistency here. | 401 | /* The sysctl int routines are generic, so check consistency here. |
@@ -437,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired) | |||
437 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | 444 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
438 | struct tcp_out_options *opts) | 445 | struct tcp_out_options *opts) |
439 | { | 446 | { |
440 | u8 options = opts->options; /* mungable copy */ | 447 | u16 options = opts->options; /* mungable copy */ |
441 | 448 | ||
442 | /* Having both authentication and cookies for security is redundant, | 449 | /* Having both authentication and cookies for security is redundant, |
443 | * and there's certainly not enough room. Instead, the cookie-less | 450 | * and there's certainly not enough room. Instead, the cookie-less |
@@ -559,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |||
559 | 566 | ||
560 | tp->rx_opt.dsack = 0; | 567 | tp->rx_opt.dsack = 0; |
561 | } | 568 | } |
569 | |||
570 | if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { | ||
571 | struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; | ||
572 | |||
573 | *ptr++ = htonl((TCPOPT_EXP << 24) | | ||
574 | ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | | ||
575 | TCPOPT_FASTOPEN_MAGIC); | ||
576 | |||
577 | memcpy(ptr, foc->val, foc->len); | ||
578 | if ((foc->len & 3) == 2) { | ||
579 | u8 *align = ((u8 *)ptr) + foc->len; | ||
580 | align[0] = align[1] = TCPOPT_NOP; | ||
581 | } | ||
582 | ptr += (foc->len + 3) >> 2; | ||
583 | } | ||
562 | } | 584 | } |
563 | 585 | ||
564 | /* Compute TCP options for SYN packets. This is not the final | 586 | /* Compute TCP options for SYN packets. This is not the final |
@@ -574,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
574 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? | 596 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? |
575 | tcp_cookie_size_check(cvp->cookie_desired) : | 597 | tcp_cookie_size_check(cvp->cookie_desired) : |
576 | 0; | 598 | 0; |
599 | struct tcp_fastopen_request *fastopen = tp->fastopen_req; | ||
577 | 600 | ||
578 | #ifdef CONFIG_TCP_MD5SIG | 601 | #ifdef CONFIG_TCP_MD5SIG |
579 | *md5 = tp->af_specific->md5_lookup(sk, sk); | 602 | *md5 = tp->af_specific->md5_lookup(sk, sk); |
@@ -614,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
614 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 637 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
615 | } | 638 | } |
616 | 639 | ||
640 | if (fastopen && fastopen->cookie.len >= 0) { | ||
641 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; | ||
642 | need = (need + 3) & ~3U; /* Align to 32 bits */ | ||
643 | if (remaining >= need) { | ||
644 | opts->options |= OPTION_FAST_OPEN_COOKIE; | ||
645 | opts->fastopen_cookie = &fastopen->cookie; | ||
646 | remaining -= need; | ||
647 | tp->syn_fastopen = 1; | ||
648 | } | ||
649 | } | ||
617 | /* Note that timestamps are required by the specification. | 650 | /* Note that timestamps are required by the specification. |
618 | * | 651 | * |
619 | * Odd numbers of bytes are prohibited by the specification, ensuring | 652 | * Odd numbers of bytes are prohibited by the specification, ensuring |
@@ -783,6 +816,156 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
783 | return size; | 816 | return size; |
784 | } | 817 | } |
785 | 818 | ||
819 | |||
820 | /* TCP SMALL QUEUES (TSQ) | ||
821 | * | ||
822 | * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) | ||
823 | * to reduce RTT and bufferbloat. | ||
824 | * We do this using a special skb destructor (tcp_wfree). | ||
825 | * | ||
826 | * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb | ||
827 | * needs to be reallocated in a driver. | ||
828 | * The invariant being skb->truesize substracted from sk->sk_wmem_alloc | ||
829 | * | ||
830 | * Since transmit from skb destructor is forbidden, we use a tasklet | ||
831 | * to process all sockets that eventually need to send more skbs. | ||
832 | * We use one tasklet per cpu, with its own queue of sockets. | ||
833 | */ | ||
834 | struct tsq_tasklet { | ||
835 | struct tasklet_struct tasklet; | ||
836 | struct list_head head; /* queue of tcp sockets */ | ||
837 | }; | ||
838 | static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); | ||
839 | |||
840 | static void tcp_tsq_handler(struct sock *sk) | ||
841 | { | ||
842 | if ((1 << sk->sk_state) & | ||
843 | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | | ||
844 | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) | ||
845 | tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); | ||
846 | } | ||
847 | /* | ||
848 | * One tasklest per cpu tries to send more skbs. | ||
849 | * We run in tasklet context but need to disable irqs when | ||
850 | * transfering tsq->head because tcp_wfree() might | ||
851 | * interrupt us (non NAPI drivers) | ||
852 | */ | ||
853 | static void tcp_tasklet_func(unsigned long data) | ||
854 | { | ||
855 | struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; | ||
856 | LIST_HEAD(list); | ||
857 | unsigned long flags; | ||
858 | struct list_head *q, *n; | ||
859 | struct tcp_sock *tp; | ||
860 | struct sock *sk; | ||
861 | |||
862 | local_irq_save(flags); | ||
863 | list_splice_init(&tsq->head, &list); | ||
864 | local_irq_restore(flags); | ||
865 | |||
866 | list_for_each_safe(q, n, &list) { | ||
867 | tp = list_entry(q, struct tcp_sock, tsq_node); | ||
868 | list_del(&tp->tsq_node); | ||
869 | |||
870 | sk = (struct sock *)tp; | ||
871 | bh_lock_sock(sk); | ||
872 | |||
873 | if (!sock_owned_by_user(sk)) { | ||
874 | tcp_tsq_handler(sk); | ||
875 | } else { | ||
876 | /* defer the work to tcp_release_cb() */ | ||
877 | set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); | ||
878 | } | ||
879 | bh_unlock_sock(sk); | ||
880 | |||
881 | clear_bit(TSQ_QUEUED, &tp->tsq_flags); | ||
882 | sk_free(sk); | ||
883 | } | ||
884 | } | ||
885 | |||
886 | #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ | ||
887 | (1UL << TCP_WRITE_TIMER_DEFERRED) | \ | ||
888 | (1UL << TCP_DELACK_TIMER_DEFERRED) | \ | ||
889 | (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
890 | /** | ||
891 | * tcp_release_cb - tcp release_sock() callback | ||
892 | * @sk: socket | ||
893 | * | ||
894 | * called from release_sock() to perform protocol dependent | ||
895 | * actions before socket release. | ||
896 | */ | ||
897 | void tcp_release_cb(struct sock *sk) | ||
898 | { | ||
899 | struct tcp_sock *tp = tcp_sk(sk); | ||
900 | unsigned long flags, nflags; | ||
901 | |||
902 | /* perform an atomic operation only if at least one flag is set */ | ||
903 | do { | ||
904 | flags = tp->tsq_flags; | ||
905 | if (!(flags & TCP_DEFERRED_ALL)) | ||
906 | return; | ||
907 | nflags = flags & ~TCP_DEFERRED_ALL; | ||
908 | } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); | ||
909 | |||
910 | if (flags & (1UL << TCP_TSQ_DEFERRED)) | ||
911 | tcp_tsq_handler(sk); | ||
912 | |||
913 | if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) | ||
914 | tcp_write_timer_handler(sk); | ||
915 | |||
916 | if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) | ||
917 | tcp_delack_timer_handler(sk); | ||
918 | |||
919 | if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
920 | sk->sk_prot->mtu_reduced(sk); | ||
921 | } | ||
922 | EXPORT_SYMBOL(tcp_release_cb); | ||
923 | |||
924 | void __init tcp_tasklet_init(void) | ||
925 | { | ||
926 | int i; | ||
927 | |||
928 | for_each_possible_cpu(i) { | ||
929 | struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); | ||
930 | |||
931 | INIT_LIST_HEAD(&tsq->head); | ||
932 | tasklet_init(&tsq->tasklet, | ||
933 | tcp_tasklet_func, | ||
934 | (unsigned long)tsq); | ||
935 | } | ||
936 | } | ||
937 | |||
938 | /* | ||
939 | * Write buffer destructor automatically called from kfree_skb. | ||
940 | * We cant xmit new skbs from this context, as we might already | ||
941 | * hold qdisc lock. | ||
942 | */ | ||
943 | void tcp_wfree(struct sk_buff *skb) | ||
944 | { | ||
945 | struct sock *sk = skb->sk; | ||
946 | struct tcp_sock *tp = tcp_sk(sk); | ||
947 | |||
948 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && | ||
949 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { | ||
950 | unsigned long flags; | ||
951 | struct tsq_tasklet *tsq; | ||
952 | |||
953 | /* Keep a ref on socket. | ||
954 | * This last ref will be released in tcp_tasklet_func() | ||
955 | */ | ||
956 | atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); | ||
957 | |||
958 | /* queue this socket to tasklet queue */ | ||
959 | local_irq_save(flags); | ||
960 | tsq = &__get_cpu_var(tsq_tasklet); | ||
961 | list_add(&tp->tsq_node, &tsq->head); | ||
962 | tasklet_schedule(&tsq->tasklet); | ||
963 | local_irq_restore(flags); | ||
964 | } else { | ||
965 | sock_wfree(skb); | ||
966 | } | ||
967 | } | ||
968 | |||
786 | /* This routine actually transmits TCP packets queued in by | 969 | /* This routine actually transmits TCP packets queued in by |
787 | * tcp_do_sendmsg(). This is used by both the initial | 970 | * tcp_do_sendmsg(). This is used by both the initial |
788 | * transmission and possible later retransmissions. | 971 | * transmission and possible later retransmissions. |
@@ -844,7 +1027,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
844 | 1027 | ||
845 | skb_push(skb, tcp_header_size); | 1028 | skb_push(skb, tcp_header_size); |
846 | skb_reset_transport_header(skb); | 1029 | skb_reset_transport_header(skb); |
847 | skb_set_owner_w(skb, sk); | 1030 | |
1031 | skb_orphan(skb); | ||
1032 | skb->sk = sk; | ||
1033 | skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? | ||
1034 | tcp_wfree : sock_wfree; | ||
1035 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | ||
848 | 1036 | ||
849 | /* Build TCP header and checksum it. */ | 1037 | /* Build TCP header and checksum it. */ |
850 | th = tcp_hdr(skb); | 1038 | th = tcp_hdr(skb); |
@@ -1780,6 +1968,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1780 | while ((skb = tcp_send_head(sk))) { | 1968 | while ((skb = tcp_send_head(sk))) { |
1781 | unsigned int limit; | 1969 | unsigned int limit; |
1782 | 1970 | ||
1971 | |||
1783 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); | 1972 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); |
1784 | BUG_ON(!tso_segs); | 1973 | BUG_ON(!tso_segs); |
1785 | 1974 | ||
@@ -1800,6 +1989,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1800 | break; | 1989 | break; |
1801 | } | 1990 | } |
1802 | 1991 | ||
1992 | /* TSQ : sk_wmem_alloc accounts skb truesize, | ||
1993 | * including skb overhead. But thats OK. | ||
1994 | */ | ||
1995 | if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { | ||
1996 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | ||
1997 | break; | ||
1998 | } | ||
1803 | limit = mss_now; | 1999 | limit = mss_now; |
1804 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | 2000 | if (tso_segs > 1 && !tcp_urg_mode(tp)) |
1805 | limit = tcp_mss_split_point(sk, skb, mss_now, | 2001 | limit = tcp_mss_split_point(sk, skb, mss_now, |
@@ -2442,7 +2638,16 @@ int tcp_send_synack(struct sock *sk) | |||
2442 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2638 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
2443 | } | 2639 | } |
2444 | 2640 | ||
2445 | /* Prepare a SYN-ACK. */ | 2641 | /** |
2642 | * tcp_make_synack - Prepare a SYN-ACK. | ||
2643 | * sk: listener socket | ||
2644 | * dst: dst entry attached to the SYNACK | ||
2645 | * req: request_sock pointer | ||
2646 | * rvp: request_values pointer | ||
2647 | * | ||
2648 | * Allocate one skb and build a SYNACK packet. | ||
2649 | * @dst is consumed : Caller should not use it again. | ||
2650 | */ | ||
2446 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | 2651 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
2447 | struct request_sock *req, | 2652 | struct request_sock *req, |
2448 | struct request_values *rvp) | 2653 | struct request_values *rvp) |
@@ -2461,14 +2666,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2461 | 2666 | ||
2462 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) | 2667 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) |
2463 | s_data_desired = cvp->s_data_desired; | 2668 | s_data_desired = cvp->s_data_desired; |
2464 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); | 2669 | skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); |
2465 | if (skb == NULL) | 2670 | if (unlikely(!skb)) { |
2671 | dst_release(dst); | ||
2466 | return NULL; | 2672 | return NULL; |
2467 | 2673 | } | |
2468 | /* Reserve space for headers. */ | 2674 | /* Reserve space for headers. */ |
2469 | skb_reserve(skb, MAX_TCP_HEADER); | 2675 | skb_reserve(skb, MAX_TCP_HEADER); |
2470 | 2676 | ||
2471 | skb_dst_set(skb, dst_clone(dst)); | 2677 | skb_dst_set(skb, dst); |
2472 | 2678 | ||
2473 | mss = dst_metric_advmss(dst); | 2679 | mss = dst_metric_advmss(dst); |
2474 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2680 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
@@ -2645,6 +2851,109 @@ void tcp_connect_init(struct sock *sk) | |||
2645 | tcp_clear_retrans(tp); | 2851 | tcp_clear_retrans(tp); |
2646 | } | 2852 | } |
2647 | 2853 | ||
2854 | static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) | ||
2855 | { | ||
2856 | struct tcp_sock *tp = tcp_sk(sk); | ||
2857 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | ||
2858 | |||
2859 | tcb->end_seq += skb->len; | ||
2860 | skb_header_release(skb); | ||
2861 | __tcp_add_write_queue_tail(sk, skb); | ||
2862 | sk->sk_wmem_queued += skb->truesize; | ||
2863 | sk_mem_charge(sk, skb->truesize); | ||
2864 | tp->write_seq = tcb->end_seq; | ||
2865 | tp->packets_out += tcp_skb_pcount(skb); | ||
2866 | } | ||
2867 | |||
2868 | /* Build and send a SYN with data and (cached) Fast Open cookie. However, | ||
2869 | * queue a data-only packet after the regular SYN, such that regular SYNs | ||
2870 | * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges | ||
2871 | * only the SYN sequence, the data are retransmitted in the first ACK. | ||
2872 | * If cookie is not cached or other error occurs, falls back to send a | ||
2873 | * regular SYN with Fast Open cookie request option. | ||
2874 | */ | ||
2875 | static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | ||
2876 | { | ||
2877 | struct tcp_sock *tp = tcp_sk(sk); | ||
2878 | struct tcp_fastopen_request *fo = tp->fastopen_req; | ||
2879 | int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; | ||
2880 | struct sk_buff *syn_data = NULL, *data; | ||
2881 | unsigned long last_syn_loss = 0; | ||
2882 | |||
2883 | tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ | ||
2884 | tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, | ||
2885 | &syn_loss, &last_syn_loss); | ||
2886 | /* Recurring FO SYN losses: revert to regular handshake temporarily */ | ||
2887 | if (syn_loss > 1 && | ||
2888 | time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { | ||
2889 | fo->cookie.len = -1; | ||
2890 | goto fallback; | ||
2891 | } | ||
2892 | |||
2893 | if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) | ||
2894 | fo->cookie.len = -1; | ||
2895 | else if (fo->cookie.len <= 0) | ||
2896 | goto fallback; | ||
2897 | |||
2898 | /* MSS for SYN-data is based on cached MSS and bounded by PMTU and | ||
2899 | * user-MSS. Reserve maximum option space for middleboxes that add | ||
2900 | * private TCP options. The cost is reduced data space in SYN :( | ||
2901 | */ | ||
2902 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) | ||
2903 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | ||
2904 | space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - | ||
2905 | MAX_TCP_OPTION_SPACE; | ||
2906 | |||
2907 | syn_data = skb_copy_expand(syn, skb_headroom(syn), space, | ||
2908 | sk->sk_allocation); | ||
2909 | if (syn_data == NULL) | ||
2910 | goto fallback; | ||
2911 | |||
2912 | for (i = 0; i < iovlen && syn_data->len < space; ++i) { | ||
2913 | struct iovec *iov = &fo->data->msg_iov[i]; | ||
2914 | unsigned char __user *from = iov->iov_base; | ||
2915 | int len = iov->iov_len; | ||
2916 | |||
2917 | if (syn_data->len + len > space) | ||
2918 | len = space - syn_data->len; | ||
2919 | else if (i + 1 == iovlen) | ||
2920 | /* No more data pending in inet_wait_for_connect() */ | ||
2921 | fo->data = NULL; | ||
2922 | |||
2923 | if (skb_add_data(syn_data, from, len)) | ||
2924 | goto fallback; | ||
2925 | } | ||
2926 | |||
2927 | /* Queue a data-only packet after the regular SYN for retransmission */ | ||
2928 | data = pskb_copy(syn_data, sk->sk_allocation); | ||
2929 | if (data == NULL) | ||
2930 | goto fallback; | ||
2931 | TCP_SKB_CB(data)->seq++; | ||
2932 | TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; | ||
2933 | TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); | ||
2934 | tcp_connect_queue_skb(sk, data); | ||
2935 | fo->copied = data->len; | ||
2936 | |||
2937 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { | ||
2938 | tp->syn_data = (fo->copied > 0); | ||
2939 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); | ||
2940 | goto done; | ||
2941 | } | ||
2942 | syn_data = NULL; | ||
2943 | |||
2944 | fallback: | ||
2945 | /* Send a regular SYN with Fast Open cookie request option */ | ||
2946 | if (fo->cookie.len > 0) | ||
2947 | fo->cookie.len = 0; | ||
2948 | err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); | ||
2949 | if (err) | ||
2950 | tp->syn_fastopen = 0; | ||
2951 | kfree_skb(syn_data); | ||
2952 | done: | ||
2953 | fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ | ||
2954 | return err; | ||
2955 | } | ||
2956 | |||
2648 | /* Build a SYN and send it off. */ | 2957 | /* Build a SYN and send it off. */ |
2649 | int tcp_connect(struct sock *sk) | 2958 | int tcp_connect(struct sock *sk) |
2650 | { | 2959 | { |
@@ -2662,17 +2971,13 @@ int tcp_connect(struct sock *sk) | |||
2662 | skb_reserve(buff, MAX_TCP_HEADER); | 2971 | skb_reserve(buff, MAX_TCP_HEADER); |
2663 | 2972 | ||
2664 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); | 2973 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); |
2974 | tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; | ||
2975 | tcp_connect_queue_skb(sk, buff); | ||
2665 | TCP_ECN_send_syn(sk, buff); | 2976 | TCP_ECN_send_syn(sk, buff); |
2666 | 2977 | ||
2667 | /* Send it off. */ | 2978 | /* Send off SYN; include data in Fast Open. */ |
2668 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 2979 | err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : |
2669 | tp->retrans_stamp = TCP_SKB_CB(buff)->when; | 2980 | tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); |
2670 | skb_header_release(buff); | ||
2671 | __tcp_add_write_queue_tail(sk, buff); | ||
2672 | sk->sk_wmem_queued += buff->truesize; | ||
2673 | sk_mem_charge(sk, buff->truesize); | ||
2674 | tp->packets_out += tcp_skb_pcount(buff); | ||
2675 | err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); | ||
2676 | if (err == -ECONNREFUSED) | 2981 | if (err == -ECONNREFUSED) |
2677 | return err; | 2982 | return err; |
2678 | 2983 | ||
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e911e6c523ec..6df36ad55a38 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; | |||
32 | int sysctl_tcp_orphan_retries __read_mostly; | 32 | int sysctl_tcp_orphan_retries __read_mostly; |
33 | int sysctl_tcp_thin_linear_timeouts __read_mostly; | 33 | int sysctl_tcp_thin_linear_timeouts __read_mostly; |
34 | 34 | ||
35 | static void tcp_write_timer(unsigned long); | ||
36 | static void tcp_delack_timer(unsigned long); | ||
37 | static void tcp_keepalive_timer (unsigned long data); | ||
38 | |||
39 | void tcp_init_xmit_timers(struct sock *sk) | ||
40 | { | ||
41 | inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, | ||
42 | &tcp_keepalive_timer); | ||
43 | } | ||
44 | EXPORT_SYMBOL(tcp_init_xmit_timers); | ||
45 | |||
46 | static void tcp_write_err(struct sock *sk) | 35 | static void tcp_write_err(struct sock *sk) |
47 | { | 36 | { |
48 | sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; | 37 | sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; |
@@ -205,21 +194,11 @@ static int tcp_write_timeout(struct sock *sk) | |||
205 | return 0; | 194 | return 0; |
206 | } | 195 | } |
207 | 196 | ||
208 | static void tcp_delack_timer(unsigned long data) | 197 | void tcp_delack_timer_handler(struct sock *sk) |
209 | { | 198 | { |
210 | struct sock *sk = (struct sock *)data; | ||
211 | struct tcp_sock *tp = tcp_sk(sk); | 199 | struct tcp_sock *tp = tcp_sk(sk); |
212 | struct inet_connection_sock *icsk = inet_csk(sk); | 200 | struct inet_connection_sock *icsk = inet_csk(sk); |
213 | 201 | ||
214 | bh_lock_sock(sk); | ||
215 | if (sock_owned_by_user(sk)) { | ||
216 | /* Try again later. */ | ||
217 | icsk->icsk_ack.blocked = 1; | ||
218 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); | ||
219 | sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); | ||
220 | goto out_unlock; | ||
221 | } | ||
222 | |||
223 | sk_mem_reclaim_partial(sk); | 202 | sk_mem_reclaim_partial(sk); |
224 | 203 | ||
225 | if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) | 204 | if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) |
@@ -260,7 +239,21 @@ static void tcp_delack_timer(unsigned long data) | |||
260 | out: | 239 | out: |
261 | if (sk_under_memory_pressure(sk)) | 240 | if (sk_under_memory_pressure(sk)) |
262 | sk_mem_reclaim(sk); | 241 | sk_mem_reclaim(sk); |
263 | out_unlock: | 242 | } |
243 | |||
244 | static void tcp_delack_timer(unsigned long data) | ||
245 | { | ||
246 | struct sock *sk = (struct sock *)data; | ||
247 | |||
248 | bh_lock_sock(sk); | ||
249 | if (!sock_owned_by_user(sk)) { | ||
250 | tcp_delack_timer_handler(sk); | ||
251 | } else { | ||
252 | inet_csk(sk)->icsk_ack.blocked = 1; | ||
253 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); | ||
254 | /* deleguate our work to tcp_release_cb() */ | ||
255 | set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); | ||
256 | } | ||
264 | bh_unlock_sock(sk); | 257 | bh_unlock_sock(sk); |
265 | sock_put(sk); | 258 | sock_put(sk); |
266 | } | 259 | } |
@@ -450,19 +443,11 @@ out_reset_timer: | |||
450 | out:; | 443 | out:; |
451 | } | 444 | } |
452 | 445 | ||
453 | static void tcp_write_timer(unsigned long data) | 446 | void tcp_write_timer_handler(struct sock *sk) |
454 | { | 447 | { |
455 | struct sock *sk = (struct sock *)data; | ||
456 | struct inet_connection_sock *icsk = inet_csk(sk); | 448 | struct inet_connection_sock *icsk = inet_csk(sk); |
457 | int event; | 449 | int event; |
458 | 450 | ||
459 | bh_lock_sock(sk); | ||
460 | if (sock_owned_by_user(sk)) { | ||
461 | /* Try again later */ | ||
462 | sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); | ||
463 | goto out_unlock; | ||
464 | } | ||
465 | |||
466 | if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) | 451 | if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) |
467 | goto out; | 452 | goto out; |
468 | 453 | ||
@@ -485,7 +470,19 @@ static void tcp_write_timer(unsigned long data) | |||
485 | 470 | ||
486 | out: | 471 | out: |
487 | sk_mem_reclaim(sk); | 472 | sk_mem_reclaim(sk); |
488 | out_unlock: | 473 | } |
474 | |||
475 | static void tcp_write_timer(unsigned long data) | ||
476 | { | ||
477 | struct sock *sk = (struct sock *)data; | ||
478 | |||
479 | bh_lock_sock(sk); | ||
480 | if (!sock_owned_by_user(sk)) { | ||
481 | tcp_write_timer_handler(sk); | ||
482 | } else { | ||
483 | /* deleguate our work to tcp_release_cb() */ | ||
484 | set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); | ||
485 | } | ||
489 | bh_unlock_sock(sk); | 486 | bh_unlock_sock(sk); |
490 | sock_put(sk); | 487 | sock_put(sk); |
491 | } | 488 | } |
@@ -602,3 +599,10 @@ out: | |||
602 | bh_unlock_sock(sk); | 599 | bh_unlock_sock(sk); |
603 | sock_put(sk); | 600 | sock_put(sk); |
604 | } | 601 | } |
602 | |||
603 | void tcp_init_xmit_timers(struct sock *sk) | ||
604 | { | ||
605 | inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, | ||
606 | &tcp_keepalive_timer); | ||
607 | } | ||
608 | EXPORT_SYMBOL(tcp_init_xmit_timers); | ||
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eaca73644e79..b4c3582a991f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -108,6 +108,7 @@ | |||
108 | #include <net/xfrm.h> | 108 | #include <net/xfrm.h> |
109 | #include <trace/events/udp.h> | 109 | #include <trace/events/udp.h> |
110 | #include <linux/static_key.h> | 110 | #include <linux/static_key.h> |
111 | #include <trace/events/skb.h> | ||
111 | #include "udp_impl.h" | 112 | #include "udp_impl.h" |
112 | 113 | ||
113 | struct udp_table udp_table __read_mostly; | 114 | struct udp_table udp_table __read_mostly; |
@@ -615,6 +616,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) | |||
615 | break; | 616 | break; |
616 | case ICMP_DEST_UNREACH: | 617 | case ICMP_DEST_UNREACH: |
617 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ | 618 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ |
619 | ipv4_sk_update_pmtu(skb, sk, info); | ||
618 | if (inet->pmtudisc != IP_PMTUDISC_DONT) { | 620 | if (inet->pmtudisc != IP_PMTUDISC_DONT) { |
619 | err = EMSGSIZE; | 621 | err = EMSGSIZE; |
620 | harderr = 1; | 622 | harderr = 1; |
@@ -628,6 +630,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) | |||
628 | err = icmp_err_convert[code].errno; | 630 | err = icmp_err_convert[code].errno; |
629 | } | 631 | } |
630 | break; | 632 | break; |
633 | case ICMP_REDIRECT: | ||
634 | ipv4_sk_redirect(skb, sk); | ||
635 | break; | ||
631 | } | 636 | } |
632 | 637 | ||
633 | /* | 638 | /* |
@@ -1219,8 +1224,10 @@ try_again: | |||
1219 | goto csum_copy_err; | 1224 | goto csum_copy_err; |
1220 | } | 1225 | } |
1221 | 1226 | ||
1222 | if (err) | 1227 | if (unlikely(err)) { |
1228 | trace_kfree_skb(skb, udp_recvmsg); | ||
1223 | goto out_free; | 1229 | goto out_free; |
1230 | } | ||
1224 | 1231 | ||
1225 | if (!peeked) | 1232 | if (!peeked) |
1226 | UDP_INC_STATS_USER(sock_net(sk), | 1233 | UDP_INC_STATS_USER(sock_net(sk), |
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index a7f86a3cd502..16d0960062be 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c | |||
@@ -34,15 +34,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, | |||
34 | int err = -EINVAL; | 34 | int err = -EINVAL; |
35 | struct sock *sk; | 35 | struct sock *sk; |
36 | struct sk_buff *rep; | 36 | struct sk_buff *rep; |
37 | struct net *net = sock_net(in_skb->sk); | ||
37 | 38 | ||
38 | if (req->sdiag_family == AF_INET) | 39 | if (req->sdiag_family == AF_INET) |
39 | sk = __udp4_lib_lookup(&init_net, | 40 | sk = __udp4_lib_lookup(net, |
40 | req->id.idiag_src[0], req->id.idiag_sport, | 41 | req->id.idiag_src[0], req->id.idiag_sport, |
41 | req->id.idiag_dst[0], req->id.idiag_dport, | 42 | req->id.idiag_dst[0], req->id.idiag_dport, |
42 | req->id.idiag_if, tbl); | 43 | req->id.idiag_if, tbl); |
43 | #if IS_ENABLED(CONFIG_IPV6) | 44 | #if IS_ENABLED(CONFIG_IPV6) |
44 | else if (req->sdiag_family == AF_INET6) | 45 | else if (req->sdiag_family == AF_INET6) |
45 | sk = __udp6_lib_lookup(&init_net, | 46 | sk = __udp6_lib_lookup(net, |
46 | (struct in6_addr *)req->id.idiag_src, | 47 | (struct in6_addr *)req->id.idiag_src, |
47 | req->id.idiag_sport, | 48 | req->id.idiag_sport, |
48 | (struct in6_addr *)req->id.idiag_dst, | 49 | (struct in6_addr *)req->id.idiag_dst, |
@@ -75,7 +76,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, | |||
75 | kfree_skb(rep); | 76 | kfree_skb(rep); |
76 | goto out; | 77 | goto out; |
77 | } | 78 | } |
78 | err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, | 79 | err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, |
79 | MSG_DONTWAIT); | 80 | MSG_DONTWAIT); |
80 | if (err > 0) | 81 | if (err > 0) |
81 | err = 0; | 82 | err = 0; |
@@ -90,6 +91,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin | |||
90 | struct inet_diag_req_v2 *r, struct nlattr *bc) | 91 | struct inet_diag_req_v2 *r, struct nlattr *bc) |
91 | { | 92 | { |
92 | int num, s_num, slot, s_slot; | 93 | int num, s_num, slot, s_slot; |
94 | struct net *net = sock_net(skb->sk); | ||
93 | 95 | ||
94 | s_slot = cb->args[0]; | 96 | s_slot = cb->args[0]; |
95 | num = s_num = cb->args[1]; | 97 | num = s_num = cb->args[1]; |
@@ -106,6 +108,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin | |||
106 | sk_nulls_for_each(sk, node, &hslot->head) { | 108 | sk_nulls_for_each(sk, node, &hslot->head) { |
107 | struct inet_sock *inet = inet_sk(sk); | 109 | struct inet_sock *inet = inet_sk(sk); |
108 | 110 | ||
111 | if (!net_eq(sock_net(sk), net)) | ||
112 | continue; | ||
109 | if (num < s_num) | 113 | if (num < s_num) |
110 | goto next; | 114 | goto next; |
111 | if (!(r->idiag_states & (1 << sk->sk_state))) | 115 | if (!(r->idiag_states & (1 << sk->sk_state))) |
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216dc..58d23a572509 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c | |||
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) | |||
27 | if (skb_dst(skb) == NULL) { | 27 | if (skb_dst(skb) == NULL) { |
28 | const struct iphdr *iph = ip_hdr(skb); | 28 | const struct iphdr *iph = ip_hdr(skb); |
29 | 29 | ||
30 | if (ip_route_input_noref(skb, iph->daddr, iph->saddr, | 30 | if (ip_route_input(skb, iph->daddr, iph->saddr, |
31 | iph->tos, skb->dev)) | 31 | iph->tos, skb->dev)) |
32 | goto drop; | 32 | goto drop; |
33 | } | 33 | } |
34 | return dst_input(skb); | 34 | return dst_input(skb); |
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index ed4bf11ef9f4..ddee0a099a2c 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c | |||
@@ -15,6 +15,65 @@ | |||
15 | #include <net/ip.h> | 15 | #include <net/ip.h> |
16 | #include <net/xfrm.h> | 16 | #include <net/xfrm.h> |
17 | 17 | ||
18 | /* Informational hook. The decap is still done here. */ | ||
19 | static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly; | ||
20 | static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); | ||
21 | |||
22 | int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler) | ||
23 | { | ||
24 | struct xfrm_tunnel __rcu **pprev; | ||
25 | struct xfrm_tunnel *t; | ||
26 | int ret = -EEXIST; | ||
27 | int priority = handler->priority; | ||
28 | |||
29 | mutex_lock(&xfrm4_mode_tunnel_input_mutex); | ||
30 | |||
31 | for (pprev = &rcv_notify_handlers; | ||
32 | (t = rcu_dereference_protected(*pprev, | ||
33 | lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; | ||
34 | pprev = &t->next) { | ||
35 | if (t->priority > priority) | ||
36 | break; | ||
37 | if (t->priority == priority) | ||
38 | goto err; | ||
39 | |||
40 | } | ||
41 | |||
42 | handler->next = *pprev; | ||
43 | rcu_assign_pointer(*pprev, handler); | ||
44 | |||
45 | ret = 0; | ||
46 | |||
47 | err: | ||
48 | mutex_unlock(&xfrm4_mode_tunnel_input_mutex); | ||
49 | return ret; | ||
50 | } | ||
51 | EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); | ||
52 | |||
53 | int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler) | ||
54 | { | ||
55 | struct xfrm_tunnel __rcu **pprev; | ||
56 | struct xfrm_tunnel *t; | ||
57 | int ret = -ENOENT; | ||
58 | |||
59 | mutex_lock(&xfrm4_mode_tunnel_input_mutex); | ||
60 | for (pprev = &rcv_notify_handlers; | ||
61 | (t = rcu_dereference_protected(*pprev, | ||
62 | lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; | ||
63 | pprev = &t->next) { | ||
64 | if (t == handler) { | ||
65 | *pprev = handler->next; | ||
66 | ret = 0; | ||
67 | break; | ||
68 | } | ||
69 | } | ||
70 | mutex_unlock(&xfrm4_mode_tunnel_input_mutex); | ||
71 | synchronize_net(); | ||
72 | |||
73 | return ret; | ||
74 | } | ||
75 | EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister); | ||
76 | |||
18 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) | 77 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) |
19 | { | 78 | { |
20 | struct iphdr *inner_iph = ipip_hdr(skb); | 79 | struct iphdr *inner_iph = ipip_hdr(skb); |
@@ -64,8 +123,14 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) | |||
64 | return 0; | 123 | return 0; |
65 | } | 124 | } |
66 | 125 | ||
126 | #define for_each_input_rcu(head, handler) \ | ||
127 | for (handler = rcu_dereference(head); \ | ||
128 | handler != NULL; \ | ||
129 | handler = rcu_dereference(handler->next)) | ||
130 | |||
67 | static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | 131 | static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) |
68 | { | 132 | { |
133 | struct xfrm_tunnel *handler; | ||
69 | int err = -EINVAL; | 134 | int err = -EINVAL; |
70 | 135 | ||
71 | if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) | 136 | if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) |
@@ -74,6 +139,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | |||
74 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | 139 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) |
75 | goto out; | 140 | goto out; |
76 | 141 | ||
142 | for_each_input_rcu(rcv_notify_handlers, handler) | ||
143 | handler->handler(skb); | ||
144 | |||
77 | if (skb_cloned(skb) && | 145 | if (skb_cloned(skb) && |
78 | (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) | 146 | (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) |
79 | goto out; | 147 | goto out; |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 0d3426cb5c4f..c6281847f16a 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
@@ -79,30 +79,19 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, | |||
79 | struct rtable *rt = (struct rtable *)xdst->route; | 79 | struct rtable *rt = (struct rtable *)xdst->route; |
80 | const struct flowi4 *fl4 = &fl->u.ip4; | 80 | const struct flowi4 *fl4 = &fl->u.ip4; |
81 | 81 | ||
82 | xdst->u.rt.rt_key_dst = fl4->daddr; | ||
83 | xdst->u.rt.rt_key_src = fl4->saddr; | ||
84 | xdst->u.rt.rt_key_tos = fl4->flowi4_tos; | ||
85 | xdst->u.rt.rt_route_iif = fl4->flowi4_iif; | ||
86 | xdst->u.rt.rt_iif = fl4->flowi4_iif; | 82 | xdst->u.rt.rt_iif = fl4->flowi4_iif; |
87 | xdst->u.rt.rt_oif = fl4->flowi4_oif; | ||
88 | xdst->u.rt.rt_mark = fl4->flowi4_mark; | ||
89 | 83 | ||
90 | xdst->u.dst.dev = dev; | 84 | xdst->u.dst.dev = dev; |
91 | dev_hold(dev); | 85 | dev_hold(dev); |
92 | 86 | ||
93 | xdst->u.rt.peer = rt->peer; | ||
94 | if (rt->peer) | ||
95 | atomic_inc(&rt->peer->refcnt); | ||
96 | |||
97 | /* Sheit... I remember I did this right. Apparently, | 87 | /* Sheit... I remember I did this right. Apparently, |
98 | * it was magically lost, so this code needs audit */ | 88 | * it was magically lost, so this code needs audit */ |
89 | xdst->u.rt.rt_is_input = rt->rt_is_input; | ||
99 | xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | | 90 | xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | |
100 | RTCF_LOCAL); | 91 | RTCF_LOCAL); |
101 | xdst->u.rt.rt_type = rt->rt_type; | 92 | xdst->u.rt.rt_type = rt->rt_type; |
102 | xdst->u.rt.rt_src = rt->rt_src; | ||
103 | xdst->u.rt.rt_dst = rt->rt_dst; | ||
104 | xdst->u.rt.rt_gateway = rt->rt_gateway; | 93 | xdst->u.rt.rt_gateway = rt->rt_gateway; |
105 | xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; | 94 | xdst->u.rt.rt_pmtu = rt->rt_pmtu; |
106 | 95 | ||
107 | return 0; | 96 | return 0; |
108 | } | 97 | } |
@@ -198,12 +187,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) | |||
198 | return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); | 187 | return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); |
199 | } | 188 | } |
200 | 189 | ||
201 | static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) | 190 | static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, |
191 | struct sk_buff *skb, u32 mtu) | ||
192 | { | ||
193 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; | ||
194 | struct dst_entry *path = xdst->route; | ||
195 | |||
196 | path->ops->update_pmtu(path, sk, skb, mtu); | ||
197 | } | ||
198 | |||
199 | static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, | ||
200 | struct sk_buff *skb) | ||
202 | { | 201 | { |
203 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; | 202 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; |
204 | struct dst_entry *path = xdst->route; | 203 | struct dst_entry *path = xdst->route; |
205 | 204 | ||
206 | path->ops->update_pmtu(path, mtu); | 205 | path->ops->redirect(path, sk, skb); |
207 | } | 206 | } |
208 | 207 | ||
209 | static void xfrm4_dst_destroy(struct dst_entry *dst) | 208 | static void xfrm4_dst_destroy(struct dst_entry *dst) |
@@ -212,9 +211,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) | |||
212 | 211 | ||
213 | dst_destroy_metrics_generic(dst); | 212 | dst_destroy_metrics_generic(dst); |
214 | 213 | ||
215 | if (likely(xdst->u.rt.peer)) | ||
216 | inet_putpeer(xdst->u.rt.peer); | ||
217 | |||
218 | xfrm_dst_destroy(xdst); | 214 | xfrm_dst_destroy(xdst); |
219 | } | 215 | } |
220 | 216 | ||
@@ -232,6 +228,7 @@ static struct dst_ops xfrm4_dst_ops = { | |||
232 | .protocol = cpu_to_be16(ETH_P_IP), | 228 | .protocol = cpu_to_be16(ETH_P_IP), |
233 | .gc = xfrm4_garbage_collect, | 229 | .gc = xfrm4_garbage_collect, |
234 | .update_pmtu = xfrm4_update_pmtu, | 230 | .update_pmtu = xfrm4_update_pmtu, |
231 | .redirect = xfrm4_redirect, | ||
235 | .cow_metrics = dst_cow_metrics_generic, | 232 | .cow_metrics = dst_cow_metrics_generic, |
236 | .destroy = xfrm4_dst_destroy, | 233 | .destroy = xfrm4_dst_destroy, |
237 | .ifdown = xfrm4_dst_ifdown, | 234 | .ifdown = xfrm4_dst_ifdown, |