diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Kconfig | 7 | ||||
-rw-r--r-- | net/ipv4/Makefile | 1 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 10 | ||||
-rw-r--r-- | net/ipv4/arp.c | 27 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 83 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 4 | ||||
-rw-r--r-- | net/ipv4/gre.c | 5 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 4 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 27 | ||||
-rw-r--r-- | net/ipv4/inet_lro.c | 5 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 31 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 1516 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 1035 | ||||
-rw-r--r-- | net/ipv4/ip_vti.c | 42 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 748 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/arptable_filter.c | 4 | ||||
-rw-r--r-- | net/ipv4/proc.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 2 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 3 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 18 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 268 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 606 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 108 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 44 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 367 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 21 | ||||
-rw-r--r-- | net/ipv4/tcp_westwood.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 115 |
30 files changed, 2012 insertions, 3097 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7944df768454..8603ca827104 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -166,6 +166,7 @@ config IP_PNP_RARP | |||
166 | config NET_IPIP | 166 | config NET_IPIP |
167 | tristate "IP: tunneling" | 167 | tristate "IP: tunneling" |
168 | select INET_TUNNEL | 168 | select INET_TUNNEL |
169 | select NET_IP_TUNNEL | ||
169 | ---help--- | 170 | ---help--- |
170 | Tunneling means encapsulating data of one protocol type within | 171 | Tunneling means encapsulating data of one protocol type within |
171 | another protocol and sending it over a channel that understands the | 172 | another protocol and sending it over a channel that understands the |
@@ -186,9 +187,14 @@ config NET_IPGRE_DEMUX | |||
186 | This is helper module to demultiplex GRE packets on GRE version field criteria. | 187 | This is helper module to demultiplex GRE packets on GRE version field criteria. |
187 | Required by ip_gre and pptp modules. | 188 | Required by ip_gre and pptp modules. |
188 | 189 | ||
190 | config NET_IP_TUNNEL | ||
191 | tristate | ||
192 | default n | ||
193 | |||
189 | config NET_IPGRE | 194 | config NET_IPGRE |
190 | tristate "IP: GRE tunnels over IP" | 195 | tristate "IP: GRE tunnels over IP" |
191 | depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX | 196 | depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX |
197 | select NET_IP_TUNNEL | ||
192 | help | 198 | help |
193 | Tunneling means encapsulating data of one protocol type within | 199 | Tunneling means encapsulating data of one protocol type within |
194 | another protocol and sending it over a channel that understands the | 200 | another protocol and sending it over a channel that understands the |
@@ -313,6 +319,7 @@ config SYN_COOKIES | |||
313 | config NET_IPVTI | 319 | config NET_IPVTI |
314 | tristate "Virtual (secure) IP: tunneling" | 320 | tristate "Virtual (secure) IP: tunneling" |
315 | select INET_TUNNEL | 321 | select INET_TUNNEL |
322 | select NET_IP_TUNNEL | ||
316 | depends on INET_XFRM_MODE_TUNNEL | 323 | depends on INET_XFRM_MODE_TUNNEL |
317 | ---help--- | 324 | ---help--- |
318 | Tunneling means encapsulating data of one protocol type within | 325 | Tunneling means encapsulating data of one protocol type within |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 15ca63ec604e..089cb9f36387 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -13,6 +13,7 @@ obj-y := route.o inetpeer.o protocol.o \ | |||
13 | fib_frontend.o fib_semantics.o fib_trie.o \ | 13 | fib_frontend.o fib_semantics.o fib_trie.o \ |
14 | inet_fragment.o ping.o | 14 | inet_fragment.o ping.o |
15 | 15 | ||
16 | obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o | ||
16 | obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o | 17 | obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o |
17 | obj-$(CONFIG_PROC_FS) += proc.o | 18 | obj-$(CONFIG_PROC_FS) += proc.o |
18 | obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o | 19 | obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c929d9c1c4b6..93824c57b108 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -111,7 +111,6 @@ | |||
111 | #include <net/sock.h> | 111 | #include <net/sock.h> |
112 | #include <net/raw.h> | 112 | #include <net/raw.h> |
113 | #include <net/icmp.h> | 113 | #include <net/icmp.h> |
114 | #include <net/ipip.h> | ||
115 | #include <net/inet_common.h> | 114 | #include <net/inet_common.h> |
116 | #include <net/xfrm.h> | 115 | #include <net/xfrm.h> |
117 | #include <net/net_namespace.h> | 116 | #include <net/net_namespace.h> |
@@ -1283,9 +1282,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
1283 | int ihl; | 1282 | int ihl; |
1284 | int id; | 1283 | int id; |
1285 | unsigned int offset = 0; | 1284 | unsigned int offset = 0; |
1286 | 1285 | bool tunnel; | |
1287 | if (!(features & NETIF_F_V4_CSUM)) | ||
1288 | features &= ~NETIF_F_SG; | ||
1289 | 1286 | ||
1290 | if (unlikely(skb_shinfo(skb)->gso_type & | 1287 | if (unlikely(skb_shinfo(skb)->gso_type & |
1291 | ~(SKB_GSO_TCPV4 | | 1288 | ~(SKB_GSO_TCPV4 | |
@@ -1293,6 +1290,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
1293 | SKB_GSO_DODGY | | 1290 | SKB_GSO_DODGY | |
1294 | SKB_GSO_TCP_ECN | | 1291 | SKB_GSO_TCP_ECN | |
1295 | SKB_GSO_GRE | | 1292 | SKB_GSO_GRE | |
1293 | SKB_GSO_UDP_TUNNEL | | ||
1296 | 0))) | 1294 | 0))) |
1297 | goto out; | 1295 | goto out; |
1298 | 1296 | ||
@@ -1307,6 +1305,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
1307 | if (unlikely(!pskb_may_pull(skb, ihl))) | 1305 | if (unlikely(!pskb_may_pull(skb, ihl))) |
1308 | goto out; | 1306 | goto out; |
1309 | 1307 | ||
1308 | tunnel = !!skb->encapsulation; | ||
1309 | |||
1310 | __skb_pull(skb, ihl); | 1310 | __skb_pull(skb, ihl); |
1311 | skb_reset_transport_header(skb); | 1311 | skb_reset_transport_header(skb); |
1312 | iph = ip_hdr(skb); | 1312 | iph = ip_hdr(skb); |
@@ -1326,7 +1326,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
1326 | skb = segs; | 1326 | skb = segs; |
1327 | do { | 1327 | do { |
1328 | iph = ip_hdr(skb); | 1328 | iph = ip_hdr(skb); |
1329 | if (proto == IPPROTO_UDP) { | 1329 | if (!tunnel && proto == IPPROTO_UDP) { |
1330 | iph->id = htons(id); | 1330 | iph->id = htons(id); |
1331 | iph->frag_off = htons(offset >> 3); | 1331 | iph->frag_off = htons(offset >> 3); |
1332 | if (skb->next != NULL) | 1332 | if (skb->next != NULL) |
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index fea4929f6200..247ec1951c35 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
@@ -654,11 +654,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, | |||
654 | arp_ptr += dev->addr_len; | 654 | arp_ptr += dev->addr_len; |
655 | memcpy(arp_ptr, &src_ip, 4); | 655 | memcpy(arp_ptr, &src_ip, 4); |
656 | arp_ptr += 4; | 656 | arp_ptr += 4; |
657 | if (target_hw != NULL) | 657 | |
658 | memcpy(arp_ptr, target_hw, dev->addr_len); | 658 | switch (dev->type) { |
659 | else | 659 | #if IS_ENABLED(CONFIG_FIREWIRE_NET) |
660 | memset(arp_ptr, 0, dev->addr_len); | 660 | case ARPHRD_IEEE1394: |
661 | arp_ptr += dev->addr_len; | 661 | break; |
662 | #endif | ||
663 | default: | ||
664 | if (target_hw != NULL) | ||
665 | memcpy(arp_ptr, target_hw, dev->addr_len); | ||
666 | else | ||
667 | memset(arp_ptr, 0, dev->addr_len); | ||
668 | arp_ptr += dev->addr_len; | ||
669 | } | ||
662 | memcpy(arp_ptr, &dest_ip, 4); | 670 | memcpy(arp_ptr, &dest_ip, 4); |
663 | 671 | ||
664 | return skb; | 672 | return skb; |
@@ -781,7 +789,14 @@ static int arp_process(struct sk_buff *skb) | |||
781 | arp_ptr += dev->addr_len; | 789 | arp_ptr += dev->addr_len; |
782 | memcpy(&sip, arp_ptr, 4); | 790 | memcpy(&sip, arp_ptr, 4); |
783 | arp_ptr += 4; | 791 | arp_ptr += 4; |
784 | arp_ptr += dev->addr_len; | 792 | switch (dev_type) { |
793 | #if IS_ENABLED(CONFIG_FIREWIRE_NET) | ||
794 | case ARPHRD_IEEE1394: | ||
795 | break; | ||
796 | #endif | ||
797 | default: | ||
798 | arp_ptr += dev->addr_len; | ||
799 | } | ||
785 | memcpy(&tip, arp_ptr, 4); | 800 | memcpy(&tip, arp_ptr, 4); |
786 | /* | 801 | /* |
787 | * Check for bad requests for 127.x.x.x and requests for multicast | 802 | * Check for bad requests for 127.x.x.x and requests for multicast |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f678507bc829..5d985e367535 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -536,7 +536,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, | |||
536 | return NULL; | 536 | return NULL; |
537 | } | 537 | } |
538 | 538 | ||
539 | static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) | 539 | static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) |
540 | { | 540 | { |
541 | struct net *net = sock_net(skb->sk); | 541 | struct net *net = sock_net(skb->sk); |
542 | struct nlattr *tb[IFA_MAX+1]; | 542 | struct nlattr *tb[IFA_MAX+1]; |
@@ -775,7 +775,7 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) | |||
775 | return NULL; | 775 | return NULL; |
776 | } | 776 | } |
777 | 777 | ||
778 | static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) | 778 | static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) |
779 | { | 779 | { |
780 | struct net *net = sock_net(skb->sk); | 780 | struct net *net = sock_net(skb->sk); |
781 | struct in_ifaddr *ifa; | 781 | struct in_ifaddr *ifa; |
@@ -1499,6 +1499,8 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) | |||
1499 | idx = 0; | 1499 | idx = 0; |
1500 | head = &net->dev_index_head[h]; | 1500 | head = &net->dev_index_head[h]; |
1501 | rcu_read_lock(); | 1501 | rcu_read_lock(); |
1502 | cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ | ||
1503 | net->dev_base_seq; | ||
1502 | hlist_for_each_entry_rcu(dev, head, index_hlist) { | 1504 | hlist_for_each_entry_rcu(dev, head, index_hlist) { |
1503 | if (idx < s_idx) | 1505 | if (idx < s_idx) |
1504 | goto cont; | 1506 | goto cont; |
@@ -1519,6 +1521,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) | |||
1519 | rcu_read_unlock(); | 1521 | rcu_read_unlock(); |
1520 | goto done; | 1522 | goto done; |
1521 | } | 1523 | } |
1524 | nl_dump_check_consistent(cb, nlmsg_hdr(skb)); | ||
1522 | } | 1525 | } |
1523 | cont: | 1526 | cont: |
1524 | idx++; | 1527 | idx++; |
@@ -1730,8 +1733,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { | |||
1730 | }; | 1733 | }; |
1731 | 1734 | ||
1732 | static int inet_netconf_get_devconf(struct sk_buff *in_skb, | 1735 | static int inet_netconf_get_devconf(struct sk_buff *in_skb, |
1733 | struct nlmsghdr *nlh, | 1736 | struct nlmsghdr *nlh) |
1734 | void *arg) | ||
1735 | { | 1737 | { |
1736 | struct net *net = sock_net(in_skb->sk); | 1738 | struct net *net = sock_net(in_skb->sk); |
1737 | struct nlattr *tb[NETCONFA_MAX+1]; | 1739 | struct nlattr *tb[NETCONFA_MAX+1]; |
@@ -1791,6 +1793,77 @@ errout: | |||
1791 | return err; | 1793 | return err; |
1792 | } | 1794 | } |
1793 | 1795 | ||
1796 | static int inet_netconf_dump_devconf(struct sk_buff *skb, | ||
1797 | struct netlink_callback *cb) | ||
1798 | { | ||
1799 | struct net *net = sock_net(skb->sk); | ||
1800 | int h, s_h; | ||
1801 | int idx, s_idx; | ||
1802 | struct net_device *dev; | ||
1803 | struct in_device *in_dev; | ||
1804 | struct hlist_head *head; | ||
1805 | |||
1806 | s_h = cb->args[0]; | ||
1807 | s_idx = idx = cb->args[1]; | ||
1808 | |||
1809 | for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { | ||
1810 | idx = 0; | ||
1811 | head = &net->dev_index_head[h]; | ||
1812 | rcu_read_lock(); | ||
1813 | cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ | ||
1814 | net->dev_base_seq; | ||
1815 | hlist_for_each_entry_rcu(dev, head, index_hlist) { | ||
1816 | if (idx < s_idx) | ||
1817 | goto cont; | ||
1818 | in_dev = __in_dev_get_rcu(dev); | ||
1819 | if (!in_dev) | ||
1820 | goto cont; | ||
1821 | |||
1822 | if (inet_netconf_fill_devconf(skb, dev->ifindex, | ||
1823 | &in_dev->cnf, | ||
1824 | NETLINK_CB(cb->skb).portid, | ||
1825 | cb->nlh->nlmsg_seq, | ||
1826 | RTM_NEWNETCONF, | ||
1827 | NLM_F_MULTI, | ||
1828 | -1) <= 0) { | ||
1829 | rcu_read_unlock(); | ||
1830 | goto done; | ||
1831 | } | ||
1832 | nl_dump_check_consistent(cb, nlmsg_hdr(skb)); | ||
1833 | cont: | ||
1834 | idx++; | ||
1835 | } | ||
1836 | rcu_read_unlock(); | ||
1837 | } | ||
1838 | if (h == NETDEV_HASHENTRIES) { | ||
1839 | if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, | ||
1840 | net->ipv4.devconf_all, | ||
1841 | NETLINK_CB(cb->skb).portid, | ||
1842 | cb->nlh->nlmsg_seq, | ||
1843 | RTM_NEWNETCONF, NLM_F_MULTI, | ||
1844 | -1) <= 0) | ||
1845 | goto done; | ||
1846 | else | ||
1847 | h++; | ||
1848 | } | ||
1849 | if (h == NETDEV_HASHENTRIES + 1) { | ||
1850 | if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, | ||
1851 | net->ipv4.devconf_dflt, | ||
1852 | NETLINK_CB(cb->skb).portid, | ||
1853 | cb->nlh->nlmsg_seq, | ||
1854 | RTM_NEWNETCONF, NLM_F_MULTI, | ||
1855 | -1) <= 0) | ||
1856 | goto done; | ||
1857 | else | ||
1858 | h++; | ||
1859 | } | ||
1860 | done: | ||
1861 | cb->args[0] = h; | ||
1862 | cb->args[1] = idx; | ||
1863 | |||
1864 | return skb->len; | ||
1865 | } | ||
1866 | |||
1794 | #ifdef CONFIG_SYSCTL | 1867 | #ifdef CONFIG_SYSCTL |
1795 | 1868 | ||
1796 | static void devinet_copy_dflt_conf(struct net *net, int i) | 1869 | static void devinet_copy_dflt_conf(struct net *net, int i) |
@@ -2195,6 +2268,6 @@ void __init devinet_init(void) | |||
2195 | rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); | 2268 | rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); |
2196 | rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); | 2269 | rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); |
2197 | rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, | 2270 | rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, |
2198 | NULL, NULL); | 2271 | inet_netconf_dump_devconf, NULL); |
2199 | } | 2272 | } |
2200 | 2273 | ||
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index eb4bb12b3eb4..0e74398bc8e6 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -604,7 +604,7 @@ errout: | |||
604 | return err; | 604 | return err; |
605 | } | 605 | } |
606 | 606 | ||
607 | static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) | 607 | static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) |
608 | { | 608 | { |
609 | struct net *net = sock_net(skb->sk); | 609 | struct net *net = sock_net(skb->sk); |
610 | struct fib_config cfg; | 610 | struct fib_config cfg; |
@@ -626,7 +626,7 @@ errout: | |||
626 | return err; | 626 | return err; |
627 | } | 627 | } |
628 | 628 | ||
629 | static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) | 629 | static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) |
630 | { | 630 | { |
631 | struct net *net = sock_net(skb->sk); | 631 | struct net *net = sock_net(skb->sk); |
632 | struct fib_config cfg; | 632 | struct fib_config cfg; |
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index 7a4c710c4cdd..d2d5a99fba09 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c | |||
@@ -27,11 +27,6 @@ | |||
27 | 27 | ||
28 | static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; | 28 | static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; |
29 | static DEFINE_SPINLOCK(gre_proto_lock); | 29 | static DEFINE_SPINLOCK(gre_proto_lock); |
30 | struct gre_base_hdr { | ||
31 | __be16 flags; | ||
32 | __be16 protocol; | ||
33 | }; | ||
34 | #define GRE_HEADER_SECTION 4 | ||
35 | 30 | ||
36 | int gre_add_protocol(const struct gre_protocol *proto, u8 version) | 31 | int gre_add_protocol(const struct gre_protocol *proto, u8 version) |
37 | { | 32 | { |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 786d97aee751..6acb541c9091 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -559,7 +559,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, | |||
559 | 559 | ||
560 | int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) | 560 | int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) |
561 | { | 561 | { |
562 | int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL); | 562 | int err = req->rsk_ops->rtx_syn_ack(parent, req); |
563 | 563 | ||
564 | if (!err) | 564 | if (!err) |
565 | req->num_retrans++; | 565 | req->num_retrans++; |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7afa2c3c788f..8620408af574 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -158,7 +158,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
158 | 158 | ||
159 | #define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) | 159 | #define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) |
160 | 160 | ||
161 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) { | 161 | if (icsk->icsk_pending == ICSK_TIME_RETRANS || |
162 | icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || | ||
163 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | ||
162 | r->idiag_timer = 1; | 164 | r->idiag_timer = 1; |
163 | r->idiag_retrans = icsk->icsk_retransmits; | 165 | r->idiag_retrans = icsk->icsk_retransmits; |
164 | r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); | 166 | r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); |
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index f4fd23de9b13..1206ca64b0ea 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c | |||
@@ -23,6 +23,28 @@ | |||
23 | 23 | ||
24 | #include <net/sock.h> | 24 | #include <net/sock.h> |
25 | #include <net/inet_frag.h> | 25 | #include <net/inet_frag.h> |
26 | #include <net/inet_ecn.h> | ||
27 | |||
28 | /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements | ||
29 | * Value : 0xff if frame should be dropped. | ||
30 | * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field | ||
31 | */ | ||
32 | const u8 ip_frag_ecn_table[16] = { | ||
33 | /* at least one fragment had CE, and others ECT_0 or ECT_1 */ | ||
34 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, | ||
35 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, | ||
36 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, | ||
37 | |||
38 | /* invalid combinations : drop frame */ | ||
39 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, | ||
40 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, | ||
41 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, | ||
42 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, | ||
43 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, | ||
44 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, | ||
45 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, | ||
46 | }; | ||
47 | EXPORT_SYMBOL(ip_frag_ecn_table); | ||
26 | 48 | ||
27 | static void inet_frag_secret_rebuild(unsigned long dummy) | 49 | static void inet_frag_secret_rebuild(unsigned long dummy) |
28 | { | 50 | { |
@@ -102,7 +124,6 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) | |||
102 | { | 124 | { |
103 | write_lock(&f->lock); | 125 | write_lock(&f->lock); |
104 | hlist_del(&fq->list); | 126 | hlist_del(&fq->list); |
105 | fq->net->nqueues--; | ||
106 | write_unlock(&f->lock); | 127 | write_unlock(&f->lock); |
107 | inet_frag_lru_del(fq); | 128 | inet_frag_lru_del(fq); |
108 | } | 129 | } |
@@ -182,6 +203,9 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) | |||
182 | q = list_first_entry(&nf->lru_list, | 203 | q = list_first_entry(&nf->lru_list, |
183 | struct inet_frag_queue, lru_list); | 204 | struct inet_frag_queue, lru_list); |
184 | atomic_inc(&q->refcnt); | 205 | atomic_inc(&q->refcnt); |
206 | /* Remove q from list to avoid several CPUs grabbing it */ | ||
207 | list_del_init(&q->lru_list); | ||
208 | |||
185 | spin_unlock(&nf->lru_lock); | 209 | spin_unlock(&nf->lru_lock); |
186 | 210 | ||
187 | spin_lock(&q->lock); | 211 | spin_lock(&q->lock); |
@@ -235,7 +259,6 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, | |||
235 | 259 | ||
236 | atomic_inc(&qp->refcnt); | 260 | atomic_inc(&qp->refcnt); |
237 | hlist_add_head(&qp->list, &f->hash[hash]); | 261 | hlist_add_head(&qp->list, &f->hash[hash]); |
238 | nf->nqueues++; | ||
239 | write_unlock(&f->lock); | 262 | write_unlock(&f->lock); |
240 | inet_frag_lru_add(nf, qp); | 263 | inet_frag_lru_add(nf, qp); |
241 | return qp; | 264 | return qp; |
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index cc280a3f4f96..1975f52933c5 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/if_vlan.h> | 30 | #include <linux/if_vlan.h> |
31 | #include <linux/inet_lro.h> | 31 | #include <linux/inet_lro.h> |
32 | #include <net/checksum.h> | ||
32 | 33 | ||
33 | MODULE_LICENSE("GPL"); | 34 | MODULE_LICENSE("GPL"); |
34 | MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); | 35 | MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); |
@@ -114,11 +115,9 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) | |||
114 | *(p+2) = lro_desc->tcp_rcv_tsecr; | 115 | *(p+2) = lro_desc->tcp_rcv_tsecr; |
115 | } | 116 | } |
116 | 117 | ||
118 | csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); | ||
117 | iph->tot_len = htons(lro_desc->ip_tot_len); | 119 | iph->tot_len = htons(lro_desc->ip_tot_len); |
118 | 120 | ||
119 | iph->check = 0; | ||
120 | iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); | ||
121 | |||
122 | tcph->check = 0; | 121 | tcph->check = 0; |
123 | tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); | 122 | tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); |
124 | lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); | 123 | lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); |
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a6445b843ef4..938520668b2f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c | |||
@@ -79,40 +79,11 @@ struct ipq { | |||
79 | struct inet_peer *peer; | 79 | struct inet_peer *peer; |
80 | }; | 80 | }; |
81 | 81 | ||
82 | /* RFC 3168 support : | ||
83 | * We want to check ECN values of all fragments, do detect invalid combinations. | ||
84 | * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. | ||
85 | */ | ||
86 | #define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ | ||
87 | #define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ | ||
88 | #define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ | ||
89 | #define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ | ||
90 | |||
91 | static inline u8 ip4_frag_ecn(u8 tos) | 82 | static inline u8 ip4_frag_ecn(u8 tos) |
92 | { | 83 | { |
93 | return 1 << (tos & INET_ECN_MASK); | 84 | return 1 << (tos & INET_ECN_MASK); |
94 | } | 85 | } |
95 | 86 | ||
96 | /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements | ||
97 | * Value : 0xff if frame should be dropped. | ||
98 | * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field | ||
99 | */ | ||
100 | static const u8 ip4_frag_ecn_table[16] = { | ||
101 | /* at least one fragment had CE, and others ECT_0 or ECT_1 */ | ||
102 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, | ||
103 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, | ||
104 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, | ||
105 | |||
106 | /* invalid combinations : drop frame */ | ||
107 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, | ||
108 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, | ||
109 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, | ||
110 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, | ||
111 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, | ||
112 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, | ||
113 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, | ||
114 | }; | ||
115 | |||
116 | static struct inet_frags ip4_frags; | 87 | static struct inet_frags ip4_frags; |
117 | 88 | ||
118 | int ip_frag_nqueues(struct net *net) | 89 | int ip_frag_nqueues(struct net *net) |
@@ -551,7 +522,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
551 | 522 | ||
552 | ipq_kill(qp); | 523 | ipq_kill(qp); |
553 | 524 | ||
554 | ecn = ip4_frag_ecn_table[qp->ecn]; | 525 | ecn = ip_frag_ecn_table[qp->ecn]; |
555 | if (unlikely(ecn == 0xff)) { | 526 | if (unlikely(ecn == 0xff)) { |
556 | err = -EINVAL; | 527 | err = -EINVAL; |
557 | goto out_fail; | 528 | goto out_fail; |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 91d66dbde9c0..ad662e906f7e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -37,7 +37,7 @@ | |||
37 | #include <net/ip.h> | 37 | #include <net/ip.h> |
38 | #include <net/icmp.h> | 38 | #include <net/icmp.h> |
39 | #include <net/protocol.h> | 39 | #include <net/protocol.h> |
40 | #include <net/ipip.h> | 40 | #include <net/ip_tunnels.h> |
41 | #include <net/arp.h> | 41 | #include <net/arp.h> |
42 | #include <net/checksum.h> | 42 | #include <net/checksum.h> |
43 | #include <net/dsfield.h> | 43 | #include <net/dsfield.h> |
@@ -108,15 +108,6 @@ | |||
108 | fatal route to network, even if it were you who configured | 108 | fatal route to network, even if it were you who configured |
109 | fatal static route: you are innocent. :-) | 109 | fatal static route: you are innocent. :-) |
110 | 110 | ||
111 | |||
112 | |||
113 | 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain | ||
114 | practically identical code. It would be good to glue them | ||
115 | together, but it is not very evident, how to make them modular. | ||
116 | sit is integral part of IPv6, ipip and gre are naturally modular. | ||
117 | We could extract common parts (hash table, ioctl etc) | ||
118 | to a separate module (ip_tunnel.c). | ||
119 | |||
120 | Alexey Kuznetsov. | 111 | Alexey Kuznetsov. |
121 | */ | 112 | */ |
122 | 113 | ||
@@ -126,400 +117,135 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); | |||
126 | 117 | ||
127 | static struct rtnl_link_ops ipgre_link_ops __read_mostly; | 118 | static struct rtnl_link_ops ipgre_link_ops __read_mostly; |
128 | static int ipgre_tunnel_init(struct net_device *dev); | 119 | static int ipgre_tunnel_init(struct net_device *dev); |
129 | static void ipgre_tunnel_setup(struct net_device *dev); | ||
130 | static int ipgre_tunnel_bind_dev(struct net_device *dev); | ||
131 | |||
132 | /* Fallback tunnel: no source, no destination, no key, no options */ | ||
133 | |||
134 | #define HASH_SIZE 16 | ||
135 | 120 | ||
136 | static int ipgre_net_id __read_mostly; | 121 | static int ipgre_net_id __read_mostly; |
137 | struct ipgre_net { | 122 | static int gre_tap_net_id __read_mostly; |
138 | struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; | ||
139 | |||
140 | struct net_device *fb_tunnel_dev; | ||
141 | }; | ||
142 | |||
143 | /* Tunnel hash table */ | ||
144 | |||
145 | /* | ||
146 | 4 hash tables: | ||
147 | |||
148 | 3: (remote,local) | ||
149 | 2: (remote,*) | ||
150 | 1: (*,local) | ||
151 | 0: (*,*) | ||
152 | 123 | ||
153 | We require exact key match i.e. if a key is present in packet | 124 | static __sum16 check_checksum(struct sk_buff *skb) |
154 | it will match only tunnel with the same key; if it is not present, | 125 | { |
155 | it will match only keyless tunnel. | 126 | __sum16 csum = 0; |
156 | |||
157 | All keysless packets, if not matched configured keyless tunnels | ||
158 | will match fallback tunnel. | ||
159 | */ | ||
160 | 127 | ||
161 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) | 128 | switch (skb->ip_summed) { |
129 | case CHECKSUM_COMPLETE: | ||
130 | csum = csum_fold(skb->csum); | ||
162 | 131 | ||
163 | #define tunnels_r_l tunnels[3] | 132 | if (!csum) |
164 | #define tunnels_r tunnels[2] | 133 | break; |
165 | #define tunnels_l tunnels[1] | 134 | /* Fall through. */ |
166 | #define tunnels_wc tunnels[0] | ||
167 | 135 | ||
168 | static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, | 136 | case CHECKSUM_NONE: |
169 | struct rtnl_link_stats64 *tot) | 137 | skb->csum = 0; |
170 | { | 138 | csum = __skb_checksum_complete(skb); |
171 | int i; | 139 | skb->ip_summed = CHECKSUM_COMPLETE; |
172 | 140 | break; | |
173 | for_each_possible_cpu(i) { | ||
174 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | ||
175 | u64 rx_packets, rx_bytes, tx_packets, tx_bytes; | ||
176 | unsigned int start; | ||
177 | |||
178 | do { | ||
179 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | ||
180 | rx_packets = tstats->rx_packets; | ||
181 | tx_packets = tstats->tx_packets; | ||
182 | rx_bytes = tstats->rx_bytes; | ||
183 | tx_bytes = tstats->tx_bytes; | ||
184 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | ||
185 | |||
186 | tot->rx_packets += rx_packets; | ||
187 | tot->tx_packets += tx_packets; | ||
188 | tot->rx_bytes += rx_bytes; | ||
189 | tot->tx_bytes += tx_bytes; | ||
190 | } | 141 | } |
191 | 142 | ||
192 | tot->multicast = dev->stats.multicast; | 143 | return csum; |
193 | tot->rx_crc_errors = dev->stats.rx_crc_errors; | ||
194 | tot->rx_fifo_errors = dev->stats.rx_fifo_errors; | ||
195 | tot->rx_length_errors = dev->stats.rx_length_errors; | ||
196 | tot->rx_frame_errors = dev->stats.rx_frame_errors; | ||
197 | tot->rx_errors = dev->stats.rx_errors; | ||
198 | |||
199 | tot->tx_fifo_errors = dev->stats.tx_fifo_errors; | ||
200 | tot->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
201 | tot->tx_dropped = dev->stats.tx_dropped; | ||
202 | tot->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
203 | tot->tx_errors = dev->stats.tx_errors; | ||
204 | |||
205 | return tot; | ||
206 | } | 144 | } |
207 | 145 | ||
208 | /* Does key in tunnel parameters match packet */ | 146 | static int ip_gre_calc_hlen(__be16 o_flags) |
209 | static bool ipgre_key_match(const struct ip_tunnel_parm *p, | ||
210 | __be16 flags, __be32 key) | ||
211 | { | 147 | { |
212 | if (p->i_flags & GRE_KEY) { | 148 | int addend = 4; |
213 | if (flags & GRE_KEY) | ||
214 | return key == p->i_key; | ||
215 | else | ||
216 | return false; /* key expected, none present */ | ||
217 | } else | ||
218 | return !(flags & GRE_KEY); | ||
219 | } | ||
220 | 149 | ||
221 | /* Given src, dst and key, find appropriate for input tunnel. */ | 150 | if (o_flags&TUNNEL_CSUM) |
151 | addend += 4; | ||
152 | if (o_flags&TUNNEL_KEY) | ||
153 | addend += 4; | ||
154 | if (o_flags&TUNNEL_SEQ) | ||
155 | addend += 4; | ||
156 | return addend; | ||
157 | } | ||
222 | 158 | ||
223 | static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, | 159 | static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, |
224 | __be32 remote, __be32 local, | 160 | bool *csum_err, int *hdr_len) |
225 | __be16 flags, __be32 key, | ||
226 | __be16 gre_proto) | ||
227 | { | 161 | { |
228 | struct net *net = dev_net(dev); | 162 | struct iphdr *iph = ip_hdr(skb); |
229 | int link = dev->ifindex; | 163 | struct gre_base_hdr *greh; |
230 | unsigned int h0 = HASH(remote); | 164 | __be32 *options; |
231 | unsigned int h1 = HASH(key); | ||
232 | struct ip_tunnel *t, *cand = NULL; | ||
233 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
234 | int dev_type = (gre_proto == htons(ETH_P_TEB)) ? | ||
235 | ARPHRD_ETHER : ARPHRD_IPGRE; | ||
236 | int score, cand_score = 4; | ||
237 | |||
238 | for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { | ||
239 | if (local != t->parms.iph.saddr || | ||
240 | remote != t->parms.iph.daddr || | ||
241 | !(t->dev->flags & IFF_UP)) | ||
242 | continue; | ||
243 | |||
244 | if (!ipgre_key_match(&t->parms, flags, key)) | ||
245 | continue; | ||
246 | |||
247 | if (t->dev->type != ARPHRD_IPGRE && | ||
248 | t->dev->type != dev_type) | ||
249 | continue; | ||
250 | |||
251 | score = 0; | ||
252 | if (t->parms.link != link) | ||
253 | score |= 1; | ||
254 | if (t->dev->type != dev_type) | ||
255 | score |= 2; | ||
256 | if (score == 0) | ||
257 | return t; | ||
258 | |||
259 | if (score < cand_score) { | ||
260 | cand = t; | ||
261 | cand_score = score; | ||
262 | } | ||
263 | } | ||
264 | |||
265 | for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { | ||
266 | if (remote != t->parms.iph.daddr || | ||
267 | !(t->dev->flags & IFF_UP)) | ||
268 | continue; | ||
269 | |||
270 | if (!ipgre_key_match(&t->parms, flags, key)) | ||
271 | continue; | ||
272 | |||
273 | if (t->dev->type != ARPHRD_IPGRE && | ||
274 | t->dev->type != dev_type) | ||
275 | continue; | ||
276 | |||
277 | score = 0; | ||
278 | if (t->parms.link != link) | ||
279 | score |= 1; | ||
280 | if (t->dev->type != dev_type) | ||
281 | score |= 2; | ||
282 | if (score == 0) | ||
283 | return t; | ||
284 | |||
285 | if (score < cand_score) { | ||
286 | cand = t; | ||
287 | cand_score = score; | ||
288 | } | ||
289 | } | ||
290 | 165 | ||
291 | for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { | 166 | if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) |
292 | if ((local != t->parms.iph.saddr && | 167 | return -EINVAL; |
293 | (local != t->parms.iph.daddr || | ||
294 | !ipv4_is_multicast(local))) || | ||
295 | !(t->dev->flags & IFF_UP)) | ||
296 | continue; | ||
297 | |||
298 | if (!ipgre_key_match(&t->parms, flags, key)) | ||
299 | continue; | ||
300 | |||
301 | if (t->dev->type != ARPHRD_IPGRE && | ||
302 | t->dev->type != dev_type) | ||
303 | continue; | ||
304 | |||
305 | score = 0; | ||
306 | if (t->parms.link != link) | ||
307 | score |= 1; | ||
308 | if (t->dev->type != dev_type) | ||
309 | score |= 2; | ||
310 | if (score == 0) | ||
311 | return t; | ||
312 | |||
313 | if (score < cand_score) { | ||
314 | cand = t; | ||
315 | cand_score = score; | ||
316 | } | ||
317 | } | ||
318 | 168 | ||
319 | for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { | 169 | greh = (struct gre_base_hdr *)((u8 *)iph + (iph->ihl << 2)); |
320 | if (t->parms.i_key != key || | 170 | if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) |
321 | !(t->dev->flags & IFF_UP)) | 171 | return -EINVAL; |
322 | continue; | ||
323 | |||
324 | if (t->dev->type != ARPHRD_IPGRE && | ||
325 | t->dev->type != dev_type) | ||
326 | continue; | ||
327 | |||
328 | score = 0; | ||
329 | if (t->parms.link != link) | ||
330 | score |= 1; | ||
331 | if (t->dev->type != dev_type) | ||
332 | score |= 2; | ||
333 | if (score == 0) | ||
334 | return t; | ||
335 | |||
336 | if (score < cand_score) { | ||
337 | cand = t; | ||
338 | cand_score = score; | ||
339 | } | ||
340 | } | ||
341 | 172 | ||
342 | if (cand != NULL) | 173 | tpi->flags = gre_flags_to_tnl_flags(greh->flags); |
343 | return cand; | 174 | *hdr_len = ip_gre_calc_hlen(tpi->flags); |
344 | 175 | ||
345 | dev = ign->fb_tunnel_dev; | 176 | if (!pskb_may_pull(skb, *hdr_len)) |
346 | if (dev->flags & IFF_UP) | 177 | return -EINVAL; |
347 | return netdev_priv(dev); | ||
348 | 178 | ||
349 | return NULL; | 179 | tpi->proto = greh->protocol; |
350 | } | ||
351 | 180 | ||
352 | static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, | 181 | options = (__be32 *)(greh + 1); |
353 | struct ip_tunnel_parm *parms) | 182 | if (greh->flags & GRE_CSUM) { |
354 | { | 183 | if (check_checksum(skb)) { |
355 | __be32 remote = parms->iph.daddr; | 184 | *csum_err = true; |
356 | __be32 local = parms->iph.saddr; | 185 | return -EINVAL; |
357 | __be32 key = parms->i_key; | 186 | } |
358 | unsigned int h = HASH(key); | 187 | options++; |
359 | int prio = 0; | ||
360 | |||
361 | if (local) | ||
362 | prio |= 1; | ||
363 | if (remote && !ipv4_is_multicast(remote)) { | ||
364 | prio |= 2; | ||
365 | h ^= HASH(remote); | ||
366 | } | 188 | } |
367 | 189 | ||
368 | return &ign->tunnels[prio][h]; | 190 | if (greh->flags & GRE_KEY) { |
369 | } | 191 | tpi->key = *options; |
370 | 192 | options++; | |
371 | static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, | 193 | } else |
372 | struct ip_tunnel *t) | 194 | tpi->key = 0; |
373 | { | ||
374 | return __ipgre_bucket(ign, &t->parms); | ||
375 | } | ||
376 | |||
377 | static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) | ||
378 | { | ||
379 | struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); | ||
380 | 195 | ||
381 | rcu_assign_pointer(t->next, rtnl_dereference(*tp)); | 196 | if (unlikely(greh->flags & GRE_SEQ)) { |
382 | rcu_assign_pointer(*tp, t); | 197 | tpi->seq = *options; |
383 | } | 198 | options++; |
199 | } else | ||
200 | tpi->seq = 0; | ||
384 | 201 | ||
385 | static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) | 202 | /* WCCP version 1 and 2 protocol decoding. |
386 | { | 203 | * - Change protocol to IP |
387 | struct ip_tunnel __rcu **tp; | 204 | * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header |
388 | struct ip_tunnel *iter; | 205 | */ |
389 | 206 | if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { | |
390 | for (tp = ipgre_bucket(ign, t); | 207 | tpi->proto = htons(ETH_P_IP); |
391 | (iter = rtnl_dereference(*tp)) != NULL; | 208 | if ((*(u8 *)options & 0xF0) != 0x40) { |
392 | tp = &iter->next) { | 209 | *hdr_len += 4; |
393 | if (t == iter) { | 210 | if (!pskb_may_pull(skb, *hdr_len)) |
394 | rcu_assign_pointer(*tp, t->next); | 211 | return -EINVAL; |
395 | break; | ||
396 | } | 212 | } |
397 | } | 213 | } |
398 | } | ||
399 | |||
400 | static struct ip_tunnel *ipgre_tunnel_find(struct net *net, | ||
401 | struct ip_tunnel_parm *parms, | ||
402 | int type) | ||
403 | { | ||
404 | __be32 remote = parms->iph.daddr; | ||
405 | __be32 local = parms->iph.saddr; | ||
406 | __be32 key = parms->i_key; | ||
407 | int link = parms->link; | ||
408 | struct ip_tunnel *t; | ||
409 | struct ip_tunnel __rcu **tp; | ||
410 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
411 | |||
412 | for (tp = __ipgre_bucket(ign, parms); | ||
413 | (t = rtnl_dereference(*tp)) != NULL; | ||
414 | tp = &t->next) | ||
415 | if (local == t->parms.iph.saddr && | ||
416 | remote == t->parms.iph.daddr && | ||
417 | key == t->parms.i_key && | ||
418 | link == t->parms.link && | ||
419 | type == t->dev->type) | ||
420 | break; | ||
421 | |||
422 | return t; | ||
423 | } | ||
424 | |||
425 | static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, | ||
426 | struct ip_tunnel_parm *parms, int create) | ||
427 | { | ||
428 | struct ip_tunnel *t, *nt; | ||
429 | struct net_device *dev; | ||
430 | char name[IFNAMSIZ]; | ||
431 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
432 | |||
433 | t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); | ||
434 | if (t || !create) | ||
435 | return t; | ||
436 | |||
437 | if (parms->name[0]) | ||
438 | strlcpy(name, parms->name, IFNAMSIZ); | ||
439 | else | ||
440 | strcpy(name, "gre%d"); | ||
441 | |||
442 | dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); | ||
443 | if (!dev) | ||
444 | return NULL; | ||
445 | |||
446 | dev_net_set(dev, net); | ||
447 | |||
448 | nt = netdev_priv(dev); | ||
449 | nt->parms = *parms; | ||
450 | dev->rtnl_link_ops = &ipgre_link_ops; | ||
451 | 214 | ||
452 | dev->mtu = ipgre_tunnel_bind_dev(dev); | 215 | return 0; |
453 | |||
454 | if (register_netdevice(dev) < 0) | ||
455 | goto failed_free; | ||
456 | |||
457 | /* Can use a lockless transmit, unless we generate output sequences */ | ||
458 | if (!(nt->parms.o_flags & GRE_SEQ)) | ||
459 | dev->features |= NETIF_F_LLTX; | ||
460 | |||
461 | dev_hold(dev); | ||
462 | ipgre_tunnel_link(ign, nt); | ||
463 | return nt; | ||
464 | |||
465 | failed_free: | ||
466 | free_netdev(dev); | ||
467 | return NULL; | ||
468 | } | ||
469 | |||
470 | static void ipgre_tunnel_uninit(struct net_device *dev) | ||
471 | { | ||
472 | struct net *net = dev_net(dev); | ||
473 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
474 | |||
475 | ipgre_tunnel_unlink(ign, netdev_priv(dev)); | ||
476 | dev_put(dev); | ||
477 | } | 216 | } |
478 | 217 | ||
479 | |||
480 | static void ipgre_err(struct sk_buff *skb, u32 info) | 218 | static void ipgre_err(struct sk_buff *skb, u32 info) |
481 | { | 219 | { |
482 | 220 | ||
483 | /* All the routers (except for Linux) return only | 221 | /* All the routers (except for Linux) return only |
484 | 8 bytes of packet payload. It means, that precise relaying of | 222 | 8 bytes of packet payload. It means, that precise relaying of |
485 | ICMP in the real Internet is absolutely infeasible. | 223 | ICMP in the real Internet is absolutely infeasible. |
486 | 224 | ||
487 | Moreover, Cisco "wise men" put GRE key to the third word | 225 | Moreover, Cisco "wise men" put GRE key to the third word |
488 | in GRE header. It makes impossible maintaining even soft state for keyed | 226 | in GRE header. It makes impossible maintaining even soft |
489 | GRE tunnels with enabled checksum. Tell them "thank you". | 227 | state for keyed GRE tunnels with enabled checksum. Tell |
490 | 228 | them "thank you". | |
491 | Well, I wonder, rfc1812 was written by Cisco employee, | ||
492 | what the hell these idiots break standards established | ||
493 | by themselves??? | ||
494 | */ | ||
495 | 229 | ||
230 | Well, I wonder, rfc1812 was written by Cisco employee, | ||
231 | what the hell these idiots break standards established | ||
232 | by themselves??? | ||
233 | */ | ||
234 | struct net *net = dev_net(skb->dev); | ||
235 | struct ip_tunnel_net *itn; | ||
496 | const struct iphdr *iph = (const struct iphdr *)skb->data; | 236 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
497 | __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2)); | ||
498 | int grehlen = (iph->ihl<<2) + 4; | ||
499 | const int type = icmp_hdr(skb)->type; | 237 | const int type = icmp_hdr(skb)->type; |
500 | const int code = icmp_hdr(skb)->code; | 238 | const int code = icmp_hdr(skb)->code; |
501 | struct ip_tunnel *t; | 239 | struct ip_tunnel *t; |
502 | __be16 flags; | 240 | struct tnl_ptk_info tpi; |
503 | __be32 key = 0; | 241 | int hdr_len; |
242 | bool csum_err = false; | ||
504 | 243 | ||
505 | flags = p[0]; | 244 | if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) { |
506 | if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { | 245 | if (!csum_err) /* ignore csum errors. */ |
507 | if (flags&(GRE_VERSION|GRE_ROUTING)) | ||
508 | return; | 246 | return; |
509 | if (flags&GRE_KEY) { | ||
510 | grehlen += 4; | ||
511 | if (flags&GRE_CSUM) | ||
512 | grehlen += 4; | ||
513 | } | ||
514 | } | 247 | } |
515 | 248 | ||
516 | /* If only 8 bytes returned, keyed message will be dropped here */ | ||
517 | if (skb_headlen(skb) < grehlen) | ||
518 | return; | ||
519 | |||
520 | if (flags & GRE_KEY) | ||
521 | key = *(((__be32 *)p) + (grehlen / 4) - 1); | ||
522 | |||
523 | switch (type) { | 249 | switch (type) { |
524 | default: | 250 | default: |
525 | case ICMP_PARAMETERPROB: | 251 | case ICMP_PARAMETERPROB: |
@@ -548,8 +274,13 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
548 | break; | 274 | break; |
549 | } | 275 | } |
550 | 276 | ||
551 | t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, | 277 | if (tpi.proto == htons(ETH_P_TEB)) |
552 | flags, key, p[1]); | 278 | itn = net_generic(net, gre_tap_net_id); |
279 | else | ||
280 | itn = net_generic(net, ipgre_net_id); | ||
281 | |||
282 | t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, | ||
283 | iph->daddr, iph->saddr, tpi.key); | ||
553 | 284 | ||
554 | if (t == NULL) | 285 | if (t == NULL) |
555 | return; | 286 | return; |
@@ -578,158 +309,33 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
578 | t->err_time = jiffies; | 309 | t->err_time = jiffies; |
579 | } | 310 | } |
580 | 311 | ||
581 | static inline u8 | ||
582 | ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) | ||
583 | { | ||
584 | u8 inner = 0; | ||
585 | if (skb->protocol == htons(ETH_P_IP)) | ||
586 | inner = old_iph->tos; | ||
587 | else if (skb->protocol == htons(ETH_P_IPV6)) | ||
588 | inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); | ||
589 | return INET_ECN_encapsulate(tos, inner); | ||
590 | } | ||
591 | |||
592 | static int ipgre_rcv(struct sk_buff *skb) | 312 | static int ipgre_rcv(struct sk_buff *skb) |
593 | { | 313 | { |
314 | struct net *net = dev_net(skb->dev); | ||
315 | struct ip_tunnel_net *itn; | ||
594 | const struct iphdr *iph; | 316 | const struct iphdr *iph; |
595 | u8 *h; | ||
596 | __be16 flags; | ||
597 | __sum16 csum = 0; | ||
598 | __be32 key = 0; | ||
599 | u32 seqno = 0; | ||
600 | struct ip_tunnel *tunnel; | 317 | struct ip_tunnel *tunnel; |
601 | int offset = 4; | 318 | struct tnl_ptk_info tpi; |
602 | __be16 gre_proto; | 319 | int hdr_len; |
603 | int err; | 320 | bool csum_err = false; |
604 | 321 | ||
605 | if (!pskb_may_pull(skb, 16)) | 322 | if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0) |
606 | goto drop; | 323 | goto drop; |
607 | 324 | ||
608 | iph = ip_hdr(skb); | 325 | if (tpi.proto == htons(ETH_P_TEB)) |
609 | h = skb->data; | 326 | itn = net_generic(net, gre_tap_net_id); |
610 | flags = *(__be16 *)h; | 327 | else |
611 | 328 | itn = net_generic(net, ipgre_net_id); | |
612 | if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { | ||
613 | /* - Version must be 0. | ||
614 | - We do not support routing headers. | ||
615 | */ | ||
616 | if (flags&(GRE_VERSION|GRE_ROUTING)) | ||
617 | goto drop; | ||
618 | |||
619 | if (flags&GRE_CSUM) { | ||
620 | switch (skb->ip_summed) { | ||
621 | case CHECKSUM_COMPLETE: | ||
622 | csum = csum_fold(skb->csum); | ||
623 | if (!csum) | ||
624 | break; | ||
625 | /* fall through */ | ||
626 | case CHECKSUM_NONE: | ||
627 | skb->csum = 0; | ||
628 | csum = __skb_checksum_complete(skb); | ||
629 | skb->ip_summed = CHECKSUM_COMPLETE; | ||
630 | } | ||
631 | offset += 4; | ||
632 | } | ||
633 | if (flags&GRE_KEY) { | ||
634 | key = *(__be32 *)(h + offset); | ||
635 | offset += 4; | ||
636 | } | ||
637 | if (flags&GRE_SEQ) { | ||
638 | seqno = ntohl(*(__be32 *)(h + offset)); | ||
639 | offset += 4; | ||
640 | } | ||
641 | } | ||
642 | 329 | ||
643 | gre_proto = *(__be16 *)(h + 2); | 330 | iph = ip_hdr(skb); |
331 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, | ||
332 | iph->saddr, iph->daddr, tpi.key); | ||
644 | 333 | ||
645 | tunnel = ipgre_tunnel_lookup(skb->dev, | ||
646 | iph->saddr, iph->daddr, flags, key, | ||
647 | gre_proto); | ||
648 | if (tunnel) { | 334 | if (tunnel) { |
649 | struct pcpu_tstats *tstats; | 335 | ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); |
650 | |||
651 | secpath_reset(skb); | ||
652 | |||
653 | skb->protocol = gre_proto; | ||
654 | /* WCCP version 1 and 2 protocol decoding. | ||
655 | * - Change protocol to IP | ||
656 | * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header | ||
657 | */ | ||
658 | if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { | ||
659 | skb->protocol = htons(ETH_P_IP); | ||
660 | if ((*(h + offset) & 0xF0) != 0x40) | ||
661 | offset += 4; | ||
662 | } | ||
663 | |||
664 | skb->mac_header = skb->network_header; | ||
665 | __pskb_pull(skb, offset); | ||
666 | skb_postpull_rcsum(skb, skb_transport_header(skb), offset); | ||
667 | skb->pkt_type = PACKET_HOST; | ||
668 | #ifdef CONFIG_NET_IPGRE_BROADCAST | ||
669 | if (ipv4_is_multicast(iph->daddr)) { | ||
670 | /* Looped back packet, drop it! */ | ||
671 | if (rt_is_output_route(skb_rtable(skb))) | ||
672 | goto drop; | ||
673 | tunnel->dev->stats.multicast++; | ||
674 | skb->pkt_type = PACKET_BROADCAST; | ||
675 | } | ||
676 | #endif | ||
677 | |||
678 | if (((flags&GRE_CSUM) && csum) || | ||
679 | (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { | ||
680 | tunnel->dev->stats.rx_crc_errors++; | ||
681 | tunnel->dev->stats.rx_errors++; | ||
682 | goto drop; | ||
683 | } | ||
684 | if (tunnel->parms.i_flags&GRE_SEQ) { | ||
685 | if (!(flags&GRE_SEQ) || | ||
686 | (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { | ||
687 | tunnel->dev->stats.rx_fifo_errors++; | ||
688 | tunnel->dev->stats.rx_errors++; | ||
689 | goto drop; | ||
690 | } | ||
691 | tunnel->i_seqno = seqno + 1; | ||
692 | } | ||
693 | |||
694 | /* Warning: All skb pointers will be invalidated! */ | ||
695 | if (tunnel->dev->type == ARPHRD_ETHER) { | ||
696 | if (!pskb_may_pull(skb, ETH_HLEN)) { | ||
697 | tunnel->dev->stats.rx_length_errors++; | ||
698 | tunnel->dev->stats.rx_errors++; | ||
699 | goto drop; | ||
700 | } | ||
701 | |||
702 | iph = ip_hdr(skb); | ||
703 | skb->protocol = eth_type_trans(skb, tunnel->dev); | ||
704 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | ||
705 | } | ||
706 | |||
707 | __skb_tunnel_rx(skb, tunnel->dev); | ||
708 | |||
709 | skb_reset_network_header(skb); | ||
710 | err = IP_ECN_decapsulate(iph, skb); | ||
711 | if (unlikely(err)) { | ||
712 | if (log_ecn_error) | ||
713 | net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", | ||
714 | &iph->saddr, iph->tos); | ||
715 | if (err > 1) { | ||
716 | ++tunnel->dev->stats.rx_frame_errors; | ||
717 | ++tunnel->dev->stats.rx_errors; | ||
718 | goto drop; | ||
719 | } | ||
720 | } | ||
721 | |||
722 | tstats = this_cpu_ptr(tunnel->dev->tstats); | ||
723 | u64_stats_update_begin(&tstats->syncp); | ||
724 | tstats->rx_packets++; | ||
725 | tstats->rx_bytes += skb->len; | ||
726 | u64_stats_update_end(&tstats->syncp); | ||
727 | |||
728 | gro_cells_receive(&tunnel->gro_cells, skb); | ||
729 | return 0; | 336 | return 0; |
730 | } | 337 | } |
731 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | 338 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); |
732 | |||
733 | drop: | 339 | drop: |
734 | kfree_skb(skb); | 340 | kfree_skb(skb); |
735 | return 0; | 341 | return 0; |
@@ -746,7 +352,7 @@ static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff | |||
746 | skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; | 352 | skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; |
747 | return skb; | 353 | return skb; |
748 | } else if (skb->ip_summed == CHECKSUM_PARTIAL && | 354 | } else if (skb->ip_summed == CHECKSUM_PARTIAL && |
749 | tunnel->parms.o_flags&GRE_CSUM) { | 355 | tunnel->parms.o_flags&TUNNEL_CSUM) { |
750 | err = skb_checksum_help(skb); | 356 | err = skb_checksum_help(skb); |
751 | if (unlikely(err)) | 357 | if (unlikely(err)) |
752 | goto error; | 358 | goto error; |
@@ -760,494 +366,157 @@ error: | |||
760 | return ERR_PTR(err); | 366 | return ERR_PTR(err); |
761 | } | 367 | } |
762 | 368 | ||
763 | static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 369 | static struct sk_buff *gre_build_header(struct sk_buff *skb, |
370 | const struct tnl_ptk_info *tpi, | ||
371 | int hdr_len) | ||
764 | { | 372 | { |
765 | struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats); | 373 | struct gre_base_hdr *greh; |
766 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
767 | const struct iphdr *old_iph; | ||
768 | const struct iphdr *tiph; | ||
769 | struct flowi4 fl4; | ||
770 | u8 tos; | ||
771 | __be16 df; | ||
772 | struct rtable *rt; /* Route to the other host */ | ||
773 | struct net_device *tdev; /* Device to other host */ | ||
774 | struct iphdr *iph; /* Our new IP header */ | ||
775 | unsigned int max_headroom; /* The extra header space needed */ | ||
776 | int gre_hlen; | ||
777 | __be32 dst; | ||
778 | int mtu; | ||
779 | u8 ttl; | ||
780 | int err; | ||
781 | int pkt_len; | ||
782 | |||
783 | skb = handle_offloads(tunnel, skb); | ||
784 | if (IS_ERR(skb)) { | ||
785 | dev->stats.tx_dropped++; | ||
786 | return NETDEV_TX_OK; | ||
787 | } | ||
788 | 374 | ||
789 | if (!skb->encapsulation) { | 375 | skb_push(skb, hdr_len); |
790 | skb_reset_inner_headers(skb); | ||
791 | skb->encapsulation = 1; | ||
792 | } | ||
793 | 376 | ||
794 | old_iph = ip_hdr(skb); | 377 | greh = (struct gre_base_hdr *)skb->data; |
378 | greh->flags = tnl_flags_to_gre_flags(tpi->flags); | ||
379 | greh->protocol = tpi->proto; | ||
795 | 380 | ||
796 | if (dev->type == ARPHRD_ETHER) | 381 | if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { |
797 | IPCB(skb)->flags = 0; | 382 | __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); |
798 | 383 | ||
799 | if (dev->header_ops && dev->type == ARPHRD_IPGRE) { | 384 | if (tpi->flags&TUNNEL_SEQ) { |
800 | gre_hlen = 0; | 385 | *ptr = tpi->seq; |
801 | tiph = (const struct iphdr *)skb->data; | 386 | ptr--; |
802 | } else { | ||
803 | gre_hlen = tunnel->hlen; | ||
804 | tiph = &tunnel->parms.iph; | ||
805 | } | ||
806 | |||
807 | if ((dst = tiph->daddr) == 0) { | ||
808 | /* NBMA tunnel */ | ||
809 | |||
810 | if (skb_dst(skb) == NULL) { | ||
811 | dev->stats.tx_fifo_errors++; | ||
812 | goto tx_error; | ||
813 | } | 387 | } |
814 | 388 | if (tpi->flags&TUNNEL_KEY) { | |
815 | if (skb->protocol == htons(ETH_P_IP)) { | 389 | *ptr = tpi->key; |
816 | rt = skb_rtable(skb); | 390 | ptr--; |
817 | dst = rt_nexthop(rt, old_iph->daddr); | ||
818 | } | 391 | } |
819 | #if IS_ENABLED(CONFIG_IPV6) | 392 | if (tpi->flags&TUNNEL_CSUM && |
820 | else if (skb->protocol == htons(ETH_P_IPV6)) { | 393 | !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { |
821 | const struct in6_addr *addr6; | 394 | *(__sum16 *)ptr = 0; |
822 | struct neighbour *neigh; | 395 | *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, |
823 | bool do_tx_error_icmp; | 396 | skb->len, 0)); |
824 | int addr_type; | ||
825 | |||
826 | neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); | ||
827 | if (neigh == NULL) | ||
828 | goto tx_error; | ||
829 | |||
830 | addr6 = (const struct in6_addr *)&neigh->primary_key; | ||
831 | addr_type = ipv6_addr_type(addr6); | ||
832 | |||
833 | if (addr_type == IPV6_ADDR_ANY) { | ||
834 | addr6 = &ipv6_hdr(skb)->daddr; | ||
835 | addr_type = ipv6_addr_type(addr6); | ||
836 | } | ||
837 | |||
838 | if ((addr_type & IPV6_ADDR_COMPATv4) == 0) | ||
839 | do_tx_error_icmp = true; | ||
840 | else { | ||
841 | do_tx_error_icmp = false; | ||
842 | dst = addr6->s6_addr32[3]; | ||
843 | } | ||
844 | neigh_release(neigh); | ||
845 | if (do_tx_error_icmp) | ||
846 | goto tx_error_icmp; | ||
847 | } | 397 | } |
848 | #endif | ||
849 | else | ||
850 | goto tx_error; | ||
851 | } | 398 | } |
852 | 399 | ||
853 | ttl = tiph->ttl; | 400 | return skb; |
854 | tos = tiph->tos; | 401 | } |
855 | if (tos & 0x1) { | ||
856 | tos &= ~0x1; | ||
857 | if (skb->protocol == htons(ETH_P_IP)) | ||
858 | tos = old_iph->tos; | ||
859 | else if (skb->protocol == htons(ETH_P_IPV6)) | ||
860 | tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); | ||
861 | } | ||
862 | 402 | ||
863 | rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, | 403 | static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, |
864 | tunnel->parms.o_key, RT_TOS(tos), | 404 | const struct iphdr *tnl_params, |
865 | tunnel->parms.link); | 405 | __be16 proto) |
866 | if (IS_ERR(rt)) { | 406 | { |
867 | dev->stats.tx_carrier_errors++; | 407 | struct ip_tunnel *tunnel = netdev_priv(dev); |
868 | goto tx_error; | 408 | struct tnl_ptk_info tpi; |
869 | } | ||
870 | tdev = rt->dst.dev; | ||
871 | 409 | ||
872 | if (tdev == dev) { | 410 | if (likely(!skb->encapsulation)) { |
873 | ip_rt_put(rt); | 411 | skb_reset_inner_headers(skb); |
874 | dev->stats.collisions++; | 412 | skb->encapsulation = 1; |
875 | goto tx_error; | ||
876 | } | 413 | } |
877 | 414 | ||
878 | df = tiph->frag_off; | 415 | tpi.flags = tunnel->parms.o_flags; |
879 | if (df) | 416 | tpi.proto = proto; |
880 | mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; | 417 | tpi.key = tunnel->parms.o_key; |
881 | else | 418 | if (tunnel->parms.o_flags & TUNNEL_SEQ) |
882 | mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; | 419 | tunnel->o_seqno++; |
883 | 420 | tpi.seq = htonl(tunnel->o_seqno); | |
884 | if (skb_dst(skb)) | ||
885 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); | ||
886 | |||
887 | if (skb->protocol == htons(ETH_P_IP)) { | ||
888 | df |= (old_iph->frag_off&htons(IP_DF)); | ||
889 | 421 | ||
890 | if (!skb_is_gso(skb) && | 422 | /* Push GRE header. */ |
891 | (old_iph->frag_off&htons(IP_DF)) && | 423 | skb = gre_build_header(skb, &tpi, tunnel->hlen); |
892 | mtu < ntohs(old_iph->tot_len)) { | 424 | if (unlikely(!skb)) { |
893 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); | 425 | dev->stats.tx_dropped++; |
894 | ip_rt_put(rt); | 426 | return; |
895 | goto tx_error; | ||
896 | } | ||
897 | } | 427 | } |
898 | #if IS_ENABLED(CONFIG_IPV6) | ||
899 | else if (skb->protocol == htons(ETH_P_IPV6)) { | ||
900 | struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); | ||
901 | |||
902 | if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { | ||
903 | if ((tunnel->parms.iph.daddr && | ||
904 | !ipv4_is_multicast(tunnel->parms.iph.daddr)) || | ||
905 | rt6->rt6i_dst.plen == 128) { | ||
906 | rt6->rt6i_flags |= RTF_MODIFIED; | ||
907 | dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); | ||
908 | } | ||
909 | } | ||
910 | 428 | ||
911 | if (!skb_is_gso(skb) && | 429 | ip_tunnel_xmit(skb, dev, tnl_params); |
912 | mtu >= IPV6_MIN_MTU && | 430 | } |
913 | mtu < skb->len - tunnel->hlen + gre_hlen) { | ||
914 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); | ||
915 | ip_rt_put(rt); | ||
916 | goto tx_error; | ||
917 | } | ||
918 | } | ||
919 | #endif | ||
920 | 431 | ||
921 | if (tunnel->err_count > 0) { | 432 | static netdev_tx_t ipgre_xmit(struct sk_buff *skb, |
922 | if (time_before(jiffies, | 433 | struct net_device *dev) |
923 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { | 434 | { |
924 | tunnel->err_count--; | 435 | struct ip_tunnel *tunnel = netdev_priv(dev); |
436 | const struct iphdr *tnl_params; | ||
925 | 437 | ||
926 | dst_link_failure(skb); | 438 | skb = handle_offloads(tunnel, skb); |
927 | } else | 439 | if (IS_ERR(skb)) |
928 | tunnel->err_count = 0; | 440 | goto out; |
929 | } | ||
930 | 441 | ||
931 | max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; | 442 | if (dev->header_ops) { |
932 | 443 | /* Need space for new headers */ | |
933 | if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| | 444 | if (skb_cow_head(skb, dev->needed_headroom - |
934 | (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { | 445 | (tunnel->hlen + sizeof(struct iphdr)))); |
935 | struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); | 446 | goto free_skb; |
936 | if (max_headroom > dev->needed_headroom) | ||
937 | dev->needed_headroom = max_headroom; | ||
938 | if (!new_skb) { | ||
939 | ip_rt_put(rt); | ||
940 | dev->stats.tx_dropped++; | ||
941 | dev_kfree_skb(skb); | ||
942 | return NETDEV_TX_OK; | ||
943 | } | ||
944 | if (skb->sk) | ||
945 | skb_set_owner_w(new_skb, skb->sk); | ||
946 | dev_kfree_skb(skb); | ||
947 | skb = new_skb; | ||
948 | old_iph = ip_hdr(skb); | ||
949 | /* Warning : tiph value might point to freed memory */ | ||
950 | } | ||
951 | 447 | ||
952 | skb_push(skb, gre_hlen); | 448 | tnl_params = (const struct iphdr *)skb->data; |
953 | skb_reset_network_header(skb); | ||
954 | skb_set_transport_header(skb, sizeof(*iph)); | ||
955 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | ||
956 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | | ||
957 | IPSKB_REROUTED); | ||
958 | skb_dst_drop(skb); | ||
959 | skb_dst_set(skb, &rt->dst); | ||
960 | |||
961 | /* | ||
962 | * Push down and install the IPIP header. | ||
963 | */ | ||
964 | 449 | ||
965 | iph = ip_hdr(skb); | 450 | /* Pull skb since ip_tunnel_xmit() needs skb->data pointing |
966 | iph->version = 4; | 451 | * to gre header. |
967 | iph->ihl = sizeof(struct iphdr) >> 2; | 452 | */ |
968 | iph->frag_off = df; | 453 | skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); |
969 | iph->protocol = IPPROTO_GRE; | 454 | } else { |
970 | iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); | 455 | if (skb_cow_head(skb, dev->needed_headroom)) |
971 | iph->daddr = fl4.daddr; | 456 | goto free_skb; |
972 | iph->saddr = fl4.saddr; | ||
973 | iph->ttl = ttl; | ||
974 | |||
975 | tunnel_ip_select_ident(skb, old_iph, &rt->dst); | ||
976 | |||
977 | if (ttl == 0) { | ||
978 | if (skb->protocol == htons(ETH_P_IP)) | ||
979 | iph->ttl = old_iph->ttl; | ||
980 | #if IS_ENABLED(CONFIG_IPV6) | ||
981 | else if (skb->protocol == htons(ETH_P_IPV6)) | ||
982 | iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; | ||
983 | #endif | ||
984 | else | ||
985 | iph->ttl = ip4_dst_hoplimit(&rt->dst); | ||
986 | } | ||
987 | |||
988 | ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; | ||
989 | ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? | ||
990 | htons(ETH_P_TEB) : skb->protocol; | ||
991 | |||
992 | if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { | ||
993 | __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4); | ||
994 | 457 | ||
995 | if (tunnel->parms.o_flags&GRE_SEQ) { | 458 | tnl_params = &tunnel->parms.iph; |
996 | ++tunnel->o_seqno; | ||
997 | *ptr = htonl(tunnel->o_seqno); | ||
998 | ptr--; | ||
999 | } | ||
1000 | if (tunnel->parms.o_flags&GRE_KEY) { | ||
1001 | *ptr = tunnel->parms.o_key; | ||
1002 | ptr--; | ||
1003 | } | ||
1004 | /* Skip GRE checksum if skb is getting offloaded. */ | ||
1005 | if (!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE) && | ||
1006 | (tunnel->parms.o_flags&GRE_CSUM)) { | ||
1007 | int offset = skb_transport_offset(skb); | ||
1008 | |||
1009 | if (skb_has_shared_frag(skb)) { | ||
1010 | err = __skb_linearize(skb); | ||
1011 | if (err) | ||
1012 | goto tx_error; | ||
1013 | } | ||
1014 | |||
1015 | *ptr = 0; | ||
1016 | *(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset, | ||
1017 | skb->len - offset, | ||
1018 | 0)); | ||
1019 | } | ||
1020 | } | 459 | } |
1021 | 460 | ||
1022 | nf_reset(skb); | 461 | __gre_xmit(skb, dev, tnl_params, skb->protocol); |
1023 | 462 | ||
1024 | pkt_len = skb->len - skb_transport_offset(skb); | ||
1025 | err = ip_local_out(skb); | ||
1026 | if (likely(net_xmit_eval(err) == 0)) { | ||
1027 | u64_stats_update_begin(&tstats->syncp); | ||
1028 | tstats->tx_bytes += pkt_len; | ||
1029 | tstats->tx_packets++; | ||
1030 | u64_stats_update_end(&tstats->syncp); | ||
1031 | } else { | ||
1032 | dev->stats.tx_errors++; | ||
1033 | dev->stats.tx_aborted_errors++; | ||
1034 | } | ||
1035 | return NETDEV_TX_OK; | 463 | return NETDEV_TX_OK; |
1036 | 464 | ||
1037 | #if IS_ENABLED(CONFIG_IPV6) | 465 | free_skb: |
1038 | tx_error_icmp: | ||
1039 | dst_link_failure(skb); | ||
1040 | #endif | ||
1041 | tx_error: | ||
1042 | dev->stats.tx_errors++; | ||
1043 | dev_kfree_skb(skb); | 466 | dev_kfree_skb(skb); |
467 | out: | ||
468 | dev->stats.tx_dropped++; | ||
1044 | return NETDEV_TX_OK; | 469 | return NETDEV_TX_OK; |
1045 | } | 470 | } |
1046 | 471 | ||
1047 | static int ipgre_tunnel_bind_dev(struct net_device *dev) | 472 | static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, |
473 | struct net_device *dev) | ||
1048 | { | 474 | { |
1049 | struct net_device *tdev = NULL; | 475 | struct ip_tunnel *tunnel = netdev_priv(dev); |
1050 | struct ip_tunnel *tunnel; | ||
1051 | const struct iphdr *iph; | ||
1052 | int hlen = LL_MAX_HEADER; | ||
1053 | int mtu = ETH_DATA_LEN; | ||
1054 | int addend = sizeof(struct iphdr) + 4; | ||
1055 | |||
1056 | tunnel = netdev_priv(dev); | ||
1057 | iph = &tunnel->parms.iph; | ||
1058 | |||
1059 | /* Guess output device to choose reasonable mtu and needed_headroom */ | ||
1060 | |||
1061 | if (iph->daddr) { | ||
1062 | struct flowi4 fl4; | ||
1063 | struct rtable *rt; | ||
1064 | |||
1065 | rt = ip_route_output_gre(dev_net(dev), &fl4, | ||
1066 | iph->daddr, iph->saddr, | ||
1067 | tunnel->parms.o_key, | ||
1068 | RT_TOS(iph->tos), | ||
1069 | tunnel->parms.link); | ||
1070 | if (!IS_ERR(rt)) { | ||
1071 | tdev = rt->dst.dev; | ||
1072 | ip_rt_put(rt); | ||
1073 | } | ||
1074 | |||
1075 | if (dev->type != ARPHRD_ETHER) | ||
1076 | dev->flags |= IFF_POINTOPOINT; | ||
1077 | } | ||
1078 | 476 | ||
1079 | if (!tdev && tunnel->parms.link) | 477 | skb = handle_offloads(tunnel, skb); |
1080 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | 478 | if (IS_ERR(skb)) |
479 | goto out; | ||
1081 | 480 | ||
1082 | if (tdev) { | 481 | if (skb_cow_head(skb, dev->needed_headroom)) |
1083 | hlen = tdev->hard_header_len + tdev->needed_headroom; | 482 | goto free_skb; |
1084 | mtu = tdev->mtu; | ||
1085 | } | ||
1086 | dev->iflink = tunnel->parms.link; | ||
1087 | |||
1088 | /* Precalculate GRE options length */ | ||
1089 | if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { | ||
1090 | if (tunnel->parms.o_flags&GRE_CSUM) | ||
1091 | addend += 4; | ||
1092 | if (tunnel->parms.o_flags&GRE_KEY) | ||
1093 | addend += 4; | ||
1094 | if (tunnel->parms.o_flags&GRE_SEQ) | ||
1095 | addend += 4; | ||
1096 | } | ||
1097 | dev->needed_headroom = addend + hlen; | ||
1098 | mtu -= dev->hard_header_len + addend; | ||
1099 | 483 | ||
1100 | if (mtu < 68) | 484 | __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); |
1101 | mtu = 68; | ||
1102 | 485 | ||
1103 | tunnel->hlen = addend; | 486 | return NETDEV_TX_OK; |
1104 | /* TCP offload with GRE SEQ is not supported. */ | ||
1105 | if (!(tunnel->parms.o_flags & GRE_SEQ)) { | ||
1106 | dev->features |= NETIF_F_GSO_SOFTWARE; | ||
1107 | dev->hw_features |= NETIF_F_GSO_SOFTWARE; | ||
1108 | } | ||
1109 | 487 | ||
1110 | return mtu; | 488 | free_skb: |
489 | dev_kfree_skb(skb); | ||
490 | out: | ||
491 | dev->stats.tx_dropped++; | ||
492 | return NETDEV_TX_OK; | ||
1111 | } | 493 | } |
1112 | 494 | ||
1113 | static int | 495 | static int ipgre_tunnel_ioctl(struct net_device *dev, |
1114 | ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | 496 | struct ifreq *ifr, int cmd) |
1115 | { | 497 | { |
1116 | int err = 0; | 498 | int err = 0; |
1117 | struct ip_tunnel_parm p; | 499 | struct ip_tunnel_parm p; |
1118 | struct ip_tunnel *t; | ||
1119 | struct net *net = dev_net(dev); | ||
1120 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
1121 | |||
1122 | switch (cmd) { | ||
1123 | case SIOCGETTUNNEL: | ||
1124 | t = NULL; | ||
1125 | if (dev == ign->fb_tunnel_dev) { | ||
1126 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { | ||
1127 | err = -EFAULT; | ||
1128 | break; | ||
1129 | } | ||
1130 | t = ipgre_tunnel_locate(net, &p, 0); | ||
1131 | } | ||
1132 | if (t == NULL) | ||
1133 | t = netdev_priv(dev); | ||
1134 | memcpy(&p, &t->parms, sizeof(p)); | ||
1135 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) | ||
1136 | err = -EFAULT; | ||
1137 | break; | ||
1138 | |||
1139 | case SIOCADDTUNNEL: | ||
1140 | case SIOCCHGTUNNEL: | ||
1141 | err = -EPERM; | ||
1142 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
1143 | goto done; | ||
1144 | |||
1145 | err = -EFAULT; | ||
1146 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | ||
1147 | goto done; | ||
1148 | |||
1149 | err = -EINVAL; | ||
1150 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || | ||
1151 | p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || | ||
1152 | ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) | ||
1153 | goto done; | ||
1154 | if (p.iph.ttl) | ||
1155 | p.iph.frag_off |= htons(IP_DF); | ||
1156 | |||
1157 | if (!(p.i_flags&GRE_KEY)) | ||
1158 | p.i_key = 0; | ||
1159 | if (!(p.o_flags&GRE_KEY)) | ||
1160 | p.o_key = 0; | ||
1161 | |||
1162 | t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); | ||
1163 | |||
1164 | if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | ||
1165 | if (t != NULL) { | ||
1166 | if (t->dev != dev) { | ||
1167 | err = -EEXIST; | ||
1168 | break; | ||
1169 | } | ||
1170 | } else { | ||
1171 | unsigned int nflags = 0; | ||
1172 | |||
1173 | t = netdev_priv(dev); | ||
1174 | |||
1175 | if (ipv4_is_multicast(p.iph.daddr)) | ||
1176 | nflags = IFF_BROADCAST; | ||
1177 | else if (p.iph.daddr) | ||
1178 | nflags = IFF_POINTOPOINT; | ||
1179 | |||
1180 | if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { | ||
1181 | err = -EINVAL; | ||
1182 | break; | ||
1183 | } | ||
1184 | ipgre_tunnel_unlink(ign, t); | ||
1185 | synchronize_net(); | ||
1186 | t->parms.iph.saddr = p.iph.saddr; | ||
1187 | t->parms.iph.daddr = p.iph.daddr; | ||
1188 | t->parms.i_key = p.i_key; | ||
1189 | t->parms.o_key = p.o_key; | ||
1190 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
1191 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
1192 | ipgre_tunnel_link(ign, t); | ||
1193 | netdev_state_change(dev); | ||
1194 | } | ||
1195 | } | ||
1196 | |||
1197 | if (t) { | ||
1198 | err = 0; | ||
1199 | if (cmd == SIOCCHGTUNNEL) { | ||
1200 | t->parms.iph.ttl = p.iph.ttl; | ||
1201 | t->parms.iph.tos = p.iph.tos; | ||
1202 | t->parms.iph.frag_off = p.iph.frag_off; | ||
1203 | if (t->parms.link != p.link) { | ||
1204 | t->parms.link = p.link; | ||
1205 | dev->mtu = ipgre_tunnel_bind_dev(dev); | ||
1206 | netdev_state_change(dev); | ||
1207 | } | ||
1208 | } | ||
1209 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) | ||
1210 | err = -EFAULT; | ||
1211 | } else | ||
1212 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | ||
1213 | break; | ||
1214 | |||
1215 | case SIOCDELTUNNEL: | ||
1216 | err = -EPERM; | ||
1217 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
1218 | goto done; | ||
1219 | |||
1220 | if (dev == ign->fb_tunnel_dev) { | ||
1221 | err = -EFAULT; | ||
1222 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | ||
1223 | goto done; | ||
1224 | err = -ENOENT; | ||
1225 | if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) | ||
1226 | goto done; | ||
1227 | err = -EPERM; | ||
1228 | if (t == netdev_priv(ign->fb_tunnel_dev)) | ||
1229 | goto done; | ||
1230 | dev = t->dev; | ||
1231 | } | ||
1232 | unregister_netdevice(dev); | ||
1233 | err = 0; | ||
1234 | break; | ||
1235 | 500 | ||
1236 | default: | 501 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) |
1237 | err = -EINVAL; | 502 | return -EFAULT; |
503 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || | ||
504 | p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || | ||
505 | ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) { | ||
506 | return -EINVAL; | ||
1238 | } | 507 | } |
508 | p.i_flags = gre_flags_to_tnl_flags(p.i_flags); | ||
509 | p.o_flags = gre_flags_to_tnl_flags(p.o_flags); | ||
1239 | 510 | ||
1240 | done: | 511 | err = ip_tunnel_ioctl(dev, &p, cmd); |
1241 | return err; | 512 | if (err) |
1242 | } | 513 | return err; |
1243 | 514 | ||
1244 | static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) | 515 | p.i_flags = tnl_flags_to_gre_flags(p.i_flags); |
1245 | { | 516 | p.o_flags = tnl_flags_to_gre_flags(p.o_flags); |
1246 | struct ip_tunnel *tunnel = netdev_priv(dev); | 517 | |
1247 | if (new_mtu < 68 || | 518 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) |
1248 | new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) | 519 | return -EFAULT; |
1249 | return -EINVAL; | ||
1250 | dev->mtu = new_mtu; | ||
1251 | return 0; | 520 | return 0; |
1252 | } | 521 | } |
1253 | 522 | ||
@@ -1277,25 +546,23 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) | |||
1277 | ... | 546 | ... |
1278 | ftp fec0:6666:6666::193.233.7.65 | 547 | ftp fec0:6666:6666::193.233.7.65 |
1279 | ... | 548 | ... |
1280 | |||
1281 | */ | 549 | */ |
1282 | |||
1283 | static int ipgre_header(struct sk_buff *skb, struct net_device *dev, | 550 | static int ipgre_header(struct sk_buff *skb, struct net_device *dev, |
1284 | unsigned short type, | 551 | unsigned short type, |
1285 | const void *daddr, const void *saddr, unsigned int len) | 552 | const void *daddr, const void *saddr, unsigned int len) |
1286 | { | 553 | { |
1287 | struct ip_tunnel *t = netdev_priv(dev); | 554 | struct ip_tunnel *t = netdev_priv(dev); |
1288 | struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); | 555 | struct iphdr *iph; |
1289 | __be16 *p = (__be16 *)(iph+1); | 556 | struct gre_base_hdr *greh; |
1290 | 557 | ||
1291 | memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); | 558 | iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); |
1292 | p[0] = t->parms.o_flags; | 559 | greh = (struct gre_base_hdr *)(iph+1); |
1293 | p[1] = htons(type); | 560 | greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags); |
561 | greh->protocol = htons(type); | ||
1294 | 562 | ||
1295 | /* | 563 | memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); |
1296 | * Set the source hardware address. | ||
1297 | */ | ||
1298 | 564 | ||
565 | /* Set the source hardware address. */ | ||
1299 | if (saddr) | 566 | if (saddr) |
1300 | memcpy(&iph->saddr, saddr, 4); | 567 | memcpy(&iph->saddr, saddr, 4); |
1301 | if (daddr) | 568 | if (daddr) |
@@ -1303,7 +570,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, | |||
1303 | if (iph->daddr) | 570 | if (iph->daddr) |
1304 | return t->hlen; | 571 | return t->hlen; |
1305 | 572 | ||
1306 | return -t->hlen; | 573 | return -(t->hlen + sizeof(*iph)); |
1307 | } | 574 | } |
1308 | 575 | ||
1309 | static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) | 576 | static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) |
@@ -1357,31 +624,21 @@ static int ipgre_close(struct net_device *dev) | |||
1357 | } | 624 | } |
1358 | return 0; | 625 | return 0; |
1359 | } | 626 | } |
1360 | |||
1361 | #endif | 627 | #endif |
1362 | 628 | ||
1363 | static const struct net_device_ops ipgre_netdev_ops = { | 629 | static const struct net_device_ops ipgre_netdev_ops = { |
1364 | .ndo_init = ipgre_tunnel_init, | 630 | .ndo_init = ipgre_tunnel_init, |
1365 | .ndo_uninit = ipgre_tunnel_uninit, | 631 | .ndo_uninit = ip_tunnel_uninit, |
1366 | #ifdef CONFIG_NET_IPGRE_BROADCAST | 632 | #ifdef CONFIG_NET_IPGRE_BROADCAST |
1367 | .ndo_open = ipgre_open, | 633 | .ndo_open = ipgre_open, |
1368 | .ndo_stop = ipgre_close, | 634 | .ndo_stop = ipgre_close, |
1369 | #endif | 635 | #endif |
1370 | .ndo_start_xmit = ipgre_tunnel_xmit, | 636 | .ndo_start_xmit = ipgre_xmit, |
1371 | .ndo_do_ioctl = ipgre_tunnel_ioctl, | 637 | .ndo_do_ioctl = ipgre_tunnel_ioctl, |
1372 | .ndo_change_mtu = ipgre_tunnel_change_mtu, | 638 | .ndo_change_mtu = ip_tunnel_change_mtu, |
1373 | .ndo_get_stats64 = ipgre_get_stats64, | 639 | .ndo_get_stats64 = ip_tunnel_get_stats64, |
1374 | }; | 640 | }; |
1375 | 641 | ||
1376 | static void ipgre_dev_free(struct net_device *dev) | ||
1377 | { | ||
1378 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
1379 | |||
1380 | gro_cells_destroy(&tunnel->gro_cells); | ||
1381 | free_percpu(dev->tstats); | ||
1382 | free_netdev(dev); | ||
1383 | } | ||
1384 | |||
1385 | #define GRE_FEATURES (NETIF_F_SG | \ | 642 | #define GRE_FEATURES (NETIF_F_SG | \ |
1386 | NETIF_F_FRAGLIST | \ | 643 | NETIF_F_FRAGLIST | \ |
1387 | NETIF_F_HIGHDMA | \ | 644 | NETIF_F_HIGHDMA | \ |
@@ -1390,35 +647,49 @@ static void ipgre_dev_free(struct net_device *dev) | |||
1390 | static void ipgre_tunnel_setup(struct net_device *dev) | 647 | static void ipgre_tunnel_setup(struct net_device *dev) |
1391 | { | 648 | { |
1392 | dev->netdev_ops = &ipgre_netdev_ops; | 649 | dev->netdev_ops = &ipgre_netdev_ops; |
1393 | dev->destructor = ipgre_dev_free; | 650 | ip_tunnel_setup(dev, ipgre_net_id); |
651 | } | ||
1394 | 652 | ||
1395 | dev->type = ARPHRD_IPGRE; | 653 | static void __gre_tunnel_init(struct net_device *dev) |
1396 | dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; | 654 | { |
655 | struct ip_tunnel *tunnel; | ||
656 | |||
657 | tunnel = netdev_priv(dev); | ||
658 | tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); | ||
659 | tunnel->parms.iph.protocol = IPPROTO_GRE; | ||
660 | |||
661 | dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; | ||
1397 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; | 662 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; |
1398 | dev->flags = IFF_NOARP; | ||
1399 | dev->iflink = 0; | 663 | dev->iflink = 0; |
1400 | dev->addr_len = 4; | ||
1401 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
1402 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | ||
1403 | 664 | ||
1404 | dev->features |= GRE_FEATURES; | 665 | dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES; |
1405 | dev->hw_features |= GRE_FEATURES; | 666 | dev->hw_features |= GRE_FEATURES; |
667 | |||
668 | if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { | ||
669 | /* TCP offload with GRE SEQ is not supported. */ | ||
670 | dev->features |= NETIF_F_GSO_SOFTWARE; | ||
671 | dev->hw_features |= NETIF_F_GSO_SOFTWARE; | ||
672 | /* Can use a lockless transmit, unless we generate | ||
673 | * output sequences | ||
674 | */ | ||
675 | dev->features |= NETIF_F_LLTX; | ||
676 | } | ||
1406 | } | 677 | } |
1407 | 678 | ||
1408 | static int ipgre_tunnel_init(struct net_device *dev) | 679 | static int ipgre_tunnel_init(struct net_device *dev) |
1409 | { | 680 | { |
1410 | struct ip_tunnel *tunnel; | 681 | struct ip_tunnel *tunnel = netdev_priv(dev); |
1411 | struct iphdr *iph; | 682 | struct iphdr *iph = &tunnel->parms.iph; |
1412 | int err; | ||
1413 | 683 | ||
1414 | tunnel = netdev_priv(dev); | 684 | __gre_tunnel_init(dev); |
1415 | iph = &tunnel->parms.iph; | ||
1416 | 685 | ||
1417 | tunnel->dev = dev; | 686 | memcpy(dev->dev_addr, &iph->saddr, 4); |
1418 | strcpy(tunnel->parms.name, dev->name); | 687 | memcpy(dev->broadcast, &iph->daddr, 4); |
1419 | 688 | ||
1420 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | 689 | dev->type = ARPHRD_IPGRE; |
1421 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | 690 | dev->flags = IFF_NOARP; |
691 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | ||
692 | dev->addr_len = 4; | ||
1422 | 693 | ||
1423 | if (iph->daddr) { | 694 | if (iph->daddr) { |
1424 | #ifdef CONFIG_NET_IPGRE_BROADCAST | 695 | #ifdef CONFIG_NET_IPGRE_BROADCAST |
@@ -1432,106 +703,30 @@ static int ipgre_tunnel_init(struct net_device *dev) | |||
1432 | } else | 703 | } else |
1433 | dev->header_ops = &ipgre_header_ops; | 704 | dev->header_ops = &ipgre_header_ops; |
1434 | 705 | ||
1435 | dev->tstats = alloc_percpu(struct pcpu_tstats); | 706 | return ip_tunnel_init(dev); |
1436 | if (!dev->tstats) | ||
1437 | return -ENOMEM; | ||
1438 | |||
1439 | err = gro_cells_init(&tunnel->gro_cells, dev); | ||
1440 | if (err) { | ||
1441 | free_percpu(dev->tstats); | ||
1442 | return err; | ||
1443 | } | ||
1444 | |||
1445 | return 0; | ||
1446 | } | 707 | } |
1447 | 708 | ||
1448 | static void ipgre_fb_tunnel_init(struct net_device *dev) | ||
1449 | { | ||
1450 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
1451 | struct iphdr *iph = &tunnel->parms.iph; | ||
1452 | |||
1453 | tunnel->dev = dev; | ||
1454 | strcpy(tunnel->parms.name, dev->name); | ||
1455 | |||
1456 | iph->version = 4; | ||
1457 | iph->protocol = IPPROTO_GRE; | ||
1458 | iph->ihl = 5; | ||
1459 | tunnel->hlen = sizeof(struct iphdr) + 4; | ||
1460 | |||
1461 | dev_hold(dev); | ||
1462 | } | ||
1463 | |||
1464 | |||
1465 | static const struct gre_protocol ipgre_protocol = { | 709 | static const struct gre_protocol ipgre_protocol = { |
1466 | .handler = ipgre_rcv, | 710 | .handler = ipgre_rcv, |
1467 | .err_handler = ipgre_err, | 711 | .err_handler = ipgre_err, |
1468 | }; | 712 | }; |
1469 | 713 | ||
1470 | static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) | ||
1471 | { | ||
1472 | int prio; | ||
1473 | |||
1474 | for (prio = 0; prio < 4; prio++) { | ||
1475 | int h; | ||
1476 | for (h = 0; h < HASH_SIZE; h++) { | ||
1477 | struct ip_tunnel *t; | ||
1478 | |||
1479 | t = rtnl_dereference(ign->tunnels[prio][h]); | ||
1480 | |||
1481 | while (t != NULL) { | ||
1482 | unregister_netdevice_queue(t->dev, head); | ||
1483 | t = rtnl_dereference(t->next); | ||
1484 | } | ||
1485 | } | ||
1486 | } | ||
1487 | } | ||
1488 | |||
1489 | static int __net_init ipgre_init_net(struct net *net) | 714 | static int __net_init ipgre_init_net(struct net *net) |
1490 | { | 715 | { |
1491 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | 716 | return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); |
1492 | int err; | ||
1493 | |||
1494 | ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", | ||
1495 | ipgre_tunnel_setup); | ||
1496 | if (!ign->fb_tunnel_dev) { | ||
1497 | err = -ENOMEM; | ||
1498 | goto err_alloc_dev; | ||
1499 | } | ||
1500 | dev_net_set(ign->fb_tunnel_dev, net); | ||
1501 | |||
1502 | ipgre_fb_tunnel_init(ign->fb_tunnel_dev); | ||
1503 | ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; | ||
1504 | |||
1505 | if ((err = register_netdev(ign->fb_tunnel_dev))) | ||
1506 | goto err_reg_dev; | ||
1507 | |||
1508 | rcu_assign_pointer(ign->tunnels_wc[0], | ||
1509 | netdev_priv(ign->fb_tunnel_dev)); | ||
1510 | return 0; | ||
1511 | |||
1512 | err_reg_dev: | ||
1513 | ipgre_dev_free(ign->fb_tunnel_dev); | ||
1514 | err_alloc_dev: | ||
1515 | return err; | ||
1516 | } | 717 | } |
1517 | 718 | ||
1518 | static void __net_exit ipgre_exit_net(struct net *net) | 719 | static void __net_exit ipgre_exit_net(struct net *net) |
1519 | { | 720 | { |
1520 | struct ipgre_net *ign; | 721 | struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); |
1521 | LIST_HEAD(list); | 722 | ip_tunnel_delete_net(itn); |
1522 | |||
1523 | ign = net_generic(net, ipgre_net_id); | ||
1524 | rtnl_lock(); | ||
1525 | ipgre_destroy_tunnels(ign, &list); | ||
1526 | unregister_netdevice_many(&list); | ||
1527 | rtnl_unlock(); | ||
1528 | } | 723 | } |
1529 | 724 | ||
1530 | static struct pernet_operations ipgre_net_ops = { | 725 | static struct pernet_operations ipgre_net_ops = { |
1531 | .init = ipgre_init_net, | 726 | .init = ipgre_init_net, |
1532 | .exit = ipgre_exit_net, | 727 | .exit = ipgre_exit_net, |
1533 | .id = &ipgre_net_id, | 728 | .id = &ipgre_net_id, |
1534 | .size = sizeof(struct ipgre_net), | 729 | .size = sizeof(struct ip_tunnel_net), |
1535 | }; | 730 | }; |
1536 | 731 | ||
1537 | static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) | 732 | static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) |
@@ -1576,8 +771,8 @@ out: | |||
1576 | return ipgre_tunnel_validate(tb, data); | 771 | return ipgre_tunnel_validate(tb, data); |
1577 | } | 772 | } |
1578 | 773 | ||
1579 | static void ipgre_netlink_parms(struct nlattr *data[], | 774 | static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], |
1580 | struct ip_tunnel_parm *parms) | 775 | struct ip_tunnel_parm *parms) |
1581 | { | 776 | { |
1582 | memset(parms, 0, sizeof(*parms)); | 777 | memset(parms, 0, sizeof(*parms)); |
1583 | 778 | ||
@@ -1590,10 +785,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], | |||
1590 | parms->link = nla_get_u32(data[IFLA_GRE_LINK]); | 785 | parms->link = nla_get_u32(data[IFLA_GRE_LINK]); |
1591 | 786 | ||
1592 | if (data[IFLA_GRE_IFLAGS]) | 787 | if (data[IFLA_GRE_IFLAGS]) |
1593 | parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); | 788 | parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); |
1594 | 789 | ||
1595 | if (data[IFLA_GRE_OFLAGS]) | 790 | if (data[IFLA_GRE_OFLAGS]) |
1596 | parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); | 791 | parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); |
1597 | 792 | ||
1598 | if (data[IFLA_GRE_IKEY]) | 793 | if (data[IFLA_GRE_IKEY]) |
1599 | parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); | 794 | parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); |
@@ -1617,148 +812,46 @@ static void ipgre_netlink_parms(struct nlattr *data[], | |||
1617 | parms->iph.frag_off = htons(IP_DF); | 812 | parms->iph.frag_off = htons(IP_DF); |
1618 | } | 813 | } |
1619 | 814 | ||
1620 | static int ipgre_tap_init(struct net_device *dev) | 815 | static int gre_tap_init(struct net_device *dev) |
1621 | { | 816 | { |
1622 | struct ip_tunnel *tunnel; | 817 | __gre_tunnel_init(dev); |
1623 | |||
1624 | tunnel = netdev_priv(dev); | ||
1625 | |||
1626 | tunnel->dev = dev; | ||
1627 | strcpy(tunnel->parms.name, dev->name); | ||
1628 | |||
1629 | ipgre_tunnel_bind_dev(dev); | ||
1630 | 818 | ||
1631 | dev->tstats = alloc_percpu(struct pcpu_tstats); | 819 | return ip_tunnel_init(dev); |
1632 | if (!dev->tstats) | ||
1633 | return -ENOMEM; | ||
1634 | |||
1635 | return 0; | ||
1636 | } | 820 | } |
1637 | 821 | ||
1638 | static const struct net_device_ops ipgre_tap_netdev_ops = { | 822 | static const struct net_device_ops gre_tap_netdev_ops = { |
1639 | .ndo_init = ipgre_tap_init, | 823 | .ndo_init = gre_tap_init, |
1640 | .ndo_uninit = ipgre_tunnel_uninit, | 824 | .ndo_uninit = ip_tunnel_uninit, |
1641 | .ndo_start_xmit = ipgre_tunnel_xmit, | 825 | .ndo_start_xmit = gre_tap_xmit, |
1642 | .ndo_set_mac_address = eth_mac_addr, | 826 | .ndo_set_mac_address = eth_mac_addr, |
1643 | .ndo_validate_addr = eth_validate_addr, | 827 | .ndo_validate_addr = eth_validate_addr, |
1644 | .ndo_change_mtu = ipgre_tunnel_change_mtu, | 828 | .ndo_change_mtu = ip_tunnel_change_mtu, |
1645 | .ndo_get_stats64 = ipgre_get_stats64, | 829 | .ndo_get_stats64 = ip_tunnel_get_stats64, |
1646 | }; | 830 | }; |
1647 | 831 | ||
1648 | static void ipgre_tap_setup(struct net_device *dev) | 832 | static void ipgre_tap_setup(struct net_device *dev) |
1649 | { | 833 | { |
1650 | |||
1651 | ether_setup(dev); | 834 | ether_setup(dev); |
1652 | 835 | dev->netdev_ops = &gre_tap_netdev_ops; | |
1653 | dev->netdev_ops = &ipgre_tap_netdev_ops; | 836 | ip_tunnel_setup(dev, gre_tap_net_id); |
1654 | dev->destructor = ipgre_dev_free; | ||
1655 | |||
1656 | dev->iflink = 0; | ||
1657 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
1658 | |||
1659 | dev->features |= GRE_FEATURES; | ||
1660 | dev->hw_features |= GRE_FEATURES; | ||
1661 | } | 837 | } |
1662 | 838 | ||
1663 | static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], | 839 | static int ipgre_newlink(struct net *src_net, struct net_device *dev, |
1664 | struct nlattr *data[]) | 840 | struct nlattr *tb[], struct nlattr *data[]) |
1665 | { | 841 | { |
1666 | struct ip_tunnel *nt; | 842 | struct ip_tunnel_parm p; |
1667 | struct net *net = dev_net(dev); | ||
1668 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
1669 | int mtu; | ||
1670 | int err; | ||
1671 | |||
1672 | nt = netdev_priv(dev); | ||
1673 | ipgre_netlink_parms(data, &nt->parms); | ||
1674 | |||
1675 | if (ipgre_tunnel_find(net, &nt->parms, dev->type)) | ||
1676 | return -EEXIST; | ||
1677 | |||
1678 | if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) | ||
1679 | eth_hw_addr_random(dev); | ||
1680 | |||
1681 | mtu = ipgre_tunnel_bind_dev(dev); | ||
1682 | if (!tb[IFLA_MTU]) | ||
1683 | dev->mtu = mtu; | ||
1684 | |||
1685 | /* Can use a lockless transmit, unless we generate output sequences */ | ||
1686 | if (!(nt->parms.o_flags & GRE_SEQ)) | ||
1687 | dev->features |= NETIF_F_LLTX; | ||
1688 | |||
1689 | err = register_netdevice(dev); | ||
1690 | if (err) | ||
1691 | goto out; | ||
1692 | |||
1693 | dev_hold(dev); | ||
1694 | ipgre_tunnel_link(ign, nt); | ||
1695 | 843 | ||
1696 | out: | 844 | ipgre_netlink_parms(data, tb, &p); |
1697 | return err; | 845 | return ip_tunnel_newlink(dev, tb, &p); |
1698 | } | 846 | } |
1699 | 847 | ||
1700 | static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], | 848 | static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], |
1701 | struct nlattr *data[]) | 849 | struct nlattr *data[]) |
1702 | { | 850 | { |
1703 | struct ip_tunnel *t, *nt; | ||
1704 | struct net *net = dev_net(dev); | ||
1705 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
1706 | struct ip_tunnel_parm p; | 851 | struct ip_tunnel_parm p; |
1707 | int mtu; | ||
1708 | |||
1709 | if (dev == ign->fb_tunnel_dev) | ||
1710 | return -EINVAL; | ||
1711 | |||
1712 | nt = netdev_priv(dev); | ||
1713 | ipgre_netlink_parms(data, &p); | ||
1714 | |||
1715 | t = ipgre_tunnel_locate(net, &p, 0); | ||
1716 | |||
1717 | if (t) { | ||
1718 | if (t->dev != dev) | ||
1719 | return -EEXIST; | ||
1720 | } else { | ||
1721 | t = nt; | ||
1722 | |||
1723 | if (dev->type != ARPHRD_ETHER) { | ||
1724 | unsigned int nflags = 0; | ||
1725 | |||
1726 | if (ipv4_is_multicast(p.iph.daddr)) | ||
1727 | nflags = IFF_BROADCAST; | ||
1728 | else if (p.iph.daddr) | ||
1729 | nflags = IFF_POINTOPOINT; | ||
1730 | |||
1731 | if ((dev->flags ^ nflags) & | ||
1732 | (IFF_POINTOPOINT | IFF_BROADCAST)) | ||
1733 | return -EINVAL; | ||
1734 | } | ||
1735 | 852 | ||
1736 | ipgre_tunnel_unlink(ign, t); | 853 | ipgre_netlink_parms(data, tb, &p); |
1737 | t->parms.iph.saddr = p.iph.saddr; | 854 | return ip_tunnel_changelink(dev, tb, &p); |
1738 | t->parms.iph.daddr = p.iph.daddr; | ||
1739 | t->parms.i_key = p.i_key; | ||
1740 | if (dev->type != ARPHRD_ETHER) { | ||
1741 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
1742 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
1743 | } | ||
1744 | ipgre_tunnel_link(ign, t); | ||
1745 | netdev_state_change(dev); | ||
1746 | } | ||
1747 | |||
1748 | t->parms.o_key = p.o_key; | ||
1749 | t->parms.iph.ttl = p.iph.ttl; | ||
1750 | t->parms.iph.tos = p.iph.tos; | ||
1751 | t->parms.iph.frag_off = p.iph.frag_off; | ||
1752 | |||
1753 | if (t->parms.link != p.link) { | ||
1754 | t->parms.link = p.link; | ||
1755 | mtu = ipgre_tunnel_bind_dev(dev); | ||
1756 | if (!tb[IFLA_MTU]) | ||
1757 | dev->mtu = mtu; | ||
1758 | netdev_state_change(dev); | ||
1759 | } | ||
1760 | |||
1761 | return 0; | ||
1762 | } | 855 | } |
1763 | 856 | ||
1764 | static size_t ipgre_get_size(const struct net_device *dev) | 857 | static size_t ipgre_get_size(const struct net_device *dev) |
@@ -1793,8 +886,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) | |||
1793 | struct ip_tunnel_parm *p = &t->parms; | 886 | struct ip_tunnel_parm *p = &t->parms; |
1794 | 887 | ||
1795 | if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || | 888 | if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || |
1796 | nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || | 889 | nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || |
1797 | nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || | 890 | nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || |
1798 | nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || | 891 | nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || |
1799 | nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || | 892 | nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || |
1800 | nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || | 893 | nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || |
@@ -1832,6 +925,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = { | |||
1832 | .validate = ipgre_tunnel_validate, | 925 | .validate = ipgre_tunnel_validate, |
1833 | .newlink = ipgre_newlink, | 926 | .newlink = ipgre_newlink, |
1834 | .changelink = ipgre_changelink, | 927 | .changelink = ipgre_changelink, |
928 | .dellink = ip_tunnel_dellink, | ||
1835 | .get_size = ipgre_get_size, | 929 | .get_size = ipgre_get_size, |
1836 | .fill_info = ipgre_fill_info, | 930 | .fill_info = ipgre_fill_info, |
1837 | }; | 931 | }; |
@@ -1845,13 +939,28 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { | |||
1845 | .validate = ipgre_tap_validate, | 939 | .validate = ipgre_tap_validate, |
1846 | .newlink = ipgre_newlink, | 940 | .newlink = ipgre_newlink, |
1847 | .changelink = ipgre_changelink, | 941 | .changelink = ipgre_changelink, |
942 | .dellink = ip_tunnel_dellink, | ||
1848 | .get_size = ipgre_get_size, | 943 | .get_size = ipgre_get_size, |
1849 | .fill_info = ipgre_fill_info, | 944 | .fill_info = ipgre_fill_info, |
1850 | }; | 945 | }; |
1851 | 946 | ||
1852 | /* | 947 | static int __net_init ipgre_tap_init_net(struct net *net) |
1853 | * And now the modules code and kernel interface. | 948 | { |
1854 | */ | 949 | return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); |
950 | } | ||
951 | |||
952 | static void __net_exit ipgre_tap_exit_net(struct net *net) | ||
953 | { | ||
954 | struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); | ||
955 | ip_tunnel_delete_net(itn); | ||
956 | } | ||
957 | |||
958 | static struct pernet_operations ipgre_tap_net_ops = { | ||
959 | .init = ipgre_tap_init_net, | ||
960 | .exit = ipgre_tap_exit_net, | ||
961 | .id = &gre_tap_net_id, | ||
962 | .size = sizeof(struct ip_tunnel_net), | ||
963 | }; | ||
1855 | 964 | ||
1856 | static int __init ipgre_init(void) | 965 | static int __init ipgre_init(void) |
1857 | { | 966 | { |
@@ -1863,6 +972,10 @@ static int __init ipgre_init(void) | |||
1863 | if (err < 0) | 972 | if (err < 0) |
1864 | return err; | 973 | return err; |
1865 | 974 | ||
975 | err = register_pernet_device(&ipgre_tap_net_ops); | ||
976 | if (err < 0) | ||
977 | goto pnet_tap_faied; | ||
978 | |||
1866 | err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); | 979 | err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); |
1867 | if (err < 0) { | 980 | if (err < 0) { |
1868 | pr_info("%s: can't add protocol\n", __func__); | 981 | pr_info("%s: can't add protocol\n", __func__); |
@@ -1877,16 +990,17 @@ static int __init ipgre_init(void) | |||
1877 | if (err < 0) | 990 | if (err < 0) |
1878 | goto tap_ops_failed; | 991 | goto tap_ops_failed; |
1879 | 992 | ||
1880 | out: | 993 | return 0; |
1881 | return err; | ||
1882 | 994 | ||
1883 | tap_ops_failed: | 995 | tap_ops_failed: |
1884 | rtnl_link_unregister(&ipgre_link_ops); | 996 | rtnl_link_unregister(&ipgre_link_ops); |
1885 | rtnl_link_failed: | 997 | rtnl_link_failed: |
1886 | gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); | 998 | gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); |
1887 | add_proto_failed: | 999 | add_proto_failed: |
1000 | unregister_pernet_device(&ipgre_tap_net_ops); | ||
1001 | pnet_tap_faied: | ||
1888 | unregister_pernet_device(&ipgre_net_ops); | 1002 | unregister_pernet_device(&ipgre_net_ops); |
1889 | goto out; | 1003 | return err; |
1890 | } | 1004 | } |
1891 | 1005 | ||
1892 | static void __exit ipgre_fini(void) | 1006 | static void __exit ipgre_fini(void) |
@@ -1895,6 +1009,7 @@ static void __exit ipgre_fini(void) | |||
1895 | rtnl_link_unregister(&ipgre_link_ops); | 1009 | rtnl_link_unregister(&ipgre_link_ops); |
1896 | if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) | 1010 | if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) |
1897 | pr_info("%s: can't remove protocol\n", __func__); | 1011 | pr_info("%s: can't remove protocol\n", __func__); |
1012 | unregister_pernet_device(&ipgre_tap_net_ops); | ||
1898 | unregister_pernet_device(&ipgre_net_ops); | 1013 | unregister_pernet_device(&ipgre_net_ops); |
1899 | } | 1014 | } |
1900 | 1015 | ||
@@ -1904,3 +1019,4 @@ MODULE_LICENSE("GPL"); | |||
1904 | MODULE_ALIAS_RTNL_LINK("gre"); | 1019 | MODULE_ALIAS_RTNL_LINK("gre"); |
1905 | MODULE_ALIAS_RTNL_LINK("gretap"); | 1020 | MODULE_ALIAS_RTNL_LINK("gretap"); |
1906 | MODULE_ALIAS_NETDEV("gre0"); | 1021 | MODULE_ALIAS_NETDEV("gre0"); |
1022 | MODULE_ALIAS_NETDEV("gretap0"); | ||
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c new file mode 100644 index 000000000000..9d96b6853f21 --- /dev/null +++ b/net/ipv4/ip_tunnel.c | |||
@@ -0,0 +1,1035 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2013 Nicira, Inc. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | ||
16 | * 02110-1301, USA | ||
17 | */ | ||
18 | |||
19 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
20 | |||
21 | #include <linux/capability.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/types.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/slab.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include <linux/skbuff.h> | ||
28 | #include <linux/netdevice.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/tcp.h> | ||
31 | #include <linux/udp.h> | ||
32 | #include <linux/if_arp.h> | ||
33 | #include <linux/mroute.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/in6.h> | ||
36 | #include <linux/inetdevice.h> | ||
37 | #include <linux/igmp.h> | ||
38 | #include <linux/netfilter_ipv4.h> | ||
39 | #include <linux/etherdevice.h> | ||
40 | #include <linux/if_ether.h> | ||
41 | #include <linux/if_vlan.h> | ||
42 | #include <linux/rculist.h> | ||
43 | |||
44 | #include <net/sock.h> | ||
45 | #include <net/ip.h> | ||
46 | #include <net/icmp.h> | ||
47 | #include <net/protocol.h> | ||
48 | #include <net/ip_tunnels.h> | ||
49 | #include <net/arp.h> | ||
50 | #include <net/checksum.h> | ||
51 | #include <net/dsfield.h> | ||
52 | #include <net/inet_ecn.h> | ||
53 | #include <net/xfrm.h> | ||
54 | #include <net/net_namespace.h> | ||
55 | #include <net/netns/generic.h> | ||
56 | #include <net/rtnetlink.h> | ||
57 | |||
58 | #if IS_ENABLED(CONFIG_IPV6) | ||
59 | #include <net/ipv6.h> | ||
60 | #include <net/ip6_fib.h> | ||
61 | #include <net/ip6_route.h> | ||
62 | #endif | ||
63 | |||
64 | static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, | ||
65 | __be32 key, __be32 remote) | ||
66 | { | ||
67 | return hash_32((__force u32)key ^ (__force u32)remote, | ||
68 | IP_TNL_HASH_BITS); | ||
69 | } | ||
70 | |||
71 | /* Often modified stats are per cpu, other are shared (netdev->stats) */ | ||
72 | struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, | ||
73 | struct rtnl_link_stats64 *tot) | ||
74 | { | ||
75 | int i; | ||
76 | |||
77 | for_each_possible_cpu(i) { | ||
78 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | ||
79 | u64 rx_packets, rx_bytes, tx_packets, tx_bytes; | ||
80 | unsigned int start; | ||
81 | |||
82 | do { | ||
83 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | ||
84 | rx_packets = tstats->rx_packets; | ||
85 | tx_packets = tstats->tx_packets; | ||
86 | rx_bytes = tstats->rx_bytes; | ||
87 | tx_bytes = tstats->tx_bytes; | ||
88 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | ||
89 | |||
90 | tot->rx_packets += rx_packets; | ||
91 | tot->tx_packets += tx_packets; | ||
92 | tot->rx_bytes += rx_bytes; | ||
93 | tot->tx_bytes += tx_bytes; | ||
94 | } | ||
95 | |||
96 | tot->multicast = dev->stats.multicast; | ||
97 | |||
98 | tot->rx_crc_errors = dev->stats.rx_crc_errors; | ||
99 | tot->rx_fifo_errors = dev->stats.rx_fifo_errors; | ||
100 | tot->rx_length_errors = dev->stats.rx_length_errors; | ||
101 | tot->rx_frame_errors = dev->stats.rx_frame_errors; | ||
102 | tot->rx_errors = dev->stats.rx_errors; | ||
103 | |||
104 | tot->tx_fifo_errors = dev->stats.tx_fifo_errors; | ||
105 | tot->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
106 | tot->tx_dropped = dev->stats.tx_dropped; | ||
107 | tot->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
108 | tot->tx_errors = dev->stats.tx_errors; | ||
109 | |||
110 | tot->collisions = dev->stats.collisions; | ||
111 | |||
112 | return tot; | ||
113 | } | ||
114 | EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); | ||
115 | |||
116 | static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, | ||
117 | __be16 flags, __be32 key) | ||
118 | { | ||
119 | if (p->i_flags & TUNNEL_KEY) { | ||
120 | if (flags & TUNNEL_KEY) | ||
121 | return key == p->i_key; | ||
122 | else | ||
123 | /* key expected, none present */ | ||
124 | return false; | ||
125 | } else | ||
126 | return !(flags & TUNNEL_KEY); | ||
127 | } | ||
128 | |||
129 | /* Fallback tunnel: no source, no destination, no key, no options | ||
130 | |||
131 | Tunnel hash table: | ||
132 | We require exact key match i.e. if a key is present in packet | ||
133 | it will match only tunnel with the same key; if it is not present, | ||
134 | it will match only keyless tunnel. | ||
135 | |||
136 | All keysless packets, if not matched configured keyless tunnels | ||
137 | will match fallback tunnel. | ||
138 | Given src, dst and key, find appropriate for input tunnel. | ||
139 | */ | ||
140 | struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, | ||
141 | int link, __be16 flags, | ||
142 | __be32 remote, __be32 local, | ||
143 | __be32 key) | ||
144 | { | ||
145 | unsigned int hash; | ||
146 | struct ip_tunnel *t, *cand = NULL; | ||
147 | struct hlist_head *head; | ||
148 | |||
149 | hash = ip_tunnel_hash(itn, key, remote); | ||
150 | head = &itn->tunnels[hash]; | ||
151 | |||
152 | hlist_for_each_entry_rcu(t, head, hash_node) { | ||
153 | if (local != t->parms.iph.saddr || | ||
154 | remote != t->parms.iph.daddr || | ||
155 | !(t->dev->flags & IFF_UP)) | ||
156 | continue; | ||
157 | |||
158 | if (!ip_tunnel_key_match(&t->parms, flags, key)) | ||
159 | continue; | ||
160 | |||
161 | if (t->parms.link == link) | ||
162 | return t; | ||
163 | else | ||
164 | cand = t; | ||
165 | } | ||
166 | |||
167 | hlist_for_each_entry_rcu(t, head, hash_node) { | ||
168 | if (remote != t->parms.iph.daddr || | ||
169 | !(t->dev->flags & IFF_UP)) | ||
170 | continue; | ||
171 | |||
172 | if (!ip_tunnel_key_match(&t->parms, flags, key)) | ||
173 | continue; | ||
174 | |||
175 | if (t->parms.link == link) | ||
176 | return t; | ||
177 | else if (!cand) | ||
178 | cand = t; | ||
179 | } | ||
180 | |||
181 | hash = ip_tunnel_hash(itn, key, 0); | ||
182 | head = &itn->tunnels[hash]; | ||
183 | |||
184 | hlist_for_each_entry_rcu(t, head, hash_node) { | ||
185 | if ((local != t->parms.iph.saddr && | ||
186 | (local != t->parms.iph.daddr || | ||
187 | !ipv4_is_multicast(local))) || | ||
188 | !(t->dev->flags & IFF_UP)) | ||
189 | continue; | ||
190 | |||
191 | if (!ip_tunnel_key_match(&t->parms, flags, key)) | ||
192 | continue; | ||
193 | |||
194 | if (t->parms.link == link) | ||
195 | return t; | ||
196 | else if (!cand) | ||
197 | cand = t; | ||
198 | } | ||
199 | |||
200 | if (flags & TUNNEL_NO_KEY) | ||
201 | goto skip_key_lookup; | ||
202 | |||
203 | hlist_for_each_entry_rcu(t, head, hash_node) { | ||
204 | if (t->parms.i_key != key || | ||
205 | !(t->dev->flags & IFF_UP)) | ||
206 | continue; | ||
207 | |||
208 | if (t->parms.link == link) | ||
209 | return t; | ||
210 | else if (!cand) | ||
211 | cand = t; | ||
212 | } | ||
213 | |||
214 | skip_key_lookup: | ||
215 | if (cand) | ||
216 | return cand; | ||
217 | |||
218 | if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) | ||
219 | return netdev_priv(itn->fb_tunnel_dev); | ||
220 | |||
221 | |||
222 | return NULL; | ||
223 | } | ||
224 | EXPORT_SYMBOL_GPL(ip_tunnel_lookup); | ||
225 | |||
226 | static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, | ||
227 | struct ip_tunnel_parm *parms) | ||
228 | { | ||
229 | unsigned int h; | ||
230 | __be32 remote; | ||
231 | |||
232 | if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) | ||
233 | remote = parms->iph.daddr; | ||
234 | else | ||
235 | remote = 0; | ||
236 | |||
237 | h = ip_tunnel_hash(itn, parms->i_key, remote); | ||
238 | return &itn->tunnels[h]; | ||
239 | } | ||
240 | |||
241 | static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) | ||
242 | { | ||
243 | struct hlist_head *head = ip_bucket(itn, &t->parms); | ||
244 | |||
245 | hlist_add_head_rcu(&t->hash_node, head); | ||
246 | } | ||
247 | |||
248 | static void ip_tunnel_del(struct ip_tunnel *t) | ||
249 | { | ||
250 | hlist_del_init_rcu(&t->hash_node); | ||
251 | } | ||
252 | |||
253 | static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, | ||
254 | struct ip_tunnel_parm *parms, | ||
255 | int type) | ||
256 | { | ||
257 | __be32 remote = parms->iph.daddr; | ||
258 | __be32 local = parms->iph.saddr; | ||
259 | __be32 key = parms->i_key; | ||
260 | int link = parms->link; | ||
261 | struct ip_tunnel *t = NULL; | ||
262 | struct hlist_head *head = ip_bucket(itn, parms); | ||
263 | |||
264 | hlist_for_each_entry_rcu(t, head, hash_node) { | ||
265 | if (local == t->parms.iph.saddr && | ||
266 | remote == t->parms.iph.daddr && | ||
267 | key == t->parms.i_key && | ||
268 | link == t->parms.link && | ||
269 | type == t->dev->type) | ||
270 | break; | ||
271 | } | ||
272 | return t; | ||
273 | } | ||
274 | |||
275 | static struct net_device *__ip_tunnel_create(struct net *net, | ||
276 | const struct rtnl_link_ops *ops, | ||
277 | struct ip_tunnel_parm *parms) | ||
278 | { | ||
279 | int err; | ||
280 | struct ip_tunnel *tunnel; | ||
281 | struct net_device *dev; | ||
282 | char name[IFNAMSIZ]; | ||
283 | |||
284 | if (parms->name[0]) | ||
285 | strlcpy(name, parms->name, IFNAMSIZ); | ||
286 | else { | ||
287 | if (strlen(ops->kind) + 3 >= IFNAMSIZ) { | ||
288 | err = -E2BIG; | ||
289 | goto failed; | ||
290 | } | ||
291 | strlcpy(name, ops->kind, IFNAMSIZ); | ||
292 | strncat(name, "%d", 2); | ||
293 | } | ||
294 | |||
295 | ASSERT_RTNL(); | ||
296 | dev = alloc_netdev(ops->priv_size, name, ops->setup); | ||
297 | if (!dev) { | ||
298 | err = -ENOMEM; | ||
299 | goto failed; | ||
300 | } | ||
301 | dev_net_set(dev, net); | ||
302 | |||
303 | dev->rtnl_link_ops = ops; | ||
304 | |||
305 | tunnel = netdev_priv(dev); | ||
306 | tunnel->parms = *parms; | ||
307 | |||
308 | err = register_netdevice(dev); | ||
309 | if (err) | ||
310 | goto failed_free; | ||
311 | |||
312 | return dev; | ||
313 | |||
314 | failed_free: | ||
315 | free_netdev(dev); | ||
316 | failed: | ||
317 | return ERR_PTR(err); | ||
318 | } | ||
319 | |||
320 | static inline struct rtable *ip_route_output_tunnel(struct net *net, | ||
321 | struct flowi4 *fl4, | ||
322 | int proto, | ||
323 | __be32 daddr, __be32 saddr, | ||
324 | __be32 key, __u8 tos, int oif) | ||
325 | { | ||
326 | memset(fl4, 0, sizeof(*fl4)); | ||
327 | fl4->flowi4_oif = oif; | ||
328 | fl4->daddr = daddr; | ||
329 | fl4->saddr = saddr; | ||
330 | fl4->flowi4_tos = tos; | ||
331 | fl4->flowi4_proto = proto; | ||
332 | fl4->fl4_gre_key = key; | ||
333 | return ip_route_output_key(net, fl4); | ||
334 | } | ||
335 | |||
336 | static int ip_tunnel_bind_dev(struct net_device *dev) | ||
337 | { | ||
338 | struct net_device *tdev = NULL; | ||
339 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
340 | const struct iphdr *iph; | ||
341 | int hlen = LL_MAX_HEADER; | ||
342 | int mtu = ETH_DATA_LEN; | ||
343 | int t_hlen = tunnel->hlen + sizeof(struct iphdr); | ||
344 | |||
345 | iph = &tunnel->parms.iph; | ||
346 | |||
347 | /* Guess output device to choose reasonable mtu and needed_headroom */ | ||
348 | if (iph->daddr) { | ||
349 | struct flowi4 fl4; | ||
350 | struct rtable *rt; | ||
351 | |||
352 | rt = ip_route_output_tunnel(dev_net(dev), &fl4, | ||
353 | tunnel->parms.iph.protocol, | ||
354 | iph->daddr, iph->saddr, | ||
355 | tunnel->parms.o_key, | ||
356 | RT_TOS(iph->tos), | ||
357 | tunnel->parms.link); | ||
358 | if (!IS_ERR(rt)) { | ||
359 | tdev = rt->dst.dev; | ||
360 | ip_rt_put(rt); | ||
361 | } | ||
362 | if (dev->type != ARPHRD_ETHER) | ||
363 | dev->flags |= IFF_POINTOPOINT; | ||
364 | } | ||
365 | |||
366 | if (!tdev && tunnel->parms.link) | ||
367 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | ||
368 | |||
369 | if (tdev) { | ||
370 | hlen = tdev->hard_header_len + tdev->needed_headroom; | ||
371 | mtu = tdev->mtu; | ||
372 | } | ||
373 | dev->iflink = tunnel->parms.link; | ||
374 | |||
375 | dev->needed_headroom = t_hlen + hlen; | ||
376 | mtu -= (dev->hard_header_len + t_hlen); | ||
377 | |||
378 | if (mtu < 68) | ||
379 | mtu = 68; | ||
380 | |||
381 | return mtu; | ||
382 | } | ||
383 | |||
384 | static struct ip_tunnel *ip_tunnel_create(struct net *net, | ||
385 | struct ip_tunnel_net *itn, | ||
386 | struct ip_tunnel_parm *parms) | ||
387 | { | ||
388 | struct ip_tunnel *nt, *fbt; | ||
389 | struct net_device *dev; | ||
390 | |||
391 | BUG_ON(!itn->fb_tunnel_dev); | ||
392 | fbt = netdev_priv(itn->fb_tunnel_dev); | ||
393 | dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); | ||
394 | if (IS_ERR(dev)) | ||
395 | return NULL; | ||
396 | |||
397 | dev->mtu = ip_tunnel_bind_dev(dev); | ||
398 | |||
399 | nt = netdev_priv(dev); | ||
400 | ip_tunnel_add(itn, nt); | ||
401 | return nt; | ||
402 | } | ||
403 | |||
404 | int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, | ||
405 | const struct tnl_ptk_info *tpi, bool log_ecn_error) | ||
406 | { | ||
407 | struct pcpu_tstats *tstats; | ||
408 | const struct iphdr *iph = ip_hdr(skb); | ||
409 | int err; | ||
410 | |||
411 | secpath_reset(skb); | ||
412 | |||
413 | skb->protocol = tpi->proto; | ||
414 | |||
415 | skb->mac_header = skb->network_header; | ||
416 | __pskb_pull(skb, tunnel->hlen); | ||
417 | skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen); | ||
418 | #ifdef CONFIG_NET_IPGRE_BROADCAST | ||
419 | if (ipv4_is_multicast(iph->daddr)) { | ||
420 | /* Looped back packet, drop it! */ | ||
421 | if (rt_is_output_route(skb_rtable(skb))) | ||
422 | goto drop; | ||
423 | tunnel->dev->stats.multicast++; | ||
424 | skb->pkt_type = PACKET_BROADCAST; | ||
425 | } | ||
426 | #endif | ||
427 | |||
428 | if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || | ||
429 | ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { | ||
430 | tunnel->dev->stats.rx_crc_errors++; | ||
431 | tunnel->dev->stats.rx_errors++; | ||
432 | goto drop; | ||
433 | } | ||
434 | |||
435 | if (tunnel->parms.i_flags&TUNNEL_SEQ) { | ||
436 | if (!(tpi->flags&TUNNEL_SEQ) || | ||
437 | (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { | ||
438 | tunnel->dev->stats.rx_fifo_errors++; | ||
439 | tunnel->dev->stats.rx_errors++; | ||
440 | goto drop; | ||
441 | } | ||
442 | tunnel->i_seqno = ntohl(tpi->seq) + 1; | ||
443 | } | ||
444 | |||
445 | /* Warning: All skb pointers will be invalidated! */ | ||
446 | if (tunnel->dev->type == ARPHRD_ETHER) { | ||
447 | if (!pskb_may_pull(skb, ETH_HLEN)) { | ||
448 | tunnel->dev->stats.rx_length_errors++; | ||
449 | tunnel->dev->stats.rx_errors++; | ||
450 | goto drop; | ||
451 | } | ||
452 | |||
453 | iph = ip_hdr(skb); | ||
454 | skb->protocol = eth_type_trans(skb, tunnel->dev); | ||
455 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | ||
456 | } | ||
457 | |||
458 | skb->pkt_type = PACKET_HOST; | ||
459 | __skb_tunnel_rx(skb, tunnel->dev); | ||
460 | |||
461 | skb_reset_network_header(skb); | ||
462 | err = IP_ECN_decapsulate(iph, skb); | ||
463 | if (unlikely(err)) { | ||
464 | if (log_ecn_error) | ||
465 | net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", | ||
466 | &iph->saddr, iph->tos); | ||
467 | if (err > 1) { | ||
468 | ++tunnel->dev->stats.rx_frame_errors; | ||
469 | ++tunnel->dev->stats.rx_errors; | ||
470 | goto drop; | ||
471 | } | ||
472 | } | ||
473 | |||
474 | tstats = this_cpu_ptr(tunnel->dev->tstats); | ||
475 | u64_stats_update_begin(&tstats->syncp); | ||
476 | tstats->rx_packets++; | ||
477 | tstats->rx_bytes += skb->len; | ||
478 | u64_stats_update_end(&tstats->syncp); | ||
479 | |||
480 | gro_cells_receive(&tunnel->gro_cells, skb); | ||
481 | return 0; | ||
482 | |||
483 | drop: | ||
484 | kfree_skb(skb); | ||
485 | return 0; | ||
486 | } | ||
487 | EXPORT_SYMBOL_GPL(ip_tunnel_rcv); | ||
488 | |||
489 | void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, | ||
490 | const struct iphdr *tnl_params) | ||
491 | { | ||
492 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
493 | const struct iphdr *inner_iph; | ||
494 | struct iphdr *iph; | ||
495 | struct flowi4 fl4; | ||
496 | u8 tos, ttl; | ||
497 | __be16 df; | ||
498 | struct rtable *rt; /* Route to the other host */ | ||
499 | struct net_device *tdev; /* Device to other host */ | ||
500 | unsigned int max_headroom; /* The extra header space needed */ | ||
501 | __be32 dst; | ||
502 | int mtu; | ||
503 | |||
504 | inner_iph = (const struct iphdr *)skb_inner_network_header(skb); | ||
505 | |||
506 | dst = tnl_params->daddr; | ||
507 | if (dst == 0) { | ||
508 | /* NBMA tunnel */ | ||
509 | |||
510 | if (skb_dst(skb) == NULL) { | ||
511 | dev->stats.tx_fifo_errors++; | ||
512 | goto tx_error; | ||
513 | } | ||
514 | |||
515 | if (skb->protocol == htons(ETH_P_IP)) { | ||
516 | rt = skb_rtable(skb); | ||
517 | dst = rt_nexthop(rt, inner_iph->daddr); | ||
518 | } | ||
519 | #if IS_ENABLED(CONFIG_IPV6) | ||
520 | else if (skb->protocol == htons(ETH_P_IPV6)) { | ||
521 | const struct in6_addr *addr6; | ||
522 | struct neighbour *neigh; | ||
523 | bool do_tx_error_icmp; | ||
524 | int addr_type; | ||
525 | |||
526 | neigh = dst_neigh_lookup(skb_dst(skb), | ||
527 | &ipv6_hdr(skb)->daddr); | ||
528 | if (neigh == NULL) | ||
529 | goto tx_error; | ||
530 | |||
531 | addr6 = (const struct in6_addr *)&neigh->primary_key; | ||
532 | addr_type = ipv6_addr_type(addr6); | ||
533 | |||
534 | if (addr_type == IPV6_ADDR_ANY) { | ||
535 | addr6 = &ipv6_hdr(skb)->daddr; | ||
536 | addr_type = ipv6_addr_type(addr6); | ||
537 | } | ||
538 | |||
539 | if ((addr_type & IPV6_ADDR_COMPATv4) == 0) | ||
540 | do_tx_error_icmp = true; | ||
541 | else { | ||
542 | do_tx_error_icmp = false; | ||
543 | dst = addr6->s6_addr32[3]; | ||
544 | } | ||
545 | neigh_release(neigh); | ||
546 | if (do_tx_error_icmp) | ||
547 | goto tx_error_icmp; | ||
548 | } | ||
549 | #endif | ||
550 | else | ||
551 | goto tx_error; | ||
552 | } | ||
553 | |||
554 | tos = tnl_params->tos; | ||
555 | if (tos & 0x1) { | ||
556 | tos &= ~0x1; | ||
557 | if (skb->protocol == htons(ETH_P_IP)) | ||
558 | tos = inner_iph->tos; | ||
559 | else if (skb->protocol == htons(ETH_P_IPV6)) | ||
560 | tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); | ||
561 | } | ||
562 | |||
563 | rt = ip_route_output_tunnel(dev_net(dev), &fl4, | ||
564 | tunnel->parms.iph.protocol, | ||
565 | dst, tnl_params->saddr, | ||
566 | tunnel->parms.o_key, | ||
567 | RT_TOS(tos), | ||
568 | tunnel->parms.link); | ||
569 | if (IS_ERR(rt)) { | ||
570 | dev->stats.tx_carrier_errors++; | ||
571 | goto tx_error; | ||
572 | } | ||
573 | tdev = rt->dst.dev; | ||
574 | |||
575 | if (tdev == dev) { | ||
576 | ip_rt_put(rt); | ||
577 | dev->stats.collisions++; | ||
578 | goto tx_error; | ||
579 | } | ||
580 | |||
581 | df = tnl_params->frag_off; | ||
582 | |||
583 | if (df) | ||
584 | mtu = dst_mtu(&rt->dst) - dev->hard_header_len | ||
585 | - sizeof(struct iphdr); | ||
586 | else | ||
587 | mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; | ||
588 | |||
589 | if (skb_dst(skb)) | ||
590 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); | ||
591 | |||
592 | if (skb->protocol == htons(ETH_P_IP)) { | ||
593 | df |= (inner_iph->frag_off&htons(IP_DF)); | ||
594 | |||
595 | if (!skb_is_gso(skb) && | ||
596 | (inner_iph->frag_off&htons(IP_DF)) && | ||
597 | mtu < ntohs(inner_iph->tot_len)) { | ||
598 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); | ||
599 | ip_rt_put(rt); | ||
600 | goto tx_error; | ||
601 | } | ||
602 | } | ||
603 | #if IS_ENABLED(CONFIG_IPV6) | ||
604 | else if (skb->protocol == htons(ETH_P_IPV6)) { | ||
605 | struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); | ||
606 | |||
607 | if (rt6 && mtu < dst_mtu(skb_dst(skb)) && | ||
608 | mtu >= IPV6_MIN_MTU) { | ||
609 | if ((tunnel->parms.iph.daddr && | ||
610 | !ipv4_is_multicast(tunnel->parms.iph.daddr)) || | ||
611 | rt6->rt6i_dst.plen == 128) { | ||
612 | rt6->rt6i_flags |= RTF_MODIFIED; | ||
613 | dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); | ||
614 | } | ||
615 | } | ||
616 | |||
617 | if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && | ||
618 | mtu < skb->len) { | ||
619 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); | ||
620 | ip_rt_put(rt); | ||
621 | goto tx_error; | ||
622 | } | ||
623 | } | ||
624 | #endif | ||
625 | |||
626 | if (tunnel->err_count > 0) { | ||
627 | if (time_before(jiffies, | ||
628 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { | ||
629 | tunnel->err_count--; | ||
630 | |||
631 | dst_link_failure(skb); | ||
632 | } else | ||
633 | tunnel->err_count = 0; | ||
634 | } | ||
635 | |||
636 | ttl = tnl_params->ttl; | ||
637 | if (ttl == 0) { | ||
638 | if (skb->protocol == htons(ETH_P_IP)) | ||
639 | ttl = inner_iph->ttl; | ||
640 | #if IS_ENABLED(CONFIG_IPV6) | ||
641 | else if (skb->protocol == htons(ETH_P_IPV6)) | ||
642 | ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; | ||
643 | #endif | ||
644 | else | ||
645 | ttl = ip4_dst_hoplimit(&rt->dst); | ||
646 | } | ||
647 | |||
648 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr) | ||
649 | + rt->dst.header_len; | ||
650 | if (max_headroom > dev->needed_headroom) { | ||
651 | dev->needed_headroom = max_headroom; | ||
652 | if (skb_cow_head(skb, dev->needed_headroom)) { | ||
653 | dev->stats.tx_dropped++; | ||
654 | dev_kfree_skb(skb); | ||
655 | return; | ||
656 | } | ||
657 | } | ||
658 | |||
659 | skb_dst_drop(skb); | ||
660 | skb_dst_set(skb, &rt->dst); | ||
661 | memset(IPCB(skb), 0, sizeof(*IPCB(skb))); | ||
662 | |||
663 | /* Push down and install the IP header. */ | ||
664 | skb_push(skb, sizeof(struct iphdr)); | ||
665 | skb_reset_network_header(skb); | ||
666 | |||
667 | iph = ip_hdr(skb); | ||
668 | inner_iph = (const struct iphdr *)skb_inner_network_header(skb); | ||
669 | |||
670 | iph->version = 4; | ||
671 | iph->ihl = sizeof(struct iphdr) >> 2; | ||
672 | iph->frag_off = df; | ||
673 | iph->protocol = tnl_params->protocol; | ||
674 | iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); | ||
675 | iph->daddr = fl4.daddr; | ||
676 | iph->saddr = fl4.saddr; | ||
677 | iph->ttl = ttl; | ||
678 | tunnel_ip_select_ident(skb, inner_iph, &rt->dst); | ||
679 | |||
680 | iptunnel_xmit(skb, dev); | ||
681 | return; | ||
682 | |||
683 | #if IS_ENABLED(CONFIG_IPV6) | ||
684 | tx_error_icmp: | ||
685 | dst_link_failure(skb); | ||
686 | #endif | ||
687 | tx_error: | ||
688 | dev->stats.tx_errors++; | ||
689 | dev_kfree_skb(skb); | ||
690 | } | ||
691 | EXPORT_SYMBOL_GPL(ip_tunnel_xmit); | ||
692 | |||
693 | static void ip_tunnel_update(struct ip_tunnel_net *itn, | ||
694 | struct ip_tunnel *t, | ||
695 | struct net_device *dev, | ||
696 | struct ip_tunnel_parm *p, | ||
697 | bool set_mtu) | ||
698 | { | ||
699 | ip_tunnel_del(t); | ||
700 | t->parms.iph.saddr = p->iph.saddr; | ||
701 | t->parms.iph.daddr = p->iph.daddr; | ||
702 | t->parms.i_key = p->i_key; | ||
703 | t->parms.o_key = p->o_key; | ||
704 | if (dev->type != ARPHRD_ETHER) { | ||
705 | memcpy(dev->dev_addr, &p->iph.saddr, 4); | ||
706 | memcpy(dev->broadcast, &p->iph.daddr, 4); | ||
707 | } | ||
708 | ip_tunnel_add(itn, t); | ||
709 | |||
710 | t->parms.iph.ttl = p->iph.ttl; | ||
711 | t->parms.iph.tos = p->iph.tos; | ||
712 | t->parms.iph.frag_off = p->iph.frag_off; | ||
713 | |||
714 | if (t->parms.link != p->link) { | ||
715 | int mtu; | ||
716 | |||
717 | t->parms.link = p->link; | ||
718 | mtu = ip_tunnel_bind_dev(dev); | ||
719 | if (set_mtu) | ||
720 | dev->mtu = mtu; | ||
721 | } | ||
722 | netdev_state_change(dev); | ||
723 | } | ||
724 | |||
725 | int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) | ||
726 | { | ||
727 | int err = 0; | ||
728 | struct ip_tunnel *t; | ||
729 | struct net *net = dev_net(dev); | ||
730 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
731 | struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); | ||
732 | |||
733 | BUG_ON(!itn->fb_tunnel_dev); | ||
734 | switch (cmd) { | ||
735 | case SIOCGETTUNNEL: | ||
736 | t = NULL; | ||
737 | if (dev == itn->fb_tunnel_dev) | ||
738 | t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); | ||
739 | if (t == NULL) | ||
740 | t = netdev_priv(dev); | ||
741 | memcpy(p, &t->parms, sizeof(*p)); | ||
742 | break; | ||
743 | |||
744 | case SIOCADDTUNNEL: | ||
745 | case SIOCCHGTUNNEL: | ||
746 | err = -EPERM; | ||
747 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
748 | goto done; | ||
749 | if (p->iph.ttl) | ||
750 | p->iph.frag_off |= htons(IP_DF); | ||
751 | if (!(p->i_flags&TUNNEL_KEY)) | ||
752 | p->i_key = 0; | ||
753 | if (!(p->o_flags&TUNNEL_KEY)) | ||
754 | p->o_key = 0; | ||
755 | |||
756 | t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); | ||
757 | |||
758 | if (!t && (cmd == SIOCADDTUNNEL)) | ||
759 | t = ip_tunnel_create(net, itn, p); | ||
760 | |||
761 | if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | ||
762 | if (t != NULL) { | ||
763 | if (t->dev != dev) { | ||
764 | err = -EEXIST; | ||
765 | break; | ||
766 | } | ||
767 | } else { | ||
768 | unsigned int nflags = 0; | ||
769 | |||
770 | if (ipv4_is_multicast(p->iph.daddr)) | ||
771 | nflags = IFF_BROADCAST; | ||
772 | else if (p->iph.daddr) | ||
773 | nflags = IFF_POINTOPOINT; | ||
774 | |||
775 | if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { | ||
776 | err = -EINVAL; | ||
777 | break; | ||
778 | } | ||
779 | |||
780 | t = netdev_priv(dev); | ||
781 | } | ||
782 | } | ||
783 | |||
784 | if (t) { | ||
785 | err = 0; | ||
786 | ip_tunnel_update(itn, t, dev, p, true); | ||
787 | } else | ||
788 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | ||
789 | break; | ||
790 | |||
791 | case SIOCDELTUNNEL: | ||
792 | err = -EPERM; | ||
793 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
794 | goto done; | ||
795 | |||
796 | if (dev == itn->fb_tunnel_dev) { | ||
797 | err = -ENOENT; | ||
798 | t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); | ||
799 | if (t == NULL) | ||
800 | goto done; | ||
801 | err = -EPERM; | ||
802 | if (t == netdev_priv(itn->fb_tunnel_dev)) | ||
803 | goto done; | ||
804 | dev = t->dev; | ||
805 | } | ||
806 | unregister_netdevice(dev); | ||
807 | err = 0; | ||
808 | break; | ||
809 | |||
810 | default: | ||
811 | err = -EINVAL; | ||
812 | } | ||
813 | |||
814 | done: | ||
815 | return err; | ||
816 | } | ||
817 | EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); | ||
818 | |||
819 | int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) | ||
820 | { | ||
821 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
822 | int t_hlen = tunnel->hlen + sizeof(struct iphdr); | ||
823 | |||
824 | if (new_mtu < 68 || | ||
825 | new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) | ||
826 | return -EINVAL; | ||
827 | dev->mtu = new_mtu; | ||
828 | return 0; | ||
829 | } | ||
830 | EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); | ||
831 | |||
832 | static void ip_tunnel_dev_free(struct net_device *dev) | ||
833 | { | ||
834 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
835 | |||
836 | gro_cells_destroy(&tunnel->gro_cells); | ||
837 | free_percpu(dev->tstats); | ||
838 | free_netdev(dev); | ||
839 | } | ||
840 | |||
841 | void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) | ||
842 | { | ||
843 | struct net *net = dev_net(dev); | ||
844 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
845 | struct ip_tunnel_net *itn; | ||
846 | |||
847 | itn = net_generic(net, tunnel->ip_tnl_net_id); | ||
848 | |||
849 | if (itn->fb_tunnel_dev != dev) { | ||
850 | ip_tunnel_del(netdev_priv(dev)); | ||
851 | unregister_netdevice_queue(dev, head); | ||
852 | } | ||
853 | } | ||
854 | EXPORT_SYMBOL_GPL(ip_tunnel_dellink); | ||
855 | |||
856 | int __net_init ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, | ||
857 | struct rtnl_link_ops *ops, char *devname) | ||
858 | { | ||
859 | struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); | ||
860 | struct ip_tunnel_parm parms; | ||
861 | |||
862 | itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL); | ||
863 | if (!itn->tunnels) | ||
864 | return -ENOMEM; | ||
865 | |||
866 | if (!ops) { | ||
867 | itn->fb_tunnel_dev = NULL; | ||
868 | return 0; | ||
869 | } | ||
870 | memset(&parms, 0, sizeof(parms)); | ||
871 | if (devname) | ||
872 | strlcpy(parms.name, devname, IFNAMSIZ); | ||
873 | |||
874 | rtnl_lock(); | ||
875 | itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); | ||
876 | rtnl_unlock(); | ||
877 | if (IS_ERR(itn->fb_tunnel_dev)) { | ||
878 | kfree(itn->tunnels); | ||
879 | return PTR_ERR(itn->fb_tunnel_dev); | ||
880 | } | ||
881 | |||
882 | return 0; | ||
883 | } | ||
884 | EXPORT_SYMBOL_GPL(ip_tunnel_init_net); | ||
885 | |||
886 | static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head) | ||
887 | { | ||
888 | int h; | ||
889 | |||
890 | for (h = 0; h < IP_TNL_HASH_SIZE; h++) { | ||
891 | struct ip_tunnel *t; | ||
892 | struct hlist_node *n; | ||
893 | struct hlist_head *thead = &itn->tunnels[h]; | ||
894 | |||
895 | hlist_for_each_entry_safe(t, n, thead, hash_node) | ||
896 | unregister_netdevice_queue(t->dev, head); | ||
897 | } | ||
898 | if (itn->fb_tunnel_dev) | ||
899 | unregister_netdevice_queue(itn->fb_tunnel_dev, head); | ||
900 | } | ||
901 | |||
902 | void __net_exit ip_tunnel_delete_net(struct ip_tunnel_net *itn) | ||
903 | { | ||
904 | LIST_HEAD(list); | ||
905 | |||
906 | rtnl_lock(); | ||
907 | ip_tunnel_destroy(itn, &list); | ||
908 | unregister_netdevice_many(&list); | ||
909 | rtnl_unlock(); | ||
910 | kfree(itn->tunnels); | ||
911 | } | ||
912 | EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); | ||
913 | |||
914 | int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], | ||
915 | struct ip_tunnel_parm *p) | ||
916 | { | ||
917 | struct ip_tunnel *nt; | ||
918 | struct net *net = dev_net(dev); | ||
919 | struct ip_tunnel_net *itn; | ||
920 | int mtu; | ||
921 | int err; | ||
922 | |||
923 | nt = netdev_priv(dev); | ||
924 | itn = net_generic(net, nt->ip_tnl_net_id); | ||
925 | |||
926 | if (ip_tunnel_find(itn, p, dev->type)) | ||
927 | return -EEXIST; | ||
928 | |||
929 | nt->parms = *p; | ||
930 | err = register_netdevice(dev); | ||
931 | if (err) | ||
932 | goto out; | ||
933 | |||
934 | if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) | ||
935 | eth_hw_addr_random(dev); | ||
936 | |||
937 | mtu = ip_tunnel_bind_dev(dev); | ||
938 | if (!tb[IFLA_MTU]) | ||
939 | dev->mtu = mtu; | ||
940 | |||
941 | ip_tunnel_add(itn, nt); | ||
942 | |||
943 | out: | ||
944 | return err; | ||
945 | } | ||
946 | EXPORT_SYMBOL_GPL(ip_tunnel_newlink); | ||
947 | |||
948 | int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], | ||
949 | struct ip_tunnel_parm *p) | ||
950 | { | ||
951 | struct ip_tunnel *t, *nt; | ||
952 | struct net *net = dev_net(dev); | ||
953 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
954 | struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); | ||
955 | |||
956 | if (dev == itn->fb_tunnel_dev) | ||
957 | return -EINVAL; | ||
958 | |||
959 | nt = netdev_priv(dev); | ||
960 | |||
961 | t = ip_tunnel_find(itn, p, dev->type); | ||
962 | |||
963 | if (t) { | ||
964 | if (t->dev != dev) | ||
965 | return -EEXIST; | ||
966 | } else { | ||
967 | t = nt; | ||
968 | |||
969 | if (dev->type != ARPHRD_ETHER) { | ||
970 | unsigned int nflags = 0; | ||
971 | |||
972 | if (ipv4_is_multicast(p->iph.daddr)) | ||
973 | nflags = IFF_BROADCAST; | ||
974 | else if (p->iph.daddr) | ||
975 | nflags = IFF_POINTOPOINT; | ||
976 | |||
977 | if ((dev->flags ^ nflags) & | ||
978 | (IFF_POINTOPOINT | IFF_BROADCAST)) | ||
979 | return -EINVAL; | ||
980 | } | ||
981 | } | ||
982 | |||
983 | ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]); | ||
984 | return 0; | ||
985 | } | ||
986 | EXPORT_SYMBOL_GPL(ip_tunnel_changelink); | ||
987 | |||
988 | int ip_tunnel_init(struct net_device *dev) | ||
989 | { | ||
990 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
991 | struct iphdr *iph = &tunnel->parms.iph; | ||
992 | int err; | ||
993 | |||
994 | dev->destructor = ip_tunnel_dev_free; | ||
995 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
996 | if (!dev->tstats) | ||
997 | return -ENOMEM; | ||
998 | |||
999 | err = gro_cells_init(&tunnel->gro_cells, dev); | ||
1000 | if (err) { | ||
1001 | free_percpu(dev->tstats); | ||
1002 | return err; | ||
1003 | } | ||
1004 | |||
1005 | tunnel->dev = dev; | ||
1006 | strcpy(tunnel->parms.name, dev->name); | ||
1007 | iph->version = 4; | ||
1008 | iph->ihl = 5; | ||
1009 | |||
1010 | return 0; | ||
1011 | } | ||
1012 | EXPORT_SYMBOL_GPL(ip_tunnel_init); | ||
1013 | |||
1014 | void ip_tunnel_uninit(struct net_device *dev) | ||
1015 | { | ||
1016 | struct net *net = dev_net(dev); | ||
1017 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
1018 | struct ip_tunnel_net *itn; | ||
1019 | |||
1020 | itn = net_generic(net, tunnel->ip_tnl_net_id); | ||
1021 | /* fb_tunnel_dev will be unregisted in net-exit call. */ | ||
1022 | if (itn->fb_tunnel_dev != dev) | ||
1023 | ip_tunnel_del(netdev_priv(dev)); | ||
1024 | } | ||
1025 | EXPORT_SYMBOL_GPL(ip_tunnel_uninit); | ||
1026 | |||
1027 | /* Do least required initialization, rest of init is done in tunnel_init call */ | ||
1028 | void ip_tunnel_setup(struct net_device *dev, int net_id) | ||
1029 | { | ||
1030 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
1031 | tunnel->ip_tnl_net_id = net_id; | ||
1032 | } | ||
1033 | EXPORT_SYMBOL_GPL(ip_tunnel_setup); | ||
1034 | |||
1035 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index c3a4233c0ac2..9d2bdb2c1d3f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c | |||
@@ -38,7 +38,7 @@ | |||
38 | #include <net/sock.h> | 38 | #include <net/sock.h> |
39 | #include <net/ip.h> | 39 | #include <net/ip.h> |
40 | #include <net/icmp.h> | 40 | #include <net/icmp.h> |
41 | #include <net/ipip.h> | 41 | #include <net/ip_tunnels.h> |
42 | #include <net/inet_ecn.h> | 42 | #include <net/inet_ecn.h> |
43 | #include <net/xfrm.h> | 43 | #include <net/xfrm.h> |
44 | #include <net/net_namespace.h> | 44 | #include <net/net_namespace.h> |
@@ -82,44 +82,6 @@ static int vti_tunnel_bind_dev(struct net_device *dev); | |||
82 | } while (0) | 82 | } while (0) |
83 | 83 | ||
84 | 84 | ||
85 | static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev, | ||
86 | struct rtnl_link_stats64 *tot) | ||
87 | { | ||
88 | int i; | ||
89 | |||
90 | for_each_possible_cpu(i) { | ||
91 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | ||
92 | u64 rx_packets, rx_bytes, tx_packets, tx_bytes; | ||
93 | unsigned int start; | ||
94 | |||
95 | do { | ||
96 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | ||
97 | rx_packets = tstats->rx_packets; | ||
98 | tx_packets = tstats->tx_packets; | ||
99 | rx_bytes = tstats->rx_bytes; | ||
100 | tx_bytes = tstats->tx_bytes; | ||
101 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | ||
102 | |||
103 | tot->rx_packets += rx_packets; | ||
104 | tot->tx_packets += tx_packets; | ||
105 | tot->rx_bytes += rx_bytes; | ||
106 | tot->tx_bytes += tx_bytes; | ||
107 | } | ||
108 | |||
109 | tot->multicast = dev->stats.multicast; | ||
110 | tot->rx_crc_errors = dev->stats.rx_crc_errors; | ||
111 | tot->rx_fifo_errors = dev->stats.rx_fifo_errors; | ||
112 | tot->rx_length_errors = dev->stats.rx_length_errors; | ||
113 | tot->rx_errors = dev->stats.rx_errors; | ||
114 | tot->tx_fifo_errors = dev->stats.tx_fifo_errors; | ||
115 | tot->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
116 | tot->tx_dropped = dev->stats.tx_dropped; | ||
117 | tot->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
118 | tot->tx_errors = dev->stats.tx_errors; | ||
119 | |||
120 | return tot; | ||
121 | } | ||
122 | |||
123 | static struct ip_tunnel *vti_tunnel_lookup(struct net *net, | 85 | static struct ip_tunnel *vti_tunnel_lookup(struct net *net, |
124 | __be32 remote, __be32 local) | 86 | __be32 remote, __be32 local) |
125 | { | 87 | { |
@@ -597,7 +559,7 @@ static const struct net_device_ops vti_netdev_ops = { | |||
597 | .ndo_start_xmit = vti_tunnel_xmit, | 559 | .ndo_start_xmit = vti_tunnel_xmit, |
598 | .ndo_do_ioctl = vti_tunnel_ioctl, | 560 | .ndo_do_ioctl = vti_tunnel_ioctl, |
599 | .ndo_change_mtu = vti_tunnel_change_mtu, | 561 | .ndo_change_mtu = vti_tunnel_change_mtu, |
600 | .ndo_get_stats64 = vti_get_stats64, | 562 | .ndo_get_stats64 = ip_tunnel_get_stats64, |
601 | }; | 563 | }; |
602 | 564 | ||
603 | static void vti_dev_free(struct net_device *dev) | 565 | static void vti_dev_free(struct net_device *dev) |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 8f024d41eefa..77bfcce64fe5 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -111,227 +111,21 @@ | |||
111 | #include <net/sock.h> | 111 | #include <net/sock.h> |
112 | #include <net/ip.h> | 112 | #include <net/ip.h> |
113 | #include <net/icmp.h> | 113 | #include <net/icmp.h> |
114 | #include <net/ipip.h> | 114 | #include <net/ip_tunnels.h> |
115 | #include <net/inet_ecn.h> | 115 | #include <net/inet_ecn.h> |
116 | #include <net/xfrm.h> | 116 | #include <net/xfrm.h> |
117 | #include <net/net_namespace.h> | 117 | #include <net/net_namespace.h> |
118 | #include <net/netns/generic.h> | 118 | #include <net/netns/generic.h> |
119 | 119 | ||
120 | #define HASH_SIZE 16 | ||
121 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) | ||
122 | |||
123 | static bool log_ecn_error = true; | 120 | static bool log_ecn_error = true; |
124 | module_param(log_ecn_error, bool, 0644); | 121 | module_param(log_ecn_error, bool, 0644); |
125 | MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); | 122 | MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); |
126 | 123 | ||
127 | static int ipip_net_id __read_mostly; | 124 | static int ipip_net_id __read_mostly; |
128 | struct ipip_net { | ||
129 | struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; | ||
130 | struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; | ||
131 | struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; | ||
132 | struct ip_tunnel __rcu *tunnels_wc[1]; | ||
133 | struct ip_tunnel __rcu **tunnels[4]; | ||
134 | |||
135 | struct net_device *fb_tunnel_dev; | ||
136 | }; | ||
137 | 125 | ||
138 | static int ipip_tunnel_init(struct net_device *dev); | 126 | static int ipip_tunnel_init(struct net_device *dev); |
139 | static void ipip_tunnel_setup(struct net_device *dev); | ||
140 | static void ipip_dev_free(struct net_device *dev); | ||
141 | static struct rtnl_link_ops ipip_link_ops __read_mostly; | 127 | static struct rtnl_link_ops ipip_link_ops __read_mostly; |
142 | 128 | ||
143 | static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev, | ||
144 | struct rtnl_link_stats64 *tot) | ||
145 | { | ||
146 | int i; | ||
147 | |||
148 | for_each_possible_cpu(i) { | ||
149 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | ||
150 | u64 rx_packets, rx_bytes, tx_packets, tx_bytes; | ||
151 | unsigned int start; | ||
152 | |||
153 | do { | ||
154 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | ||
155 | rx_packets = tstats->rx_packets; | ||
156 | tx_packets = tstats->tx_packets; | ||
157 | rx_bytes = tstats->rx_bytes; | ||
158 | tx_bytes = tstats->tx_bytes; | ||
159 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | ||
160 | |||
161 | tot->rx_packets += rx_packets; | ||
162 | tot->tx_packets += tx_packets; | ||
163 | tot->rx_bytes += rx_bytes; | ||
164 | tot->tx_bytes += tx_bytes; | ||
165 | } | ||
166 | |||
167 | tot->tx_fifo_errors = dev->stats.tx_fifo_errors; | ||
168 | tot->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
169 | tot->tx_dropped = dev->stats.tx_dropped; | ||
170 | tot->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
171 | tot->tx_errors = dev->stats.tx_errors; | ||
172 | tot->collisions = dev->stats.collisions; | ||
173 | |||
174 | return tot; | ||
175 | } | ||
176 | |||
177 | static struct ip_tunnel *ipip_tunnel_lookup(struct net *net, | ||
178 | __be32 remote, __be32 local) | ||
179 | { | ||
180 | unsigned int h0 = HASH(remote); | ||
181 | unsigned int h1 = HASH(local); | ||
182 | struct ip_tunnel *t; | ||
183 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
184 | |||
185 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1]) | ||
186 | if (local == t->parms.iph.saddr && | ||
187 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
188 | return t; | ||
189 | |||
190 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0]) | ||
191 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
192 | return t; | ||
193 | |||
194 | for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1]) | ||
195 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) | ||
196 | return t; | ||
197 | |||
198 | t = rcu_dereference(ipn->tunnels_wc[0]); | ||
199 | if (t && (t->dev->flags&IFF_UP)) | ||
200 | return t; | ||
201 | return NULL; | ||
202 | } | ||
203 | |||
204 | static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, | ||
205 | struct ip_tunnel_parm *parms) | ||
206 | { | ||
207 | __be32 remote = parms->iph.daddr; | ||
208 | __be32 local = parms->iph.saddr; | ||
209 | unsigned int h = 0; | ||
210 | int prio = 0; | ||
211 | |||
212 | if (remote) { | ||
213 | prio |= 2; | ||
214 | h ^= HASH(remote); | ||
215 | } | ||
216 | if (local) { | ||
217 | prio |= 1; | ||
218 | h ^= HASH(local); | ||
219 | } | ||
220 | return &ipn->tunnels[prio][h]; | ||
221 | } | ||
222 | |||
223 | static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, | ||
224 | struct ip_tunnel *t) | ||
225 | { | ||
226 | return __ipip_bucket(ipn, &t->parms); | ||
227 | } | ||
228 | |||
229 | static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) | ||
230 | { | ||
231 | struct ip_tunnel __rcu **tp; | ||
232 | struct ip_tunnel *iter; | ||
233 | |||
234 | for (tp = ipip_bucket(ipn, t); | ||
235 | (iter = rtnl_dereference(*tp)) != NULL; | ||
236 | tp = &iter->next) { | ||
237 | if (t == iter) { | ||
238 | rcu_assign_pointer(*tp, t->next); | ||
239 | break; | ||
240 | } | ||
241 | } | ||
242 | } | ||
243 | |||
244 | static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) | ||
245 | { | ||
246 | struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); | ||
247 | |||
248 | rcu_assign_pointer(t->next, rtnl_dereference(*tp)); | ||
249 | rcu_assign_pointer(*tp, t); | ||
250 | } | ||
251 | |||
252 | static int ipip_tunnel_create(struct net_device *dev) | ||
253 | { | ||
254 | struct ip_tunnel *t = netdev_priv(dev); | ||
255 | struct net *net = dev_net(dev); | ||
256 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
257 | int err; | ||
258 | |||
259 | err = ipip_tunnel_init(dev); | ||
260 | if (err < 0) | ||
261 | goto out; | ||
262 | |||
263 | err = register_netdevice(dev); | ||
264 | if (err < 0) | ||
265 | goto out; | ||
266 | |||
267 | strcpy(t->parms.name, dev->name); | ||
268 | dev->rtnl_link_ops = &ipip_link_ops; | ||
269 | |||
270 | dev_hold(dev); | ||
271 | ipip_tunnel_link(ipn, t); | ||
272 | return 0; | ||
273 | |||
274 | out: | ||
275 | return err; | ||
276 | } | ||
277 | |||
278 | static struct ip_tunnel *ipip_tunnel_locate(struct net *net, | ||
279 | struct ip_tunnel_parm *parms, int create) | ||
280 | { | ||
281 | __be32 remote = parms->iph.daddr; | ||
282 | __be32 local = parms->iph.saddr; | ||
283 | struct ip_tunnel *t, *nt; | ||
284 | struct ip_tunnel __rcu **tp; | ||
285 | struct net_device *dev; | ||
286 | char name[IFNAMSIZ]; | ||
287 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
288 | |||
289 | for (tp = __ipip_bucket(ipn, parms); | ||
290 | (t = rtnl_dereference(*tp)) != NULL; | ||
291 | tp = &t->next) { | ||
292 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) | ||
293 | return t; | ||
294 | } | ||
295 | if (!create) | ||
296 | return NULL; | ||
297 | |||
298 | if (parms->name[0]) | ||
299 | strlcpy(name, parms->name, IFNAMSIZ); | ||
300 | else | ||
301 | strcpy(name, "tunl%d"); | ||
302 | |||
303 | dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); | ||
304 | if (dev == NULL) | ||
305 | return NULL; | ||
306 | |||
307 | dev_net_set(dev, net); | ||
308 | |||
309 | nt = netdev_priv(dev); | ||
310 | nt->parms = *parms; | ||
311 | |||
312 | if (ipip_tunnel_create(dev) < 0) | ||
313 | goto failed_free; | ||
314 | |||
315 | return nt; | ||
316 | |||
317 | failed_free: | ||
318 | ipip_dev_free(dev); | ||
319 | return NULL; | ||
320 | } | ||
321 | |||
322 | /* called with RTNL */ | ||
323 | static void ipip_tunnel_uninit(struct net_device *dev) | ||
324 | { | ||
325 | struct net *net = dev_net(dev); | ||
326 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
327 | |||
328 | if (dev == ipn->fb_tunnel_dev) | ||
329 | RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL); | ||
330 | else | ||
331 | ipip_tunnel_unlink(ipn, netdev_priv(dev)); | ||
332 | dev_put(dev); | ||
333 | } | ||
334 | |||
335 | static int ipip_err(struct sk_buff *skb, u32 info) | 129 | static int ipip_err(struct sk_buff *skb, u32 info) |
336 | { | 130 | { |
337 | 131 | ||
@@ -339,41 +133,17 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
339 | 8 bytes of packet payload. It means, that precise relaying of | 133 | 8 bytes of packet payload. It means, that precise relaying of |
340 | ICMP in the real Internet is absolutely infeasible. | 134 | ICMP in the real Internet is absolutely infeasible. |
341 | */ | 135 | */ |
136 | struct net *net = dev_net(skb->dev); | ||
137 | struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); | ||
342 | const struct iphdr *iph = (const struct iphdr *)skb->data; | 138 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
343 | const int type = icmp_hdr(skb)->type; | ||
344 | const int code = icmp_hdr(skb)->code; | ||
345 | struct ip_tunnel *t; | 139 | struct ip_tunnel *t; |
346 | int err; | 140 | int err; |
347 | 141 | const int type = icmp_hdr(skb)->type; | |
348 | switch (type) { | 142 | const int code = icmp_hdr(skb)->code; |
349 | default: | ||
350 | case ICMP_PARAMETERPROB: | ||
351 | return 0; | ||
352 | |||
353 | case ICMP_DEST_UNREACH: | ||
354 | switch (code) { | ||
355 | case ICMP_SR_FAILED: | ||
356 | case ICMP_PORT_UNREACH: | ||
357 | /* Impossible event. */ | ||
358 | return 0; | ||
359 | default: | ||
360 | /* All others are translated to HOST_UNREACH. | ||
361 | rfc2003 contains "deep thoughts" about NET_UNREACH, | ||
362 | I believe they are just ether pollution. --ANK | ||
363 | */ | ||
364 | break; | ||
365 | } | ||
366 | break; | ||
367 | case ICMP_TIME_EXCEEDED: | ||
368 | if (code != ICMP_EXC_TTL) | ||
369 | return 0; | ||
370 | break; | ||
371 | case ICMP_REDIRECT: | ||
372 | break; | ||
373 | } | ||
374 | 143 | ||
375 | err = -ENOENT; | 144 | err = -ENOENT; |
376 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | 145 | t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, |
146 | iph->daddr, iph->saddr, 0); | ||
377 | if (t == NULL) | 147 | if (t == NULL) |
378 | goto out; | 148 | goto out; |
379 | 149 | ||
@@ -403,53 +173,29 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
403 | else | 173 | else |
404 | t->err_count = 1; | 174 | t->err_count = 1; |
405 | t->err_time = jiffies; | 175 | t->err_time = jiffies; |
406 | out: | ||
407 | 176 | ||
177 | out: | ||
408 | return err; | 178 | return err; |
409 | } | 179 | } |
410 | 180 | ||
181 | static const struct tnl_ptk_info tpi = { | ||
182 | /* no tunnel info required for ipip. */ | ||
183 | .proto = htons(ETH_P_IP), | ||
184 | }; | ||
185 | |||
411 | static int ipip_rcv(struct sk_buff *skb) | 186 | static int ipip_rcv(struct sk_buff *skb) |
412 | { | 187 | { |
188 | struct net *net = dev_net(skb->dev); | ||
189 | struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); | ||
413 | struct ip_tunnel *tunnel; | 190 | struct ip_tunnel *tunnel; |
414 | const struct iphdr *iph = ip_hdr(skb); | 191 | const struct iphdr *iph = ip_hdr(skb); |
415 | int err; | ||
416 | |||
417 | tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); | ||
418 | if (tunnel != NULL) { | ||
419 | struct pcpu_tstats *tstats; | ||
420 | 192 | ||
193 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, | ||
194 | iph->saddr, iph->daddr, 0); | ||
195 | if (tunnel) { | ||
421 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) | 196 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) |
422 | goto drop; | 197 | goto drop; |
423 | 198 | return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); | |
424 | secpath_reset(skb); | ||
425 | |||
426 | skb->mac_header = skb->network_header; | ||
427 | skb_reset_network_header(skb); | ||
428 | skb->protocol = htons(ETH_P_IP); | ||
429 | skb->pkt_type = PACKET_HOST; | ||
430 | |||
431 | __skb_tunnel_rx(skb, tunnel->dev); | ||
432 | |||
433 | err = IP_ECN_decapsulate(iph, skb); | ||
434 | if (unlikely(err)) { | ||
435 | if (log_ecn_error) | ||
436 | net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", | ||
437 | &iph->saddr, iph->tos); | ||
438 | if (err > 1) { | ||
439 | ++tunnel->dev->stats.rx_frame_errors; | ||
440 | ++tunnel->dev->stats.rx_errors; | ||
441 | goto drop; | ||
442 | } | ||
443 | } | ||
444 | |||
445 | tstats = this_cpu_ptr(tunnel->dev->tstats); | ||
446 | u64_stats_update_begin(&tstats->syncp); | ||
447 | tstats->rx_packets++; | ||
448 | tstats->rx_bytes += skb->len; | ||
449 | u64_stats_update_end(&tstats->syncp); | ||
450 | |||
451 | netif_rx(skb); | ||
452 | return 0; | ||
453 | } | 199 | } |
454 | 200 | ||
455 | return -1; | 201 | return -1; |
@@ -463,329 +209,64 @@ drop: | |||
463 | * This function assumes it is being called from dev_queue_xmit() | 209 | * This function assumes it is being called from dev_queue_xmit() |
464 | * and that skb is filled properly by that function. | 210 | * and that skb is filled properly by that function. |
465 | */ | 211 | */ |
466 | |||
467 | static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 212 | static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) |
468 | { | 213 | { |
469 | struct ip_tunnel *tunnel = netdev_priv(dev); | 214 | struct ip_tunnel *tunnel = netdev_priv(dev); |
470 | const struct iphdr *tiph = &tunnel->parms.iph; | 215 | const struct iphdr *tiph = &tunnel->parms.iph; |
471 | u8 tos = tunnel->parms.iph.tos; | ||
472 | __be16 df = tiph->frag_off; | ||
473 | struct rtable *rt; /* Route to the other host */ | ||
474 | struct net_device *tdev; /* Device to other host */ | ||
475 | const struct iphdr *old_iph; | ||
476 | struct iphdr *iph; /* Our new IP header */ | ||
477 | unsigned int max_headroom; /* The extra header space needed */ | ||
478 | __be32 dst = tiph->daddr; | ||
479 | struct flowi4 fl4; | ||
480 | int mtu; | ||
481 | |||
482 | if (skb->protocol != htons(ETH_P_IP)) | ||
483 | goto tx_error; | ||
484 | 216 | ||
485 | if (skb->ip_summed == CHECKSUM_PARTIAL && | 217 | if (unlikely(skb->protocol != htons(ETH_P_IP))) |
486 | skb_checksum_help(skb)) | ||
487 | goto tx_error; | 218 | goto tx_error; |
488 | 219 | ||
489 | old_iph = ip_hdr(skb); | 220 | if (likely(!skb->encapsulation)) { |
490 | 221 | skb_reset_inner_headers(skb); | |
491 | if (tos & 1) | 222 | skb->encapsulation = 1; |
492 | tos = old_iph->tos; | ||
493 | |||
494 | if (!dst) { | ||
495 | /* NBMA tunnel */ | ||
496 | if ((rt = skb_rtable(skb)) == NULL) { | ||
497 | dev->stats.tx_fifo_errors++; | ||
498 | goto tx_error; | ||
499 | } | ||
500 | dst = rt_nexthop(rt, old_iph->daddr); | ||
501 | } | 223 | } |
502 | 224 | ||
503 | rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, | 225 | ip_tunnel_xmit(skb, dev, tiph); |
504 | dst, tiph->saddr, | ||
505 | 0, 0, | ||
506 | IPPROTO_IPIP, RT_TOS(tos), | ||
507 | tunnel->parms.link); | ||
508 | if (IS_ERR(rt)) { | ||
509 | dev->stats.tx_carrier_errors++; | ||
510 | goto tx_error_icmp; | ||
511 | } | ||
512 | tdev = rt->dst.dev; | ||
513 | |||
514 | if (tdev == dev) { | ||
515 | ip_rt_put(rt); | ||
516 | dev->stats.collisions++; | ||
517 | goto tx_error; | ||
518 | } | ||
519 | |||
520 | df |= old_iph->frag_off & htons(IP_DF); | ||
521 | |||
522 | if (df) { | ||
523 | mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); | ||
524 | |||
525 | if (mtu < 68) { | ||
526 | dev->stats.collisions++; | ||
527 | ip_rt_put(rt); | ||
528 | goto tx_error; | ||
529 | } | ||
530 | |||
531 | if (skb_dst(skb)) | ||
532 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); | ||
533 | |||
534 | if ((old_iph->frag_off & htons(IP_DF)) && | ||
535 | mtu < ntohs(old_iph->tot_len)) { | ||
536 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, | ||
537 | htonl(mtu)); | ||
538 | ip_rt_put(rt); | ||
539 | goto tx_error; | ||
540 | } | ||
541 | } | ||
542 | |||
543 | if (tunnel->err_count > 0) { | ||
544 | if (time_before(jiffies, | ||
545 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { | ||
546 | tunnel->err_count--; | ||
547 | dst_link_failure(skb); | ||
548 | } else | ||
549 | tunnel->err_count = 0; | ||
550 | } | ||
551 | |||
552 | /* | ||
553 | * Okay, now see if we can stuff it in the buffer as-is. | ||
554 | */ | ||
555 | max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); | ||
556 | |||
557 | if (skb_headroom(skb) < max_headroom || skb_shared(skb) || | ||
558 | (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { | ||
559 | struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); | ||
560 | if (!new_skb) { | ||
561 | ip_rt_put(rt); | ||
562 | dev->stats.tx_dropped++; | ||
563 | dev_kfree_skb(skb); | ||
564 | return NETDEV_TX_OK; | ||
565 | } | ||
566 | if (skb->sk) | ||
567 | skb_set_owner_w(new_skb, skb->sk); | ||
568 | dev_kfree_skb(skb); | ||
569 | skb = new_skb; | ||
570 | old_iph = ip_hdr(skb); | ||
571 | } | ||
572 | |||
573 | skb->transport_header = skb->network_header; | ||
574 | skb_push(skb, sizeof(struct iphdr)); | ||
575 | skb_reset_network_header(skb); | ||
576 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | ||
577 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | | ||
578 | IPSKB_REROUTED); | ||
579 | skb_dst_drop(skb); | ||
580 | skb_dst_set(skb, &rt->dst); | ||
581 | |||
582 | /* | ||
583 | * Push down and install the IPIP header. | ||
584 | */ | ||
585 | |||
586 | iph = ip_hdr(skb); | ||
587 | iph->version = 4; | ||
588 | iph->ihl = sizeof(struct iphdr)>>2; | ||
589 | iph->frag_off = df; | ||
590 | iph->protocol = IPPROTO_IPIP; | ||
591 | iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); | ||
592 | iph->daddr = fl4.daddr; | ||
593 | iph->saddr = fl4.saddr; | ||
594 | |||
595 | if ((iph->ttl = tiph->ttl) == 0) | ||
596 | iph->ttl = old_iph->ttl; | ||
597 | |||
598 | iptunnel_xmit(skb, dev); | ||
599 | return NETDEV_TX_OK; | 226 | return NETDEV_TX_OK; |
600 | 227 | ||
601 | tx_error_icmp: | ||
602 | dst_link_failure(skb); | ||
603 | tx_error: | 228 | tx_error: |
604 | dev->stats.tx_errors++; | 229 | dev->stats.tx_errors++; |
605 | dev_kfree_skb(skb); | 230 | dev_kfree_skb(skb); |
606 | return NETDEV_TX_OK; | 231 | return NETDEV_TX_OK; |
607 | } | 232 | } |
608 | 233 | ||
609 | static void ipip_tunnel_bind_dev(struct net_device *dev) | ||
610 | { | ||
611 | struct net_device *tdev = NULL; | ||
612 | struct ip_tunnel *tunnel; | ||
613 | const struct iphdr *iph; | ||
614 | |||
615 | tunnel = netdev_priv(dev); | ||
616 | iph = &tunnel->parms.iph; | ||
617 | |||
618 | if (iph->daddr) { | ||
619 | struct rtable *rt; | ||
620 | struct flowi4 fl4; | ||
621 | |||
622 | rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, | ||
623 | iph->daddr, iph->saddr, | ||
624 | 0, 0, | ||
625 | IPPROTO_IPIP, | ||
626 | RT_TOS(iph->tos), | ||
627 | tunnel->parms.link); | ||
628 | if (!IS_ERR(rt)) { | ||
629 | tdev = rt->dst.dev; | ||
630 | ip_rt_put(rt); | ||
631 | } | ||
632 | dev->flags |= IFF_POINTOPOINT; | ||
633 | } | ||
634 | |||
635 | if (!tdev && tunnel->parms.link) | ||
636 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | ||
637 | |||
638 | if (tdev) { | ||
639 | dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); | ||
640 | dev->mtu = tdev->mtu - sizeof(struct iphdr); | ||
641 | } | ||
642 | dev->iflink = tunnel->parms.link; | ||
643 | } | ||
644 | |||
645 | static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) | ||
646 | { | ||
647 | struct net *net = dev_net(t->dev); | ||
648 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
649 | |||
650 | ipip_tunnel_unlink(ipn, t); | ||
651 | synchronize_net(); | ||
652 | t->parms.iph.saddr = p->iph.saddr; | ||
653 | t->parms.iph.daddr = p->iph.daddr; | ||
654 | memcpy(t->dev->dev_addr, &p->iph.saddr, 4); | ||
655 | memcpy(t->dev->broadcast, &p->iph.daddr, 4); | ||
656 | ipip_tunnel_link(ipn, t); | ||
657 | t->parms.iph.ttl = p->iph.ttl; | ||
658 | t->parms.iph.tos = p->iph.tos; | ||
659 | t->parms.iph.frag_off = p->iph.frag_off; | ||
660 | if (t->parms.link != p->link) { | ||
661 | t->parms.link = p->link; | ||
662 | ipip_tunnel_bind_dev(t->dev); | ||
663 | } | ||
664 | netdev_state_change(t->dev); | ||
665 | } | ||
666 | |||
667 | static int | 234 | static int |
668 | ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | 235 | ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) |
669 | { | 236 | { |
670 | int err = 0; | 237 | int err = 0; |
671 | struct ip_tunnel_parm p; | 238 | struct ip_tunnel_parm p; |
672 | struct ip_tunnel *t; | ||
673 | struct net *net = dev_net(dev); | ||
674 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
675 | |||
676 | switch (cmd) { | ||
677 | case SIOCGETTUNNEL: | ||
678 | t = NULL; | ||
679 | if (dev == ipn->fb_tunnel_dev) { | ||
680 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { | ||
681 | err = -EFAULT; | ||
682 | break; | ||
683 | } | ||
684 | t = ipip_tunnel_locate(net, &p, 0); | ||
685 | } | ||
686 | if (t == NULL) | ||
687 | t = netdev_priv(dev); | ||
688 | memcpy(&p, &t->parms, sizeof(p)); | ||
689 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) | ||
690 | err = -EFAULT; | ||
691 | break; | ||
692 | |||
693 | case SIOCADDTUNNEL: | ||
694 | case SIOCCHGTUNNEL: | ||
695 | err = -EPERM; | ||
696 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
697 | goto done; | ||
698 | |||
699 | err = -EFAULT; | ||
700 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | ||
701 | goto done; | ||
702 | |||
703 | err = -EINVAL; | ||
704 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || | ||
705 | p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) | ||
706 | goto done; | ||
707 | if (p.iph.ttl) | ||
708 | p.iph.frag_off |= htons(IP_DF); | ||
709 | |||
710 | t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); | ||
711 | |||
712 | if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | ||
713 | if (t != NULL) { | ||
714 | if (t->dev != dev) { | ||
715 | err = -EEXIST; | ||
716 | break; | ||
717 | } | ||
718 | } else { | ||
719 | if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || | ||
720 | (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { | ||
721 | err = -EINVAL; | ||
722 | break; | ||
723 | } | ||
724 | t = netdev_priv(dev); | ||
725 | } | ||
726 | |||
727 | ipip_tunnel_update(t, &p); | ||
728 | } | ||
729 | |||
730 | if (t) { | ||
731 | err = 0; | ||
732 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) | ||
733 | err = -EFAULT; | ||
734 | } else | ||
735 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | ||
736 | break; | ||
737 | |||
738 | case SIOCDELTUNNEL: | ||
739 | err = -EPERM; | ||
740 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
741 | goto done; | ||
742 | |||
743 | if (dev == ipn->fb_tunnel_dev) { | ||
744 | err = -EFAULT; | ||
745 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | ||
746 | goto done; | ||
747 | err = -ENOENT; | ||
748 | if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) | ||
749 | goto done; | ||
750 | err = -EPERM; | ||
751 | if (t->dev == ipn->fb_tunnel_dev) | ||
752 | goto done; | ||
753 | dev = t->dev; | ||
754 | } | ||
755 | unregister_netdevice(dev); | ||
756 | err = 0; | ||
757 | break; | ||
758 | 239 | ||
759 | default: | 240 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) |
760 | err = -EINVAL; | 241 | return -EFAULT; |
761 | } | ||
762 | |||
763 | done: | ||
764 | return err; | ||
765 | } | ||
766 | 242 | ||
767 | static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) | 243 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || |
768 | { | 244 | p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) |
769 | if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) | 245 | return -EINVAL; |
246 | if (p.i_key || p.o_key || p.i_flags || p.o_flags) | ||
770 | return -EINVAL; | 247 | return -EINVAL; |
771 | dev->mtu = new_mtu; | 248 | if (p.iph.ttl) |
249 | p.iph.frag_off |= htons(IP_DF); | ||
250 | |||
251 | err = ip_tunnel_ioctl(dev, &p, cmd); | ||
252 | if (err) | ||
253 | return err; | ||
254 | |||
255 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) | ||
256 | return -EFAULT; | ||
257 | |||
772 | return 0; | 258 | return 0; |
773 | } | 259 | } |
774 | 260 | ||
775 | static const struct net_device_ops ipip_netdev_ops = { | 261 | static const struct net_device_ops ipip_netdev_ops = { |
776 | .ndo_uninit = ipip_tunnel_uninit, | 262 | .ndo_init = ipip_tunnel_init, |
263 | .ndo_uninit = ip_tunnel_uninit, | ||
777 | .ndo_start_xmit = ipip_tunnel_xmit, | 264 | .ndo_start_xmit = ipip_tunnel_xmit, |
778 | .ndo_do_ioctl = ipip_tunnel_ioctl, | 265 | .ndo_do_ioctl = ipip_tunnel_ioctl, |
779 | .ndo_change_mtu = ipip_tunnel_change_mtu, | 266 | .ndo_change_mtu = ip_tunnel_change_mtu, |
780 | .ndo_get_stats64 = ipip_get_stats64, | 267 | .ndo_get_stats64 = ip_tunnel_get_stats64, |
781 | }; | 268 | }; |
782 | 269 | ||
783 | static void ipip_dev_free(struct net_device *dev) | ||
784 | { | ||
785 | free_percpu(dev->tstats); | ||
786 | free_netdev(dev); | ||
787 | } | ||
788 | |||
789 | #define IPIP_FEATURES (NETIF_F_SG | \ | 270 | #define IPIP_FEATURES (NETIF_F_SG | \ |
790 | NETIF_F_FRAGLIST | \ | 271 | NETIF_F_FRAGLIST | \ |
791 | NETIF_F_HIGHDMA | \ | 272 | NETIF_F_HIGHDMA | \ |
@@ -794,11 +275,8 @@ static void ipip_dev_free(struct net_device *dev) | |||
794 | static void ipip_tunnel_setup(struct net_device *dev) | 275 | static void ipip_tunnel_setup(struct net_device *dev) |
795 | { | 276 | { |
796 | dev->netdev_ops = &ipip_netdev_ops; | 277 | dev->netdev_ops = &ipip_netdev_ops; |
797 | dev->destructor = ipip_dev_free; | ||
798 | 278 | ||
799 | dev->type = ARPHRD_TUNNEL; | 279 | dev->type = ARPHRD_TUNNEL; |
800 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); | ||
801 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); | ||
802 | dev->flags = IFF_NOARP; | 280 | dev->flags = IFF_NOARP; |
803 | dev->iflink = 0; | 281 | dev->iflink = 0; |
804 | dev->addr_len = 4; | 282 | dev->addr_len = 4; |
@@ -808,46 +286,19 @@ static void ipip_tunnel_setup(struct net_device *dev) | |||
808 | 286 | ||
809 | dev->features |= IPIP_FEATURES; | 287 | dev->features |= IPIP_FEATURES; |
810 | dev->hw_features |= IPIP_FEATURES; | 288 | dev->hw_features |= IPIP_FEATURES; |
289 | ip_tunnel_setup(dev, ipip_net_id); | ||
811 | } | 290 | } |
812 | 291 | ||
813 | static int ipip_tunnel_init(struct net_device *dev) | 292 | static int ipip_tunnel_init(struct net_device *dev) |
814 | { | 293 | { |
815 | struct ip_tunnel *tunnel = netdev_priv(dev); | 294 | struct ip_tunnel *tunnel = netdev_priv(dev); |
816 | 295 | ||
817 | tunnel->dev = dev; | ||
818 | |||
819 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | 296 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); |
820 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | 297 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); |
821 | 298 | ||
822 | ipip_tunnel_bind_dev(dev); | 299 | tunnel->hlen = 0; |
823 | 300 | tunnel->parms.iph.protocol = IPPROTO_IPIP; | |
824 | dev->tstats = alloc_percpu(struct pcpu_tstats); | 301 | return ip_tunnel_init(dev); |
825 | if (!dev->tstats) | ||
826 | return -ENOMEM; | ||
827 | |||
828 | return 0; | ||
829 | } | ||
830 | |||
831 | static int __net_init ipip_fb_tunnel_init(struct net_device *dev) | ||
832 | { | ||
833 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
834 | struct iphdr *iph = &tunnel->parms.iph; | ||
835 | struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); | ||
836 | |||
837 | tunnel->dev = dev; | ||
838 | strcpy(tunnel->parms.name, dev->name); | ||
839 | |||
840 | iph->version = 4; | ||
841 | iph->protocol = IPPROTO_IPIP; | ||
842 | iph->ihl = 5; | ||
843 | |||
844 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
845 | if (!dev->tstats) | ||
846 | return -ENOMEM; | ||
847 | |||
848 | dev_hold(dev); | ||
849 | rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); | ||
850 | return 0; | ||
851 | } | 302 | } |
852 | 303 | ||
853 | static void ipip_netlink_parms(struct nlattr *data[], | 304 | static void ipip_netlink_parms(struct nlattr *data[], |
@@ -887,28 +338,16 @@ static void ipip_netlink_parms(struct nlattr *data[], | |||
887 | static int ipip_newlink(struct net *src_net, struct net_device *dev, | 338 | static int ipip_newlink(struct net *src_net, struct net_device *dev, |
888 | struct nlattr *tb[], struct nlattr *data[]) | 339 | struct nlattr *tb[], struct nlattr *data[]) |
889 | { | 340 | { |
890 | struct net *net = dev_net(dev); | 341 | struct ip_tunnel_parm p; |
891 | struct ip_tunnel *nt; | ||
892 | |||
893 | nt = netdev_priv(dev); | ||
894 | ipip_netlink_parms(data, &nt->parms); | ||
895 | |||
896 | if (ipip_tunnel_locate(net, &nt->parms, 0)) | ||
897 | return -EEXIST; | ||
898 | 342 | ||
899 | return ipip_tunnel_create(dev); | 343 | ipip_netlink_parms(data, &p); |
344 | return ip_tunnel_newlink(dev, tb, &p); | ||
900 | } | 345 | } |
901 | 346 | ||
902 | static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], | 347 | static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], |
903 | struct nlattr *data[]) | 348 | struct nlattr *data[]) |
904 | { | 349 | { |
905 | struct ip_tunnel *t; | ||
906 | struct ip_tunnel_parm p; | 350 | struct ip_tunnel_parm p; |
907 | struct net *net = dev_net(dev); | ||
908 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
909 | |||
910 | if (dev == ipn->fb_tunnel_dev) | ||
911 | return -EINVAL; | ||
912 | 351 | ||
913 | ipip_netlink_parms(data, &p); | 352 | ipip_netlink_parms(data, &p); |
914 | 353 | ||
@@ -916,16 +355,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], | |||
916 | (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) | 355 | (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) |
917 | return -EINVAL; | 356 | return -EINVAL; |
918 | 357 | ||
919 | t = ipip_tunnel_locate(net, &p, 0); | 358 | return ip_tunnel_changelink(dev, tb, &p); |
920 | |||
921 | if (t) { | ||
922 | if (t->dev != dev) | ||
923 | return -EEXIST; | ||
924 | } else | ||
925 | t = netdev_priv(dev); | ||
926 | |||
927 | ipip_tunnel_update(t, &p); | ||
928 | return 0; | ||
929 | } | 359 | } |
930 | 360 | ||
931 | static size_t ipip_get_size(const struct net_device *dev) | 361 | static size_t ipip_get_size(const struct net_device *dev) |
@@ -982,6 +412,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = { | |||
982 | .setup = ipip_tunnel_setup, | 412 | .setup = ipip_tunnel_setup, |
983 | .newlink = ipip_newlink, | 413 | .newlink = ipip_newlink, |
984 | .changelink = ipip_changelink, | 414 | .changelink = ipip_changelink, |
415 | .dellink = ip_tunnel_dellink, | ||
985 | .get_size = ipip_get_size, | 416 | .get_size = ipip_get_size, |
986 | .fill_info = ipip_fill_info, | 417 | .fill_info = ipip_fill_info, |
987 | }; | 418 | }; |
@@ -992,90 +423,29 @@ static struct xfrm_tunnel ipip_handler __read_mostly = { | |||
992 | .priority = 1, | 423 | .priority = 1, |
993 | }; | 424 | }; |
994 | 425 | ||
995 | static const char banner[] __initconst = | ||
996 | KERN_INFO "IPv4 over IPv4 tunneling driver\n"; | ||
997 | |||
998 | static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) | ||
999 | { | ||
1000 | int prio; | ||
1001 | |||
1002 | for (prio = 1; prio < 4; prio++) { | ||
1003 | int h; | ||
1004 | for (h = 0; h < HASH_SIZE; h++) { | ||
1005 | struct ip_tunnel *t; | ||
1006 | |||
1007 | t = rtnl_dereference(ipn->tunnels[prio][h]); | ||
1008 | while (t != NULL) { | ||
1009 | unregister_netdevice_queue(t->dev, head); | ||
1010 | t = rtnl_dereference(t->next); | ||
1011 | } | ||
1012 | } | ||
1013 | } | ||
1014 | } | ||
1015 | |||
1016 | static int __net_init ipip_init_net(struct net *net) | 426 | static int __net_init ipip_init_net(struct net *net) |
1017 | { | 427 | { |
1018 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 428 | return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); |
1019 | struct ip_tunnel *t; | ||
1020 | int err; | ||
1021 | |||
1022 | ipn->tunnels[0] = ipn->tunnels_wc; | ||
1023 | ipn->tunnels[1] = ipn->tunnels_l; | ||
1024 | ipn->tunnels[2] = ipn->tunnels_r; | ||
1025 | ipn->tunnels[3] = ipn->tunnels_r_l; | ||
1026 | |||
1027 | ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), | ||
1028 | "tunl0", | ||
1029 | ipip_tunnel_setup); | ||
1030 | if (!ipn->fb_tunnel_dev) { | ||
1031 | err = -ENOMEM; | ||
1032 | goto err_alloc_dev; | ||
1033 | } | ||
1034 | dev_net_set(ipn->fb_tunnel_dev, net); | ||
1035 | |||
1036 | err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); | ||
1037 | if (err) | ||
1038 | goto err_reg_dev; | ||
1039 | |||
1040 | if ((err = register_netdev(ipn->fb_tunnel_dev))) | ||
1041 | goto err_reg_dev; | ||
1042 | |||
1043 | t = netdev_priv(ipn->fb_tunnel_dev); | ||
1044 | |||
1045 | strcpy(t->parms.name, ipn->fb_tunnel_dev->name); | ||
1046 | return 0; | ||
1047 | |||
1048 | err_reg_dev: | ||
1049 | ipip_dev_free(ipn->fb_tunnel_dev); | ||
1050 | err_alloc_dev: | ||
1051 | /* nothing */ | ||
1052 | return err; | ||
1053 | } | 429 | } |
1054 | 430 | ||
1055 | static void __net_exit ipip_exit_net(struct net *net) | 431 | static void __net_exit ipip_exit_net(struct net *net) |
1056 | { | 432 | { |
1057 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 433 | struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); |
1058 | LIST_HEAD(list); | 434 | ip_tunnel_delete_net(itn); |
1059 | |||
1060 | rtnl_lock(); | ||
1061 | ipip_destroy_tunnels(ipn, &list); | ||
1062 | unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); | ||
1063 | unregister_netdevice_many(&list); | ||
1064 | rtnl_unlock(); | ||
1065 | } | 435 | } |
1066 | 436 | ||
1067 | static struct pernet_operations ipip_net_ops = { | 437 | static struct pernet_operations ipip_net_ops = { |
1068 | .init = ipip_init_net, | 438 | .init = ipip_init_net, |
1069 | .exit = ipip_exit_net, | 439 | .exit = ipip_exit_net, |
1070 | .id = &ipip_net_id, | 440 | .id = &ipip_net_id, |
1071 | .size = sizeof(struct ipip_net), | 441 | .size = sizeof(struct ip_tunnel_net), |
1072 | }; | 442 | }; |
1073 | 443 | ||
1074 | static int __init ipip_init(void) | 444 | static int __init ipip_init(void) |
1075 | { | 445 | { |
1076 | int err; | 446 | int err; |
1077 | 447 | ||
1078 | printk(banner); | 448 | pr_info("ipip: IPv4 over IPv4 tunneling driver\n"); |
1079 | 449 | ||
1080 | err = register_pernet_device(&ipip_net_ops); | 450 | err = register_pernet_device(&ipip_net_ops); |
1081 | if (err < 0) | 451 | if (err < 0) |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5f95b3aa579e..fd61fe16679f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -61,7 +61,7 @@ | |||
61 | #include <linux/netfilter_ipv4.h> | 61 | #include <linux/netfilter_ipv4.h> |
62 | #include <linux/compat.h> | 62 | #include <linux/compat.h> |
63 | #include <linux/export.h> | 63 | #include <linux/export.h> |
64 | #include <net/ipip.h> | 64 | #include <net/ip_tunnels.h> |
65 | #include <net/checksum.h> | 65 | #include <net/checksum.h> |
66 | #include <net/netlink.h> | 66 | #include <net/netlink.h> |
67 | #include <net/fib_rules.h> | 67 | #include <net/fib_rules.h> |
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 79ca5e70d497..eadab1ed6500 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c | |||
@@ -48,9 +48,7 @@ static int __net_init arptable_filter_net_init(struct net *net) | |||
48 | net->ipv4.arptable_filter = | 48 | net->ipv4.arptable_filter = |
49 | arpt_register_table(net, &packet_filter, repl); | 49 | arpt_register_table(net, &packet_filter, repl); |
50 | kfree(repl); | 50 | kfree(repl); |
51 | if (IS_ERR(net->ipv4.arptable_filter)) | 51 | return PTR_RET(net->ipv4.arptable_filter); |
52 | return PTR_ERR(net->ipv4.arptable_filter); | ||
53 | return 0; | ||
54 | } | 52 | } |
55 | 53 | ||
56 | static void __net_exit arptable_filter_net_exit(struct net *net) | 54 | static void __net_exit arptable_filter_net_exit(struct net *net) |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 32030a24e776..b6f2ea174898 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -224,6 +224,8 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
224 | SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), | 224 | SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), |
225 | SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), | 225 | SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), |
226 | SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), | 226 | SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), |
227 | SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), | ||
228 | SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), | ||
227 | SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), | 229 | SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), |
228 | SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), | 230 | SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), |
229 | SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), | 231 | SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6e2851464f8f..550781a17b34 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -2311,7 +2311,7 @@ nla_put_failure: | |||
2311 | return -EMSGSIZE; | 2311 | return -EMSGSIZE; |
2312 | } | 2312 | } |
2313 | 2313 | ||
2314 | static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) | 2314 | static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) |
2315 | { | 2315 | { |
2316 | struct net *net = sock_net(in_skb->sk); | 2316 | struct net *net = sock_net(in_skb->sk); |
2317 | struct rtmsg *rtm; | 2317 | struct rtmsg *rtm; |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index ef54377fb11c..7f4a5cb8f8d0 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -267,7 +267,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
267 | struct ip_options *opt) | 267 | struct ip_options *opt) |
268 | { | 268 | { |
269 | struct tcp_options_received tcp_opt; | 269 | struct tcp_options_received tcp_opt; |
270 | const u8 *hash_location; | ||
271 | struct inet_request_sock *ireq; | 270 | struct inet_request_sock *ireq; |
272 | struct tcp_request_sock *treq; | 271 | struct tcp_request_sock *treq; |
273 | struct tcp_sock *tp = tcp_sk(sk); | 272 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -294,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
294 | 293 | ||
295 | /* check for timestamp cookie support */ | 294 | /* check for timestamp cookie support */ |
296 | memset(&tcp_opt, 0, sizeof(tcp_opt)); | 295 | memset(&tcp_opt, 0, sizeof(tcp_opt)); |
297 | tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); | 296 | tcp_parse_options(skb, &tcp_opt, 0, NULL); |
298 | 297 | ||
299 | if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) | 298 | if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) |
300 | goto out; | 299 | goto out; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 960fd29d9b8e..fa2f63fc453b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -28,7 +28,7 @@ | |||
28 | 28 | ||
29 | static int zero; | 29 | static int zero; |
30 | static int one = 1; | 30 | static int one = 1; |
31 | static int two = 2; | 31 | static int four = 4; |
32 | static int tcp_retr1_max = 255; | 32 | static int tcp_retr1_max = 255; |
33 | static int ip_local_port_range_min[] = { 1, 1 }; | 33 | static int ip_local_port_range_min[] = { 1, 1 }; |
34 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 34 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
@@ -592,13 +592,6 @@ static struct ctl_table ipv4_table[] = { | |||
592 | .proc_handler = proc_dointvec | 592 | .proc_handler = proc_dointvec |
593 | }, | 593 | }, |
594 | { | 594 | { |
595 | .procname = "tcp_frto_response", | ||
596 | .data = &sysctl_tcp_frto_response, | ||
597 | .maxlen = sizeof(int), | ||
598 | .mode = 0644, | ||
599 | .proc_handler = proc_dointvec | ||
600 | }, | ||
601 | { | ||
602 | .procname = "tcp_low_latency", | 595 | .procname = "tcp_low_latency", |
603 | .data = &sysctl_tcp_low_latency, | 596 | .data = &sysctl_tcp_low_latency, |
604 | .maxlen = sizeof(int), | 597 | .maxlen = sizeof(int), |
@@ -733,13 +726,6 @@ static struct ctl_table ipv4_table[] = { | |||
733 | .proc_handler = proc_dointvec, | 726 | .proc_handler = proc_dointvec, |
734 | }, | 727 | }, |
735 | { | 728 | { |
736 | .procname = "tcp_cookie_size", | ||
737 | .data = &sysctl_tcp_cookie_size, | ||
738 | .maxlen = sizeof(int), | ||
739 | .mode = 0644, | ||
740 | .proc_handler = proc_dointvec | ||
741 | }, | ||
742 | { | ||
743 | .procname = "tcp_thin_linear_timeouts", | 729 | .procname = "tcp_thin_linear_timeouts", |
744 | .data = &sysctl_tcp_thin_linear_timeouts, | 730 | .data = &sysctl_tcp_thin_linear_timeouts, |
745 | .maxlen = sizeof(int), | 731 | .maxlen = sizeof(int), |
@@ -760,7 +746,7 @@ static struct ctl_table ipv4_table[] = { | |||
760 | .mode = 0644, | 746 | .mode = 0644, |
761 | .proc_handler = proc_dointvec_minmax, | 747 | .proc_handler = proc_dointvec_minmax, |
762 | .extra1 = &zero, | 748 | .extra1 = &zero, |
763 | .extra2 = &two, | 749 | .extra2 = &four, |
764 | }, | 750 | }, |
765 | { | 751 | { |
766 | .procname = "udp_mem", | 752 | .procname = "udp_mem", |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e22020790709..a96f7b586277 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -409,15 +409,6 @@ void tcp_init_sock(struct sock *sk) | |||
409 | 409 | ||
410 | icsk->icsk_sync_mss = tcp_sync_mss; | 410 | icsk->icsk_sync_mss = tcp_sync_mss; |
411 | 411 | ||
412 | /* TCP Cookie Transactions */ | ||
413 | if (sysctl_tcp_cookie_size > 0) { | ||
414 | /* Default, cookies without s_data_payload. */ | ||
415 | tp->cookie_values = | ||
416 | kzalloc(sizeof(*tp->cookie_values), | ||
417 | sk->sk_allocation); | ||
418 | if (tp->cookie_values != NULL) | ||
419 | kref_init(&tp->cookie_values->kref); | ||
420 | } | ||
421 | /* Presumed zeroed, in order of appearance: | 412 | /* Presumed zeroed, in order of appearance: |
422 | * cookie_in_always, cookie_out_never, | 413 | * cookie_in_always, cookie_out_never, |
423 | * s_data_constant, s_data_in, s_data_out | 414 | * s_data_constant, s_data_in, s_data_out |
@@ -2397,92 +2388,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2397 | release_sock(sk); | 2388 | release_sock(sk); |
2398 | return err; | 2389 | return err; |
2399 | } | 2390 | } |
2400 | case TCP_COOKIE_TRANSACTIONS: { | ||
2401 | struct tcp_cookie_transactions ctd; | ||
2402 | struct tcp_cookie_values *cvp = NULL; | ||
2403 | |||
2404 | if (sizeof(ctd) > optlen) | ||
2405 | return -EINVAL; | ||
2406 | if (copy_from_user(&ctd, optval, sizeof(ctd))) | ||
2407 | return -EFAULT; | ||
2408 | |||
2409 | if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || | ||
2410 | ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) | ||
2411 | return -EINVAL; | ||
2412 | |||
2413 | if (ctd.tcpct_cookie_desired == 0) { | ||
2414 | /* default to global value */ | ||
2415 | } else if ((0x1 & ctd.tcpct_cookie_desired) || | ||
2416 | ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || | ||
2417 | ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { | ||
2418 | return -EINVAL; | ||
2419 | } | ||
2420 | |||
2421 | if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { | ||
2422 | /* Supercedes all other values */ | ||
2423 | lock_sock(sk); | ||
2424 | if (tp->cookie_values != NULL) { | ||
2425 | kref_put(&tp->cookie_values->kref, | ||
2426 | tcp_cookie_values_release); | ||
2427 | tp->cookie_values = NULL; | ||
2428 | } | ||
2429 | tp->rx_opt.cookie_in_always = 0; /* false */ | ||
2430 | tp->rx_opt.cookie_out_never = 1; /* true */ | ||
2431 | release_sock(sk); | ||
2432 | return err; | ||
2433 | } | ||
2434 | |||
2435 | /* Allocate ancillary memory before locking. | ||
2436 | */ | ||
2437 | if (ctd.tcpct_used > 0 || | ||
2438 | (tp->cookie_values == NULL && | ||
2439 | (sysctl_tcp_cookie_size > 0 || | ||
2440 | ctd.tcpct_cookie_desired > 0 || | ||
2441 | ctd.tcpct_s_data_desired > 0))) { | ||
2442 | cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, | ||
2443 | GFP_KERNEL); | ||
2444 | if (cvp == NULL) | ||
2445 | return -ENOMEM; | ||
2446 | |||
2447 | kref_init(&cvp->kref); | ||
2448 | } | ||
2449 | lock_sock(sk); | ||
2450 | tp->rx_opt.cookie_in_always = | ||
2451 | (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); | ||
2452 | tp->rx_opt.cookie_out_never = 0; /* false */ | ||
2453 | |||
2454 | if (tp->cookie_values != NULL) { | ||
2455 | if (cvp != NULL) { | ||
2456 | /* Changed values are recorded by a changed | ||
2457 | * pointer, ensuring the cookie will differ, | ||
2458 | * without separately hashing each value later. | ||
2459 | */ | ||
2460 | kref_put(&tp->cookie_values->kref, | ||
2461 | tcp_cookie_values_release); | ||
2462 | } else { | ||
2463 | cvp = tp->cookie_values; | ||
2464 | } | ||
2465 | } | ||
2466 | |||
2467 | if (cvp != NULL) { | ||
2468 | cvp->cookie_desired = ctd.tcpct_cookie_desired; | ||
2469 | |||
2470 | if (ctd.tcpct_used > 0) { | ||
2471 | memcpy(cvp->s_data_payload, ctd.tcpct_value, | ||
2472 | ctd.tcpct_used); | ||
2473 | cvp->s_data_desired = ctd.tcpct_used; | ||
2474 | cvp->s_data_constant = 1; /* true */ | ||
2475 | } else { | ||
2476 | /* No constant payload data. */ | ||
2477 | cvp->s_data_desired = ctd.tcpct_s_data_desired; | ||
2478 | cvp->s_data_constant = 0; /* false */ | ||
2479 | } | ||
2480 | |||
2481 | tp->cookie_values = cvp; | ||
2482 | } | ||
2483 | release_sock(sk); | ||
2484 | return err; | ||
2485 | } | ||
2486 | default: | 2391 | default: |
2487 | /* fallthru */ | 2392 | /* fallthru */ |
2488 | break; | 2393 | break; |
@@ -2902,41 +2807,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
2902 | return -EFAULT; | 2807 | return -EFAULT; |
2903 | return 0; | 2808 | return 0; |
2904 | 2809 | ||
2905 | case TCP_COOKIE_TRANSACTIONS: { | ||
2906 | struct tcp_cookie_transactions ctd; | ||
2907 | struct tcp_cookie_values *cvp = tp->cookie_values; | ||
2908 | |||
2909 | if (get_user(len, optlen)) | ||
2910 | return -EFAULT; | ||
2911 | if (len < sizeof(ctd)) | ||
2912 | return -EINVAL; | ||
2913 | |||
2914 | memset(&ctd, 0, sizeof(ctd)); | ||
2915 | ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? | ||
2916 | TCP_COOKIE_IN_ALWAYS : 0) | ||
2917 | | (tp->rx_opt.cookie_out_never ? | ||
2918 | TCP_COOKIE_OUT_NEVER : 0); | ||
2919 | |||
2920 | if (cvp != NULL) { | ||
2921 | ctd.tcpct_flags |= (cvp->s_data_in ? | ||
2922 | TCP_S_DATA_IN : 0) | ||
2923 | | (cvp->s_data_out ? | ||
2924 | TCP_S_DATA_OUT : 0); | ||
2925 | |||
2926 | ctd.tcpct_cookie_desired = cvp->cookie_desired; | ||
2927 | ctd.tcpct_s_data_desired = cvp->s_data_desired; | ||
2928 | |||
2929 | memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], | ||
2930 | cvp->cookie_pair_size); | ||
2931 | ctd.tcpct_used = cvp->cookie_pair_size; | ||
2932 | } | ||
2933 | |||
2934 | if (put_user(sizeof(ctd), optlen)) | ||
2935 | return -EFAULT; | ||
2936 | if (copy_to_user(optval, &ctd, sizeof(ctd))) | ||
2937 | return -EFAULT; | ||
2938 | return 0; | ||
2939 | } | ||
2940 | case TCP_THIN_LINEAR_TIMEOUTS: | 2810 | case TCP_THIN_LINEAR_TIMEOUTS: |
2941 | val = tp->thin_lto; | 2811 | val = tp->thin_lto; |
2942 | break; | 2812 | break; |
@@ -3044,6 +2914,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, | |||
3044 | SKB_GSO_TCP_ECN | | 2914 | SKB_GSO_TCP_ECN | |
3045 | SKB_GSO_TCPV6 | | 2915 | SKB_GSO_TCPV6 | |
3046 | SKB_GSO_GRE | | 2916 | SKB_GSO_GRE | |
2917 | SKB_GSO_UDP_TUNNEL | | ||
3047 | 0) || | 2918 | 0) || |
3048 | !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) | 2919 | !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) |
3049 | goto out; | 2920 | goto out; |
@@ -3408,134 +3279,6 @@ EXPORT_SYMBOL(tcp_md5_hash_key); | |||
3408 | 3279 | ||
3409 | #endif | 3280 | #endif |
3410 | 3281 | ||
3411 | /* Each Responder maintains up to two secret values concurrently for | ||
3412 | * efficient secret rollover. Each secret value has 4 states: | ||
3413 | * | ||
3414 | * Generating. (tcp_secret_generating != tcp_secret_primary) | ||
3415 | * Generates new Responder-Cookies, but not yet used for primary | ||
3416 | * verification. This is a short-term state, typically lasting only | ||
3417 | * one round trip time (RTT). | ||
3418 | * | ||
3419 | * Primary. (tcp_secret_generating == tcp_secret_primary) | ||
3420 | * Used both for generation and primary verification. | ||
3421 | * | ||
3422 | * Retiring. (tcp_secret_retiring != tcp_secret_secondary) | ||
3423 | * Used for verification, until the first failure that can be | ||
3424 | * verified by the newer Generating secret. At that time, this | ||
3425 | * cookie's state is changed to Secondary, and the Generating | ||
3426 | * cookie's state is changed to Primary. This is a short-term state, | ||
3427 | * typically lasting only one round trip time (RTT). | ||
3428 | * | ||
3429 | * Secondary. (tcp_secret_retiring == tcp_secret_secondary) | ||
3430 | * Used for secondary verification, after primary verification | ||
3431 | * failures. This state lasts no more than twice the Maximum Segment | ||
3432 | * Lifetime (2MSL). Then, the secret is discarded. | ||
3433 | */ | ||
3434 | struct tcp_cookie_secret { | ||
3435 | /* The secret is divided into two parts. The digest part is the | ||
3436 | * equivalent of previously hashing a secret and saving the state, | ||
3437 | * and serves as an initialization vector (IV). The message part | ||
3438 | * serves as the trailing secret. | ||
3439 | */ | ||
3440 | u32 secrets[COOKIE_WORKSPACE_WORDS]; | ||
3441 | unsigned long expires; | ||
3442 | }; | ||
3443 | |||
3444 | #define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) | ||
3445 | #define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) | ||
3446 | #define TCP_SECRET_LIFE (HZ * 600) | ||
3447 | |||
3448 | static struct tcp_cookie_secret tcp_secret_one; | ||
3449 | static struct tcp_cookie_secret tcp_secret_two; | ||
3450 | |||
3451 | /* Essentially a circular list, without dynamic allocation. */ | ||
3452 | static struct tcp_cookie_secret *tcp_secret_generating; | ||
3453 | static struct tcp_cookie_secret *tcp_secret_primary; | ||
3454 | static struct tcp_cookie_secret *tcp_secret_retiring; | ||
3455 | static struct tcp_cookie_secret *tcp_secret_secondary; | ||
3456 | |||
3457 | static DEFINE_SPINLOCK(tcp_secret_locker); | ||
3458 | |||
3459 | /* Select a pseudo-random word in the cookie workspace. | ||
3460 | */ | ||
3461 | static inline u32 tcp_cookie_work(const u32 *ws, const int n) | ||
3462 | { | ||
3463 | return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; | ||
3464 | } | ||
3465 | |||
3466 | /* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. | ||
3467 | * Called in softirq context. | ||
3468 | * Returns: 0 for success. | ||
3469 | */ | ||
3470 | int tcp_cookie_generator(u32 *bakery) | ||
3471 | { | ||
3472 | unsigned long jiffy = jiffies; | ||
3473 | |||
3474 | if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { | ||
3475 | spin_lock_bh(&tcp_secret_locker); | ||
3476 | if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { | ||
3477 | /* refreshed by another */ | ||
3478 | memcpy(bakery, | ||
3479 | &tcp_secret_generating->secrets[0], | ||
3480 | COOKIE_WORKSPACE_WORDS); | ||
3481 | } else { | ||
3482 | /* still needs refreshing */ | ||
3483 | get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); | ||
3484 | |||
3485 | /* The first time, paranoia assumes that the | ||
3486 | * randomization function isn't as strong. But, | ||
3487 | * this secret initialization is delayed until | ||
3488 | * the last possible moment (packet arrival). | ||
3489 | * Although that time is observable, it is | ||
3490 | * unpredictably variable. Mash in the most | ||
3491 | * volatile clock bits available, and expire the | ||
3492 | * secret extra quickly. | ||
3493 | */ | ||
3494 | if (unlikely(tcp_secret_primary->expires == | ||
3495 | tcp_secret_secondary->expires)) { | ||
3496 | struct timespec tv; | ||
3497 | |||
3498 | getnstimeofday(&tv); | ||
3499 | bakery[COOKIE_DIGEST_WORDS+0] ^= | ||
3500 | (u32)tv.tv_nsec; | ||
3501 | |||
3502 | tcp_secret_secondary->expires = jiffy | ||
3503 | + TCP_SECRET_1MSL | ||
3504 | + (0x0f & tcp_cookie_work(bakery, 0)); | ||
3505 | } else { | ||
3506 | tcp_secret_secondary->expires = jiffy | ||
3507 | + TCP_SECRET_LIFE | ||
3508 | + (0xff & tcp_cookie_work(bakery, 1)); | ||
3509 | tcp_secret_primary->expires = jiffy | ||
3510 | + TCP_SECRET_2MSL | ||
3511 | + (0x1f & tcp_cookie_work(bakery, 2)); | ||
3512 | } | ||
3513 | memcpy(&tcp_secret_secondary->secrets[0], | ||
3514 | bakery, COOKIE_WORKSPACE_WORDS); | ||
3515 | |||
3516 | rcu_assign_pointer(tcp_secret_generating, | ||
3517 | tcp_secret_secondary); | ||
3518 | rcu_assign_pointer(tcp_secret_retiring, | ||
3519 | tcp_secret_primary); | ||
3520 | /* | ||
3521 | * Neither call_rcu() nor synchronize_rcu() needed. | ||
3522 | * Retiring data is not freed. It is replaced after | ||
3523 | * further (locked) pointer updates, and a quiet time | ||
3524 | * (minimum 1MSL, maximum LIFE - 2MSL). | ||
3525 | */ | ||
3526 | } | ||
3527 | spin_unlock_bh(&tcp_secret_locker); | ||
3528 | } else { | ||
3529 | rcu_read_lock_bh(); | ||
3530 | memcpy(bakery, | ||
3531 | &rcu_dereference(tcp_secret_generating)->secrets[0], | ||
3532 | COOKIE_WORKSPACE_WORDS); | ||
3533 | rcu_read_unlock_bh(); | ||
3534 | } | ||
3535 | return 0; | ||
3536 | } | ||
3537 | EXPORT_SYMBOL(tcp_cookie_generator); | ||
3538 | |||
3539 | void tcp_done(struct sock *sk) | 3282 | void tcp_done(struct sock *sk) |
3540 | { | 3283 | { |
3541 | struct request_sock *req = tcp_sk(sk)->fastopen_rsk; | 3284 | struct request_sock *req = tcp_sk(sk)->fastopen_rsk; |
@@ -3590,7 +3333,6 @@ void __init tcp_init(void) | |||
3590 | unsigned long limit; | 3333 | unsigned long limit; |
3591 | int max_rshare, max_wshare, cnt; | 3334 | int max_rshare, max_wshare, cnt; |
3592 | unsigned int i; | 3335 | unsigned int i; |
3593 | unsigned long jiffy = jiffies; | ||
3594 | 3336 | ||
3595 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); | 3337 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); |
3596 | 3338 | ||
@@ -3666,13 +3408,5 @@ void __init tcp_init(void) | |||
3666 | 3408 | ||
3667 | tcp_register_congestion_control(&tcp_reno); | 3409 | tcp_register_congestion_control(&tcp_reno); |
3668 | 3410 | ||
3669 | memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); | ||
3670 | memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); | ||
3671 | tcp_secret_one.expires = jiffy; /* past due */ | ||
3672 | tcp_secret_two.expires = jiffy; /* past due */ | ||
3673 | tcp_secret_generating = &tcp_secret_one; | ||
3674 | tcp_secret_primary = &tcp_secret_one; | ||
3675 | tcp_secret_retiring = &tcp_secret_two; | ||
3676 | tcp_secret_secondary = &tcp_secret_two; | ||
3677 | tcp_tasklet_init(); | 3411 | tcp_tasklet_init(); |
3678 | } | 3412 | } |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3bd55bad230a..6d9ca35f0c35 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -93,12 +93,11 @@ int sysctl_tcp_stdurg __read_mostly; | |||
93 | int sysctl_tcp_rfc1337 __read_mostly; | 93 | int sysctl_tcp_rfc1337 __read_mostly; |
94 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; | 94 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
95 | int sysctl_tcp_frto __read_mostly = 2; | 95 | int sysctl_tcp_frto __read_mostly = 2; |
96 | int sysctl_tcp_frto_response __read_mostly; | ||
97 | 96 | ||
98 | int sysctl_tcp_thin_dupack __read_mostly; | 97 | int sysctl_tcp_thin_dupack __read_mostly; |
99 | 98 | ||
100 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | 99 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
101 | int sysctl_tcp_early_retrans __read_mostly = 2; | 100 | int sysctl_tcp_early_retrans __read_mostly = 3; |
102 | 101 | ||
103 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 102 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
104 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 103 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
@@ -108,17 +107,15 @@ int sysctl_tcp_early_retrans __read_mostly = 2; | |||
108 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ | 107 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ |
109 | #define FLAG_ECE 0x40 /* ECE in this ACK */ | 108 | #define FLAG_ECE 0x40 /* ECE in this ACK */ |
110 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ | 109 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ |
111 | #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ | 110 | #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ |
112 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ | 111 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ |
113 | #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ | 112 | #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ |
114 | #define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ | ||
115 | #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ | 113 | #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ |
116 | 114 | ||
117 | #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) | 115 | #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) |
118 | #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) | 116 | #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) |
119 | #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) | 117 | #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) |
120 | #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) | 118 | #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) |
121 | #define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) | ||
122 | 119 | ||
123 | #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) | 120 | #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) |
124 | #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) | 121 | #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) |
@@ -1159,10 +1156,8 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1159 | tcp_highest_sack_seq(tp))) | 1156 | tcp_highest_sack_seq(tp))) |
1160 | state->reord = min(fack_count, | 1157 | state->reord = min(fack_count, |
1161 | state->reord); | 1158 | state->reord); |
1162 | 1159 | if (!after(end_seq, tp->high_seq)) | |
1163 | /* SACK enhanced F-RTO (RFC4138; Appendix B) */ | 1160 | state->flag |= FLAG_ORIG_SACK_ACKED; |
1164 | if (!after(end_seq, tp->frto_highmark)) | ||
1165 | state->flag |= FLAG_ONLY_ORIG_SACKED; | ||
1166 | } | 1161 | } |
1167 | 1162 | ||
1168 | if (sacked & TCPCB_LOST) { | 1163 | if (sacked & TCPCB_LOST) { |
@@ -1555,7 +1550,6 @@ static int | |||
1555 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | 1550 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, |
1556 | u32 prior_snd_una) | 1551 | u32 prior_snd_una) |
1557 | { | 1552 | { |
1558 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
1559 | struct tcp_sock *tp = tcp_sk(sk); | 1553 | struct tcp_sock *tp = tcp_sk(sk); |
1560 | const unsigned char *ptr = (skb_transport_header(ack_skb) + | 1554 | const unsigned char *ptr = (skb_transport_header(ack_skb) + |
1561 | TCP_SKB_CB(ack_skb)->sacked); | 1555 | TCP_SKB_CB(ack_skb)->sacked); |
@@ -1728,12 +1722,6 @@ walk: | |||
1728 | start_seq, end_seq, dup_sack); | 1722 | start_seq, end_seq, dup_sack); |
1729 | 1723 | ||
1730 | advance_sp: | 1724 | advance_sp: |
1731 | /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct | ||
1732 | * due to in-order walk | ||
1733 | */ | ||
1734 | if (after(end_seq, tp->frto_highmark)) | ||
1735 | state.flag &= ~FLAG_ONLY_ORIG_SACKED; | ||
1736 | |||
1737 | i++; | 1725 | i++; |
1738 | } | 1726 | } |
1739 | 1727 | ||
@@ -1750,8 +1738,7 @@ advance_sp: | |||
1750 | tcp_verify_left_out(tp); | 1738 | tcp_verify_left_out(tp); |
1751 | 1739 | ||
1752 | if ((state.reord < tp->fackets_out) && | 1740 | if ((state.reord < tp->fackets_out) && |
1753 | ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && | 1741 | ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) |
1754 | (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) | ||
1755 | tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); | 1742 | tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); |
1756 | 1743 | ||
1757 | out: | 1744 | out: |
@@ -1825,197 +1812,6 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) | |||
1825 | tp->sacked_out = 0; | 1812 | tp->sacked_out = 0; |
1826 | } | 1813 | } |
1827 | 1814 | ||
1828 | static int tcp_is_sackfrto(const struct tcp_sock *tp) | ||
1829 | { | ||
1830 | return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); | ||
1831 | } | ||
1832 | |||
1833 | /* F-RTO can only be used if TCP has never retransmitted anything other than | ||
1834 | * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) | ||
1835 | */ | ||
1836 | bool tcp_use_frto(struct sock *sk) | ||
1837 | { | ||
1838 | const struct tcp_sock *tp = tcp_sk(sk); | ||
1839 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
1840 | struct sk_buff *skb; | ||
1841 | |||
1842 | if (!sysctl_tcp_frto) | ||
1843 | return false; | ||
1844 | |||
1845 | /* MTU probe and F-RTO won't really play nicely along currently */ | ||
1846 | if (icsk->icsk_mtup.probe_size) | ||
1847 | return false; | ||
1848 | |||
1849 | if (tcp_is_sackfrto(tp)) | ||
1850 | return true; | ||
1851 | |||
1852 | /* Avoid expensive walking of rexmit queue if possible */ | ||
1853 | if (tp->retrans_out > 1) | ||
1854 | return false; | ||
1855 | |||
1856 | skb = tcp_write_queue_head(sk); | ||
1857 | if (tcp_skb_is_last(sk, skb)) | ||
1858 | return true; | ||
1859 | skb = tcp_write_queue_next(sk, skb); /* Skips head */ | ||
1860 | tcp_for_write_queue_from(skb, sk) { | ||
1861 | if (skb == tcp_send_head(sk)) | ||
1862 | break; | ||
1863 | if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) | ||
1864 | return false; | ||
1865 | /* Short-circuit when first non-SACKed skb has been checked */ | ||
1866 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | ||
1867 | break; | ||
1868 | } | ||
1869 | return true; | ||
1870 | } | ||
1871 | |||
1872 | /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO | ||
1873 | * recovery a bit and use heuristics in tcp_process_frto() to detect if | ||
1874 | * the RTO was spurious. Only clear SACKED_RETRANS of the head here to | ||
1875 | * keep retrans_out counting accurate (with SACK F-RTO, other than head | ||
1876 | * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS | ||
1877 | * bits are handled if the Loss state is really to be entered (in | ||
1878 | * tcp_enter_frto_loss). | ||
1879 | * | ||
1880 | * Do like tcp_enter_loss() would; when RTO expires the second time it | ||
1881 | * does: | ||
1882 | * "Reduce ssthresh if it has not yet been made inside this window." | ||
1883 | */ | ||
1884 | void tcp_enter_frto(struct sock *sk) | ||
1885 | { | ||
1886 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
1887 | struct tcp_sock *tp = tcp_sk(sk); | ||
1888 | struct sk_buff *skb; | ||
1889 | |||
1890 | if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || | ||
1891 | tp->snd_una == tp->high_seq || | ||
1892 | ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && | ||
1893 | !icsk->icsk_retransmits)) { | ||
1894 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | ||
1895 | /* Our state is too optimistic in ssthresh() call because cwnd | ||
1896 | * is not reduced until tcp_enter_frto_loss() when previous F-RTO | ||
1897 | * recovery has not yet completed. Pattern would be this: RTO, | ||
1898 | * Cumulative ACK, RTO (2xRTO for the same segment does not end | ||
1899 | * up here twice). | ||
1900 | * RFC4138 should be more specific on what to do, even though | ||
1901 | * RTO is quite unlikely to occur after the first Cumulative ACK | ||
1902 | * due to back-off and complexity of triggering events ... | ||
1903 | */ | ||
1904 | if (tp->frto_counter) { | ||
1905 | u32 stored_cwnd; | ||
1906 | stored_cwnd = tp->snd_cwnd; | ||
1907 | tp->snd_cwnd = 2; | ||
1908 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | ||
1909 | tp->snd_cwnd = stored_cwnd; | ||
1910 | } else { | ||
1911 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | ||
1912 | } | ||
1913 | /* ... in theory, cong.control module could do "any tricks" in | ||
1914 | * ssthresh(), which means that ca_state, lost bits and lost_out | ||
1915 | * counter would have to be faked before the call occurs. We | ||
1916 | * consider that too expensive, unlikely and hacky, so modules | ||
1917 | * using these in ssthresh() must deal these incompatibility | ||
1918 | * issues if they receives CA_EVENT_FRTO and frto_counter != 0 | ||
1919 | */ | ||
1920 | tcp_ca_event(sk, CA_EVENT_FRTO); | ||
1921 | } | ||
1922 | |||
1923 | tp->undo_marker = tp->snd_una; | ||
1924 | tp->undo_retrans = 0; | ||
1925 | |||
1926 | skb = tcp_write_queue_head(sk); | ||
1927 | if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) | ||
1928 | tp->undo_marker = 0; | ||
1929 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { | ||
1930 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | ||
1931 | tp->retrans_out -= tcp_skb_pcount(skb); | ||
1932 | } | ||
1933 | tcp_verify_left_out(tp); | ||
1934 | |||
1935 | /* Too bad if TCP was application limited */ | ||
1936 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); | ||
1937 | |||
1938 | /* Earlier loss recovery underway (see RFC4138; Appendix B). | ||
1939 | * The last condition is necessary at least in tp->frto_counter case. | ||
1940 | */ | ||
1941 | if (tcp_is_sackfrto(tp) && (tp->frto_counter || | ||
1942 | ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && | ||
1943 | after(tp->high_seq, tp->snd_una)) { | ||
1944 | tp->frto_highmark = tp->high_seq; | ||
1945 | } else { | ||
1946 | tp->frto_highmark = tp->snd_nxt; | ||
1947 | } | ||
1948 | tcp_set_ca_state(sk, TCP_CA_Disorder); | ||
1949 | tp->high_seq = tp->snd_nxt; | ||
1950 | tp->frto_counter = 1; | ||
1951 | } | ||
1952 | |||
1953 | /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, | ||
1954 | * which indicates that we should follow the traditional RTO recovery, | ||
1955 | * i.e. mark everything lost and do go-back-N retransmission. | ||
1956 | */ | ||
1957 | static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) | ||
1958 | { | ||
1959 | struct tcp_sock *tp = tcp_sk(sk); | ||
1960 | struct sk_buff *skb; | ||
1961 | |||
1962 | tp->lost_out = 0; | ||
1963 | tp->retrans_out = 0; | ||
1964 | if (tcp_is_reno(tp)) | ||
1965 | tcp_reset_reno_sack(tp); | ||
1966 | |||
1967 | tcp_for_write_queue(skb, sk) { | ||
1968 | if (skb == tcp_send_head(sk)) | ||
1969 | break; | ||
1970 | |||
1971 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | ||
1972 | /* | ||
1973 | * Count the retransmission made on RTO correctly (only when | ||
1974 | * waiting for the first ACK and did not get it)... | ||
1975 | */ | ||
1976 | if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { | ||
1977 | /* For some reason this R-bit might get cleared? */ | ||
1978 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) | ||
1979 | tp->retrans_out += tcp_skb_pcount(skb); | ||
1980 | /* ...enter this if branch just for the first segment */ | ||
1981 | flag |= FLAG_DATA_ACKED; | ||
1982 | } else { | ||
1983 | if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) | ||
1984 | tp->undo_marker = 0; | ||
1985 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | ||
1986 | } | ||
1987 | |||
1988 | /* Marking forward transmissions that were made after RTO lost | ||
1989 | * can cause unnecessary retransmissions in some scenarios, | ||
1990 | * SACK blocks will mitigate that in some but not in all cases. | ||
1991 | * We used to not mark them but it was causing break-ups with | ||
1992 | * receivers that do only in-order receival. | ||
1993 | * | ||
1994 | * TODO: we could detect presence of such receiver and select | ||
1995 | * different behavior per flow. | ||
1996 | */ | ||
1997 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { | ||
1998 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | ||
1999 | tp->lost_out += tcp_skb_pcount(skb); | ||
2000 | tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; | ||
2001 | } | ||
2002 | } | ||
2003 | tcp_verify_left_out(tp); | ||
2004 | |||
2005 | tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; | ||
2006 | tp->snd_cwnd_cnt = 0; | ||
2007 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
2008 | tp->frto_counter = 0; | ||
2009 | |||
2010 | tp->reordering = min_t(unsigned int, tp->reordering, | ||
2011 | sysctl_tcp_reordering); | ||
2012 | tcp_set_ca_state(sk, TCP_CA_Loss); | ||
2013 | tp->high_seq = tp->snd_nxt; | ||
2014 | TCP_ECN_queue_cwr(tp); | ||
2015 | |||
2016 | tcp_clear_all_retrans_hints(tp); | ||
2017 | } | ||
2018 | |||
2019 | static void tcp_clear_retrans_partial(struct tcp_sock *tp) | 1815 | static void tcp_clear_retrans_partial(struct tcp_sock *tp) |
2020 | { | 1816 | { |
2021 | tp->retrans_out = 0; | 1817 | tp->retrans_out = 0; |
@@ -2042,10 +1838,13 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
2042 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1838 | const struct inet_connection_sock *icsk = inet_csk(sk); |
2043 | struct tcp_sock *tp = tcp_sk(sk); | 1839 | struct tcp_sock *tp = tcp_sk(sk); |
2044 | struct sk_buff *skb; | 1840 | struct sk_buff *skb; |
1841 | bool new_recovery = false; | ||
2045 | 1842 | ||
2046 | /* Reduce ssthresh if it has not yet been made inside this window. */ | 1843 | /* Reduce ssthresh if it has not yet been made inside this window. */ |
2047 | if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || | 1844 | if (icsk->icsk_ca_state <= TCP_CA_Disorder || |
1845 | !after(tp->high_seq, tp->snd_una) || | ||
2048 | (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { | 1846 | (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { |
1847 | new_recovery = true; | ||
2049 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | 1848 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
2050 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | 1849 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); |
2051 | tcp_ca_event(sk, CA_EVENT_LOSS); | 1850 | tcp_ca_event(sk, CA_EVENT_LOSS); |
@@ -2087,8 +1886,14 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
2087 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1886 | tcp_set_ca_state(sk, TCP_CA_Loss); |
2088 | tp->high_seq = tp->snd_nxt; | 1887 | tp->high_seq = tp->snd_nxt; |
2089 | TCP_ECN_queue_cwr(tp); | 1888 | TCP_ECN_queue_cwr(tp); |
2090 | /* Abort F-RTO algorithm if one is in progress */ | 1889 | |
2091 | tp->frto_counter = 0; | 1890 | /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous |
1891 | * loss recovery is underway except recurring timeout(s) on | ||
1892 | * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing | ||
1893 | */ | ||
1894 | tp->frto = sysctl_tcp_frto && | ||
1895 | (new_recovery || icsk->icsk_retransmits) && | ||
1896 | !inet_csk(sk)->icsk_mtup.probe_size; | ||
2092 | } | 1897 | } |
2093 | 1898 | ||
2094 | /* If ACK arrived pointing to a remembered SACK, it means that our | 1899 | /* If ACK arrived pointing to a remembered SACK, it means that our |
@@ -2147,15 +1952,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) | |||
2147 | * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples | 1952 | * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples |
2148 | * available, or RTO is scheduled to fire first. | 1953 | * available, or RTO is scheduled to fire first. |
2149 | */ | 1954 | */ |
2150 | if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) | 1955 | if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || |
1956 | (flag & FLAG_ECE) || !tp->srtt) | ||
2151 | return false; | 1957 | return false; |
2152 | 1958 | ||
2153 | delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); | 1959 | delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); |
2154 | if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) | 1960 | if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) |
2155 | return false; | 1961 | return false; |
2156 | 1962 | ||
2157 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); | 1963 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, |
2158 | tp->early_retrans_delayed = 1; | 1964 | TCP_RTO_MAX); |
2159 | return true; | 1965 | return true; |
2160 | } | 1966 | } |
2161 | 1967 | ||
@@ -2271,10 +2077,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) | |||
2271 | struct tcp_sock *tp = tcp_sk(sk); | 2077 | struct tcp_sock *tp = tcp_sk(sk); |
2272 | __u32 packets_out; | 2078 | __u32 packets_out; |
2273 | 2079 | ||
2274 | /* Do not perform any recovery during F-RTO algorithm */ | ||
2275 | if (tp->frto_counter) | ||
2276 | return false; | ||
2277 | |||
2278 | /* Trick#1: The loss is proven. */ | 2080 | /* Trick#1: The loss is proven. */ |
2279 | if (tp->lost_out) | 2081 | if (tp->lost_out) |
2280 | return true; | 2082 | return true; |
@@ -2318,7 +2120,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) | |||
2318 | * interval if appropriate. | 2120 | * interval if appropriate. |
2319 | */ | 2121 | */ |
2320 | if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && | 2122 | if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && |
2321 | (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && | 2123 | (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && |
2322 | !tcp_may_send_now(sk)) | 2124 | !tcp_may_send_now(sk)) |
2323 | return !tcp_pause_early_retransmit(sk, flag); | 2125 | return !tcp_pause_early_retransmit(sk, flag); |
2324 | 2126 | ||
@@ -2635,12 +2437,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) | |||
2635 | return failed; | 2437 | return failed; |
2636 | } | 2438 | } |
2637 | 2439 | ||
2638 | /* Undo during loss recovery after partial ACK. */ | 2440 | /* Undo during loss recovery after partial ACK or using F-RTO. */ |
2639 | static bool tcp_try_undo_loss(struct sock *sk) | 2441 | static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) |
2640 | { | 2442 | { |
2641 | struct tcp_sock *tp = tcp_sk(sk); | 2443 | struct tcp_sock *tp = tcp_sk(sk); |
2642 | 2444 | ||
2643 | if (tcp_may_undo(tp)) { | 2445 | if (frto_undo || tcp_may_undo(tp)) { |
2644 | struct sk_buff *skb; | 2446 | struct sk_buff *skb; |
2645 | tcp_for_write_queue(skb, sk) { | 2447 | tcp_for_write_queue(skb, sk) { |
2646 | if (skb == tcp_send_head(sk)) | 2448 | if (skb == tcp_send_head(sk)) |
@@ -2654,9 +2456,12 @@ static bool tcp_try_undo_loss(struct sock *sk) | |||
2654 | tp->lost_out = 0; | 2456 | tp->lost_out = 0; |
2655 | tcp_undo_cwr(sk, true); | 2457 | tcp_undo_cwr(sk, true); |
2656 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); | 2458 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); |
2459 | if (frto_undo) | ||
2460 | NET_INC_STATS_BH(sock_net(sk), | ||
2461 | LINUX_MIB_TCPSPURIOUSRTOS); | ||
2657 | inet_csk(sk)->icsk_retransmits = 0; | 2462 | inet_csk(sk)->icsk_retransmits = 0; |
2658 | tp->undo_marker = 0; | 2463 | tp->undo_marker = 0; |
2659 | if (tcp_is_sack(tp)) | 2464 | if (frto_undo || tcp_is_sack(tp)) |
2660 | tcp_set_ca_state(sk, TCP_CA_Open); | 2465 | tcp_set_ca_state(sk, TCP_CA_Open); |
2661 | return true; | 2466 | return true; |
2662 | } | 2467 | } |
@@ -2678,6 +2483,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) | |||
2678 | struct tcp_sock *tp = tcp_sk(sk); | 2483 | struct tcp_sock *tp = tcp_sk(sk); |
2679 | 2484 | ||
2680 | tp->high_seq = tp->snd_nxt; | 2485 | tp->high_seq = tp->snd_nxt; |
2486 | tp->tlp_high_seq = 0; | ||
2681 | tp->snd_cwnd_cnt = 0; | 2487 | tp->snd_cwnd_cnt = 0; |
2682 | tp->prior_cwnd = tp->snd_cwnd; | 2488 | tp->prior_cwnd = tp->snd_cwnd; |
2683 | tp->prr_delivered = 0; | 2489 | tp->prr_delivered = 0; |
@@ -2755,7 +2561,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) | |||
2755 | 2561 | ||
2756 | tcp_verify_left_out(tp); | 2562 | tcp_verify_left_out(tp); |
2757 | 2563 | ||
2758 | if (!tp->frto_counter && !tcp_any_retrans_done(sk)) | 2564 | if (!tcp_any_retrans_done(sk)) |
2759 | tp->retrans_stamp = 0; | 2565 | tp->retrans_stamp = 0; |
2760 | 2566 | ||
2761 | if (flag & FLAG_ECE) | 2567 | if (flag & FLAG_ECE) |
@@ -2872,6 +2678,58 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | |||
2872 | tcp_set_ca_state(sk, TCP_CA_Recovery); | 2678 | tcp_set_ca_state(sk, TCP_CA_Recovery); |
2873 | } | 2679 | } |
2874 | 2680 | ||
2681 | /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are | ||
2682 | * recovered or spurious. Otherwise retransmits more on partial ACKs. | ||
2683 | */ | ||
2684 | static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) | ||
2685 | { | ||
2686 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
2687 | struct tcp_sock *tp = tcp_sk(sk); | ||
2688 | bool recovered = !before(tp->snd_una, tp->high_seq); | ||
2689 | |||
2690 | if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ | ||
2691 | if (flag & FLAG_ORIG_SACK_ACKED) { | ||
2692 | /* Step 3.b. A timeout is spurious if not all data are | ||
2693 | * lost, i.e., never-retransmitted data are (s)acked. | ||
2694 | */ | ||
2695 | tcp_try_undo_loss(sk, true); | ||
2696 | return; | ||
2697 | } | ||
2698 | if (after(tp->snd_nxt, tp->high_seq) && | ||
2699 | (flag & FLAG_DATA_SACKED || is_dupack)) { | ||
2700 | tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ | ||
2701 | } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { | ||
2702 | tp->high_seq = tp->snd_nxt; | ||
2703 | __tcp_push_pending_frames(sk, tcp_current_mss(sk), | ||
2704 | TCP_NAGLE_OFF); | ||
2705 | if (after(tp->snd_nxt, tp->high_seq)) | ||
2706 | return; /* Step 2.b */ | ||
2707 | tp->frto = 0; | ||
2708 | } | ||
2709 | } | ||
2710 | |||
2711 | if (recovered) { | ||
2712 | /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ | ||
2713 | icsk->icsk_retransmits = 0; | ||
2714 | tcp_try_undo_recovery(sk); | ||
2715 | return; | ||
2716 | } | ||
2717 | if (flag & FLAG_DATA_ACKED) | ||
2718 | icsk->icsk_retransmits = 0; | ||
2719 | if (tcp_is_reno(tp)) { | ||
2720 | /* A Reno DUPACK means new data in F-RTO step 2.b above are | ||
2721 | * delivered. Lower inflight to clock out (re)tranmissions. | ||
2722 | */ | ||
2723 | if (after(tp->snd_nxt, tp->high_seq) && is_dupack) | ||
2724 | tcp_add_reno_sack(sk); | ||
2725 | else if (flag & FLAG_SND_UNA_ADVANCED) | ||
2726 | tcp_reset_reno_sack(tp); | ||
2727 | } | ||
2728 | if (tcp_try_undo_loss(sk, false)) | ||
2729 | return; | ||
2730 | tcp_xmit_retransmit_queue(sk); | ||
2731 | } | ||
2732 | |||
2875 | /* Process an event, which can update packets-in-flight not trivially. | 2733 | /* Process an event, which can update packets-in-flight not trivially. |
2876 | * Main goal of this function is to calculate new estimate for left_out, | 2734 | * Main goal of this function is to calculate new estimate for left_out, |
2877 | * taking into account both packets sitting in receiver's buffer and | 2735 | * taking into account both packets sitting in receiver's buffer and |
@@ -2918,12 +2776,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2918 | tp->retrans_stamp = 0; | 2776 | tp->retrans_stamp = 0; |
2919 | } else if (!before(tp->snd_una, tp->high_seq)) { | 2777 | } else if (!before(tp->snd_una, tp->high_seq)) { |
2920 | switch (icsk->icsk_ca_state) { | 2778 | switch (icsk->icsk_ca_state) { |
2921 | case TCP_CA_Loss: | ||
2922 | icsk->icsk_retransmits = 0; | ||
2923 | if (tcp_try_undo_recovery(sk)) | ||
2924 | return; | ||
2925 | break; | ||
2926 | |||
2927 | case TCP_CA_CWR: | 2779 | case TCP_CA_CWR: |
2928 | /* CWR is to be held something *above* high_seq | 2780 | /* CWR is to be held something *above* high_seq |
2929 | * is ACKed for CWR bit to reach receiver. */ | 2781 | * is ACKed for CWR bit to reach receiver. */ |
@@ -2954,18 +2806,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2954 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; | 2806 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; |
2955 | break; | 2807 | break; |
2956 | case TCP_CA_Loss: | 2808 | case TCP_CA_Loss: |
2957 | if (flag & FLAG_DATA_ACKED) | 2809 | tcp_process_loss(sk, flag, is_dupack); |
2958 | icsk->icsk_retransmits = 0; | ||
2959 | if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) | ||
2960 | tcp_reset_reno_sack(tp); | ||
2961 | if (!tcp_try_undo_loss(sk)) { | ||
2962 | tcp_moderate_cwnd(tp); | ||
2963 | tcp_xmit_retransmit_queue(sk); | ||
2964 | return; | ||
2965 | } | ||
2966 | if (icsk->icsk_ca_state != TCP_CA_Open) | 2810 | if (icsk->icsk_ca_state != TCP_CA_Open) |
2967 | return; | 2811 | return; |
2968 | /* Loss is undone; fall through to processing in Open state. */ | 2812 | /* Fall through to processing in Open state. */ |
2969 | default: | 2813 | default: |
2970 | if (tcp_is_reno(tp)) { | 2814 | if (tcp_is_reno(tp)) { |
2971 | if (flag & FLAG_SND_UNA_ADVANCED) | 2815 | if (flag & FLAG_SND_UNA_ADVANCED) |
@@ -3078,6 +2922,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) | |||
3078 | */ | 2922 | */ |
3079 | void tcp_rearm_rto(struct sock *sk) | 2923 | void tcp_rearm_rto(struct sock *sk) |
3080 | { | 2924 | { |
2925 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
3081 | struct tcp_sock *tp = tcp_sk(sk); | 2926 | struct tcp_sock *tp = tcp_sk(sk); |
3082 | 2927 | ||
3083 | /* If the retrans timer is currently being used by Fast Open | 2928 | /* If the retrans timer is currently being used by Fast Open |
@@ -3091,12 +2936,13 @@ void tcp_rearm_rto(struct sock *sk) | |||
3091 | } else { | 2936 | } else { |
3092 | u32 rto = inet_csk(sk)->icsk_rto; | 2937 | u32 rto = inet_csk(sk)->icsk_rto; |
3093 | /* Offset the time elapsed after installing regular RTO */ | 2938 | /* Offset the time elapsed after installing regular RTO */ |
3094 | if (tp->early_retrans_delayed) { | 2939 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || |
2940 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | ||
3095 | struct sk_buff *skb = tcp_write_queue_head(sk); | 2941 | struct sk_buff *skb = tcp_write_queue_head(sk); |
3096 | const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; | 2942 | const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; |
3097 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); | 2943 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); |
3098 | /* delta may not be positive if the socket is locked | 2944 | /* delta may not be positive if the socket is locked |
3099 | * when the delayed ER timer fires and is rescheduled. | 2945 | * when the retrans timer fires and is rescheduled. |
3100 | */ | 2946 | */ |
3101 | if (delta > 0) | 2947 | if (delta > 0) |
3102 | rto = delta; | 2948 | rto = delta; |
@@ -3104,7 +2950,6 @@ void tcp_rearm_rto(struct sock *sk) | |||
3104 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, | 2950 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
3105 | TCP_RTO_MAX); | 2951 | TCP_RTO_MAX); |
3106 | } | 2952 | } |
3107 | tp->early_retrans_delayed = 0; | ||
3108 | } | 2953 | } |
3109 | 2954 | ||
3110 | /* This function is called when the delayed ER timer fires. TCP enters | 2955 | /* This function is called when the delayed ER timer fires. TCP enters |
@@ -3192,8 +3037,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3192 | flag |= FLAG_RETRANS_DATA_ACKED; | 3037 | flag |= FLAG_RETRANS_DATA_ACKED; |
3193 | ca_seq_rtt = -1; | 3038 | ca_seq_rtt = -1; |
3194 | seq_rtt = -1; | 3039 | seq_rtt = -1; |
3195 | if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) | ||
3196 | flag |= FLAG_NONHEAD_RETRANS_ACKED; | ||
3197 | } else { | 3040 | } else { |
3198 | ca_seq_rtt = now - scb->when; | 3041 | ca_seq_rtt = now - scb->when; |
3199 | last_ackt = skb->tstamp; | 3042 | last_ackt = skb->tstamp; |
@@ -3202,6 +3045,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3202 | } | 3045 | } |
3203 | if (!(sacked & TCPCB_SACKED_ACKED)) | 3046 | if (!(sacked & TCPCB_SACKED_ACKED)) |
3204 | reord = min(pkts_acked, reord); | 3047 | reord = min(pkts_acked, reord); |
3048 | if (!after(scb->end_seq, tp->high_seq)) | ||
3049 | flag |= FLAG_ORIG_SACK_ACKED; | ||
3205 | } | 3050 | } |
3206 | 3051 | ||
3207 | if (sacked & TCPCB_SACKED_ACKED) | 3052 | if (sacked & TCPCB_SACKED_ACKED) |
@@ -3402,150 +3247,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 | |||
3402 | return flag; | 3247 | return flag; |
3403 | } | 3248 | } |
3404 | 3249 | ||
3405 | /* A very conservative spurious RTO response algorithm: reduce cwnd and | ||
3406 | * continue in congestion avoidance. | ||
3407 | */ | ||
3408 | static void tcp_conservative_spur_to_response(struct tcp_sock *tp) | ||
3409 | { | ||
3410 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | ||
3411 | tp->snd_cwnd_cnt = 0; | ||
3412 | TCP_ECN_queue_cwr(tp); | ||
3413 | tcp_moderate_cwnd(tp); | ||
3414 | } | ||
3415 | |||
3416 | /* A conservative spurious RTO response algorithm: reduce cwnd using | ||
3417 | * PRR and continue in congestion avoidance. | ||
3418 | */ | ||
3419 | static void tcp_cwr_spur_to_response(struct sock *sk) | ||
3420 | { | ||
3421 | tcp_enter_cwr(sk, 0); | ||
3422 | } | ||
3423 | |||
3424 | static void tcp_undo_spur_to_response(struct sock *sk, int flag) | ||
3425 | { | ||
3426 | if (flag & FLAG_ECE) | ||
3427 | tcp_cwr_spur_to_response(sk); | ||
3428 | else | ||
3429 | tcp_undo_cwr(sk, true); | ||
3430 | } | ||
3431 | |||
3432 | /* F-RTO spurious RTO detection algorithm (RFC4138) | ||
3433 | * | ||
3434 | * F-RTO affects during two new ACKs following RTO (well, almost, see inline | ||
3435 | * comments). State (ACK number) is kept in frto_counter. When ACK advances | ||
3436 | * window (but not to or beyond highest sequence sent before RTO): | ||
3437 | * On First ACK, send two new segments out. | ||
3438 | * On Second ACK, RTO was likely spurious. Do spurious response (response | ||
3439 | * algorithm is not part of the F-RTO detection algorithm | ||
3440 | * given in RFC4138 but can be selected separately). | ||
3441 | * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss | ||
3442 | * and TCP falls back to conventional RTO recovery. F-RTO allows overriding | ||
3443 | * of Nagle, this is done using frto_counter states 2 and 3, when a new data | ||
3444 | * segment of any size sent during F-RTO, state 2 is upgraded to 3. | ||
3445 | * | ||
3446 | * Rationale: if the RTO was spurious, new ACKs should arrive from the | ||
3447 | * original window even after we transmit two new data segments. | ||
3448 | * | ||
3449 | * SACK version: | ||
3450 | * on first step, wait until first cumulative ACK arrives, then move to | ||
3451 | * the second step. In second step, the next ACK decides. | ||
3452 | * | ||
3453 | * F-RTO is implemented (mainly) in four functions: | ||
3454 | * - tcp_use_frto() is used to determine if TCP is can use F-RTO | ||
3455 | * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is | ||
3456 | * called when tcp_use_frto() showed green light | ||
3457 | * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm | ||
3458 | * - tcp_enter_frto_loss() is called if there is not enough evidence | ||
3459 | * to prove that the RTO is indeed spurious. It transfers the control | ||
3460 | * from F-RTO to the conventional RTO recovery | ||
3461 | */ | ||
3462 | static bool tcp_process_frto(struct sock *sk, int flag) | ||
3463 | { | ||
3464 | struct tcp_sock *tp = tcp_sk(sk); | ||
3465 | |||
3466 | tcp_verify_left_out(tp); | ||
3467 | |||
3468 | /* Duplicate the behavior from Loss state (fastretrans_alert) */ | ||
3469 | if (flag & FLAG_DATA_ACKED) | ||
3470 | inet_csk(sk)->icsk_retransmits = 0; | ||
3471 | |||
3472 | if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || | ||
3473 | ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) | ||
3474 | tp->undo_marker = 0; | ||
3475 | |||
3476 | if (!before(tp->snd_una, tp->frto_highmark)) { | ||
3477 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); | ||
3478 | return true; | ||
3479 | } | ||
3480 | |||
3481 | if (!tcp_is_sackfrto(tp)) { | ||
3482 | /* RFC4138 shortcoming in step 2; should also have case c): | ||
3483 | * ACK isn't duplicate nor advances window, e.g., opposite dir | ||
3484 | * data, winupdate | ||
3485 | */ | ||
3486 | if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) | ||
3487 | return true; | ||
3488 | |||
3489 | if (!(flag & FLAG_DATA_ACKED)) { | ||
3490 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), | ||
3491 | flag); | ||
3492 | return true; | ||
3493 | } | ||
3494 | } else { | ||
3495 | if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { | ||
3496 | if (!tcp_packets_in_flight(tp)) { | ||
3497 | tcp_enter_frto_loss(sk, 2, flag); | ||
3498 | return true; | ||
3499 | } | ||
3500 | |||
3501 | /* Prevent sending of new data. */ | ||
3502 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
3503 | tcp_packets_in_flight(tp)); | ||
3504 | return true; | ||
3505 | } | ||
3506 | |||
3507 | if ((tp->frto_counter >= 2) && | ||
3508 | (!(flag & FLAG_FORWARD_PROGRESS) || | ||
3509 | ((flag & FLAG_DATA_SACKED) && | ||
3510 | !(flag & FLAG_ONLY_ORIG_SACKED)))) { | ||
3511 | /* RFC4138 shortcoming (see comment above) */ | ||
3512 | if (!(flag & FLAG_FORWARD_PROGRESS) && | ||
3513 | (flag & FLAG_NOT_DUP)) | ||
3514 | return true; | ||
3515 | |||
3516 | tcp_enter_frto_loss(sk, 3, flag); | ||
3517 | return true; | ||
3518 | } | ||
3519 | } | ||
3520 | |||
3521 | if (tp->frto_counter == 1) { | ||
3522 | /* tcp_may_send_now needs to see updated state */ | ||
3523 | tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; | ||
3524 | tp->frto_counter = 2; | ||
3525 | |||
3526 | if (!tcp_may_send_now(sk)) | ||
3527 | tcp_enter_frto_loss(sk, 2, flag); | ||
3528 | |||
3529 | return true; | ||
3530 | } else { | ||
3531 | switch (sysctl_tcp_frto_response) { | ||
3532 | case 2: | ||
3533 | tcp_undo_spur_to_response(sk, flag); | ||
3534 | break; | ||
3535 | case 1: | ||
3536 | tcp_conservative_spur_to_response(tp); | ||
3537 | break; | ||
3538 | default: | ||
3539 | tcp_cwr_spur_to_response(sk); | ||
3540 | break; | ||
3541 | } | ||
3542 | tp->frto_counter = 0; | ||
3543 | tp->undo_marker = 0; | ||
3544 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); | ||
3545 | } | ||
3546 | return false; | ||
3547 | } | ||
3548 | |||
3549 | /* RFC 5961 7 [ACK Throttling] */ | 3250 | /* RFC 5961 7 [ACK Throttling] */ |
3550 | static void tcp_send_challenge_ack(struct sock *sk) | 3251 | static void tcp_send_challenge_ack(struct sock *sk) |
3551 | { | 3252 | { |
@@ -3564,6 +3265,38 @@ static void tcp_send_challenge_ack(struct sock *sk) | |||
3564 | } | 3265 | } |
3565 | } | 3266 | } |
3566 | 3267 | ||
3268 | /* This routine deals with acks during a TLP episode. | ||
3269 | * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. | ||
3270 | */ | ||
3271 | static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) | ||
3272 | { | ||
3273 | struct tcp_sock *tp = tcp_sk(sk); | ||
3274 | bool is_tlp_dupack = (ack == tp->tlp_high_seq) && | ||
3275 | !(flag & (FLAG_SND_UNA_ADVANCED | | ||
3276 | FLAG_NOT_DUP | FLAG_DATA_SACKED)); | ||
3277 | |||
3278 | /* Mark the end of TLP episode on receiving TLP dupack or when | ||
3279 | * ack is after tlp_high_seq. | ||
3280 | */ | ||
3281 | if (is_tlp_dupack) { | ||
3282 | tp->tlp_high_seq = 0; | ||
3283 | return; | ||
3284 | } | ||
3285 | |||
3286 | if (after(ack, tp->tlp_high_seq)) { | ||
3287 | tp->tlp_high_seq = 0; | ||
3288 | /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ | ||
3289 | if (!(flag & FLAG_DSACKING_ACK)) { | ||
3290 | tcp_init_cwnd_reduction(sk, true); | ||
3291 | tcp_set_ca_state(sk, TCP_CA_CWR); | ||
3292 | tcp_end_cwnd_reduction(sk); | ||
3293 | tcp_set_ca_state(sk, TCP_CA_Open); | ||
3294 | NET_INC_STATS_BH(sock_net(sk), | ||
3295 | LINUX_MIB_TCPLOSSPROBERECOVERY); | ||
3296 | } | ||
3297 | } | ||
3298 | } | ||
3299 | |||
3567 | /* This routine deals with incoming acks, but not outgoing ones. */ | 3300 | /* This routine deals with incoming acks, but not outgoing ones. */ |
3568 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | 3301 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) |
3569 | { | 3302 | { |
@@ -3578,7 +3311,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3578 | int prior_packets; | 3311 | int prior_packets; |
3579 | int prior_sacked = tp->sacked_out; | 3312 | int prior_sacked = tp->sacked_out; |
3580 | int pkts_acked = 0; | 3313 | int pkts_acked = 0; |
3581 | bool frto_cwnd = false; | ||
3582 | 3314 | ||
3583 | /* If the ack is older than previous acks | 3315 | /* If the ack is older than previous acks |
3584 | * then we can probably ignore it. | 3316 | * then we can probably ignore it. |
@@ -3598,7 +3330,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3598 | if (after(ack, tp->snd_nxt)) | 3330 | if (after(ack, tp->snd_nxt)) |
3599 | goto invalid_ack; | 3331 | goto invalid_ack; |
3600 | 3332 | ||
3601 | if (tp->early_retrans_delayed) | 3333 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || |
3334 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) | ||
3602 | tcp_rearm_rto(sk); | 3335 | tcp_rearm_rto(sk); |
3603 | 3336 | ||
3604 | if (after(ack, prior_snd_una)) | 3337 | if (after(ack, prior_snd_una)) |
@@ -3651,30 +3384,29 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3651 | 3384 | ||
3652 | pkts_acked = prior_packets - tp->packets_out; | 3385 | pkts_acked = prior_packets - tp->packets_out; |
3653 | 3386 | ||
3654 | if (tp->frto_counter) | ||
3655 | frto_cwnd = tcp_process_frto(sk, flag); | ||
3656 | /* Guarantee sacktag reordering detection against wrap-arounds */ | ||
3657 | if (before(tp->frto_highmark, tp->snd_una)) | ||
3658 | tp->frto_highmark = 0; | ||
3659 | |||
3660 | if (tcp_ack_is_dubious(sk, flag)) { | 3387 | if (tcp_ack_is_dubious(sk, flag)) { |
3661 | /* Advance CWND, if state allows this. */ | 3388 | /* Advance CWND, if state allows this. */ |
3662 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && | 3389 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) |
3663 | tcp_may_raise_cwnd(sk, flag)) | ||
3664 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3390 | tcp_cong_avoid(sk, ack, prior_in_flight); |
3665 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | 3391 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
3666 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, | 3392 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, |
3667 | is_dupack, flag); | 3393 | is_dupack, flag); |
3668 | } else { | 3394 | } else { |
3669 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) | 3395 | if (flag & FLAG_DATA_ACKED) |
3670 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3396 | tcp_cong_avoid(sk, ack, prior_in_flight); |
3671 | } | 3397 | } |
3672 | 3398 | ||
3399 | if (tp->tlp_high_seq) | ||
3400 | tcp_process_tlp_ack(sk, ack, flag); | ||
3401 | |||
3673 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { | 3402 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { |
3674 | struct dst_entry *dst = __sk_dst_get(sk); | 3403 | struct dst_entry *dst = __sk_dst_get(sk); |
3675 | if (dst) | 3404 | if (dst) |
3676 | dst_confirm(dst); | 3405 | dst_confirm(dst); |
3677 | } | 3406 | } |
3407 | |||
3408 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) | ||
3409 | tcp_schedule_loss_probe(sk); | ||
3678 | return 1; | 3410 | return 1; |
3679 | 3411 | ||
3680 | no_queue: | 3412 | no_queue: |
@@ -3688,6 +3420,9 @@ no_queue: | |||
3688 | */ | 3420 | */ |
3689 | if (tcp_send_head(sk)) | 3421 | if (tcp_send_head(sk)) |
3690 | tcp_ack_probe(sk); | 3422 | tcp_ack_probe(sk); |
3423 | |||
3424 | if (tp->tlp_high_seq) | ||
3425 | tcp_process_tlp_ack(sk, ack, flag); | ||
3691 | return 1; | 3426 | return 1; |
3692 | 3427 | ||
3693 | invalid_ack: | 3428 | invalid_ack: |
@@ -3712,8 +3447,8 @@ old_ack: | |||
3712 | * But, this can also be called on packets in the established flow when | 3447 | * But, this can also be called on packets in the established flow when |
3713 | * the fast version below fails. | 3448 | * the fast version below fails. |
3714 | */ | 3449 | */ |
3715 | void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, | 3450 | void tcp_parse_options(const struct sk_buff *skb, |
3716 | const u8 **hvpp, int estab, | 3451 | struct tcp_options_received *opt_rx, int estab, |
3717 | struct tcp_fastopen_cookie *foc) | 3452 | struct tcp_fastopen_cookie *foc) |
3718 | { | 3453 | { |
3719 | const unsigned char *ptr; | 3454 | const unsigned char *ptr; |
@@ -3797,31 +3532,6 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
3797 | */ | 3532 | */ |
3798 | break; | 3533 | break; |
3799 | #endif | 3534 | #endif |
3800 | case TCPOPT_COOKIE: | ||
3801 | /* This option is variable length. | ||
3802 | */ | ||
3803 | switch (opsize) { | ||
3804 | case TCPOLEN_COOKIE_BASE: | ||
3805 | /* not yet implemented */ | ||
3806 | break; | ||
3807 | case TCPOLEN_COOKIE_PAIR: | ||
3808 | /* not yet implemented */ | ||
3809 | break; | ||
3810 | case TCPOLEN_COOKIE_MIN+0: | ||
3811 | case TCPOLEN_COOKIE_MIN+2: | ||
3812 | case TCPOLEN_COOKIE_MIN+4: | ||
3813 | case TCPOLEN_COOKIE_MIN+6: | ||
3814 | case TCPOLEN_COOKIE_MAX: | ||
3815 | /* 16-bit multiple */ | ||
3816 | opt_rx->cookie_plus = opsize; | ||
3817 | *hvpp = ptr; | ||
3818 | break; | ||
3819 | default: | ||
3820 | /* ignore option */ | ||
3821 | break; | ||
3822 | } | ||
3823 | break; | ||
3824 | |||
3825 | case TCPOPT_EXP: | 3535 | case TCPOPT_EXP: |
3826 | /* Fast Open option shares code 254 using a | 3536 | /* Fast Open option shares code 254 using a |
3827 | * 16 bits magic number. It's valid only in | 3537 | * 16 bits magic number. It's valid only in |
@@ -3867,8 +3577,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr | |||
3867 | * If it is wrong it falls back on tcp_parse_options(). | 3577 | * If it is wrong it falls back on tcp_parse_options(). |
3868 | */ | 3578 | */ |
3869 | static bool tcp_fast_parse_options(const struct sk_buff *skb, | 3579 | static bool tcp_fast_parse_options(const struct sk_buff *skb, |
3870 | const struct tcphdr *th, | 3580 | const struct tcphdr *th, struct tcp_sock *tp) |
3871 | struct tcp_sock *tp, const u8 **hvpp) | ||
3872 | { | 3581 | { |
3873 | /* In the spirit of fast parsing, compare doff directly to constant | 3582 | /* In the spirit of fast parsing, compare doff directly to constant |
3874 | * values. Because equality is used, short doff can be ignored here. | 3583 | * values. Because equality is used, short doff can be ignored here. |
@@ -3882,7 +3591,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, | |||
3882 | return true; | 3591 | return true; |
3883 | } | 3592 | } |
3884 | 3593 | ||
3885 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); | 3594 | tcp_parse_options(skb, &tp->rx_opt, 1, NULL); |
3886 | if (tp->rx_opt.saw_tstamp) | 3595 | if (tp->rx_opt.saw_tstamp) |
3887 | tp->rx_opt.rcv_tsecr -= tp->tsoffset; | 3596 | tp->rx_opt.rcv_tsecr -= tp->tsoffset; |
3888 | 3597 | ||
@@ -5263,12 +4972,10 @@ out: | |||
5263 | static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | 4972 | static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, |
5264 | const struct tcphdr *th, int syn_inerr) | 4973 | const struct tcphdr *th, int syn_inerr) |
5265 | { | 4974 | { |
5266 | const u8 *hash_location; | ||
5267 | struct tcp_sock *tp = tcp_sk(sk); | 4975 | struct tcp_sock *tp = tcp_sk(sk); |
5268 | 4976 | ||
5269 | /* RFC1323: H1. Apply PAWS check first. */ | 4977 | /* RFC1323: H1. Apply PAWS check first. */ |
5270 | if (tcp_fast_parse_options(skb, th, tp, &hash_location) && | 4978 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && |
5271 | tp->rx_opt.saw_tstamp && | ||
5272 | tcp_paws_discard(sk, skb)) { | 4979 | tcp_paws_discard(sk, skb)) { |
5273 | if (!th->rst) { | 4980 | if (!th->rst) { |
5274 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | 4981 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); |
@@ -5622,12 +5329,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | |||
5622 | 5329 | ||
5623 | if (mss == tp->rx_opt.user_mss) { | 5330 | if (mss == tp->rx_opt.user_mss) { |
5624 | struct tcp_options_received opt; | 5331 | struct tcp_options_received opt; |
5625 | const u8 *hash_location; | ||
5626 | 5332 | ||
5627 | /* Get original SYNACK MSS value if user MSS sets mss_clamp */ | 5333 | /* Get original SYNACK MSS value if user MSS sets mss_clamp */ |
5628 | tcp_clear_options(&opt); | 5334 | tcp_clear_options(&opt); |
5629 | opt.user_mss = opt.mss_clamp = 0; | 5335 | opt.user_mss = opt.mss_clamp = 0; |
5630 | tcp_parse_options(synack, &opt, &hash_location, 0, NULL); | 5336 | tcp_parse_options(synack, &opt, 0, NULL); |
5631 | mss = opt.mss_clamp; | 5337 | mss = opt.mss_clamp; |
5632 | } | 5338 | } |
5633 | 5339 | ||
@@ -5658,14 +5364,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | |||
5658 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | 5364 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
5659 | const struct tcphdr *th, unsigned int len) | 5365 | const struct tcphdr *th, unsigned int len) |
5660 | { | 5366 | { |
5661 | const u8 *hash_location; | ||
5662 | struct inet_connection_sock *icsk = inet_csk(sk); | 5367 | struct inet_connection_sock *icsk = inet_csk(sk); |
5663 | struct tcp_sock *tp = tcp_sk(sk); | 5368 | struct tcp_sock *tp = tcp_sk(sk); |
5664 | struct tcp_cookie_values *cvp = tp->cookie_values; | ||
5665 | struct tcp_fastopen_cookie foc = { .len = -1 }; | 5369 | struct tcp_fastopen_cookie foc = { .len = -1 }; |
5666 | int saved_clamp = tp->rx_opt.mss_clamp; | 5370 | int saved_clamp = tp->rx_opt.mss_clamp; |
5667 | 5371 | ||
5668 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); | 5372 | tcp_parse_options(skb, &tp->rx_opt, 0, &foc); |
5669 | if (tp->rx_opt.saw_tstamp) | 5373 | if (tp->rx_opt.saw_tstamp) |
5670 | tp->rx_opt.rcv_tsecr -= tp->tsoffset; | 5374 | tp->rx_opt.rcv_tsecr -= tp->tsoffset; |
5671 | 5375 | ||
@@ -5762,30 +5466,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5762 | * is initialized. */ | 5466 | * is initialized. */ |
5763 | tp->copied_seq = tp->rcv_nxt; | 5467 | tp->copied_seq = tp->rcv_nxt; |
5764 | 5468 | ||
5765 | if (cvp != NULL && | ||
5766 | cvp->cookie_pair_size > 0 && | ||
5767 | tp->rx_opt.cookie_plus > 0) { | ||
5768 | int cookie_size = tp->rx_opt.cookie_plus | ||
5769 | - TCPOLEN_COOKIE_BASE; | ||
5770 | int cookie_pair_size = cookie_size | ||
5771 | + cvp->cookie_desired; | ||
5772 | |||
5773 | /* A cookie extension option was sent and returned. | ||
5774 | * Note that each incoming SYNACK replaces the | ||
5775 | * Responder cookie. The initial exchange is most | ||
5776 | * fragile, as protection against spoofing relies | ||
5777 | * entirely upon the sequence and timestamp (above). | ||
5778 | * This replacement strategy allows the correct pair to | ||
5779 | * pass through, while any others will be filtered via | ||
5780 | * Responder verification later. | ||
5781 | */ | ||
5782 | if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { | ||
5783 | memcpy(&cvp->cookie_pair[cvp->cookie_desired], | ||
5784 | hash_location, cookie_size); | ||
5785 | cvp->cookie_pair_size = cookie_pair_size; | ||
5786 | } | ||
5787 | } | ||
5788 | |||
5789 | smp_mb(); | 5469 | smp_mb(); |
5790 | 5470 | ||
5791 | tcp_finish_connect(sk, skb); | 5471 | tcp_finish_connect(sk, skb); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d09203c63264..2278669b1d85 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -838,7 +838,6 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
838 | */ | 838 | */ |
839 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | 839 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
840 | struct request_sock *req, | 840 | struct request_sock *req, |
841 | struct request_values *rvp, | ||
842 | u16 queue_mapping, | 841 | u16 queue_mapping, |
843 | bool nocache) | 842 | bool nocache) |
844 | { | 843 | { |
@@ -851,7 +850,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
851 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) | 850 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) |
852 | return -1; | 851 | return -1; |
853 | 852 | ||
854 | skb = tcp_make_synack(sk, dst, req, rvp, NULL); | 853 | skb = tcp_make_synack(sk, dst, req, NULL); |
855 | 854 | ||
856 | if (skb) { | 855 | if (skb) { |
857 | __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); | 856 | __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); |
@@ -868,10 +867,9 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
868 | return err; | 867 | return err; |
869 | } | 868 | } |
870 | 869 | ||
871 | static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, | 870 | static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) |
872 | struct request_values *rvp) | ||
873 | { | 871 | { |
874 | int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); | 872 | int res = tcp_v4_send_synack(sk, NULL, req, 0, false); |
875 | 873 | ||
876 | if (!res) | 874 | if (!res) |
877 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); | 875 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
@@ -1371,8 +1369,7 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, | |||
1371 | static int tcp_v4_conn_req_fastopen(struct sock *sk, | 1369 | static int tcp_v4_conn_req_fastopen(struct sock *sk, |
1372 | struct sk_buff *skb, | 1370 | struct sk_buff *skb, |
1373 | struct sk_buff *skb_synack, | 1371 | struct sk_buff *skb_synack, |
1374 | struct request_sock *req, | 1372 | struct request_sock *req) |
1375 | struct request_values *rvp) | ||
1376 | { | 1373 | { |
1377 | struct tcp_sock *tp = tcp_sk(sk); | 1374 | struct tcp_sock *tp = tcp_sk(sk); |
1378 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; | 1375 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; |
@@ -1467,9 +1464,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, | |||
1467 | 1464 | ||
1468 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | 1465 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) |
1469 | { | 1466 | { |
1470 | struct tcp_extend_values tmp_ext; | ||
1471 | struct tcp_options_received tmp_opt; | 1467 | struct tcp_options_received tmp_opt; |
1472 | const u8 *hash_location; | ||
1473 | struct request_sock *req; | 1468 | struct request_sock *req; |
1474 | struct inet_request_sock *ireq; | 1469 | struct inet_request_sock *ireq; |
1475 | struct tcp_sock *tp = tcp_sk(sk); | 1470 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -1519,42 +1514,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1519 | tcp_clear_options(&tmp_opt); | 1514 | tcp_clear_options(&tmp_opt); |
1520 | tmp_opt.mss_clamp = TCP_MSS_DEFAULT; | 1515 | tmp_opt.mss_clamp = TCP_MSS_DEFAULT; |
1521 | tmp_opt.user_mss = tp->rx_opt.user_mss; | 1516 | tmp_opt.user_mss = tp->rx_opt.user_mss; |
1522 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, | 1517 | tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); |
1523 | want_cookie ? NULL : &foc); | ||
1524 | |||
1525 | if (tmp_opt.cookie_plus > 0 && | ||
1526 | tmp_opt.saw_tstamp && | ||
1527 | !tp->rx_opt.cookie_out_never && | ||
1528 | (sysctl_tcp_cookie_size > 0 || | ||
1529 | (tp->cookie_values != NULL && | ||
1530 | tp->cookie_values->cookie_desired > 0))) { | ||
1531 | u8 *c; | ||
1532 | u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; | ||
1533 | int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; | ||
1534 | |||
1535 | if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) | ||
1536 | goto drop_and_release; | ||
1537 | |||
1538 | /* Secret recipe starts with IP addresses */ | ||
1539 | *mess++ ^= (__force u32)daddr; | ||
1540 | *mess++ ^= (__force u32)saddr; | ||
1541 | |||
1542 | /* plus variable length Initiator Cookie */ | ||
1543 | c = (u8 *)mess; | ||
1544 | while (l-- > 0) | ||
1545 | *c++ ^= *hash_location++; | ||
1546 | |||
1547 | want_cookie = false; /* not our kind of cookie */ | ||
1548 | tmp_ext.cookie_out_never = 0; /* false */ | ||
1549 | tmp_ext.cookie_plus = tmp_opt.cookie_plus; | ||
1550 | } else if (!tp->rx_opt.cookie_in_always) { | ||
1551 | /* redundant indications, but ensure initialization. */ | ||
1552 | tmp_ext.cookie_out_never = 1; /* true */ | ||
1553 | tmp_ext.cookie_plus = 0; | ||
1554 | } else { | ||
1555 | goto drop_and_release; | ||
1556 | } | ||
1557 | tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; | ||
1558 | 1518 | ||
1559 | if (want_cookie && !tmp_opt.saw_tstamp) | 1519 | if (want_cookie && !tmp_opt.saw_tstamp) |
1560 | tcp_clear_options(&tmp_opt); | 1520 | tcp_clear_options(&tmp_opt); |
@@ -1636,7 +1596,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1636 | * of tcp_v4_send_synack()->tcp_select_initial_window(). | 1596 | * of tcp_v4_send_synack()->tcp_select_initial_window(). |
1637 | */ | 1597 | */ |
1638 | skb_synack = tcp_make_synack(sk, dst, req, | 1598 | skb_synack = tcp_make_synack(sk, dst, req, |
1639 | (struct request_values *)&tmp_ext, | ||
1640 | fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); | 1599 | fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); |
1641 | 1600 | ||
1642 | if (skb_synack) { | 1601 | if (skb_synack) { |
@@ -1660,8 +1619,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1660 | if (fastopen_cookie_present(&foc) && foc.len != 0) | 1619 | if (fastopen_cookie_present(&foc) && foc.len != 0) |
1661 | NET_INC_STATS_BH(sock_net(sk), | 1620 | NET_INC_STATS_BH(sock_net(sk), |
1662 | LINUX_MIB_TCPFASTOPENPASSIVEFAIL); | 1621 | LINUX_MIB_TCPFASTOPENPASSIVEFAIL); |
1663 | } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req, | 1622 | } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) |
1664 | (struct request_values *)&tmp_ext)) | ||
1665 | goto drop_and_free; | 1623 | goto drop_and_free; |
1666 | 1624 | ||
1667 | return 0; | 1625 | return 0; |
@@ -1950,6 +1908,50 @@ void tcp_v4_early_demux(struct sk_buff *skb) | |||
1950 | } | 1908 | } |
1951 | } | 1909 | } |
1952 | 1910 | ||
1911 | /* Packet is added to VJ-style prequeue for processing in process | ||
1912 | * context, if a reader task is waiting. Apparently, this exciting | ||
1913 | * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) | ||
1914 | * failed somewhere. Latency? Burstiness? Well, at least now we will | ||
1915 | * see, why it failed. 8)8) --ANK | ||
1916 | * | ||
1917 | */ | ||
1918 | bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) | ||
1919 | { | ||
1920 | struct tcp_sock *tp = tcp_sk(sk); | ||
1921 | |||
1922 | if (sysctl_tcp_low_latency || !tp->ucopy.task) | ||
1923 | return false; | ||
1924 | |||
1925 | if (skb->len <= tcp_hdrlen(skb) && | ||
1926 | skb_queue_len(&tp->ucopy.prequeue) == 0) | ||
1927 | return false; | ||
1928 | |||
1929 | __skb_queue_tail(&tp->ucopy.prequeue, skb); | ||
1930 | tp->ucopy.memory += skb->truesize; | ||
1931 | if (tp->ucopy.memory > sk->sk_rcvbuf) { | ||
1932 | struct sk_buff *skb1; | ||
1933 | |||
1934 | BUG_ON(sock_owned_by_user(sk)); | ||
1935 | |||
1936 | while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { | ||
1937 | sk_backlog_rcv(sk, skb1); | ||
1938 | NET_INC_STATS_BH(sock_net(sk), | ||
1939 | LINUX_MIB_TCPPREQUEUEDROPPED); | ||
1940 | } | ||
1941 | |||
1942 | tp->ucopy.memory = 0; | ||
1943 | } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { | ||
1944 | wake_up_interruptible_sync_poll(sk_sleep(sk), | ||
1945 | POLLIN | POLLRDNORM | POLLRDBAND); | ||
1946 | if (!inet_csk_ack_scheduled(sk)) | ||
1947 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, | ||
1948 | (3 * tcp_rto_min(sk)) / 4, | ||
1949 | TCP_RTO_MAX); | ||
1950 | } | ||
1951 | return true; | ||
1952 | } | ||
1953 | EXPORT_SYMBOL(tcp_prequeue); | ||
1954 | |||
1953 | /* | 1955 | /* |
1954 | * From tcp_input.c | 1956 | * From tcp_input.c |
1955 | */ | 1957 | */ |
@@ -2197,12 +2199,6 @@ void tcp_v4_destroy_sock(struct sock *sk) | |||
2197 | if (inet_csk(sk)->icsk_bind_hash) | 2199 | if (inet_csk(sk)->icsk_bind_hash) |
2198 | inet_put_port(sk); | 2200 | inet_put_port(sk); |
2199 | 2201 | ||
2200 | /* TCP Cookie Transactions */ | ||
2201 | if (tp->cookie_values != NULL) { | ||
2202 | kref_put(&tp->cookie_values->kref, | ||
2203 | tcp_cookie_values_release); | ||
2204 | tp->cookie_values = NULL; | ||
2205 | } | ||
2206 | BUG_ON(tp->fastopen_rsk != NULL); | 2202 | BUG_ON(tp->fastopen_rsk != NULL); |
2207 | 2203 | ||
2208 | /* If socket is aborted during connect operation */ | 2204 | /* If socket is aborted during connect operation */ |
@@ -2659,7 +2655,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
2659 | __u16 srcp = ntohs(inet->inet_sport); | 2655 | __u16 srcp = ntohs(inet->inet_sport); |
2660 | int rx_queue; | 2656 | int rx_queue; |
2661 | 2657 | ||
2662 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) { | 2658 | if (icsk->icsk_pending == ICSK_TIME_RETRANS || |
2659 | icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || | ||
2660 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | ||
2663 | timer_active = 1; | 2661 | timer_active = 1; |
2664 | timer_expires = icsk->icsk_timeout; | 2662 | timer_expires = icsk->icsk_timeout; |
2665 | } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { | 2663 | } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b83a49cc3816..05eaf8904613 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -93,13 +93,12 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, | |||
93 | const struct tcphdr *th) | 93 | const struct tcphdr *th) |
94 | { | 94 | { |
95 | struct tcp_options_received tmp_opt; | 95 | struct tcp_options_received tmp_opt; |
96 | const u8 *hash_location; | ||
97 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 96 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
98 | bool paws_reject = false; | 97 | bool paws_reject = false; |
99 | 98 | ||
100 | tmp_opt.saw_tstamp = 0; | 99 | tmp_opt.saw_tstamp = 0; |
101 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { | 100 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { |
102 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); | 101 | tcp_parse_options(skb, &tmp_opt, 0, NULL); |
103 | 102 | ||
104 | if (tmp_opt.saw_tstamp) { | 103 | if (tmp_opt.saw_tstamp) { |
105 | tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; | 104 | tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; |
@@ -388,32 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
388 | struct tcp_request_sock *treq = tcp_rsk(req); | 387 | struct tcp_request_sock *treq = tcp_rsk(req); |
389 | struct inet_connection_sock *newicsk = inet_csk(newsk); | 388 | struct inet_connection_sock *newicsk = inet_csk(newsk); |
390 | struct tcp_sock *newtp = tcp_sk(newsk); | 389 | struct tcp_sock *newtp = tcp_sk(newsk); |
391 | struct tcp_sock *oldtp = tcp_sk(sk); | ||
392 | struct tcp_cookie_values *oldcvp = oldtp->cookie_values; | ||
393 | |||
394 | /* TCP Cookie Transactions require space for the cookie pair, | ||
395 | * as it differs for each connection. There is no need to | ||
396 | * copy any s_data_payload stored at the original socket. | ||
397 | * Failure will prevent resuming the connection. | ||
398 | * | ||
399 | * Presumed copied, in order of appearance: | ||
400 | * cookie_in_always, cookie_out_never | ||
401 | */ | ||
402 | if (oldcvp != NULL) { | ||
403 | struct tcp_cookie_values *newcvp = | ||
404 | kzalloc(sizeof(*newtp->cookie_values), | ||
405 | GFP_ATOMIC); | ||
406 | |||
407 | if (newcvp != NULL) { | ||
408 | kref_init(&newcvp->kref); | ||
409 | newcvp->cookie_desired = | ||
410 | oldcvp->cookie_desired; | ||
411 | newtp->cookie_values = newcvp; | ||
412 | } else { | ||
413 | /* Not Yet Implemented */ | ||
414 | newtp->cookie_values = NULL; | ||
415 | } | ||
416 | } | ||
417 | 390 | ||
418 | /* Now setup tcp_sock */ | 391 | /* Now setup tcp_sock */ |
419 | newtp->pred_flags = 0; | 392 | newtp->pred_flags = 0; |
@@ -422,8 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
422 | newtp->rcv_nxt = treq->rcv_isn + 1; | 395 | newtp->rcv_nxt = treq->rcv_isn + 1; |
423 | 396 | ||
424 | newtp->snd_sml = newtp->snd_una = | 397 | newtp->snd_sml = newtp->snd_una = |
425 | newtp->snd_nxt = newtp->snd_up = | 398 | newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; |
426 | treq->snt_isn + 1 + tcp_s_data_size(oldtp); | ||
427 | 399 | ||
428 | tcp_prequeue_init(newtp); | 400 | tcp_prequeue_init(newtp); |
429 | INIT_LIST_HEAD(&newtp->tsq_node); | 401 | INIT_LIST_HEAD(&newtp->tsq_node); |
@@ -440,6 +412,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
440 | newtp->fackets_out = 0; | 412 | newtp->fackets_out = 0; |
441 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | 413 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
442 | tcp_enable_early_retrans(newtp); | 414 | tcp_enable_early_retrans(newtp); |
415 | newtp->tlp_high_seq = 0; | ||
443 | 416 | ||
444 | /* So many TCP implementations out there (incorrectly) count the | 417 | /* So many TCP implementations out there (incorrectly) count the |
445 | * initial SYN frame in their delayed-ACK and congestion control | 418 | * initial SYN frame in their delayed-ACK and congestion control |
@@ -449,9 +422,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
449 | newtp->snd_cwnd = TCP_INIT_CWND; | 422 | newtp->snd_cwnd = TCP_INIT_CWND; |
450 | newtp->snd_cwnd_cnt = 0; | 423 | newtp->snd_cwnd_cnt = 0; |
451 | 424 | ||
452 | newtp->frto_counter = 0; | ||
453 | newtp->frto_highmark = 0; | ||
454 | |||
455 | if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && | 425 | if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && |
456 | !try_module_get(newicsk->icsk_ca_ops->owner)) | 426 | !try_module_get(newicsk->icsk_ca_ops->owner)) |
457 | newicsk->icsk_ca_ops = &tcp_init_congestion_ops; | 427 | newicsk->icsk_ca_ops = &tcp_init_congestion_ops; |
@@ -459,8 +429,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
459 | tcp_set_ca_state(newsk, TCP_CA_Open); | 429 | tcp_set_ca_state(newsk, TCP_CA_Open); |
460 | tcp_init_xmit_timers(newsk); | 430 | tcp_init_xmit_timers(newsk); |
461 | skb_queue_head_init(&newtp->out_of_order_queue); | 431 | skb_queue_head_init(&newtp->out_of_order_queue); |
462 | newtp->write_seq = newtp->pushed_seq = | 432 | newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; |
463 | treq->snt_isn + 1 + tcp_s_data_size(oldtp); | ||
464 | 433 | ||
465 | newtp->rx_opt.saw_tstamp = 0; | 434 | newtp->rx_opt.saw_tstamp = 0; |
466 | 435 | ||
@@ -537,7 +506,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
537 | bool fastopen) | 506 | bool fastopen) |
538 | { | 507 | { |
539 | struct tcp_options_received tmp_opt; | 508 | struct tcp_options_received tmp_opt; |
540 | const u8 *hash_location; | ||
541 | struct sock *child; | 509 | struct sock *child; |
542 | const struct tcphdr *th = tcp_hdr(skb); | 510 | const struct tcphdr *th = tcp_hdr(skb); |
543 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); | 511 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); |
@@ -547,7 +515,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
547 | 515 | ||
548 | tmp_opt.saw_tstamp = 0; | 516 | tmp_opt.saw_tstamp = 0; |
549 | if (th->doff > (sizeof(struct tcphdr)>>2)) { | 517 | if (th->doff > (sizeof(struct tcphdr)>>2)) { |
550 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); | 518 | tcp_parse_options(skb, &tmp_opt, 0, NULL); |
551 | 519 | ||
552 | if (tmp_opt.saw_tstamp) { | 520 | if (tmp_opt.saw_tstamp) { |
553 | tmp_opt.ts_recent = req->ts_recent; | 521 | tmp_opt.ts_recent = req->ts_recent; |
@@ -647,7 +615,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
647 | */ | 615 | */ |
648 | if ((flg & TCP_FLAG_ACK) && !fastopen && | 616 | if ((flg & TCP_FLAG_ACK) && !fastopen && |
649 | (TCP_SKB_CB(skb)->ack_seq != | 617 | (TCP_SKB_CB(skb)->ack_seq != |
650 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) | 618 | tcp_rsk(req)->snt_isn + 1)) |
651 | return sk; | 619 | return sk; |
652 | 620 | ||
653 | /* Also, it would be not so bad idea to check rcv_tsecr, which | 621 | /* Also, it would be not so bad idea to check rcv_tsecr, which |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5d0b4387cba6..af354c98fdb5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -65,27 +65,22 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; | |||
65 | /* By default, RFC2861 behavior. */ | 65 | /* By default, RFC2861 behavior. */ |
66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | 66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; |
67 | 67 | ||
68 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ | ||
69 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); | ||
70 | |||
71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | 68 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
72 | int push_one, gfp_t gfp); | 69 | int push_one, gfp_t gfp); |
73 | 70 | ||
74 | /* Account for new data that has been sent to the network. */ | 71 | /* Account for new data that has been sent to the network. */ |
75 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | 72 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) |
76 | { | 73 | { |
74 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
77 | struct tcp_sock *tp = tcp_sk(sk); | 75 | struct tcp_sock *tp = tcp_sk(sk); |
78 | unsigned int prior_packets = tp->packets_out; | 76 | unsigned int prior_packets = tp->packets_out; |
79 | 77 | ||
80 | tcp_advance_send_head(sk, skb); | 78 | tcp_advance_send_head(sk, skb); |
81 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; | 79 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; |
82 | 80 | ||
83 | /* Don't override Nagle indefinitely with F-RTO */ | ||
84 | if (tp->frto_counter == 2) | ||
85 | tp->frto_counter = 3; | ||
86 | |||
87 | tp->packets_out += tcp_skb_pcount(skb); | 81 | tp->packets_out += tcp_skb_pcount(skb); |
88 | if (!prior_packets || tp->early_retrans_delayed) | 82 | if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || |
83 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) | ||
89 | tcp_rearm_rto(sk); | 84 | tcp_rearm_rto(sk); |
90 | } | 85 | } |
91 | 86 | ||
@@ -384,7 +379,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) | |||
384 | #define OPTION_TS (1 << 1) | 379 | #define OPTION_TS (1 << 1) |
385 | #define OPTION_MD5 (1 << 2) | 380 | #define OPTION_MD5 (1 << 2) |
386 | #define OPTION_WSCALE (1 << 3) | 381 | #define OPTION_WSCALE (1 << 3) |
387 | #define OPTION_COOKIE_EXTENSION (1 << 4) | ||
388 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) | 382 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) |
389 | 383 | ||
390 | struct tcp_out_options { | 384 | struct tcp_out_options { |
@@ -398,36 +392,6 @@ struct tcp_out_options { | |||
398 | struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ | 392 | struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ |
399 | }; | 393 | }; |
400 | 394 | ||
401 | /* The sysctl int routines are generic, so check consistency here. | ||
402 | */ | ||
403 | static u8 tcp_cookie_size_check(u8 desired) | ||
404 | { | ||
405 | int cookie_size; | ||
406 | |||
407 | if (desired > 0) | ||
408 | /* previously specified */ | ||
409 | return desired; | ||
410 | |||
411 | cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); | ||
412 | if (cookie_size <= 0) | ||
413 | /* no default specified */ | ||
414 | return 0; | ||
415 | |||
416 | if (cookie_size <= TCP_COOKIE_MIN) | ||
417 | /* value too small, specify minimum */ | ||
418 | return TCP_COOKIE_MIN; | ||
419 | |||
420 | if (cookie_size >= TCP_COOKIE_MAX) | ||
421 | /* value too large, specify maximum */ | ||
422 | return TCP_COOKIE_MAX; | ||
423 | |||
424 | if (cookie_size & 1) | ||
425 | /* 8-bit multiple, illegal, fix it */ | ||
426 | cookie_size++; | ||
427 | |||
428 | return (u8)cookie_size; | ||
429 | } | ||
430 | |||
431 | /* Write previously computed TCP options to the packet. | 395 | /* Write previously computed TCP options to the packet. |
432 | * | 396 | * |
433 | * Beware: Something in the Internet is very sensitive to the ordering of | 397 | * Beware: Something in the Internet is very sensitive to the ordering of |
@@ -446,27 +410,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |||
446 | { | 410 | { |
447 | u16 options = opts->options; /* mungable copy */ | 411 | u16 options = opts->options; /* mungable copy */ |
448 | 412 | ||
449 | /* Having both authentication and cookies for security is redundant, | ||
450 | * and there's certainly not enough room. Instead, the cookie-less | ||
451 | * extension variant is proposed. | ||
452 | * | ||
453 | * Consider the pessimal case with authentication. The options | ||
454 | * could look like: | ||
455 | * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 | ||
456 | */ | ||
457 | if (unlikely(OPTION_MD5 & options)) { | 413 | if (unlikely(OPTION_MD5 & options)) { |
458 | if (unlikely(OPTION_COOKIE_EXTENSION & options)) { | 414 | *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | |
459 | *ptr++ = htonl((TCPOPT_COOKIE << 24) | | 415 | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); |
460 | (TCPOLEN_COOKIE_BASE << 16) | | ||
461 | (TCPOPT_MD5SIG << 8) | | ||
462 | TCPOLEN_MD5SIG); | ||
463 | } else { | ||
464 | *ptr++ = htonl((TCPOPT_NOP << 24) | | ||
465 | (TCPOPT_NOP << 16) | | ||
466 | (TCPOPT_MD5SIG << 8) | | ||
467 | TCPOLEN_MD5SIG); | ||
468 | } | ||
469 | options &= ~OPTION_COOKIE_EXTENSION; | ||
470 | /* overload cookie hash location */ | 416 | /* overload cookie hash location */ |
471 | opts->hash_location = (__u8 *)ptr; | 417 | opts->hash_location = (__u8 *)ptr; |
472 | ptr += 4; | 418 | ptr += 4; |
@@ -495,44 +441,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |||
495 | *ptr++ = htonl(opts->tsecr); | 441 | *ptr++ = htonl(opts->tsecr); |
496 | } | 442 | } |
497 | 443 | ||
498 | /* Specification requires after timestamp, so do it now. | ||
499 | * | ||
500 | * Consider the pessimal case without authentication. The options | ||
501 | * could look like: | ||
502 | * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 | ||
503 | */ | ||
504 | if (unlikely(OPTION_COOKIE_EXTENSION & options)) { | ||
505 | __u8 *cookie_copy = opts->hash_location; | ||
506 | u8 cookie_size = opts->hash_size; | ||
507 | |||
508 | /* 8-bit multiple handled in tcp_cookie_size_check() above, | ||
509 | * and elsewhere. | ||
510 | */ | ||
511 | if (0x2 & cookie_size) { | ||
512 | __u8 *p = (__u8 *)ptr; | ||
513 | |||
514 | /* 16-bit multiple */ | ||
515 | *p++ = TCPOPT_COOKIE; | ||
516 | *p++ = TCPOLEN_COOKIE_BASE + cookie_size; | ||
517 | *p++ = *cookie_copy++; | ||
518 | *p++ = *cookie_copy++; | ||
519 | ptr++; | ||
520 | cookie_size -= 2; | ||
521 | } else { | ||
522 | /* 32-bit multiple */ | ||
523 | *ptr++ = htonl(((TCPOPT_NOP << 24) | | ||
524 | (TCPOPT_NOP << 16) | | ||
525 | (TCPOPT_COOKIE << 8) | | ||
526 | TCPOLEN_COOKIE_BASE) + | ||
527 | cookie_size); | ||
528 | } | ||
529 | |||
530 | if (cookie_size > 0) { | ||
531 | memcpy(ptr, cookie_copy, cookie_size); | ||
532 | ptr += (cookie_size / 4); | ||
533 | } | ||
534 | } | ||
535 | |||
536 | if (unlikely(OPTION_SACK_ADVERTISE & options)) { | 444 | if (unlikely(OPTION_SACK_ADVERTISE & options)) { |
537 | *ptr++ = htonl((TCPOPT_NOP << 24) | | 445 | *ptr++ = htonl((TCPOPT_NOP << 24) | |
538 | (TCPOPT_NOP << 16) | | 446 | (TCPOPT_NOP << 16) | |
@@ -591,11 +499,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
591 | struct tcp_md5sig_key **md5) | 499 | struct tcp_md5sig_key **md5) |
592 | { | 500 | { |
593 | struct tcp_sock *tp = tcp_sk(sk); | 501 | struct tcp_sock *tp = tcp_sk(sk); |
594 | struct tcp_cookie_values *cvp = tp->cookie_values; | ||
595 | unsigned int remaining = MAX_TCP_OPTION_SPACE; | 502 | unsigned int remaining = MAX_TCP_OPTION_SPACE; |
596 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? | ||
597 | tcp_cookie_size_check(cvp->cookie_desired) : | ||
598 | 0; | ||
599 | struct tcp_fastopen_request *fastopen = tp->fastopen_req; | 503 | struct tcp_fastopen_request *fastopen = tp->fastopen_req; |
600 | 504 | ||
601 | #ifdef CONFIG_TCP_MD5SIG | 505 | #ifdef CONFIG_TCP_MD5SIG |
@@ -647,52 +551,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
647 | tp->syn_fastopen = 1; | 551 | tp->syn_fastopen = 1; |
648 | } | 552 | } |
649 | } | 553 | } |
650 | /* Note that timestamps are required by the specification. | ||
651 | * | ||
652 | * Odd numbers of bytes are prohibited by the specification, ensuring | ||
653 | * that the cookie is 16-bit aligned, and the resulting cookie pair is | ||
654 | * 32-bit aligned. | ||
655 | */ | ||
656 | if (*md5 == NULL && | ||
657 | (OPTION_TS & opts->options) && | ||
658 | cookie_size > 0) { | ||
659 | int need = TCPOLEN_COOKIE_BASE + cookie_size; | ||
660 | |||
661 | if (0x2 & need) { | ||
662 | /* 32-bit multiple */ | ||
663 | need += 2; /* NOPs */ | ||
664 | |||
665 | if (need > remaining) { | ||
666 | /* try shrinking cookie to fit */ | ||
667 | cookie_size -= 2; | ||
668 | need -= 4; | ||
669 | } | ||
670 | } | ||
671 | while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { | ||
672 | cookie_size -= 4; | ||
673 | need -= 4; | ||
674 | } | ||
675 | if (TCP_COOKIE_MIN <= cookie_size) { | ||
676 | opts->options |= OPTION_COOKIE_EXTENSION; | ||
677 | opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; | ||
678 | opts->hash_size = cookie_size; | ||
679 | |||
680 | /* Remember for future incarnations. */ | ||
681 | cvp->cookie_desired = cookie_size; | ||
682 | |||
683 | if (cvp->cookie_desired != cvp->cookie_pair_size) { | ||
684 | /* Currently use random bytes as a nonce, | ||
685 | * assuming these are completely unpredictable | ||
686 | * by hostile users of the same system. | ||
687 | */ | ||
688 | get_random_bytes(&cvp->cookie_pair[0], | ||
689 | cookie_size); | ||
690 | cvp->cookie_pair_size = cookie_size; | ||
691 | } | ||
692 | 554 | ||
693 | remaining -= need; | ||
694 | } | ||
695 | } | ||
696 | return MAX_TCP_OPTION_SPACE - remaining; | 555 | return MAX_TCP_OPTION_SPACE - remaining; |
697 | } | 556 | } |
698 | 557 | ||
@@ -702,14 +561,10 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
702 | unsigned int mss, struct sk_buff *skb, | 561 | unsigned int mss, struct sk_buff *skb, |
703 | struct tcp_out_options *opts, | 562 | struct tcp_out_options *opts, |
704 | struct tcp_md5sig_key **md5, | 563 | struct tcp_md5sig_key **md5, |
705 | struct tcp_extend_values *xvp, | ||
706 | struct tcp_fastopen_cookie *foc) | 564 | struct tcp_fastopen_cookie *foc) |
707 | { | 565 | { |
708 | struct inet_request_sock *ireq = inet_rsk(req); | 566 | struct inet_request_sock *ireq = inet_rsk(req); |
709 | unsigned int remaining = MAX_TCP_OPTION_SPACE; | 567 | unsigned int remaining = MAX_TCP_OPTION_SPACE; |
710 | u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? | ||
711 | xvp->cookie_plus : | ||
712 | 0; | ||
713 | 568 | ||
714 | #ifdef CONFIG_TCP_MD5SIG | 569 | #ifdef CONFIG_TCP_MD5SIG |
715 | *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); | 570 | *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); |
@@ -757,28 +612,7 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
757 | remaining -= need; | 612 | remaining -= need; |
758 | } | 613 | } |
759 | } | 614 | } |
760 | /* Similar rationale to tcp_syn_options() applies here, too. | 615 | |
761 | * If the <SYN> options fit, the same options should fit now! | ||
762 | */ | ||
763 | if (*md5 == NULL && | ||
764 | ireq->tstamp_ok && | ||
765 | cookie_plus > TCPOLEN_COOKIE_BASE) { | ||
766 | int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ | ||
767 | |||
768 | if (0x2 & need) { | ||
769 | /* 32-bit multiple */ | ||
770 | need += 2; /* NOPs */ | ||
771 | } | ||
772 | if (need <= remaining) { | ||
773 | opts->options |= OPTION_COOKIE_EXTENSION; | ||
774 | opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; | ||
775 | remaining -= need; | ||
776 | } else { | ||
777 | /* There's no error return, so flag it. */ | ||
778 | xvp->cookie_out_never = 1; /* true */ | ||
779 | opts->hash_size = 0; | ||
780 | } | ||
781 | } | ||
782 | return MAX_TCP_OPTION_SPACE - remaining; | 616 | return MAX_TCP_OPTION_SPACE - remaining; |
783 | } | 617 | } |
784 | 618 | ||
@@ -1632,11 +1466,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf | |||
1632 | if (nonagle & TCP_NAGLE_PUSH) | 1466 | if (nonagle & TCP_NAGLE_PUSH) |
1633 | return true; | 1467 | return true; |
1634 | 1468 | ||
1635 | /* Don't use the nagle rule for urgent data (or for the final FIN). | 1469 | /* Don't use the nagle rule for urgent data (or for the final FIN). */ |
1636 | * Nagle can be ignored during F-RTO too (see RFC4138). | 1470 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) |
1637 | */ | ||
1638 | if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || | ||
1639 | (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) | ||
1640 | return true; | 1471 | return true; |
1641 | 1472 | ||
1642 | if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) | 1473 | if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) |
@@ -1961,6 +1792,9 @@ static int tcp_mtu_probe(struct sock *sk) | |||
1961 | * snd_up-64k-mss .. snd_up cannot be large. However, taking into | 1792 | * snd_up-64k-mss .. snd_up cannot be large. However, taking into |
1962 | * account rare use of URG, this is not a big flaw. | 1793 | * account rare use of URG, this is not a big flaw. |
1963 | * | 1794 | * |
1795 | * Send at most one packet when push_one > 0. Temporarily ignore | ||
1796 | * cwnd limit to force at most one packet out when push_one == 2. | ||
1797 | |||
1964 | * Returns true, if no segments are in flight and we have queued segments, | 1798 | * Returns true, if no segments are in flight and we have queued segments, |
1965 | * but cannot send anything now because of SWS or another problem. | 1799 | * but cannot send anything now because of SWS or another problem. |
1966 | */ | 1800 | */ |
@@ -1996,8 +1830,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1996 | goto repair; /* Skip network transmission */ | 1830 | goto repair; /* Skip network transmission */ |
1997 | 1831 | ||
1998 | cwnd_quota = tcp_cwnd_test(tp, skb); | 1832 | cwnd_quota = tcp_cwnd_test(tp, skb); |
1999 | if (!cwnd_quota) | 1833 | if (!cwnd_quota) { |
2000 | break; | 1834 | if (push_one == 2) |
1835 | /* Force out a loss probe pkt. */ | ||
1836 | cwnd_quota = 1; | ||
1837 | else | ||
1838 | break; | ||
1839 | } | ||
2001 | 1840 | ||
2002 | if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) | 1841 | if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) |
2003 | break; | 1842 | break; |
@@ -2051,10 +1890,129 @@ repair: | |||
2051 | if (likely(sent_pkts)) { | 1890 | if (likely(sent_pkts)) { |
2052 | if (tcp_in_cwnd_reduction(sk)) | 1891 | if (tcp_in_cwnd_reduction(sk)) |
2053 | tp->prr_out += sent_pkts; | 1892 | tp->prr_out += sent_pkts; |
1893 | |||
1894 | /* Send one loss probe per tail loss episode. */ | ||
1895 | if (push_one != 2) | ||
1896 | tcp_schedule_loss_probe(sk); | ||
2054 | tcp_cwnd_validate(sk); | 1897 | tcp_cwnd_validate(sk); |
2055 | return false; | 1898 | return false; |
2056 | } | 1899 | } |
2057 | return !tp->packets_out && tcp_send_head(sk); | 1900 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); |
1901 | } | ||
1902 | |||
1903 | bool tcp_schedule_loss_probe(struct sock *sk) | ||
1904 | { | ||
1905 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1906 | struct tcp_sock *tp = tcp_sk(sk); | ||
1907 | u32 timeout, tlp_time_stamp, rto_time_stamp; | ||
1908 | u32 rtt = tp->srtt >> 3; | ||
1909 | |||
1910 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) | ||
1911 | return false; | ||
1912 | /* No consecutive loss probes. */ | ||
1913 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { | ||
1914 | tcp_rearm_rto(sk); | ||
1915 | return false; | ||
1916 | } | ||
1917 | /* Don't do any loss probe on a Fast Open connection before 3WHS | ||
1918 | * finishes. | ||
1919 | */ | ||
1920 | if (sk->sk_state == TCP_SYN_RECV) | ||
1921 | return false; | ||
1922 | |||
1923 | /* TLP is only scheduled when next timer event is RTO. */ | ||
1924 | if (icsk->icsk_pending != ICSK_TIME_RETRANS) | ||
1925 | return false; | ||
1926 | |||
1927 | /* Schedule a loss probe in 2*RTT for SACK capable connections | ||
1928 | * in Open state, that are either limited by cwnd or application. | ||
1929 | */ | ||
1930 | if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || | ||
1931 | !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) | ||
1932 | return false; | ||
1933 | |||
1934 | if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && | ||
1935 | tcp_send_head(sk)) | ||
1936 | return false; | ||
1937 | |||
1938 | /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account | ||
1939 | * for delayed ack when there's one outstanding packet. | ||
1940 | */ | ||
1941 | timeout = rtt << 1; | ||
1942 | if (tp->packets_out == 1) | ||
1943 | timeout = max_t(u32, timeout, | ||
1944 | (rtt + (rtt >> 1) + TCP_DELACK_MAX)); | ||
1945 | timeout = max_t(u32, timeout, msecs_to_jiffies(10)); | ||
1946 | |||
1947 | /* If RTO is shorter, just schedule TLP in its place. */ | ||
1948 | tlp_time_stamp = tcp_time_stamp + timeout; | ||
1949 | rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; | ||
1950 | if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { | ||
1951 | s32 delta = rto_time_stamp - tcp_time_stamp; | ||
1952 | if (delta > 0) | ||
1953 | timeout = delta; | ||
1954 | } | ||
1955 | |||
1956 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, | ||
1957 | TCP_RTO_MAX); | ||
1958 | return true; | ||
1959 | } | ||
1960 | |||
1961 | /* When probe timeout (PTO) fires, send a new segment if one exists, else | ||
1962 | * retransmit the last segment. | ||
1963 | */ | ||
1964 | void tcp_send_loss_probe(struct sock *sk) | ||
1965 | { | ||
1966 | struct tcp_sock *tp = tcp_sk(sk); | ||
1967 | struct sk_buff *skb; | ||
1968 | int pcount; | ||
1969 | int mss = tcp_current_mss(sk); | ||
1970 | int err = -1; | ||
1971 | |||
1972 | if (tcp_send_head(sk) != NULL) { | ||
1973 | err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); | ||
1974 | goto rearm_timer; | ||
1975 | } | ||
1976 | |||
1977 | /* At most one outstanding TLP retransmission. */ | ||
1978 | if (tp->tlp_high_seq) | ||
1979 | goto rearm_timer; | ||
1980 | |||
1981 | /* Retransmit last segment. */ | ||
1982 | skb = tcp_write_queue_tail(sk); | ||
1983 | if (WARN_ON(!skb)) | ||
1984 | goto rearm_timer; | ||
1985 | |||
1986 | pcount = tcp_skb_pcount(skb); | ||
1987 | if (WARN_ON(!pcount)) | ||
1988 | goto rearm_timer; | ||
1989 | |||
1990 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { | ||
1991 | if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) | ||
1992 | goto rearm_timer; | ||
1993 | skb = tcp_write_queue_tail(sk); | ||
1994 | } | ||
1995 | |||
1996 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) | ||
1997 | goto rearm_timer; | ||
1998 | |||
1999 | /* Probe with zero data doesn't trigger fast recovery. */ | ||
2000 | if (skb->len > 0) | ||
2001 | err = __tcp_retransmit_skb(sk, skb); | ||
2002 | |||
2003 | /* Record snd_nxt for loss detection. */ | ||
2004 | if (likely(!err)) | ||
2005 | tp->tlp_high_seq = tp->snd_nxt; | ||
2006 | |||
2007 | rearm_timer: | ||
2008 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
2009 | inet_csk(sk)->icsk_rto, | ||
2010 | TCP_RTO_MAX); | ||
2011 | |||
2012 | if (likely(!err)) | ||
2013 | NET_INC_STATS_BH(sock_net(sk), | ||
2014 | LINUX_MIB_TCPLOSSPROBES); | ||
2015 | return; | ||
2058 | } | 2016 | } |
2059 | 2017 | ||
2060 | /* Push out any pending frames which were held back due to | 2018 | /* Push out any pending frames which were held back due to |
@@ -2675,32 +2633,24 @@ int tcp_send_synack(struct sock *sk) | |||
2675 | * sk: listener socket | 2633 | * sk: listener socket |
2676 | * dst: dst entry attached to the SYNACK | 2634 | * dst: dst entry attached to the SYNACK |
2677 | * req: request_sock pointer | 2635 | * req: request_sock pointer |
2678 | * rvp: request_values pointer | ||
2679 | * | 2636 | * |
2680 | * Allocate one skb and build a SYNACK packet. | 2637 | * Allocate one skb and build a SYNACK packet. |
2681 | * @dst is consumed : Caller should not use it again. | 2638 | * @dst is consumed : Caller should not use it again. |
2682 | */ | 2639 | */ |
2683 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | 2640 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
2684 | struct request_sock *req, | 2641 | struct request_sock *req, |
2685 | struct request_values *rvp, | ||
2686 | struct tcp_fastopen_cookie *foc) | 2642 | struct tcp_fastopen_cookie *foc) |
2687 | { | 2643 | { |
2688 | struct tcp_out_options opts; | 2644 | struct tcp_out_options opts; |
2689 | struct tcp_extend_values *xvp = tcp_xv(rvp); | ||
2690 | struct inet_request_sock *ireq = inet_rsk(req); | 2645 | struct inet_request_sock *ireq = inet_rsk(req); |
2691 | struct tcp_sock *tp = tcp_sk(sk); | 2646 | struct tcp_sock *tp = tcp_sk(sk); |
2692 | const struct tcp_cookie_values *cvp = tp->cookie_values; | ||
2693 | struct tcphdr *th; | 2647 | struct tcphdr *th; |
2694 | struct sk_buff *skb; | 2648 | struct sk_buff *skb; |
2695 | struct tcp_md5sig_key *md5; | 2649 | struct tcp_md5sig_key *md5; |
2696 | int tcp_header_size; | 2650 | int tcp_header_size; |
2697 | int mss; | 2651 | int mss; |
2698 | int s_data_desired = 0; | ||
2699 | 2652 | ||
2700 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) | 2653 | skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC)); |
2701 | s_data_desired = cvp->s_data_desired; | ||
2702 | skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, | ||
2703 | sk_gfp_atomic(sk, GFP_ATOMIC)); | ||
2704 | if (unlikely(!skb)) { | 2654 | if (unlikely(!skb)) { |
2705 | dst_release(dst); | 2655 | dst_release(dst); |
2706 | return NULL; | 2656 | return NULL; |
@@ -2742,9 +2692,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2742 | else | 2692 | else |
2743 | #endif | 2693 | #endif |
2744 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2694 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
2745 | tcp_header_size = tcp_synack_options(sk, req, mss, | 2695 | tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, |
2746 | skb, &opts, &md5, xvp, foc) | 2696 | foc) + sizeof(*th); |
2747 | + sizeof(*th); | ||
2748 | 2697 | ||
2749 | skb_push(skb, tcp_header_size); | 2698 | skb_push(skb, tcp_header_size); |
2750 | skb_reset_transport_header(skb); | 2699 | skb_reset_transport_header(skb); |
@@ -2762,40 +2711,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2762 | tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, | 2711 | tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, |
2763 | TCPHDR_SYN | TCPHDR_ACK); | 2712 | TCPHDR_SYN | TCPHDR_ACK); |
2764 | 2713 | ||
2765 | if (OPTION_COOKIE_EXTENSION & opts.options) { | ||
2766 | if (s_data_desired) { | ||
2767 | u8 *buf = skb_put(skb, s_data_desired); | ||
2768 | |||
2769 | /* copy data directly from the listening socket. */ | ||
2770 | memcpy(buf, cvp->s_data_payload, s_data_desired); | ||
2771 | TCP_SKB_CB(skb)->end_seq += s_data_desired; | ||
2772 | } | ||
2773 | |||
2774 | if (opts.hash_size > 0) { | ||
2775 | __u32 workspace[SHA_WORKSPACE_WORDS]; | ||
2776 | u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; | ||
2777 | u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; | ||
2778 | |||
2779 | /* Secret recipe depends on the Timestamp, (future) | ||
2780 | * Sequence and Acknowledgment Numbers, Initiator | ||
2781 | * Cookie, and others handled by IP variant caller. | ||
2782 | */ | ||
2783 | *tail-- ^= opts.tsval; | ||
2784 | *tail-- ^= tcp_rsk(req)->rcv_isn + 1; | ||
2785 | *tail-- ^= TCP_SKB_CB(skb)->seq + 1; | ||
2786 | |||
2787 | /* recommended */ | ||
2788 | *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); | ||
2789 | *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ | ||
2790 | |||
2791 | sha_transform((__u32 *)&xvp->cookie_bakery[0], | ||
2792 | (char *)mess, | ||
2793 | &workspace[0]); | ||
2794 | opts.hash_location = | ||
2795 | (__u8 *)&xvp->cookie_bakery[0]; | ||
2796 | } | ||
2797 | } | ||
2798 | |||
2799 | th->seq = htonl(TCP_SKB_CB(skb)->seq); | 2714 | th->seq = htonl(TCP_SKB_CB(skb)->seq); |
2800 | /* XXX data is queued and acked as is. No buffer/window check */ | 2715 | /* XXX data is queued and acked as is. No buffer/window check */ |
2801 | th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); | 2716 | th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b78aac30c498..4b85e6f636c9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -342,10 +342,6 @@ void tcp_retransmit_timer(struct sock *sk) | |||
342 | struct tcp_sock *tp = tcp_sk(sk); | 342 | struct tcp_sock *tp = tcp_sk(sk); |
343 | struct inet_connection_sock *icsk = inet_csk(sk); | 343 | struct inet_connection_sock *icsk = inet_csk(sk); |
344 | 344 | ||
345 | if (tp->early_retrans_delayed) { | ||
346 | tcp_resume_early_retransmit(sk); | ||
347 | return; | ||
348 | } | ||
349 | if (tp->fastopen_rsk) { | 345 | if (tp->fastopen_rsk) { |
350 | WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && | 346 | WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && |
351 | sk->sk_state != TCP_FIN_WAIT1); | 347 | sk->sk_state != TCP_FIN_WAIT1); |
@@ -360,6 +356,8 @@ void tcp_retransmit_timer(struct sock *sk) | |||
360 | 356 | ||
361 | WARN_ON(tcp_write_queue_empty(sk)); | 357 | WARN_ON(tcp_write_queue_empty(sk)); |
362 | 358 | ||
359 | tp->tlp_high_seq = 0; | ||
360 | |||
363 | if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && | 361 | if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && |
364 | !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { | 362 | !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { |
365 | /* Receiver dastardly shrinks window. Our retransmits | 363 | /* Receiver dastardly shrinks window. Our retransmits |
@@ -418,11 +416,7 @@ void tcp_retransmit_timer(struct sock *sk) | |||
418 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | 416 | NET_INC_STATS_BH(sock_net(sk), mib_idx); |
419 | } | 417 | } |
420 | 418 | ||
421 | if (tcp_use_frto(sk)) { | 419 | tcp_enter_loss(sk, 0); |
422 | tcp_enter_frto(sk); | ||
423 | } else { | ||
424 | tcp_enter_loss(sk, 0); | ||
425 | } | ||
426 | 420 | ||
427 | if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { | 421 | if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { |
428 | /* Retransmission failed because of local congestion, | 422 | /* Retransmission failed because of local congestion, |
@@ -495,13 +489,20 @@ void tcp_write_timer_handler(struct sock *sk) | |||
495 | } | 489 | } |
496 | 490 | ||
497 | event = icsk->icsk_pending; | 491 | event = icsk->icsk_pending; |
498 | icsk->icsk_pending = 0; | ||
499 | 492 | ||
500 | switch (event) { | 493 | switch (event) { |
494 | case ICSK_TIME_EARLY_RETRANS: | ||
495 | tcp_resume_early_retransmit(sk); | ||
496 | break; | ||
497 | case ICSK_TIME_LOSS_PROBE: | ||
498 | tcp_send_loss_probe(sk); | ||
499 | break; | ||
501 | case ICSK_TIME_RETRANS: | 500 | case ICSK_TIME_RETRANS: |
501 | icsk->icsk_pending = 0; | ||
502 | tcp_retransmit_timer(sk); | 502 | tcp_retransmit_timer(sk); |
503 | break; | 503 | break; |
504 | case ICSK_TIME_PROBE0: | 504 | case ICSK_TIME_PROBE0: |
505 | icsk->icsk_pending = 0; | ||
505 | tcp_probe_timer(sk); | 506 | tcp_probe_timer(sk); |
506 | break; | 507 | break; |
507 | } | 508 | } |
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 1b91bf48e277..76a1e23259e1 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c | |||
@@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) | |||
236 | tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); | 236 | tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
237 | break; | 237 | break; |
238 | 238 | ||
239 | case CA_EVENT_FRTO: | 239 | case CA_EVENT_LOSS: |
240 | tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); | 240 | tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
241 | /* Update RTT_min when next ack arrives */ | 241 | /* Update RTT_min when next ack arrives */ |
242 | w->reset_rtt_min = 1; | 242 | w->reset_rtt_min = 1; |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0a073a263720..7117d1467b02 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -2279,31 +2279,88 @@ void __init udp_init(void) | |||
2279 | 2279 | ||
2280 | int udp4_ufo_send_check(struct sk_buff *skb) | 2280 | int udp4_ufo_send_check(struct sk_buff *skb) |
2281 | { | 2281 | { |
2282 | const struct iphdr *iph; | 2282 | if (!pskb_may_pull(skb, sizeof(struct udphdr))) |
2283 | struct udphdr *uh; | ||
2284 | |||
2285 | if (!pskb_may_pull(skb, sizeof(*uh))) | ||
2286 | return -EINVAL; | 2283 | return -EINVAL; |
2287 | 2284 | ||
2288 | iph = ip_hdr(skb); | 2285 | if (likely(!skb->encapsulation)) { |
2289 | uh = udp_hdr(skb); | 2286 | const struct iphdr *iph; |
2287 | struct udphdr *uh; | ||
2290 | 2288 | ||
2291 | uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, | 2289 | iph = ip_hdr(skb); |
2292 | IPPROTO_UDP, 0); | 2290 | uh = udp_hdr(skb); |
2293 | skb->csum_start = skb_transport_header(skb) - skb->head; | 2291 | |
2294 | skb->csum_offset = offsetof(struct udphdr, check); | 2292 | uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, |
2295 | skb->ip_summed = CHECKSUM_PARTIAL; | 2293 | IPPROTO_UDP, 0); |
2294 | skb->csum_start = skb_transport_header(skb) - skb->head; | ||
2295 | skb->csum_offset = offsetof(struct udphdr, check); | ||
2296 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
2297 | } | ||
2296 | return 0; | 2298 | return 0; |
2297 | } | 2299 | } |
2298 | 2300 | ||
2301 | static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | ||
2302 | netdev_features_t features) | ||
2303 | { | ||
2304 | struct sk_buff *segs = ERR_PTR(-EINVAL); | ||
2305 | int mac_len = skb->mac_len; | ||
2306 | int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); | ||
2307 | int outer_hlen; | ||
2308 | netdev_features_t enc_features; | ||
2309 | |||
2310 | if (unlikely(!pskb_may_pull(skb, tnl_hlen))) | ||
2311 | goto out; | ||
2312 | |||
2313 | skb->encapsulation = 0; | ||
2314 | __skb_pull(skb, tnl_hlen); | ||
2315 | skb_reset_mac_header(skb); | ||
2316 | skb_set_network_header(skb, skb_inner_network_offset(skb)); | ||
2317 | skb->mac_len = skb_inner_network_offset(skb); | ||
2318 | |||
2319 | /* segment inner packet. */ | ||
2320 | enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); | ||
2321 | segs = skb_mac_gso_segment(skb, enc_features); | ||
2322 | if (!segs || IS_ERR(segs)) | ||
2323 | goto out; | ||
2324 | |||
2325 | outer_hlen = skb_tnl_header_len(skb); | ||
2326 | skb = segs; | ||
2327 | do { | ||
2328 | struct udphdr *uh; | ||
2329 | int udp_offset = outer_hlen - tnl_hlen; | ||
2330 | |||
2331 | skb->mac_len = mac_len; | ||
2332 | |||
2333 | skb_push(skb, outer_hlen); | ||
2334 | skb_reset_mac_header(skb); | ||
2335 | skb_set_network_header(skb, mac_len); | ||
2336 | skb_set_transport_header(skb, udp_offset); | ||
2337 | uh = udp_hdr(skb); | ||
2338 | uh->len = htons(skb->len - udp_offset); | ||
2339 | |||
2340 | /* csum segment if tunnel sets skb with csum. */ | ||
2341 | if (unlikely(uh->check)) { | ||
2342 | struct iphdr *iph = ip_hdr(skb); | ||
2343 | |||
2344 | uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, | ||
2345 | skb->len - udp_offset, | ||
2346 | IPPROTO_UDP, 0); | ||
2347 | uh->check = csum_fold(skb_checksum(skb, udp_offset, | ||
2348 | skb->len - udp_offset, 0)); | ||
2349 | if (uh->check == 0) | ||
2350 | uh->check = CSUM_MANGLED_0; | ||
2351 | |||
2352 | } | ||
2353 | skb->ip_summed = CHECKSUM_NONE; | ||
2354 | } while ((skb = skb->next)); | ||
2355 | out: | ||
2356 | return segs; | ||
2357 | } | ||
2358 | |||
2299 | struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, | 2359 | struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, |
2300 | netdev_features_t features) | 2360 | netdev_features_t features) |
2301 | { | 2361 | { |
2302 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 2362 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
2303 | unsigned int mss; | 2363 | unsigned int mss; |
2304 | int offset; | ||
2305 | __wsum csum; | ||
2306 | |||
2307 | mss = skb_shinfo(skb)->gso_size; | 2364 | mss = skb_shinfo(skb)->gso_size; |
2308 | if (unlikely(skb->len <= mss)) | 2365 | if (unlikely(skb->len <= mss)) |
2309 | goto out; | 2366 | goto out; |
@@ -2313,6 +2370,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, | |||
2313 | int type = skb_shinfo(skb)->gso_type; | 2370 | int type = skb_shinfo(skb)->gso_type; |
2314 | 2371 | ||
2315 | if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | | 2372 | if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | |
2373 | SKB_GSO_UDP_TUNNEL | | ||
2316 | SKB_GSO_GRE) || | 2374 | SKB_GSO_GRE) || |
2317 | !(type & (SKB_GSO_UDP)))) | 2375 | !(type & (SKB_GSO_UDP)))) |
2318 | goto out; | 2376 | goto out; |
@@ -2323,20 +2381,27 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, | |||
2323 | goto out; | 2381 | goto out; |
2324 | } | 2382 | } |
2325 | 2383 | ||
2326 | /* Do software UFO. Complete and fill in the UDP checksum as HW cannot | ||
2327 | * do checksum of UDP packets sent as multiple IP fragments. | ||
2328 | */ | ||
2329 | offset = skb_checksum_start_offset(skb); | ||
2330 | csum = skb_checksum(skb, offset, skb->len - offset, 0); | ||
2331 | offset += skb->csum_offset; | ||
2332 | *(__sum16 *)(skb->data + offset) = csum_fold(csum); | ||
2333 | skb->ip_summed = CHECKSUM_NONE; | ||
2334 | |||
2335 | /* Fragment the skb. IP headers of the fragments are updated in | 2384 | /* Fragment the skb. IP headers of the fragments are updated in |
2336 | * inet_gso_segment() | 2385 | * inet_gso_segment() |
2337 | */ | 2386 | */ |
2338 | segs = skb_segment(skb, features); | 2387 | if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) |
2388 | segs = skb_udp_tunnel_segment(skb, features); | ||
2389 | else { | ||
2390 | int offset; | ||
2391 | __wsum csum; | ||
2392 | |||
2393 | /* Do software UFO. Complete and fill in the UDP checksum as | ||
2394 | * HW cannot do checksum of UDP packets sent as multiple | ||
2395 | * IP fragments. | ||
2396 | */ | ||
2397 | offset = skb_checksum_start_offset(skb); | ||
2398 | csum = skb_checksum(skb, offset, skb->len - offset, 0); | ||
2399 | offset += skb->csum_offset; | ||
2400 | *(__sum16 *)(skb->data + offset) = csum_fold(csum); | ||
2401 | skb->ip_summed = CHECKSUM_NONE; | ||
2402 | |||
2403 | segs = skb_segment(skb, features); | ||
2404 | } | ||
2339 | out: | 2405 | out: |
2340 | return segs; | 2406 | return segs; |
2341 | } | 2407 | } |
2342 | |||