aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig7
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c10
-rw-r--r--net/ipv4/arp.c27
-rw-r--r--net/ipv4/devinet.c83
-rw-r--r--net/ipv4/fib_frontend.c4
-rw-r--r--net/ipv4/gre.c5
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/inet_fragment.c27
-rw-r--r--net/ipv4/inet_lro.c5
-rw-r--r--net/ipv4/ip_fragment.c31
-rw-r--r--net/ipv4/ip_gre.c1516
-rw-r--r--net/ipv4/ip_tunnel.c1035
-rw-r--r--net/ipv4/ip_vti.c42
-rw-r--r--net/ipv4/ipip.c748
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/arptable_filter.c4
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c18
-rw-r--r--net/ipv4/tcp.c268
-rw-r--r--net/ipv4/tcp_input.c606
-rw-r--r--net/ipv4/tcp_ipv4.c108
-rw-r--r--net/ipv4/tcp_minisocks.c44
-rw-r--r--net/ipv4/tcp_output.c367
-rw-r--r--net/ipv4/tcp_timer.c21
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/udp.c115
30 files changed, 2012 insertions, 3097 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7944df768454..8603ca827104 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -166,6 +166,7 @@ config IP_PNP_RARP
166config NET_IPIP 166config NET_IPIP
167 tristate "IP: tunneling" 167 tristate "IP: tunneling"
168 select INET_TUNNEL 168 select INET_TUNNEL
169 select NET_IP_TUNNEL
169 ---help--- 170 ---help---
170 Tunneling means encapsulating data of one protocol type within 171 Tunneling means encapsulating data of one protocol type within
171 another protocol and sending it over a channel that understands the 172 another protocol and sending it over a channel that understands the
@@ -186,9 +187,14 @@ config NET_IPGRE_DEMUX
186 This is helper module to demultiplex GRE packets on GRE version field criteria. 187 This is helper module to demultiplex GRE packets on GRE version field criteria.
187 Required by ip_gre and pptp modules. 188 Required by ip_gre and pptp modules.
188 189
190config NET_IP_TUNNEL
191 tristate
192 default n
193
189config NET_IPGRE 194config NET_IPGRE
190 tristate "IP: GRE tunnels over IP" 195 tristate "IP: GRE tunnels over IP"
191 depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX 196 depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
197 select NET_IP_TUNNEL
192 help 198 help
193 Tunneling means encapsulating data of one protocol type within 199 Tunneling means encapsulating data of one protocol type within
194 another protocol and sending it over a channel that understands the 200 another protocol and sending it over a channel that understands the
@@ -313,6 +319,7 @@ config SYN_COOKIES
313config NET_IPVTI 319config NET_IPVTI
314 tristate "Virtual (secure) IP: tunneling" 320 tristate "Virtual (secure) IP: tunneling"
315 select INET_TUNNEL 321 select INET_TUNNEL
322 select NET_IP_TUNNEL
316 depends on INET_XFRM_MODE_TUNNEL 323 depends on INET_XFRM_MODE_TUNNEL
317 ---help--- 324 ---help---
318 Tunneling means encapsulating data of one protocol type within 325 Tunneling means encapsulating data of one protocol type within
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 15ca63ec604e..089cb9f36387 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -13,6 +13,7 @@ obj-y := route.o inetpeer.o protocol.o \
13 fib_frontend.o fib_semantics.o fib_trie.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
14 inet_fragment.o ping.o 14 inet_fragment.o ping.o
15 15
16obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 17obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
17obj-$(CONFIG_PROC_FS) += proc.o 18obj-$(CONFIG_PROC_FS) += proc.o
18obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 19obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c929d9c1c4b6..93824c57b108 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -111,7 +111,6 @@
111#include <net/sock.h> 111#include <net/sock.h>
112#include <net/raw.h> 112#include <net/raw.h>
113#include <net/icmp.h> 113#include <net/icmp.h>
114#include <net/ipip.h>
115#include <net/inet_common.h> 114#include <net/inet_common.h>
116#include <net/xfrm.h> 115#include <net/xfrm.h>
117#include <net/net_namespace.h> 116#include <net/net_namespace.h>
@@ -1283,9 +1282,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1283 int ihl; 1282 int ihl;
1284 int id; 1283 int id;
1285 unsigned int offset = 0; 1284 unsigned int offset = 0;
1286 1285 bool tunnel;
1287 if (!(features & NETIF_F_V4_CSUM))
1288 features &= ~NETIF_F_SG;
1289 1286
1290 if (unlikely(skb_shinfo(skb)->gso_type & 1287 if (unlikely(skb_shinfo(skb)->gso_type &
1291 ~(SKB_GSO_TCPV4 | 1288 ~(SKB_GSO_TCPV4 |
@@ -1293,6 +1290,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1293 SKB_GSO_DODGY | 1290 SKB_GSO_DODGY |
1294 SKB_GSO_TCP_ECN | 1291 SKB_GSO_TCP_ECN |
1295 SKB_GSO_GRE | 1292 SKB_GSO_GRE |
1293 SKB_GSO_UDP_TUNNEL |
1296 0))) 1294 0)))
1297 goto out; 1295 goto out;
1298 1296
@@ -1307,6 +1305,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1307 if (unlikely(!pskb_may_pull(skb, ihl))) 1305 if (unlikely(!pskb_may_pull(skb, ihl)))
1308 goto out; 1306 goto out;
1309 1307
1308 tunnel = !!skb->encapsulation;
1309
1310 __skb_pull(skb, ihl); 1310 __skb_pull(skb, ihl);
1311 skb_reset_transport_header(skb); 1311 skb_reset_transport_header(skb);
1312 iph = ip_hdr(skb); 1312 iph = ip_hdr(skb);
@@ -1326,7 +1326,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1326 skb = segs; 1326 skb = segs;
1327 do { 1327 do {
1328 iph = ip_hdr(skb); 1328 iph = ip_hdr(skb);
1329 if (proto == IPPROTO_UDP) { 1329 if (!tunnel && proto == IPPROTO_UDP) {
1330 iph->id = htons(id); 1330 iph->id = htons(id);
1331 iph->frag_off = htons(offset >> 3); 1331 iph->frag_off = htons(offset >> 3);
1332 if (skb->next != NULL) 1332 if (skb->next != NULL)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index fea4929f6200..247ec1951c35 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -654,11 +654,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
654 arp_ptr += dev->addr_len; 654 arp_ptr += dev->addr_len;
655 memcpy(arp_ptr, &src_ip, 4); 655 memcpy(arp_ptr, &src_ip, 4);
656 arp_ptr += 4; 656 arp_ptr += 4;
657 if (target_hw != NULL) 657
658 memcpy(arp_ptr, target_hw, dev->addr_len); 658 switch (dev->type) {
659 else 659#if IS_ENABLED(CONFIG_FIREWIRE_NET)
660 memset(arp_ptr, 0, dev->addr_len); 660 case ARPHRD_IEEE1394:
661 arp_ptr += dev->addr_len; 661 break;
662#endif
663 default:
664 if (target_hw != NULL)
665 memcpy(arp_ptr, target_hw, dev->addr_len);
666 else
667 memset(arp_ptr, 0, dev->addr_len);
668 arp_ptr += dev->addr_len;
669 }
662 memcpy(arp_ptr, &dest_ip, 4); 670 memcpy(arp_ptr, &dest_ip, 4);
663 671
664 return skb; 672 return skb;
@@ -781,7 +789,14 @@ static int arp_process(struct sk_buff *skb)
781 arp_ptr += dev->addr_len; 789 arp_ptr += dev->addr_len;
782 memcpy(&sip, arp_ptr, 4); 790 memcpy(&sip, arp_ptr, 4);
783 arp_ptr += 4; 791 arp_ptr += 4;
784 arp_ptr += dev->addr_len; 792 switch (dev_type) {
793#if IS_ENABLED(CONFIG_FIREWIRE_NET)
794 case ARPHRD_IEEE1394:
795 break;
796#endif
797 default:
798 arp_ptr += dev->addr_len;
799 }
785 memcpy(&tip, arp_ptr, 4); 800 memcpy(&tip, arp_ptr, 4);
786/* 801/*
787 * Check for bad requests for 127.x.x.x and requests for multicast 802 * Check for bad requests for 127.x.x.x and requests for multicast
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index f678507bc829..5d985e367535 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -536,7 +536,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
536 return NULL; 536 return NULL;
537} 537}
538 538
539static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) 539static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
540{ 540{
541 struct net *net = sock_net(skb->sk); 541 struct net *net = sock_net(skb->sk);
542 struct nlattr *tb[IFA_MAX+1]; 542 struct nlattr *tb[IFA_MAX+1];
@@ -775,7 +775,7 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
775 return NULL; 775 return NULL;
776} 776}
777 777
778static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) 778static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
779{ 779{
780 struct net *net = sock_net(skb->sk); 780 struct net *net = sock_net(skb->sk);
781 struct in_ifaddr *ifa; 781 struct in_ifaddr *ifa;
@@ -1499,6 +1499,8 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1499 idx = 0; 1499 idx = 0;
1500 head = &net->dev_index_head[h]; 1500 head = &net->dev_index_head[h];
1501 rcu_read_lock(); 1501 rcu_read_lock();
1502 cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
1503 net->dev_base_seq;
1502 hlist_for_each_entry_rcu(dev, head, index_hlist) { 1504 hlist_for_each_entry_rcu(dev, head, index_hlist) {
1503 if (idx < s_idx) 1505 if (idx < s_idx)
1504 goto cont; 1506 goto cont;
@@ -1519,6 +1521,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1519 rcu_read_unlock(); 1521 rcu_read_unlock();
1520 goto done; 1522 goto done;
1521 } 1523 }
1524 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1522 } 1525 }
1523cont: 1526cont:
1524 idx++; 1527 idx++;
@@ -1730,8 +1733,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
1730}; 1733};
1731 1734
1732static int inet_netconf_get_devconf(struct sk_buff *in_skb, 1735static int inet_netconf_get_devconf(struct sk_buff *in_skb,
1733 struct nlmsghdr *nlh, 1736 struct nlmsghdr *nlh)
1734 void *arg)
1735{ 1737{
1736 struct net *net = sock_net(in_skb->sk); 1738 struct net *net = sock_net(in_skb->sk);
1737 struct nlattr *tb[NETCONFA_MAX+1]; 1739 struct nlattr *tb[NETCONFA_MAX+1];
@@ -1791,6 +1793,77 @@ errout:
1791 return err; 1793 return err;
1792} 1794}
1793 1795
1796static int inet_netconf_dump_devconf(struct sk_buff *skb,
1797 struct netlink_callback *cb)
1798{
1799 struct net *net = sock_net(skb->sk);
1800 int h, s_h;
1801 int idx, s_idx;
1802 struct net_device *dev;
1803 struct in_device *in_dev;
1804 struct hlist_head *head;
1805
1806 s_h = cb->args[0];
1807 s_idx = idx = cb->args[1];
1808
1809 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1810 idx = 0;
1811 head = &net->dev_index_head[h];
1812 rcu_read_lock();
1813 cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
1814 net->dev_base_seq;
1815 hlist_for_each_entry_rcu(dev, head, index_hlist) {
1816 if (idx < s_idx)
1817 goto cont;
1818 in_dev = __in_dev_get_rcu(dev);
1819 if (!in_dev)
1820 goto cont;
1821
1822 if (inet_netconf_fill_devconf(skb, dev->ifindex,
1823 &in_dev->cnf,
1824 NETLINK_CB(cb->skb).portid,
1825 cb->nlh->nlmsg_seq,
1826 RTM_NEWNETCONF,
1827 NLM_F_MULTI,
1828 -1) <= 0) {
1829 rcu_read_unlock();
1830 goto done;
1831 }
1832 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1833cont:
1834 idx++;
1835 }
1836 rcu_read_unlock();
1837 }
1838 if (h == NETDEV_HASHENTRIES) {
1839 if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
1840 net->ipv4.devconf_all,
1841 NETLINK_CB(cb->skb).portid,
1842 cb->nlh->nlmsg_seq,
1843 RTM_NEWNETCONF, NLM_F_MULTI,
1844 -1) <= 0)
1845 goto done;
1846 else
1847 h++;
1848 }
1849 if (h == NETDEV_HASHENTRIES + 1) {
1850 if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
1851 net->ipv4.devconf_dflt,
1852 NETLINK_CB(cb->skb).portid,
1853 cb->nlh->nlmsg_seq,
1854 RTM_NEWNETCONF, NLM_F_MULTI,
1855 -1) <= 0)
1856 goto done;
1857 else
1858 h++;
1859 }
1860done:
1861 cb->args[0] = h;
1862 cb->args[1] = idx;
1863
1864 return skb->len;
1865}
1866
1794#ifdef CONFIG_SYSCTL 1867#ifdef CONFIG_SYSCTL
1795 1868
1796static void devinet_copy_dflt_conf(struct net *net, int i) 1869static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -2195,6 +2268,6 @@ void __init devinet_init(void)
2195 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); 2268 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
2196 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); 2269 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
2197 rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, 2270 rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
2198 NULL, NULL); 2271 inet_netconf_dump_devconf, NULL);
2199} 2272}
2200 2273
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index eb4bb12b3eb4..0e74398bc8e6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -604,7 +604,7 @@ errout:
604 return err; 604 return err;
605} 605}
606 606
607static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) 607static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
608{ 608{
609 struct net *net = sock_net(skb->sk); 609 struct net *net = sock_net(skb->sk);
610 struct fib_config cfg; 610 struct fib_config cfg;
@@ -626,7 +626,7 @@ errout:
626 return err; 626 return err;
627} 627}
628 628
629static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) 629static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
630{ 630{
631 struct net *net = sock_net(skb->sk); 631 struct net *net = sock_net(skb->sk);
632 struct fib_config cfg; 632 struct fib_config cfg;
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 7a4c710c4cdd..d2d5a99fba09 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -27,11 +27,6 @@
27 27
28static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; 28static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
29static DEFINE_SPINLOCK(gre_proto_lock); 29static DEFINE_SPINLOCK(gre_proto_lock);
30struct gre_base_hdr {
31 __be16 flags;
32 __be16 protocol;
33};
34#define GRE_HEADER_SECTION 4
35 30
36int gre_add_protocol(const struct gre_protocol *proto, u8 version) 31int gre_add_protocol(const struct gre_protocol *proto, u8 version)
37{ 32{
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 786d97aee751..6acb541c9091 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -559,7 +559,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
559 559
560int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) 560int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
561{ 561{
562 int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL); 562 int err = req->rsk_ops->rtx_syn_ack(parent, req);
563 563
564 if (!err) 564 if (!err)
565 req->num_retrans++; 565 req->num_retrans++;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7afa2c3c788f..8620408af574 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -158,7 +158,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
158 158
159#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) 159#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
160 160
161 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 161 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
162 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
163 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
162 r->idiag_timer = 1; 164 r->idiag_timer = 1;
163 r->idiag_retrans = icsk->icsk_retransmits; 165 r->idiag_retrans = icsk->icsk_retransmits;
164 r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); 166 r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index f4fd23de9b13..1206ca64b0ea 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -23,6 +23,28 @@
23 23
24#include <net/sock.h> 24#include <net/sock.h>
25#include <net/inet_frag.h> 25#include <net/inet_frag.h>
26#include <net/inet_ecn.h>
27
28/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
29 * Value : 0xff if frame should be dropped.
30 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
31 */
32const u8 ip_frag_ecn_table[16] = {
33 /* at least one fragment had CE, and others ECT_0 or ECT_1 */
34 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
35 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
36 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
37
38 /* invalid combinations : drop frame */
39 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
40 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
41 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
42 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
43 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
44 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
45 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
46};
47EXPORT_SYMBOL(ip_frag_ecn_table);
26 48
27static void inet_frag_secret_rebuild(unsigned long dummy) 49static void inet_frag_secret_rebuild(unsigned long dummy)
28{ 50{
@@ -102,7 +124,6 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
102{ 124{
103 write_lock(&f->lock); 125 write_lock(&f->lock);
104 hlist_del(&fq->list); 126 hlist_del(&fq->list);
105 fq->net->nqueues--;
106 write_unlock(&f->lock); 127 write_unlock(&f->lock);
107 inet_frag_lru_del(fq); 128 inet_frag_lru_del(fq);
108} 129}
@@ -182,6 +203,9 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
182 q = list_first_entry(&nf->lru_list, 203 q = list_first_entry(&nf->lru_list,
183 struct inet_frag_queue, lru_list); 204 struct inet_frag_queue, lru_list);
184 atomic_inc(&q->refcnt); 205 atomic_inc(&q->refcnt);
206 /* Remove q from list to avoid several CPUs grabbing it */
207 list_del_init(&q->lru_list);
208
185 spin_unlock(&nf->lru_lock); 209 spin_unlock(&nf->lru_lock);
186 210
187 spin_lock(&q->lock); 211 spin_lock(&q->lock);
@@ -235,7 +259,6 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
235 259
236 atomic_inc(&qp->refcnt); 260 atomic_inc(&qp->refcnt);
237 hlist_add_head(&qp->list, &f->hash[hash]); 261 hlist_add_head(&qp->list, &f->hash[hash]);
238 nf->nqueues++;
239 write_unlock(&f->lock); 262 write_unlock(&f->lock);
240 inet_frag_lru_add(nf, qp); 263 inet_frag_lru_add(nf, qp);
241 return qp; 264 return qp;
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index cc280a3f4f96..1975f52933c5 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/if_vlan.h> 30#include <linux/if_vlan.h>
31#include <linux/inet_lro.h> 31#include <linux/inet_lro.h>
32#include <net/checksum.h>
32 33
33MODULE_LICENSE("GPL"); 34MODULE_LICENSE("GPL");
34MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); 35MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
@@ -114,11 +115,9 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
114 *(p+2) = lro_desc->tcp_rcv_tsecr; 115 *(p+2) = lro_desc->tcp_rcv_tsecr;
115 } 116 }
116 117
118 csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
117 iph->tot_len = htons(lro_desc->ip_tot_len); 119 iph->tot_len = htons(lro_desc->ip_tot_len);
118 120
119 iph->check = 0;
120 iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
121
122 tcph->check = 0; 121 tcph->check = 0;
123 tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); 122 tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
124 lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); 123 lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index a6445b843ef4..938520668b2f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -79,40 +79,11 @@ struct ipq {
79 struct inet_peer *peer; 79 struct inet_peer *peer;
80}; 80};
81 81
82/* RFC 3168 support :
83 * We want to check ECN values of all fragments, do detect invalid combinations.
84 * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
85 */
86#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */
87#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */
88#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */
89#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */
90
91static inline u8 ip4_frag_ecn(u8 tos) 82static inline u8 ip4_frag_ecn(u8 tos)
92{ 83{
93 return 1 << (tos & INET_ECN_MASK); 84 return 1 << (tos & INET_ECN_MASK);
94} 85}
95 86
96/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
97 * Value : 0xff if frame should be dropped.
98 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
99 */
100static const u8 ip4_frag_ecn_table[16] = {
101 /* at least one fragment had CE, and others ECT_0 or ECT_1 */
102 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
103 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
104 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
105
106 /* invalid combinations : drop frame */
107 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
108 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
109 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
110 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
111 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
112 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
113 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
114};
115
116static struct inet_frags ip4_frags; 87static struct inet_frags ip4_frags;
117 88
118int ip_frag_nqueues(struct net *net) 89int ip_frag_nqueues(struct net *net)
@@ -551,7 +522,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
551 522
552 ipq_kill(qp); 523 ipq_kill(qp);
553 524
554 ecn = ip4_frag_ecn_table[qp->ecn]; 525 ecn = ip_frag_ecn_table[qp->ecn];
555 if (unlikely(ecn == 0xff)) { 526 if (unlikely(ecn == 0xff)) {
556 err = -EINVAL; 527 err = -EINVAL;
557 goto out_fail; 528 goto out_fail;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 91d66dbde9c0..ad662e906f7e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -37,7 +37,7 @@
37#include <net/ip.h> 37#include <net/ip.h>
38#include <net/icmp.h> 38#include <net/icmp.h>
39#include <net/protocol.h> 39#include <net/protocol.h>
40#include <net/ipip.h> 40#include <net/ip_tunnels.h>
41#include <net/arp.h> 41#include <net/arp.h>
42#include <net/checksum.h> 42#include <net/checksum.h>
43#include <net/dsfield.h> 43#include <net/dsfield.h>
@@ -108,15 +108,6 @@
108 fatal route to network, even if it were you who configured 108 fatal route to network, even if it were you who configured
109 fatal static route: you are innocent. :-) 109 fatal static route: you are innocent. :-)
110 110
111
112
113 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114 practically identical code. It would be good to glue them
115 together, but it is not very evident, how to make them modular.
116 sit is integral part of IPv6, ipip and gre are naturally modular.
117 We could extract common parts (hash table, ioctl etc)
118 to a separate module (ip_tunnel.c).
119
120 Alexey Kuznetsov. 111 Alexey Kuznetsov.
121 */ 112 */
122 113
@@ -126,400 +117,135 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126 117
127static struct rtnl_link_ops ipgre_link_ops __read_mostly; 118static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128static int ipgre_tunnel_init(struct net_device *dev); 119static int ipgre_tunnel_init(struct net_device *dev);
129static void ipgre_tunnel_setup(struct net_device *dev);
130static int ipgre_tunnel_bind_dev(struct net_device *dev);
131
132/* Fallback tunnel: no source, no destination, no key, no options */
133
134#define HASH_SIZE 16
135 120
136static int ipgre_net_id __read_mostly; 121static int ipgre_net_id __read_mostly;
137struct ipgre_net { 122static int gre_tap_net_id __read_mostly;
138 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
139
140 struct net_device *fb_tunnel_dev;
141};
142
143/* Tunnel hash table */
144
145/*
146 4 hash tables:
147
148 3: (remote,local)
149 2: (remote,*)
150 1: (*,local)
151 0: (*,*)
152 123
153 We require exact key match i.e. if a key is present in packet 124static __sum16 check_checksum(struct sk_buff *skb)
154 it will match only tunnel with the same key; if it is not present, 125{
155 it will match only keyless tunnel. 126 __sum16 csum = 0;
156
157 All keysless packets, if not matched configured keyless tunnels
158 will match fallback tunnel.
159 */
160 127
161#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 128 switch (skb->ip_summed) {
129 case CHECKSUM_COMPLETE:
130 csum = csum_fold(skb->csum);
162 131
163#define tunnels_r_l tunnels[3] 132 if (!csum)
164#define tunnels_r tunnels[2] 133 break;
165#define tunnels_l tunnels[1] 134 /* Fall through. */
166#define tunnels_wc tunnels[0]
167 135
168static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, 136 case CHECKSUM_NONE:
169 struct rtnl_link_stats64 *tot) 137 skb->csum = 0;
170{ 138 csum = __skb_checksum_complete(skb);
171 int i; 139 skb->ip_summed = CHECKSUM_COMPLETE;
172 140 break;
173 for_each_possible_cpu(i) {
174 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
175 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
176 unsigned int start;
177
178 do {
179 start = u64_stats_fetch_begin_bh(&tstats->syncp);
180 rx_packets = tstats->rx_packets;
181 tx_packets = tstats->tx_packets;
182 rx_bytes = tstats->rx_bytes;
183 tx_bytes = tstats->tx_bytes;
184 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
185
186 tot->rx_packets += rx_packets;
187 tot->tx_packets += tx_packets;
188 tot->rx_bytes += rx_bytes;
189 tot->tx_bytes += tx_bytes;
190 } 141 }
191 142
192 tot->multicast = dev->stats.multicast; 143 return csum;
193 tot->rx_crc_errors = dev->stats.rx_crc_errors;
194 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
195 tot->rx_length_errors = dev->stats.rx_length_errors;
196 tot->rx_frame_errors = dev->stats.rx_frame_errors;
197 tot->rx_errors = dev->stats.rx_errors;
198
199 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
200 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
201 tot->tx_dropped = dev->stats.tx_dropped;
202 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
203 tot->tx_errors = dev->stats.tx_errors;
204
205 return tot;
206} 144}
207 145
208/* Does key in tunnel parameters match packet */ 146static int ip_gre_calc_hlen(__be16 o_flags)
209static bool ipgre_key_match(const struct ip_tunnel_parm *p,
210 __be16 flags, __be32 key)
211{ 147{
212 if (p->i_flags & GRE_KEY) { 148 int addend = 4;
213 if (flags & GRE_KEY)
214 return key == p->i_key;
215 else
216 return false; /* key expected, none present */
217 } else
218 return !(flags & GRE_KEY);
219}
220 149
221/* Given src, dst and key, find appropriate for input tunnel. */ 150 if (o_flags&TUNNEL_CSUM)
151 addend += 4;
152 if (o_flags&TUNNEL_KEY)
153 addend += 4;
154 if (o_flags&TUNNEL_SEQ)
155 addend += 4;
156 return addend;
157}
222 158
223static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, 159static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
224 __be32 remote, __be32 local, 160 bool *csum_err, int *hdr_len)
225 __be16 flags, __be32 key,
226 __be16 gre_proto)
227{ 161{
228 struct net *net = dev_net(dev); 162 struct iphdr *iph = ip_hdr(skb);
229 int link = dev->ifindex; 163 struct gre_base_hdr *greh;
230 unsigned int h0 = HASH(remote); 164 __be32 *options;
231 unsigned int h1 = HASH(key);
232 struct ip_tunnel *t, *cand = NULL;
233 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
234 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
235 ARPHRD_ETHER : ARPHRD_IPGRE;
236 int score, cand_score = 4;
237
238 for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
239 if (local != t->parms.iph.saddr ||
240 remote != t->parms.iph.daddr ||
241 !(t->dev->flags & IFF_UP))
242 continue;
243
244 if (!ipgre_key_match(&t->parms, flags, key))
245 continue;
246
247 if (t->dev->type != ARPHRD_IPGRE &&
248 t->dev->type != dev_type)
249 continue;
250
251 score = 0;
252 if (t->parms.link != link)
253 score |= 1;
254 if (t->dev->type != dev_type)
255 score |= 2;
256 if (score == 0)
257 return t;
258
259 if (score < cand_score) {
260 cand = t;
261 cand_score = score;
262 }
263 }
264
265 for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
266 if (remote != t->parms.iph.daddr ||
267 !(t->dev->flags & IFF_UP))
268 continue;
269
270 if (!ipgre_key_match(&t->parms, flags, key))
271 continue;
272
273 if (t->dev->type != ARPHRD_IPGRE &&
274 t->dev->type != dev_type)
275 continue;
276
277 score = 0;
278 if (t->parms.link != link)
279 score |= 1;
280 if (t->dev->type != dev_type)
281 score |= 2;
282 if (score == 0)
283 return t;
284
285 if (score < cand_score) {
286 cand = t;
287 cand_score = score;
288 }
289 }
290 165
291 for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { 166 if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
292 if ((local != t->parms.iph.saddr && 167 return -EINVAL;
293 (local != t->parms.iph.daddr ||
294 !ipv4_is_multicast(local))) ||
295 !(t->dev->flags & IFF_UP))
296 continue;
297
298 if (!ipgre_key_match(&t->parms, flags, key))
299 continue;
300
301 if (t->dev->type != ARPHRD_IPGRE &&
302 t->dev->type != dev_type)
303 continue;
304
305 score = 0;
306 if (t->parms.link != link)
307 score |= 1;
308 if (t->dev->type != dev_type)
309 score |= 2;
310 if (score == 0)
311 return t;
312
313 if (score < cand_score) {
314 cand = t;
315 cand_score = score;
316 }
317 }
318 168
319 for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { 169 greh = (struct gre_base_hdr *)((u8 *)iph + (iph->ihl << 2));
320 if (t->parms.i_key != key || 170 if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
321 !(t->dev->flags & IFF_UP)) 171 return -EINVAL;
322 continue;
323
324 if (t->dev->type != ARPHRD_IPGRE &&
325 t->dev->type != dev_type)
326 continue;
327
328 score = 0;
329 if (t->parms.link != link)
330 score |= 1;
331 if (t->dev->type != dev_type)
332 score |= 2;
333 if (score == 0)
334 return t;
335
336 if (score < cand_score) {
337 cand = t;
338 cand_score = score;
339 }
340 }
341 172
342 if (cand != NULL) 173 tpi->flags = gre_flags_to_tnl_flags(greh->flags);
343 return cand; 174 *hdr_len = ip_gre_calc_hlen(tpi->flags);
344 175
345 dev = ign->fb_tunnel_dev; 176 if (!pskb_may_pull(skb, *hdr_len))
346 if (dev->flags & IFF_UP) 177 return -EINVAL;
347 return netdev_priv(dev);
348 178
349 return NULL; 179 tpi->proto = greh->protocol;
350}
351 180
352static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, 181 options = (__be32 *)(greh + 1);
353 struct ip_tunnel_parm *parms) 182 if (greh->flags & GRE_CSUM) {
354{ 183 if (check_checksum(skb)) {
355 __be32 remote = parms->iph.daddr; 184 *csum_err = true;
356 __be32 local = parms->iph.saddr; 185 return -EINVAL;
357 __be32 key = parms->i_key; 186 }
358 unsigned int h = HASH(key); 187 options++;
359 int prio = 0;
360
361 if (local)
362 prio |= 1;
363 if (remote && !ipv4_is_multicast(remote)) {
364 prio |= 2;
365 h ^= HASH(remote);
366 } 188 }
367 189
368 return &ign->tunnels[prio][h]; 190 if (greh->flags & GRE_KEY) {
369} 191 tpi->key = *options;
370 192 options++;
371static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, 193 } else
372 struct ip_tunnel *t) 194 tpi->key = 0;
373{
374 return __ipgre_bucket(ign, &t->parms);
375}
376
377static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
378{
379 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
380 195
381 rcu_assign_pointer(t->next, rtnl_dereference(*tp)); 196 if (unlikely(greh->flags & GRE_SEQ)) {
382 rcu_assign_pointer(*tp, t); 197 tpi->seq = *options;
383} 198 options++;
199 } else
200 tpi->seq = 0;
384 201
385static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 202 /* WCCP version 1 and 2 protocol decoding.
386{ 203 * - Change protocol to IP
387 struct ip_tunnel __rcu **tp; 204 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
388 struct ip_tunnel *iter; 205 */
389 206 if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
390 for (tp = ipgre_bucket(ign, t); 207 tpi->proto = htons(ETH_P_IP);
391 (iter = rtnl_dereference(*tp)) != NULL; 208 if ((*(u8 *)options & 0xF0) != 0x40) {
392 tp = &iter->next) { 209 *hdr_len += 4;
393 if (t == iter) { 210 if (!pskb_may_pull(skb, *hdr_len))
394 rcu_assign_pointer(*tp, t->next); 211 return -EINVAL;
395 break;
396 } 212 }
397 } 213 }
398}
399
400static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
401 struct ip_tunnel_parm *parms,
402 int type)
403{
404 __be32 remote = parms->iph.daddr;
405 __be32 local = parms->iph.saddr;
406 __be32 key = parms->i_key;
407 int link = parms->link;
408 struct ip_tunnel *t;
409 struct ip_tunnel __rcu **tp;
410 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
411
412 for (tp = __ipgre_bucket(ign, parms);
413 (t = rtnl_dereference(*tp)) != NULL;
414 tp = &t->next)
415 if (local == t->parms.iph.saddr &&
416 remote == t->parms.iph.daddr &&
417 key == t->parms.i_key &&
418 link == t->parms.link &&
419 type == t->dev->type)
420 break;
421
422 return t;
423}
424
425static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
426 struct ip_tunnel_parm *parms, int create)
427{
428 struct ip_tunnel *t, *nt;
429 struct net_device *dev;
430 char name[IFNAMSIZ];
431 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
432
433 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
434 if (t || !create)
435 return t;
436
437 if (parms->name[0])
438 strlcpy(name, parms->name, IFNAMSIZ);
439 else
440 strcpy(name, "gre%d");
441
442 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
443 if (!dev)
444 return NULL;
445
446 dev_net_set(dev, net);
447
448 nt = netdev_priv(dev);
449 nt->parms = *parms;
450 dev->rtnl_link_ops = &ipgre_link_ops;
451 214
452 dev->mtu = ipgre_tunnel_bind_dev(dev); 215 return 0;
453
454 if (register_netdevice(dev) < 0)
455 goto failed_free;
456
457 /* Can use a lockless transmit, unless we generate output sequences */
458 if (!(nt->parms.o_flags & GRE_SEQ))
459 dev->features |= NETIF_F_LLTX;
460
461 dev_hold(dev);
462 ipgre_tunnel_link(ign, nt);
463 return nt;
464
465failed_free:
466 free_netdev(dev);
467 return NULL;
468}
469
470static void ipgre_tunnel_uninit(struct net_device *dev)
471{
472 struct net *net = dev_net(dev);
473 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
474
475 ipgre_tunnel_unlink(ign, netdev_priv(dev));
476 dev_put(dev);
477} 216}
478 217
479
480static void ipgre_err(struct sk_buff *skb, u32 info) 218static void ipgre_err(struct sk_buff *skb, u32 info)
481{ 219{
482 220
483/* All the routers (except for Linux) return only 221 /* All the routers (except for Linux) return only
484 8 bytes of packet payload. It means, that precise relaying of 222 8 bytes of packet payload. It means, that precise relaying of
485 ICMP in the real Internet is absolutely infeasible. 223 ICMP in the real Internet is absolutely infeasible.
486 224
487 Moreover, Cisco "wise men" put GRE key to the third word 225 Moreover, Cisco "wise men" put GRE key to the third word
488 in GRE header. It makes impossible maintaining even soft state for keyed 226 in GRE header. It makes impossible maintaining even soft
489 GRE tunnels with enabled checksum. Tell them "thank you". 227 state for keyed GRE tunnels with enabled checksum. Tell
490 228 them "thank you".
491 Well, I wonder, rfc1812 was written by Cisco employee,
492 what the hell these idiots break standards established
493 by themselves???
494 */
495 229
230 Well, I wonder, rfc1812 was written by Cisco employee,
231 what the hell these idiots break standards established
232 by themselves???
233 */
234 struct net *net = dev_net(skb->dev);
235 struct ip_tunnel_net *itn;
496 const struct iphdr *iph = (const struct iphdr *)skb->data; 236 const struct iphdr *iph = (const struct iphdr *)skb->data;
497 __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
498 int grehlen = (iph->ihl<<2) + 4;
499 const int type = icmp_hdr(skb)->type; 237 const int type = icmp_hdr(skb)->type;
500 const int code = icmp_hdr(skb)->code; 238 const int code = icmp_hdr(skb)->code;
501 struct ip_tunnel *t; 239 struct ip_tunnel *t;
502 __be16 flags; 240 struct tnl_ptk_info tpi;
503 __be32 key = 0; 241 int hdr_len;
242 bool csum_err = false;
504 243
505 flags = p[0]; 244 if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) {
506 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 245 if (!csum_err) /* ignore csum errors. */
507 if (flags&(GRE_VERSION|GRE_ROUTING))
508 return; 246 return;
509 if (flags&GRE_KEY) {
510 grehlen += 4;
511 if (flags&GRE_CSUM)
512 grehlen += 4;
513 }
514 } 247 }
515 248
516 /* If only 8 bytes returned, keyed message will be dropped here */
517 if (skb_headlen(skb) < grehlen)
518 return;
519
520 if (flags & GRE_KEY)
521 key = *(((__be32 *)p) + (grehlen / 4) - 1);
522
523 switch (type) { 249 switch (type) {
524 default: 250 default:
525 case ICMP_PARAMETERPROB: 251 case ICMP_PARAMETERPROB:
@@ -548,8 +274,13 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
548 break; 274 break;
549 } 275 }
550 276
551 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 277 if (tpi.proto == htons(ETH_P_TEB))
552 flags, key, p[1]); 278 itn = net_generic(net, gre_tap_net_id);
279 else
280 itn = net_generic(net, ipgre_net_id);
281
282 t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
283 iph->daddr, iph->saddr, tpi.key);
553 284
554 if (t == NULL) 285 if (t == NULL)
555 return; 286 return;
@@ -578,158 +309,33 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
578 t->err_time = jiffies; 309 t->err_time = jiffies;
579} 310}
580 311
581static inline u8
582ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
583{
584 u8 inner = 0;
585 if (skb->protocol == htons(ETH_P_IP))
586 inner = old_iph->tos;
587 else if (skb->protocol == htons(ETH_P_IPV6))
588 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589 return INET_ECN_encapsulate(tos, inner);
590}
591
592static int ipgre_rcv(struct sk_buff *skb) 312static int ipgre_rcv(struct sk_buff *skb)
593{ 313{
314 struct net *net = dev_net(skb->dev);
315 struct ip_tunnel_net *itn;
594 const struct iphdr *iph; 316 const struct iphdr *iph;
595 u8 *h;
596 __be16 flags;
597 __sum16 csum = 0;
598 __be32 key = 0;
599 u32 seqno = 0;
600 struct ip_tunnel *tunnel; 317 struct ip_tunnel *tunnel;
601 int offset = 4; 318 struct tnl_ptk_info tpi;
602 __be16 gre_proto; 319 int hdr_len;
603 int err; 320 bool csum_err = false;
604 321
605 if (!pskb_may_pull(skb, 16)) 322 if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0)
606 goto drop; 323 goto drop;
607 324
608 iph = ip_hdr(skb); 325 if (tpi.proto == htons(ETH_P_TEB))
609 h = skb->data; 326 itn = net_generic(net, gre_tap_net_id);
610 flags = *(__be16 *)h; 327 else
611 328 itn = net_generic(net, ipgre_net_id);
612 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
613 /* - Version must be 0.
614 - We do not support routing headers.
615 */
616 if (flags&(GRE_VERSION|GRE_ROUTING))
617 goto drop;
618
619 if (flags&GRE_CSUM) {
620 switch (skb->ip_summed) {
621 case CHECKSUM_COMPLETE:
622 csum = csum_fold(skb->csum);
623 if (!csum)
624 break;
625 /* fall through */
626 case CHECKSUM_NONE:
627 skb->csum = 0;
628 csum = __skb_checksum_complete(skb);
629 skb->ip_summed = CHECKSUM_COMPLETE;
630 }
631 offset += 4;
632 }
633 if (flags&GRE_KEY) {
634 key = *(__be32 *)(h + offset);
635 offset += 4;
636 }
637 if (flags&GRE_SEQ) {
638 seqno = ntohl(*(__be32 *)(h + offset));
639 offset += 4;
640 }
641 }
642 329
643 gre_proto = *(__be16 *)(h + 2); 330 iph = ip_hdr(skb);
331 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
332 iph->saddr, iph->daddr, tpi.key);
644 333
645 tunnel = ipgre_tunnel_lookup(skb->dev,
646 iph->saddr, iph->daddr, flags, key,
647 gre_proto);
648 if (tunnel) { 334 if (tunnel) {
649 struct pcpu_tstats *tstats; 335 ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
650
651 secpath_reset(skb);
652
653 skb->protocol = gre_proto;
654 /* WCCP version 1 and 2 protocol decoding.
655 * - Change protocol to IP
656 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
657 */
658 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
659 skb->protocol = htons(ETH_P_IP);
660 if ((*(h + offset) & 0xF0) != 0x40)
661 offset += 4;
662 }
663
664 skb->mac_header = skb->network_header;
665 __pskb_pull(skb, offset);
666 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
667 skb->pkt_type = PACKET_HOST;
668#ifdef CONFIG_NET_IPGRE_BROADCAST
669 if (ipv4_is_multicast(iph->daddr)) {
670 /* Looped back packet, drop it! */
671 if (rt_is_output_route(skb_rtable(skb)))
672 goto drop;
673 tunnel->dev->stats.multicast++;
674 skb->pkt_type = PACKET_BROADCAST;
675 }
676#endif
677
678 if (((flags&GRE_CSUM) && csum) ||
679 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
680 tunnel->dev->stats.rx_crc_errors++;
681 tunnel->dev->stats.rx_errors++;
682 goto drop;
683 }
684 if (tunnel->parms.i_flags&GRE_SEQ) {
685 if (!(flags&GRE_SEQ) ||
686 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
687 tunnel->dev->stats.rx_fifo_errors++;
688 tunnel->dev->stats.rx_errors++;
689 goto drop;
690 }
691 tunnel->i_seqno = seqno + 1;
692 }
693
694 /* Warning: All skb pointers will be invalidated! */
695 if (tunnel->dev->type == ARPHRD_ETHER) {
696 if (!pskb_may_pull(skb, ETH_HLEN)) {
697 tunnel->dev->stats.rx_length_errors++;
698 tunnel->dev->stats.rx_errors++;
699 goto drop;
700 }
701
702 iph = ip_hdr(skb);
703 skb->protocol = eth_type_trans(skb, tunnel->dev);
704 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
705 }
706
707 __skb_tunnel_rx(skb, tunnel->dev);
708
709 skb_reset_network_header(skb);
710 err = IP_ECN_decapsulate(iph, skb);
711 if (unlikely(err)) {
712 if (log_ecn_error)
713 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
714 &iph->saddr, iph->tos);
715 if (err > 1) {
716 ++tunnel->dev->stats.rx_frame_errors;
717 ++tunnel->dev->stats.rx_errors;
718 goto drop;
719 }
720 }
721
722 tstats = this_cpu_ptr(tunnel->dev->tstats);
723 u64_stats_update_begin(&tstats->syncp);
724 tstats->rx_packets++;
725 tstats->rx_bytes += skb->len;
726 u64_stats_update_end(&tstats->syncp);
727
728 gro_cells_receive(&tunnel->gro_cells, skb);
729 return 0; 336 return 0;
730 } 337 }
731 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 338 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
732
733drop: 339drop:
734 kfree_skb(skb); 340 kfree_skb(skb);
735 return 0; 341 return 0;
@@ -746,7 +352,7 @@ static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff
746 skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; 352 skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
747 return skb; 353 return skb;
748 } else if (skb->ip_summed == CHECKSUM_PARTIAL && 354 } else if (skb->ip_summed == CHECKSUM_PARTIAL &&
749 tunnel->parms.o_flags&GRE_CSUM) { 355 tunnel->parms.o_flags&TUNNEL_CSUM) {
750 err = skb_checksum_help(skb); 356 err = skb_checksum_help(skb);
751 if (unlikely(err)) 357 if (unlikely(err))
752 goto error; 358 goto error;
@@ -760,494 +366,157 @@ error:
760 return ERR_PTR(err); 366 return ERR_PTR(err);
761} 367}
762 368
763static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 369static struct sk_buff *gre_build_header(struct sk_buff *skb,
370 const struct tnl_ptk_info *tpi,
371 int hdr_len)
764{ 372{
765 struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats); 373 struct gre_base_hdr *greh;
766 struct ip_tunnel *tunnel = netdev_priv(dev);
767 const struct iphdr *old_iph;
768 const struct iphdr *tiph;
769 struct flowi4 fl4;
770 u8 tos;
771 __be16 df;
772 struct rtable *rt; /* Route to the other host */
773 struct net_device *tdev; /* Device to other host */
774 struct iphdr *iph; /* Our new IP header */
775 unsigned int max_headroom; /* The extra header space needed */
776 int gre_hlen;
777 __be32 dst;
778 int mtu;
779 u8 ttl;
780 int err;
781 int pkt_len;
782
783 skb = handle_offloads(tunnel, skb);
784 if (IS_ERR(skb)) {
785 dev->stats.tx_dropped++;
786 return NETDEV_TX_OK;
787 }
788 374
789 if (!skb->encapsulation) { 375 skb_push(skb, hdr_len);
790 skb_reset_inner_headers(skb);
791 skb->encapsulation = 1;
792 }
793 376
794 old_iph = ip_hdr(skb); 377 greh = (struct gre_base_hdr *)skb->data;
378 greh->flags = tnl_flags_to_gre_flags(tpi->flags);
379 greh->protocol = tpi->proto;
795 380
796 if (dev->type == ARPHRD_ETHER) 381 if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
797 IPCB(skb)->flags = 0; 382 __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
798 383
799 if (dev->header_ops && dev->type == ARPHRD_IPGRE) { 384 if (tpi->flags&TUNNEL_SEQ) {
800 gre_hlen = 0; 385 *ptr = tpi->seq;
801 tiph = (const struct iphdr *)skb->data; 386 ptr--;
802 } else {
803 gre_hlen = tunnel->hlen;
804 tiph = &tunnel->parms.iph;
805 }
806
807 if ((dst = tiph->daddr) == 0) {
808 /* NBMA tunnel */
809
810 if (skb_dst(skb) == NULL) {
811 dev->stats.tx_fifo_errors++;
812 goto tx_error;
813 } 387 }
814 388 if (tpi->flags&TUNNEL_KEY) {
815 if (skb->protocol == htons(ETH_P_IP)) { 389 *ptr = tpi->key;
816 rt = skb_rtable(skb); 390 ptr--;
817 dst = rt_nexthop(rt, old_iph->daddr);
818 } 391 }
819#if IS_ENABLED(CONFIG_IPV6) 392 if (tpi->flags&TUNNEL_CSUM &&
820 else if (skb->protocol == htons(ETH_P_IPV6)) { 393 !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
821 const struct in6_addr *addr6; 394 *(__sum16 *)ptr = 0;
822 struct neighbour *neigh; 395 *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
823 bool do_tx_error_icmp; 396 skb->len, 0));
824 int addr_type;
825
826 neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
827 if (neigh == NULL)
828 goto tx_error;
829
830 addr6 = (const struct in6_addr *)&neigh->primary_key;
831 addr_type = ipv6_addr_type(addr6);
832
833 if (addr_type == IPV6_ADDR_ANY) {
834 addr6 = &ipv6_hdr(skb)->daddr;
835 addr_type = ipv6_addr_type(addr6);
836 }
837
838 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
839 do_tx_error_icmp = true;
840 else {
841 do_tx_error_icmp = false;
842 dst = addr6->s6_addr32[3];
843 }
844 neigh_release(neigh);
845 if (do_tx_error_icmp)
846 goto tx_error_icmp;
847 } 397 }
848#endif
849 else
850 goto tx_error;
851 } 398 }
852 399
853 ttl = tiph->ttl; 400 return skb;
854 tos = tiph->tos; 401}
855 if (tos & 0x1) {
856 tos &= ~0x1;
857 if (skb->protocol == htons(ETH_P_IP))
858 tos = old_iph->tos;
859 else if (skb->protocol == htons(ETH_P_IPV6))
860 tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
861 }
862 402
863 rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, 403static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
864 tunnel->parms.o_key, RT_TOS(tos), 404 const struct iphdr *tnl_params,
865 tunnel->parms.link); 405 __be16 proto)
866 if (IS_ERR(rt)) { 406{
867 dev->stats.tx_carrier_errors++; 407 struct ip_tunnel *tunnel = netdev_priv(dev);
868 goto tx_error; 408 struct tnl_ptk_info tpi;
869 }
870 tdev = rt->dst.dev;
871 409
872 if (tdev == dev) { 410 if (likely(!skb->encapsulation)) {
873 ip_rt_put(rt); 411 skb_reset_inner_headers(skb);
874 dev->stats.collisions++; 412 skb->encapsulation = 1;
875 goto tx_error;
876 } 413 }
877 414
878 df = tiph->frag_off; 415 tpi.flags = tunnel->parms.o_flags;
879 if (df) 416 tpi.proto = proto;
880 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; 417 tpi.key = tunnel->parms.o_key;
881 else 418 if (tunnel->parms.o_flags & TUNNEL_SEQ)
882 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 419 tunnel->o_seqno++;
883 420 tpi.seq = htonl(tunnel->o_seqno);
884 if (skb_dst(skb))
885 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
886
887 if (skb->protocol == htons(ETH_P_IP)) {
888 df |= (old_iph->frag_off&htons(IP_DF));
889 421
890 if (!skb_is_gso(skb) && 422 /* Push GRE header. */
891 (old_iph->frag_off&htons(IP_DF)) && 423 skb = gre_build_header(skb, &tpi, tunnel->hlen);
892 mtu < ntohs(old_iph->tot_len)) { 424 if (unlikely(!skb)) {
893 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 425 dev->stats.tx_dropped++;
894 ip_rt_put(rt); 426 return;
895 goto tx_error;
896 }
897 } 427 }
898#if IS_ENABLED(CONFIG_IPV6)
899 else if (skb->protocol == htons(ETH_P_IPV6)) {
900 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
901
902 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
903 if ((tunnel->parms.iph.daddr &&
904 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
905 rt6->rt6i_dst.plen == 128) {
906 rt6->rt6i_flags |= RTF_MODIFIED;
907 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
908 }
909 }
910 428
911 if (!skb_is_gso(skb) && 429 ip_tunnel_xmit(skb, dev, tnl_params);
912 mtu >= IPV6_MIN_MTU && 430}
913 mtu < skb->len - tunnel->hlen + gre_hlen) {
914 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
915 ip_rt_put(rt);
916 goto tx_error;
917 }
918 }
919#endif
920 431
921 if (tunnel->err_count > 0) { 432static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
922 if (time_before(jiffies, 433 struct net_device *dev)
923 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 434{
924 tunnel->err_count--; 435 struct ip_tunnel *tunnel = netdev_priv(dev);
436 const struct iphdr *tnl_params;
925 437
926 dst_link_failure(skb); 438 skb = handle_offloads(tunnel, skb);
927 } else 439 if (IS_ERR(skb))
928 tunnel->err_count = 0; 440 goto out;
929 }
930 441
931 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; 442 if (dev->header_ops) {
932 443 /* Need space for new headers */
933 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 444 if (skb_cow_head(skb, dev->needed_headroom -
934 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 445 (tunnel->hlen + sizeof(struct iphdr))));
935 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 446 goto free_skb;
936 if (max_headroom > dev->needed_headroom)
937 dev->needed_headroom = max_headroom;
938 if (!new_skb) {
939 ip_rt_put(rt);
940 dev->stats.tx_dropped++;
941 dev_kfree_skb(skb);
942 return NETDEV_TX_OK;
943 }
944 if (skb->sk)
945 skb_set_owner_w(new_skb, skb->sk);
946 dev_kfree_skb(skb);
947 skb = new_skb;
948 old_iph = ip_hdr(skb);
949 /* Warning : tiph value might point to freed memory */
950 }
951 447
952 skb_push(skb, gre_hlen); 448 tnl_params = (const struct iphdr *)skb->data;
953 skb_reset_network_header(skb);
954 skb_set_transport_header(skb, sizeof(*iph));
955 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
956 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
957 IPSKB_REROUTED);
958 skb_dst_drop(skb);
959 skb_dst_set(skb, &rt->dst);
960
961 /*
962 * Push down and install the IPIP header.
963 */
964 449
965 iph = ip_hdr(skb); 450 /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
966 iph->version = 4; 451 * to gre header.
967 iph->ihl = sizeof(struct iphdr) >> 2; 452 */
968 iph->frag_off = df; 453 skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
969 iph->protocol = IPPROTO_GRE; 454 } else {
970 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); 455 if (skb_cow_head(skb, dev->needed_headroom))
971 iph->daddr = fl4.daddr; 456 goto free_skb;
972 iph->saddr = fl4.saddr;
973 iph->ttl = ttl;
974
975 tunnel_ip_select_ident(skb, old_iph, &rt->dst);
976
977 if (ttl == 0) {
978 if (skb->protocol == htons(ETH_P_IP))
979 iph->ttl = old_iph->ttl;
980#if IS_ENABLED(CONFIG_IPV6)
981 else if (skb->protocol == htons(ETH_P_IPV6))
982 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
983#endif
984 else
985 iph->ttl = ip4_dst_hoplimit(&rt->dst);
986 }
987
988 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
989 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
990 htons(ETH_P_TEB) : skb->protocol;
991
992 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
993 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
994 457
995 if (tunnel->parms.o_flags&GRE_SEQ) { 458 tnl_params = &tunnel->parms.iph;
996 ++tunnel->o_seqno;
997 *ptr = htonl(tunnel->o_seqno);
998 ptr--;
999 }
1000 if (tunnel->parms.o_flags&GRE_KEY) {
1001 *ptr = tunnel->parms.o_key;
1002 ptr--;
1003 }
1004 /* Skip GRE checksum if skb is getting offloaded. */
1005 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE) &&
1006 (tunnel->parms.o_flags&GRE_CSUM)) {
1007 int offset = skb_transport_offset(skb);
1008
1009 if (skb_has_shared_frag(skb)) {
1010 err = __skb_linearize(skb);
1011 if (err)
1012 goto tx_error;
1013 }
1014
1015 *ptr = 0;
1016 *(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset,
1017 skb->len - offset,
1018 0));
1019 }
1020 } 459 }
1021 460
1022 nf_reset(skb); 461 __gre_xmit(skb, dev, tnl_params, skb->protocol);
1023 462
1024 pkt_len = skb->len - skb_transport_offset(skb);
1025 err = ip_local_out(skb);
1026 if (likely(net_xmit_eval(err) == 0)) {
1027 u64_stats_update_begin(&tstats->syncp);
1028 tstats->tx_bytes += pkt_len;
1029 tstats->tx_packets++;
1030 u64_stats_update_end(&tstats->syncp);
1031 } else {
1032 dev->stats.tx_errors++;
1033 dev->stats.tx_aborted_errors++;
1034 }
1035 return NETDEV_TX_OK; 463 return NETDEV_TX_OK;
1036 464
1037#if IS_ENABLED(CONFIG_IPV6) 465free_skb:
1038tx_error_icmp:
1039 dst_link_failure(skb);
1040#endif
1041tx_error:
1042 dev->stats.tx_errors++;
1043 dev_kfree_skb(skb); 466 dev_kfree_skb(skb);
467out:
468 dev->stats.tx_dropped++;
1044 return NETDEV_TX_OK; 469 return NETDEV_TX_OK;
1045} 470}
1046 471
1047static int ipgre_tunnel_bind_dev(struct net_device *dev) 472static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
473 struct net_device *dev)
1048{ 474{
1049 struct net_device *tdev = NULL; 475 struct ip_tunnel *tunnel = netdev_priv(dev);
1050 struct ip_tunnel *tunnel;
1051 const struct iphdr *iph;
1052 int hlen = LL_MAX_HEADER;
1053 int mtu = ETH_DATA_LEN;
1054 int addend = sizeof(struct iphdr) + 4;
1055
1056 tunnel = netdev_priv(dev);
1057 iph = &tunnel->parms.iph;
1058
1059 /* Guess output device to choose reasonable mtu and needed_headroom */
1060
1061 if (iph->daddr) {
1062 struct flowi4 fl4;
1063 struct rtable *rt;
1064
1065 rt = ip_route_output_gre(dev_net(dev), &fl4,
1066 iph->daddr, iph->saddr,
1067 tunnel->parms.o_key,
1068 RT_TOS(iph->tos),
1069 tunnel->parms.link);
1070 if (!IS_ERR(rt)) {
1071 tdev = rt->dst.dev;
1072 ip_rt_put(rt);
1073 }
1074
1075 if (dev->type != ARPHRD_ETHER)
1076 dev->flags |= IFF_POINTOPOINT;
1077 }
1078 476
1079 if (!tdev && tunnel->parms.link) 477 skb = handle_offloads(tunnel, skb);
1080 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); 478 if (IS_ERR(skb))
479 goto out;
1081 480
1082 if (tdev) { 481 if (skb_cow_head(skb, dev->needed_headroom))
1083 hlen = tdev->hard_header_len + tdev->needed_headroom; 482 goto free_skb;
1084 mtu = tdev->mtu;
1085 }
1086 dev->iflink = tunnel->parms.link;
1087
1088 /* Precalculate GRE options length */
1089 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1090 if (tunnel->parms.o_flags&GRE_CSUM)
1091 addend += 4;
1092 if (tunnel->parms.o_flags&GRE_KEY)
1093 addend += 4;
1094 if (tunnel->parms.o_flags&GRE_SEQ)
1095 addend += 4;
1096 }
1097 dev->needed_headroom = addend + hlen;
1098 mtu -= dev->hard_header_len + addend;
1099 483
1100 if (mtu < 68) 484 __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
1101 mtu = 68;
1102 485
1103 tunnel->hlen = addend; 486 return NETDEV_TX_OK;
1104 /* TCP offload with GRE SEQ is not supported. */
1105 if (!(tunnel->parms.o_flags & GRE_SEQ)) {
1106 dev->features |= NETIF_F_GSO_SOFTWARE;
1107 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1108 }
1109 487
1110 return mtu; 488free_skb:
489 dev_kfree_skb(skb);
490out:
491 dev->stats.tx_dropped++;
492 return NETDEV_TX_OK;
1111} 493}
1112 494
1113static int 495static int ipgre_tunnel_ioctl(struct net_device *dev,
1114ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) 496 struct ifreq *ifr, int cmd)
1115{ 497{
1116 int err = 0; 498 int err = 0;
1117 struct ip_tunnel_parm p; 499 struct ip_tunnel_parm p;
1118 struct ip_tunnel *t;
1119 struct net *net = dev_net(dev);
1120 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1121
1122 switch (cmd) {
1123 case SIOCGETTUNNEL:
1124 t = NULL;
1125 if (dev == ign->fb_tunnel_dev) {
1126 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1127 err = -EFAULT;
1128 break;
1129 }
1130 t = ipgre_tunnel_locate(net, &p, 0);
1131 }
1132 if (t == NULL)
1133 t = netdev_priv(dev);
1134 memcpy(&p, &t->parms, sizeof(p));
1135 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1136 err = -EFAULT;
1137 break;
1138
1139 case SIOCADDTUNNEL:
1140 case SIOCCHGTUNNEL:
1141 err = -EPERM;
1142 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1143 goto done;
1144
1145 err = -EFAULT;
1146 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1147 goto done;
1148
1149 err = -EINVAL;
1150 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1151 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1152 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1153 goto done;
1154 if (p.iph.ttl)
1155 p.iph.frag_off |= htons(IP_DF);
1156
1157 if (!(p.i_flags&GRE_KEY))
1158 p.i_key = 0;
1159 if (!(p.o_flags&GRE_KEY))
1160 p.o_key = 0;
1161
1162 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1163
1164 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1165 if (t != NULL) {
1166 if (t->dev != dev) {
1167 err = -EEXIST;
1168 break;
1169 }
1170 } else {
1171 unsigned int nflags = 0;
1172
1173 t = netdev_priv(dev);
1174
1175 if (ipv4_is_multicast(p.iph.daddr))
1176 nflags = IFF_BROADCAST;
1177 else if (p.iph.daddr)
1178 nflags = IFF_POINTOPOINT;
1179
1180 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1181 err = -EINVAL;
1182 break;
1183 }
1184 ipgre_tunnel_unlink(ign, t);
1185 synchronize_net();
1186 t->parms.iph.saddr = p.iph.saddr;
1187 t->parms.iph.daddr = p.iph.daddr;
1188 t->parms.i_key = p.i_key;
1189 t->parms.o_key = p.o_key;
1190 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1191 memcpy(dev->broadcast, &p.iph.daddr, 4);
1192 ipgre_tunnel_link(ign, t);
1193 netdev_state_change(dev);
1194 }
1195 }
1196
1197 if (t) {
1198 err = 0;
1199 if (cmd == SIOCCHGTUNNEL) {
1200 t->parms.iph.ttl = p.iph.ttl;
1201 t->parms.iph.tos = p.iph.tos;
1202 t->parms.iph.frag_off = p.iph.frag_off;
1203 if (t->parms.link != p.link) {
1204 t->parms.link = p.link;
1205 dev->mtu = ipgre_tunnel_bind_dev(dev);
1206 netdev_state_change(dev);
1207 }
1208 }
1209 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1210 err = -EFAULT;
1211 } else
1212 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1213 break;
1214
1215 case SIOCDELTUNNEL:
1216 err = -EPERM;
1217 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1218 goto done;
1219
1220 if (dev == ign->fb_tunnel_dev) {
1221 err = -EFAULT;
1222 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1223 goto done;
1224 err = -ENOENT;
1225 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1226 goto done;
1227 err = -EPERM;
1228 if (t == netdev_priv(ign->fb_tunnel_dev))
1229 goto done;
1230 dev = t->dev;
1231 }
1232 unregister_netdevice(dev);
1233 err = 0;
1234 break;
1235 500
1236 default: 501 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1237 err = -EINVAL; 502 return -EFAULT;
503 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
504 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
505 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) {
506 return -EINVAL;
1238 } 507 }
508 p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
509 p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
1239 510
1240done: 511 err = ip_tunnel_ioctl(dev, &p, cmd);
1241 return err; 512 if (err)
1242} 513 return err;
1243 514
1244static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) 515 p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
1245{ 516 p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
1246 struct ip_tunnel *tunnel = netdev_priv(dev); 517
1247 if (new_mtu < 68 || 518 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1248 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) 519 return -EFAULT;
1249 return -EINVAL;
1250 dev->mtu = new_mtu;
1251 return 0; 520 return 0;
1252} 521}
1253 522
@@ -1277,25 +546,23 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1277 ... 546 ...
1278 ftp fec0:6666:6666::193.233.7.65 547 ftp fec0:6666:6666::193.233.7.65
1279 ... 548 ...
1280
1281 */ 549 */
1282
1283static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 550static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1284 unsigned short type, 551 unsigned short type,
1285 const void *daddr, const void *saddr, unsigned int len) 552 const void *daddr, const void *saddr, unsigned int len)
1286{ 553{
1287 struct ip_tunnel *t = netdev_priv(dev); 554 struct ip_tunnel *t = netdev_priv(dev);
1288 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 555 struct iphdr *iph;
1289 __be16 *p = (__be16 *)(iph+1); 556 struct gre_base_hdr *greh;
1290 557
1291 memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); 558 iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
1292 p[0] = t->parms.o_flags; 559 greh = (struct gre_base_hdr *)(iph+1);
1293 p[1] = htons(type); 560 greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
561 greh->protocol = htons(type);
1294 562
1295 /* 563 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1296 * Set the source hardware address.
1297 */
1298 564
565 /* Set the source hardware address. */
1299 if (saddr) 566 if (saddr)
1300 memcpy(&iph->saddr, saddr, 4); 567 memcpy(&iph->saddr, saddr, 4);
1301 if (daddr) 568 if (daddr)
@@ -1303,7 +570,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1303 if (iph->daddr) 570 if (iph->daddr)
1304 return t->hlen; 571 return t->hlen;
1305 572
1306 return -t->hlen; 573 return -(t->hlen + sizeof(*iph));
1307} 574}
1308 575
1309static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) 576static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
@@ -1357,31 +624,21 @@ static int ipgre_close(struct net_device *dev)
1357 } 624 }
1358 return 0; 625 return 0;
1359} 626}
1360
1361#endif 627#endif
1362 628
1363static const struct net_device_ops ipgre_netdev_ops = { 629static const struct net_device_ops ipgre_netdev_ops = {
1364 .ndo_init = ipgre_tunnel_init, 630 .ndo_init = ipgre_tunnel_init,
1365 .ndo_uninit = ipgre_tunnel_uninit, 631 .ndo_uninit = ip_tunnel_uninit,
1366#ifdef CONFIG_NET_IPGRE_BROADCAST 632#ifdef CONFIG_NET_IPGRE_BROADCAST
1367 .ndo_open = ipgre_open, 633 .ndo_open = ipgre_open,
1368 .ndo_stop = ipgre_close, 634 .ndo_stop = ipgre_close,
1369#endif 635#endif
1370 .ndo_start_xmit = ipgre_tunnel_xmit, 636 .ndo_start_xmit = ipgre_xmit,
1371 .ndo_do_ioctl = ipgre_tunnel_ioctl, 637 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1372 .ndo_change_mtu = ipgre_tunnel_change_mtu, 638 .ndo_change_mtu = ip_tunnel_change_mtu,
1373 .ndo_get_stats64 = ipgre_get_stats64, 639 .ndo_get_stats64 = ip_tunnel_get_stats64,
1374}; 640};
1375 641
1376static void ipgre_dev_free(struct net_device *dev)
1377{
1378 struct ip_tunnel *tunnel = netdev_priv(dev);
1379
1380 gro_cells_destroy(&tunnel->gro_cells);
1381 free_percpu(dev->tstats);
1382 free_netdev(dev);
1383}
1384
1385#define GRE_FEATURES (NETIF_F_SG | \ 642#define GRE_FEATURES (NETIF_F_SG | \
1386 NETIF_F_FRAGLIST | \ 643 NETIF_F_FRAGLIST | \
1387 NETIF_F_HIGHDMA | \ 644 NETIF_F_HIGHDMA | \
@@ -1390,35 +647,49 @@ static void ipgre_dev_free(struct net_device *dev)
1390static void ipgre_tunnel_setup(struct net_device *dev) 647static void ipgre_tunnel_setup(struct net_device *dev)
1391{ 648{
1392 dev->netdev_ops = &ipgre_netdev_ops; 649 dev->netdev_ops = &ipgre_netdev_ops;
1393 dev->destructor = ipgre_dev_free; 650 ip_tunnel_setup(dev, ipgre_net_id);
651}
1394 652
1395 dev->type = ARPHRD_IPGRE; 653static void __gre_tunnel_init(struct net_device *dev)
1396 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 654{
655 struct ip_tunnel *tunnel;
656
657 tunnel = netdev_priv(dev);
658 tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
659 tunnel->parms.iph.protocol = IPPROTO_GRE;
660
661 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1397 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 662 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1398 dev->flags = IFF_NOARP;
1399 dev->iflink = 0; 663 dev->iflink = 0;
1400 dev->addr_len = 4;
1401 dev->features |= NETIF_F_NETNS_LOCAL;
1402 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1403 664
1404 dev->features |= GRE_FEATURES; 665 dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES;
1405 dev->hw_features |= GRE_FEATURES; 666 dev->hw_features |= GRE_FEATURES;
667
668 if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
669 /* TCP offload with GRE SEQ is not supported. */
670 dev->features |= NETIF_F_GSO_SOFTWARE;
671 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
672 /* Can use a lockless transmit, unless we generate
673 * output sequences
674 */
675 dev->features |= NETIF_F_LLTX;
676 }
1406} 677}
1407 678
1408static int ipgre_tunnel_init(struct net_device *dev) 679static int ipgre_tunnel_init(struct net_device *dev)
1409{ 680{
1410 struct ip_tunnel *tunnel; 681 struct ip_tunnel *tunnel = netdev_priv(dev);
1411 struct iphdr *iph; 682 struct iphdr *iph = &tunnel->parms.iph;
1412 int err;
1413 683
1414 tunnel = netdev_priv(dev); 684 __gre_tunnel_init(dev);
1415 iph = &tunnel->parms.iph;
1416 685
1417 tunnel->dev = dev; 686 memcpy(dev->dev_addr, &iph->saddr, 4);
1418 strcpy(tunnel->parms.name, dev->name); 687 memcpy(dev->broadcast, &iph->daddr, 4);
1419 688
1420 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 689 dev->type = ARPHRD_IPGRE;
1421 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 690 dev->flags = IFF_NOARP;
691 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
692 dev->addr_len = 4;
1422 693
1423 if (iph->daddr) { 694 if (iph->daddr) {
1424#ifdef CONFIG_NET_IPGRE_BROADCAST 695#ifdef CONFIG_NET_IPGRE_BROADCAST
@@ -1432,106 +703,30 @@ static int ipgre_tunnel_init(struct net_device *dev)
1432 } else 703 } else
1433 dev->header_ops = &ipgre_header_ops; 704 dev->header_ops = &ipgre_header_ops;
1434 705
1435 dev->tstats = alloc_percpu(struct pcpu_tstats); 706 return ip_tunnel_init(dev);
1436 if (!dev->tstats)
1437 return -ENOMEM;
1438
1439 err = gro_cells_init(&tunnel->gro_cells, dev);
1440 if (err) {
1441 free_percpu(dev->tstats);
1442 return err;
1443 }
1444
1445 return 0;
1446} 707}
1447 708
1448static void ipgre_fb_tunnel_init(struct net_device *dev)
1449{
1450 struct ip_tunnel *tunnel = netdev_priv(dev);
1451 struct iphdr *iph = &tunnel->parms.iph;
1452
1453 tunnel->dev = dev;
1454 strcpy(tunnel->parms.name, dev->name);
1455
1456 iph->version = 4;
1457 iph->protocol = IPPROTO_GRE;
1458 iph->ihl = 5;
1459 tunnel->hlen = sizeof(struct iphdr) + 4;
1460
1461 dev_hold(dev);
1462}
1463
1464
1465static const struct gre_protocol ipgre_protocol = { 709static const struct gre_protocol ipgre_protocol = {
1466 .handler = ipgre_rcv, 710 .handler = ipgre_rcv,
1467 .err_handler = ipgre_err, 711 .err_handler = ipgre_err,
1468}; 712};
1469 713
1470static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1471{
1472 int prio;
1473
1474 for (prio = 0; prio < 4; prio++) {
1475 int h;
1476 for (h = 0; h < HASH_SIZE; h++) {
1477 struct ip_tunnel *t;
1478
1479 t = rtnl_dereference(ign->tunnels[prio][h]);
1480
1481 while (t != NULL) {
1482 unregister_netdevice_queue(t->dev, head);
1483 t = rtnl_dereference(t->next);
1484 }
1485 }
1486 }
1487}
1488
1489static int __net_init ipgre_init_net(struct net *net) 714static int __net_init ipgre_init_net(struct net *net)
1490{ 715{
1491 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 716 return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1492 int err;
1493
1494 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1495 ipgre_tunnel_setup);
1496 if (!ign->fb_tunnel_dev) {
1497 err = -ENOMEM;
1498 goto err_alloc_dev;
1499 }
1500 dev_net_set(ign->fb_tunnel_dev, net);
1501
1502 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1503 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1504
1505 if ((err = register_netdev(ign->fb_tunnel_dev)))
1506 goto err_reg_dev;
1507
1508 rcu_assign_pointer(ign->tunnels_wc[0],
1509 netdev_priv(ign->fb_tunnel_dev));
1510 return 0;
1511
1512err_reg_dev:
1513 ipgre_dev_free(ign->fb_tunnel_dev);
1514err_alloc_dev:
1515 return err;
1516} 717}
1517 718
1518static void __net_exit ipgre_exit_net(struct net *net) 719static void __net_exit ipgre_exit_net(struct net *net)
1519{ 720{
1520 struct ipgre_net *ign; 721 struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
1521 LIST_HEAD(list); 722 ip_tunnel_delete_net(itn);
1522
1523 ign = net_generic(net, ipgre_net_id);
1524 rtnl_lock();
1525 ipgre_destroy_tunnels(ign, &list);
1526 unregister_netdevice_many(&list);
1527 rtnl_unlock();
1528} 723}
1529 724
1530static struct pernet_operations ipgre_net_ops = { 725static struct pernet_operations ipgre_net_ops = {
1531 .init = ipgre_init_net, 726 .init = ipgre_init_net,
1532 .exit = ipgre_exit_net, 727 .exit = ipgre_exit_net,
1533 .id = &ipgre_net_id, 728 .id = &ipgre_net_id,
1534 .size = sizeof(struct ipgre_net), 729 .size = sizeof(struct ip_tunnel_net),
1535}; 730};
1536 731
1537static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 732static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1576,8 +771,8 @@ out:
1576 return ipgre_tunnel_validate(tb, data); 771 return ipgre_tunnel_validate(tb, data);
1577} 772}
1578 773
1579static void ipgre_netlink_parms(struct nlattr *data[], 774static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
1580 struct ip_tunnel_parm *parms) 775 struct ip_tunnel_parm *parms)
1581{ 776{
1582 memset(parms, 0, sizeof(*parms)); 777 memset(parms, 0, sizeof(*parms));
1583 778
@@ -1590,10 +785,10 @@ static void ipgre_netlink_parms(struct nlattr *data[],
1590 parms->link = nla_get_u32(data[IFLA_GRE_LINK]); 785 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1591 786
1592 if (data[IFLA_GRE_IFLAGS]) 787 if (data[IFLA_GRE_IFLAGS])
1593 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); 788 parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1594 789
1595 if (data[IFLA_GRE_OFLAGS]) 790 if (data[IFLA_GRE_OFLAGS])
1596 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); 791 parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1597 792
1598 if (data[IFLA_GRE_IKEY]) 793 if (data[IFLA_GRE_IKEY])
1599 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); 794 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1617,148 +812,46 @@ static void ipgre_netlink_parms(struct nlattr *data[],
1617 parms->iph.frag_off = htons(IP_DF); 812 parms->iph.frag_off = htons(IP_DF);
1618} 813}
1619 814
1620static int ipgre_tap_init(struct net_device *dev) 815static int gre_tap_init(struct net_device *dev)
1621{ 816{
1622 struct ip_tunnel *tunnel; 817 __gre_tunnel_init(dev);
1623
1624 tunnel = netdev_priv(dev);
1625
1626 tunnel->dev = dev;
1627 strcpy(tunnel->parms.name, dev->name);
1628
1629 ipgre_tunnel_bind_dev(dev);
1630 818
1631 dev->tstats = alloc_percpu(struct pcpu_tstats); 819 return ip_tunnel_init(dev);
1632 if (!dev->tstats)
1633 return -ENOMEM;
1634
1635 return 0;
1636} 820}
1637 821
1638static const struct net_device_ops ipgre_tap_netdev_ops = { 822static const struct net_device_ops gre_tap_netdev_ops = {
1639 .ndo_init = ipgre_tap_init, 823 .ndo_init = gre_tap_init,
1640 .ndo_uninit = ipgre_tunnel_uninit, 824 .ndo_uninit = ip_tunnel_uninit,
1641 .ndo_start_xmit = ipgre_tunnel_xmit, 825 .ndo_start_xmit = gre_tap_xmit,
1642 .ndo_set_mac_address = eth_mac_addr, 826 .ndo_set_mac_address = eth_mac_addr,
1643 .ndo_validate_addr = eth_validate_addr, 827 .ndo_validate_addr = eth_validate_addr,
1644 .ndo_change_mtu = ipgre_tunnel_change_mtu, 828 .ndo_change_mtu = ip_tunnel_change_mtu,
1645 .ndo_get_stats64 = ipgre_get_stats64, 829 .ndo_get_stats64 = ip_tunnel_get_stats64,
1646}; 830};
1647 831
1648static void ipgre_tap_setup(struct net_device *dev) 832static void ipgre_tap_setup(struct net_device *dev)
1649{ 833{
1650
1651 ether_setup(dev); 834 ether_setup(dev);
1652 835 dev->netdev_ops = &gre_tap_netdev_ops;
1653 dev->netdev_ops = &ipgre_tap_netdev_ops; 836 ip_tunnel_setup(dev, gre_tap_net_id);
1654 dev->destructor = ipgre_dev_free;
1655
1656 dev->iflink = 0;
1657 dev->features |= NETIF_F_NETNS_LOCAL;
1658
1659 dev->features |= GRE_FEATURES;
1660 dev->hw_features |= GRE_FEATURES;
1661} 837}
1662 838
1663static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], 839static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1664 struct nlattr *data[]) 840 struct nlattr *tb[], struct nlattr *data[])
1665{ 841{
1666 struct ip_tunnel *nt; 842 struct ip_tunnel_parm p;
1667 struct net *net = dev_net(dev);
1668 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1669 int mtu;
1670 int err;
1671
1672 nt = netdev_priv(dev);
1673 ipgre_netlink_parms(data, &nt->parms);
1674
1675 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1676 return -EEXIST;
1677
1678 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1679 eth_hw_addr_random(dev);
1680
1681 mtu = ipgre_tunnel_bind_dev(dev);
1682 if (!tb[IFLA_MTU])
1683 dev->mtu = mtu;
1684
1685 /* Can use a lockless transmit, unless we generate output sequences */
1686 if (!(nt->parms.o_flags & GRE_SEQ))
1687 dev->features |= NETIF_F_LLTX;
1688
1689 err = register_netdevice(dev);
1690 if (err)
1691 goto out;
1692
1693 dev_hold(dev);
1694 ipgre_tunnel_link(ign, nt);
1695 843
1696out: 844 ipgre_netlink_parms(data, tb, &p);
1697 return err; 845 return ip_tunnel_newlink(dev, tb, &p);
1698} 846}
1699 847
1700static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], 848static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1701 struct nlattr *data[]) 849 struct nlattr *data[])
1702{ 850{
1703 struct ip_tunnel *t, *nt;
1704 struct net *net = dev_net(dev);
1705 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1706 struct ip_tunnel_parm p; 851 struct ip_tunnel_parm p;
1707 int mtu;
1708
1709 if (dev == ign->fb_tunnel_dev)
1710 return -EINVAL;
1711
1712 nt = netdev_priv(dev);
1713 ipgre_netlink_parms(data, &p);
1714
1715 t = ipgre_tunnel_locate(net, &p, 0);
1716
1717 if (t) {
1718 if (t->dev != dev)
1719 return -EEXIST;
1720 } else {
1721 t = nt;
1722
1723 if (dev->type != ARPHRD_ETHER) {
1724 unsigned int nflags = 0;
1725
1726 if (ipv4_is_multicast(p.iph.daddr))
1727 nflags = IFF_BROADCAST;
1728 else if (p.iph.daddr)
1729 nflags = IFF_POINTOPOINT;
1730
1731 if ((dev->flags ^ nflags) &
1732 (IFF_POINTOPOINT | IFF_BROADCAST))
1733 return -EINVAL;
1734 }
1735 852
1736 ipgre_tunnel_unlink(ign, t); 853 ipgre_netlink_parms(data, tb, &p);
1737 t->parms.iph.saddr = p.iph.saddr; 854 return ip_tunnel_changelink(dev, tb, &p);
1738 t->parms.iph.daddr = p.iph.daddr;
1739 t->parms.i_key = p.i_key;
1740 if (dev->type != ARPHRD_ETHER) {
1741 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1742 memcpy(dev->broadcast, &p.iph.daddr, 4);
1743 }
1744 ipgre_tunnel_link(ign, t);
1745 netdev_state_change(dev);
1746 }
1747
1748 t->parms.o_key = p.o_key;
1749 t->parms.iph.ttl = p.iph.ttl;
1750 t->parms.iph.tos = p.iph.tos;
1751 t->parms.iph.frag_off = p.iph.frag_off;
1752
1753 if (t->parms.link != p.link) {
1754 t->parms.link = p.link;
1755 mtu = ipgre_tunnel_bind_dev(dev);
1756 if (!tb[IFLA_MTU])
1757 dev->mtu = mtu;
1758 netdev_state_change(dev);
1759 }
1760
1761 return 0;
1762} 855}
1763 856
1764static size_t ipgre_get_size(const struct net_device *dev) 857static size_t ipgre_get_size(const struct net_device *dev)
@@ -1793,8 +886,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1793 struct ip_tunnel_parm *p = &t->parms; 886 struct ip_tunnel_parm *p = &t->parms;
1794 887
1795 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || 888 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1796 nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || 889 nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
1797 nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || 890 nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
1798 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || 891 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1799 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || 892 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1800 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || 893 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
@@ -1832,6 +925,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1832 .validate = ipgre_tunnel_validate, 925 .validate = ipgre_tunnel_validate,
1833 .newlink = ipgre_newlink, 926 .newlink = ipgre_newlink,
1834 .changelink = ipgre_changelink, 927 .changelink = ipgre_changelink,
928 .dellink = ip_tunnel_dellink,
1835 .get_size = ipgre_get_size, 929 .get_size = ipgre_get_size,
1836 .fill_info = ipgre_fill_info, 930 .fill_info = ipgre_fill_info,
1837}; 931};
@@ -1845,13 +939,28 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1845 .validate = ipgre_tap_validate, 939 .validate = ipgre_tap_validate,
1846 .newlink = ipgre_newlink, 940 .newlink = ipgre_newlink,
1847 .changelink = ipgre_changelink, 941 .changelink = ipgre_changelink,
942 .dellink = ip_tunnel_dellink,
1848 .get_size = ipgre_get_size, 943 .get_size = ipgre_get_size,
1849 .fill_info = ipgre_fill_info, 944 .fill_info = ipgre_fill_info,
1850}; 945};
1851 946
1852/* 947static int __net_init ipgre_tap_init_net(struct net *net)
1853 * And now the modules code and kernel interface. 948{
1854 */ 949 return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
950}
951
952static void __net_exit ipgre_tap_exit_net(struct net *net)
953{
954 struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
955 ip_tunnel_delete_net(itn);
956}
957
958static struct pernet_operations ipgre_tap_net_ops = {
959 .init = ipgre_tap_init_net,
960 .exit = ipgre_tap_exit_net,
961 .id = &gre_tap_net_id,
962 .size = sizeof(struct ip_tunnel_net),
963};
1855 964
1856static int __init ipgre_init(void) 965static int __init ipgre_init(void)
1857{ 966{
@@ -1863,6 +972,10 @@ static int __init ipgre_init(void)
1863 if (err < 0) 972 if (err < 0)
1864 return err; 973 return err;
1865 974
975 err = register_pernet_device(&ipgre_tap_net_ops);
976 if (err < 0)
977 goto pnet_tap_faied;
978
1866 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); 979 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1867 if (err < 0) { 980 if (err < 0) {
1868 pr_info("%s: can't add protocol\n", __func__); 981 pr_info("%s: can't add protocol\n", __func__);
@@ -1877,16 +990,17 @@ static int __init ipgre_init(void)
1877 if (err < 0) 990 if (err < 0)
1878 goto tap_ops_failed; 991 goto tap_ops_failed;
1879 992
1880out: 993 return 0;
1881 return err;
1882 994
1883tap_ops_failed: 995tap_ops_failed:
1884 rtnl_link_unregister(&ipgre_link_ops); 996 rtnl_link_unregister(&ipgre_link_ops);
1885rtnl_link_failed: 997rtnl_link_failed:
1886 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); 998 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1887add_proto_failed: 999add_proto_failed:
1000 unregister_pernet_device(&ipgre_tap_net_ops);
1001pnet_tap_faied:
1888 unregister_pernet_device(&ipgre_net_ops); 1002 unregister_pernet_device(&ipgre_net_ops);
1889 goto out; 1003 return err;
1890} 1004}
1891 1005
1892static void __exit ipgre_fini(void) 1006static void __exit ipgre_fini(void)
@@ -1895,6 +1009,7 @@ static void __exit ipgre_fini(void)
1895 rtnl_link_unregister(&ipgre_link_ops); 1009 rtnl_link_unregister(&ipgre_link_ops);
1896 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) 1010 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1897 pr_info("%s: can't remove protocol\n", __func__); 1011 pr_info("%s: can't remove protocol\n", __func__);
1012 unregister_pernet_device(&ipgre_tap_net_ops);
1898 unregister_pernet_device(&ipgre_net_ops); 1013 unregister_pernet_device(&ipgre_net_ops);
1899} 1014}
1900 1015
@@ -1904,3 +1019,4 @@ MODULE_LICENSE("GPL");
1904MODULE_ALIAS_RTNL_LINK("gre"); 1019MODULE_ALIAS_RTNL_LINK("gre");
1905MODULE_ALIAS_RTNL_LINK("gretap"); 1020MODULE_ALIAS_RTNL_LINK("gretap");
1906MODULE_ALIAS_NETDEV("gre0"); 1021MODULE_ALIAS_NETDEV("gre0");
1022MODULE_ALIAS_NETDEV("gretap0");
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
new file mode 100644
index 000000000000..9d96b6853f21
--- /dev/null
+++ b/net/ipv4/ip_tunnel.c
@@ -0,0 +1,1035 @@
1/*
2 * Copyright (c) 2013 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/capability.h>
22#include <linux/module.h>
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/mroute.h>
34#include <linux/init.h>
35#include <linux/in6.h>
36#include <linux/inetdevice.h>
37#include <linux/igmp.h>
38#include <linux/netfilter_ipv4.h>
39#include <linux/etherdevice.h>
40#include <linux/if_ether.h>
41#include <linux/if_vlan.h>
42#include <linux/rculist.h>
43
44#include <net/sock.h>
45#include <net/ip.h>
46#include <net/icmp.h>
47#include <net/protocol.h>
48#include <net/ip_tunnels.h>
49#include <net/arp.h>
50#include <net/checksum.h>
51#include <net/dsfield.h>
52#include <net/inet_ecn.h>
53#include <net/xfrm.h>
54#include <net/net_namespace.h>
55#include <net/netns/generic.h>
56#include <net/rtnetlink.h>
57
58#if IS_ENABLED(CONFIG_IPV6)
59#include <net/ipv6.h>
60#include <net/ip6_fib.h>
61#include <net/ip6_route.h>
62#endif
63
64static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
65 __be32 key, __be32 remote)
66{
67 return hash_32((__force u32)key ^ (__force u32)remote,
68 IP_TNL_HASH_BITS);
69}
70
71/* Often modified stats are per cpu, other are shared (netdev->stats) */
72struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
73 struct rtnl_link_stats64 *tot)
74{
75 int i;
76
77 for_each_possible_cpu(i) {
78 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
79 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
80 unsigned int start;
81
82 do {
83 start = u64_stats_fetch_begin_bh(&tstats->syncp);
84 rx_packets = tstats->rx_packets;
85 tx_packets = tstats->tx_packets;
86 rx_bytes = tstats->rx_bytes;
87 tx_bytes = tstats->tx_bytes;
88 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
89
90 tot->rx_packets += rx_packets;
91 tot->tx_packets += tx_packets;
92 tot->rx_bytes += rx_bytes;
93 tot->tx_bytes += tx_bytes;
94 }
95
96 tot->multicast = dev->stats.multicast;
97
98 tot->rx_crc_errors = dev->stats.rx_crc_errors;
99 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
100 tot->rx_length_errors = dev->stats.rx_length_errors;
101 tot->rx_frame_errors = dev->stats.rx_frame_errors;
102 tot->rx_errors = dev->stats.rx_errors;
103
104 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
105 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
106 tot->tx_dropped = dev->stats.tx_dropped;
107 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
108 tot->tx_errors = dev->stats.tx_errors;
109
110 tot->collisions = dev->stats.collisions;
111
112 return tot;
113}
114EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
115
116static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
117 __be16 flags, __be32 key)
118{
119 if (p->i_flags & TUNNEL_KEY) {
120 if (flags & TUNNEL_KEY)
121 return key == p->i_key;
122 else
123 /* key expected, none present */
124 return false;
125 } else
126 return !(flags & TUNNEL_KEY);
127}
128
129/* Fallback tunnel: no source, no destination, no key, no options
130
131 Tunnel hash table:
132 We require exact key match i.e. if a key is present in packet
133 it will match only tunnel with the same key; if it is not present,
134 it will match only keyless tunnel.
135
136 All keysless packets, if not matched configured keyless tunnels
137 will match fallback tunnel.
138 Given src, dst and key, find appropriate for input tunnel.
139*/
140struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
141 int link, __be16 flags,
142 __be32 remote, __be32 local,
143 __be32 key)
144{
145 unsigned int hash;
146 struct ip_tunnel *t, *cand = NULL;
147 struct hlist_head *head;
148
149 hash = ip_tunnel_hash(itn, key, remote);
150 head = &itn->tunnels[hash];
151
152 hlist_for_each_entry_rcu(t, head, hash_node) {
153 if (local != t->parms.iph.saddr ||
154 remote != t->parms.iph.daddr ||
155 !(t->dev->flags & IFF_UP))
156 continue;
157
158 if (!ip_tunnel_key_match(&t->parms, flags, key))
159 continue;
160
161 if (t->parms.link == link)
162 return t;
163 else
164 cand = t;
165 }
166
167 hlist_for_each_entry_rcu(t, head, hash_node) {
168 if (remote != t->parms.iph.daddr ||
169 !(t->dev->flags & IFF_UP))
170 continue;
171
172 if (!ip_tunnel_key_match(&t->parms, flags, key))
173 continue;
174
175 if (t->parms.link == link)
176 return t;
177 else if (!cand)
178 cand = t;
179 }
180
181 hash = ip_tunnel_hash(itn, key, 0);
182 head = &itn->tunnels[hash];
183
184 hlist_for_each_entry_rcu(t, head, hash_node) {
185 if ((local != t->parms.iph.saddr &&
186 (local != t->parms.iph.daddr ||
187 !ipv4_is_multicast(local))) ||
188 !(t->dev->flags & IFF_UP))
189 continue;
190
191 if (!ip_tunnel_key_match(&t->parms, flags, key))
192 continue;
193
194 if (t->parms.link == link)
195 return t;
196 else if (!cand)
197 cand = t;
198 }
199
200 if (flags & TUNNEL_NO_KEY)
201 goto skip_key_lookup;
202
203 hlist_for_each_entry_rcu(t, head, hash_node) {
204 if (t->parms.i_key != key ||
205 !(t->dev->flags & IFF_UP))
206 continue;
207
208 if (t->parms.link == link)
209 return t;
210 else if (!cand)
211 cand = t;
212 }
213
214skip_key_lookup:
215 if (cand)
216 return cand;
217
218 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
219 return netdev_priv(itn->fb_tunnel_dev);
220
221
222 return NULL;
223}
224EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
225
226static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
227 struct ip_tunnel_parm *parms)
228{
229 unsigned int h;
230 __be32 remote;
231
232 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
233 remote = parms->iph.daddr;
234 else
235 remote = 0;
236
237 h = ip_tunnel_hash(itn, parms->i_key, remote);
238 return &itn->tunnels[h];
239}
240
241static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
242{
243 struct hlist_head *head = ip_bucket(itn, &t->parms);
244
245 hlist_add_head_rcu(&t->hash_node, head);
246}
247
248static void ip_tunnel_del(struct ip_tunnel *t)
249{
250 hlist_del_init_rcu(&t->hash_node);
251}
252
253static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
254 struct ip_tunnel_parm *parms,
255 int type)
256{
257 __be32 remote = parms->iph.daddr;
258 __be32 local = parms->iph.saddr;
259 __be32 key = parms->i_key;
260 int link = parms->link;
261 struct ip_tunnel *t = NULL;
262 struct hlist_head *head = ip_bucket(itn, parms);
263
264 hlist_for_each_entry_rcu(t, head, hash_node) {
265 if (local == t->parms.iph.saddr &&
266 remote == t->parms.iph.daddr &&
267 key == t->parms.i_key &&
268 link == t->parms.link &&
269 type == t->dev->type)
270 break;
271 }
272 return t;
273}
274
275static struct net_device *__ip_tunnel_create(struct net *net,
276 const struct rtnl_link_ops *ops,
277 struct ip_tunnel_parm *parms)
278{
279 int err;
280 struct ip_tunnel *tunnel;
281 struct net_device *dev;
282 char name[IFNAMSIZ];
283
284 if (parms->name[0])
285 strlcpy(name, parms->name, IFNAMSIZ);
286 else {
287 if (strlen(ops->kind) + 3 >= IFNAMSIZ) {
288 err = -E2BIG;
289 goto failed;
290 }
291 strlcpy(name, ops->kind, IFNAMSIZ);
292 strncat(name, "%d", 2);
293 }
294
295 ASSERT_RTNL();
296 dev = alloc_netdev(ops->priv_size, name, ops->setup);
297 if (!dev) {
298 err = -ENOMEM;
299 goto failed;
300 }
301 dev_net_set(dev, net);
302
303 dev->rtnl_link_ops = ops;
304
305 tunnel = netdev_priv(dev);
306 tunnel->parms = *parms;
307
308 err = register_netdevice(dev);
309 if (err)
310 goto failed_free;
311
312 return dev;
313
314failed_free:
315 free_netdev(dev);
316failed:
317 return ERR_PTR(err);
318}
319
320static inline struct rtable *ip_route_output_tunnel(struct net *net,
321 struct flowi4 *fl4,
322 int proto,
323 __be32 daddr, __be32 saddr,
324 __be32 key, __u8 tos, int oif)
325{
326 memset(fl4, 0, sizeof(*fl4));
327 fl4->flowi4_oif = oif;
328 fl4->daddr = daddr;
329 fl4->saddr = saddr;
330 fl4->flowi4_tos = tos;
331 fl4->flowi4_proto = proto;
332 fl4->fl4_gre_key = key;
333 return ip_route_output_key(net, fl4);
334}
335
336static int ip_tunnel_bind_dev(struct net_device *dev)
337{
338 struct net_device *tdev = NULL;
339 struct ip_tunnel *tunnel = netdev_priv(dev);
340 const struct iphdr *iph;
341 int hlen = LL_MAX_HEADER;
342 int mtu = ETH_DATA_LEN;
343 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
344
345 iph = &tunnel->parms.iph;
346
347 /* Guess output device to choose reasonable mtu and needed_headroom */
348 if (iph->daddr) {
349 struct flowi4 fl4;
350 struct rtable *rt;
351
352 rt = ip_route_output_tunnel(dev_net(dev), &fl4,
353 tunnel->parms.iph.protocol,
354 iph->daddr, iph->saddr,
355 tunnel->parms.o_key,
356 RT_TOS(iph->tos),
357 tunnel->parms.link);
358 if (!IS_ERR(rt)) {
359 tdev = rt->dst.dev;
360 ip_rt_put(rt);
361 }
362 if (dev->type != ARPHRD_ETHER)
363 dev->flags |= IFF_POINTOPOINT;
364 }
365
366 if (!tdev && tunnel->parms.link)
367 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
368
369 if (tdev) {
370 hlen = tdev->hard_header_len + tdev->needed_headroom;
371 mtu = tdev->mtu;
372 }
373 dev->iflink = tunnel->parms.link;
374
375 dev->needed_headroom = t_hlen + hlen;
376 mtu -= (dev->hard_header_len + t_hlen);
377
378 if (mtu < 68)
379 mtu = 68;
380
381 return mtu;
382}
383
384static struct ip_tunnel *ip_tunnel_create(struct net *net,
385 struct ip_tunnel_net *itn,
386 struct ip_tunnel_parm *parms)
387{
388 struct ip_tunnel *nt, *fbt;
389 struct net_device *dev;
390
391 BUG_ON(!itn->fb_tunnel_dev);
392 fbt = netdev_priv(itn->fb_tunnel_dev);
393 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
394 if (IS_ERR(dev))
395 return NULL;
396
397 dev->mtu = ip_tunnel_bind_dev(dev);
398
399 nt = netdev_priv(dev);
400 ip_tunnel_add(itn, nt);
401 return nt;
402}
403
404int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
405 const struct tnl_ptk_info *tpi, bool log_ecn_error)
406{
407 struct pcpu_tstats *tstats;
408 const struct iphdr *iph = ip_hdr(skb);
409 int err;
410
411 secpath_reset(skb);
412
413 skb->protocol = tpi->proto;
414
415 skb->mac_header = skb->network_header;
416 __pskb_pull(skb, tunnel->hlen);
417 skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen);
418#ifdef CONFIG_NET_IPGRE_BROADCAST
419 if (ipv4_is_multicast(iph->daddr)) {
420 /* Looped back packet, drop it! */
421 if (rt_is_output_route(skb_rtable(skb)))
422 goto drop;
423 tunnel->dev->stats.multicast++;
424 skb->pkt_type = PACKET_BROADCAST;
425 }
426#endif
427
428 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
429 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
430 tunnel->dev->stats.rx_crc_errors++;
431 tunnel->dev->stats.rx_errors++;
432 goto drop;
433 }
434
435 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
436 if (!(tpi->flags&TUNNEL_SEQ) ||
437 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
438 tunnel->dev->stats.rx_fifo_errors++;
439 tunnel->dev->stats.rx_errors++;
440 goto drop;
441 }
442 tunnel->i_seqno = ntohl(tpi->seq) + 1;
443 }
444
445 /* Warning: All skb pointers will be invalidated! */
446 if (tunnel->dev->type == ARPHRD_ETHER) {
447 if (!pskb_may_pull(skb, ETH_HLEN)) {
448 tunnel->dev->stats.rx_length_errors++;
449 tunnel->dev->stats.rx_errors++;
450 goto drop;
451 }
452
453 iph = ip_hdr(skb);
454 skb->protocol = eth_type_trans(skb, tunnel->dev);
455 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
456 }
457
458 skb->pkt_type = PACKET_HOST;
459 __skb_tunnel_rx(skb, tunnel->dev);
460
461 skb_reset_network_header(skb);
462 err = IP_ECN_decapsulate(iph, skb);
463 if (unlikely(err)) {
464 if (log_ecn_error)
465 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
466 &iph->saddr, iph->tos);
467 if (err > 1) {
468 ++tunnel->dev->stats.rx_frame_errors;
469 ++tunnel->dev->stats.rx_errors;
470 goto drop;
471 }
472 }
473
474 tstats = this_cpu_ptr(tunnel->dev->tstats);
475 u64_stats_update_begin(&tstats->syncp);
476 tstats->rx_packets++;
477 tstats->rx_bytes += skb->len;
478 u64_stats_update_end(&tstats->syncp);
479
480 gro_cells_receive(&tunnel->gro_cells, skb);
481 return 0;
482
483drop:
484 kfree_skb(skb);
485 return 0;
486}
487EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
488
489void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
490 const struct iphdr *tnl_params)
491{
492 struct ip_tunnel *tunnel = netdev_priv(dev);
493 const struct iphdr *inner_iph;
494 struct iphdr *iph;
495 struct flowi4 fl4;
496 u8 tos, ttl;
497 __be16 df;
498 struct rtable *rt; /* Route to the other host */
499 struct net_device *tdev; /* Device to other host */
500 unsigned int max_headroom; /* The extra header space needed */
501 __be32 dst;
502 int mtu;
503
504 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
505
506 dst = tnl_params->daddr;
507 if (dst == 0) {
508 /* NBMA tunnel */
509
510 if (skb_dst(skb) == NULL) {
511 dev->stats.tx_fifo_errors++;
512 goto tx_error;
513 }
514
515 if (skb->protocol == htons(ETH_P_IP)) {
516 rt = skb_rtable(skb);
517 dst = rt_nexthop(rt, inner_iph->daddr);
518 }
519#if IS_ENABLED(CONFIG_IPV6)
520 else if (skb->protocol == htons(ETH_P_IPV6)) {
521 const struct in6_addr *addr6;
522 struct neighbour *neigh;
523 bool do_tx_error_icmp;
524 int addr_type;
525
526 neigh = dst_neigh_lookup(skb_dst(skb),
527 &ipv6_hdr(skb)->daddr);
528 if (neigh == NULL)
529 goto tx_error;
530
531 addr6 = (const struct in6_addr *)&neigh->primary_key;
532 addr_type = ipv6_addr_type(addr6);
533
534 if (addr_type == IPV6_ADDR_ANY) {
535 addr6 = &ipv6_hdr(skb)->daddr;
536 addr_type = ipv6_addr_type(addr6);
537 }
538
539 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
540 do_tx_error_icmp = true;
541 else {
542 do_tx_error_icmp = false;
543 dst = addr6->s6_addr32[3];
544 }
545 neigh_release(neigh);
546 if (do_tx_error_icmp)
547 goto tx_error_icmp;
548 }
549#endif
550 else
551 goto tx_error;
552 }
553
554 tos = tnl_params->tos;
555 if (tos & 0x1) {
556 tos &= ~0x1;
557 if (skb->protocol == htons(ETH_P_IP))
558 tos = inner_iph->tos;
559 else if (skb->protocol == htons(ETH_P_IPV6))
560 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
561 }
562
563 rt = ip_route_output_tunnel(dev_net(dev), &fl4,
564 tunnel->parms.iph.protocol,
565 dst, tnl_params->saddr,
566 tunnel->parms.o_key,
567 RT_TOS(tos),
568 tunnel->parms.link);
569 if (IS_ERR(rt)) {
570 dev->stats.tx_carrier_errors++;
571 goto tx_error;
572 }
573 tdev = rt->dst.dev;
574
575 if (tdev == dev) {
576 ip_rt_put(rt);
577 dev->stats.collisions++;
578 goto tx_error;
579 }
580
581 df = tnl_params->frag_off;
582
583 if (df)
584 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
585 - sizeof(struct iphdr);
586 else
587 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
588
589 if (skb_dst(skb))
590 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
591
592 if (skb->protocol == htons(ETH_P_IP)) {
593 df |= (inner_iph->frag_off&htons(IP_DF));
594
595 if (!skb_is_gso(skb) &&
596 (inner_iph->frag_off&htons(IP_DF)) &&
597 mtu < ntohs(inner_iph->tot_len)) {
598 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
599 ip_rt_put(rt);
600 goto tx_error;
601 }
602 }
603#if IS_ENABLED(CONFIG_IPV6)
604 else if (skb->protocol == htons(ETH_P_IPV6)) {
605 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
606
607 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
608 mtu >= IPV6_MIN_MTU) {
609 if ((tunnel->parms.iph.daddr &&
610 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
611 rt6->rt6i_dst.plen == 128) {
612 rt6->rt6i_flags |= RTF_MODIFIED;
613 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
614 }
615 }
616
617 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
618 mtu < skb->len) {
619 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
620 ip_rt_put(rt);
621 goto tx_error;
622 }
623 }
624#endif
625
626 if (tunnel->err_count > 0) {
627 if (time_before(jiffies,
628 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
629 tunnel->err_count--;
630
631 dst_link_failure(skb);
632 } else
633 tunnel->err_count = 0;
634 }
635
636 ttl = tnl_params->ttl;
637 if (ttl == 0) {
638 if (skb->protocol == htons(ETH_P_IP))
639 ttl = inner_iph->ttl;
640#if IS_ENABLED(CONFIG_IPV6)
641 else if (skb->protocol == htons(ETH_P_IPV6))
642 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
643#endif
644 else
645 ttl = ip4_dst_hoplimit(&rt->dst);
646 }
647
648 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr)
649 + rt->dst.header_len;
650 if (max_headroom > dev->needed_headroom) {
651 dev->needed_headroom = max_headroom;
652 if (skb_cow_head(skb, dev->needed_headroom)) {
653 dev->stats.tx_dropped++;
654 dev_kfree_skb(skb);
655 return;
656 }
657 }
658
659 skb_dst_drop(skb);
660 skb_dst_set(skb, &rt->dst);
661 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
662
663 /* Push down and install the IP header. */
664 skb_push(skb, sizeof(struct iphdr));
665 skb_reset_network_header(skb);
666
667 iph = ip_hdr(skb);
668 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
669
670 iph->version = 4;
671 iph->ihl = sizeof(struct iphdr) >> 2;
672 iph->frag_off = df;
673 iph->protocol = tnl_params->protocol;
674 iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
675 iph->daddr = fl4.daddr;
676 iph->saddr = fl4.saddr;
677 iph->ttl = ttl;
678 tunnel_ip_select_ident(skb, inner_iph, &rt->dst);
679
680 iptunnel_xmit(skb, dev);
681 return;
682
683#if IS_ENABLED(CONFIG_IPV6)
684tx_error_icmp:
685 dst_link_failure(skb);
686#endif
687tx_error:
688 dev->stats.tx_errors++;
689 dev_kfree_skb(skb);
690}
691EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
692
693static void ip_tunnel_update(struct ip_tunnel_net *itn,
694 struct ip_tunnel *t,
695 struct net_device *dev,
696 struct ip_tunnel_parm *p,
697 bool set_mtu)
698{
699 ip_tunnel_del(t);
700 t->parms.iph.saddr = p->iph.saddr;
701 t->parms.iph.daddr = p->iph.daddr;
702 t->parms.i_key = p->i_key;
703 t->parms.o_key = p->o_key;
704 if (dev->type != ARPHRD_ETHER) {
705 memcpy(dev->dev_addr, &p->iph.saddr, 4);
706 memcpy(dev->broadcast, &p->iph.daddr, 4);
707 }
708 ip_tunnel_add(itn, t);
709
710 t->parms.iph.ttl = p->iph.ttl;
711 t->parms.iph.tos = p->iph.tos;
712 t->parms.iph.frag_off = p->iph.frag_off;
713
714 if (t->parms.link != p->link) {
715 int mtu;
716
717 t->parms.link = p->link;
718 mtu = ip_tunnel_bind_dev(dev);
719 if (set_mtu)
720 dev->mtu = mtu;
721 }
722 netdev_state_change(dev);
723}
724
725int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
726{
727 int err = 0;
728 struct ip_tunnel *t;
729 struct net *net = dev_net(dev);
730 struct ip_tunnel *tunnel = netdev_priv(dev);
731 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
732
733 BUG_ON(!itn->fb_tunnel_dev);
734 switch (cmd) {
735 case SIOCGETTUNNEL:
736 t = NULL;
737 if (dev == itn->fb_tunnel_dev)
738 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
739 if (t == NULL)
740 t = netdev_priv(dev);
741 memcpy(p, &t->parms, sizeof(*p));
742 break;
743
744 case SIOCADDTUNNEL:
745 case SIOCCHGTUNNEL:
746 err = -EPERM;
747 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
748 goto done;
749 if (p->iph.ttl)
750 p->iph.frag_off |= htons(IP_DF);
751 if (!(p->i_flags&TUNNEL_KEY))
752 p->i_key = 0;
753 if (!(p->o_flags&TUNNEL_KEY))
754 p->o_key = 0;
755
756 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
757
758 if (!t && (cmd == SIOCADDTUNNEL))
759 t = ip_tunnel_create(net, itn, p);
760
761 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
762 if (t != NULL) {
763 if (t->dev != dev) {
764 err = -EEXIST;
765 break;
766 }
767 } else {
768 unsigned int nflags = 0;
769
770 if (ipv4_is_multicast(p->iph.daddr))
771 nflags = IFF_BROADCAST;
772 else if (p->iph.daddr)
773 nflags = IFF_POINTOPOINT;
774
775 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
776 err = -EINVAL;
777 break;
778 }
779
780 t = netdev_priv(dev);
781 }
782 }
783
784 if (t) {
785 err = 0;
786 ip_tunnel_update(itn, t, dev, p, true);
787 } else
788 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
789 break;
790
791 case SIOCDELTUNNEL:
792 err = -EPERM;
793 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
794 goto done;
795
796 if (dev == itn->fb_tunnel_dev) {
797 err = -ENOENT;
798 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
799 if (t == NULL)
800 goto done;
801 err = -EPERM;
802 if (t == netdev_priv(itn->fb_tunnel_dev))
803 goto done;
804 dev = t->dev;
805 }
806 unregister_netdevice(dev);
807 err = 0;
808 break;
809
810 default:
811 err = -EINVAL;
812 }
813
814done:
815 return err;
816}
817EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
818
819int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
820{
821 struct ip_tunnel *tunnel = netdev_priv(dev);
822 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
823
824 if (new_mtu < 68 ||
825 new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
826 return -EINVAL;
827 dev->mtu = new_mtu;
828 return 0;
829}
830EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
831
832static void ip_tunnel_dev_free(struct net_device *dev)
833{
834 struct ip_tunnel *tunnel = netdev_priv(dev);
835
836 gro_cells_destroy(&tunnel->gro_cells);
837 free_percpu(dev->tstats);
838 free_netdev(dev);
839}
840
841void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
842{
843 struct net *net = dev_net(dev);
844 struct ip_tunnel *tunnel = netdev_priv(dev);
845 struct ip_tunnel_net *itn;
846
847 itn = net_generic(net, tunnel->ip_tnl_net_id);
848
849 if (itn->fb_tunnel_dev != dev) {
850 ip_tunnel_del(netdev_priv(dev));
851 unregister_netdevice_queue(dev, head);
852 }
853}
854EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
855
856int __net_init ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
857 struct rtnl_link_ops *ops, char *devname)
858{
859 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
860 struct ip_tunnel_parm parms;
861
862 itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
863 if (!itn->tunnels)
864 return -ENOMEM;
865
866 if (!ops) {
867 itn->fb_tunnel_dev = NULL;
868 return 0;
869 }
870 memset(&parms, 0, sizeof(parms));
871 if (devname)
872 strlcpy(parms.name, devname, IFNAMSIZ);
873
874 rtnl_lock();
875 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
876 rtnl_unlock();
877 if (IS_ERR(itn->fb_tunnel_dev)) {
878 kfree(itn->tunnels);
879 return PTR_ERR(itn->fb_tunnel_dev);
880 }
881
882 return 0;
883}
884EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
885
886static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
887{
888 int h;
889
890 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
891 struct ip_tunnel *t;
892 struct hlist_node *n;
893 struct hlist_head *thead = &itn->tunnels[h];
894
895 hlist_for_each_entry_safe(t, n, thead, hash_node)
896 unregister_netdevice_queue(t->dev, head);
897 }
898 if (itn->fb_tunnel_dev)
899 unregister_netdevice_queue(itn->fb_tunnel_dev, head);
900}
901
902void __net_exit ip_tunnel_delete_net(struct ip_tunnel_net *itn)
903{
904 LIST_HEAD(list);
905
906 rtnl_lock();
907 ip_tunnel_destroy(itn, &list);
908 unregister_netdevice_many(&list);
909 rtnl_unlock();
910 kfree(itn->tunnels);
911}
912EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
913
914int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
915 struct ip_tunnel_parm *p)
916{
917 struct ip_tunnel *nt;
918 struct net *net = dev_net(dev);
919 struct ip_tunnel_net *itn;
920 int mtu;
921 int err;
922
923 nt = netdev_priv(dev);
924 itn = net_generic(net, nt->ip_tnl_net_id);
925
926 if (ip_tunnel_find(itn, p, dev->type))
927 return -EEXIST;
928
929 nt->parms = *p;
930 err = register_netdevice(dev);
931 if (err)
932 goto out;
933
934 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
935 eth_hw_addr_random(dev);
936
937 mtu = ip_tunnel_bind_dev(dev);
938 if (!tb[IFLA_MTU])
939 dev->mtu = mtu;
940
941 ip_tunnel_add(itn, nt);
942
943out:
944 return err;
945}
946EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
947
948int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
949 struct ip_tunnel_parm *p)
950{
951 struct ip_tunnel *t, *nt;
952 struct net *net = dev_net(dev);
953 struct ip_tunnel *tunnel = netdev_priv(dev);
954 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
955
956 if (dev == itn->fb_tunnel_dev)
957 return -EINVAL;
958
959 nt = netdev_priv(dev);
960
961 t = ip_tunnel_find(itn, p, dev->type);
962
963 if (t) {
964 if (t->dev != dev)
965 return -EEXIST;
966 } else {
967 t = nt;
968
969 if (dev->type != ARPHRD_ETHER) {
970 unsigned int nflags = 0;
971
972 if (ipv4_is_multicast(p->iph.daddr))
973 nflags = IFF_BROADCAST;
974 else if (p->iph.daddr)
975 nflags = IFF_POINTOPOINT;
976
977 if ((dev->flags ^ nflags) &
978 (IFF_POINTOPOINT | IFF_BROADCAST))
979 return -EINVAL;
980 }
981 }
982
983 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
984 return 0;
985}
986EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
987
988int ip_tunnel_init(struct net_device *dev)
989{
990 struct ip_tunnel *tunnel = netdev_priv(dev);
991 struct iphdr *iph = &tunnel->parms.iph;
992 int err;
993
994 dev->destructor = ip_tunnel_dev_free;
995 dev->tstats = alloc_percpu(struct pcpu_tstats);
996 if (!dev->tstats)
997 return -ENOMEM;
998
999 err = gro_cells_init(&tunnel->gro_cells, dev);
1000 if (err) {
1001 free_percpu(dev->tstats);
1002 return err;
1003 }
1004
1005 tunnel->dev = dev;
1006 strcpy(tunnel->parms.name, dev->name);
1007 iph->version = 4;
1008 iph->ihl = 5;
1009
1010 return 0;
1011}
1012EXPORT_SYMBOL_GPL(ip_tunnel_init);
1013
1014void ip_tunnel_uninit(struct net_device *dev)
1015{
1016 struct net *net = dev_net(dev);
1017 struct ip_tunnel *tunnel = netdev_priv(dev);
1018 struct ip_tunnel_net *itn;
1019
1020 itn = net_generic(net, tunnel->ip_tnl_net_id);
1021 /* fb_tunnel_dev will be unregisted in net-exit call. */
1022 if (itn->fb_tunnel_dev != dev)
1023 ip_tunnel_del(netdev_priv(dev));
1024}
1025EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1026
1027/* Do least required initialization, rest of init is done in tunnel_init call */
1028void ip_tunnel_setup(struct net_device *dev, int net_id)
1029{
1030 struct ip_tunnel *tunnel = netdev_priv(dev);
1031 tunnel->ip_tnl_net_id = net_id;
1032}
1033EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1034
1035MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index c3a4233c0ac2..9d2bdb2c1d3f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -38,7 +38,7 @@
38#include <net/sock.h> 38#include <net/sock.h>
39#include <net/ip.h> 39#include <net/ip.h>
40#include <net/icmp.h> 40#include <net/icmp.h>
41#include <net/ipip.h> 41#include <net/ip_tunnels.h>
42#include <net/inet_ecn.h> 42#include <net/inet_ecn.h>
43#include <net/xfrm.h> 43#include <net/xfrm.h>
44#include <net/net_namespace.h> 44#include <net/net_namespace.h>
@@ -82,44 +82,6 @@ static int vti_tunnel_bind_dev(struct net_device *dev);
82} while (0) 82} while (0)
83 83
84 84
85static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
86 struct rtnl_link_stats64 *tot)
87{
88 int i;
89
90 for_each_possible_cpu(i) {
91 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
92 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
93 unsigned int start;
94
95 do {
96 start = u64_stats_fetch_begin_bh(&tstats->syncp);
97 rx_packets = tstats->rx_packets;
98 tx_packets = tstats->tx_packets;
99 rx_bytes = tstats->rx_bytes;
100 tx_bytes = tstats->tx_bytes;
101 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
102
103 tot->rx_packets += rx_packets;
104 tot->tx_packets += tx_packets;
105 tot->rx_bytes += rx_bytes;
106 tot->tx_bytes += tx_bytes;
107 }
108
109 tot->multicast = dev->stats.multicast;
110 tot->rx_crc_errors = dev->stats.rx_crc_errors;
111 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
112 tot->rx_length_errors = dev->stats.rx_length_errors;
113 tot->rx_errors = dev->stats.rx_errors;
114 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
115 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
116 tot->tx_dropped = dev->stats.tx_dropped;
117 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
118 tot->tx_errors = dev->stats.tx_errors;
119
120 return tot;
121}
122
123static struct ip_tunnel *vti_tunnel_lookup(struct net *net, 85static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
124 __be32 remote, __be32 local) 86 __be32 remote, __be32 local)
125{ 87{
@@ -597,7 +559,7 @@ static const struct net_device_ops vti_netdev_ops = {
597 .ndo_start_xmit = vti_tunnel_xmit, 559 .ndo_start_xmit = vti_tunnel_xmit,
598 .ndo_do_ioctl = vti_tunnel_ioctl, 560 .ndo_do_ioctl = vti_tunnel_ioctl,
599 .ndo_change_mtu = vti_tunnel_change_mtu, 561 .ndo_change_mtu = vti_tunnel_change_mtu,
600 .ndo_get_stats64 = vti_get_stats64, 562 .ndo_get_stats64 = ip_tunnel_get_stats64,
601}; 563};
602 564
603static void vti_dev_free(struct net_device *dev) 565static void vti_dev_free(struct net_device *dev)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 8f024d41eefa..77bfcce64fe5 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -111,227 +111,21 @@
111#include <net/sock.h> 111#include <net/sock.h>
112#include <net/ip.h> 112#include <net/ip.h>
113#include <net/icmp.h> 113#include <net/icmp.h>
114#include <net/ipip.h> 114#include <net/ip_tunnels.h>
115#include <net/inet_ecn.h> 115#include <net/inet_ecn.h>
116#include <net/xfrm.h> 116#include <net/xfrm.h>
117#include <net/net_namespace.h> 117#include <net/net_namespace.h>
118#include <net/netns/generic.h> 118#include <net/netns/generic.h>
119 119
120#define HASH_SIZE 16
121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
122
123static bool log_ecn_error = true; 120static bool log_ecn_error = true;
124module_param(log_ecn_error, bool, 0644); 121module_param(log_ecn_error, bool, 0644);
125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 122MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126 123
127static int ipip_net_id __read_mostly; 124static int ipip_net_id __read_mostly;
128struct ipip_net {
129 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
130 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
131 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
132 struct ip_tunnel __rcu *tunnels_wc[1];
133 struct ip_tunnel __rcu **tunnels[4];
134
135 struct net_device *fb_tunnel_dev;
136};
137 125
138static int ipip_tunnel_init(struct net_device *dev); 126static int ipip_tunnel_init(struct net_device *dev);
139static void ipip_tunnel_setup(struct net_device *dev);
140static void ipip_dev_free(struct net_device *dev);
141static struct rtnl_link_ops ipip_link_ops __read_mostly; 127static struct rtnl_link_ops ipip_link_ops __read_mostly;
142 128
143static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
144 struct rtnl_link_stats64 *tot)
145{
146 int i;
147
148 for_each_possible_cpu(i) {
149 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
150 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
151 unsigned int start;
152
153 do {
154 start = u64_stats_fetch_begin_bh(&tstats->syncp);
155 rx_packets = tstats->rx_packets;
156 tx_packets = tstats->tx_packets;
157 rx_bytes = tstats->rx_bytes;
158 tx_bytes = tstats->tx_bytes;
159 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
160
161 tot->rx_packets += rx_packets;
162 tot->tx_packets += tx_packets;
163 tot->rx_bytes += rx_bytes;
164 tot->tx_bytes += tx_bytes;
165 }
166
167 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
168 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
169 tot->tx_dropped = dev->stats.tx_dropped;
170 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
171 tot->tx_errors = dev->stats.tx_errors;
172 tot->collisions = dev->stats.collisions;
173
174 return tot;
175}
176
177static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
178 __be32 remote, __be32 local)
179{
180 unsigned int h0 = HASH(remote);
181 unsigned int h1 = HASH(local);
182 struct ip_tunnel *t;
183 struct ipip_net *ipn = net_generic(net, ipip_net_id);
184
185 for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
186 if (local == t->parms.iph.saddr &&
187 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
188 return t;
189
190 for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
191 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
192 return t;
193
194 for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
195 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
196 return t;
197
198 t = rcu_dereference(ipn->tunnels_wc[0]);
199 if (t && (t->dev->flags&IFF_UP))
200 return t;
201 return NULL;
202}
203
204static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
205 struct ip_tunnel_parm *parms)
206{
207 __be32 remote = parms->iph.daddr;
208 __be32 local = parms->iph.saddr;
209 unsigned int h = 0;
210 int prio = 0;
211
212 if (remote) {
213 prio |= 2;
214 h ^= HASH(remote);
215 }
216 if (local) {
217 prio |= 1;
218 h ^= HASH(local);
219 }
220 return &ipn->tunnels[prio][h];
221}
222
223static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
224 struct ip_tunnel *t)
225{
226 return __ipip_bucket(ipn, &t->parms);
227}
228
229static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
230{
231 struct ip_tunnel __rcu **tp;
232 struct ip_tunnel *iter;
233
234 for (tp = ipip_bucket(ipn, t);
235 (iter = rtnl_dereference(*tp)) != NULL;
236 tp = &iter->next) {
237 if (t == iter) {
238 rcu_assign_pointer(*tp, t->next);
239 break;
240 }
241 }
242}
243
244static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
245{
246 struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
247
248 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
249 rcu_assign_pointer(*tp, t);
250}
251
252static int ipip_tunnel_create(struct net_device *dev)
253{
254 struct ip_tunnel *t = netdev_priv(dev);
255 struct net *net = dev_net(dev);
256 struct ipip_net *ipn = net_generic(net, ipip_net_id);
257 int err;
258
259 err = ipip_tunnel_init(dev);
260 if (err < 0)
261 goto out;
262
263 err = register_netdevice(dev);
264 if (err < 0)
265 goto out;
266
267 strcpy(t->parms.name, dev->name);
268 dev->rtnl_link_ops = &ipip_link_ops;
269
270 dev_hold(dev);
271 ipip_tunnel_link(ipn, t);
272 return 0;
273
274out:
275 return err;
276}
277
278static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
279 struct ip_tunnel_parm *parms, int create)
280{
281 __be32 remote = parms->iph.daddr;
282 __be32 local = parms->iph.saddr;
283 struct ip_tunnel *t, *nt;
284 struct ip_tunnel __rcu **tp;
285 struct net_device *dev;
286 char name[IFNAMSIZ];
287 struct ipip_net *ipn = net_generic(net, ipip_net_id);
288
289 for (tp = __ipip_bucket(ipn, parms);
290 (t = rtnl_dereference(*tp)) != NULL;
291 tp = &t->next) {
292 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
293 return t;
294 }
295 if (!create)
296 return NULL;
297
298 if (parms->name[0])
299 strlcpy(name, parms->name, IFNAMSIZ);
300 else
301 strcpy(name, "tunl%d");
302
303 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
304 if (dev == NULL)
305 return NULL;
306
307 dev_net_set(dev, net);
308
309 nt = netdev_priv(dev);
310 nt->parms = *parms;
311
312 if (ipip_tunnel_create(dev) < 0)
313 goto failed_free;
314
315 return nt;
316
317failed_free:
318 ipip_dev_free(dev);
319 return NULL;
320}
321
322/* called with RTNL */
323static void ipip_tunnel_uninit(struct net_device *dev)
324{
325 struct net *net = dev_net(dev);
326 struct ipip_net *ipn = net_generic(net, ipip_net_id);
327
328 if (dev == ipn->fb_tunnel_dev)
329 RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL);
330 else
331 ipip_tunnel_unlink(ipn, netdev_priv(dev));
332 dev_put(dev);
333}
334
335static int ipip_err(struct sk_buff *skb, u32 info) 129static int ipip_err(struct sk_buff *skb, u32 info)
336{ 130{
337 131
@@ -339,41 +133,17 @@ static int ipip_err(struct sk_buff *skb, u32 info)
339 8 bytes of packet payload. It means, that precise relaying of 133 8 bytes of packet payload. It means, that precise relaying of
340 ICMP in the real Internet is absolutely infeasible. 134 ICMP in the real Internet is absolutely infeasible.
341 */ 135 */
136 struct net *net = dev_net(skb->dev);
137 struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
342 const struct iphdr *iph = (const struct iphdr *)skb->data; 138 const struct iphdr *iph = (const struct iphdr *)skb->data;
343 const int type = icmp_hdr(skb)->type;
344 const int code = icmp_hdr(skb)->code;
345 struct ip_tunnel *t; 139 struct ip_tunnel *t;
346 int err; 140 int err;
347 141 const int type = icmp_hdr(skb)->type;
348 switch (type) { 142 const int code = icmp_hdr(skb)->code;
349 default:
350 case ICMP_PARAMETERPROB:
351 return 0;
352
353 case ICMP_DEST_UNREACH:
354 switch (code) {
355 case ICMP_SR_FAILED:
356 case ICMP_PORT_UNREACH:
357 /* Impossible event. */
358 return 0;
359 default:
360 /* All others are translated to HOST_UNREACH.
361 rfc2003 contains "deep thoughts" about NET_UNREACH,
362 I believe they are just ether pollution. --ANK
363 */
364 break;
365 }
366 break;
367 case ICMP_TIME_EXCEEDED:
368 if (code != ICMP_EXC_TTL)
369 return 0;
370 break;
371 case ICMP_REDIRECT:
372 break;
373 }
374 143
375 err = -ENOENT; 144 err = -ENOENT;
376 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 145 t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
146 iph->daddr, iph->saddr, 0);
377 if (t == NULL) 147 if (t == NULL)
378 goto out; 148 goto out;
379 149
@@ -403,53 +173,29 @@ static int ipip_err(struct sk_buff *skb, u32 info)
403 else 173 else
404 t->err_count = 1; 174 t->err_count = 1;
405 t->err_time = jiffies; 175 t->err_time = jiffies;
406out:
407 176
177out:
408 return err; 178 return err;
409} 179}
410 180
181static const struct tnl_ptk_info tpi = {
182 /* no tunnel info required for ipip. */
183 .proto = htons(ETH_P_IP),
184};
185
411static int ipip_rcv(struct sk_buff *skb) 186static int ipip_rcv(struct sk_buff *skb)
412{ 187{
188 struct net *net = dev_net(skb->dev);
189 struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
413 struct ip_tunnel *tunnel; 190 struct ip_tunnel *tunnel;
414 const struct iphdr *iph = ip_hdr(skb); 191 const struct iphdr *iph = ip_hdr(skb);
415 int err;
416
417 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
418 if (tunnel != NULL) {
419 struct pcpu_tstats *tstats;
420 192
193 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
194 iph->saddr, iph->daddr, 0);
195 if (tunnel) {
421 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 196 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
422 goto drop; 197 goto drop;
423 198 return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
424 secpath_reset(skb);
425
426 skb->mac_header = skb->network_header;
427 skb_reset_network_header(skb);
428 skb->protocol = htons(ETH_P_IP);
429 skb->pkt_type = PACKET_HOST;
430
431 __skb_tunnel_rx(skb, tunnel->dev);
432
433 err = IP_ECN_decapsulate(iph, skb);
434 if (unlikely(err)) {
435 if (log_ecn_error)
436 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
437 &iph->saddr, iph->tos);
438 if (err > 1) {
439 ++tunnel->dev->stats.rx_frame_errors;
440 ++tunnel->dev->stats.rx_errors;
441 goto drop;
442 }
443 }
444
445 tstats = this_cpu_ptr(tunnel->dev->tstats);
446 u64_stats_update_begin(&tstats->syncp);
447 tstats->rx_packets++;
448 tstats->rx_bytes += skb->len;
449 u64_stats_update_end(&tstats->syncp);
450
451 netif_rx(skb);
452 return 0;
453 } 199 }
454 200
455 return -1; 201 return -1;
@@ -463,329 +209,64 @@ drop:
463 * This function assumes it is being called from dev_queue_xmit() 209 * This function assumes it is being called from dev_queue_xmit()
464 * and that skb is filled properly by that function. 210 * and that skb is filled properly by that function.
465 */ 211 */
466
467static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 212static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
468{ 213{
469 struct ip_tunnel *tunnel = netdev_priv(dev); 214 struct ip_tunnel *tunnel = netdev_priv(dev);
470 const struct iphdr *tiph = &tunnel->parms.iph; 215 const struct iphdr *tiph = &tunnel->parms.iph;
471 u8 tos = tunnel->parms.iph.tos;
472 __be16 df = tiph->frag_off;
473 struct rtable *rt; /* Route to the other host */
474 struct net_device *tdev; /* Device to other host */
475 const struct iphdr *old_iph;
476 struct iphdr *iph; /* Our new IP header */
477 unsigned int max_headroom; /* The extra header space needed */
478 __be32 dst = tiph->daddr;
479 struct flowi4 fl4;
480 int mtu;
481
482 if (skb->protocol != htons(ETH_P_IP))
483 goto tx_error;
484 216
485 if (skb->ip_summed == CHECKSUM_PARTIAL && 217 if (unlikely(skb->protocol != htons(ETH_P_IP)))
486 skb_checksum_help(skb))
487 goto tx_error; 218 goto tx_error;
488 219
489 old_iph = ip_hdr(skb); 220 if (likely(!skb->encapsulation)) {
490 221 skb_reset_inner_headers(skb);
491 if (tos & 1) 222 skb->encapsulation = 1;
492 tos = old_iph->tos;
493
494 if (!dst) {
495 /* NBMA tunnel */
496 if ((rt = skb_rtable(skb)) == NULL) {
497 dev->stats.tx_fifo_errors++;
498 goto tx_error;
499 }
500 dst = rt_nexthop(rt, old_iph->daddr);
501 } 223 }
502 224
503 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, 225 ip_tunnel_xmit(skb, dev, tiph);
504 dst, tiph->saddr,
505 0, 0,
506 IPPROTO_IPIP, RT_TOS(tos),
507 tunnel->parms.link);
508 if (IS_ERR(rt)) {
509 dev->stats.tx_carrier_errors++;
510 goto tx_error_icmp;
511 }
512 tdev = rt->dst.dev;
513
514 if (tdev == dev) {
515 ip_rt_put(rt);
516 dev->stats.collisions++;
517 goto tx_error;
518 }
519
520 df |= old_iph->frag_off & htons(IP_DF);
521
522 if (df) {
523 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
524
525 if (mtu < 68) {
526 dev->stats.collisions++;
527 ip_rt_put(rt);
528 goto tx_error;
529 }
530
531 if (skb_dst(skb))
532 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
533
534 if ((old_iph->frag_off & htons(IP_DF)) &&
535 mtu < ntohs(old_iph->tot_len)) {
536 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
537 htonl(mtu));
538 ip_rt_put(rt);
539 goto tx_error;
540 }
541 }
542
543 if (tunnel->err_count > 0) {
544 if (time_before(jiffies,
545 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
546 tunnel->err_count--;
547 dst_link_failure(skb);
548 } else
549 tunnel->err_count = 0;
550 }
551
552 /*
553 * Okay, now see if we can stuff it in the buffer as-is.
554 */
555 max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
556
557 if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
558 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
559 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
560 if (!new_skb) {
561 ip_rt_put(rt);
562 dev->stats.tx_dropped++;
563 dev_kfree_skb(skb);
564 return NETDEV_TX_OK;
565 }
566 if (skb->sk)
567 skb_set_owner_w(new_skb, skb->sk);
568 dev_kfree_skb(skb);
569 skb = new_skb;
570 old_iph = ip_hdr(skb);
571 }
572
573 skb->transport_header = skb->network_header;
574 skb_push(skb, sizeof(struct iphdr));
575 skb_reset_network_header(skb);
576 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
577 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
578 IPSKB_REROUTED);
579 skb_dst_drop(skb);
580 skb_dst_set(skb, &rt->dst);
581
582 /*
583 * Push down and install the IPIP header.
584 */
585
586 iph = ip_hdr(skb);
587 iph->version = 4;
588 iph->ihl = sizeof(struct iphdr)>>2;
589 iph->frag_off = df;
590 iph->protocol = IPPROTO_IPIP;
591 iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
592 iph->daddr = fl4.daddr;
593 iph->saddr = fl4.saddr;
594
595 if ((iph->ttl = tiph->ttl) == 0)
596 iph->ttl = old_iph->ttl;
597
598 iptunnel_xmit(skb, dev);
599 return NETDEV_TX_OK; 226 return NETDEV_TX_OK;
600 227
601tx_error_icmp:
602 dst_link_failure(skb);
603tx_error: 228tx_error:
604 dev->stats.tx_errors++; 229 dev->stats.tx_errors++;
605 dev_kfree_skb(skb); 230 dev_kfree_skb(skb);
606 return NETDEV_TX_OK; 231 return NETDEV_TX_OK;
607} 232}
608 233
609static void ipip_tunnel_bind_dev(struct net_device *dev)
610{
611 struct net_device *tdev = NULL;
612 struct ip_tunnel *tunnel;
613 const struct iphdr *iph;
614
615 tunnel = netdev_priv(dev);
616 iph = &tunnel->parms.iph;
617
618 if (iph->daddr) {
619 struct rtable *rt;
620 struct flowi4 fl4;
621
622 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
623 iph->daddr, iph->saddr,
624 0, 0,
625 IPPROTO_IPIP,
626 RT_TOS(iph->tos),
627 tunnel->parms.link);
628 if (!IS_ERR(rt)) {
629 tdev = rt->dst.dev;
630 ip_rt_put(rt);
631 }
632 dev->flags |= IFF_POINTOPOINT;
633 }
634
635 if (!tdev && tunnel->parms.link)
636 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
637
638 if (tdev) {
639 dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
640 dev->mtu = tdev->mtu - sizeof(struct iphdr);
641 }
642 dev->iflink = tunnel->parms.link;
643}
644
645static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
646{
647 struct net *net = dev_net(t->dev);
648 struct ipip_net *ipn = net_generic(net, ipip_net_id);
649
650 ipip_tunnel_unlink(ipn, t);
651 synchronize_net();
652 t->parms.iph.saddr = p->iph.saddr;
653 t->parms.iph.daddr = p->iph.daddr;
654 memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
655 memcpy(t->dev->broadcast, &p->iph.daddr, 4);
656 ipip_tunnel_link(ipn, t);
657 t->parms.iph.ttl = p->iph.ttl;
658 t->parms.iph.tos = p->iph.tos;
659 t->parms.iph.frag_off = p->iph.frag_off;
660 if (t->parms.link != p->link) {
661 t->parms.link = p->link;
662 ipip_tunnel_bind_dev(t->dev);
663 }
664 netdev_state_change(t->dev);
665}
666
667static int 234static int
668ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) 235ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
669{ 236{
670 int err = 0; 237 int err = 0;
671 struct ip_tunnel_parm p; 238 struct ip_tunnel_parm p;
672 struct ip_tunnel *t;
673 struct net *net = dev_net(dev);
674 struct ipip_net *ipn = net_generic(net, ipip_net_id);
675
676 switch (cmd) {
677 case SIOCGETTUNNEL:
678 t = NULL;
679 if (dev == ipn->fb_tunnel_dev) {
680 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
681 err = -EFAULT;
682 break;
683 }
684 t = ipip_tunnel_locate(net, &p, 0);
685 }
686 if (t == NULL)
687 t = netdev_priv(dev);
688 memcpy(&p, &t->parms, sizeof(p));
689 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
690 err = -EFAULT;
691 break;
692
693 case SIOCADDTUNNEL:
694 case SIOCCHGTUNNEL:
695 err = -EPERM;
696 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
697 goto done;
698
699 err = -EFAULT;
700 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
701 goto done;
702
703 err = -EINVAL;
704 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
705 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
706 goto done;
707 if (p.iph.ttl)
708 p.iph.frag_off |= htons(IP_DF);
709
710 t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
711
712 if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
713 if (t != NULL) {
714 if (t->dev != dev) {
715 err = -EEXIST;
716 break;
717 }
718 } else {
719 if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
720 (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
721 err = -EINVAL;
722 break;
723 }
724 t = netdev_priv(dev);
725 }
726
727 ipip_tunnel_update(t, &p);
728 }
729
730 if (t) {
731 err = 0;
732 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
733 err = -EFAULT;
734 } else
735 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
736 break;
737
738 case SIOCDELTUNNEL:
739 err = -EPERM;
740 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
741 goto done;
742
743 if (dev == ipn->fb_tunnel_dev) {
744 err = -EFAULT;
745 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
746 goto done;
747 err = -ENOENT;
748 if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
749 goto done;
750 err = -EPERM;
751 if (t->dev == ipn->fb_tunnel_dev)
752 goto done;
753 dev = t->dev;
754 }
755 unregister_netdevice(dev);
756 err = 0;
757 break;
758 239
759 default: 240 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
760 err = -EINVAL; 241 return -EFAULT;
761 }
762
763done:
764 return err;
765}
766 242
767static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 243 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
768{ 244 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
769 if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) 245 return -EINVAL;
246 if (p.i_key || p.o_key || p.i_flags || p.o_flags)
770 return -EINVAL; 247 return -EINVAL;
771 dev->mtu = new_mtu; 248 if (p.iph.ttl)
249 p.iph.frag_off |= htons(IP_DF);
250
251 err = ip_tunnel_ioctl(dev, &p, cmd);
252 if (err)
253 return err;
254
255 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
256 return -EFAULT;
257
772 return 0; 258 return 0;
773} 259}
774 260
775static const struct net_device_ops ipip_netdev_ops = { 261static const struct net_device_ops ipip_netdev_ops = {
776 .ndo_uninit = ipip_tunnel_uninit, 262 .ndo_init = ipip_tunnel_init,
263 .ndo_uninit = ip_tunnel_uninit,
777 .ndo_start_xmit = ipip_tunnel_xmit, 264 .ndo_start_xmit = ipip_tunnel_xmit,
778 .ndo_do_ioctl = ipip_tunnel_ioctl, 265 .ndo_do_ioctl = ipip_tunnel_ioctl,
779 .ndo_change_mtu = ipip_tunnel_change_mtu, 266 .ndo_change_mtu = ip_tunnel_change_mtu,
780 .ndo_get_stats64 = ipip_get_stats64, 267 .ndo_get_stats64 = ip_tunnel_get_stats64,
781}; 268};
782 269
783static void ipip_dev_free(struct net_device *dev)
784{
785 free_percpu(dev->tstats);
786 free_netdev(dev);
787}
788
789#define IPIP_FEATURES (NETIF_F_SG | \ 270#define IPIP_FEATURES (NETIF_F_SG | \
790 NETIF_F_FRAGLIST | \ 271 NETIF_F_FRAGLIST | \
791 NETIF_F_HIGHDMA | \ 272 NETIF_F_HIGHDMA | \
@@ -794,11 +275,8 @@ static void ipip_dev_free(struct net_device *dev)
794static void ipip_tunnel_setup(struct net_device *dev) 275static void ipip_tunnel_setup(struct net_device *dev)
795{ 276{
796 dev->netdev_ops = &ipip_netdev_ops; 277 dev->netdev_ops = &ipip_netdev_ops;
797 dev->destructor = ipip_dev_free;
798 278
799 dev->type = ARPHRD_TUNNEL; 279 dev->type = ARPHRD_TUNNEL;
800 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
801 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);
802 dev->flags = IFF_NOARP; 280 dev->flags = IFF_NOARP;
803 dev->iflink = 0; 281 dev->iflink = 0;
804 dev->addr_len = 4; 282 dev->addr_len = 4;
@@ -808,46 +286,19 @@ static void ipip_tunnel_setup(struct net_device *dev)
808 286
809 dev->features |= IPIP_FEATURES; 287 dev->features |= IPIP_FEATURES;
810 dev->hw_features |= IPIP_FEATURES; 288 dev->hw_features |= IPIP_FEATURES;
289 ip_tunnel_setup(dev, ipip_net_id);
811} 290}
812 291
813static int ipip_tunnel_init(struct net_device *dev) 292static int ipip_tunnel_init(struct net_device *dev)
814{ 293{
815 struct ip_tunnel *tunnel = netdev_priv(dev); 294 struct ip_tunnel *tunnel = netdev_priv(dev);
816 295
817 tunnel->dev = dev;
818
819 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 296 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
820 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 297 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
821 298
822 ipip_tunnel_bind_dev(dev); 299 tunnel->hlen = 0;
823 300 tunnel->parms.iph.protocol = IPPROTO_IPIP;
824 dev->tstats = alloc_percpu(struct pcpu_tstats); 301 return ip_tunnel_init(dev);
825 if (!dev->tstats)
826 return -ENOMEM;
827
828 return 0;
829}
830
831static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
832{
833 struct ip_tunnel *tunnel = netdev_priv(dev);
834 struct iphdr *iph = &tunnel->parms.iph;
835 struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
836
837 tunnel->dev = dev;
838 strcpy(tunnel->parms.name, dev->name);
839
840 iph->version = 4;
841 iph->protocol = IPPROTO_IPIP;
842 iph->ihl = 5;
843
844 dev->tstats = alloc_percpu(struct pcpu_tstats);
845 if (!dev->tstats)
846 return -ENOMEM;
847
848 dev_hold(dev);
849 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
850 return 0;
851} 302}
852 303
853static void ipip_netlink_parms(struct nlattr *data[], 304static void ipip_netlink_parms(struct nlattr *data[],
@@ -887,28 +338,16 @@ static void ipip_netlink_parms(struct nlattr *data[],
887static int ipip_newlink(struct net *src_net, struct net_device *dev, 338static int ipip_newlink(struct net *src_net, struct net_device *dev,
888 struct nlattr *tb[], struct nlattr *data[]) 339 struct nlattr *tb[], struct nlattr *data[])
889{ 340{
890 struct net *net = dev_net(dev); 341 struct ip_tunnel_parm p;
891 struct ip_tunnel *nt;
892
893 nt = netdev_priv(dev);
894 ipip_netlink_parms(data, &nt->parms);
895
896 if (ipip_tunnel_locate(net, &nt->parms, 0))
897 return -EEXIST;
898 342
899 return ipip_tunnel_create(dev); 343 ipip_netlink_parms(data, &p);
344 return ip_tunnel_newlink(dev, tb, &p);
900} 345}
901 346
902static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], 347static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
903 struct nlattr *data[]) 348 struct nlattr *data[])
904{ 349{
905 struct ip_tunnel *t;
906 struct ip_tunnel_parm p; 350 struct ip_tunnel_parm p;
907 struct net *net = dev_net(dev);
908 struct ipip_net *ipn = net_generic(net, ipip_net_id);
909
910 if (dev == ipn->fb_tunnel_dev)
911 return -EINVAL;
912 351
913 ipip_netlink_parms(data, &p); 352 ipip_netlink_parms(data, &p);
914 353
@@ -916,16 +355,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
916 (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) 355 (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
917 return -EINVAL; 356 return -EINVAL;
918 357
919 t = ipip_tunnel_locate(net, &p, 0); 358 return ip_tunnel_changelink(dev, tb, &p);
920
921 if (t) {
922 if (t->dev != dev)
923 return -EEXIST;
924 } else
925 t = netdev_priv(dev);
926
927 ipip_tunnel_update(t, &p);
928 return 0;
929} 359}
930 360
931static size_t ipip_get_size(const struct net_device *dev) 361static size_t ipip_get_size(const struct net_device *dev)
@@ -982,6 +412,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
982 .setup = ipip_tunnel_setup, 412 .setup = ipip_tunnel_setup,
983 .newlink = ipip_newlink, 413 .newlink = ipip_newlink,
984 .changelink = ipip_changelink, 414 .changelink = ipip_changelink,
415 .dellink = ip_tunnel_dellink,
985 .get_size = ipip_get_size, 416 .get_size = ipip_get_size,
986 .fill_info = ipip_fill_info, 417 .fill_info = ipip_fill_info,
987}; 418};
@@ -992,90 +423,29 @@ static struct xfrm_tunnel ipip_handler __read_mostly = {
992 .priority = 1, 423 .priority = 1,
993}; 424};
994 425
995static const char banner[] __initconst =
996 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
997
998static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
999{
1000 int prio;
1001
1002 for (prio = 1; prio < 4; prio++) {
1003 int h;
1004 for (h = 0; h < HASH_SIZE; h++) {
1005 struct ip_tunnel *t;
1006
1007 t = rtnl_dereference(ipn->tunnels[prio][h]);
1008 while (t != NULL) {
1009 unregister_netdevice_queue(t->dev, head);
1010 t = rtnl_dereference(t->next);
1011 }
1012 }
1013 }
1014}
1015
1016static int __net_init ipip_init_net(struct net *net) 426static int __net_init ipip_init_net(struct net *net)
1017{ 427{
1018 struct ipip_net *ipn = net_generic(net, ipip_net_id); 428 return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
1019 struct ip_tunnel *t;
1020 int err;
1021
1022 ipn->tunnels[0] = ipn->tunnels_wc;
1023 ipn->tunnels[1] = ipn->tunnels_l;
1024 ipn->tunnels[2] = ipn->tunnels_r;
1025 ipn->tunnels[3] = ipn->tunnels_r_l;
1026
1027 ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
1028 "tunl0",
1029 ipip_tunnel_setup);
1030 if (!ipn->fb_tunnel_dev) {
1031 err = -ENOMEM;
1032 goto err_alloc_dev;
1033 }
1034 dev_net_set(ipn->fb_tunnel_dev, net);
1035
1036 err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
1037 if (err)
1038 goto err_reg_dev;
1039
1040 if ((err = register_netdev(ipn->fb_tunnel_dev)))
1041 goto err_reg_dev;
1042
1043 t = netdev_priv(ipn->fb_tunnel_dev);
1044
1045 strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
1046 return 0;
1047
1048err_reg_dev:
1049 ipip_dev_free(ipn->fb_tunnel_dev);
1050err_alloc_dev:
1051 /* nothing */
1052 return err;
1053} 429}
1054 430
1055static void __net_exit ipip_exit_net(struct net *net) 431static void __net_exit ipip_exit_net(struct net *net)
1056{ 432{
1057 struct ipip_net *ipn = net_generic(net, ipip_net_id); 433 struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
1058 LIST_HEAD(list); 434 ip_tunnel_delete_net(itn);
1059
1060 rtnl_lock();
1061 ipip_destroy_tunnels(ipn, &list);
1062 unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
1063 unregister_netdevice_many(&list);
1064 rtnl_unlock();
1065} 435}
1066 436
1067static struct pernet_operations ipip_net_ops = { 437static struct pernet_operations ipip_net_ops = {
1068 .init = ipip_init_net, 438 .init = ipip_init_net,
1069 .exit = ipip_exit_net, 439 .exit = ipip_exit_net,
1070 .id = &ipip_net_id, 440 .id = &ipip_net_id,
1071 .size = sizeof(struct ipip_net), 441 .size = sizeof(struct ip_tunnel_net),
1072}; 442};
1073 443
1074static int __init ipip_init(void) 444static int __init ipip_init(void)
1075{ 445{
1076 int err; 446 int err;
1077 447
1078 printk(banner); 448 pr_info("ipip: IPv4 over IPv4 tunneling driver\n");
1079 449
1080 err = register_pernet_device(&ipip_net_ops); 450 err = register_pernet_device(&ipip_net_ops);
1081 if (err < 0) 451 if (err < 0)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5f95b3aa579e..fd61fe16679f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -61,7 +61,7 @@
61#include <linux/netfilter_ipv4.h> 61#include <linux/netfilter_ipv4.h>
62#include <linux/compat.h> 62#include <linux/compat.h>
63#include <linux/export.h> 63#include <linux/export.h>
64#include <net/ipip.h> 64#include <net/ip_tunnels.h>
65#include <net/checksum.h> 65#include <net/checksum.h>
66#include <net/netlink.h> 66#include <net/netlink.h>
67#include <net/fib_rules.h> 67#include <net/fib_rules.h>
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 79ca5e70d497..eadab1ed6500 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -48,9 +48,7 @@ static int __net_init arptable_filter_net_init(struct net *net)
48 net->ipv4.arptable_filter = 48 net->ipv4.arptable_filter =
49 arpt_register_table(net, &packet_filter, repl); 49 arpt_register_table(net, &packet_filter, repl);
50 kfree(repl); 50 kfree(repl);
51 if (IS_ERR(net->ipv4.arptable_filter)) 51 return PTR_RET(net->ipv4.arptable_filter);
52 return PTR_ERR(net->ipv4.arptable_filter);
53 return 0;
54} 52}
55 53
56static void __net_exit arptable_filter_net_exit(struct net *net) 54static void __net_exit arptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 32030a24e776..b6f2ea174898 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -224,6 +224,8 @@ static const struct snmp_mib snmp4_net_list[] = {
224 SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), 224 SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
225 SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), 225 SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
226 SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), 226 SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
227 SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
228 SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
227 SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), 229 SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
228 SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), 230 SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
229 SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), 231 SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6e2851464f8f..550781a17b34 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2311,7 +2311,7 @@ nla_put_failure:
2311 return -EMSGSIZE; 2311 return -EMSGSIZE;
2312} 2312}
2313 2313
2314static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) 2314static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2315{ 2315{
2316 struct net *net = sock_net(in_skb->sk); 2316 struct net *net = sock_net(in_skb->sk);
2317 struct rtmsg *rtm; 2317 struct rtmsg *rtm;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index ef54377fb11c..7f4a5cb8f8d0 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -267,7 +267,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
267 struct ip_options *opt) 267 struct ip_options *opt)
268{ 268{
269 struct tcp_options_received tcp_opt; 269 struct tcp_options_received tcp_opt;
270 const u8 *hash_location;
271 struct inet_request_sock *ireq; 270 struct inet_request_sock *ireq;
272 struct tcp_request_sock *treq; 271 struct tcp_request_sock *treq;
273 struct tcp_sock *tp = tcp_sk(sk); 272 struct tcp_sock *tp = tcp_sk(sk);
@@ -294,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
294 293
295 /* check for timestamp cookie support */ 294 /* check for timestamp cookie support */
296 memset(&tcp_opt, 0, sizeof(tcp_opt)); 295 memset(&tcp_opt, 0, sizeof(tcp_opt));
297 tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); 296 tcp_parse_options(skb, &tcp_opt, 0, NULL);
298 297
299 if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) 298 if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
300 goto out; 299 goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 960fd29d9b8e..fa2f63fc453b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,7 +28,7 @@
28 28
29static int zero; 29static int zero;
30static int one = 1; 30static int one = 1;
31static int two = 2; 31static int four = 4;
32static int tcp_retr1_max = 255; 32static int tcp_retr1_max = 255;
33static int ip_local_port_range_min[] = { 1, 1 }; 33static int ip_local_port_range_min[] = { 1, 1 };
34static int ip_local_port_range_max[] = { 65535, 65535 }; 34static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -592,13 +592,6 @@ static struct ctl_table ipv4_table[] = {
592 .proc_handler = proc_dointvec 592 .proc_handler = proc_dointvec
593 }, 593 },
594 { 594 {
595 .procname = "tcp_frto_response",
596 .data = &sysctl_tcp_frto_response,
597 .maxlen = sizeof(int),
598 .mode = 0644,
599 .proc_handler = proc_dointvec
600 },
601 {
602 .procname = "tcp_low_latency", 595 .procname = "tcp_low_latency",
603 .data = &sysctl_tcp_low_latency, 596 .data = &sysctl_tcp_low_latency,
604 .maxlen = sizeof(int), 597 .maxlen = sizeof(int),
@@ -733,13 +726,6 @@ static struct ctl_table ipv4_table[] = {
733 .proc_handler = proc_dointvec, 726 .proc_handler = proc_dointvec,
734 }, 727 },
735 { 728 {
736 .procname = "tcp_cookie_size",
737 .data = &sysctl_tcp_cookie_size,
738 .maxlen = sizeof(int),
739 .mode = 0644,
740 .proc_handler = proc_dointvec
741 },
742 {
743 .procname = "tcp_thin_linear_timeouts", 729 .procname = "tcp_thin_linear_timeouts",
744 .data = &sysctl_tcp_thin_linear_timeouts, 730 .data = &sysctl_tcp_thin_linear_timeouts,
745 .maxlen = sizeof(int), 731 .maxlen = sizeof(int),
@@ -760,7 +746,7 @@ static struct ctl_table ipv4_table[] = {
760 .mode = 0644, 746 .mode = 0644,
761 .proc_handler = proc_dointvec_minmax, 747 .proc_handler = proc_dointvec_minmax,
762 .extra1 = &zero, 748 .extra1 = &zero,
763 .extra2 = &two, 749 .extra2 = &four,
764 }, 750 },
765 { 751 {
766 .procname = "udp_mem", 752 .procname = "udp_mem",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e22020790709..a96f7b586277 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -409,15 +409,6 @@ void tcp_init_sock(struct sock *sk)
409 409
410 icsk->icsk_sync_mss = tcp_sync_mss; 410 icsk->icsk_sync_mss = tcp_sync_mss;
411 411
412 /* TCP Cookie Transactions */
413 if (sysctl_tcp_cookie_size > 0) {
414 /* Default, cookies without s_data_payload. */
415 tp->cookie_values =
416 kzalloc(sizeof(*tp->cookie_values),
417 sk->sk_allocation);
418 if (tp->cookie_values != NULL)
419 kref_init(&tp->cookie_values->kref);
420 }
421 /* Presumed zeroed, in order of appearance: 412 /* Presumed zeroed, in order of appearance:
422 * cookie_in_always, cookie_out_never, 413 * cookie_in_always, cookie_out_never,
423 * s_data_constant, s_data_in, s_data_out 414 * s_data_constant, s_data_in, s_data_out
@@ -2397,92 +2388,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2397 release_sock(sk); 2388 release_sock(sk);
2398 return err; 2389 return err;
2399 } 2390 }
2400 case TCP_COOKIE_TRANSACTIONS: {
2401 struct tcp_cookie_transactions ctd;
2402 struct tcp_cookie_values *cvp = NULL;
2403
2404 if (sizeof(ctd) > optlen)
2405 return -EINVAL;
2406 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2407 return -EFAULT;
2408
2409 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2410 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2411 return -EINVAL;
2412
2413 if (ctd.tcpct_cookie_desired == 0) {
2414 /* default to global value */
2415 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2416 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2417 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2418 return -EINVAL;
2419 }
2420
2421 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2422 /* Supercedes all other values */
2423 lock_sock(sk);
2424 if (tp->cookie_values != NULL) {
2425 kref_put(&tp->cookie_values->kref,
2426 tcp_cookie_values_release);
2427 tp->cookie_values = NULL;
2428 }
2429 tp->rx_opt.cookie_in_always = 0; /* false */
2430 tp->rx_opt.cookie_out_never = 1; /* true */
2431 release_sock(sk);
2432 return err;
2433 }
2434
2435 /* Allocate ancillary memory before locking.
2436 */
2437 if (ctd.tcpct_used > 0 ||
2438 (tp->cookie_values == NULL &&
2439 (sysctl_tcp_cookie_size > 0 ||
2440 ctd.tcpct_cookie_desired > 0 ||
2441 ctd.tcpct_s_data_desired > 0))) {
2442 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2443 GFP_KERNEL);
2444 if (cvp == NULL)
2445 return -ENOMEM;
2446
2447 kref_init(&cvp->kref);
2448 }
2449 lock_sock(sk);
2450 tp->rx_opt.cookie_in_always =
2451 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2452 tp->rx_opt.cookie_out_never = 0; /* false */
2453
2454 if (tp->cookie_values != NULL) {
2455 if (cvp != NULL) {
2456 /* Changed values are recorded by a changed
2457 * pointer, ensuring the cookie will differ,
2458 * without separately hashing each value later.
2459 */
2460 kref_put(&tp->cookie_values->kref,
2461 tcp_cookie_values_release);
2462 } else {
2463 cvp = tp->cookie_values;
2464 }
2465 }
2466
2467 if (cvp != NULL) {
2468 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2469
2470 if (ctd.tcpct_used > 0) {
2471 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2472 ctd.tcpct_used);
2473 cvp->s_data_desired = ctd.tcpct_used;
2474 cvp->s_data_constant = 1; /* true */
2475 } else {
2476 /* No constant payload data. */
2477 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2478 cvp->s_data_constant = 0; /* false */
2479 }
2480
2481 tp->cookie_values = cvp;
2482 }
2483 release_sock(sk);
2484 return err;
2485 }
2486 default: 2391 default:
2487 /* fallthru */ 2392 /* fallthru */
2488 break; 2393 break;
@@ -2902,41 +2807,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2902 return -EFAULT; 2807 return -EFAULT;
2903 return 0; 2808 return 0;
2904 2809
2905 case TCP_COOKIE_TRANSACTIONS: {
2906 struct tcp_cookie_transactions ctd;
2907 struct tcp_cookie_values *cvp = tp->cookie_values;
2908
2909 if (get_user(len, optlen))
2910 return -EFAULT;
2911 if (len < sizeof(ctd))
2912 return -EINVAL;
2913
2914 memset(&ctd, 0, sizeof(ctd));
2915 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2916 TCP_COOKIE_IN_ALWAYS : 0)
2917 | (tp->rx_opt.cookie_out_never ?
2918 TCP_COOKIE_OUT_NEVER : 0);
2919
2920 if (cvp != NULL) {
2921 ctd.tcpct_flags |= (cvp->s_data_in ?
2922 TCP_S_DATA_IN : 0)
2923 | (cvp->s_data_out ?
2924 TCP_S_DATA_OUT : 0);
2925
2926 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2927 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2928
2929 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2930 cvp->cookie_pair_size);
2931 ctd.tcpct_used = cvp->cookie_pair_size;
2932 }
2933
2934 if (put_user(sizeof(ctd), optlen))
2935 return -EFAULT;
2936 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2937 return -EFAULT;
2938 return 0;
2939 }
2940 case TCP_THIN_LINEAR_TIMEOUTS: 2810 case TCP_THIN_LINEAR_TIMEOUTS:
2941 val = tp->thin_lto; 2811 val = tp->thin_lto;
2942 break; 2812 break;
@@ -3044,6 +2914,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
3044 SKB_GSO_TCP_ECN | 2914 SKB_GSO_TCP_ECN |
3045 SKB_GSO_TCPV6 | 2915 SKB_GSO_TCPV6 |
3046 SKB_GSO_GRE | 2916 SKB_GSO_GRE |
2917 SKB_GSO_UDP_TUNNEL |
3047 0) || 2918 0) ||
3048 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) 2919 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
3049 goto out; 2920 goto out;
@@ -3408,134 +3279,6 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
3408 3279
3409#endif 3280#endif
3410 3281
3411/* Each Responder maintains up to two secret values concurrently for
3412 * efficient secret rollover. Each secret value has 4 states:
3413 *
3414 * Generating. (tcp_secret_generating != tcp_secret_primary)
3415 * Generates new Responder-Cookies, but not yet used for primary
3416 * verification. This is a short-term state, typically lasting only
3417 * one round trip time (RTT).
3418 *
3419 * Primary. (tcp_secret_generating == tcp_secret_primary)
3420 * Used both for generation and primary verification.
3421 *
3422 * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
3423 * Used for verification, until the first failure that can be
3424 * verified by the newer Generating secret. At that time, this
3425 * cookie's state is changed to Secondary, and the Generating
3426 * cookie's state is changed to Primary. This is a short-term state,
3427 * typically lasting only one round trip time (RTT).
3428 *
3429 * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
3430 * Used for secondary verification, after primary verification
3431 * failures. This state lasts no more than twice the Maximum Segment
3432 * Lifetime (2MSL). Then, the secret is discarded.
3433 */
3434struct tcp_cookie_secret {
3435 /* The secret is divided into two parts. The digest part is the
3436 * equivalent of previously hashing a secret and saving the state,
3437 * and serves as an initialization vector (IV). The message part
3438 * serves as the trailing secret.
3439 */
3440 u32 secrets[COOKIE_WORKSPACE_WORDS];
3441 unsigned long expires;
3442};
3443
3444#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3445#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3446#define TCP_SECRET_LIFE (HZ * 600)
3447
3448static struct tcp_cookie_secret tcp_secret_one;
3449static struct tcp_cookie_secret tcp_secret_two;
3450
3451/* Essentially a circular list, without dynamic allocation. */
3452static struct tcp_cookie_secret *tcp_secret_generating;
3453static struct tcp_cookie_secret *tcp_secret_primary;
3454static struct tcp_cookie_secret *tcp_secret_retiring;
3455static struct tcp_cookie_secret *tcp_secret_secondary;
3456
3457static DEFINE_SPINLOCK(tcp_secret_locker);
3458
3459/* Select a pseudo-random word in the cookie workspace.
3460 */
3461static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3462{
3463 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3464}
3465
3466/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
3467 * Called in softirq context.
3468 * Returns: 0 for success.
3469 */
3470int tcp_cookie_generator(u32 *bakery)
3471{
3472 unsigned long jiffy = jiffies;
3473
3474 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3475 spin_lock_bh(&tcp_secret_locker);
3476 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3477 /* refreshed by another */
3478 memcpy(bakery,
3479 &tcp_secret_generating->secrets[0],
3480 COOKIE_WORKSPACE_WORDS);
3481 } else {
3482 /* still needs refreshing */
3483 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3484
3485 /* The first time, paranoia assumes that the
3486 * randomization function isn't as strong. But,
3487 * this secret initialization is delayed until
3488 * the last possible moment (packet arrival).
3489 * Although that time is observable, it is
3490 * unpredictably variable. Mash in the most
3491 * volatile clock bits available, and expire the
3492 * secret extra quickly.
3493 */
3494 if (unlikely(tcp_secret_primary->expires ==
3495 tcp_secret_secondary->expires)) {
3496 struct timespec tv;
3497
3498 getnstimeofday(&tv);
3499 bakery[COOKIE_DIGEST_WORDS+0] ^=
3500 (u32)tv.tv_nsec;
3501
3502 tcp_secret_secondary->expires = jiffy
3503 + TCP_SECRET_1MSL
3504 + (0x0f & tcp_cookie_work(bakery, 0));
3505 } else {
3506 tcp_secret_secondary->expires = jiffy
3507 + TCP_SECRET_LIFE
3508 + (0xff & tcp_cookie_work(bakery, 1));
3509 tcp_secret_primary->expires = jiffy
3510 + TCP_SECRET_2MSL
3511 + (0x1f & tcp_cookie_work(bakery, 2));
3512 }
3513 memcpy(&tcp_secret_secondary->secrets[0],
3514 bakery, COOKIE_WORKSPACE_WORDS);
3515
3516 rcu_assign_pointer(tcp_secret_generating,
3517 tcp_secret_secondary);
3518 rcu_assign_pointer(tcp_secret_retiring,
3519 tcp_secret_primary);
3520 /*
3521 * Neither call_rcu() nor synchronize_rcu() needed.
3522 * Retiring data is not freed. It is replaced after
3523 * further (locked) pointer updates, and a quiet time
3524 * (minimum 1MSL, maximum LIFE - 2MSL).
3525 */
3526 }
3527 spin_unlock_bh(&tcp_secret_locker);
3528 } else {
3529 rcu_read_lock_bh();
3530 memcpy(bakery,
3531 &rcu_dereference(tcp_secret_generating)->secrets[0],
3532 COOKIE_WORKSPACE_WORDS);
3533 rcu_read_unlock_bh();
3534 }
3535 return 0;
3536}
3537EXPORT_SYMBOL(tcp_cookie_generator);
3538
3539void tcp_done(struct sock *sk) 3282void tcp_done(struct sock *sk)
3540{ 3283{
3541 struct request_sock *req = tcp_sk(sk)->fastopen_rsk; 3284 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
@@ -3590,7 +3333,6 @@ void __init tcp_init(void)
3590 unsigned long limit; 3333 unsigned long limit;
3591 int max_rshare, max_wshare, cnt; 3334 int max_rshare, max_wshare, cnt;
3592 unsigned int i; 3335 unsigned int i;
3593 unsigned long jiffy = jiffies;
3594 3336
3595 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3337 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3596 3338
@@ -3666,13 +3408,5 @@ void __init tcp_init(void)
3666 3408
3667 tcp_register_congestion_control(&tcp_reno); 3409 tcp_register_congestion_control(&tcp_reno);
3668 3410
3669 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
3670 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
3671 tcp_secret_one.expires = jiffy; /* past due */
3672 tcp_secret_two.expires = jiffy; /* past due */
3673 tcp_secret_generating = &tcp_secret_one;
3674 tcp_secret_primary = &tcp_secret_one;
3675 tcp_secret_retiring = &tcp_secret_two;
3676 tcp_secret_secondary = &tcp_secret_two;
3677 tcp_tasklet_init(); 3411 tcp_tasklet_init();
3678} 3412}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3bd55bad230a..6d9ca35f0c35 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,12 +93,11 @@ int sysctl_tcp_stdurg __read_mostly;
93int sysctl_tcp_rfc1337 __read_mostly; 93int sysctl_tcp_rfc1337 __read_mostly;
94int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 94int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
95int sysctl_tcp_frto __read_mostly = 2; 95int sysctl_tcp_frto __read_mostly = 2;
96int sysctl_tcp_frto_response __read_mostly;
97 96
98int sysctl_tcp_thin_dupack __read_mostly; 97int sysctl_tcp_thin_dupack __read_mostly;
99 98
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 99int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_early_retrans __read_mostly = 2; 100int sysctl_tcp_early_retrans __read_mostly = 3;
102 101
103#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 102#define FLAG_DATA 0x01 /* Incoming frame contained data. */
104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 103#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -108,17 +107,15 @@ int sysctl_tcp_early_retrans __read_mostly = 2;
108#define FLAG_DATA_SACKED 0x20 /* New SACK. */ 107#define FLAG_DATA_SACKED 0x20 /* New SACK. */
109#define FLAG_ECE 0x40 /* ECE in this ACK */ 108#define FLAG_ECE 0x40 /* ECE in this ACK */
110#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ 109#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
111#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ 110#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
112#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ 111#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
113#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ 112#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
114#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
115#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ 113#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
116 114
117#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) 115#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
118#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) 116#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
119#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) 117#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
120#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) 118#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
121#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
122 119
123#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) 120#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
124#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) 121#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@@ -1159,10 +1156,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
1159 tcp_highest_sack_seq(tp))) 1156 tcp_highest_sack_seq(tp)))
1160 state->reord = min(fack_count, 1157 state->reord = min(fack_count,
1161 state->reord); 1158 state->reord);
1162 1159 if (!after(end_seq, tp->high_seq))
1163 /* SACK enhanced F-RTO (RFC4138; Appendix B) */ 1160 state->flag |= FLAG_ORIG_SACK_ACKED;
1164 if (!after(end_seq, tp->frto_highmark))
1165 state->flag |= FLAG_ONLY_ORIG_SACKED;
1166 } 1161 }
1167 1162
1168 if (sacked & TCPCB_LOST) { 1163 if (sacked & TCPCB_LOST) {
@@ -1555,7 +1550,6 @@ static int
1555tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, 1550tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1556 u32 prior_snd_una) 1551 u32 prior_snd_una)
1557{ 1552{
1558 const struct inet_connection_sock *icsk = inet_csk(sk);
1559 struct tcp_sock *tp = tcp_sk(sk); 1553 struct tcp_sock *tp = tcp_sk(sk);
1560 const unsigned char *ptr = (skb_transport_header(ack_skb) + 1554 const unsigned char *ptr = (skb_transport_header(ack_skb) +
1561 TCP_SKB_CB(ack_skb)->sacked); 1555 TCP_SKB_CB(ack_skb)->sacked);
@@ -1728,12 +1722,6 @@ walk:
1728 start_seq, end_seq, dup_sack); 1722 start_seq, end_seq, dup_sack);
1729 1723
1730advance_sp: 1724advance_sp:
1731 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
1732 * due to in-order walk
1733 */
1734 if (after(end_seq, tp->frto_highmark))
1735 state.flag &= ~FLAG_ONLY_ORIG_SACKED;
1736
1737 i++; 1725 i++;
1738 } 1726 }
1739 1727
@@ -1750,8 +1738,7 @@ advance_sp:
1750 tcp_verify_left_out(tp); 1738 tcp_verify_left_out(tp);
1751 1739
1752 if ((state.reord < tp->fackets_out) && 1740 if ((state.reord < tp->fackets_out) &&
1753 ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && 1741 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1754 (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
1755 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); 1742 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
1756 1743
1757out: 1744out:
@@ -1825,197 +1812,6 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1825 tp->sacked_out = 0; 1812 tp->sacked_out = 0;
1826} 1813}
1827 1814
1828static int tcp_is_sackfrto(const struct tcp_sock *tp)
1829{
1830 return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
1831}
1832
1833/* F-RTO can only be used if TCP has never retransmitted anything other than
1834 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
1835 */
1836bool tcp_use_frto(struct sock *sk)
1837{
1838 const struct tcp_sock *tp = tcp_sk(sk);
1839 const struct inet_connection_sock *icsk = inet_csk(sk);
1840 struct sk_buff *skb;
1841
1842 if (!sysctl_tcp_frto)
1843 return false;
1844
1845 /* MTU probe and F-RTO won't really play nicely along currently */
1846 if (icsk->icsk_mtup.probe_size)
1847 return false;
1848
1849 if (tcp_is_sackfrto(tp))
1850 return true;
1851
1852 /* Avoid expensive walking of rexmit queue if possible */
1853 if (tp->retrans_out > 1)
1854 return false;
1855
1856 skb = tcp_write_queue_head(sk);
1857 if (tcp_skb_is_last(sk, skb))
1858 return true;
1859 skb = tcp_write_queue_next(sk, skb); /* Skips head */
1860 tcp_for_write_queue_from(skb, sk) {
1861 if (skb == tcp_send_head(sk))
1862 break;
1863 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1864 return false;
1865 /* Short-circuit when first non-SACKed skb has been checked */
1866 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1867 break;
1868 }
1869 return true;
1870}
1871
1872/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
1873 * recovery a bit and use heuristics in tcp_process_frto() to detect if
1874 * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
1875 * keep retrans_out counting accurate (with SACK F-RTO, other than head
1876 * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
1877 * bits are handled if the Loss state is really to be entered (in
1878 * tcp_enter_frto_loss).
1879 *
1880 * Do like tcp_enter_loss() would; when RTO expires the second time it
1881 * does:
1882 * "Reduce ssthresh if it has not yet been made inside this window."
1883 */
1884void tcp_enter_frto(struct sock *sk)
1885{
1886 const struct inet_connection_sock *icsk = inet_csk(sk);
1887 struct tcp_sock *tp = tcp_sk(sk);
1888 struct sk_buff *skb;
1889
1890 if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
1891 tp->snd_una == tp->high_seq ||
1892 ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
1893 !icsk->icsk_retransmits)) {
1894 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1895 /* Our state is too optimistic in ssthresh() call because cwnd
1896 * is not reduced until tcp_enter_frto_loss() when previous F-RTO
1897 * recovery has not yet completed. Pattern would be this: RTO,
1898 * Cumulative ACK, RTO (2xRTO for the same segment does not end
1899 * up here twice).
1900 * RFC4138 should be more specific on what to do, even though
1901 * RTO is quite unlikely to occur after the first Cumulative ACK
1902 * due to back-off and complexity of triggering events ...
1903 */
1904 if (tp->frto_counter) {
1905 u32 stored_cwnd;
1906 stored_cwnd = tp->snd_cwnd;
1907 tp->snd_cwnd = 2;
1908 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1909 tp->snd_cwnd = stored_cwnd;
1910 } else {
1911 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1912 }
1913 /* ... in theory, cong.control module could do "any tricks" in
1914 * ssthresh(), which means that ca_state, lost bits and lost_out
1915 * counter would have to be faked before the call occurs. We
1916 * consider that too expensive, unlikely and hacky, so modules
1917 * using these in ssthresh() must deal these incompatibility
1918 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
1919 */
1920 tcp_ca_event(sk, CA_EVENT_FRTO);
1921 }
1922
1923 tp->undo_marker = tp->snd_una;
1924 tp->undo_retrans = 0;
1925
1926 skb = tcp_write_queue_head(sk);
1927 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1928 tp->undo_marker = 0;
1929 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1930 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1931 tp->retrans_out -= tcp_skb_pcount(skb);
1932 }
1933 tcp_verify_left_out(tp);
1934
1935 /* Too bad if TCP was application limited */
1936 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
1937
1938 /* Earlier loss recovery underway (see RFC4138; Appendix B).
1939 * The last condition is necessary at least in tp->frto_counter case.
1940 */
1941 if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
1942 ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
1943 after(tp->high_seq, tp->snd_una)) {
1944 tp->frto_highmark = tp->high_seq;
1945 } else {
1946 tp->frto_highmark = tp->snd_nxt;
1947 }
1948 tcp_set_ca_state(sk, TCP_CA_Disorder);
1949 tp->high_seq = tp->snd_nxt;
1950 tp->frto_counter = 1;
1951}
1952
1953/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
1954 * which indicates that we should follow the traditional RTO recovery,
1955 * i.e. mark everything lost and do go-back-N retransmission.
1956 */
1957static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1958{
1959 struct tcp_sock *tp = tcp_sk(sk);
1960 struct sk_buff *skb;
1961
1962 tp->lost_out = 0;
1963 tp->retrans_out = 0;
1964 if (tcp_is_reno(tp))
1965 tcp_reset_reno_sack(tp);
1966
1967 tcp_for_write_queue(skb, sk) {
1968 if (skb == tcp_send_head(sk))
1969 break;
1970
1971 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1972 /*
1973 * Count the retransmission made on RTO correctly (only when
1974 * waiting for the first ACK and did not get it)...
1975 */
1976 if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
1977 /* For some reason this R-bit might get cleared? */
1978 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1979 tp->retrans_out += tcp_skb_pcount(skb);
1980 /* ...enter this if branch just for the first segment */
1981 flag |= FLAG_DATA_ACKED;
1982 } else {
1983 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1984 tp->undo_marker = 0;
1985 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1986 }
1987
1988 /* Marking forward transmissions that were made after RTO lost
1989 * can cause unnecessary retransmissions in some scenarios,
1990 * SACK blocks will mitigate that in some but not in all cases.
1991 * We used to not mark them but it was causing break-ups with
1992 * receivers that do only in-order receival.
1993 *
1994 * TODO: we could detect presence of such receiver and select
1995 * different behavior per flow.
1996 */
1997 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1998 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1999 tp->lost_out += tcp_skb_pcount(skb);
2000 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
2001 }
2002 }
2003 tcp_verify_left_out(tp);
2004
2005 tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
2006 tp->snd_cwnd_cnt = 0;
2007 tp->snd_cwnd_stamp = tcp_time_stamp;
2008 tp->frto_counter = 0;
2009
2010 tp->reordering = min_t(unsigned int, tp->reordering,
2011 sysctl_tcp_reordering);
2012 tcp_set_ca_state(sk, TCP_CA_Loss);
2013 tp->high_seq = tp->snd_nxt;
2014 TCP_ECN_queue_cwr(tp);
2015
2016 tcp_clear_all_retrans_hints(tp);
2017}
2018
2019static void tcp_clear_retrans_partial(struct tcp_sock *tp) 1815static void tcp_clear_retrans_partial(struct tcp_sock *tp)
2020{ 1816{
2021 tp->retrans_out = 0; 1817 tp->retrans_out = 0;
@@ -2042,10 +1838,13 @@ void tcp_enter_loss(struct sock *sk, int how)
2042 const struct inet_connection_sock *icsk = inet_csk(sk); 1838 const struct inet_connection_sock *icsk = inet_csk(sk);
2043 struct tcp_sock *tp = tcp_sk(sk); 1839 struct tcp_sock *tp = tcp_sk(sk);
2044 struct sk_buff *skb; 1840 struct sk_buff *skb;
1841 bool new_recovery = false;
2045 1842
2046 /* Reduce ssthresh if it has not yet been made inside this window. */ 1843 /* Reduce ssthresh if it has not yet been made inside this window. */
2047 if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1844 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1845 !after(tp->high_seq, tp->snd_una) ||
2048 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { 1846 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1847 new_recovery = true;
2049 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1848 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2050 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1849 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
2051 tcp_ca_event(sk, CA_EVENT_LOSS); 1850 tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2087,8 +1886,14 @@ void tcp_enter_loss(struct sock *sk, int how)
2087 tcp_set_ca_state(sk, TCP_CA_Loss); 1886 tcp_set_ca_state(sk, TCP_CA_Loss);
2088 tp->high_seq = tp->snd_nxt; 1887 tp->high_seq = tp->snd_nxt;
2089 TCP_ECN_queue_cwr(tp); 1888 TCP_ECN_queue_cwr(tp);
2090 /* Abort F-RTO algorithm if one is in progress */ 1889
2091 tp->frto_counter = 0; 1890 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
1891 * loss recovery is underway except recurring timeout(s) on
1892 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
1893 */
1894 tp->frto = sysctl_tcp_frto &&
1895 (new_recovery || icsk->icsk_retransmits) &&
1896 !inet_csk(sk)->icsk_mtup.probe_size;
2092} 1897}
2093 1898
2094/* If ACK arrived pointing to a remembered SACK, it means that our 1899/* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2147,15 +1952,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2147 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples 1952 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2148 * available, or RTO is scheduled to fire first. 1953 * available, or RTO is scheduled to fire first.
2149 */ 1954 */
2150 if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) 1955 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
1956 (flag & FLAG_ECE) || !tp->srtt)
2151 return false; 1957 return false;
2152 1958
2153 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); 1959 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
2154 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) 1960 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2155 return false; 1961 return false;
2156 1962
2157 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); 1963 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2158 tp->early_retrans_delayed = 1; 1964 TCP_RTO_MAX);
2159 return true; 1965 return true;
2160} 1966}
2161 1967
@@ -2271,10 +2077,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2271 struct tcp_sock *tp = tcp_sk(sk); 2077 struct tcp_sock *tp = tcp_sk(sk);
2272 __u32 packets_out; 2078 __u32 packets_out;
2273 2079
2274 /* Do not perform any recovery during F-RTO algorithm */
2275 if (tp->frto_counter)
2276 return false;
2277
2278 /* Trick#1: The loss is proven. */ 2080 /* Trick#1: The loss is proven. */
2279 if (tp->lost_out) 2081 if (tp->lost_out)
2280 return true; 2082 return true;
@@ -2318,7 +2120,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2318 * interval if appropriate. 2120 * interval if appropriate.
2319 */ 2121 */
2320 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && 2122 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2321 (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && 2123 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2322 !tcp_may_send_now(sk)) 2124 !tcp_may_send_now(sk))
2323 return !tcp_pause_early_retransmit(sk, flag); 2125 return !tcp_pause_early_retransmit(sk, flag);
2324 2126
@@ -2635,12 +2437,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2635 return failed; 2437 return failed;
2636} 2438}
2637 2439
2638/* Undo during loss recovery after partial ACK. */ 2440/* Undo during loss recovery after partial ACK or using F-RTO. */
2639static bool tcp_try_undo_loss(struct sock *sk) 2441static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2640{ 2442{
2641 struct tcp_sock *tp = tcp_sk(sk); 2443 struct tcp_sock *tp = tcp_sk(sk);
2642 2444
2643 if (tcp_may_undo(tp)) { 2445 if (frto_undo || tcp_may_undo(tp)) {
2644 struct sk_buff *skb; 2446 struct sk_buff *skb;
2645 tcp_for_write_queue(skb, sk) { 2447 tcp_for_write_queue(skb, sk) {
2646 if (skb == tcp_send_head(sk)) 2448 if (skb == tcp_send_head(sk))
@@ -2654,9 +2456,12 @@ static bool tcp_try_undo_loss(struct sock *sk)
2654 tp->lost_out = 0; 2456 tp->lost_out = 0;
2655 tcp_undo_cwr(sk, true); 2457 tcp_undo_cwr(sk, true);
2656 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); 2458 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2459 if (frto_undo)
2460 NET_INC_STATS_BH(sock_net(sk),
2461 LINUX_MIB_TCPSPURIOUSRTOS);
2657 inet_csk(sk)->icsk_retransmits = 0; 2462 inet_csk(sk)->icsk_retransmits = 0;
2658 tp->undo_marker = 0; 2463 tp->undo_marker = 0;
2659 if (tcp_is_sack(tp)) 2464 if (frto_undo || tcp_is_sack(tp))
2660 tcp_set_ca_state(sk, TCP_CA_Open); 2465 tcp_set_ca_state(sk, TCP_CA_Open);
2661 return true; 2466 return true;
2662 } 2467 }
@@ -2678,6 +2483,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2678 struct tcp_sock *tp = tcp_sk(sk); 2483 struct tcp_sock *tp = tcp_sk(sk);
2679 2484
2680 tp->high_seq = tp->snd_nxt; 2485 tp->high_seq = tp->snd_nxt;
2486 tp->tlp_high_seq = 0;
2681 tp->snd_cwnd_cnt = 0; 2487 tp->snd_cwnd_cnt = 0;
2682 tp->prior_cwnd = tp->snd_cwnd; 2488 tp->prior_cwnd = tp->snd_cwnd;
2683 tp->prr_delivered = 0; 2489 tp->prr_delivered = 0;
@@ -2755,7 +2561,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
2755 2561
2756 tcp_verify_left_out(tp); 2562 tcp_verify_left_out(tp);
2757 2563
2758 if (!tp->frto_counter && !tcp_any_retrans_done(sk)) 2564 if (!tcp_any_retrans_done(sk))
2759 tp->retrans_stamp = 0; 2565 tp->retrans_stamp = 0;
2760 2566
2761 if (flag & FLAG_ECE) 2567 if (flag & FLAG_ECE)
@@ -2872,6 +2678,58 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2872 tcp_set_ca_state(sk, TCP_CA_Recovery); 2678 tcp_set_ca_state(sk, TCP_CA_Recovery);
2873} 2679}
2874 2680
2681/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2682 * recovered or spurious. Otherwise retransmits more on partial ACKs.
2683 */
2684static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2685{
2686 struct inet_connection_sock *icsk = inet_csk(sk);
2687 struct tcp_sock *tp = tcp_sk(sk);
2688 bool recovered = !before(tp->snd_una, tp->high_seq);
2689
2690 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2691 if (flag & FLAG_ORIG_SACK_ACKED) {
2692 /* Step 3.b. A timeout is spurious if not all data are
2693 * lost, i.e., never-retransmitted data are (s)acked.
2694 */
2695 tcp_try_undo_loss(sk, true);
2696 return;
2697 }
2698 if (after(tp->snd_nxt, tp->high_seq) &&
2699 (flag & FLAG_DATA_SACKED || is_dupack)) {
2700 tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2701 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2702 tp->high_seq = tp->snd_nxt;
2703 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
2704 TCP_NAGLE_OFF);
2705 if (after(tp->snd_nxt, tp->high_seq))
2706 return; /* Step 2.b */
2707 tp->frto = 0;
2708 }
2709 }
2710
2711 if (recovered) {
2712 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2713 icsk->icsk_retransmits = 0;
2714 tcp_try_undo_recovery(sk);
2715 return;
2716 }
2717 if (flag & FLAG_DATA_ACKED)
2718 icsk->icsk_retransmits = 0;
2719 if (tcp_is_reno(tp)) {
2720 /* A Reno DUPACK means new data in F-RTO step 2.b above are
2721 * delivered. Lower inflight to clock out (re)tranmissions.
2722 */
2723 if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2724 tcp_add_reno_sack(sk);
2725 else if (flag & FLAG_SND_UNA_ADVANCED)
2726 tcp_reset_reno_sack(tp);
2727 }
2728 if (tcp_try_undo_loss(sk, false))
2729 return;
2730 tcp_xmit_retransmit_queue(sk);
2731}
2732
2875/* Process an event, which can update packets-in-flight not trivially. 2733/* Process an event, which can update packets-in-flight not trivially.
2876 * Main goal of this function is to calculate new estimate for left_out, 2734 * Main goal of this function is to calculate new estimate for left_out,
2877 * taking into account both packets sitting in receiver's buffer and 2735 * taking into account both packets sitting in receiver's buffer and
@@ -2918,12 +2776,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2918 tp->retrans_stamp = 0; 2776 tp->retrans_stamp = 0;
2919 } else if (!before(tp->snd_una, tp->high_seq)) { 2777 } else if (!before(tp->snd_una, tp->high_seq)) {
2920 switch (icsk->icsk_ca_state) { 2778 switch (icsk->icsk_ca_state) {
2921 case TCP_CA_Loss:
2922 icsk->icsk_retransmits = 0;
2923 if (tcp_try_undo_recovery(sk))
2924 return;
2925 break;
2926
2927 case TCP_CA_CWR: 2779 case TCP_CA_CWR:
2928 /* CWR is to be held something *above* high_seq 2780 /* CWR is to be held something *above* high_seq
2929 * is ACKed for CWR bit to reach receiver. */ 2781 * is ACKed for CWR bit to reach receiver. */
@@ -2954,18 +2806,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2954 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; 2806 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
2955 break; 2807 break;
2956 case TCP_CA_Loss: 2808 case TCP_CA_Loss:
2957 if (flag & FLAG_DATA_ACKED) 2809 tcp_process_loss(sk, flag, is_dupack);
2958 icsk->icsk_retransmits = 0;
2959 if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
2960 tcp_reset_reno_sack(tp);
2961 if (!tcp_try_undo_loss(sk)) {
2962 tcp_moderate_cwnd(tp);
2963 tcp_xmit_retransmit_queue(sk);
2964 return;
2965 }
2966 if (icsk->icsk_ca_state != TCP_CA_Open) 2810 if (icsk->icsk_ca_state != TCP_CA_Open)
2967 return; 2811 return;
2968 /* Loss is undone; fall through to processing in Open state. */ 2812 /* Fall through to processing in Open state. */
2969 default: 2813 default:
2970 if (tcp_is_reno(tp)) { 2814 if (tcp_is_reno(tp)) {
2971 if (flag & FLAG_SND_UNA_ADVANCED) 2815 if (flag & FLAG_SND_UNA_ADVANCED)
@@ -3078,6 +2922,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
3078 */ 2922 */
3079void tcp_rearm_rto(struct sock *sk) 2923void tcp_rearm_rto(struct sock *sk)
3080{ 2924{
2925 const struct inet_connection_sock *icsk = inet_csk(sk);
3081 struct tcp_sock *tp = tcp_sk(sk); 2926 struct tcp_sock *tp = tcp_sk(sk);
3082 2927
3083 /* If the retrans timer is currently being used by Fast Open 2928 /* If the retrans timer is currently being used by Fast Open
@@ -3091,12 +2936,13 @@ void tcp_rearm_rto(struct sock *sk)
3091 } else { 2936 } else {
3092 u32 rto = inet_csk(sk)->icsk_rto; 2937 u32 rto = inet_csk(sk)->icsk_rto;
3093 /* Offset the time elapsed after installing regular RTO */ 2938 /* Offset the time elapsed after installing regular RTO */
3094 if (tp->early_retrans_delayed) { 2939 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2940 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3095 struct sk_buff *skb = tcp_write_queue_head(sk); 2941 struct sk_buff *skb = tcp_write_queue_head(sk);
3096 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 2942 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
3097 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2943 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3098 /* delta may not be positive if the socket is locked 2944 /* delta may not be positive if the socket is locked
3099 * when the delayed ER timer fires and is rescheduled. 2945 * when the retrans timer fires and is rescheduled.
3100 */ 2946 */
3101 if (delta > 0) 2947 if (delta > 0)
3102 rto = delta; 2948 rto = delta;
@@ -3104,7 +2950,6 @@ void tcp_rearm_rto(struct sock *sk)
3104 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, 2950 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3105 TCP_RTO_MAX); 2951 TCP_RTO_MAX);
3106 } 2952 }
3107 tp->early_retrans_delayed = 0;
3108} 2953}
3109 2954
3110/* This function is called when the delayed ER timer fires. TCP enters 2955/* This function is called when the delayed ER timer fires. TCP enters
@@ -3192,8 +3037,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3192 flag |= FLAG_RETRANS_DATA_ACKED; 3037 flag |= FLAG_RETRANS_DATA_ACKED;
3193 ca_seq_rtt = -1; 3038 ca_seq_rtt = -1;
3194 seq_rtt = -1; 3039 seq_rtt = -1;
3195 if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
3196 flag |= FLAG_NONHEAD_RETRANS_ACKED;
3197 } else { 3040 } else {
3198 ca_seq_rtt = now - scb->when; 3041 ca_seq_rtt = now - scb->when;
3199 last_ackt = skb->tstamp; 3042 last_ackt = skb->tstamp;
@@ -3202,6 +3045,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3202 } 3045 }
3203 if (!(sacked & TCPCB_SACKED_ACKED)) 3046 if (!(sacked & TCPCB_SACKED_ACKED))
3204 reord = min(pkts_acked, reord); 3047 reord = min(pkts_acked, reord);
3048 if (!after(scb->end_seq, tp->high_seq))
3049 flag |= FLAG_ORIG_SACK_ACKED;
3205 } 3050 }
3206 3051
3207 if (sacked & TCPCB_SACKED_ACKED) 3052 if (sacked & TCPCB_SACKED_ACKED)
@@ -3402,150 +3247,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3402 return flag; 3247 return flag;
3403} 3248}
3404 3249
3405/* A very conservative spurious RTO response algorithm: reduce cwnd and
3406 * continue in congestion avoidance.
3407 */
3408static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
3409{
3410 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
3411 tp->snd_cwnd_cnt = 0;
3412 TCP_ECN_queue_cwr(tp);
3413 tcp_moderate_cwnd(tp);
3414}
3415
3416/* A conservative spurious RTO response algorithm: reduce cwnd using
3417 * PRR and continue in congestion avoidance.
3418 */
3419static void tcp_cwr_spur_to_response(struct sock *sk)
3420{
3421 tcp_enter_cwr(sk, 0);
3422}
3423
3424static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3425{
3426 if (flag & FLAG_ECE)
3427 tcp_cwr_spur_to_response(sk);
3428 else
3429 tcp_undo_cwr(sk, true);
3430}
3431
3432/* F-RTO spurious RTO detection algorithm (RFC4138)
3433 *
3434 * F-RTO affects during two new ACKs following RTO (well, almost, see inline
3435 * comments). State (ACK number) is kept in frto_counter. When ACK advances
3436 * window (but not to or beyond highest sequence sent before RTO):
3437 * On First ACK, send two new segments out.
3438 * On Second ACK, RTO was likely spurious. Do spurious response (response
3439 * algorithm is not part of the F-RTO detection algorithm
3440 * given in RFC4138 but can be selected separately).
3441 * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
3442 * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
3443 * of Nagle, this is done using frto_counter states 2 and 3, when a new data
3444 * segment of any size sent during F-RTO, state 2 is upgraded to 3.
3445 *
3446 * Rationale: if the RTO was spurious, new ACKs should arrive from the
3447 * original window even after we transmit two new data segments.
3448 *
3449 * SACK version:
3450 * on first step, wait until first cumulative ACK arrives, then move to
3451 * the second step. In second step, the next ACK decides.
3452 *
3453 * F-RTO is implemented (mainly) in four functions:
3454 * - tcp_use_frto() is used to determine if TCP is can use F-RTO
3455 * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
3456 * called when tcp_use_frto() showed green light
3457 * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
3458 * - tcp_enter_frto_loss() is called if there is not enough evidence
3459 * to prove that the RTO is indeed spurious. It transfers the control
3460 * from F-RTO to the conventional RTO recovery
3461 */
3462static bool tcp_process_frto(struct sock *sk, int flag)
3463{
3464 struct tcp_sock *tp = tcp_sk(sk);
3465
3466 tcp_verify_left_out(tp);
3467
3468 /* Duplicate the behavior from Loss state (fastretrans_alert) */
3469 if (flag & FLAG_DATA_ACKED)
3470 inet_csk(sk)->icsk_retransmits = 0;
3471
3472 if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
3473 ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
3474 tp->undo_marker = 0;
3475
3476 if (!before(tp->snd_una, tp->frto_highmark)) {
3477 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
3478 return true;
3479 }
3480
3481 if (!tcp_is_sackfrto(tp)) {
3482 /* RFC4138 shortcoming in step 2; should also have case c):
3483 * ACK isn't duplicate nor advances window, e.g., opposite dir
3484 * data, winupdate
3485 */
3486 if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
3487 return true;
3488
3489 if (!(flag & FLAG_DATA_ACKED)) {
3490 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
3491 flag);
3492 return true;
3493 }
3494 } else {
3495 if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
3496 if (!tcp_packets_in_flight(tp)) {
3497 tcp_enter_frto_loss(sk, 2, flag);
3498 return true;
3499 }
3500
3501 /* Prevent sending of new data. */
3502 tp->snd_cwnd = min(tp->snd_cwnd,
3503 tcp_packets_in_flight(tp));
3504 return true;
3505 }
3506
3507 if ((tp->frto_counter >= 2) &&
3508 (!(flag & FLAG_FORWARD_PROGRESS) ||
3509 ((flag & FLAG_DATA_SACKED) &&
3510 !(flag & FLAG_ONLY_ORIG_SACKED)))) {
3511 /* RFC4138 shortcoming (see comment above) */
3512 if (!(flag & FLAG_FORWARD_PROGRESS) &&
3513 (flag & FLAG_NOT_DUP))
3514 return true;
3515
3516 tcp_enter_frto_loss(sk, 3, flag);
3517 return true;
3518 }
3519 }
3520
3521 if (tp->frto_counter == 1) {
3522 /* tcp_may_send_now needs to see updated state */
3523 tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
3524 tp->frto_counter = 2;
3525
3526 if (!tcp_may_send_now(sk))
3527 tcp_enter_frto_loss(sk, 2, flag);
3528
3529 return true;
3530 } else {
3531 switch (sysctl_tcp_frto_response) {
3532 case 2:
3533 tcp_undo_spur_to_response(sk, flag);
3534 break;
3535 case 1:
3536 tcp_conservative_spur_to_response(tp);
3537 break;
3538 default:
3539 tcp_cwr_spur_to_response(sk);
3540 break;
3541 }
3542 tp->frto_counter = 0;
3543 tp->undo_marker = 0;
3544 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
3545 }
3546 return false;
3547}
3548
3549/* RFC 5961 7 [ACK Throttling] */ 3250/* RFC 5961 7 [ACK Throttling] */
3550static void tcp_send_challenge_ack(struct sock *sk) 3251static void tcp_send_challenge_ack(struct sock *sk)
3551{ 3252{
@@ -3564,6 +3265,38 @@ static void tcp_send_challenge_ack(struct sock *sk)
3564 } 3265 }
3565} 3266}
3566 3267
3268/* This routine deals with acks during a TLP episode.
3269 * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
3270 */
3271static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3272{
3273 struct tcp_sock *tp = tcp_sk(sk);
3274 bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
3275 !(flag & (FLAG_SND_UNA_ADVANCED |
3276 FLAG_NOT_DUP | FLAG_DATA_SACKED));
3277
3278 /* Mark the end of TLP episode on receiving TLP dupack or when
3279 * ack is after tlp_high_seq.
3280 */
3281 if (is_tlp_dupack) {
3282 tp->tlp_high_seq = 0;
3283 return;
3284 }
3285
3286 if (after(ack, tp->tlp_high_seq)) {
3287 tp->tlp_high_seq = 0;
3288 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3289 if (!(flag & FLAG_DSACKING_ACK)) {
3290 tcp_init_cwnd_reduction(sk, true);
3291 tcp_set_ca_state(sk, TCP_CA_CWR);
3292 tcp_end_cwnd_reduction(sk);
3293 tcp_set_ca_state(sk, TCP_CA_Open);
3294 NET_INC_STATS_BH(sock_net(sk),
3295 LINUX_MIB_TCPLOSSPROBERECOVERY);
3296 }
3297 }
3298}
3299
3567/* This routine deals with incoming acks, but not outgoing ones. */ 3300/* This routine deals with incoming acks, but not outgoing ones. */
3568static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3301static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3569{ 3302{
@@ -3578,7 +3311,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3578 int prior_packets; 3311 int prior_packets;
3579 int prior_sacked = tp->sacked_out; 3312 int prior_sacked = tp->sacked_out;
3580 int pkts_acked = 0; 3313 int pkts_acked = 0;
3581 bool frto_cwnd = false;
3582 3314
3583 /* If the ack is older than previous acks 3315 /* If the ack is older than previous acks
3584 * then we can probably ignore it. 3316 * then we can probably ignore it.
@@ -3598,7 +3330,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3598 if (after(ack, tp->snd_nxt)) 3330 if (after(ack, tp->snd_nxt))
3599 goto invalid_ack; 3331 goto invalid_ack;
3600 3332
3601 if (tp->early_retrans_delayed) 3333 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3334 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3602 tcp_rearm_rto(sk); 3335 tcp_rearm_rto(sk);
3603 3336
3604 if (after(ack, prior_snd_una)) 3337 if (after(ack, prior_snd_una))
@@ -3651,30 +3384,29 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3651 3384
3652 pkts_acked = prior_packets - tp->packets_out; 3385 pkts_acked = prior_packets - tp->packets_out;
3653 3386
3654 if (tp->frto_counter)
3655 frto_cwnd = tcp_process_frto(sk, flag);
3656 /* Guarantee sacktag reordering detection against wrap-arounds */
3657 if (before(tp->frto_highmark, tp->snd_una))
3658 tp->frto_highmark = 0;
3659
3660 if (tcp_ack_is_dubious(sk, flag)) { 3387 if (tcp_ack_is_dubious(sk, flag)) {
3661 /* Advance CWND, if state allows this. */ 3388 /* Advance CWND, if state allows this. */
3662 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && 3389 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
3663 tcp_may_raise_cwnd(sk, flag))
3664 tcp_cong_avoid(sk, ack, prior_in_flight); 3390 tcp_cong_avoid(sk, ack, prior_in_flight);
3665 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3391 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3666 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3392 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3667 is_dupack, flag); 3393 is_dupack, flag);
3668 } else { 3394 } else {
3669 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3395 if (flag & FLAG_DATA_ACKED)
3670 tcp_cong_avoid(sk, ack, prior_in_flight); 3396 tcp_cong_avoid(sk, ack, prior_in_flight);
3671 } 3397 }
3672 3398
3399 if (tp->tlp_high_seq)
3400 tcp_process_tlp_ack(sk, ack, flag);
3401
3673 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3402 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3674 struct dst_entry *dst = __sk_dst_get(sk); 3403 struct dst_entry *dst = __sk_dst_get(sk);
3675 if (dst) 3404 if (dst)
3676 dst_confirm(dst); 3405 dst_confirm(dst);
3677 } 3406 }
3407
3408 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3409 tcp_schedule_loss_probe(sk);
3678 return 1; 3410 return 1;
3679 3411
3680no_queue: 3412no_queue:
@@ -3688,6 +3420,9 @@ no_queue:
3688 */ 3420 */
3689 if (tcp_send_head(sk)) 3421 if (tcp_send_head(sk))
3690 tcp_ack_probe(sk); 3422 tcp_ack_probe(sk);
3423
3424 if (tp->tlp_high_seq)
3425 tcp_process_tlp_ack(sk, ack, flag);
3691 return 1; 3426 return 1;
3692 3427
3693invalid_ack: 3428invalid_ack:
@@ -3712,8 +3447,8 @@ old_ack:
3712 * But, this can also be called on packets in the established flow when 3447 * But, this can also be called on packets in the established flow when
3713 * the fast version below fails. 3448 * the fast version below fails.
3714 */ 3449 */
3715void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, 3450void tcp_parse_options(const struct sk_buff *skb,
3716 const u8 **hvpp, int estab, 3451 struct tcp_options_received *opt_rx, int estab,
3717 struct tcp_fastopen_cookie *foc) 3452 struct tcp_fastopen_cookie *foc)
3718{ 3453{
3719 const unsigned char *ptr; 3454 const unsigned char *ptr;
@@ -3797,31 +3532,6 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3797 */ 3532 */
3798 break; 3533 break;
3799#endif 3534#endif
3800 case TCPOPT_COOKIE:
3801 /* This option is variable length.
3802 */
3803 switch (opsize) {
3804 case TCPOLEN_COOKIE_BASE:
3805 /* not yet implemented */
3806 break;
3807 case TCPOLEN_COOKIE_PAIR:
3808 /* not yet implemented */
3809 break;
3810 case TCPOLEN_COOKIE_MIN+0:
3811 case TCPOLEN_COOKIE_MIN+2:
3812 case TCPOLEN_COOKIE_MIN+4:
3813 case TCPOLEN_COOKIE_MIN+6:
3814 case TCPOLEN_COOKIE_MAX:
3815 /* 16-bit multiple */
3816 opt_rx->cookie_plus = opsize;
3817 *hvpp = ptr;
3818 break;
3819 default:
3820 /* ignore option */
3821 break;
3822 }
3823 break;
3824
3825 case TCPOPT_EXP: 3535 case TCPOPT_EXP:
3826 /* Fast Open option shares code 254 using a 3536 /* Fast Open option shares code 254 using a
3827 * 16 bits magic number. It's valid only in 3537 * 16 bits magic number. It's valid only in
@@ -3867,8 +3577,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
3867 * If it is wrong it falls back on tcp_parse_options(). 3577 * If it is wrong it falls back on tcp_parse_options().
3868 */ 3578 */
3869static bool tcp_fast_parse_options(const struct sk_buff *skb, 3579static bool tcp_fast_parse_options(const struct sk_buff *skb,
3870 const struct tcphdr *th, 3580 const struct tcphdr *th, struct tcp_sock *tp)
3871 struct tcp_sock *tp, const u8 **hvpp)
3872{ 3581{
3873 /* In the spirit of fast parsing, compare doff directly to constant 3582 /* In the spirit of fast parsing, compare doff directly to constant
3874 * values. Because equality is used, short doff can be ignored here. 3583 * values. Because equality is used, short doff can be ignored here.
@@ -3882,7 +3591,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
3882 return true; 3591 return true;
3883 } 3592 }
3884 3593
3885 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); 3594 tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3886 if (tp->rx_opt.saw_tstamp) 3595 if (tp->rx_opt.saw_tstamp)
3887 tp->rx_opt.rcv_tsecr -= tp->tsoffset; 3596 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3888 3597
@@ -5263,12 +4972,10 @@ out:
5263static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 4972static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5264 const struct tcphdr *th, int syn_inerr) 4973 const struct tcphdr *th, int syn_inerr)
5265{ 4974{
5266 const u8 *hash_location;
5267 struct tcp_sock *tp = tcp_sk(sk); 4975 struct tcp_sock *tp = tcp_sk(sk);
5268 4976
5269 /* RFC1323: H1. Apply PAWS check first. */ 4977 /* RFC1323: H1. Apply PAWS check first. */
5270 if (tcp_fast_parse_options(skb, th, tp, &hash_location) && 4978 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
5271 tp->rx_opt.saw_tstamp &&
5272 tcp_paws_discard(sk, skb)) { 4979 tcp_paws_discard(sk, skb)) {
5273 if (!th->rst) { 4980 if (!th->rst) {
5274 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 4981 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5622,12 +5329,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5622 5329
5623 if (mss == tp->rx_opt.user_mss) { 5330 if (mss == tp->rx_opt.user_mss) {
5624 struct tcp_options_received opt; 5331 struct tcp_options_received opt;
5625 const u8 *hash_location;
5626 5332
5627 /* Get original SYNACK MSS value if user MSS sets mss_clamp */ 5333 /* Get original SYNACK MSS value if user MSS sets mss_clamp */
5628 tcp_clear_options(&opt); 5334 tcp_clear_options(&opt);
5629 opt.user_mss = opt.mss_clamp = 0; 5335 opt.user_mss = opt.mss_clamp = 0;
5630 tcp_parse_options(synack, &opt, &hash_location, 0, NULL); 5336 tcp_parse_options(synack, &opt, 0, NULL);
5631 mss = opt.mss_clamp; 5337 mss = opt.mss_clamp;
5632 } 5338 }
5633 5339
@@ -5658,14 +5364,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5658static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5364static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5659 const struct tcphdr *th, unsigned int len) 5365 const struct tcphdr *th, unsigned int len)
5660{ 5366{
5661 const u8 *hash_location;
5662 struct inet_connection_sock *icsk = inet_csk(sk); 5367 struct inet_connection_sock *icsk = inet_csk(sk);
5663 struct tcp_sock *tp = tcp_sk(sk); 5368 struct tcp_sock *tp = tcp_sk(sk);
5664 struct tcp_cookie_values *cvp = tp->cookie_values;
5665 struct tcp_fastopen_cookie foc = { .len = -1 }; 5369 struct tcp_fastopen_cookie foc = { .len = -1 };
5666 int saved_clamp = tp->rx_opt.mss_clamp; 5370 int saved_clamp = tp->rx_opt.mss_clamp;
5667 5371
5668 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); 5372 tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
5669 if (tp->rx_opt.saw_tstamp) 5373 if (tp->rx_opt.saw_tstamp)
5670 tp->rx_opt.rcv_tsecr -= tp->tsoffset; 5374 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5671 5375
@@ -5762,30 +5466,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5762 * is initialized. */ 5466 * is initialized. */
5763 tp->copied_seq = tp->rcv_nxt; 5467 tp->copied_seq = tp->rcv_nxt;
5764 5468
5765 if (cvp != NULL &&
5766 cvp->cookie_pair_size > 0 &&
5767 tp->rx_opt.cookie_plus > 0) {
5768 int cookie_size = tp->rx_opt.cookie_plus
5769 - TCPOLEN_COOKIE_BASE;
5770 int cookie_pair_size = cookie_size
5771 + cvp->cookie_desired;
5772
5773 /* A cookie extension option was sent and returned.
5774 * Note that each incoming SYNACK replaces the
5775 * Responder cookie. The initial exchange is most
5776 * fragile, as protection against spoofing relies
5777 * entirely upon the sequence and timestamp (above).
5778 * This replacement strategy allows the correct pair to
5779 * pass through, while any others will be filtered via
5780 * Responder verification later.
5781 */
5782 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
5783 memcpy(&cvp->cookie_pair[cvp->cookie_desired],
5784 hash_location, cookie_size);
5785 cvp->cookie_pair_size = cookie_pair_size;
5786 }
5787 }
5788
5789 smp_mb(); 5469 smp_mb();
5790 5470
5791 tcp_finish_connect(sk, skb); 5471 tcp_finish_connect(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d09203c63264..2278669b1d85 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -838,7 +838,6 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
838 */ 838 */
839static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 839static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
840 struct request_sock *req, 840 struct request_sock *req,
841 struct request_values *rvp,
842 u16 queue_mapping, 841 u16 queue_mapping,
843 bool nocache) 842 bool nocache)
844{ 843{
@@ -851,7 +850,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
851 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 850 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
852 return -1; 851 return -1;
853 852
854 skb = tcp_make_synack(sk, dst, req, rvp, NULL); 853 skb = tcp_make_synack(sk, dst, req, NULL);
855 854
856 if (skb) { 855 if (skb) {
857 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); 856 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -868,10 +867,9 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
868 return err; 867 return err;
869} 868}
870 869
871static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, 870static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
872 struct request_values *rvp)
873{ 871{
874 int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); 872 int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
875 873
876 if (!res) 874 if (!res)
877 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 875 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -1371,8 +1369,7 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1371static int tcp_v4_conn_req_fastopen(struct sock *sk, 1369static int tcp_v4_conn_req_fastopen(struct sock *sk,
1372 struct sk_buff *skb, 1370 struct sk_buff *skb,
1373 struct sk_buff *skb_synack, 1371 struct sk_buff *skb_synack,
1374 struct request_sock *req, 1372 struct request_sock *req)
1375 struct request_values *rvp)
1376{ 1373{
1377 struct tcp_sock *tp = tcp_sk(sk); 1374 struct tcp_sock *tp = tcp_sk(sk);
1378 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; 1375 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
@@ -1467,9 +1464,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
1467 1464
1468int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1465int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1469{ 1466{
1470 struct tcp_extend_values tmp_ext;
1471 struct tcp_options_received tmp_opt; 1467 struct tcp_options_received tmp_opt;
1472 const u8 *hash_location;
1473 struct request_sock *req; 1468 struct request_sock *req;
1474 struct inet_request_sock *ireq; 1469 struct inet_request_sock *ireq;
1475 struct tcp_sock *tp = tcp_sk(sk); 1470 struct tcp_sock *tp = tcp_sk(sk);
@@ -1519,42 +1514,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1519 tcp_clear_options(&tmp_opt); 1514 tcp_clear_options(&tmp_opt);
1520 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1515 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1521 tmp_opt.user_mss = tp->rx_opt.user_mss; 1516 tmp_opt.user_mss = tp->rx_opt.user_mss;
1522 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, 1517 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1523 want_cookie ? NULL : &foc);
1524
1525 if (tmp_opt.cookie_plus > 0 &&
1526 tmp_opt.saw_tstamp &&
1527 !tp->rx_opt.cookie_out_never &&
1528 (sysctl_tcp_cookie_size > 0 ||
1529 (tp->cookie_values != NULL &&
1530 tp->cookie_values->cookie_desired > 0))) {
1531 u8 *c;
1532 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1533 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1534
1535 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1536 goto drop_and_release;
1537
1538 /* Secret recipe starts with IP addresses */
1539 *mess++ ^= (__force u32)daddr;
1540 *mess++ ^= (__force u32)saddr;
1541
1542 /* plus variable length Initiator Cookie */
1543 c = (u8 *)mess;
1544 while (l-- > 0)
1545 *c++ ^= *hash_location++;
1546
1547 want_cookie = false; /* not our kind of cookie */
1548 tmp_ext.cookie_out_never = 0; /* false */
1549 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1550 } else if (!tp->rx_opt.cookie_in_always) {
1551 /* redundant indications, but ensure initialization. */
1552 tmp_ext.cookie_out_never = 1; /* true */
1553 tmp_ext.cookie_plus = 0;
1554 } else {
1555 goto drop_and_release;
1556 }
1557 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1558 1518
1559 if (want_cookie && !tmp_opt.saw_tstamp) 1519 if (want_cookie && !tmp_opt.saw_tstamp)
1560 tcp_clear_options(&tmp_opt); 1520 tcp_clear_options(&tmp_opt);
@@ -1636,7 +1596,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1636 * of tcp_v4_send_synack()->tcp_select_initial_window(). 1596 * of tcp_v4_send_synack()->tcp_select_initial_window().
1637 */ 1597 */
1638 skb_synack = tcp_make_synack(sk, dst, req, 1598 skb_synack = tcp_make_synack(sk, dst, req,
1639 (struct request_values *)&tmp_ext,
1640 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); 1599 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1641 1600
1642 if (skb_synack) { 1601 if (skb_synack) {
@@ -1660,8 +1619,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1660 if (fastopen_cookie_present(&foc) && foc.len != 0) 1619 if (fastopen_cookie_present(&foc) && foc.len != 0)
1661 NET_INC_STATS_BH(sock_net(sk), 1620 NET_INC_STATS_BH(sock_net(sk),
1662 LINUX_MIB_TCPFASTOPENPASSIVEFAIL); 1621 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1663 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req, 1622 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1664 (struct request_values *)&tmp_ext))
1665 goto drop_and_free; 1623 goto drop_and_free;
1666 1624
1667 return 0; 1625 return 0;
@@ -1950,6 +1908,50 @@ void tcp_v4_early_demux(struct sk_buff *skb)
1950 } 1908 }
1951} 1909}
1952 1910
1911/* Packet is added to VJ-style prequeue for processing in process
1912 * context, if a reader task is waiting. Apparently, this exciting
1913 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1914 * failed somewhere. Latency? Burstiness? Well, at least now we will
1915 * see, why it failed. 8)8) --ANK
1916 *
1917 */
1918bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1919{
1920 struct tcp_sock *tp = tcp_sk(sk);
1921
1922 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1923 return false;
1924
1925 if (skb->len <= tcp_hdrlen(skb) &&
1926 skb_queue_len(&tp->ucopy.prequeue) == 0)
1927 return false;
1928
1929 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1930 tp->ucopy.memory += skb->truesize;
1931 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1932 struct sk_buff *skb1;
1933
1934 BUG_ON(sock_owned_by_user(sk));
1935
1936 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1937 sk_backlog_rcv(sk, skb1);
1938 NET_INC_STATS_BH(sock_net(sk),
1939 LINUX_MIB_TCPPREQUEUEDROPPED);
1940 }
1941
1942 tp->ucopy.memory = 0;
1943 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1944 wake_up_interruptible_sync_poll(sk_sleep(sk),
1945 POLLIN | POLLRDNORM | POLLRDBAND);
1946 if (!inet_csk_ack_scheduled(sk))
1947 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1948 (3 * tcp_rto_min(sk)) / 4,
1949 TCP_RTO_MAX);
1950 }
1951 return true;
1952}
1953EXPORT_SYMBOL(tcp_prequeue);
1954
1953/* 1955/*
1954 * From tcp_input.c 1956 * From tcp_input.c
1955 */ 1957 */
@@ -2197,12 +2199,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
2197 if (inet_csk(sk)->icsk_bind_hash) 2199 if (inet_csk(sk)->icsk_bind_hash)
2198 inet_put_port(sk); 2200 inet_put_port(sk);
2199 2201
2200 /* TCP Cookie Transactions */
2201 if (tp->cookie_values != NULL) {
2202 kref_put(&tp->cookie_values->kref,
2203 tcp_cookie_values_release);
2204 tp->cookie_values = NULL;
2205 }
2206 BUG_ON(tp->fastopen_rsk != NULL); 2202 BUG_ON(tp->fastopen_rsk != NULL);
2207 2203
2208 /* If socket is aborted during connect operation */ 2204 /* If socket is aborted during connect operation */
@@ -2659,7 +2655,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2659 __u16 srcp = ntohs(inet->inet_sport); 2655 __u16 srcp = ntohs(inet->inet_sport);
2660 int rx_queue; 2656 int rx_queue;
2661 2657
2662 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 2658 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2659 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2660 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2663 timer_active = 1; 2661 timer_active = 1;
2664 timer_expires = icsk->icsk_timeout; 2662 timer_expires = icsk->icsk_timeout;
2665 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2663 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b83a49cc3816..05eaf8904613 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -93,13 +93,12 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
93 const struct tcphdr *th) 93 const struct tcphdr *th)
94{ 94{
95 struct tcp_options_received tmp_opt; 95 struct tcp_options_received tmp_opt;
96 const u8 *hash_location;
97 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 96 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
98 bool paws_reject = false; 97 bool paws_reject = false;
99 98
100 tmp_opt.saw_tstamp = 0; 99 tmp_opt.saw_tstamp = 0;
101 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 100 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
102 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 101 tcp_parse_options(skb, &tmp_opt, 0, NULL);
103 102
104 if (tmp_opt.saw_tstamp) { 103 if (tmp_opt.saw_tstamp) {
105 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; 104 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
@@ -388,32 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
388 struct tcp_request_sock *treq = tcp_rsk(req); 387 struct tcp_request_sock *treq = tcp_rsk(req);
389 struct inet_connection_sock *newicsk = inet_csk(newsk); 388 struct inet_connection_sock *newicsk = inet_csk(newsk);
390 struct tcp_sock *newtp = tcp_sk(newsk); 389 struct tcp_sock *newtp = tcp_sk(newsk);
391 struct tcp_sock *oldtp = tcp_sk(sk);
392 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
393
394 /* TCP Cookie Transactions require space for the cookie pair,
395 * as it differs for each connection. There is no need to
396 * copy any s_data_payload stored at the original socket.
397 * Failure will prevent resuming the connection.
398 *
399 * Presumed copied, in order of appearance:
400 * cookie_in_always, cookie_out_never
401 */
402 if (oldcvp != NULL) {
403 struct tcp_cookie_values *newcvp =
404 kzalloc(sizeof(*newtp->cookie_values),
405 GFP_ATOMIC);
406
407 if (newcvp != NULL) {
408 kref_init(&newcvp->kref);
409 newcvp->cookie_desired =
410 oldcvp->cookie_desired;
411 newtp->cookie_values = newcvp;
412 } else {
413 /* Not Yet Implemented */
414 newtp->cookie_values = NULL;
415 }
416 }
417 390
418 /* Now setup tcp_sock */ 391 /* Now setup tcp_sock */
419 newtp->pred_flags = 0; 392 newtp->pred_flags = 0;
@@ -422,8 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
422 newtp->rcv_nxt = treq->rcv_isn + 1; 395 newtp->rcv_nxt = treq->rcv_isn + 1;
423 396
424 newtp->snd_sml = newtp->snd_una = 397 newtp->snd_sml = newtp->snd_una =
425 newtp->snd_nxt = newtp->snd_up = 398 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
426 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
427 399
428 tcp_prequeue_init(newtp); 400 tcp_prequeue_init(newtp);
429 INIT_LIST_HEAD(&newtp->tsq_node); 401 INIT_LIST_HEAD(&newtp->tsq_node);
@@ -440,6 +412,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
440 newtp->fackets_out = 0; 412 newtp->fackets_out = 0;
441 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 413 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
442 tcp_enable_early_retrans(newtp); 414 tcp_enable_early_retrans(newtp);
415 newtp->tlp_high_seq = 0;
443 416
444 /* So many TCP implementations out there (incorrectly) count the 417 /* So many TCP implementations out there (incorrectly) count the
445 * initial SYN frame in their delayed-ACK and congestion control 418 * initial SYN frame in their delayed-ACK and congestion control
@@ -449,9 +422,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
449 newtp->snd_cwnd = TCP_INIT_CWND; 422 newtp->snd_cwnd = TCP_INIT_CWND;
450 newtp->snd_cwnd_cnt = 0; 423 newtp->snd_cwnd_cnt = 0;
451 424
452 newtp->frto_counter = 0;
453 newtp->frto_highmark = 0;
454
455 if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && 425 if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
456 !try_module_get(newicsk->icsk_ca_ops->owner)) 426 !try_module_get(newicsk->icsk_ca_ops->owner))
457 newicsk->icsk_ca_ops = &tcp_init_congestion_ops; 427 newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -459,8 +429,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
459 tcp_set_ca_state(newsk, TCP_CA_Open); 429 tcp_set_ca_state(newsk, TCP_CA_Open);
460 tcp_init_xmit_timers(newsk); 430 tcp_init_xmit_timers(newsk);
461 skb_queue_head_init(&newtp->out_of_order_queue); 431 skb_queue_head_init(&newtp->out_of_order_queue);
462 newtp->write_seq = newtp->pushed_seq = 432 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
463 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
464 433
465 newtp->rx_opt.saw_tstamp = 0; 434 newtp->rx_opt.saw_tstamp = 0;
466 435
@@ -537,7 +506,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
537 bool fastopen) 506 bool fastopen)
538{ 507{
539 struct tcp_options_received tmp_opt; 508 struct tcp_options_received tmp_opt;
540 const u8 *hash_location;
541 struct sock *child; 509 struct sock *child;
542 const struct tcphdr *th = tcp_hdr(skb); 510 const struct tcphdr *th = tcp_hdr(skb);
543 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 511 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
@@ -547,7 +515,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
547 515
548 tmp_opt.saw_tstamp = 0; 516 tmp_opt.saw_tstamp = 0;
549 if (th->doff > (sizeof(struct tcphdr)>>2)) { 517 if (th->doff > (sizeof(struct tcphdr)>>2)) {
550 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 518 tcp_parse_options(skb, &tmp_opt, 0, NULL);
551 519
552 if (tmp_opt.saw_tstamp) { 520 if (tmp_opt.saw_tstamp) {
553 tmp_opt.ts_recent = req->ts_recent; 521 tmp_opt.ts_recent = req->ts_recent;
@@ -647,7 +615,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
647 */ 615 */
648 if ((flg & TCP_FLAG_ACK) && !fastopen && 616 if ((flg & TCP_FLAG_ACK) && !fastopen &&
649 (TCP_SKB_CB(skb)->ack_seq != 617 (TCP_SKB_CB(skb)->ack_seq !=
650 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 618 tcp_rsk(req)->snt_isn + 1))
651 return sk; 619 return sk;
652 620
653 /* Also, it would be not so bad idea to check rcv_tsecr, which 621 /* Also, it would be not so bad idea to check rcv_tsecr, which
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5d0b4387cba6..af354c98fdb5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,27 +65,22 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
65/* By default, RFC2861 behavior. */ 65/* By default, RFC2861 behavior. */
66int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 66int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67 67
68int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
69EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
70
71static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 68static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
72 int push_one, gfp_t gfp); 69 int push_one, gfp_t gfp);
73 70
74/* Account for new data that has been sent to the network. */ 71/* Account for new data that has been sent to the network. */
75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 72static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76{ 73{
74 struct inet_connection_sock *icsk = inet_csk(sk);
77 struct tcp_sock *tp = tcp_sk(sk); 75 struct tcp_sock *tp = tcp_sk(sk);
78 unsigned int prior_packets = tp->packets_out; 76 unsigned int prior_packets = tp->packets_out;
79 77
80 tcp_advance_send_head(sk, skb); 78 tcp_advance_send_head(sk, skb);
81 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 79 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
82 80
83 /* Don't override Nagle indefinitely with F-RTO */
84 if (tp->frto_counter == 2)
85 tp->frto_counter = 3;
86
87 tp->packets_out += tcp_skb_pcount(skb); 81 tp->packets_out += tcp_skb_pcount(skb);
88 if (!prior_packets || tp->early_retrans_delayed) 82 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
83 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
89 tcp_rearm_rto(sk); 84 tcp_rearm_rto(sk);
90} 85}
91 86
@@ -384,7 +379,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
384#define OPTION_TS (1 << 1) 379#define OPTION_TS (1 << 1)
385#define OPTION_MD5 (1 << 2) 380#define OPTION_MD5 (1 << 2)
386#define OPTION_WSCALE (1 << 3) 381#define OPTION_WSCALE (1 << 3)
387#define OPTION_COOKIE_EXTENSION (1 << 4)
388#define OPTION_FAST_OPEN_COOKIE (1 << 8) 382#define OPTION_FAST_OPEN_COOKIE (1 << 8)
389 383
390struct tcp_out_options { 384struct tcp_out_options {
@@ -398,36 +392,6 @@ struct tcp_out_options {
398 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ 392 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
399}; 393};
400 394
401/* The sysctl int routines are generic, so check consistency here.
402 */
403static u8 tcp_cookie_size_check(u8 desired)
404{
405 int cookie_size;
406
407 if (desired > 0)
408 /* previously specified */
409 return desired;
410
411 cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
412 if (cookie_size <= 0)
413 /* no default specified */
414 return 0;
415
416 if (cookie_size <= TCP_COOKIE_MIN)
417 /* value too small, specify minimum */
418 return TCP_COOKIE_MIN;
419
420 if (cookie_size >= TCP_COOKIE_MAX)
421 /* value too large, specify maximum */
422 return TCP_COOKIE_MAX;
423
424 if (cookie_size & 1)
425 /* 8-bit multiple, illegal, fix it */
426 cookie_size++;
427
428 return (u8)cookie_size;
429}
430
431/* Write previously computed TCP options to the packet. 395/* Write previously computed TCP options to the packet.
432 * 396 *
433 * Beware: Something in the Internet is very sensitive to the ordering of 397 * Beware: Something in the Internet is very sensitive to the ordering of
@@ -446,27 +410,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
446{ 410{
447 u16 options = opts->options; /* mungable copy */ 411 u16 options = opts->options; /* mungable copy */
448 412
449 /* Having both authentication and cookies for security is redundant,
450 * and there's certainly not enough room. Instead, the cookie-less
451 * extension variant is proposed.
452 *
453 * Consider the pessimal case with authentication. The options
454 * could look like:
455 * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
456 */
457 if (unlikely(OPTION_MD5 & options)) { 413 if (unlikely(OPTION_MD5 & options)) {
458 if (unlikely(OPTION_COOKIE_EXTENSION & options)) { 414 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
459 *ptr++ = htonl((TCPOPT_COOKIE << 24) | 415 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
460 (TCPOLEN_COOKIE_BASE << 16) |
461 (TCPOPT_MD5SIG << 8) |
462 TCPOLEN_MD5SIG);
463 } else {
464 *ptr++ = htonl((TCPOPT_NOP << 24) |
465 (TCPOPT_NOP << 16) |
466 (TCPOPT_MD5SIG << 8) |
467 TCPOLEN_MD5SIG);
468 }
469 options &= ~OPTION_COOKIE_EXTENSION;
470 /* overload cookie hash location */ 416 /* overload cookie hash location */
471 opts->hash_location = (__u8 *)ptr; 417 opts->hash_location = (__u8 *)ptr;
472 ptr += 4; 418 ptr += 4;
@@ -495,44 +441,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
495 *ptr++ = htonl(opts->tsecr); 441 *ptr++ = htonl(opts->tsecr);
496 } 442 }
497 443
498 /* Specification requires after timestamp, so do it now.
499 *
500 * Consider the pessimal case without authentication. The options
501 * could look like:
502 * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
503 */
504 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
505 __u8 *cookie_copy = opts->hash_location;
506 u8 cookie_size = opts->hash_size;
507
508 /* 8-bit multiple handled in tcp_cookie_size_check() above,
509 * and elsewhere.
510 */
511 if (0x2 & cookie_size) {
512 __u8 *p = (__u8 *)ptr;
513
514 /* 16-bit multiple */
515 *p++ = TCPOPT_COOKIE;
516 *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
517 *p++ = *cookie_copy++;
518 *p++ = *cookie_copy++;
519 ptr++;
520 cookie_size -= 2;
521 } else {
522 /* 32-bit multiple */
523 *ptr++ = htonl(((TCPOPT_NOP << 24) |
524 (TCPOPT_NOP << 16) |
525 (TCPOPT_COOKIE << 8) |
526 TCPOLEN_COOKIE_BASE) +
527 cookie_size);
528 }
529
530 if (cookie_size > 0) {
531 memcpy(ptr, cookie_copy, cookie_size);
532 ptr += (cookie_size / 4);
533 }
534 }
535
536 if (unlikely(OPTION_SACK_ADVERTISE & options)) { 444 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
537 *ptr++ = htonl((TCPOPT_NOP << 24) | 445 *ptr++ = htonl((TCPOPT_NOP << 24) |
538 (TCPOPT_NOP << 16) | 446 (TCPOPT_NOP << 16) |
@@ -591,11 +499,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
591 struct tcp_md5sig_key **md5) 499 struct tcp_md5sig_key **md5)
592{ 500{
593 struct tcp_sock *tp = tcp_sk(sk); 501 struct tcp_sock *tp = tcp_sk(sk);
594 struct tcp_cookie_values *cvp = tp->cookie_values;
595 unsigned int remaining = MAX_TCP_OPTION_SPACE; 502 unsigned int remaining = MAX_TCP_OPTION_SPACE;
596 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
597 tcp_cookie_size_check(cvp->cookie_desired) :
598 0;
599 struct tcp_fastopen_request *fastopen = tp->fastopen_req; 503 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
600 504
601#ifdef CONFIG_TCP_MD5SIG 505#ifdef CONFIG_TCP_MD5SIG
@@ -647,52 +551,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
647 tp->syn_fastopen = 1; 551 tp->syn_fastopen = 1;
648 } 552 }
649 } 553 }
650 /* Note that timestamps are required by the specification.
651 *
652 * Odd numbers of bytes are prohibited by the specification, ensuring
653 * that the cookie is 16-bit aligned, and the resulting cookie pair is
654 * 32-bit aligned.
655 */
656 if (*md5 == NULL &&
657 (OPTION_TS & opts->options) &&
658 cookie_size > 0) {
659 int need = TCPOLEN_COOKIE_BASE + cookie_size;
660
661 if (0x2 & need) {
662 /* 32-bit multiple */
663 need += 2; /* NOPs */
664
665 if (need > remaining) {
666 /* try shrinking cookie to fit */
667 cookie_size -= 2;
668 need -= 4;
669 }
670 }
671 while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
672 cookie_size -= 4;
673 need -= 4;
674 }
675 if (TCP_COOKIE_MIN <= cookie_size) {
676 opts->options |= OPTION_COOKIE_EXTENSION;
677 opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
678 opts->hash_size = cookie_size;
679
680 /* Remember for future incarnations. */
681 cvp->cookie_desired = cookie_size;
682
683 if (cvp->cookie_desired != cvp->cookie_pair_size) {
684 /* Currently use random bytes as a nonce,
685 * assuming these are completely unpredictable
686 * by hostile users of the same system.
687 */
688 get_random_bytes(&cvp->cookie_pair[0],
689 cookie_size);
690 cvp->cookie_pair_size = cookie_size;
691 }
692 554
693 remaining -= need;
694 }
695 }
696 return MAX_TCP_OPTION_SPACE - remaining; 555 return MAX_TCP_OPTION_SPACE - remaining;
697} 556}
698 557
@@ -702,14 +561,10 @@ static unsigned int tcp_synack_options(struct sock *sk,
702 unsigned int mss, struct sk_buff *skb, 561 unsigned int mss, struct sk_buff *skb,
703 struct tcp_out_options *opts, 562 struct tcp_out_options *opts,
704 struct tcp_md5sig_key **md5, 563 struct tcp_md5sig_key **md5,
705 struct tcp_extend_values *xvp,
706 struct tcp_fastopen_cookie *foc) 564 struct tcp_fastopen_cookie *foc)
707{ 565{
708 struct inet_request_sock *ireq = inet_rsk(req); 566 struct inet_request_sock *ireq = inet_rsk(req);
709 unsigned int remaining = MAX_TCP_OPTION_SPACE; 567 unsigned int remaining = MAX_TCP_OPTION_SPACE;
710 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
711 xvp->cookie_plus :
712 0;
713 568
714#ifdef CONFIG_TCP_MD5SIG 569#ifdef CONFIG_TCP_MD5SIG
715 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); 570 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
@@ -757,28 +612,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
757 remaining -= need; 612 remaining -= need;
758 } 613 }
759 } 614 }
760 /* Similar rationale to tcp_syn_options() applies here, too. 615
761 * If the <SYN> options fit, the same options should fit now!
762 */
763 if (*md5 == NULL &&
764 ireq->tstamp_ok &&
765 cookie_plus > TCPOLEN_COOKIE_BASE) {
766 int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
767
768 if (0x2 & need) {
769 /* 32-bit multiple */
770 need += 2; /* NOPs */
771 }
772 if (need <= remaining) {
773 opts->options |= OPTION_COOKIE_EXTENSION;
774 opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
775 remaining -= need;
776 } else {
777 /* There's no error return, so flag it. */
778 xvp->cookie_out_never = 1; /* true */
779 opts->hash_size = 0;
780 }
781 }
782 return MAX_TCP_OPTION_SPACE - remaining; 616 return MAX_TCP_OPTION_SPACE - remaining;
783} 617}
784 618
@@ -1632,11 +1466,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
1632 if (nonagle & TCP_NAGLE_PUSH) 1466 if (nonagle & TCP_NAGLE_PUSH)
1633 return true; 1467 return true;
1634 1468
1635 /* Don't use the nagle rule for urgent data (or for the final FIN). 1469 /* Don't use the nagle rule for urgent data (or for the final FIN). */
1636 * Nagle can be ignored during F-RTO too (see RFC4138). 1470 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1637 */
1638 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
1639 (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1640 return true; 1471 return true;
1641 1472
1642 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1473 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1961,6 +1792,9 @@ static int tcp_mtu_probe(struct sock *sk)
1961 * snd_up-64k-mss .. snd_up cannot be large. However, taking into 1792 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
1962 * account rare use of URG, this is not a big flaw. 1793 * account rare use of URG, this is not a big flaw.
1963 * 1794 *
1795 * Send at most one packet when push_one > 0. Temporarily ignore
1796 * cwnd limit to force at most one packet out when push_one == 2.
1797
1964 * Returns true, if no segments are in flight and we have queued segments, 1798 * Returns true, if no segments are in flight and we have queued segments,
1965 * but cannot send anything now because of SWS or another problem. 1799 * but cannot send anything now because of SWS or another problem.
1966 */ 1800 */
@@ -1996,8 +1830,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1996 goto repair; /* Skip network transmission */ 1830 goto repair; /* Skip network transmission */
1997 1831
1998 cwnd_quota = tcp_cwnd_test(tp, skb); 1832 cwnd_quota = tcp_cwnd_test(tp, skb);
1999 if (!cwnd_quota) 1833 if (!cwnd_quota) {
2000 break; 1834 if (push_one == 2)
1835 /* Force out a loss probe pkt. */
1836 cwnd_quota = 1;
1837 else
1838 break;
1839 }
2001 1840
2002 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) 1841 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
2003 break; 1842 break;
@@ -2051,10 +1890,129 @@ repair:
2051 if (likely(sent_pkts)) { 1890 if (likely(sent_pkts)) {
2052 if (tcp_in_cwnd_reduction(sk)) 1891 if (tcp_in_cwnd_reduction(sk))
2053 tp->prr_out += sent_pkts; 1892 tp->prr_out += sent_pkts;
1893
1894 /* Send one loss probe per tail loss episode. */
1895 if (push_one != 2)
1896 tcp_schedule_loss_probe(sk);
2054 tcp_cwnd_validate(sk); 1897 tcp_cwnd_validate(sk);
2055 return false; 1898 return false;
2056 } 1899 }
2057 return !tp->packets_out && tcp_send_head(sk); 1900 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
1901}
1902
1903bool tcp_schedule_loss_probe(struct sock *sk)
1904{
1905 struct inet_connection_sock *icsk = inet_csk(sk);
1906 struct tcp_sock *tp = tcp_sk(sk);
1907 u32 timeout, tlp_time_stamp, rto_time_stamp;
1908 u32 rtt = tp->srtt >> 3;
1909
1910 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
1911 return false;
1912 /* No consecutive loss probes. */
1913 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
1914 tcp_rearm_rto(sk);
1915 return false;
1916 }
1917 /* Don't do any loss probe on a Fast Open connection before 3WHS
1918 * finishes.
1919 */
1920 if (sk->sk_state == TCP_SYN_RECV)
1921 return false;
1922
1923 /* TLP is only scheduled when next timer event is RTO. */
1924 if (icsk->icsk_pending != ICSK_TIME_RETRANS)
1925 return false;
1926
1927 /* Schedule a loss probe in 2*RTT for SACK capable connections
1928 * in Open state, that are either limited by cwnd or application.
1929 */
1930 if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
1931 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
1932 return false;
1933
1934 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
1935 tcp_send_head(sk))
1936 return false;
1937
1938 /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
1939 * for delayed ack when there's one outstanding packet.
1940 */
1941 timeout = rtt << 1;
1942 if (tp->packets_out == 1)
1943 timeout = max_t(u32, timeout,
1944 (rtt + (rtt >> 1) + TCP_DELACK_MAX));
1945 timeout = max_t(u32, timeout, msecs_to_jiffies(10));
1946
1947 /* If RTO is shorter, just schedule TLP in its place. */
1948 tlp_time_stamp = tcp_time_stamp + timeout;
1949 rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
1950 if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
1951 s32 delta = rto_time_stamp - tcp_time_stamp;
1952 if (delta > 0)
1953 timeout = delta;
1954 }
1955
1956 inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
1957 TCP_RTO_MAX);
1958 return true;
1959}
1960
1961/* When probe timeout (PTO) fires, send a new segment if one exists, else
1962 * retransmit the last segment.
1963 */
1964void tcp_send_loss_probe(struct sock *sk)
1965{
1966 struct tcp_sock *tp = tcp_sk(sk);
1967 struct sk_buff *skb;
1968 int pcount;
1969 int mss = tcp_current_mss(sk);
1970 int err = -1;
1971
1972 if (tcp_send_head(sk) != NULL) {
1973 err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
1974 goto rearm_timer;
1975 }
1976
1977 /* At most one outstanding TLP retransmission. */
1978 if (tp->tlp_high_seq)
1979 goto rearm_timer;
1980
1981 /* Retransmit last segment. */
1982 skb = tcp_write_queue_tail(sk);
1983 if (WARN_ON(!skb))
1984 goto rearm_timer;
1985
1986 pcount = tcp_skb_pcount(skb);
1987 if (WARN_ON(!pcount))
1988 goto rearm_timer;
1989
1990 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
1991 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
1992 goto rearm_timer;
1993 skb = tcp_write_queue_tail(sk);
1994 }
1995
1996 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
1997 goto rearm_timer;
1998
1999 /* Probe with zero data doesn't trigger fast recovery. */
2000 if (skb->len > 0)
2001 err = __tcp_retransmit_skb(sk, skb);
2002
2003 /* Record snd_nxt for loss detection. */
2004 if (likely(!err))
2005 tp->tlp_high_seq = tp->snd_nxt;
2006
2007rearm_timer:
2008 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2009 inet_csk(sk)->icsk_rto,
2010 TCP_RTO_MAX);
2011
2012 if (likely(!err))
2013 NET_INC_STATS_BH(sock_net(sk),
2014 LINUX_MIB_TCPLOSSPROBES);
2015 return;
2058} 2016}
2059 2017
2060/* Push out any pending frames which were held back due to 2018/* Push out any pending frames which were held back due to
@@ -2675,32 +2633,24 @@ int tcp_send_synack(struct sock *sk)
2675 * sk: listener socket 2633 * sk: listener socket
2676 * dst: dst entry attached to the SYNACK 2634 * dst: dst entry attached to the SYNACK
2677 * req: request_sock pointer 2635 * req: request_sock pointer
2678 * rvp: request_values pointer
2679 * 2636 *
2680 * Allocate one skb and build a SYNACK packet. 2637 * Allocate one skb and build a SYNACK packet.
2681 * @dst is consumed : Caller should not use it again. 2638 * @dst is consumed : Caller should not use it again.
2682 */ 2639 */
2683struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2640struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2684 struct request_sock *req, 2641 struct request_sock *req,
2685 struct request_values *rvp,
2686 struct tcp_fastopen_cookie *foc) 2642 struct tcp_fastopen_cookie *foc)
2687{ 2643{
2688 struct tcp_out_options opts; 2644 struct tcp_out_options opts;
2689 struct tcp_extend_values *xvp = tcp_xv(rvp);
2690 struct inet_request_sock *ireq = inet_rsk(req); 2645 struct inet_request_sock *ireq = inet_rsk(req);
2691 struct tcp_sock *tp = tcp_sk(sk); 2646 struct tcp_sock *tp = tcp_sk(sk);
2692 const struct tcp_cookie_values *cvp = tp->cookie_values;
2693 struct tcphdr *th; 2647 struct tcphdr *th;
2694 struct sk_buff *skb; 2648 struct sk_buff *skb;
2695 struct tcp_md5sig_key *md5; 2649 struct tcp_md5sig_key *md5;
2696 int tcp_header_size; 2650 int tcp_header_size;
2697 int mss; 2651 int mss;
2698 int s_data_desired = 0;
2699 2652
2700 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) 2653 skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC));
2701 s_data_desired = cvp->s_data_desired;
2702 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
2703 sk_gfp_atomic(sk, GFP_ATOMIC));
2704 if (unlikely(!skb)) { 2654 if (unlikely(!skb)) {
2705 dst_release(dst); 2655 dst_release(dst);
2706 return NULL; 2656 return NULL;
@@ -2742,9 +2692,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2742 else 2692 else
2743#endif 2693#endif
2744 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2694 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2745 tcp_header_size = tcp_synack_options(sk, req, mss, 2695 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
2746 skb, &opts, &md5, xvp, foc) 2696 foc) + sizeof(*th);
2747 + sizeof(*th);
2748 2697
2749 skb_push(skb, tcp_header_size); 2698 skb_push(skb, tcp_header_size);
2750 skb_reset_transport_header(skb); 2699 skb_reset_transport_header(skb);
@@ -2762,40 +2711,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2762 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2711 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2763 TCPHDR_SYN | TCPHDR_ACK); 2712 TCPHDR_SYN | TCPHDR_ACK);
2764 2713
2765 if (OPTION_COOKIE_EXTENSION & opts.options) {
2766 if (s_data_desired) {
2767 u8 *buf = skb_put(skb, s_data_desired);
2768
2769 /* copy data directly from the listening socket. */
2770 memcpy(buf, cvp->s_data_payload, s_data_desired);
2771 TCP_SKB_CB(skb)->end_seq += s_data_desired;
2772 }
2773
2774 if (opts.hash_size > 0) {
2775 __u32 workspace[SHA_WORKSPACE_WORDS];
2776 u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
2777 u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
2778
2779 /* Secret recipe depends on the Timestamp, (future)
2780 * Sequence and Acknowledgment Numbers, Initiator
2781 * Cookie, and others handled by IP variant caller.
2782 */
2783 *tail-- ^= opts.tsval;
2784 *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
2785 *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
2786
2787 /* recommended */
2788 *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
2789 *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
2790
2791 sha_transform((__u32 *)&xvp->cookie_bakery[0],
2792 (char *)mess,
2793 &workspace[0]);
2794 opts.hash_location =
2795 (__u8 *)&xvp->cookie_bakery[0];
2796 }
2797 }
2798
2799 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2714 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2800 /* XXX data is queued and acked as is. No buffer/window check */ 2715 /* XXX data is queued and acked as is. No buffer/window check */
2801 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); 2716 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b78aac30c498..4b85e6f636c9 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -342,10 +342,6 @@ void tcp_retransmit_timer(struct sock *sk)
342 struct tcp_sock *tp = tcp_sk(sk); 342 struct tcp_sock *tp = tcp_sk(sk);
343 struct inet_connection_sock *icsk = inet_csk(sk); 343 struct inet_connection_sock *icsk = inet_csk(sk);
344 344
345 if (tp->early_retrans_delayed) {
346 tcp_resume_early_retransmit(sk);
347 return;
348 }
349 if (tp->fastopen_rsk) { 345 if (tp->fastopen_rsk) {
350 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && 346 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
351 sk->sk_state != TCP_FIN_WAIT1); 347 sk->sk_state != TCP_FIN_WAIT1);
@@ -360,6 +356,8 @@ void tcp_retransmit_timer(struct sock *sk)
360 356
361 WARN_ON(tcp_write_queue_empty(sk)); 357 WARN_ON(tcp_write_queue_empty(sk));
362 358
359 tp->tlp_high_seq = 0;
360
363 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && 361 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
364 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { 362 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
365 /* Receiver dastardly shrinks window. Our retransmits 363 /* Receiver dastardly shrinks window. Our retransmits
@@ -418,11 +416,7 @@ void tcp_retransmit_timer(struct sock *sk)
418 NET_INC_STATS_BH(sock_net(sk), mib_idx); 416 NET_INC_STATS_BH(sock_net(sk), mib_idx);
419 } 417 }
420 418
421 if (tcp_use_frto(sk)) { 419 tcp_enter_loss(sk, 0);
422 tcp_enter_frto(sk);
423 } else {
424 tcp_enter_loss(sk, 0);
425 }
426 420
427 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { 421 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
428 /* Retransmission failed because of local congestion, 422 /* Retransmission failed because of local congestion,
@@ -495,13 +489,20 @@ void tcp_write_timer_handler(struct sock *sk)
495 } 489 }
496 490
497 event = icsk->icsk_pending; 491 event = icsk->icsk_pending;
498 icsk->icsk_pending = 0;
499 492
500 switch (event) { 493 switch (event) {
494 case ICSK_TIME_EARLY_RETRANS:
495 tcp_resume_early_retransmit(sk);
496 break;
497 case ICSK_TIME_LOSS_PROBE:
498 tcp_send_loss_probe(sk);
499 break;
501 case ICSK_TIME_RETRANS: 500 case ICSK_TIME_RETRANS:
501 icsk->icsk_pending = 0;
502 tcp_retransmit_timer(sk); 502 tcp_retransmit_timer(sk);
503 break; 503 break;
504 case ICSK_TIME_PROBE0: 504 case ICSK_TIME_PROBE0:
505 icsk->icsk_pending = 0;
505 tcp_probe_timer(sk); 506 tcp_probe_timer(sk);
506 break; 507 break;
507 } 508 }
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 1b91bf48e277..76a1e23259e1 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
236 tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); 236 tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
237 break; 237 break;
238 238
239 case CA_EVENT_FRTO: 239 case CA_EVENT_LOSS:
240 tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); 240 tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
241 /* Update RTT_min when next ack arrives */ 241 /* Update RTT_min when next ack arrives */
242 w->reset_rtt_min = 1; 242 w->reset_rtt_min = 1;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0a073a263720..7117d1467b02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2279,31 +2279,88 @@ void __init udp_init(void)
2279 2279
2280int udp4_ufo_send_check(struct sk_buff *skb) 2280int udp4_ufo_send_check(struct sk_buff *skb)
2281{ 2281{
2282 const struct iphdr *iph; 2282 if (!pskb_may_pull(skb, sizeof(struct udphdr)))
2283 struct udphdr *uh;
2284
2285 if (!pskb_may_pull(skb, sizeof(*uh)))
2286 return -EINVAL; 2283 return -EINVAL;
2287 2284
2288 iph = ip_hdr(skb); 2285 if (likely(!skb->encapsulation)) {
2289 uh = udp_hdr(skb); 2286 const struct iphdr *iph;
2287 struct udphdr *uh;
2290 2288
2291 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, 2289 iph = ip_hdr(skb);
2292 IPPROTO_UDP, 0); 2290 uh = udp_hdr(skb);
2293 skb->csum_start = skb_transport_header(skb) - skb->head; 2291
2294 skb->csum_offset = offsetof(struct udphdr, check); 2292 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
2295 skb->ip_summed = CHECKSUM_PARTIAL; 2293 IPPROTO_UDP, 0);
2294 skb->csum_start = skb_transport_header(skb) - skb->head;
2295 skb->csum_offset = offsetof(struct udphdr, check);
2296 skb->ip_summed = CHECKSUM_PARTIAL;
2297 }
2296 return 0; 2298 return 0;
2297} 2299}
2298 2300
2301static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2302 netdev_features_t features)
2303{
2304 struct sk_buff *segs = ERR_PTR(-EINVAL);
2305 int mac_len = skb->mac_len;
2306 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
2307 int outer_hlen;
2308 netdev_features_t enc_features;
2309
2310 if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
2311 goto out;
2312
2313 skb->encapsulation = 0;
2314 __skb_pull(skb, tnl_hlen);
2315 skb_reset_mac_header(skb);
2316 skb_set_network_header(skb, skb_inner_network_offset(skb));
2317 skb->mac_len = skb_inner_network_offset(skb);
2318
2319 /* segment inner packet. */
2320 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
2321 segs = skb_mac_gso_segment(skb, enc_features);
2322 if (!segs || IS_ERR(segs))
2323 goto out;
2324
2325 outer_hlen = skb_tnl_header_len(skb);
2326 skb = segs;
2327 do {
2328 struct udphdr *uh;
2329 int udp_offset = outer_hlen - tnl_hlen;
2330
2331 skb->mac_len = mac_len;
2332
2333 skb_push(skb, outer_hlen);
2334 skb_reset_mac_header(skb);
2335 skb_set_network_header(skb, mac_len);
2336 skb_set_transport_header(skb, udp_offset);
2337 uh = udp_hdr(skb);
2338 uh->len = htons(skb->len - udp_offset);
2339
2340 /* csum segment if tunnel sets skb with csum. */
2341 if (unlikely(uh->check)) {
2342 struct iphdr *iph = ip_hdr(skb);
2343
2344 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
2345 skb->len - udp_offset,
2346 IPPROTO_UDP, 0);
2347 uh->check = csum_fold(skb_checksum(skb, udp_offset,
2348 skb->len - udp_offset, 0));
2349 if (uh->check == 0)
2350 uh->check = CSUM_MANGLED_0;
2351
2352 }
2353 skb->ip_summed = CHECKSUM_NONE;
2354 } while ((skb = skb->next));
2355out:
2356 return segs;
2357}
2358
2299struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, 2359struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
2300 netdev_features_t features) 2360 netdev_features_t features)
2301{ 2361{
2302 struct sk_buff *segs = ERR_PTR(-EINVAL); 2362 struct sk_buff *segs = ERR_PTR(-EINVAL);
2303 unsigned int mss; 2363 unsigned int mss;
2304 int offset;
2305 __wsum csum;
2306
2307 mss = skb_shinfo(skb)->gso_size; 2364 mss = skb_shinfo(skb)->gso_size;
2308 if (unlikely(skb->len <= mss)) 2365 if (unlikely(skb->len <= mss))
2309 goto out; 2366 goto out;
@@ -2313,6 +2370,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
2313 int type = skb_shinfo(skb)->gso_type; 2370 int type = skb_shinfo(skb)->gso_type;
2314 2371
2315 if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | 2372 if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
2373 SKB_GSO_UDP_TUNNEL |
2316 SKB_GSO_GRE) || 2374 SKB_GSO_GRE) ||
2317 !(type & (SKB_GSO_UDP)))) 2375 !(type & (SKB_GSO_UDP))))
2318 goto out; 2376 goto out;
@@ -2323,20 +2381,27 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
2323 goto out; 2381 goto out;
2324 } 2382 }
2325 2383
2326 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
2327 * do checksum of UDP packets sent as multiple IP fragments.
2328 */
2329 offset = skb_checksum_start_offset(skb);
2330 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2331 offset += skb->csum_offset;
2332 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2333 skb->ip_summed = CHECKSUM_NONE;
2334
2335 /* Fragment the skb. IP headers of the fragments are updated in 2384 /* Fragment the skb. IP headers of the fragments are updated in
2336 * inet_gso_segment() 2385 * inet_gso_segment()
2337 */ 2386 */
2338 segs = skb_segment(skb, features); 2387 if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
2388 segs = skb_udp_tunnel_segment(skb, features);
2389 else {
2390 int offset;
2391 __wsum csum;
2392
2393 /* Do software UFO. Complete and fill in the UDP checksum as
2394 * HW cannot do checksum of UDP packets sent as multiple
2395 * IP fragments.
2396 */
2397 offset = skb_checksum_start_offset(skb);
2398 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2399 offset += skb->csum_offset;
2400 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2401 skb->ip_summed = CHECKSUM_NONE;
2402
2403 segs = skb_segment(skb, features);
2404 }
2339out: 2405out:
2340 return segs; 2406 return segs;
2341} 2407}
2342