aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig16
-rw-r--r--net/ipv4/af_inet.c12
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/devinet.c17
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_rules.c25
-rw-r--r--net/ipv4/fib_trie.c5
-rw-r--r--net/ipv4/igmp.c80
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_input.c8
-rw-r--r--net/ipv4/ip_output.c8
-rw-r--r--net/ipv4/ip_tunnel.c71
-rw-r--r--net/ipv4/ip_tunnel_core.c14
-rw-r--r--net/ipv4/ip_vti.c528
-rw-r--r--net/ipv4/ipip.c8
-rw-r--r--net/ipv4/ipmr.c18
-rw-r--r--net/ipv4/netfilter/Kconfig13
-rw-r--r--net/ipv4/netfilter/Makefile1
-rw-r--r--net/ipv4/netfilter/arptable_filter.c2
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c21
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c476
-rw-r--r--net/ipv4/netfilter/iptable_filter.c2
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/iptable_nat.c2
-rw-r--r--net/ipv4/netfilter/iptable_raw.c2
-rw-r--r--net/ipv4/netfilter/iptable_security.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c7
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c9
-rw-r--r--net/ipv4/raw.c5
-rw-r--r--net/ipv4/route.c24
-rw-r--r--net/ipv4/syncookies.c29
-rw-r--r--net/ipv4/sysctl_net_ipv4.c17
-rw-r--r--net/ipv4/tcp.c53
-rw-r--r--net/ipv4/tcp_cubic.c12
-rw-r--r--net/ipv4/tcp_fastopen.c13
-rw-r--r--net/ipv4/tcp_input.c210
-rw-r--r--net/ipv4/tcp_ipv4.c32
-rw-r--r--net/ipv4/tcp_memcontrol.c12
-rw-r--r--net/ipv4/tcp_metrics.c42
-rw-r--r--net/ipv4/tcp_minisocks.c8
-rw-r--r--net/ipv4/tcp_output.c9
-rw-r--r--net/ipv4/tcp_probe.c87
-rw-r--r--net/ipv4/udp.c18
-rw-r--r--net/ipv4/xfrm4_output.c16
-rw-r--r--net/ipv4/xfrm4_state.c1
47 files changed, 1125 insertions, 826 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 37cf1a6ea3ad..05c57f0fcabe 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -259,22 +259,6 @@ config IP_PIMSM_V2
259 gated-5). This routing protocol is not used widely, so say N unless 259 gated-5). This routing protocol is not used widely, so say N unless
260 you want to play with it. 260 you want to play with it.
261 261
262config ARPD
263 bool "IP: ARP daemon support"
264 ---help---
265 The kernel maintains an internal cache which maps IP addresses to
266 hardware addresses on the local network, so that Ethernet
267 frames are sent to the proper address on the physical networking
268 layer. Normally, kernel uses the ARP protocol to resolve these
269 mappings.
270
271 Saying Y here adds support to have an user space daemon to do this
272 resolution instead. This is useful for implementing an alternate
273 address resolution protocol (e.g. NHRP on mGRE tunnels) and also for
274 testing purposes.
275
276 If unsure, say N.
277
278config SYN_COOKIES 262config SYN_COOKIES
279 bool "IP: TCP syncookie support" 263 bool "IP: TCP syncookie support"
280 ---help--- 264 ---help---
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b4d0be2b7ce9..7a1874b7b8fd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1532,18 +1532,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
1532} 1532}
1533EXPORT_SYMBOL_GPL(snmp_mib_init); 1533EXPORT_SYMBOL_GPL(snmp_mib_init);
1534 1534
1535void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
1536{
1537 int i;
1538
1539 BUG_ON(ptr == NULL);
1540 for (i = 0; i < SNMP_ARRAY_SZ; i++) {
1541 free_percpu(ptr[i]);
1542 ptr[i] = NULL;
1543 }
1544}
1545EXPORT_SYMBOL_GPL(snmp_mib_free);
1546
1547#ifdef CONFIG_IP_MULTICAST 1535#ifdef CONFIG_IP_MULTICAST
1548static const struct net_protocol igmp_protocol = { 1536static const struct net_protocol igmp_protocol = {
1549 .handler = igmp_rcv, 1537 .handler = igmp_rcv,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4429b013f269..7808093cede6 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -368,9 +368,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
368 } else { 368 } else {
369 probes -= neigh->parms->app_probes; 369 probes -= neigh->parms->app_probes;
370 if (probes < 0) { 370 if (probes < 0) {
371#ifdef CONFIG_ARPD
372 neigh_app_ns(neigh); 371 neigh_app_ns(neigh);
373#endif
374 return; 372 return;
375 } 373 }
376 } 374 }
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 34ca6d5a3a4b..a1b5bcbd04ae 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -73,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = {
73 [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, 73 [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
74 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, 74 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
75 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, 75 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
76 [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
77 [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
76 }, 78 },
77}; 79};
78 80
@@ -83,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
83 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, 85 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
84 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, 86 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
85 [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, 87 [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
88 [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
89 [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
86 }, 90 },
87}; 91};
88 92
@@ -1126,10 +1130,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
1126 if (len < (int) sizeof(ifr)) 1130 if (len < (int) sizeof(ifr))
1127 break; 1131 break;
1128 memset(&ifr, 0, sizeof(struct ifreq)); 1132 memset(&ifr, 0, sizeof(struct ifreq));
1129 if (ifa->ifa_label) 1133 strcpy(ifr.ifr_name, ifa->ifa_label);
1130 strcpy(ifr.ifr_name, ifa->ifa_label);
1131 else
1132 strcpy(ifr.ifr_name, dev->name);
1133 1134
1134 (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; 1135 (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
1135 (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = 1136 (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
@@ -2097,11 +2098,15 @@ static struct devinet_sysctl_table {
2097 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), 2098 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
2098 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), 2099 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
2099 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), 2100 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
2101 DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
2102 "force_igmp_version"),
2103 DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
2104 "igmpv2_unsolicited_report_interval"),
2105 DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
2106 "igmpv3_unsolicited_report_interval"),
2100 2107
2101 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), 2108 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
2102 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), 2109 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
2103 DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
2104 "force_igmp_version"),
2105 DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, 2110 DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
2106 "promote_secondaries"), 2111 "promote_secondaries"),
2107 DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, 2112 DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ab3d814bc80a..109ee89f123e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -477,7 +477,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
477 } 477 }
478 478
479 return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - 479 return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
480 net_adj) & ~(align - 1)) + (net_adj - 2); 480 net_adj) & ~(align - 1)) + net_adj - 2;
481} 481}
482 482
483static void esp4_err(struct sk_buff *skb, u32 info) 483static void esp4_err(struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 26aa65d1fce4..523be38e37de 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -101,6 +101,30 @@ errout:
101 return err; 101 return err;
102} 102}
103 103
104static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
105{
106 struct fib_result *result = (struct fib_result *) arg->result;
107 struct net_device *dev = result->fi->fib_dev;
108
109 /* do not accept result if the route does
110 * not meet the required prefix length
111 */
112 if (result->prefixlen <= rule->suppress_prefixlen)
113 goto suppress_route;
114
115 /* do not accept result if the route uses a device
116 * belonging to a forbidden interface group
117 */
118 if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
119 goto suppress_route;
120
121 return false;
122
123suppress_route:
124 if (!(arg->flags & FIB_LOOKUP_NOREF))
125 fib_info_put(result->fi);
126 return true;
127}
104 128
105static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) 129static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
106{ 130{
@@ -267,6 +291,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
267 .rule_size = sizeof(struct fib4_rule), 291 .rule_size = sizeof(struct fib4_rule),
268 .addr_size = sizeof(u32), 292 .addr_size = sizeof(u32),
269 .action = fib4_rule_action, 293 .action = fib4_rule_action,
294 .suppress = fib4_rule_suppress,
270 .match = fib4_rule_match, 295 .match = fib4_rule_match,
271 .configure = fib4_rule_configure, 296 .configure = fib4_rule_configure,
272 .delete = fib4_rule_delete, 297 .delete = fib4_rule_delete,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 108a1e9c9eac..3df6d3edb2a1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -71,7 +71,6 @@
71#include <linux/init.h> 71#include <linux/init.h>
72#include <linux/list.h> 72#include <linux/list.h>
73#include <linux/slab.h> 73#include <linux/slab.h>
74#include <linux/prefetch.h>
75#include <linux/export.h> 74#include <linux/export.h>
76#include <net/net_namespace.h> 75#include <net/net_namespace.h>
77#include <net/ip.h> 76#include <net/ip.h>
@@ -1761,10 +1760,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1761 if (!c) 1760 if (!c)
1762 continue; 1761 continue;
1763 1762
1764 if (IS_LEAF(c)) { 1763 if (IS_LEAF(c))
1765 prefetch(rcu_dereference_rtnl(p->child[idx]));
1766 return (struct leaf *) c; 1764 return (struct leaf *) c;
1767 }
1768 1765
1769 /* Rescan start scanning in new node */ 1766 /* Rescan start scanning in new node */
1770 p = (struct tnode *) c; 1767 p = (struct tnode *) c;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index cd71190d2962..d6c0e64ec97f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -88,6 +88,7 @@
88#include <linux/if_arp.h> 88#include <linux/if_arp.h>
89#include <linux/rtnetlink.h> 89#include <linux/rtnetlink.h>
90#include <linux/times.h> 90#include <linux/times.h>
91#include <linux/pkt_sched.h>
91 92
92#include <net/net_namespace.h> 93#include <net/net_namespace.h>
93#include <net/arp.h> 94#include <net/arp.h>
@@ -113,7 +114,8 @@
113 114
114#define IGMP_V1_Router_Present_Timeout (400*HZ) 115#define IGMP_V1_Router_Present_Timeout (400*HZ)
115#define IGMP_V2_Router_Present_Timeout (400*HZ) 116#define IGMP_V2_Router_Present_Timeout (400*HZ)
116#define IGMP_Unsolicited_Report_Interval (10*HZ) 117#define IGMP_V2_Unsolicited_Report_Interval (10*HZ)
118#define IGMP_V3_Unsolicited_Report_Interval (1*HZ)
117#define IGMP_Query_Response_Interval (10*HZ) 119#define IGMP_Query_Response_Interval (10*HZ)
118#define IGMP_Unsolicited_Report_Count 2 120#define IGMP_Unsolicited_Report_Count 2
119 121
@@ -138,6 +140,29 @@
138 ((in_dev)->mr_v2_seen && \ 140 ((in_dev)->mr_v2_seen && \
139 time_before(jiffies, (in_dev)->mr_v2_seen))) 141 time_before(jiffies, (in_dev)->mr_v2_seen)))
140 142
143static int unsolicited_report_interval(struct in_device *in_dev)
144{
145 int interval_ms, interval_jiffies;
146
147 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
148 interval_ms = IN_DEV_CONF_GET(
149 in_dev,
150 IGMPV2_UNSOLICITED_REPORT_INTERVAL);
151 else /* v3 */
152 interval_ms = IN_DEV_CONF_GET(
153 in_dev,
154 IGMPV3_UNSOLICITED_REPORT_INTERVAL);
155
156 interval_jiffies = msecs_to_jiffies(interval_ms);
157
158 /* _timer functions can't handle a delay of 0 jiffies so ensure
159 * we always return a positive value.
160 */
161 if (interval_jiffies <= 0)
162 interval_jiffies = 1;
163 return interval_jiffies;
164}
165
141static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); 166static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
142static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); 167static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
143static void igmpv3_clear_delrec(struct in_device *in_dev); 168static void igmpv3_clear_delrec(struct in_device *in_dev);
@@ -315,6 +340,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
315 if (size < 256) 340 if (size < 256)
316 return NULL; 341 return NULL;
317 } 342 }
343 skb->priority = TC_PRIO_CONTROL;
318 igmp_skb_size(skb) = size; 344 igmp_skb_size(skb) = size;
319 345
320 rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 346 rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
@@ -670,6 +696,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
670 ip_rt_put(rt); 696 ip_rt_put(rt);
671 return -1; 697 return -1;
672 } 698 }
699 skb->priority = TC_PRIO_CONTROL;
673 700
674 skb_dst_set(skb, &rt->dst); 701 skb_dst_set(skb, &rt->dst);
675 702
@@ -719,7 +746,8 @@ static void igmp_ifc_timer_expire(unsigned long data)
719 igmpv3_send_cr(in_dev); 746 igmpv3_send_cr(in_dev);
720 if (in_dev->mr_ifc_count) { 747 if (in_dev->mr_ifc_count) {
721 in_dev->mr_ifc_count--; 748 in_dev->mr_ifc_count--;
722 igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); 749 igmp_ifc_start_timer(in_dev,
750 unsolicited_report_interval(in_dev));
723 } 751 }
724 __in_dev_put(in_dev); 752 __in_dev_put(in_dev);
725} 753}
@@ -744,7 +772,7 @@ static void igmp_timer_expire(unsigned long data)
744 772
745 if (im->unsolicit_count) { 773 if (im->unsolicit_count) {
746 im->unsolicit_count--; 774 im->unsolicit_count--;
747 igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); 775 igmp_start_timer(im, unsolicited_report_interval(in_dev));
748 } 776 }
749 im->reporter = 1; 777 im->reporter = 1;
750 spin_unlock(&im->lock); 778 spin_unlock(&im->lock);
@@ -1323,16 +1351,17 @@ out:
1323EXPORT_SYMBOL(ip_mc_inc_group); 1351EXPORT_SYMBOL(ip_mc_inc_group);
1324 1352
1325/* 1353/*
1326 * Resend IGMP JOIN report; used for bonding. 1354 * Resend IGMP JOIN report; used by netdev notifier.
1327 * Called with rcu_read_lock()
1328 */ 1355 */
1329void ip_mc_rejoin_groups(struct in_device *in_dev) 1356static void ip_mc_rejoin_groups(struct in_device *in_dev)
1330{ 1357{
1331#ifdef CONFIG_IP_MULTICAST 1358#ifdef CONFIG_IP_MULTICAST
1332 struct ip_mc_list *im; 1359 struct ip_mc_list *im;
1333 int type; 1360 int type;
1334 1361
1335 for_each_pmc_rcu(in_dev, im) { 1362 ASSERT_RTNL();
1363
1364 for_each_pmc_rtnl(in_dev, im) {
1336 if (im->multiaddr == IGMP_ALL_HOSTS) 1365 if (im->multiaddr == IGMP_ALL_HOSTS)
1337 continue; 1366 continue;
1338 1367
@@ -1349,7 +1378,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev)
1349 } 1378 }
1350#endif 1379#endif
1351} 1380}
1352EXPORT_SYMBOL(ip_mc_rejoin_groups);
1353 1381
1354/* 1382/*
1355 * A socket has left a multicast group on device dev 1383 * A socket has left a multicast group on device dev
@@ -2735,8 +2763,42 @@ static struct pernet_operations igmp_net_ops = {
2735 .exit = igmp_net_exit, 2763 .exit = igmp_net_exit,
2736}; 2764};
2737 2765
2766static int igmp_netdev_event(struct notifier_block *this,
2767 unsigned long event, void *ptr)
2768{
2769 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2770 struct in_device *in_dev;
2771
2772 switch (event) {
2773 case NETDEV_RESEND_IGMP:
2774 in_dev = __in_dev_get_rtnl(dev);
2775 if (in_dev)
2776 ip_mc_rejoin_groups(in_dev);
2777 break;
2778 default:
2779 break;
2780 }
2781 return NOTIFY_DONE;
2782}
2783
2784static struct notifier_block igmp_notifier = {
2785 .notifier_call = igmp_netdev_event,
2786};
2787
2738int __init igmp_mc_proc_init(void) 2788int __init igmp_mc_proc_init(void)
2739{ 2789{
2740 return register_pernet_subsys(&igmp_net_ops); 2790 int err;
2791
2792 err = register_pernet_subsys(&igmp_net_ops);
2793 if (err)
2794 return err;
2795 err = register_netdevice_notifier(&igmp_notifier);
2796 if (err)
2797 goto reg_notif_fail;
2798 return 0;
2799
2800reg_notif_fail:
2801 unregister_pernet_subsys(&igmp_net_ops);
2802 return err;
2741} 2803}
2742#endif 2804#endif
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1f6eab66f7ce..d7aea4c5b940 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -383,7 +383,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
383 if (daddr) 383 if (daddr)
384 memcpy(&iph->daddr, daddr, 4); 384 memcpy(&iph->daddr, daddr, 4);
385 if (iph->daddr) 385 if (iph->daddr)
386 return t->hlen; 386 return t->hlen + sizeof(*iph);
387 387
388 return -(t->hlen + sizeof(*iph)); 388 return -(t->hlen + sizeof(*iph));
389} 389}
@@ -534,7 +534,7 @@ static int __net_init ipgre_init_net(struct net *net)
534static void __net_exit ipgre_exit_net(struct net *net) 534static void __net_exit ipgre_exit_net(struct net *net)
535{ 535{
536 struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); 536 struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
537 ip_tunnel_delete_net(itn); 537 ip_tunnel_delete_net(itn, &ipgre_link_ops);
538} 538}
539 539
540static struct pernet_operations ipgre_net_ops = { 540static struct pernet_operations ipgre_net_ops = {
@@ -767,7 +767,7 @@ static int __net_init ipgre_tap_init_net(struct net *net)
767static void __net_exit ipgre_tap_exit_net(struct net *net) 767static void __net_exit ipgre_tap_exit_net(struct net *net)
768{ 768{
769 struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); 769 struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
770 ip_tunnel_delete_net(itn); 770 ip_tunnel_delete_net(itn, &ipgre_tap_ops);
771} 771}
772 772
773static struct pernet_operations ipgre_tap_net_ops = { 773static struct pernet_operations ipgre_tap_net_ops = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 15e3e683adec..054a3e97d822 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -141,6 +141,7 @@
141#include <net/icmp.h> 141#include <net/icmp.h>
142#include <net/raw.h> 142#include <net/raw.h>
143#include <net/checksum.h> 143#include <net/checksum.h>
144#include <net/inet_ecn.h>
144#include <linux/netfilter_ipv4.h> 145#include <linux/netfilter_ipv4.h>
145#include <net/xfrm.h> 146#include <net/xfrm.h>
146#include <linux/mroute.h> 147#include <linux/mroute.h>
@@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
410 if (iph->ihl < 5 || iph->version != 4) 411 if (iph->ihl < 5 || iph->version != 4)
411 goto inhdr_error; 412 goto inhdr_error;
412 413
414 BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
415 BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
416 BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
417 IP_ADD_STATS_BH(dev_net(dev),
418 IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
419 max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
420
413 if (!pskb_may_pull(skb, iph->ihl*4)) 421 if (!pskb_may_pull(skb, iph->ihl*4))
414 goto inhdr_error; 422 goto inhdr_error;
415 423
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4bcabf3ab4ca..9ee17e3d11c3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -211,14 +211,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
211 return -EINVAL; 211 return -EINVAL;
212} 212}
213 213
214static inline int ip_skb_dst_mtu(struct sk_buff *skb)
215{
216 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
217
218 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
219 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
220}
221
222static int ip_finish_output(struct sk_buff *skb) 214static int ip_finish_output(struct sk_buff *skb)
223{ 215{
224#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 216#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ca1cb2d5f6e2..ac9fabe0300f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -350,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
350 struct flowi4 fl4; 350 struct flowi4 fl4;
351 struct rtable *rt; 351 struct rtable *rt;
352 352
353 rt = ip_route_output_tunnel(dev_net(dev), &fl4, 353 rt = ip_route_output_tunnel(tunnel->net, &fl4,
354 tunnel->parms.iph.protocol, 354 tunnel->parms.iph.protocol,
355 iph->daddr, iph->saddr, 355 iph->daddr, iph->saddr,
356 tunnel->parms.o_key, 356 tunnel->parms.o_key,
@@ -365,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
365 } 365 }
366 366
367 if (!tdev && tunnel->parms.link) 367 if (!tdev && tunnel->parms.link)
368 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); 368 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
369 369
370 if (tdev) { 370 if (tdev) {
371 hlen = tdev->hard_header_len + tdev->needed_headroom; 371 hlen = tdev->hard_header_len + tdev->needed_headroom;
@@ -454,15 +454,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
454 tstats->rx_bytes += skb->len; 454 tstats->rx_bytes += skb->len;
455 u64_stats_update_end(&tstats->syncp); 455 u64_stats_update_end(&tstats->syncp);
456 456
457 if (tunnel->net != dev_net(tunnel->dev))
458 skb_scrub_packet(skb);
459
460 if (tunnel->dev->type == ARPHRD_ETHER) { 457 if (tunnel->dev->type == ARPHRD_ETHER) {
461 skb->protocol = eth_type_trans(skb, tunnel->dev); 458 skb->protocol = eth_type_trans(skb, tunnel->dev);
462 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 459 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
463 } else { 460 } else {
464 skb->dev = tunnel->dev; 461 skb->dev = tunnel->dev;
465 } 462 }
463
464 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
465
466 gro_cells_receive(&tunnel->gro_cells, skb); 466 gro_cells_receive(&tunnel->gro_cells, skb);
467 return 0; 467 return 0;
468 468
@@ -613,9 +613,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
613 goto tx_error; 613 goto tx_error;
614 } 614 }
615 615
616 if (tunnel->net != dev_net(dev))
617 skb_scrub_packet(skb);
618
619 if (tunnel->err_count > 0) { 616 if (tunnel->err_count > 0) {
620 if (time_before(jiffies, 617 if (time_before(jiffies,
621 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 618 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
@@ -653,9 +650,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
653 } 650 }
654 } 651 }
655 652
656 err = iptunnel_xmit(dev_net(dev), rt, skb, 653 err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
657 fl4.saddr, fl4.daddr, protocol, 654 ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df,
658 ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df); 655 !net_eq(tunnel->net, dev_net(dev)));
659 iptunnel_xmit_stats(err, &dev->stats, dev->tstats); 656 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
660 657
661 return; 658 return;
@@ -820,11 +817,10 @@ static void ip_tunnel_dev_free(struct net_device *dev)
820 817
821void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 818void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
822{ 819{
823 struct net *net = dev_net(dev);
824 struct ip_tunnel *tunnel = netdev_priv(dev); 820 struct ip_tunnel *tunnel = netdev_priv(dev);
825 struct ip_tunnel_net *itn; 821 struct ip_tunnel_net *itn;
826 822
827 itn = net_generic(net, tunnel->ip_tnl_net_id); 823 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
828 824
829 if (itn->fb_tunnel_dev != dev) { 825 if (itn->fb_tunnel_dev != dev) {
830 ip_tunnel_del(netdev_priv(dev)); 826 ip_tunnel_del(netdev_priv(dev));
@@ -838,56 +834,68 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
838{ 834{
839 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 835 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
840 struct ip_tunnel_parm parms; 836 struct ip_tunnel_parm parms;
837 unsigned int i;
841 838
842 itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL); 839 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
843 if (!itn->tunnels) 840 INIT_HLIST_HEAD(&itn->tunnels[i]);
844 return -ENOMEM;
845 841
846 if (!ops) { 842 if (!ops) {
847 itn->fb_tunnel_dev = NULL; 843 itn->fb_tunnel_dev = NULL;
848 return 0; 844 return 0;
849 } 845 }
846
850 memset(&parms, 0, sizeof(parms)); 847 memset(&parms, 0, sizeof(parms));
851 if (devname) 848 if (devname)
852 strlcpy(parms.name, devname, IFNAMSIZ); 849 strlcpy(parms.name, devname, IFNAMSIZ);
853 850
854 rtnl_lock(); 851 rtnl_lock();
855 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 852 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
853 /* FB netdevice is special: we have one, and only one per netns.
854 * Allowing to move it to another netns is clearly unsafe.
855 */
856 if (!IS_ERR(itn->fb_tunnel_dev))
857 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
856 rtnl_unlock(); 858 rtnl_unlock();
857 if (IS_ERR(itn->fb_tunnel_dev)) {
858 kfree(itn->tunnels);
859 return PTR_ERR(itn->fb_tunnel_dev);
860 }
861 859
862 return 0; 860 return PTR_RET(itn->fb_tunnel_dev);
863} 861}
864EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 862EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
865 863
866static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head) 864static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
865 struct rtnl_link_ops *ops)
867{ 866{
867 struct net *net = dev_net(itn->fb_tunnel_dev);
868 struct net_device *dev, *aux;
868 int h; 869 int h;
869 870
871 for_each_netdev_safe(net, dev, aux)
872 if (dev->rtnl_link_ops == ops)
873 unregister_netdevice_queue(dev, head);
874
870 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 875 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
871 struct ip_tunnel *t; 876 struct ip_tunnel *t;
872 struct hlist_node *n; 877 struct hlist_node *n;
873 struct hlist_head *thead = &itn->tunnels[h]; 878 struct hlist_head *thead = &itn->tunnels[h];
874 879
875 hlist_for_each_entry_safe(t, n, thead, hash_node) 880 hlist_for_each_entry_safe(t, n, thead, hash_node)
876 unregister_netdevice_queue(t->dev, head); 881 /* If dev is in the same netns, it has already
882 * been added to the list by the previous loop.
883 */
884 if (!net_eq(dev_net(t->dev), net))
885 unregister_netdevice_queue(t->dev, head);
877 } 886 }
878 if (itn->fb_tunnel_dev) 887 if (itn->fb_tunnel_dev)
879 unregister_netdevice_queue(itn->fb_tunnel_dev, head); 888 unregister_netdevice_queue(itn->fb_tunnel_dev, head);
880} 889}
881 890
882void ip_tunnel_delete_net(struct ip_tunnel_net *itn) 891void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
883{ 892{
884 LIST_HEAD(list); 893 LIST_HEAD(list);
885 894
886 rtnl_lock(); 895 rtnl_lock();
887 ip_tunnel_destroy(itn, &list); 896 ip_tunnel_destroy(itn, &list, ops);
888 unregister_netdevice_many(&list); 897 unregister_netdevice_many(&list);
889 rtnl_unlock(); 898 rtnl_unlock();
890 kfree(itn->tunnels);
891} 899}
892EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); 900EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
893 901
@@ -929,23 +937,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
929int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 937int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
930 struct ip_tunnel_parm *p) 938 struct ip_tunnel_parm *p)
931{ 939{
932 struct ip_tunnel *t, *nt; 940 struct ip_tunnel *t;
933 struct net *net = dev_net(dev);
934 struct ip_tunnel *tunnel = netdev_priv(dev); 941 struct ip_tunnel *tunnel = netdev_priv(dev);
942 struct net *net = tunnel->net;
935 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 943 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
936 944
937 if (dev == itn->fb_tunnel_dev) 945 if (dev == itn->fb_tunnel_dev)
938 return -EINVAL; 946 return -EINVAL;
939 947
940 nt = netdev_priv(dev);
941
942 t = ip_tunnel_find(itn, p, dev->type); 948 t = ip_tunnel_find(itn, p, dev->type);
943 949
944 if (t) { 950 if (t) {
945 if (t->dev != dev) 951 if (t->dev != dev)
946 return -EEXIST; 952 return -EEXIST;
947 } else { 953 } else {
948 t = nt; 954 t = tunnel;
949 955
950 if (dev->type != ARPHRD_ETHER) { 956 if (dev->type != ARPHRD_ETHER) {
951 unsigned int nflags = 0; 957 unsigned int nflags = 0;
@@ -984,6 +990,7 @@ int ip_tunnel_init(struct net_device *dev)
984 } 990 }
985 991
986 tunnel->dev = dev; 992 tunnel->dev = dev;
993 tunnel->net = dev_net(dev);
987 strcpy(tunnel->parms.name, dev->name); 994 strcpy(tunnel->parms.name, dev->name);
988 iph->version = 4; 995 iph->version = 4;
989 iph->ihl = 5; 996 iph->ihl = 5;
@@ -994,8 +1001,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init);
994 1001
995void ip_tunnel_uninit(struct net_device *dev) 1002void ip_tunnel_uninit(struct net_device *dev)
996{ 1003{
997 struct net *net = dev_net(dev);
998 struct ip_tunnel *tunnel = netdev_priv(dev); 1004 struct ip_tunnel *tunnel = netdev_priv(dev);
1005 struct net *net = tunnel->net;
999 struct ip_tunnel_net *itn; 1006 struct ip_tunnel_net *itn;
1000 1007
1001 itn = net_generic(net, tunnel->ip_tnl_net_id); 1008 itn = net_generic(net, tunnel->ip_tnl_net_id);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 7167b08977df..d6c856b17fd4 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -46,19 +46,17 @@
46#include <net/netns/generic.h> 46#include <net/netns/generic.h>
47#include <net/rtnetlink.h> 47#include <net/rtnetlink.h>
48 48
49int iptunnel_xmit(struct net *net, struct rtable *rt, 49int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
50 struct sk_buff *skb,
51 __be32 src, __be32 dst, __u8 proto, 50 __be32 src, __be32 dst, __u8 proto,
52 __u8 tos, __u8 ttl, __be16 df) 51 __u8 tos, __u8 ttl, __be16 df, bool xnet)
53{ 52{
54 int pkt_len = skb->len; 53 int pkt_len = skb->len;
55 struct iphdr *iph; 54 struct iphdr *iph;
56 int err; 55 int err;
57 56
58 nf_reset(skb); 57 skb_scrub_packet(skb, xnet);
59 secpath_reset(skb); 58
60 skb->rxhash = 0; 59 skb->rxhash = 0;
61 skb_dst_drop(skb);
62 skb_dst_set(skb, &rt->dst); 60 skb_dst_set(skb, &rt->dst);
63 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 61 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
64 62
@@ -76,9 +74,7 @@ int iptunnel_xmit(struct net *net, struct rtable *rt,
76 iph->daddr = dst; 74 iph->daddr = dst;
77 iph->saddr = src; 75 iph->saddr = src;
78 iph->ttl = ttl; 76 iph->ttl = ttl;
79 tunnel_ip_select_ident(skb, 77 __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1);
80 (const struct iphdr *)skb_inner_network_header(skb),
81 &rt->dst);
82 78
83 err = ip_local_out(skb); 79 err = ip_local_out(skb);
84 if (unlikely(net_xmit_eval(err))) 80 if (unlikely(net_xmit_eval(err)))
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 17cc0ffa8c0d..e805e7b3030e 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -44,176 +44,10 @@
44#include <net/net_namespace.h> 44#include <net/net_namespace.h>
45#include <net/netns/generic.h> 45#include <net/netns/generic.h>
46 46
47#define HASH_SIZE 16
48#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
49
50static struct rtnl_link_ops vti_link_ops __read_mostly; 47static struct rtnl_link_ops vti_link_ops __read_mostly;
51 48
52static int vti_net_id __read_mostly; 49static int vti_net_id __read_mostly;
53struct vti_net {
54 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
55 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
56 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
57 struct ip_tunnel __rcu *tunnels_wc[1];
58 struct ip_tunnel __rcu **tunnels[4];
59
60 struct net_device *fb_tunnel_dev;
61};
62
63static int vti_fb_tunnel_init(struct net_device *dev);
64static int vti_tunnel_init(struct net_device *dev); 50static int vti_tunnel_init(struct net_device *dev);
65static void vti_tunnel_setup(struct net_device *dev);
66static void vti_dev_free(struct net_device *dev);
67static int vti_tunnel_bind_dev(struct net_device *dev);
68
69#define VTI_XMIT(stats1, stats2) do { \
70 int err; \
71 int pkt_len = skb->len; \
72 err = dst_output(skb); \
73 if (net_xmit_eval(err) == 0) { \
74 u64_stats_update_begin(&(stats1)->syncp); \
75 (stats1)->tx_bytes += pkt_len; \
76 (stats1)->tx_packets++; \
77 u64_stats_update_end(&(stats1)->syncp); \
78 } else { \
79 (stats2)->tx_errors++; \
80 (stats2)->tx_aborted_errors++; \
81 } \
82} while (0)
83
84
85static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
86 __be32 remote, __be32 local)
87{
88 unsigned h0 = HASH(remote);
89 unsigned h1 = HASH(local);
90 struct ip_tunnel *t;
91 struct vti_net *ipn = net_generic(net, vti_net_id);
92
93 for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
94 if (local == t->parms.iph.saddr &&
95 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
96 return t;
97 for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
98 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
99 return t;
100
101 for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
102 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
103 return t;
104
105 for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0])
106 if (t && (t->dev->flags&IFF_UP))
107 return t;
108 return NULL;
109}
110
111static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
112 struct ip_tunnel_parm *parms)
113{
114 __be32 remote = parms->iph.daddr;
115 __be32 local = parms->iph.saddr;
116 unsigned h = 0;
117 int prio = 0;
118
119 if (remote) {
120 prio |= 2;
121 h ^= HASH(remote);
122 }
123 if (local) {
124 prio |= 1;
125 h ^= HASH(local);
126 }
127 return &ipn->tunnels[prio][h];
128}
129
130static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
131 struct ip_tunnel *t)
132{
133 return __vti_bucket(ipn, &t->parms);
134}
135
136static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
137{
138 struct ip_tunnel __rcu **tp;
139 struct ip_tunnel *iter;
140
141 for (tp = vti_bucket(ipn, t);
142 (iter = rtnl_dereference(*tp)) != NULL;
143 tp = &iter->next) {
144 if (t == iter) {
145 rcu_assign_pointer(*tp, t->next);
146 break;
147 }
148 }
149}
150
151static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
152{
153 struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
154
155 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
156 rcu_assign_pointer(*tp, t);
157}
158
159static struct ip_tunnel *vti_tunnel_locate(struct net *net,
160 struct ip_tunnel_parm *parms,
161 int create)
162{
163 __be32 remote = parms->iph.daddr;
164 __be32 local = parms->iph.saddr;
165 struct ip_tunnel *t, *nt;
166 struct ip_tunnel __rcu **tp;
167 struct net_device *dev;
168 char name[IFNAMSIZ];
169 struct vti_net *ipn = net_generic(net, vti_net_id);
170
171 for (tp = __vti_bucket(ipn, parms);
172 (t = rtnl_dereference(*tp)) != NULL;
173 tp = &t->next) {
174 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
175 return t;
176 }
177 if (!create)
178 return NULL;
179
180 if (parms->name[0])
181 strlcpy(name, parms->name, IFNAMSIZ);
182 else
183 strcpy(name, "vti%d");
184
185 dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
186 if (dev == NULL)
187 return NULL;
188
189 dev_net_set(dev, net);
190
191 nt = netdev_priv(dev);
192 nt->parms = *parms;
193 dev->rtnl_link_ops = &vti_link_ops;
194
195 vti_tunnel_bind_dev(dev);
196
197 if (register_netdevice(dev) < 0)
198 goto failed_free;
199
200 dev_hold(dev);
201 vti_tunnel_link(ipn, nt);
202 return nt;
203
204failed_free:
205 free_netdev(dev);
206 return NULL;
207}
208
209static void vti_tunnel_uninit(struct net_device *dev)
210{
211 struct net *net = dev_net(dev);
212 struct vti_net *ipn = net_generic(net, vti_net_id);
213
214 vti_tunnel_unlink(ipn, netdev_priv(dev));
215 dev_put(dev);
216}
217 51
218static int vti_err(struct sk_buff *skb, u32 info) 52static int vti_err(struct sk_buff *skb, u32 info)
219{ 53{
@@ -222,6 +56,8 @@ static int vti_err(struct sk_buff *skb, u32 info)
222 * 8 bytes of packet payload. It means, that precise relaying of 56 * 8 bytes of packet payload. It means, that precise relaying of
223 * ICMP in the real Internet is absolutely infeasible. 57 * ICMP in the real Internet is absolutely infeasible.
224 */ 58 */
59 struct net *net = dev_net(skb->dev);
60 struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
225 struct iphdr *iph = (struct iphdr *)skb->data; 61 struct iphdr *iph = (struct iphdr *)skb->data;
226 const int type = icmp_hdr(skb)->type; 62 const int type = icmp_hdr(skb)->type;
227 const int code = icmp_hdr(skb)->code; 63 const int code = icmp_hdr(skb)->code;
@@ -252,7 +88,8 @@ static int vti_err(struct sk_buff *skb, u32 info)
252 88
253 err = -ENOENT; 89 err = -ENOENT;
254 90
255 t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 91 t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
92 iph->daddr, iph->saddr, 0);
256 if (t == NULL) 93 if (t == NULL)
257 goto out; 94 goto out;
258 95
@@ -281,8 +118,11 @@ static int vti_rcv(struct sk_buff *skb)
281{ 118{
282 struct ip_tunnel *tunnel; 119 struct ip_tunnel *tunnel;
283 const struct iphdr *iph = ip_hdr(skb); 120 const struct iphdr *iph = ip_hdr(skb);
121 struct net *net = dev_net(skb->dev);
122 struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
284 123
285 tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); 124 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
125 iph->saddr, iph->daddr, 0);
286 if (tunnel != NULL) { 126 if (tunnel != NULL) {
287 struct pcpu_tstats *tstats; 127 struct pcpu_tstats *tstats;
288 128
@@ -311,7 +151,6 @@ static int vti_rcv(struct sk_buff *skb)
311static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 151static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
312{ 152{
313 struct ip_tunnel *tunnel = netdev_priv(dev); 153 struct ip_tunnel *tunnel = netdev_priv(dev);
314 struct pcpu_tstats *tstats;
315 struct iphdr *tiph = &tunnel->parms.iph; 154 struct iphdr *tiph = &tunnel->parms.iph;
316 u8 tos; 155 u8 tos;
317 struct rtable *rt; /* Route to the other host */ 156 struct rtable *rt; /* Route to the other host */
@@ -319,6 +158,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
319 struct iphdr *old_iph = ip_hdr(skb); 158 struct iphdr *old_iph = ip_hdr(skb);
320 __be32 dst = tiph->daddr; 159 __be32 dst = tiph->daddr;
321 struct flowi4 fl4; 160 struct flowi4 fl4;
161 int err;
322 162
323 if (skb->protocol != htons(ETH_P_IP)) 163 if (skb->protocol != htons(ETH_P_IP))
324 goto tx_error; 164 goto tx_error;
@@ -367,8 +207,10 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
367 nf_reset(skb); 207 nf_reset(skb);
368 skb->dev = skb_dst(skb)->dev; 208 skb->dev = skb_dst(skb)->dev;
369 209
370 tstats = this_cpu_ptr(dev->tstats); 210 err = dst_output(skb);
371 VTI_XMIT(tstats, &dev->stats); 211 if (net_xmit_eval(err) == 0)
212 err = skb->len;
213 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
372 return NETDEV_TX_OK; 214 return NETDEV_TX_OK;
373 215
374tx_error_icmp: 216tx_error_icmp:
@@ -379,198 +221,57 @@ tx_error:
379 return NETDEV_TX_OK; 221 return NETDEV_TX_OK;
380} 222}
381 223
382static int vti_tunnel_bind_dev(struct net_device *dev)
383{
384 struct net_device *tdev = NULL;
385 struct ip_tunnel *tunnel;
386 struct iphdr *iph;
387
388 tunnel = netdev_priv(dev);
389 iph = &tunnel->parms.iph;
390
391 if (iph->daddr) {
392 struct rtable *rt;
393 struct flowi4 fl4;
394 memset(&fl4, 0, sizeof(fl4));
395 flowi4_init_output(&fl4, tunnel->parms.link,
396 be32_to_cpu(tunnel->parms.i_key),
397 RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
398 IPPROTO_IPIP, 0,
399 iph->daddr, iph->saddr, 0, 0);
400 rt = ip_route_output_key(dev_net(dev), &fl4);
401 if (!IS_ERR(rt)) {
402 tdev = rt->dst.dev;
403 ip_rt_put(rt);
404 }
405 dev->flags |= IFF_POINTOPOINT;
406 }
407
408 if (!tdev && tunnel->parms.link)
409 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
410
411 if (tdev) {
412 dev->hard_header_len = tdev->hard_header_len +
413 sizeof(struct iphdr);
414 dev->mtu = tdev->mtu;
415 }
416 dev->iflink = tunnel->parms.link;
417 return dev->mtu;
418}
419
420static int 224static int
421vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) 225vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
422{ 226{
423 int err = 0; 227 int err = 0;
424 struct ip_tunnel_parm p; 228 struct ip_tunnel_parm p;
425 struct ip_tunnel *t;
426 struct net *net = dev_net(dev);
427 struct vti_net *ipn = net_generic(net, vti_net_id);
428
429 switch (cmd) {
430 case SIOCGETTUNNEL:
431 t = NULL;
432 if (dev == ipn->fb_tunnel_dev) {
433 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
434 sizeof(p))) {
435 err = -EFAULT;
436 break;
437 }
438 t = vti_tunnel_locate(net, &p, 0);
439 }
440 if (t == NULL)
441 t = netdev_priv(dev);
442 memcpy(&p, &t->parms, sizeof(p));
443 p.i_flags |= GRE_KEY | VTI_ISVTI;
444 p.o_flags |= GRE_KEY;
445 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
446 err = -EFAULT;
447 break;
448
449 case SIOCADDTUNNEL:
450 case SIOCCHGTUNNEL:
451 err = -EPERM;
452 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
453 goto done;
454 229
455 err = -EFAULT; 230 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
456 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 231 return -EFAULT;
457 goto done;
458 232
459 err = -EINVAL; 233 if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
460 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || 234 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
461 p.iph.ihl != 5) 235 p.iph.ihl != 5)
462 goto done; 236 return -EINVAL;
463 237 }
464 t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
465
466 if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
467 if (t != NULL) {
468 if (t->dev != dev) {
469 err = -EEXIST;
470 break;
471 }
472 } else {
473 if (((dev->flags&IFF_POINTOPOINT) &&
474 !p.iph.daddr) ||
475 (!(dev->flags&IFF_POINTOPOINT) &&
476 p.iph.daddr)) {
477 err = -EINVAL;
478 break;
479 }
480 t = netdev_priv(dev);
481 vti_tunnel_unlink(ipn, t);
482 synchronize_net();
483 t->parms.iph.saddr = p.iph.saddr;
484 t->parms.iph.daddr = p.iph.daddr;
485 t->parms.i_key = p.i_key;
486 t->parms.o_key = p.o_key;
487 t->parms.iph.protocol = IPPROTO_IPIP;
488 memcpy(dev->dev_addr, &p.iph.saddr, 4);
489 memcpy(dev->broadcast, &p.iph.daddr, 4);
490 vti_tunnel_link(ipn, t);
491 netdev_state_change(dev);
492 }
493 }
494
495 if (t) {
496 err = 0;
497 if (cmd == SIOCCHGTUNNEL) {
498 t->parms.i_key = p.i_key;
499 t->parms.o_key = p.o_key;
500 if (t->parms.link != p.link) {
501 t->parms.link = p.link;
502 vti_tunnel_bind_dev(dev);
503 netdev_state_change(dev);
504 }
505 }
506 p.i_flags |= GRE_KEY | VTI_ISVTI;
507 p.o_flags |= GRE_KEY;
508 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
509 sizeof(p)))
510 err = -EFAULT;
511 } else
512 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
513 break;
514 238
515 case SIOCDELTUNNEL: 239 err = ip_tunnel_ioctl(dev, &p, cmd);
516 err = -EPERM; 240 if (err)
517 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 241 return err;
518 goto done;
519
520 if (dev == ipn->fb_tunnel_dev) {
521 err = -EFAULT;
522 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
523 sizeof(p)))
524 goto done;
525 err = -ENOENT;
526
527 t = vti_tunnel_locate(net, &p, 0);
528 if (t == NULL)
529 goto done;
530 err = -EPERM;
531 if (t->dev == ipn->fb_tunnel_dev)
532 goto done;
533 dev = t->dev;
534 }
535 unregister_netdevice(dev);
536 err = 0;
537 break;
538 242
539 default: 243 if (cmd != SIOCDELTUNNEL) {
540 err = -EINVAL; 244 p.i_flags |= GRE_KEY | VTI_ISVTI;
245 p.o_flags |= GRE_KEY;
541 } 246 }
542 247
543done: 248 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
544 return err; 249 return -EFAULT;
545}
546
547static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
548{
549 if (new_mtu < 68 || new_mtu > 0xFFF8)
550 return -EINVAL;
551 dev->mtu = new_mtu;
552 return 0; 250 return 0;
553} 251}
554 252
555static const struct net_device_ops vti_netdev_ops = { 253static const struct net_device_ops vti_netdev_ops = {
556 .ndo_init = vti_tunnel_init, 254 .ndo_init = vti_tunnel_init,
557 .ndo_uninit = vti_tunnel_uninit, 255 .ndo_uninit = ip_tunnel_uninit,
558 .ndo_start_xmit = vti_tunnel_xmit, 256 .ndo_start_xmit = vti_tunnel_xmit,
559 .ndo_do_ioctl = vti_tunnel_ioctl, 257 .ndo_do_ioctl = vti_tunnel_ioctl,
560 .ndo_change_mtu = vti_tunnel_change_mtu, 258 .ndo_change_mtu = ip_tunnel_change_mtu,
561 .ndo_get_stats64 = ip_tunnel_get_stats64, 259 .ndo_get_stats64 = ip_tunnel_get_stats64,
562}; 260};
563 261
564static void vti_dev_free(struct net_device *dev) 262static void vti_tunnel_setup(struct net_device *dev)
565{ 263{
566 free_percpu(dev->tstats); 264 dev->netdev_ops = &vti_netdev_ops;
567 free_netdev(dev); 265 ip_tunnel_setup(dev, vti_net_id);
568} 266}
569 267
570static void vti_tunnel_setup(struct net_device *dev) 268static int vti_tunnel_init(struct net_device *dev)
571{ 269{
572 dev->netdev_ops = &vti_netdev_ops; 270 struct ip_tunnel *tunnel = netdev_priv(dev);
573 dev->destructor = vti_dev_free; 271 struct iphdr *iph = &tunnel->parms.iph;
272
273 memcpy(dev->dev_addr, &iph->saddr, 4);
274 memcpy(dev->broadcast, &iph->daddr, 4);
574 275
575 dev->type = ARPHRD_TUNNEL; 276 dev->type = ARPHRD_TUNNEL;
576 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 277 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -581,38 +282,18 @@ static void vti_tunnel_setup(struct net_device *dev)
581 dev->features |= NETIF_F_NETNS_LOCAL; 282 dev->features |= NETIF_F_NETNS_LOCAL;
582 dev->features |= NETIF_F_LLTX; 283 dev->features |= NETIF_F_LLTX;
583 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 284 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
584}
585 285
586static int vti_tunnel_init(struct net_device *dev) 286 return ip_tunnel_init(dev);
587{
588 struct ip_tunnel *tunnel = netdev_priv(dev);
589
590 tunnel->dev = dev;
591 strcpy(tunnel->parms.name, dev->name);
592
593 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
594 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
595
596 dev->tstats = alloc_percpu(struct pcpu_tstats);
597 if (!dev->tstats)
598 return -ENOMEM;
599
600 return 0;
601} 287}
602 288
603static int __net_init vti_fb_tunnel_init(struct net_device *dev) 289static void __net_init vti_fb_tunnel_init(struct net_device *dev)
604{ 290{
605 struct ip_tunnel *tunnel = netdev_priv(dev); 291 struct ip_tunnel *tunnel = netdev_priv(dev);
606 struct iphdr *iph = &tunnel->parms.iph; 292 struct iphdr *iph = &tunnel->parms.iph;
607 struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
608 293
609 iph->version = 4; 294 iph->version = 4;
610 iph->protocol = IPPROTO_IPIP; 295 iph->protocol = IPPROTO_IPIP;
611 iph->ihl = 5; 296 iph->ihl = 5;
612
613 dev_hold(dev);
614 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
615 return 0;
616} 297}
617 298
618static struct xfrm_tunnel vti_handler __read_mostly = { 299static struct xfrm_tunnel vti_handler __read_mostly = {
@@ -621,76 +302,30 @@ static struct xfrm_tunnel vti_handler __read_mostly = {
621 .priority = 1, 302 .priority = 1,
622}; 303};
623 304
624static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
625{
626 int prio;
627
628 for (prio = 1; prio < 4; prio++) {
629 int h;
630 for (h = 0; h < HASH_SIZE; h++) {
631 struct ip_tunnel *t;
632
633 t = rtnl_dereference(ipn->tunnels[prio][h]);
634 while (t != NULL) {
635 unregister_netdevice_queue(t->dev, head);
636 t = rtnl_dereference(t->next);
637 }
638 }
639 }
640}
641
642static int __net_init vti_init_net(struct net *net) 305static int __net_init vti_init_net(struct net *net)
643{ 306{
644 int err; 307 int err;
645 struct vti_net *ipn = net_generic(net, vti_net_id); 308 struct ip_tunnel_net *itn;
646
647 ipn->tunnels[0] = ipn->tunnels_wc;
648 ipn->tunnels[1] = ipn->tunnels_l;
649 ipn->tunnels[2] = ipn->tunnels_r;
650 ipn->tunnels[3] = ipn->tunnels_r_l;
651
652 ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
653 "ip_vti0",
654 vti_tunnel_setup);
655 if (!ipn->fb_tunnel_dev) {
656 err = -ENOMEM;
657 goto err_alloc_dev;
658 }
659 dev_net_set(ipn->fb_tunnel_dev, net);
660
661 err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
662 if (err)
663 goto err_reg_dev;
664 ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
665 309
666 err = register_netdev(ipn->fb_tunnel_dev); 310 err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
667 if (err) 311 if (err)
668 goto err_reg_dev; 312 return err;
313 itn = net_generic(net, vti_net_id);
314 vti_fb_tunnel_init(itn->fb_tunnel_dev);
669 return 0; 315 return 0;
670
671err_reg_dev:
672 vti_dev_free(ipn->fb_tunnel_dev);
673err_alloc_dev:
674 /* nothing */
675 return err;
676} 316}
677 317
678static void __net_exit vti_exit_net(struct net *net) 318static void __net_exit vti_exit_net(struct net *net)
679{ 319{
680 struct vti_net *ipn = net_generic(net, vti_net_id); 320 struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
681 LIST_HEAD(list); 321 ip_tunnel_delete_net(itn, &vti_link_ops);
682
683 rtnl_lock();
684 vti_destroy_tunnels(ipn, &list);
685 unregister_netdevice_many(&list);
686 rtnl_unlock();
687} 322}
688 323
689static struct pernet_operations vti_net_ops = { 324static struct pernet_operations vti_net_ops = {
690 .init = vti_init_net, 325 .init = vti_init_net,
691 .exit = vti_exit_net, 326 .exit = vti_exit_net,
692 .id = &vti_net_id, 327 .id = &vti_net_id,
693 .size = sizeof(struct vti_net), 328 .size = sizeof(struct ip_tunnel_net),
694}; 329};
695 330
696static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 331static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -728,78 +363,19 @@ static void vti_netlink_parms(struct nlattr *data[],
728static int vti_newlink(struct net *src_net, struct net_device *dev, 363static int vti_newlink(struct net *src_net, struct net_device *dev,
729 struct nlattr *tb[], struct nlattr *data[]) 364 struct nlattr *tb[], struct nlattr *data[])
730{ 365{
731 struct ip_tunnel *nt; 366 struct ip_tunnel_parm parms;
732 struct net *net = dev_net(dev);
733 struct vti_net *ipn = net_generic(net, vti_net_id);
734 int mtu;
735 int err;
736
737 nt = netdev_priv(dev);
738 vti_netlink_parms(data, &nt->parms);
739
740 if (vti_tunnel_locate(net, &nt->parms, 0))
741 return -EEXIST;
742 367
743 mtu = vti_tunnel_bind_dev(dev); 368 vti_netlink_parms(data, &parms);
744 if (!tb[IFLA_MTU]) 369 return ip_tunnel_newlink(dev, tb, &parms);
745 dev->mtu = mtu;
746
747 err = register_netdevice(dev);
748 if (err)
749 goto out;
750
751 dev_hold(dev);
752 vti_tunnel_link(ipn, nt);
753
754out:
755 return err;
756} 370}
757 371
758static int vti_changelink(struct net_device *dev, struct nlattr *tb[], 372static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
759 struct nlattr *data[]) 373 struct nlattr *data[])
760{ 374{
761 struct ip_tunnel *t, *nt;
762 struct net *net = dev_net(dev);
763 struct vti_net *ipn = net_generic(net, vti_net_id);
764 struct ip_tunnel_parm p; 375 struct ip_tunnel_parm p;
765 int mtu;
766
767 if (dev == ipn->fb_tunnel_dev)
768 return -EINVAL;
769 376
770 nt = netdev_priv(dev);
771 vti_netlink_parms(data, &p); 377 vti_netlink_parms(data, &p);
772 378 return ip_tunnel_changelink(dev, tb, &p);
773 t = vti_tunnel_locate(net, &p, 0);
774
775 if (t) {
776 if (t->dev != dev)
777 return -EEXIST;
778 } else {
779 t = nt;
780
781 vti_tunnel_unlink(ipn, t);
782 t->parms.iph.saddr = p.iph.saddr;
783 t->parms.iph.daddr = p.iph.daddr;
784 t->parms.i_key = p.i_key;
785 t->parms.o_key = p.o_key;
786 if (dev->type != ARPHRD_ETHER) {
787 memcpy(dev->dev_addr, &p.iph.saddr, 4);
788 memcpy(dev->broadcast, &p.iph.daddr, 4);
789 }
790 vti_tunnel_link(ipn, t);
791 netdev_state_change(dev);
792 }
793
794 if (t->parms.link != p.link) {
795 t->parms.link = p.link;
796 mtu = vti_tunnel_bind_dev(dev);
797 if (!tb[IFLA_MTU])
798 dev->mtu = mtu;
799 netdev_state_change(dev);
800 }
801
802 return 0;
803} 379}
804 380
805static size_t vti_get_size(const struct net_device *dev) 381static size_t vti_get_size(const struct net_device *dev)
@@ -865,7 +441,7 @@ static int __init vti_init(void)
865 err = xfrm4_mode_tunnel_input_register(&vti_handler); 441 err = xfrm4_mode_tunnel_input_register(&vti_handler);
866 if (err < 0) { 442 if (err < 0) {
867 unregister_pernet_device(&vti_net_ops); 443 unregister_pernet_device(&vti_net_ops);
868 pr_info(KERN_INFO "vti init: can't register tunnel\n"); 444 pr_info("vti init: can't register tunnel\n");
869 } 445 }
870 446
871 err = rtnl_link_register(&vti_link_ops); 447 err = rtnl_link_register(&vti_link_ops);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 51fc2a1dcdd3..7f80fb4b82d3 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -190,15 +190,14 @@ static int ipip_rcv(struct sk_buff *skb)
190 struct ip_tunnel *tunnel; 190 struct ip_tunnel *tunnel;
191 const struct iphdr *iph; 191 const struct iphdr *iph;
192 192
193 if (iptunnel_pull_header(skb, 0, tpi.proto))
194 goto drop;
195
196 iph = ip_hdr(skb); 193 iph = ip_hdr(skb);
197 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, 194 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
198 iph->saddr, iph->daddr, 0); 195 iph->saddr, iph->daddr, 0);
199 if (tunnel) { 196 if (tunnel) {
200 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 197 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
201 goto drop; 198 goto drop;
199 if (iptunnel_pull_header(skb, 0, tpi.proto))
200 goto drop;
202 return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); 201 return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
203 } 202 }
204 203
@@ -286,7 +285,6 @@ static void ipip_tunnel_setup(struct net_device *dev)
286 dev->flags = IFF_NOARP; 285 dev->flags = IFF_NOARP;
287 dev->iflink = 0; 286 dev->iflink = 0;
288 dev->addr_len = 4; 287 dev->addr_len = 4;
289 dev->features |= NETIF_F_NETNS_LOCAL;
290 dev->features |= NETIF_F_LLTX; 288 dev->features |= NETIF_F_LLTX;
291 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 289 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
292 290
@@ -437,7 +435,7 @@ static int __net_init ipip_init_net(struct net *net)
437static void __net_exit ipip_exit_net(struct net *net) 435static void __net_exit ipip_exit_net(struct net *net)
438{ 436{
439 struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); 437 struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
440 ip_tunnel_delete_net(itn); 438 ip_tunnel_delete_net(itn, &ipip_link_ops);
441} 439}
442 440
443static struct pernet_operations ipip_net_ops = { 441static struct pernet_operations ipip_net_ops = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 132a09664704..9ae54b09254f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -127,9 +127,9 @@ static struct kmem_cache *mrt_cachep __read_mostly;
127static struct mr_table *ipmr_new_table(struct net *net, u32 id); 127static struct mr_table *ipmr_new_table(struct net *net, u32 id);
128static void ipmr_free_table(struct mr_table *mrt); 128static void ipmr_free_table(struct mr_table *mrt);
129 129
130static int ip_mr_forward(struct net *net, struct mr_table *mrt, 130static void ip_mr_forward(struct net *net, struct mr_table *mrt,
131 struct sk_buff *skb, struct mfc_cache *cache, 131 struct sk_buff *skb, struct mfc_cache *cache,
132 int local); 132 int local);
133static int ipmr_cache_report(struct mr_table *mrt, 133static int ipmr_cache_report(struct mr_table *mrt,
134 struct sk_buff *pkt, vifi_t vifi, int assert); 134 struct sk_buff *pkt, vifi_t vifi, int assert);
135static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, 135static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
@@ -1795,9 +1795,9 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1795 1795
1796/* "local" means that we should preserve one skb (for local delivery) */ 1796/* "local" means that we should preserve one skb (for local delivery) */
1797 1797
1798static int ip_mr_forward(struct net *net, struct mr_table *mrt, 1798static void ip_mr_forward(struct net *net, struct mr_table *mrt,
1799 struct sk_buff *skb, struct mfc_cache *cache, 1799 struct sk_buff *skb, struct mfc_cache *cache,
1800 int local) 1800 int local)
1801{ 1801{
1802 int psend = -1; 1802 int psend = -1;
1803 int vif, ct; 1803 int vif, ct;
@@ -1903,14 +1903,13 @@ last_forward:
1903 ipmr_queue_xmit(net, mrt, skb2, cache, psend); 1903 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1904 } else { 1904 } else {
1905 ipmr_queue_xmit(net, mrt, skb, cache, psend); 1905 ipmr_queue_xmit(net, mrt, skb, cache, psend);
1906 return 0; 1906 return;
1907 } 1907 }
1908 } 1908 }
1909 1909
1910dont_forward: 1910dont_forward:
1911 if (!local) 1911 if (!local)
1912 kfree_skb(skb); 1912 kfree_skb(skb);
1913 return 0;
1914} 1913}
1915 1914
1916static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) 1915static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
@@ -2068,9 +2067,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
2068 skb_reset_network_header(skb); 2067 skb_reset_network_header(skb);
2069 skb->protocol = htons(ETH_P_IP); 2068 skb->protocol = htons(ETH_P_IP);
2070 skb->ip_summed = CHECKSUM_NONE; 2069 skb->ip_summed = CHECKSUM_NONE;
2071 skb->pkt_type = PACKET_HOST;
2072 2070
2073 skb_tunnel_rx(skb, reg_dev); 2071 skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
2074 2072
2075 netif_rx(skb); 2073 netif_rx(skb);
2076 2074
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 4e9028017428..1657e39b291f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT
110 110
111 To compile it as a module, choose M here. If unsure, say N. 111 To compile it as a module, choose M here. If unsure, say N.
112 112
113config IP_NF_TARGET_SYNPROXY
114 tristate "SYNPROXY target support"
115 depends on NF_CONNTRACK && NETFILTER_ADVANCED
116 select NETFILTER_SYNPROXY
117 select SYN_COOKIES
118 help
119 The SYNPROXY target allows you to intercept TCP connections and
120 establish them using syncookies before they are passed on to the
121 server. This allows to avoid conntrack and server resource usage
122 during SYN-flood attacks.
123
124 To compile it as a module, choose M here. If unsure, say N.
125
113config IP_NF_TARGET_ULOG 126config IP_NF_TARGET_ULOG
114 tristate "ULOG target support (obsolete)" 127 tristate "ULOG target support (obsolete)"
115 default m if NETFILTER_ADVANCED=n 128 default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 007b128eecc9..3622b248b6dd 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
46obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o 46obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
47obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o 47obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
48obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o 48obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
49obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
49obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o 50obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
50 51
51# generic ARP tables 52# generic ARP tables
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index eadab1ed6500..a865f6f94013 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -48,7 +48,7 @@ static int __net_init arptable_filter_net_init(struct net *net)
48 net->ipv4.arptable_filter = 48 net->ipv4.arptable_filter =
49 arpt_register_table(net, &packet_filter, repl); 49 arpt_register_table(net, &packet_filter, repl);
50 kfree(repl); 50 kfree(repl);
51 return PTR_RET(net->ipv4.arptable_filter); 51 return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
52} 52}
53 53
54static void __net_exit arptable_filter_net_exit(struct net *net) 54static void __net_exit arptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 30e4de940567..00352ce0f0de 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this,
118 NF_CT_ASSERT(dev->ifindex != 0); 118 NF_CT_ASSERT(dev->ifindex != 0);
119 119
120 nf_ct_iterate_cleanup(net, device_cmp, 120 nf_ct_iterate_cleanup(net, device_cmp,
121 (void *)(long)dev->ifindex); 121 (void *)(long)dev->ifindex, 0, 0);
122 } 122 }
123 123
124 return NOTIFY_DONE; 124 return NOTIFY_DONE;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 04b18c1ac345..b969131ad1c1 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -119,7 +119,26 @@ static void send_reset(struct sk_buff *oldskb, int hook)
119 119
120 nf_ct_attach(nskb, oldskb); 120 nf_ct_attach(nskb, oldskb);
121 121
122 ip_local_out(nskb); 122#ifdef CONFIG_BRIDGE_NETFILTER
123 /* If we use ip_local_out for bridged traffic, the MAC source on
124 * the RST will be ours, instead of the destination's. This confuses
125 * some routers/firewalls, and they drop the packet. So we need to
126 * build the eth header using the original destination's MAC as the
127 * source, and send the RST packet directly.
128 */
129 if (oldskb->nf_bridge) {
130 struct ethhdr *oeth = eth_hdr(oldskb);
131 nskb->dev = oldskb->nf_bridge->physindev;
132 niph->tot_len = htons(nskb->len);
133 ip_send_check(niph);
134 if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
135 oeth->h_source, oeth->h_dest, nskb->len) < 0)
136 goto free_nskb;
137 dev_queue_xmit(nskb);
138 } else
139#endif
140 ip_local_out(nskb);
141
123 return; 142 return;
124 143
125 free_nskb: 144 free_nskb:
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 000000000000..67e17dcda65e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,476 @@
1/*
2 * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <net/tcp.h>
12
13#include <linux/netfilter_ipv4/ip_tables.h>
14#include <linux/netfilter/x_tables.h>
15#include <linux/netfilter/xt_SYNPROXY.h>
16#include <net/netfilter/nf_conntrack.h>
17#include <net/netfilter/nf_conntrack_seqadj.h>
18#include <net/netfilter/nf_conntrack_synproxy.h>
19
20static struct iphdr *
21synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
22{
23 struct iphdr *iph;
24
25 skb_reset_network_header(skb);
26 iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
27 iph->version = 4;
28 iph->ihl = sizeof(*iph) / 4;
29 iph->tos = 0;
30 iph->id = 0;
31 iph->frag_off = htons(IP_DF);
32 iph->ttl = sysctl_ip_default_ttl;
33 iph->protocol = IPPROTO_TCP;
34 iph->check = 0;
35 iph->saddr = saddr;
36 iph->daddr = daddr;
37
38 return iph;
39}
40
41static void
42synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
43 struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
44 struct iphdr *niph, struct tcphdr *nth,
45 unsigned int tcp_hdr_size)
46{
47 nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
48 nskb->ip_summed = CHECKSUM_PARTIAL;
49 nskb->csum_start = (unsigned char *)nth - nskb->head;
50 nskb->csum_offset = offsetof(struct tcphdr, check);
51
52 skb_dst_set_noref(nskb, skb_dst(skb));
53 nskb->protocol = htons(ETH_P_IP);
54 if (ip_route_me_harder(nskb, RTN_UNSPEC))
55 goto free_nskb;
56
57 if (nfct) {
58 nskb->nfct = nfct;
59 nskb->nfctinfo = ctinfo;
60 nf_conntrack_get(nfct);
61 }
62
63 ip_local_out(nskb);
64 return;
65
66free_nskb:
67 kfree_skb(nskb);
68}
69
70static void
71synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
72 const struct synproxy_options *opts)
73{
74 struct sk_buff *nskb;
75 struct iphdr *iph, *niph;
76 struct tcphdr *nth;
77 unsigned int tcp_hdr_size;
78 u16 mss = opts->mss;
79
80 iph = ip_hdr(skb);
81
82 tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
83 nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
84 GFP_ATOMIC);
85 if (nskb == NULL)
86 return;
87 skb_reserve(nskb, MAX_TCP_HEADER);
88
89 niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
90
91 skb_reset_transport_header(nskb);
92 nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
93 nth->source = th->dest;
94 nth->dest = th->source;
95 nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
96 nth->ack_seq = htonl(ntohl(th->seq) + 1);
97 tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
98 if (opts->options & XT_SYNPROXY_OPT_ECN)
99 tcp_flag_word(nth) |= TCP_FLAG_ECE;
100 nth->doff = tcp_hdr_size / 4;
101 nth->window = 0;
102 nth->check = 0;
103 nth->urg_ptr = 0;
104
105 synproxy_build_options(nth, opts);
106
107 synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
108 niph, nth, tcp_hdr_size);
109}
110
111static void
112synproxy_send_server_syn(const struct synproxy_net *snet,
113 const struct sk_buff *skb, const struct tcphdr *th,
114 const struct synproxy_options *opts, u32 recv_seq)
115{
116 struct sk_buff *nskb;
117 struct iphdr *iph, *niph;
118 struct tcphdr *nth;
119 unsigned int tcp_hdr_size;
120
121 iph = ip_hdr(skb);
122
123 tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
124 nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
125 GFP_ATOMIC);
126 if (nskb == NULL)
127 return;
128 skb_reserve(nskb, MAX_TCP_HEADER);
129
130 niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
131
132 skb_reset_transport_header(nskb);
133 nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
134 nth->source = th->source;
135 nth->dest = th->dest;
136 nth->seq = htonl(recv_seq - 1);
137 /* ack_seq is used to relay our ISN to the synproxy hook to initialize
138 * sequence number translation once a connection tracking entry exists.
139 */
140 nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
141 tcp_flag_word(nth) = TCP_FLAG_SYN;
142 if (opts->options & XT_SYNPROXY_OPT_ECN)
143 tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
144 nth->doff = tcp_hdr_size / 4;
145 nth->window = th->window;
146 nth->check = 0;
147 nth->urg_ptr = 0;
148
149 synproxy_build_options(nth, opts);
150
151 synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
152 niph, nth, tcp_hdr_size);
153}
154
155static void
156synproxy_send_server_ack(const struct synproxy_net *snet,
157 const struct ip_ct_tcp *state,
158 const struct sk_buff *skb, const struct tcphdr *th,
159 const struct synproxy_options *opts)
160{
161 struct sk_buff *nskb;
162 struct iphdr *iph, *niph;
163 struct tcphdr *nth;
164 unsigned int tcp_hdr_size;
165
166 iph = ip_hdr(skb);
167
168 tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
169 nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
170 GFP_ATOMIC);
171 if (nskb == NULL)
172 return;
173 skb_reserve(nskb, MAX_TCP_HEADER);
174
175 niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
176
177 skb_reset_transport_header(nskb);
178 nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
179 nth->source = th->dest;
180 nth->dest = th->source;
181 nth->seq = htonl(ntohl(th->ack_seq));
182 nth->ack_seq = htonl(ntohl(th->seq) + 1);
183 tcp_flag_word(nth) = TCP_FLAG_ACK;
184 nth->doff = tcp_hdr_size / 4;
185 nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
186 nth->check = 0;
187 nth->urg_ptr = 0;
188
189 synproxy_build_options(nth, opts);
190
191 synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
192}
193
194static void
195synproxy_send_client_ack(const struct synproxy_net *snet,
196 const struct sk_buff *skb, const struct tcphdr *th,
197 const struct synproxy_options *opts)
198{
199 struct sk_buff *nskb;
200 struct iphdr *iph, *niph;
201 struct tcphdr *nth;
202 unsigned int tcp_hdr_size;
203
204 iph = ip_hdr(skb);
205
206 tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
207 nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
208 GFP_ATOMIC);
209 if (nskb == NULL)
210 return;
211 skb_reserve(nskb, MAX_TCP_HEADER);
212
213 niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
214
215 skb_reset_transport_header(nskb);
216 nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
217 nth->source = th->source;
218 nth->dest = th->dest;
219 nth->seq = htonl(ntohl(th->seq) + 1);
220 nth->ack_seq = th->ack_seq;
221 tcp_flag_word(nth) = TCP_FLAG_ACK;
222 nth->doff = tcp_hdr_size / 4;
223 nth->window = ntohs(htons(th->window) >> opts->wscale);
224 nth->check = 0;
225 nth->urg_ptr = 0;
226
227 synproxy_build_options(nth, opts);
228
229 synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
230}
231
232static bool
233synproxy_recv_client_ack(const struct synproxy_net *snet,
234 const struct sk_buff *skb, const struct tcphdr *th,
235 struct synproxy_options *opts, u32 recv_seq)
236{
237 int mss;
238
239 mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
240 if (mss == 0) {
241 this_cpu_inc(snet->stats->cookie_invalid);
242 return false;
243 }
244
245 this_cpu_inc(snet->stats->cookie_valid);
246 opts->mss = mss;
247
248 if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
249 synproxy_check_timestamp_cookie(opts);
250
251 synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
252 return true;
253}
254
255static unsigned int
256synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
257{
258 const struct xt_synproxy_info *info = par->targinfo;
259 struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
260 struct synproxy_options opts = {};
261 struct tcphdr *th, _th;
262
263 if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
264 return NF_DROP;
265
266 th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
267 if (th == NULL)
268 return NF_DROP;
269
270 synproxy_parse_options(skb, par->thoff, th, &opts);
271
272 if (th->syn && !(th->ack || th->fin || th->rst)) {
273 /* Initial SYN from client */
274 this_cpu_inc(snet->stats->syn_received);
275
276 if (th->ece && th->cwr)
277 opts.options |= XT_SYNPROXY_OPT_ECN;
278
279 opts.options &= info->options;
280 if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
281 synproxy_init_timestamp_cookie(info, &opts);
282 else
283 opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
284 XT_SYNPROXY_OPT_SACK_PERM |
285 XT_SYNPROXY_OPT_ECN);
286
287 synproxy_send_client_synack(skb, th, &opts);
288 return NF_DROP;
289
290 } else if (th->ack && !(th->fin || th->rst || th->syn)) {
291 /* ACK from client */
292 synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
293 return NF_DROP;
294 }
295
296 return XT_CONTINUE;
297}
298
299static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
300 struct sk_buff *skb,
301 const struct net_device *in,
302 const struct net_device *out,
303 int (*okfn)(struct sk_buff *))
304{
305 struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out));
306 enum ip_conntrack_info ctinfo;
307 struct nf_conn *ct;
308 struct nf_conn_synproxy *synproxy;
309 struct synproxy_options opts = {};
310 const struct ip_ct_tcp *state;
311 struct tcphdr *th, _th;
312 unsigned int thoff;
313
314 ct = nf_ct_get(skb, &ctinfo);
315 if (ct == NULL)
316 return NF_ACCEPT;
317
318 synproxy = nfct_synproxy(ct);
319 if (synproxy == NULL)
320 return NF_ACCEPT;
321
322 if (nf_is_loopback_packet(skb))
323 return NF_ACCEPT;
324
325 thoff = ip_hdrlen(skb);
326 th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
327 if (th == NULL)
328 return NF_DROP;
329
330 state = &ct->proto.tcp;
331 switch (state->state) {
332 case TCP_CONNTRACK_CLOSE:
333 if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
334 nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
335 ntohl(th->seq) + 1);
336 break;
337 }
338
339 if (!th->syn || th->ack ||
340 CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
341 break;
342
343 /* Reopened connection - reset the sequence number and timestamp
344 * adjustments, they will get initialized once the connection is
345 * reestablished.
346 */
347 nf_ct_seqadj_init(ct, ctinfo, 0);
348 synproxy->tsoff = 0;
349 this_cpu_inc(snet->stats->conn_reopened);
350
351 /* fall through */
352 case TCP_CONNTRACK_SYN_SENT:
353 synproxy_parse_options(skb, thoff, th, &opts);
354
355 if (!th->syn && th->ack &&
356 CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
357 /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
358 * therefore we need to add 1 to make the SYN sequence
359 * number match the one of first SYN.
360 */
361 if (synproxy_recv_client_ack(snet, skb, th, &opts,
362 ntohl(th->seq) + 1))
363 this_cpu_inc(snet->stats->cookie_retrans);
364
365 return NF_DROP;
366 }
367
368 synproxy->isn = ntohl(th->ack_seq);
369 if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
370 synproxy->its = opts.tsecr;
371 break;
372 case TCP_CONNTRACK_SYN_RECV:
373 if (!th->syn || !th->ack)
374 break;
375
376 synproxy_parse_options(skb, thoff, th, &opts);
377 if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
378 synproxy->tsoff = opts.tsval - synproxy->its;
379
380 opts.options &= ~(XT_SYNPROXY_OPT_MSS |
381 XT_SYNPROXY_OPT_WSCALE |
382 XT_SYNPROXY_OPT_SACK_PERM);
383
384 swap(opts.tsval, opts.tsecr);
385 synproxy_send_server_ack(snet, state, skb, th, &opts);
386
387 nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
388
389 swap(opts.tsval, opts.tsecr);
390 synproxy_send_client_ack(snet, skb, th, &opts);
391
392 consume_skb(skb);
393 return NF_STOLEN;
394 default:
395 break;
396 }
397
398 synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
399 return NF_ACCEPT;
400}
401
402static int synproxy_tg4_check(const struct xt_tgchk_param *par)
403{
404 const struct ipt_entry *e = par->entryinfo;
405
406 if (e->ip.proto != IPPROTO_TCP ||
407 e->ip.invflags & XT_INV_PROTO)
408 return -EINVAL;
409
410 return nf_ct_l3proto_try_module_get(par->family);
411}
412
413static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
414{
415 nf_ct_l3proto_module_put(par->family);
416}
417
418static struct xt_target synproxy_tg4_reg __read_mostly = {
419 .name = "SYNPROXY",
420 .family = NFPROTO_IPV4,
421 .target = synproxy_tg4,
422 .targetsize = sizeof(struct xt_synproxy_info),
423 .checkentry = synproxy_tg4_check,
424 .destroy = synproxy_tg4_destroy,
425 .me = THIS_MODULE,
426};
427
428static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
429 {
430 .hook = ipv4_synproxy_hook,
431 .owner = THIS_MODULE,
432 .pf = NFPROTO_IPV4,
433 .hooknum = NF_INET_LOCAL_IN,
434 .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
435 },
436 {
437 .hook = ipv4_synproxy_hook,
438 .owner = THIS_MODULE,
439 .pf = NFPROTO_IPV4,
440 .hooknum = NF_INET_POST_ROUTING,
441 .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
442 },
443};
444
445static int __init synproxy_tg4_init(void)
446{
447 int err;
448
449 err = nf_register_hooks(ipv4_synproxy_ops,
450 ARRAY_SIZE(ipv4_synproxy_ops));
451 if (err < 0)
452 goto err1;
453
454 err = xt_register_target(&synproxy_tg4_reg);
455 if (err < 0)
456 goto err2;
457
458 return 0;
459
460err2:
461 nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
462err1:
463 return err;
464}
465
466static void __exit synproxy_tg4_exit(void)
467{
468 xt_unregister_target(&synproxy_tg4_reg);
469 nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
470}
471
472module_init(synproxy_tg4_init);
473module_exit(synproxy_tg4_exit);
474
475MODULE_LICENSE("GPL");
476MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 6b3da5cf54e9..50af5b45c050 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -69,7 +69,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
69 net->ipv4.iptable_filter = 69 net->ipv4.iptable_filter =
70 ipt_register_table(net, &packet_filter, repl); 70 ipt_register_table(net, &packet_filter, repl);
71 kfree(repl); 71 kfree(repl);
72 return PTR_RET(net->ipv4.iptable_filter); 72 return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
73} 73}
74 74
75static void __net_exit iptable_filter_net_exit(struct net *net) 75static void __net_exit iptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index cba5658ec82c..0d8cd82e0fad 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -107,7 +107,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
107 net->ipv4.iptable_mangle = 107 net->ipv4.iptable_mangle =
108 ipt_register_table(net, &packet_mangler, repl); 108 ipt_register_table(net, &packet_mangler, repl);
109 kfree(repl); 109 kfree(repl);
110 return PTR_RET(net->ipv4.iptable_mangle); 110 return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
111} 111}
112 112
113static void __net_exit iptable_mangle_net_exit(struct net *net) 113static void __net_exit iptable_mangle_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 6383273d54e1..683bfaffed65 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -292,7 +292,7 @@ static int __net_init iptable_nat_net_init(struct net *net)
292 return -ENOMEM; 292 return -ENOMEM;
293 net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); 293 net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
294 kfree(repl); 294 kfree(repl);
295 return PTR_RET(net->ipv4.nat_table); 295 return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
296} 296}
297 297
298static void __net_exit iptable_nat_net_exit(struct net *net) 298static void __net_exit iptable_nat_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 03d9696d3c6e..1f82aea11df6 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -48,7 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
48 net->ipv4.iptable_raw = 48 net->ipv4.iptable_raw =
49 ipt_register_table(net, &packet_raw, repl); 49 ipt_register_table(net, &packet_raw, repl);
50 kfree(repl); 50 kfree(repl);
51 return PTR_RET(net->ipv4.iptable_raw); 51 return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
52} 52}
53 53
54static void __net_exit iptable_raw_net_exit(struct net *net) 54static void __net_exit iptable_raw_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index b283d8e2601a..f867a8d38bf7 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -66,7 +66,7 @@ static int __net_init iptable_security_net_init(struct net *net)
66 net->ipv4.iptable_security = 66 net->ipv4.iptable_security =
67 ipt_register_table(net, &security_table, repl); 67 ipt_register_table(net, &security_table, repl);
68 kfree(repl); 68 kfree(repl);
69 return PTR_RET(net->ipv4.iptable_security); 69 return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
70} 70}
71 71
72static void __net_exit iptable_security_net_exit(struct net *net) 72static void __net_exit iptable_security_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 0a2e0e3e95ba..86f5b34a4ed1 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -25,6 +25,7 @@
25#include <net/netfilter/nf_conntrack_l3proto.h> 25#include <net/netfilter/nf_conntrack_l3proto.h>
26#include <net/netfilter/nf_conntrack_zones.h> 26#include <net/netfilter/nf_conntrack_zones.h>
27#include <net/netfilter/nf_conntrack_core.h> 27#include <net/netfilter/nf_conntrack_core.h>
28#include <net/netfilter/nf_conntrack_seqadj.h>
28#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 29#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
29#include <net/netfilter/nf_nat_helper.h> 30#include <net/netfilter/nf_nat_helper.h>
30#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 31#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
@@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
136 /* adjust seqs for loopback traffic only in outgoing direction */ 137 /* adjust seqs for loopback traffic only in outgoing direction */
137 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 138 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
138 !nf_is_loopback_packet(skb)) { 139 !nf_is_loopback_packet(skb)) {
139 typeof(nf_nat_seq_adjust_hook) seq_adjust; 140 if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
140
141 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
142 if (!seq_adjust ||
143 !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
144 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 141 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
145 return NF_DROP; 142 return NF_DROP;
146 } 143 }
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 746427c9e719..d7d9882d4cae 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1082,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
1082 __u16 srcp = ntohs(inet->inet_sport); 1082 __u16 srcp = ntohs(inet->inet_sport);
1083 1083
1084 seq_printf(f, "%5d: %08X:%04X %08X:%04X" 1084 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
1085 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", 1085 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
1086 bucket, src, srcp, dest, destp, sp->sk_state, 1086 bucket, src, srcp, dest, destp, sp->sk_state,
1087 sk_wmem_alloc_get(sp), 1087 sk_wmem_alloc_get(sp),
1088 sk_rmem_alloc_get(sp), 1088 sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 6577a1149a47..4a0335854b89 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
111 SNMP_MIB_SENTINEL 111 SNMP_MIB_SENTINEL
112}; 112};
113 113
114/* Following RFC4293 items are displayed in /proc/net/netstat */ 114/* Following items are displayed in /proc/net/netstat */
115static const struct snmp_mib snmp4_ipextstats_list[] = { 115static const struct snmp_mib snmp4_ipextstats_list[] = {
116 SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), 116 SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
117 SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), 117 SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
@@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
125 SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), 125 SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
126 SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), 126 SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
127 SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), 127 SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
128 /* Non RFC4293 fields */
128 SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), 129 SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
130 SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
131 SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
132 SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
133 SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
129 SNMP_MIB_SENTINEL 134 SNMP_MIB_SENTINEL
130}; 135};
131 136
@@ -273,7 +278,7 @@ static const struct snmp_mib snmp4_net_list[] = {
273 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), 278 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
274 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), 279 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
275 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), 280 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
276 SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), 281 SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
277 SNMP_MIB_SENTINEL 282 SNMP_MIB_SENTINEL
278}; 283};
279 284
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index dd44e0ab600c..a86c7ae71881 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -571,7 +571,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
571 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, 571 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
572 RT_SCOPE_UNIVERSE, 572 RT_SCOPE_UNIVERSE,
573 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 573 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574 inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, 574 inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP |
575 (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
575 daddr, saddr, 0, 0); 576 daddr, saddr, 0, 0);
576 577
577 if (!inet->hdrincl) { 578 if (!inet->hdrincl) {
@@ -987,7 +988,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
987 srcp = inet->inet_num; 988 srcp = inet->inet_num;
988 989
989 seq_printf(seq, "%4d: %08X:%04X %08X:%04X" 990 seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
990 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", 991 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
991 i, src, srcp, dest, destp, sp->sk_state, 992 i, src, srcp, dest, destp, sp->sk_state,
992 sk_wmem_alloc_get(sp), 993 sk_wmem_alloc_get(sp),
993 sk_rmem_alloc_get(sp), 994 sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a9a54a236832..727f4365bcdf 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,7 +112,8 @@
112#define RT_FL_TOS(oldflp4) \ 112#define RT_FL_TOS(oldflp4) \
113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 114
115#define IP_MAX_MTU 0xFFF0 115/* IPv4 datagram length is stored into 16bit field (tot_len) */
116#define IP_MAX_MTU 0xFFFF
116 117
117#define RT_GC_TIMEOUT (300*HZ) 118#define RT_GC_TIMEOUT (300*HZ)
118 119
@@ -435,12 +436,12 @@ static inline int ip_rt_proc_init(void)
435 436
436static inline bool rt_is_expired(const struct rtable *rth) 437static inline bool rt_is_expired(const struct rtable *rth)
437{ 438{
438 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 439 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
439} 440}
440 441
441void rt_cache_flush(struct net *net) 442void rt_cache_flush(struct net *net)
442{ 443{
443 rt_genid_bump(net); 444 rt_genid_bump_ipv4(net);
444} 445}
445 446
446static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 447static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1227,10 +1228,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1227 mtu = 576; 1228 mtu = 576;
1228 } 1229 }
1229 1230
1230 if (mtu > IP_MAX_MTU) 1231 return min_t(unsigned int, mtu, IP_MAX_MTU);
1231 mtu = IP_MAX_MTU;
1232
1233 return mtu;
1234} 1232}
1235 1233
1236static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1234static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
@@ -1458,7 +1456,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1458#endif 1456#endif
1459 rth->dst.output = ip_rt_bug; 1457 rth->dst.output = ip_rt_bug;
1460 1458
1461 rth->rt_genid = rt_genid(dev_net(dev)); 1459 rth->rt_genid = rt_genid_ipv4(dev_net(dev));
1462 rth->rt_flags = RTCF_MULTICAST; 1460 rth->rt_flags = RTCF_MULTICAST;
1463 rth->rt_type = RTN_MULTICAST; 1461 rth->rt_type = RTN_MULTICAST;
1464 rth->rt_is_input= 1; 1462 rth->rt_is_input= 1;
@@ -1589,7 +1587,7 @@ static int __mkroute_input(struct sk_buff *skb,
1589 goto cleanup; 1587 goto cleanup;
1590 } 1588 }
1591 1589
1592 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 1590 rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1593 rth->rt_flags = flags; 1591 rth->rt_flags = flags;
1594 rth->rt_type = res->type; 1592 rth->rt_type = res->type;
1595 rth->rt_is_input = 1; 1593 rth->rt_is_input = 1;
@@ -1760,7 +1758,7 @@ local_input:
1760 rth->dst.tclassid = itag; 1758 rth->dst.tclassid = itag;
1761#endif 1759#endif
1762 1760
1763 rth->rt_genid = rt_genid(net); 1761 rth->rt_genid = rt_genid_ipv4(net);
1764 rth->rt_flags = flags|RTCF_LOCAL; 1762 rth->rt_flags = flags|RTCF_LOCAL;
1765 rth->rt_type = res.type; 1763 rth->rt_type = res.type;
1766 rth->rt_is_input = 1; 1764 rth->rt_is_input = 1;
@@ -1945,7 +1943,7 @@ add:
1945 1943
1946 rth->dst.output = ip_output; 1944 rth->dst.output = ip_output;
1947 1945
1948 rth->rt_genid = rt_genid(dev_net(dev_out)); 1946 rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1949 rth->rt_flags = flags; 1947 rth->rt_flags = flags;
1950 rth->rt_type = type; 1948 rth->rt_type = type;
1951 rth->rt_is_input = 0; 1949 rth->rt_is_input = 0;
@@ -2227,7 +2225,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2227 rt->rt_iif = ort->rt_iif; 2225 rt->rt_iif = ort->rt_iif;
2228 rt->rt_pmtu = ort->rt_pmtu; 2226 rt->rt_pmtu = ort->rt_pmtu;
2229 2227
2230 rt->rt_genid = rt_genid(net); 2228 rt->rt_genid = rt_genid_ipv4(net);
2231 rt->rt_flags = ort->rt_flags; 2229 rt->rt_flags = ort->rt_flags;
2232 rt->rt_type = ort->rt_type; 2230 rt->rt_type = ort->rt_type;
2233 rt->rt_gateway = ort->rt_gateway; 2231 rt->rt_gateway = ort->rt_gateway;
@@ -2665,7 +2663,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
2665 2663
2666static __net_init int rt_genid_init(struct net *net) 2664static __net_init int rt_genid_init(struct net *net)
2667{ 2665{
2668 atomic_set(&net->rt_genid, 0); 2666 atomic_set(&net->ipv4.rt_genid, 0);
2669 atomic_set(&net->fnhe_genid, 0); 2667 atomic_set(&net->fnhe_genid, 0);
2670 get_random_bytes(&net->ipv4.dev_addr_genid, 2668 get_random_bytes(&net->ipv4.dev_addr_genid,
2671 sizeof(net->ipv4.dev_addr_genid)); 2669 sizeof(net->ipv4.dev_addr_genid));
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b05c96e7af8b..14a15c49129d 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -160,26 +160,33 @@ static __u16 const msstab[] = {
160 * Generate a syncookie. mssp points to the mss, which is returned 160 * Generate a syncookie. mssp points to the mss, which is returned
161 * rounded down to the value encoded in the cookie. 161 * rounded down to the value encoded in the cookie.
162 */ 162 */
163__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) 163u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
164 u16 *mssp)
164{ 165{
165 const struct iphdr *iph = ip_hdr(skb);
166 const struct tcphdr *th = tcp_hdr(skb);
167 int mssind; 166 int mssind;
168 const __u16 mss = *mssp; 167 const __u16 mss = *mssp;
169 168
170 tcp_synq_overflow(sk);
171
172 for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) 169 for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
173 if (mss >= msstab[mssind]) 170 if (mss >= msstab[mssind])
174 break; 171 break;
175 *mssp = msstab[mssind]; 172 *mssp = msstab[mssind];
176 173
177 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
178
179 return secure_tcp_syn_cookie(iph->saddr, iph->daddr, 174 return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
180 th->source, th->dest, ntohl(th->seq), 175 th->source, th->dest, ntohl(th->seq),
181 jiffies / (HZ * 60), mssind); 176 jiffies / (HZ * 60), mssind);
182} 177}
178EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
179
180__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
181{
182 const struct iphdr *iph = ip_hdr(skb);
183 const struct tcphdr *th = tcp_hdr(skb);
184
185 tcp_synq_overflow(sk);
186 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
187
188 return __cookie_v4_init_sequence(iph, th, mssp);
189}
183 190
184/* 191/*
185 * This (misnamed) value is the age of syncookie which is permitted. 192 * This (misnamed) value is the age of syncookie which is permitted.
@@ -192,10 +199,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
192 * Check if a ack sequence number is a valid syncookie. 199 * Check if a ack sequence number is a valid syncookie.
193 * Return the decoded mss if it is, or 0 if not. 200 * Return the decoded mss if it is, or 0 if not.
194 */ 201 */
195static inline int cookie_check(struct sk_buff *skb, __u32 cookie) 202int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
203 u32 cookie)
196{ 204{
197 const struct iphdr *iph = ip_hdr(skb);
198 const struct tcphdr *th = tcp_hdr(skb);
199 __u32 seq = ntohl(th->seq) - 1; 205 __u32 seq = ntohl(th->seq) - 1;
200 __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, 206 __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
201 th->source, th->dest, seq, 207 th->source, th->dest, seq,
@@ -204,6 +210,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
204 210
205 return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; 211 return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
206} 212}
213EXPORT_SYMBOL_GPL(__cookie_v4_check);
207 214
208static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 215static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
209 struct request_sock *req, 216 struct request_sock *req,
@@ -284,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
284 goto out; 291 goto out;
285 292
286 if (tcp_synq_no_recent_overflow(sk) || 293 if (tcp_synq_no_recent_overflow(sk) ||
287 (mss = cookie_check(skb, cookie)) == 0) { 294 (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) {
288 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); 295 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
289 goto out; 296 goto out;
290 } 297 }
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 610e324348d1..540279f4c531 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
29static int zero; 29static int zero;
30static int one = 1; 30static int one = 1;
31static int four = 4; 31static int four = 4;
32static int gso_max_segs = GSO_MAX_SEGS;
32static int tcp_retr1_max = 255; 33static int tcp_retr1_max = 255;
33static int ip_local_port_range_min[] = { 1, 1 }; 34static int ip_local_port_range_min[] = { 1, 1 };
34static int ip_local_port_range_max[] = { 65535, 65535 }; 35static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -559,6 +560,13 @@ static struct ctl_table ipv4_table[] = {
559 .extra1 = &one, 560 .extra1 = &one,
560 }, 561 },
561 { 562 {
563 .procname = "tcp_notsent_lowat",
564 .data = &sysctl_tcp_notsent_lowat,
565 .maxlen = sizeof(sysctl_tcp_notsent_lowat),
566 .mode = 0644,
567 .proc_handler = proc_dointvec,
568 },
569 {
562 .procname = "tcp_rmem", 570 .procname = "tcp_rmem",
563 .data = &sysctl_tcp_rmem, 571 .data = &sysctl_tcp_rmem,
564 .maxlen = sizeof(sysctl_tcp_rmem), 572 .maxlen = sizeof(sysctl_tcp_rmem),
@@ -754,6 +762,15 @@ static struct ctl_table ipv4_table[] = {
754 .extra2 = &four, 762 .extra2 = &four,
755 }, 763 },
756 { 764 {
765 .procname = "tcp_min_tso_segs",
766 .data = &sysctl_tcp_min_tso_segs,
767 .maxlen = sizeof(int),
768 .mode = 0644,
769 .proc_handler = proc_dointvec_minmax,
770 .extra1 = &zero,
771 .extra2 = &gso_max_segs,
772 },
773 {
757 .procname = "udp_mem", 774 .procname = "udp_mem",
758 .data = &sysctl_udp_mem, 775 .data = &sysctl_udp_mem,
759 .maxlen = sizeof(sysctl_udp_mem), 776 .maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5423223e93c2..6e5617b9f9db 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
283 283
284int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; 284int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285 285
286int sysctl_tcp_min_tso_segs __read_mostly = 2;
287
286struct percpu_counter tcp_orphan_count; 288struct percpu_counter tcp_orphan_count;
287EXPORT_SYMBOL_GPL(tcp_orphan_count); 289EXPORT_SYMBOL_GPL(tcp_orphan_count);
288 290
@@ -410,10 +412,6 @@ void tcp_init_sock(struct sock *sk)
410 412
411 icsk->icsk_sync_mss = tcp_sync_mss; 413 icsk->icsk_sync_mss = tcp_sync_mss;
412 414
413 /* Presumed zeroed, in order of appearance:
414 * cookie_in_always, cookie_out_never,
415 * s_data_constant, s_data_in, s_data_out
416 */
417 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 415 sk->sk_sndbuf = sysctl_tcp_wmem[1];
418 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 416 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
419 417
@@ -499,7 +497,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
499 mask |= POLLIN | POLLRDNORM; 497 mask |= POLLIN | POLLRDNORM;
500 498
501 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 499 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
502 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { 500 if (sk_stream_is_writeable(sk)) {
503 mask |= POLLOUT | POLLWRNORM; 501 mask |= POLLOUT | POLLWRNORM;
504 } else { /* send SIGIO later */ 502 } else { /* send SIGIO later */
505 set_bit(SOCK_ASYNC_NOSPACE, 503 set_bit(SOCK_ASYNC_NOSPACE,
@@ -510,7 +508,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
510 * wspace test but before the flags are set, 508 * wspace test but before the flags are set,
511 * IO signal will be lost. 509 * IO signal will be lost.
512 */ 510 */
513 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 511 if (sk_stream_is_writeable(sk))
514 mask |= POLLOUT | POLLWRNORM; 512 mask |= POLLOUT | POLLWRNORM;
515 } 513 }
516 } else 514 } else
@@ -789,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
789 xmit_size_goal = mss_now; 787 xmit_size_goal = mss_now;
790 788
791 if (large_allowed && sk_can_gso(sk)) { 789 if (large_allowed && sk_can_gso(sk)) {
792 xmit_size_goal = ((sk->sk_gso_max_size - 1) - 790 u32 gso_size, hlen;
793 inet_csk(sk)->icsk_af_ops->net_header_len - 791
794 inet_csk(sk)->icsk_ext_hdr_len - 792 /* Maybe we should/could use sk->sk_prot->max_header here ? */
795 tp->tcp_header_len); 793 hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
794 inet_csk(sk)->icsk_ext_hdr_len +
795 tp->tcp_header_len;
796
797 /* Goal is to send at least one packet per ms,
798 * not one big TSO packet every 100 ms.
799 * This preserves ACK clocking and is consistent
800 * with tcp_tso_should_defer() heuristic.
801 */
802 gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
803 gso_size = max_t(u32, gso_size,
804 sysctl_tcp_min_tso_segs * mss_now);
805
806 xmit_size_goal = min_t(u32, gso_size,
807 sk->sk_gso_max_size - 1 - hlen);
796 808
797 /* TSQ : try to have two TSO segments in flight */ 809 /* TSQ : try to have at least two segments in flight
810 * (one in NIC TX ring, another in Qdisc)
811 */
798 xmit_size_goal = min_t(u32, xmit_size_goal, 812 xmit_size_goal = min_t(u32, xmit_size_goal,
799 sysctl_tcp_limit_output_bytes >> 1); 813 sysctl_tcp_limit_output_bytes >> 1);
800 814
@@ -1121,6 +1135,13 @@ new_segment:
1121 goto wait_for_memory; 1135 goto wait_for_memory;
1122 1136
1123 /* 1137 /*
1138 * All packets are restored as if they have
1139 * already been sent.
1140 */
1141 if (tp->repair)
1142 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1143
1144 /*
1124 * Check whether we can use HW checksum. 1145 * Check whether we can use HW checksum.
1125 */ 1146 */
1126 if (sk->sk_route_caps & NETIF_F_ALL_CSUM) 1147 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
@@ -2447,10 +2468,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2447 case TCP_THIN_DUPACK: 2468 case TCP_THIN_DUPACK:
2448 if (val < 0 || val > 1) 2469 if (val < 0 || val > 1)
2449 err = -EINVAL; 2470 err = -EINVAL;
2450 else 2471 else {
2451 tp->thin_dupack = val; 2472 tp->thin_dupack = val;
2452 if (tp->thin_dupack) 2473 if (tp->thin_dupack)
2453 tcp_disable_early_retrans(tp); 2474 tcp_disable_early_retrans(tp);
2475 }
2454 break; 2476 break;
2455 2477
2456 case TCP_REPAIR: 2478 case TCP_REPAIR:
@@ -2631,6 +2653,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2631 else 2653 else
2632 tp->tsoffset = val - tcp_time_stamp; 2654 tp->tsoffset = val - tcp_time_stamp;
2633 break; 2655 break;
2656 case TCP_NOTSENT_LOWAT:
2657 tp->notsent_lowat = val;
2658 sk->sk_write_space(sk);
2659 break;
2634 default: 2660 default:
2635 err = -ENOPROTOOPT; 2661 err = -ENOPROTOOPT;
2636 break; 2662 break;
@@ -2847,6 +2873,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2847 case TCP_TIMESTAMP: 2873 case TCP_TIMESTAMP:
2848 val = tcp_time_stamp + tp->tsoffset; 2874 val = tcp_time_stamp + tp->tsoffset;
2849 break; 2875 break;
2876 case TCP_NOTSENT_LOWAT:
2877 val = tp->notsent_lowat;
2878 break;
2850 default: 2879 default:
2851 return -ENOPROTOOPT; 2880 return -ENOPROTOOPT;
2852 } 2881 }
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index a9077f441cb2..b6ae92a51f58 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -206,8 +206,8 @@ static u32 cubic_root(u64 a)
206 */ 206 */
207static inline void bictcp_update(struct bictcp *ca, u32 cwnd) 207static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
208{ 208{
209 u64 offs; 209 u32 delta, bic_target, max_cnt;
210 u32 delta, t, bic_target, max_cnt; 210 u64 offs, t;
211 211
212 ca->ack_cnt++; /* count the number of ACKs */ 212 ca->ack_cnt++; /* count the number of ACKs */
213 213
@@ -250,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
250 * if the cwnd < 1 million packets !!! 250 * if the cwnd < 1 million packets !!!
251 */ 251 */
252 252
253 t = (s32)(tcp_time_stamp - ca->epoch_start);
254 t += msecs_to_jiffies(ca->delay_min >> 3);
253 /* change the unit from HZ to bictcp_HZ */ 255 /* change the unit from HZ to bictcp_HZ */
254 t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) 256 t <<= BICTCP_HZ;
255 - ca->epoch_start) << BICTCP_HZ) / HZ; 257 do_div(t, HZ);
256 258
257 if (t < ca->bic_K) /* t - K */ 259 if (t < ca->bic_K) /* t - K */
258 offs = ca->bic_K - t; 260 offs = ca->bic_K - t;
@@ -414,7 +416,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
414 return; 416 return;
415 417
416 /* Discard delay samples right after fast recovery */ 418 /* Discard delay samples right after fast recovery */
417 if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) 419 if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
418 return; 420 return;
419 421
420 delay = (rtt_us << 3) / USEC_PER_MSEC; 422 delay = (rtt_us << 3) / USEC_PER_MSEC;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8f7ef0ad80e5..ab7bd35bb312 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -58,23 +58,22 @@ error: kfree(ctx);
58 return err; 58 return err;
59} 59}
60 60
61/* Computes the fastopen cookie for the peer. 61/* Computes the fastopen cookie for the IP path.
62 * The peer address is a 128 bits long (pad with zeros for IPv4). 62 * The path is a 128 bits long (pad with zeros for IPv4).
63 * 63 *
64 * The caller must check foc->len to determine if a valid cookie 64 * The caller must check foc->len to determine if a valid cookie
65 * has been generated successfully. 65 * has been generated successfully.
66*/ 66*/
67void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) 67void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
68 struct tcp_fastopen_cookie *foc)
68{ 69{
69 __be32 peer_addr[4] = { addr, 0, 0, 0 }; 70 __be32 path[4] = { src, dst, 0, 0 };
70 struct tcp_fastopen_context *ctx; 71 struct tcp_fastopen_context *ctx;
71 72
72 rcu_read_lock(); 73 rcu_read_lock();
73 ctx = rcu_dereference(tcp_fastopen_ctx); 74 ctx = rcu_dereference(tcp_fastopen_ctx);
74 if (ctx) { 75 if (ctx) {
75 crypto_cipher_encrypt_one(ctx->tfm, 76 crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path);
76 foc->val,
77 (__u8 *)peer_addr);
78 foc->len = TCP_FASTOPEN_COOKIE_SIZE; 77 foc->len = TCP_FASTOPEN_COOKIE_SIZE;
79 } 78 }
80 rcu_read_unlock(); 79 rcu_read_unlock();
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28af45abe062..1969e16d936d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
688 } 688 }
689} 689}
690 690
691/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
692 * Note: TCP stack does not yet implement pacing.
693 * FQ packet scheduler can be used to implement cheap but effective
694 * TCP pacing, to smooth the burst on large writes when packets
695 * in flight is significantly lower than cwnd (or rwin)
696 */
697static void tcp_update_pacing_rate(struct sock *sk)
698{
699 const struct tcp_sock *tp = tcp_sk(sk);
700 u64 rate;
701
702 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
703 rate = (u64)tp->mss_cache * 2 * (HZ << 3);
704
705 rate *= max(tp->snd_cwnd, tp->packets_out);
706
707 /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
708 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
709 * We probably need usec resolution in the future.
710 * Note: This also takes care of possible srtt=0 case,
711 * when tcp_rtt_estimator() was not yet called.
712 */
713 if (tp->srtt > 8 + 2)
714 do_div(rate, tp->srtt);
715
716 sk->sk_pacing_rate = min_t(u64, rate, ~0U);
717}
718
691/* Calculate rto without backoff. This is the second half of Van Jacobson's 719/* Calculate rto without backoff. This is the second half of Van Jacobson's
692 * routine referred to above. 720 * routine referred to above.
693 */ 721 */
@@ -1048,6 +1076,7 @@ struct tcp_sacktag_state {
1048 int reord; 1076 int reord;
1049 int fack_count; 1077 int fack_count;
1050 int flag; 1078 int flag;
1079 s32 rtt; /* RTT measured by SACKing never-retransmitted data */
1051}; 1080};
1052 1081
1053/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1082/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1108,7 +1137,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1108static u8 tcp_sacktag_one(struct sock *sk, 1137static u8 tcp_sacktag_one(struct sock *sk,
1109 struct tcp_sacktag_state *state, u8 sacked, 1138 struct tcp_sacktag_state *state, u8 sacked,
1110 u32 start_seq, u32 end_seq, 1139 u32 start_seq, u32 end_seq,
1111 bool dup_sack, int pcount) 1140 int dup_sack, int pcount, u32 xmit_time)
1112{ 1141{
1113 struct tcp_sock *tp = tcp_sk(sk); 1142 struct tcp_sock *tp = tcp_sk(sk);
1114 int fack_count = state->fack_count; 1143 int fack_count = state->fack_count;
@@ -1148,6 +1177,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
1148 state->reord); 1177 state->reord);
1149 if (!after(end_seq, tp->high_seq)) 1178 if (!after(end_seq, tp->high_seq))
1150 state->flag |= FLAG_ORIG_SACK_ACKED; 1179 state->flag |= FLAG_ORIG_SACK_ACKED;
1180 /* Pick the earliest sequence sacked for RTT */
1181 if (state->rtt < 0)
1182 state->rtt = tcp_time_stamp - xmit_time;
1151 } 1183 }
1152 1184
1153 if (sacked & TCPCB_LOST) { 1185 if (sacked & TCPCB_LOST) {
@@ -1205,7 +1237,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1205 * tcp_highest_sack_seq() when skb is highest_sack. 1237 * tcp_highest_sack_seq() when skb is highest_sack.
1206 */ 1238 */
1207 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 1239 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1208 start_seq, end_seq, dup_sack, pcount); 1240 start_seq, end_seq, dup_sack, pcount,
1241 TCP_SKB_CB(skb)->when);
1209 1242
1210 if (skb == tp->lost_skb_hint) 1243 if (skb == tp->lost_skb_hint)
1211 tp->lost_cnt_hint += pcount; 1244 tp->lost_cnt_hint += pcount;
@@ -1479,7 +1512,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1479 TCP_SKB_CB(skb)->seq, 1512 TCP_SKB_CB(skb)->seq,
1480 TCP_SKB_CB(skb)->end_seq, 1513 TCP_SKB_CB(skb)->end_seq,
1481 dup_sack, 1514 dup_sack,
1482 tcp_skb_pcount(skb)); 1515 tcp_skb_pcount(skb),
1516 TCP_SKB_CB(skb)->when);
1483 1517
1484 if (!before(TCP_SKB_CB(skb)->seq, 1518 if (!before(TCP_SKB_CB(skb)->seq,
1485 tcp_highest_sack_seq(tp))) 1519 tcp_highest_sack_seq(tp)))
@@ -1536,7 +1570,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
1536 1570
1537static int 1571static int
1538tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, 1572tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1539 u32 prior_snd_una) 1573 u32 prior_snd_una, s32 *sack_rtt)
1540{ 1574{
1541 struct tcp_sock *tp = tcp_sk(sk); 1575 struct tcp_sock *tp = tcp_sk(sk);
1542 const unsigned char *ptr = (skb_transport_header(ack_skb) + 1576 const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1554,6 +1588,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1554 1588
1555 state.flag = 0; 1589 state.flag = 0;
1556 state.reord = tp->packets_out; 1590 state.reord = tp->packets_out;
1591 state.rtt = -1;
1557 1592
1558 if (!tp->sacked_out) { 1593 if (!tp->sacked_out) {
1559 if (WARN_ON(tp->fackets_out)) 1594 if (WARN_ON(tp->fackets_out))
@@ -1737,6 +1772,7 @@ out:
1737 WARN_ON((int)tp->retrans_out < 0); 1772 WARN_ON((int)tp->retrans_out < 0);
1738 WARN_ON((int)tcp_packets_in_flight(tp) < 0); 1773 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1739#endif 1774#endif
1775 *sack_rtt = state.rtt;
1740 return state.flag; 1776 return state.flag;
1741} 1777}
1742 1778
@@ -1869,8 +1905,13 @@ void tcp_enter_loss(struct sock *sk, int how)
1869 } 1905 }
1870 tcp_verify_left_out(tp); 1906 tcp_verify_left_out(tp);
1871 1907
1872 tp->reordering = min_t(unsigned int, tp->reordering, 1908 /* Timeout in disordered state after receiving substantial DUPACKs
1873 sysctl_tcp_reordering); 1909 * suggests that the degree of reordering is over-estimated.
1910 */
1911 if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
1912 tp->sacked_out >= sysctl_tcp_reordering)
1913 tp->reordering = min_t(unsigned int, tp->reordering,
1914 sysctl_tcp_reordering);
1874 tcp_set_ca_state(sk, TCP_CA_Loss); 1915 tcp_set_ca_state(sk, TCP_CA_Loss);
1875 tp->high_seq = tp->snd_nxt; 1916 tp->high_seq = tp->snd_nxt;
1876 TCP_ECN_queue_cwr(tp); 1917 TCP_ECN_queue_cwr(tp);
@@ -2472,8 +2513,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
2472 2513
2473 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2514 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2474 tcp_try_keep_open(sk); 2515 tcp_try_keep_open(sk);
2475 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2476 tcp_moderate_cwnd(tp);
2477 } else { 2516 } else {
2478 tcp_cwnd_reduction(sk, prior_unsacked, 0); 2517 tcp_cwnd_reduction(sk, prior_unsacked, 0);
2479 } 2518 }
@@ -2792,65 +2831,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2792 tcp_xmit_retransmit_queue(sk); 2831 tcp_xmit_retransmit_queue(sk);
2793} 2832}
2794 2833
2795void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) 2834static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2835 s32 seq_rtt, s32 sack_rtt)
2796{ 2836{
2797 tcp_rtt_estimator(sk, seq_rtt); 2837 const struct tcp_sock *tp = tcp_sk(sk);
2798 tcp_set_rto(sk); 2838
2799 inet_csk(sk)->icsk_backoff = 0; 2839 /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
2800} 2840 * broken middle-boxes or peers may corrupt TS-ECR fields. But
2801EXPORT_SYMBOL(tcp_valid_rtt_meas); 2841 * Karn's algorithm forbids taking RTT if some retransmitted data
2842 * is acked (RFC6298).
2843 */
2844 if (flag & FLAG_RETRANS_DATA_ACKED)
2845 seq_rtt = -1;
2846
2847 if (seq_rtt < 0)
2848 seq_rtt = sack_rtt;
2802 2849
2803/* Read draft-ietf-tcplw-high-performance before mucking
2804 * with this code. (Supersedes RFC1323)
2805 */
2806static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
2807{
2808 /* RTTM Rule: A TSecr value received in a segment is used to 2850 /* RTTM Rule: A TSecr value received in a segment is used to
2809 * update the averaged RTT measurement only if the segment 2851 * update the averaged RTT measurement only if the segment
2810 * acknowledges some new data, i.e., only if it advances the 2852 * acknowledges some new data, i.e., only if it advances the
2811 * left edge of the send window. 2853 * left edge of the send window.
2812 *
2813 * See draft-ietf-tcplw-high-performance-00, section 3.3. 2854 * See draft-ietf-tcplw-high-performance-00, section 3.3.
2814 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
2815 *
2816 * Changed: reset backoff as soon as we see the first valid sample.
2817 * If we do not, we get strongly overestimated rto. With timestamps
2818 * samples are accepted even from very old segments: f.e., when rtt=1
2819 * increases to 8, we retransmit 5 times and after 8 seconds delayed
2820 * answer arrives rto becomes 120 seconds! If at least one of segments
2821 * in window is lost... Voila. --ANK (010210)
2822 */ 2855 */
2823 struct tcp_sock *tp = tcp_sk(sk); 2856 if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2824 2857 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
2825 tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
2826}
2827 2858
2828static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) 2859 if (seq_rtt < 0)
2829{ 2860 return false;
2830 /* We don't have a timestamp. Can only use
2831 * packets that are not retransmitted to determine
2832 * rtt estimates. Also, we must not reset the
2833 * backoff for rto until we get a non-retransmitted
2834 * packet. This allows us to deal with a situation
2835 * where the network delay has increased suddenly.
2836 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
2837 */
2838 2861
2839 if (flag & FLAG_RETRANS_DATA_ACKED) 2862 tcp_rtt_estimator(sk, seq_rtt);
2840 return; 2863 tcp_set_rto(sk);
2841 2864
2842 tcp_valid_rtt_meas(sk, seq_rtt); 2865 /* RFC6298: only reset backoff on valid RTT measurement. */
2866 inet_csk(sk)->icsk_backoff = 0;
2867 return true;
2843} 2868}
2844 2869
2845static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, 2870/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
2846 const s32 seq_rtt) 2871static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2847{ 2872{
2848 const struct tcp_sock *tp = tcp_sk(sk); 2873 struct tcp_sock *tp = tcp_sk(sk);
2849 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 2874 s32 seq_rtt = -1;
2850 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 2875
2851 tcp_ack_saw_tstamp(sk, flag); 2876 if (tp->lsndtime && !tp->total_retrans)
2852 else if (seq_rtt >= 0) 2877 seq_rtt = tcp_time_stamp - tp->lsndtime;
2853 tcp_ack_no_tstamp(sk, seq_rtt, flag); 2878 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
2854} 2879}
2855 2880
2856static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) 2881static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
@@ -2939,7 +2964,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
2939 * arrived at the other end. 2964 * arrived at the other end.
2940 */ 2965 */
2941static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 2966static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
2942 u32 prior_snd_una) 2967 u32 prior_snd_una, s32 sack_rtt)
2943{ 2968{
2944 struct tcp_sock *tp = tcp_sk(sk); 2969 struct tcp_sock *tp = tcp_sk(sk);
2945 const struct inet_connection_sock *icsk = inet_csk(sk); 2970 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2978,8 +3003,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
2978 if (sacked & TCPCB_SACKED_RETRANS) 3003 if (sacked & TCPCB_SACKED_RETRANS)
2979 tp->retrans_out -= acked_pcount; 3004 tp->retrans_out -= acked_pcount;
2980 flag |= FLAG_RETRANS_DATA_ACKED; 3005 flag |= FLAG_RETRANS_DATA_ACKED;
2981 ca_seq_rtt = -1;
2982 seq_rtt = -1;
2983 } else { 3006 } else {
2984 ca_seq_rtt = now - scb->when; 3007 ca_seq_rtt = now - scb->when;
2985 last_ackt = skb->tstamp; 3008 last_ackt = skb->tstamp;
@@ -3031,6 +3054,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3031 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3054 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3032 flag |= FLAG_SACK_RENEGING; 3055 flag |= FLAG_SACK_RENEGING;
3033 3056
3057 if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
3058 (flag & FLAG_ACKED))
3059 tcp_rearm_rto(sk);
3060
3034 if (flag & FLAG_ACKED) { 3061 if (flag & FLAG_ACKED) {
3035 const struct tcp_congestion_ops *ca_ops 3062 const struct tcp_congestion_ops *ca_ops
3036 = inet_csk(sk)->icsk_ca_ops; 3063 = inet_csk(sk)->icsk_ca_ops;
@@ -3040,9 +3067,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3040 tcp_mtup_probe_success(sk); 3067 tcp_mtup_probe_success(sk);
3041 } 3068 }
3042 3069
3043 tcp_ack_update_rtt(sk, flag, seq_rtt);
3044 tcp_rearm_rto(sk);
3045
3046 if (tcp_is_reno(tp)) { 3070 if (tcp_is_reno(tp)) {
3047 tcp_remove_reno_sacks(sk, pkts_acked); 3071 tcp_remove_reno_sacks(sk, pkts_acked);
3048 } else { 3072 } else {
@@ -3130,11 +3154,24 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3130 inet_csk(sk)->icsk_ca_state != TCP_CA_Open; 3154 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3131} 3155}
3132 3156
3157/* Decide wheather to run the increase function of congestion control. */
3133static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3158static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3134{ 3159{
3135 const struct tcp_sock *tp = tcp_sk(sk); 3160 if (tcp_in_cwnd_reduction(sk))
3136 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && 3161 return false;
3137 !tcp_in_cwnd_reduction(sk); 3162
3163 /* If reordering is high then always grow cwnd whenever data is
3164 * delivered regardless of its ordering. Otherwise stay conservative
3165 * and only grow cwnd on in-order delivery in Open state, and retain
3166 * cwnd in Disordered state (RFC5681). A stretched ACK with
3167 * new SACK or ECE mark may first advance cwnd here and later reduce
3168 * cwnd in tcp_fastretrans_alert() based on more states.
3169 */
3170 if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
3171 return flag & FLAG_FORWARD_PROGRESS;
3172
3173 return inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
3174 flag & FLAG_DATA_ACKED;
3138} 3175}
3139 3176
3140/* Check that window update is acceptable. 3177/* Check that window update is acceptable.
@@ -3269,11 +3306,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3269 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3306 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3270 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3307 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3271 bool is_dupack = false; 3308 bool is_dupack = false;
3272 u32 prior_in_flight; 3309 u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
3273 u32 prior_fackets; 3310 u32 prior_fackets;
3274 int prior_packets = tp->packets_out; 3311 int prior_packets = tp->packets_out;
3275 const int prior_unsacked = tp->packets_out - tp->sacked_out; 3312 const int prior_unsacked = tp->packets_out - tp->sacked_out;
3276 int acked = 0; /* Number of packets newly acked */ 3313 int acked = 0; /* Number of packets newly acked */
3314 s32 sack_rtt = -1;
3277 3315
3278 /* If the ack is older than previous acks 3316 /* If the ack is older than previous acks
3279 * then we can probably ignore it. 3317 * then we can probably ignore it.
@@ -3330,7 +3368,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3330 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); 3368 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
3331 3369
3332 if (TCP_SKB_CB(skb)->sacked) 3370 if (TCP_SKB_CB(skb)->sacked)
3333 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3371 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3372 &sack_rtt);
3334 3373
3335 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3374 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
3336 flag |= FLAG_ECE; 3375 flag |= FLAG_ECE;
@@ -3349,21 +3388,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3349 3388
3350 /* See if we can take anything off of the retransmit queue. */ 3389 /* See if we can take anything off of the retransmit queue. */
3351 acked = tp->packets_out; 3390 acked = tp->packets_out;
3352 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3391 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
3353 acked -= tp->packets_out; 3392 acked -= tp->packets_out;
3354 3393
3394 /* Advance cwnd if state allows */
3395 if (tcp_may_raise_cwnd(sk, flag))
3396 tcp_cong_avoid(sk, ack, prior_in_flight);
3397
3355 if (tcp_ack_is_dubious(sk, flag)) { 3398 if (tcp_ack_is_dubious(sk, flag)) {
3356 /* Advance CWND, if state allows this. */
3357 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
3358 tcp_cong_avoid(sk, ack, prior_in_flight);
3359 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3399 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3360 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3400 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3361 is_dupack, flag); 3401 is_dupack, flag);
3362 } else {
3363 if (flag & FLAG_DATA_ACKED)
3364 tcp_cong_avoid(sk, ack, prior_in_flight);
3365 } 3402 }
3366
3367 if (tp->tlp_high_seq) 3403 if (tp->tlp_high_seq)
3368 tcp_process_tlp_ack(sk, ack, flag); 3404 tcp_process_tlp_ack(sk, ack, flag);
3369 3405
@@ -3375,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3375 3411
3376 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3412 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3377 tcp_schedule_loss_probe(sk); 3413 tcp_schedule_loss_probe(sk);
3414 if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
3415 tcp_update_pacing_rate(sk);
3378 return 1; 3416 return 1;
3379 3417
3380no_queue: 3418no_queue:
@@ -3402,7 +3440,8 @@ old_ack:
3402 * If data was DSACKed, see if we can undo a cwnd reduction. 3440 * If data was DSACKed, see if we can undo a cwnd reduction.
3403 */ 3441 */
3404 if (TCP_SKB_CB(skb)->sacked) { 3442 if (TCP_SKB_CB(skb)->sacked) {
3405 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3443 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3444 &sack_rtt);
3406 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3445 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3407 is_dupack, flag); 3446 is_dupack, flag);
3408 } 3447 }
@@ -3535,7 +3574,10 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
3535 ++ptr; 3574 ++ptr;
3536 tp->rx_opt.rcv_tsval = ntohl(*ptr); 3575 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3537 ++ptr; 3576 ++ptr;
3538 tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; 3577 if (*ptr)
3578 tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
3579 else
3580 tp->rx_opt.rcv_tsecr = 0;
3539 return true; 3581 return true;
3540 } 3582 }
3541 return false; 3583 return false;
@@ -3560,7 +3602,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
3560 } 3602 }
3561 3603
3562 tcp_parse_options(skb, &tp->rx_opt, 1, NULL); 3604 tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3563 if (tp->rx_opt.saw_tstamp) 3605 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3564 tp->rx_opt.rcv_tsecr -= tp->tsoffset; 3606 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3565 3607
3566 return true; 3608 return true;
@@ -5010,8 +5052,8 @@ discard:
5010 * the rest is checked inline. Fast processing is turned on in 5052 * the rest is checked inline. Fast processing is turned on in
5011 * tcp_data_queue when everything is OK. 5053 * tcp_data_queue when everything is OK.
5012 */ 5054 */
5013int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, 5055void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5014 const struct tcphdr *th, unsigned int len) 5056 const struct tcphdr *th, unsigned int len)
5015{ 5057{
5016 struct tcp_sock *tp = tcp_sk(sk); 5058 struct tcp_sock *tp = tcp_sk(sk);
5017 5059
@@ -5088,7 +5130,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5088 tcp_ack(sk, skb, 0); 5130 tcp_ack(sk, skb, 0);
5089 __kfree_skb(skb); 5131 __kfree_skb(skb);
5090 tcp_data_snd_check(sk); 5132 tcp_data_snd_check(sk);
5091 return 0; 5133 return;
5092 } else { /* Header too small */ 5134 } else { /* Header too small */
5093 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 5135 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5094 goto discard; 5136 goto discard;
@@ -5181,7 +5223,7 @@ no_ack:
5181 if (eaten) 5223 if (eaten)
5182 kfree_skb_partial(skb, fragstolen); 5224 kfree_skb_partial(skb, fragstolen);
5183 sk->sk_data_ready(sk, 0); 5225 sk->sk_data_ready(sk, 0);
5184 return 0; 5226 return;
5185 } 5227 }
5186 } 5228 }
5187 5229
@@ -5197,7 +5239,7 @@ slow_path:
5197 */ 5239 */
5198 5240
5199 if (!tcp_validate_incoming(sk, skb, th, 1)) 5241 if (!tcp_validate_incoming(sk, skb, th, 1))
5200 return 0; 5242 return;
5201 5243
5202step5: 5244step5:
5203 if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) 5245 if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
@@ -5213,7 +5255,7 @@ step5:
5213 5255
5214 tcp_data_snd_check(sk); 5256 tcp_data_snd_check(sk);
5215 tcp_ack_snd_check(sk); 5257 tcp_ack_snd_check(sk);
5216 return 0; 5258 return;
5217 5259
5218csum_error: 5260csum_error:
5219 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); 5261 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
@@ -5221,7 +5263,6 @@ csum_error:
5221 5263
5222discard: 5264discard:
5223 __kfree_skb(skb); 5265 __kfree_skb(skb);
5224 return 0;
5225} 5266}
5226EXPORT_SYMBOL(tcp_rcv_established); 5267EXPORT_SYMBOL(tcp_rcv_established);
5227 5268
@@ -5316,7 +5357,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5316 int saved_clamp = tp->rx_opt.mss_clamp; 5357 int saved_clamp = tp->rx_opt.mss_clamp;
5317 5358
5318 tcp_parse_options(skb, &tp->rx_opt, 0, &foc); 5359 tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
5319 if (tp->rx_opt.saw_tstamp) 5360 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
5320 tp->rx_opt.rcv_tsecr -= tp->tsoffset; 5361 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5321 5362
5322 if (th->ack) { 5363 if (th->ack) {
@@ -5624,9 +5665,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5624 * so release it. 5665 * so release it.
5625 */ 5666 */
5626 if (req) { 5667 if (req) {
5627 tcp_synack_rtt_meas(sk, req);
5628 tp->total_retrans = req->num_retrans; 5668 tp->total_retrans = req->num_retrans;
5629
5630 reqsk_fastopen_remove(sk, req, false); 5669 reqsk_fastopen_remove(sk, req, false);
5631 } else { 5670 } else {
5632 /* Make sure socket is routed, for correct metrics. */ 5671 /* Make sure socket is routed, for correct metrics. */
@@ -5651,6 +5690,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5651 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 5690 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5652 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; 5691 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
5653 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5692 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5693 tcp_synack_rtt_meas(sk, req);
5654 5694
5655 if (tp->rx_opt.tstamp_ok) 5695 if (tp->rx_opt.tstamp_ok)
5656 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5696 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b299da5ff499..b14266bb91eb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -821,8 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
821 */ 821 */
822static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 822static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
823 struct request_sock *req, 823 struct request_sock *req,
824 u16 queue_mapping, 824 u16 queue_mapping)
825 bool nocache)
826{ 825{
827 const struct inet_request_sock *ireq = inet_rsk(req); 826 const struct inet_request_sock *ireq = inet_rsk(req);
828 struct flowi4 fl4; 827 struct flowi4 fl4;
@@ -852,7 +851,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
852 851
853static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) 852static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
854{ 853{
855 int res = tcp_v4_send_synack(sk, NULL, req, 0, false); 854 int res = tcp_v4_send_synack(sk, NULL, req, 0);
856 855
857 if (!res) 856 if (!res)
858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 857 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -890,7 +889,7 @@ bool tcp_syn_flood_action(struct sock *sk,
890 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); 889 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
891 890
892 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; 891 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
893 if (!lopt->synflood_warned) { 892 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
894 lopt->synflood_warned = 1; 893 lopt->synflood_warned = 1;
895 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", 894 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
896 proto, ntohs(tcp_hdr(skb)->dest), msg); 895 proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -1316,9 +1315,11 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1316 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 1315 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1317 return true; 1316 return true;
1318 } 1317 }
1318
1319 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { 1319 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1320 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { 1320 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1321 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); 1321 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1322 ip_hdr(skb)->daddr, valid_foc);
1322 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || 1323 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1323 memcmp(&foc->val[0], &valid_foc->val[0], 1324 memcmp(&foc->val[0], &valid_foc->val[0],
1324 TCP_FASTOPEN_COOKIE_SIZE) != 0) 1325 TCP_FASTOPEN_COOKIE_SIZE) != 0)
@@ -1329,14 +1330,16 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1329 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 1330 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1330 return true; 1331 return true;
1331 } else if (foc->len == 0) { /* Client requesting a cookie */ 1332 } else if (foc->len == 0) { /* Client requesting a cookie */
1332 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); 1333 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1334 ip_hdr(skb)->daddr, valid_foc);
1333 NET_INC_STATS_BH(sock_net(sk), 1335 NET_INC_STATS_BH(sock_net(sk),
1334 LINUX_MIB_TCPFASTOPENCOOKIEREQD); 1336 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1335 } else { 1337 } else {
1336 /* Client sent a cookie with wrong size. Treat it 1338 /* Client sent a cookie with wrong size. Treat it
1337 * the same as invalid and return a valid one. 1339 * the same as invalid and return a valid one.
1338 */ 1340 */
1339 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); 1341 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1342 ip_hdr(skb)->daddr, valid_foc);
1340 } 1343 }
1341 return false; 1344 return false;
1342} 1345}
@@ -1462,7 +1465,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1462 * limitations, they conserve resources and peer is 1465 * limitations, they conserve resources and peer is
1463 * evidently real one. 1466 * evidently real one.
1464 */ 1467 */
1465 if (inet_csk_reqsk_queue_is_full(sk) && !isn) { 1468 if ((sysctl_tcp_syncookies == 2 ||
1469 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
1466 want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); 1470 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1467 if (!want_cookie) 1471 if (!want_cookie)
1468 goto drop; 1472 goto drop;
@@ -1671,8 +1675,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1671 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1675 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1672 1676
1673 tcp_initialize_rcv_mss(newsk); 1677 tcp_initialize_rcv_mss(newsk);
1674 tcp_synack_rtt_meas(newsk, req);
1675 newtp->total_retrans = req->num_retrans;
1676 1678
1677#ifdef CONFIG_TCP_MD5SIG 1679#ifdef CONFIG_TCP_MD5SIG
1678 /* Copy over the MD5 key from the original socket */ 1680 /* Copy over the MD5 key from the original socket */
@@ -1797,10 +1799,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1797 sk->sk_rx_dst = NULL; 1799 sk->sk_rx_dst = NULL;
1798 } 1800 }
1799 } 1801 }
1800 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1802 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1801 rsk = sk;
1802 goto reset;
1803 }
1804 return 0; 1803 return 0;
1805 } 1804 }
1806 1805
@@ -2605,7 +2604,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2605 long delta = req->expires - jiffies; 2604 long delta = req->expires - jiffies;
2606 2605
2607 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2606 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2608 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", 2607 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
2609 i, 2608 i,
2610 ireq->loc_addr, 2609 ireq->loc_addr,
2611 ntohs(inet_sk(sk)->inet_sport), 2610 ntohs(inet_sk(sk)->inet_sport),
@@ -2663,7 +2662,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2663 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2662 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2664 2663
2665 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2664 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2666 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", 2665 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
2667 i, src, srcp, dest, destp, sk->sk_state, 2666 i, src, srcp, dest, destp, sk->sk_state,
2668 tp->write_seq - tp->snd_una, 2667 tp->write_seq - tp->snd_una,
2669 rx_queue, 2668 rx_queue,
@@ -2802,6 +2801,7 @@ struct proto tcp_prot = {
2802 .unhash = inet_unhash, 2801 .unhash = inet_unhash,
2803 .get_port = inet_csk_get_port, 2802 .get_port = inet_csk_get_port,
2804 .enter_memory_pressure = tcp_enter_memory_pressure, 2803 .enter_memory_pressure = tcp_enter_memory_pressure,
2804 .stream_memory_free = tcp_stream_memory_free,
2805 .sockets_allocated = &tcp_sockets_allocated, 2805 .sockets_allocated = &tcp_sockets_allocated,
2806 .orphan_count = &tcp_orphan_count, 2806 .orphan_count = &tcp_orphan_count,
2807 .memory_allocated = &tcp_memory_allocated, 2807 .memory_allocated = &tcp_memory_allocated,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index da14436c1735..8a57d79b0b16 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -132,10 +132,10 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
132 return 0; 132 return 0;
133} 133}
134 134
135static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, 135static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
136 const char *buffer) 136 const char *buffer)
137{ 137{
138 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 138 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
139 unsigned long long val; 139 unsigned long long val;
140 int ret = 0; 140 int ret = 0;
141 141
@@ -180,9 +180,9 @@ static u64 tcp_read_usage(struct mem_cgroup *memcg)
180 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); 180 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
181} 181}
182 182
183static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) 183static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
184{ 184{
185 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 185 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
186 u64 val; 186 u64 val;
187 187
188 switch (cft->private) { 188 switch (cft->private) {
@@ -202,13 +202,13 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
202 return val; 202 return val;
203} 203}
204 204
205static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) 205static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
206{ 206{
207 struct mem_cgroup *memcg; 207 struct mem_cgroup *memcg;
208 struct tcp_memcontrol *tcp; 208 struct tcp_memcontrol *tcp;
209 struct cg_proto *cg_proto; 209 struct cg_proto *cg_proto;
210 210
211 memcg = mem_cgroup_from_cont(cont); 211 memcg = mem_cgroup_from_css(css);
212 cg_proto = tcp_prot.proto_cgroup(memcg); 212 cg_proto = tcp_prot.proto_cgroup(memcg);
213 if (!cg_proto) 213 if (!cg_proto)
214 return 0; 214 return 0;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f6a005c485a9..4a22f3e715df 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk)
443 struct dst_entry *dst = __sk_dst_get(sk); 443 struct dst_entry *dst = __sk_dst_get(sk);
444 struct tcp_sock *tp = tcp_sk(sk); 444 struct tcp_sock *tp = tcp_sk(sk);
445 struct tcp_metrics_block *tm; 445 struct tcp_metrics_block *tm;
446 u32 val; 446 u32 val, crtt = 0; /* cached RTT scaled by 8 */
447 447
448 if (dst == NULL) 448 if (dst == NULL)
449 goto reset; 449 goto reset;
@@ -478,15 +478,19 @@ void tcp_init_metrics(struct sock *sk)
478 tp->reordering = val; 478 tp->reordering = val;
479 } 479 }
480 480
481 val = tcp_metric_get(tm, TCP_METRIC_RTT); 481 crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
482 if (val == 0 || tp->srtt == 0) { 482 rcu_read_unlock();
483 rcu_read_unlock(); 483reset:
484 goto reset; 484 /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
485 } 485 * to seed the RTO for later data packets because SYN packets are
486 /* Initial rtt is determined from SYN,SYN-ACK. 486 * small. Use the per-dst cached values to seed the RTO but keep
487 * The segment is small and rtt may appear much 487 * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
488 * less than real one. Use per-dst memory 488 * Later the RTO will be updated immediately upon obtaining the first
489 * to make it more realistic. 489 * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
490 * influences the first RTO but not later RTT estimation.
491 *
492 * But if RTT is not available from the SYN (due to retransmits or
493 * syn cookies) or the cache, force a conservative 3secs timeout.
490 * 494 *
491 * A bit of theory. RTT is time passed after "normal" sized packet 495 * A bit of theory. RTT is time passed after "normal" sized packet
492 * is sent until it is ACKed. In normal circumstances sending small 496 * is sent until it is ACKed. In normal circumstances sending small
@@ -497,21 +501,9 @@ void tcp_init_metrics(struct sock *sk)
497 * to low value, and then abruptly stops to do it and starts to delay 501 * to low value, and then abruptly stops to do it and starts to delay
498 * ACKs, wait for troubles. 502 * ACKs, wait for troubles.
499 */ 503 */
500 val = msecs_to_jiffies(val); 504 if (crtt > tp->srtt) {
501 if (val > tp->srtt) { 505 inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
502 tp->srtt = val; 506 } else if (tp->srtt == 0) {
503 tp->rtt_seq = tp->snd_nxt;
504 }
505 val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
506 if (val > tp->mdev) {
507 tp->mdev = val;
508 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
509 }
510 rcu_read_unlock();
511
512 tcp_set_rto(sk);
513reset:
514 if (tp->srtt == 0) {
515 /* RFC6298: 5.7 We've failed to get a valid RTT sample from 507 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
516 * 3WHS. This is most likely due to retransmission, 508 * 3WHS. This is most likely due to retransmission,
517 * including spurious one. Reset the RTO back to 3secs 509 * including spurious one. Reset the RTO back to 3secs
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ab1c08658528..58a3e69aef64 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
411 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 411 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
412 tcp_enable_early_retrans(newtp); 412 tcp_enable_early_retrans(newtp);
413 newtp->tlp_high_seq = 0; 413 newtp->tlp_high_seq = 0;
414 newtp->lsndtime = treq->snt_synack;
415 newtp->total_retrans = req->num_retrans;
414 416
415 /* So many TCP implementations out there (incorrectly) count the 417 /* So many TCP implementations out there (incorrectly) count the
416 * initial SYN frame in their delayed-ACK and congestion control 418 * initial SYN frame in their delayed-ACK and congestion control
@@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
666 if (!(flg & TCP_FLAG_ACK)) 668 if (!(flg & TCP_FLAG_ACK))
667 return NULL; 669 return NULL;
668 670
669 /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
670 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
671 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
672 else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
673 tcp_rsk(req)->snt_synack = 0;
674
675 /* For Fast Open no more processing is needed (sk is the 671 /* For Fast Open no more processing is needed (sk is the
676 * child socket). 672 * child socket).
677 */ 673 */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 92fde8d1aa82..7c83cb8bf137 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
65/* By default, RFC2861 behavior. */ 65/* By default, RFC2861 behavior. */
66int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 66int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67 67
68unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
69EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
70
68static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 71static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
69 int push_one, gfp_t gfp); 72 int push_one, gfp_t gfp);
70 73
@@ -1628,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1628 1631
1629 /* If a full-sized TSO skb can be sent, do it. */ 1632 /* If a full-sized TSO skb can be sent, do it. */
1630 if (limit >= min_t(unsigned int, sk->sk_gso_max_size, 1633 if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
1631 sk->sk_gso_max_segs * tp->mss_cache)) 1634 tp->xmit_size_goal_segs * tp->mss_cache))
1632 goto send_now; 1635 goto send_now;
1633 1636
1634 /* Middle in queue won't get any more data, full sendable already? */ 1637 /* Middle in queue won't get any more data, full sendable already? */
@@ -2670,7 +2673,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2670 int tcp_header_size; 2673 int tcp_header_size;
2671 int mss; 2674 int mss;
2672 2675
2673 skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC)); 2676 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
2674 if (unlikely(!skb)) { 2677 if (unlikely(!skb)) {
2675 dst_release(dst); 2678 dst_release(dst);
2676 return NULL; 2679 return NULL;
@@ -2814,6 +2817,8 @@ void tcp_connect_init(struct sock *sk)
2814 2817
2815 if (likely(!tp->repair)) 2818 if (likely(!tp->repair))
2816 tp->rcv_nxt = 0; 2819 tp->rcv_nxt = 0;
2820 else
2821 tp->rcv_tstamp = tcp_time_stamp;
2817 tp->rcv_wup = tp->rcv_nxt; 2822 tp->rcv_wup = tp->rcv_nxt;
2818 tp->copied_seq = tp->rcv_nxt; 2823 tp->copied_seq = tp->rcv_nxt;
2819 2824
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index d4943f67aff2..611beab38a00 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -46,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096;
46MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); 46MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
47module_param(bufsize, uint, 0); 47module_param(bufsize, uint, 0);
48 48
49static unsigned int fwmark __read_mostly = 0;
50MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
51module_param(fwmark, uint, 0);
52
49static int full __read_mostly; 53static int full __read_mostly;
50MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); 54MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
51module_param(full, int, 0); 55module_param(full, int, 0);
@@ -54,12 +58,16 @@ static const char procname[] = "tcpprobe";
54 58
55struct tcp_log { 59struct tcp_log {
56 ktime_t tstamp; 60 ktime_t tstamp;
57 __be32 saddr, daddr; 61 union {
58 __be16 sport, dport; 62 struct sockaddr raw;
63 struct sockaddr_in v4;
64 struct sockaddr_in6 v6;
65 } src, dst;
59 u16 length; 66 u16 length;
60 u32 snd_nxt; 67 u32 snd_nxt;
61 u32 snd_una; 68 u32 snd_una;
62 u32 snd_wnd; 69 u32 snd_wnd;
70 u32 rcv_wnd;
63 u32 snd_cwnd; 71 u32 snd_cwnd;
64 u32 ssthresh; 72 u32 ssthresh;
65 u32 srtt; 73 u32 srtt;
@@ -86,19 +94,45 @@ static inline int tcp_probe_avail(void)
86 return bufsize - tcp_probe_used() - 1; 94 return bufsize - tcp_probe_used() - 1;
87} 95}
88 96
97#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \
98 do { \
99 si4.sin_family = AF_INET; \
100 si4.sin_port = inet->inet_##mem##port; \
101 si4.sin_addr.s_addr = inet->inet_##mem##addr; \
102 } while (0) \
103
104#if IS_ENABLED(CONFIG_IPV6)
105#define tcp_probe_copy_fl_to_si6(inet, si6, mem) \
106 do { \
107 struct ipv6_pinfo *pi6 = inet->pinet6; \
108 si6.sin6_family = AF_INET6; \
109 si6.sin6_port = inet->inet_##mem##port; \
110 si6.sin6_addr = pi6->mem##addr; \
111 si6.sin6_flowinfo = 0; /* No need here. */ \
112 si6.sin6_scope_id = 0; /* No need here. */ \
113 } while (0)
114#else
115#define tcp_probe_copy_fl_to_si6(fl, si6, mem) \
116 do { \
117 memset(&si6, 0, sizeof(si6)); \
118 } while (0)
119#endif
120
89/* 121/*
90 * Hook inserted to be called before each receive packet. 122 * Hook inserted to be called before each receive packet.
91 * Note: arguments must match tcp_rcv_established()! 123 * Note: arguments must match tcp_rcv_established()!
92 */ 124 */
93static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, 125static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
94 struct tcphdr *th, unsigned int len) 126 const struct tcphdr *th, unsigned int len)
95{ 127{
96 const struct tcp_sock *tp = tcp_sk(sk); 128 const struct tcp_sock *tp = tcp_sk(sk);
97 const struct inet_sock *inet = inet_sk(sk); 129 const struct inet_sock *inet = inet_sk(sk);
98 130
99 /* Only update if port matches */ 131 /* Only update if port or skb mark matches */
100 if ((port == 0 || ntohs(inet->inet_dport) == port || 132 if (((port == 0 && fwmark == 0) ||
101 ntohs(inet->inet_sport) == port) && 133 ntohs(inet->inet_dport) == port ||
134 ntohs(inet->inet_sport) == port ||
135 (fwmark > 0 && skb->mark == fwmark)) &&
102 (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { 136 (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
103 137
104 spin_lock(&tcp_probe.lock); 138 spin_lock(&tcp_probe.lock);
@@ -107,15 +141,25 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
107 struct tcp_log *p = tcp_probe.log + tcp_probe.head; 141 struct tcp_log *p = tcp_probe.log + tcp_probe.head;
108 142
109 p->tstamp = ktime_get(); 143 p->tstamp = ktime_get();
110 p->saddr = inet->inet_saddr; 144 switch (sk->sk_family) {
111 p->sport = inet->inet_sport; 145 case AF_INET:
112 p->daddr = inet->inet_daddr; 146 tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
113 p->dport = inet->inet_dport; 147 tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
148 break;
149 case AF_INET6:
150 tcp_probe_copy_fl_to_si6(inet, p->src.v6, s);
151 tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d);
152 break;
153 default:
154 BUG();
155 }
156
114 p->length = skb->len; 157 p->length = skb->len;
115 p->snd_nxt = tp->snd_nxt; 158 p->snd_nxt = tp->snd_nxt;
116 p->snd_una = tp->snd_una; 159 p->snd_una = tp->snd_una;
117 p->snd_cwnd = tp->snd_cwnd; 160 p->snd_cwnd = tp->snd_cwnd;
118 p->snd_wnd = tp->snd_wnd; 161 p->snd_wnd = tp->snd_wnd;
162 p->rcv_wnd = tp->rcv_wnd;
119 p->ssthresh = tcp_current_ssthresh(sk); 163 p->ssthresh = tcp_current_ssthresh(sk);
120 p->srtt = tp->srtt >> 3; 164 p->srtt = tp->srtt >> 3;
121 165
@@ -128,7 +172,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
128 } 172 }
129 173
130 jprobe_return(); 174 jprobe_return();
131 return 0;
132} 175}
133 176
134static struct jprobe tcp_jprobe = { 177static struct jprobe tcp_jprobe = {
@@ -157,13 +200,11 @@ static int tcpprobe_sprint(char *tbuf, int n)
157 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); 200 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
158 201
159 return scnprintf(tbuf, n, 202 return scnprintf(tbuf, n,
160 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", 203 "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
161 (unsigned long) tv.tv_sec, 204 (unsigned long) tv.tv_sec,
162 (unsigned long) tv.tv_nsec, 205 (unsigned long) tv.tv_nsec,
163 &p->saddr, ntohs(p->sport), 206 &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
164 &p->daddr, ntohs(p->dport), 207 p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
165 p->length, p->snd_nxt, p->snd_una,
166 p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt);
167} 208}
168 209
169static ssize_t tcpprobe_read(struct file *file, char __user *buf, 210static ssize_t tcpprobe_read(struct file *file, char __user *buf,
@@ -176,7 +217,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
176 return -EINVAL; 217 return -EINVAL;
177 218
178 while (cnt < len) { 219 while (cnt < len) {
179 char tbuf[164]; 220 char tbuf[256];
180 int width; 221 int width;
181 222
182 /* Wait for data in buffer */ 223 /* Wait for data in buffer */
@@ -223,6 +264,13 @@ static __init int tcpprobe_init(void)
223{ 264{
224 int ret = -ENOMEM; 265 int ret = -ENOMEM;
225 266
267 /* Warning: if the function signature of tcp_rcv_established,
268 * has been changed, you also have to change the signature of
269 * jtcp_rcv_established, otherwise you end up right here!
270 */
271 BUILD_BUG_ON(__same_type(tcp_rcv_established,
272 jtcp_rcv_established) == 0);
273
226 init_waitqueue_head(&tcp_probe.wait); 274 init_waitqueue_head(&tcp_probe.wait);
227 spin_lock_init(&tcp_probe.lock); 275 spin_lock_init(&tcp_probe.lock);
228 276
@@ -241,7 +289,8 @@ static __init int tcpprobe_init(void)
241 if (ret) 289 if (ret)
242 goto err1; 290 goto err1;
243 291
244 pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize); 292 pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
293 port, fwmark, bufsize);
245 return 0; 294 return 0;
246 err1: 295 err1:
247 remove_proc_entry(procname, init_net.proc_net); 296 remove_proc_entry(procname, init_net.proc_net);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 766e6bab9113..74d2c95db57f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -704,7 +704,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames);
704 * @src: source IP address 704 * @src: source IP address
705 * @dst: destination IP address 705 * @dst: destination IP address
706 */ 706 */
707static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) 707void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
708{ 708{
709 struct udphdr *uh = udp_hdr(skb); 709 struct udphdr *uh = udp_hdr(skb);
710 struct sk_buff *frags = skb_shinfo(skb)->frag_list; 710 struct sk_buff *frags = skb_shinfo(skb)->frag_list;
@@ -740,6 +740,7 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
740 uh->check = CSUM_MANGLED_0; 740 uh->check = CSUM_MANGLED_0;
741 } 741 }
742} 742}
743EXPORT_SYMBOL_GPL(udp4_hwcsum);
743 744
744static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) 745static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
745{ 746{
@@ -2158,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
2158 __u16 srcp = ntohs(inet->inet_sport); 2159 __u16 srcp = ntohs(inet->inet_sport);
2159 2160
2160 seq_printf(f, "%5d: %08X:%04X %08X:%04X" 2161 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
2161 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", 2162 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
2162 bucket, src, srcp, dest, destp, sp->sk_state, 2163 bucket, src, srcp, dest, destp, sp->sk_state,
2163 sk_wmem_alloc_get(sp), 2164 sk_wmem_alloc_get(sp),
2164 sk_rmem_alloc_get(sp), 2165 sk_rmem_alloc_get(sp),
@@ -2336,7 +2337,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2336 uh->len = htons(skb->len - udp_offset); 2337 uh->len = htons(skb->len - udp_offset);
2337 2338
2338 /* csum segment if tunnel sets skb with csum. */ 2339 /* csum segment if tunnel sets skb with csum. */
2339 if (unlikely(uh->check)) { 2340 if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) {
2340 struct iphdr *iph = ip_hdr(skb); 2341 struct iphdr *iph = ip_hdr(skb);
2341 2342
2342 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, 2343 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
@@ -2347,7 +2348,18 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2347 if (uh->check == 0) 2348 if (uh->check == 0)
2348 uh->check = CSUM_MANGLED_0; 2349 uh->check = CSUM_MANGLED_0;
2349 2350
2351 } else if (protocol == htons(ETH_P_IPV6)) {
2352 struct ipv6hdr *ipv6h = ipv6_hdr(skb);
2353 u32 len = skb->len - udp_offset;
2354
2355 uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
2356 len, IPPROTO_UDP, 0);
2357 uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
2358 if (uh->check == 0)
2359 uh->check = CSUM_MANGLED_0;
2360 skb->ip_summed = CHECKSUM_NONE;
2350 } 2361 }
2362
2351 skb->protocol = protocol; 2363 skb->protocol = protocol;
2352 } while ((skb = skb->next)); 2364 } while ((skb = skb->next));
2353out: 2365out:
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 327a617d594c..baa0f63731fd 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -21,7 +21,6 @@
21static int xfrm4_tunnel_check_size(struct sk_buff *skb) 21static int xfrm4_tunnel_check_size(struct sk_buff *skb)
22{ 22{
23 int mtu, ret = 0; 23 int mtu, ret = 0;
24 struct dst_entry *dst;
25 24
26 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) 25 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
27 goto out; 26 goto out;
@@ -29,12 +28,10 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
29 if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) 28 if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
30 goto out; 29 goto out;
31 30
32 dst = skb_dst(skb); 31 mtu = dst_mtu(skb_dst(skb));
33 mtu = dst_mtu(dst);
34 if (skb->len > mtu) { 32 if (skb->len > mtu) {
35 if (skb->sk) 33 if (skb->sk)
36 ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, 34 xfrm_local_error(skb, mtu);
37 inet_sk(skb->sk)->inet_dport, mtu);
38 else 35 else
39 icmp_send(skb, ICMP_DEST_UNREACH, 36 icmp_send(skb, ICMP_DEST_UNREACH,
40 ICMP_FRAG_NEEDED, htonl(mtu)); 37 ICMP_FRAG_NEEDED, htonl(mtu));
@@ -99,3 +96,12 @@ int xfrm4_output(struct sk_buff *skb)
99 x->outer_mode->afinfo->output_finish, 96 x->outer_mode->afinfo->output_finish,
100 !(IPCB(skb)->flags & IPSKB_REROUTED)); 97 !(IPCB(skb)->flags & IPSKB_REROUTED));
101} 98}
99
100void xfrm4_local_error(struct sk_buff *skb, u32 mtu)
101{
102 struct iphdr *hdr;
103
104 hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
105 ip_local_error(skb->sk, EMSGSIZE, hdr->daddr,
106 inet_sk(skb->sk)->inet_dport, mtu);
107}
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 9258e751baba..0b2a0641526a 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -83,6 +83,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
83 .extract_input = xfrm4_extract_input, 83 .extract_input = xfrm4_extract_input,
84 .extract_output = xfrm4_extract_output, 84 .extract_output = xfrm4_extract_output,
85 .transport_finish = xfrm4_transport_finish, 85 .transport_finish = xfrm4_transport_finish,
86 .local_error = xfrm4_local_error,
86}; 87};
87 88
88void __init xfrm4_state_init(void) 89void __init xfrm4_state_init(void)