diff options
Diffstat (limited to 'net/ipv4')
47 files changed, 1125 insertions, 826 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 37cf1a6ea3ad..05c57f0fcabe 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -259,22 +259,6 @@ config IP_PIMSM_V2 | |||
259 | gated-5). This routing protocol is not used widely, so say N unless | 259 | gated-5). This routing protocol is not used widely, so say N unless |
260 | you want to play with it. | 260 | you want to play with it. |
261 | 261 | ||
262 | config ARPD | ||
263 | bool "IP: ARP daemon support" | ||
264 | ---help--- | ||
265 | The kernel maintains an internal cache which maps IP addresses to | ||
266 | hardware addresses on the local network, so that Ethernet | ||
267 | frames are sent to the proper address on the physical networking | ||
268 | layer. Normally, kernel uses the ARP protocol to resolve these | ||
269 | mappings. | ||
270 | |||
271 | Saying Y here adds support to have an user space daemon to do this | ||
272 | resolution instead. This is useful for implementing an alternate | ||
273 | address resolution protocol (e.g. NHRP on mGRE tunnels) and also for | ||
274 | testing purposes. | ||
275 | |||
276 | If unsure, say N. | ||
277 | |||
278 | config SYN_COOKIES | 262 | config SYN_COOKIES |
279 | bool "IP: TCP syncookie support" | 263 | bool "IP: TCP syncookie support" |
280 | ---help--- | 264 | ---help--- |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b4d0be2b7ce9..7a1874b7b8fd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -1532,18 +1532,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) | |||
1532 | } | 1532 | } |
1533 | EXPORT_SYMBOL_GPL(snmp_mib_init); | 1533 | EXPORT_SYMBOL_GPL(snmp_mib_init); |
1534 | 1534 | ||
1535 | void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ]) | ||
1536 | { | ||
1537 | int i; | ||
1538 | |||
1539 | BUG_ON(ptr == NULL); | ||
1540 | for (i = 0; i < SNMP_ARRAY_SZ; i++) { | ||
1541 | free_percpu(ptr[i]); | ||
1542 | ptr[i] = NULL; | ||
1543 | } | ||
1544 | } | ||
1545 | EXPORT_SYMBOL_GPL(snmp_mib_free); | ||
1546 | |||
1547 | #ifdef CONFIG_IP_MULTICAST | 1535 | #ifdef CONFIG_IP_MULTICAST |
1548 | static const struct net_protocol igmp_protocol = { | 1536 | static const struct net_protocol igmp_protocol = { |
1549 | .handler = igmp_rcv, | 1537 | .handler = igmp_rcv, |
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 4429b013f269..7808093cede6 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
@@ -368,9 +368,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) | |||
368 | } else { | 368 | } else { |
369 | probes -= neigh->parms->app_probes; | 369 | probes -= neigh->parms->app_probes; |
370 | if (probes < 0) { | 370 | if (probes < 0) { |
371 | #ifdef CONFIG_ARPD | ||
372 | neigh_app_ns(neigh); | 371 | neigh_app_ns(neigh); |
373 | #endif | ||
374 | return; | 372 | return; |
375 | } | 373 | } |
376 | } | 374 | } |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 34ca6d5a3a4b..a1b5bcbd04ae 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -73,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = { | |||
73 | [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, | 73 | [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, |
74 | [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, | 74 | [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, |
75 | [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, | 75 | [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, |
76 | [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, | ||
77 | [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, | ||
76 | }, | 78 | }, |
77 | }; | 79 | }; |
78 | 80 | ||
@@ -83,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = { | |||
83 | [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, | 85 | [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, |
84 | [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, | 86 | [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, |
85 | [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, | 87 | [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, |
88 | [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, | ||
89 | [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, | ||
86 | }, | 90 | }, |
87 | }; | 91 | }; |
88 | 92 | ||
@@ -1126,10 +1130,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) | |||
1126 | if (len < (int) sizeof(ifr)) | 1130 | if (len < (int) sizeof(ifr)) |
1127 | break; | 1131 | break; |
1128 | memset(&ifr, 0, sizeof(struct ifreq)); | 1132 | memset(&ifr, 0, sizeof(struct ifreq)); |
1129 | if (ifa->ifa_label) | 1133 | strcpy(ifr.ifr_name, ifa->ifa_label); |
1130 | strcpy(ifr.ifr_name, ifa->ifa_label); | ||
1131 | else | ||
1132 | strcpy(ifr.ifr_name, dev->name); | ||
1133 | 1134 | ||
1134 | (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; | 1135 | (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; |
1135 | (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = | 1136 | (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = |
@@ -2097,11 +2098,15 @@ static struct devinet_sysctl_table { | |||
2097 | DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), | 2098 | DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), |
2098 | DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), | 2099 | DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), |
2099 | DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), | 2100 | DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), |
2101 | DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, | ||
2102 | "force_igmp_version"), | ||
2103 | DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, | ||
2104 | "igmpv2_unsolicited_report_interval"), | ||
2105 | DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, | ||
2106 | "igmpv3_unsolicited_report_interval"), | ||
2100 | 2107 | ||
2101 | DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), | 2108 | DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), |
2102 | DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), | 2109 | DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), |
2103 | DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION, | ||
2104 | "force_igmp_version"), | ||
2105 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, | 2110 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, |
2106 | "promote_secondaries"), | 2111 | "promote_secondaries"), |
2107 | DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, | 2112 | DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, |
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ab3d814bc80a..109ee89f123e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -477,7 +477,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) | |||
477 | } | 477 | } |
478 | 478 | ||
479 | return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - | 479 | return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - |
480 | net_adj) & ~(align - 1)) + (net_adj - 2); | 480 | net_adj) & ~(align - 1)) + net_adj - 2; |
481 | } | 481 | } |
482 | 482 | ||
483 | static void esp4_err(struct sk_buff *skb, u32 info) | 483 | static void esp4_err(struct sk_buff *skb, u32 info) |
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 26aa65d1fce4..523be38e37de 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c | |||
@@ -101,6 +101,30 @@ errout: | |||
101 | return err; | 101 | return err; |
102 | } | 102 | } |
103 | 103 | ||
104 | static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) | ||
105 | { | ||
106 | struct fib_result *result = (struct fib_result *) arg->result; | ||
107 | struct net_device *dev = result->fi->fib_dev; | ||
108 | |||
109 | /* do not accept result if the route does | ||
110 | * not meet the required prefix length | ||
111 | */ | ||
112 | if (result->prefixlen <= rule->suppress_prefixlen) | ||
113 | goto suppress_route; | ||
114 | |||
115 | /* do not accept result if the route uses a device | ||
116 | * belonging to a forbidden interface group | ||
117 | */ | ||
118 | if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) | ||
119 | goto suppress_route; | ||
120 | |||
121 | return false; | ||
122 | |||
123 | suppress_route: | ||
124 | if (!(arg->flags & FIB_LOOKUP_NOREF)) | ||
125 | fib_info_put(result->fi); | ||
126 | return true; | ||
127 | } | ||
104 | 128 | ||
105 | static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) | 129 | static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) |
106 | { | 130 | { |
@@ -267,6 +291,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { | |||
267 | .rule_size = sizeof(struct fib4_rule), | 291 | .rule_size = sizeof(struct fib4_rule), |
268 | .addr_size = sizeof(u32), | 292 | .addr_size = sizeof(u32), |
269 | .action = fib4_rule_action, | 293 | .action = fib4_rule_action, |
294 | .suppress = fib4_rule_suppress, | ||
270 | .match = fib4_rule_match, | 295 | .match = fib4_rule_match, |
271 | .configure = fib4_rule_configure, | 296 | .configure = fib4_rule_configure, |
272 | .delete = fib4_rule_delete, | 297 | .delete = fib4_rule_delete, |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 108a1e9c9eac..3df6d3edb2a1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
@@ -71,7 +71,6 @@ | |||
71 | #include <linux/init.h> | 71 | #include <linux/init.h> |
72 | #include <linux/list.h> | 72 | #include <linux/list.h> |
73 | #include <linux/slab.h> | 73 | #include <linux/slab.h> |
74 | #include <linux/prefetch.h> | ||
75 | #include <linux/export.h> | 74 | #include <linux/export.h> |
76 | #include <net/net_namespace.h> | 75 | #include <net/net_namespace.h> |
77 | #include <net/ip.h> | 76 | #include <net/ip.h> |
@@ -1761,10 +1760,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) | |||
1761 | if (!c) | 1760 | if (!c) |
1762 | continue; | 1761 | continue; |
1763 | 1762 | ||
1764 | if (IS_LEAF(c)) { | 1763 | if (IS_LEAF(c)) |
1765 | prefetch(rcu_dereference_rtnl(p->child[idx])); | ||
1766 | return (struct leaf *) c; | 1764 | return (struct leaf *) c; |
1767 | } | ||
1768 | 1765 | ||
1769 | /* Rescan start scanning in new node */ | 1766 | /* Rescan start scanning in new node */ |
1770 | p = (struct tnode *) c; | 1767 | p = (struct tnode *) c; |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cd71190d2962..d6c0e64ec97f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -88,6 +88,7 @@ | |||
88 | #include <linux/if_arp.h> | 88 | #include <linux/if_arp.h> |
89 | #include <linux/rtnetlink.h> | 89 | #include <linux/rtnetlink.h> |
90 | #include <linux/times.h> | 90 | #include <linux/times.h> |
91 | #include <linux/pkt_sched.h> | ||
91 | 92 | ||
92 | #include <net/net_namespace.h> | 93 | #include <net/net_namespace.h> |
93 | #include <net/arp.h> | 94 | #include <net/arp.h> |
@@ -113,7 +114,8 @@ | |||
113 | 114 | ||
114 | #define IGMP_V1_Router_Present_Timeout (400*HZ) | 115 | #define IGMP_V1_Router_Present_Timeout (400*HZ) |
115 | #define IGMP_V2_Router_Present_Timeout (400*HZ) | 116 | #define IGMP_V2_Router_Present_Timeout (400*HZ) |
116 | #define IGMP_Unsolicited_Report_Interval (10*HZ) | 117 | #define IGMP_V2_Unsolicited_Report_Interval (10*HZ) |
118 | #define IGMP_V3_Unsolicited_Report_Interval (1*HZ) | ||
117 | #define IGMP_Query_Response_Interval (10*HZ) | 119 | #define IGMP_Query_Response_Interval (10*HZ) |
118 | #define IGMP_Unsolicited_Report_Count 2 | 120 | #define IGMP_Unsolicited_Report_Count 2 |
119 | 121 | ||
@@ -138,6 +140,29 @@ | |||
138 | ((in_dev)->mr_v2_seen && \ | 140 | ((in_dev)->mr_v2_seen && \ |
139 | time_before(jiffies, (in_dev)->mr_v2_seen))) | 141 | time_before(jiffies, (in_dev)->mr_v2_seen))) |
140 | 142 | ||
143 | static int unsolicited_report_interval(struct in_device *in_dev) | ||
144 | { | ||
145 | int interval_ms, interval_jiffies; | ||
146 | |||
147 | if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) | ||
148 | interval_ms = IN_DEV_CONF_GET( | ||
149 | in_dev, | ||
150 | IGMPV2_UNSOLICITED_REPORT_INTERVAL); | ||
151 | else /* v3 */ | ||
152 | interval_ms = IN_DEV_CONF_GET( | ||
153 | in_dev, | ||
154 | IGMPV3_UNSOLICITED_REPORT_INTERVAL); | ||
155 | |||
156 | interval_jiffies = msecs_to_jiffies(interval_ms); | ||
157 | |||
158 | /* _timer functions can't handle a delay of 0 jiffies so ensure | ||
159 | * we always return a positive value. | ||
160 | */ | ||
161 | if (interval_jiffies <= 0) | ||
162 | interval_jiffies = 1; | ||
163 | return interval_jiffies; | ||
164 | } | ||
165 | |||
141 | static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); | 166 | static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); |
142 | static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); | 167 | static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); |
143 | static void igmpv3_clear_delrec(struct in_device *in_dev); | 168 | static void igmpv3_clear_delrec(struct in_device *in_dev); |
@@ -315,6 +340,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | |||
315 | if (size < 256) | 340 | if (size < 256) |
316 | return NULL; | 341 | return NULL; |
317 | } | 342 | } |
343 | skb->priority = TC_PRIO_CONTROL; | ||
318 | igmp_skb_size(skb) = size; | 344 | igmp_skb_size(skb) = size; |
319 | 345 | ||
320 | rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, | 346 | rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, |
@@ -670,6 +696,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
670 | ip_rt_put(rt); | 696 | ip_rt_put(rt); |
671 | return -1; | 697 | return -1; |
672 | } | 698 | } |
699 | skb->priority = TC_PRIO_CONTROL; | ||
673 | 700 | ||
674 | skb_dst_set(skb, &rt->dst); | 701 | skb_dst_set(skb, &rt->dst); |
675 | 702 | ||
@@ -719,7 +746,8 @@ static void igmp_ifc_timer_expire(unsigned long data) | |||
719 | igmpv3_send_cr(in_dev); | 746 | igmpv3_send_cr(in_dev); |
720 | if (in_dev->mr_ifc_count) { | 747 | if (in_dev->mr_ifc_count) { |
721 | in_dev->mr_ifc_count--; | 748 | in_dev->mr_ifc_count--; |
722 | igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); | 749 | igmp_ifc_start_timer(in_dev, |
750 | unsolicited_report_interval(in_dev)); | ||
723 | } | 751 | } |
724 | __in_dev_put(in_dev); | 752 | __in_dev_put(in_dev); |
725 | } | 753 | } |
@@ -744,7 +772,7 @@ static void igmp_timer_expire(unsigned long data) | |||
744 | 772 | ||
745 | if (im->unsolicit_count) { | 773 | if (im->unsolicit_count) { |
746 | im->unsolicit_count--; | 774 | im->unsolicit_count--; |
747 | igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); | 775 | igmp_start_timer(im, unsolicited_report_interval(in_dev)); |
748 | } | 776 | } |
749 | im->reporter = 1; | 777 | im->reporter = 1; |
750 | spin_unlock(&im->lock); | 778 | spin_unlock(&im->lock); |
@@ -1323,16 +1351,17 @@ out: | |||
1323 | EXPORT_SYMBOL(ip_mc_inc_group); | 1351 | EXPORT_SYMBOL(ip_mc_inc_group); |
1324 | 1352 | ||
1325 | /* | 1353 | /* |
1326 | * Resend IGMP JOIN report; used for bonding. | 1354 | * Resend IGMP JOIN report; used by netdev notifier. |
1327 | * Called with rcu_read_lock() | ||
1328 | */ | 1355 | */ |
1329 | void ip_mc_rejoin_groups(struct in_device *in_dev) | 1356 | static void ip_mc_rejoin_groups(struct in_device *in_dev) |
1330 | { | 1357 | { |
1331 | #ifdef CONFIG_IP_MULTICAST | 1358 | #ifdef CONFIG_IP_MULTICAST |
1332 | struct ip_mc_list *im; | 1359 | struct ip_mc_list *im; |
1333 | int type; | 1360 | int type; |
1334 | 1361 | ||
1335 | for_each_pmc_rcu(in_dev, im) { | 1362 | ASSERT_RTNL(); |
1363 | |||
1364 | for_each_pmc_rtnl(in_dev, im) { | ||
1336 | if (im->multiaddr == IGMP_ALL_HOSTS) | 1365 | if (im->multiaddr == IGMP_ALL_HOSTS) |
1337 | continue; | 1366 | continue; |
1338 | 1367 | ||
@@ -1349,7 +1378,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev) | |||
1349 | } | 1378 | } |
1350 | #endif | 1379 | #endif |
1351 | } | 1380 | } |
1352 | EXPORT_SYMBOL(ip_mc_rejoin_groups); | ||
1353 | 1381 | ||
1354 | /* | 1382 | /* |
1355 | * A socket has left a multicast group on device dev | 1383 | * A socket has left a multicast group on device dev |
@@ -2735,8 +2763,42 @@ static struct pernet_operations igmp_net_ops = { | |||
2735 | .exit = igmp_net_exit, | 2763 | .exit = igmp_net_exit, |
2736 | }; | 2764 | }; |
2737 | 2765 | ||
2766 | static int igmp_netdev_event(struct notifier_block *this, | ||
2767 | unsigned long event, void *ptr) | ||
2768 | { | ||
2769 | struct net_device *dev = netdev_notifier_info_to_dev(ptr); | ||
2770 | struct in_device *in_dev; | ||
2771 | |||
2772 | switch (event) { | ||
2773 | case NETDEV_RESEND_IGMP: | ||
2774 | in_dev = __in_dev_get_rtnl(dev); | ||
2775 | if (in_dev) | ||
2776 | ip_mc_rejoin_groups(in_dev); | ||
2777 | break; | ||
2778 | default: | ||
2779 | break; | ||
2780 | } | ||
2781 | return NOTIFY_DONE; | ||
2782 | } | ||
2783 | |||
2784 | static struct notifier_block igmp_notifier = { | ||
2785 | .notifier_call = igmp_netdev_event, | ||
2786 | }; | ||
2787 | |||
2738 | int __init igmp_mc_proc_init(void) | 2788 | int __init igmp_mc_proc_init(void) |
2739 | { | 2789 | { |
2740 | return register_pernet_subsys(&igmp_net_ops); | 2790 | int err; |
2791 | |||
2792 | err = register_pernet_subsys(&igmp_net_ops); | ||
2793 | if (err) | ||
2794 | return err; | ||
2795 | err = register_netdevice_notifier(&igmp_notifier); | ||
2796 | if (err) | ||
2797 | goto reg_notif_fail; | ||
2798 | return 0; | ||
2799 | |||
2800 | reg_notif_fail: | ||
2801 | unregister_pernet_subsys(&igmp_net_ops); | ||
2802 | return err; | ||
2741 | } | 2803 | } |
2742 | #endif | 2804 | #endif |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1f6eab66f7ce..d7aea4c5b940 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -383,7 +383,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, | |||
383 | if (daddr) | 383 | if (daddr) |
384 | memcpy(&iph->daddr, daddr, 4); | 384 | memcpy(&iph->daddr, daddr, 4); |
385 | if (iph->daddr) | 385 | if (iph->daddr) |
386 | return t->hlen; | 386 | return t->hlen + sizeof(*iph); |
387 | 387 | ||
388 | return -(t->hlen + sizeof(*iph)); | 388 | return -(t->hlen + sizeof(*iph)); |
389 | } | 389 | } |
@@ -534,7 +534,7 @@ static int __net_init ipgre_init_net(struct net *net) | |||
534 | static void __net_exit ipgre_exit_net(struct net *net) | 534 | static void __net_exit ipgre_exit_net(struct net *net) |
535 | { | 535 | { |
536 | struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); | 536 | struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); |
537 | ip_tunnel_delete_net(itn); | 537 | ip_tunnel_delete_net(itn, &ipgre_link_ops); |
538 | } | 538 | } |
539 | 539 | ||
540 | static struct pernet_operations ipgre_net_ops = { | 540 | static struct pernet_operations ipgre_net_ops = { |
@@ -767,7 +767,7 @@ static int __net_init ipgre_tap_init_net(struct net *net) | |||
767 | static void __net_exit ipgre_tap_exit_net(struct net *net) | 767 | static void __net_exit ipgre_tap_exit_net(struct net *net) |
768 | { | 768 | { |
769 | struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); | 769 | struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); |
770 | ip_tunnel_delete_net(itn); | 770 | ip_tunnel_delete_net(itn, &ipgre_tap_ops); |
771 | } | 771 | } |
772 | 772 | ||
773 | static struct pernet_operations ipgre_tap_net_ops = { | 773 | static struct pernet_operations ipgre_tap_net_ops = { |
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 15e3e683adec..054a3e97d822 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
@@ -141,6 +141,7 @@ | |||
141 | #include <net/icmp.h> | 141 | #include <net/icmp.h> |
142 | #include <net/raw.h> | 142 | #include <net/raw.h> |
143 | #include <net/checksum.h> | 143 | #include <net/checksum.h> |
144 | #include <net/inet_ecn.h> | ||
144 | #include <linux/netfilter_ipv4.h> | 145 | #include <linux/netfilter_ipv4.h> |
145 | #include <net/xfrm.h> | 146 | #include <net/xfrm.h> |
146 | #include <linux/mroute.h> | 147 | #include <linux/mroute.h> |
@@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, | |||
410 | if (iph->ihl < 5 || iph->version != 4) | 411 | if (iph->ihl < 5 || iph->version != 4) |
411 | goto inhdr_error; | 412 | goto inhdr_error; |
412 | 413 | ||
414 | BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); | ||
415 | BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); | ||
416 | BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); | ||
417 | IP_ADD_STATS_BH(dev_net(dev), | ||
418 | IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), | ||
419 | max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); | ||
420 | |||
413 | if (!pskb_may_pull(skb, iph->ihl*4)) | 421 | if (!pskb_may_pull(skb, iph->ihl*4)) |
414 | goto inhdr_error; | 422 | goto inhdr_error; |
415 | 423 | ||
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4bcabf3ab4ca..9ee17e3d11c3 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -211,14 +211,6 @@ static inline int ip_finish_output2(struct sk_buff *skb) | |||
211 | return -EINVAL; | 211 | return -EINVAL; |
212 | } | 212 | } |
213 | 213 | ||
214 | static inline int ip_skb_dst_mtu(struct sk_buff *skb) | ||
215 | { | ||
216 | struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; | ||
217 | |||
218 | return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? | ||
219 | skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); | ||
220 | } | ||
221 | |||
222 | static int ip_finish_output(struct sk_buff *skb) | 214 | static int ip_finish_output(struct sk_buff *skb) |
223 | { | 215 | { |
224 | #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) | 216 | #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) |
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index ca1cb2d5f6e2..ac9fabe0300f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c | |||
@@ -350,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) | |||
350 | struct flowi4 fl4; | 350 | struct flowi4 fl4; |
351 | struct rtable *rt; | 351 | struct rtable *rt; |
352 | 352 | ||
353 | rt = ip_route_output_tunnel(dev_net(dev), &fl4, | 353 | rt = ip_route_output_tunnel(tunnel->net, &fl4, |
354 | tunnel->parms.iph.protocol, | 354 | tunnel->parms.iph.protocol, |
355 | iph->daddr, iph->saddr, | 355 | iph->daddr, iph->saddr, |
356 | tunnel->parms.o_key, | 356 | tunnel->parms.o_key, |
@@ -365,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) | |||
365 | } | 365 | } |
366 | 366 | ||
367 | if (!tdev && tunnel->parms.link) | 367 | if (!tdev && tunnel->parms.link) |
368 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | 368 | tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); |
369 | 369 | ||
370 | if (tdev) { | 370 | if (tdev) { |
371 | hlen = tdev->hard_header_len + tdev->needed_headroom; | 371 | hlen = tdev->hard_header_len + tdev->needed_headroom; |
@@ -454,15 +454,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, | |||
454 | tstats->rx_bytes += skb->len; | 454 | tstats->rx_bytes += skb->len; |
455 | u64_stats_update_end(&tstats->syncp); | 455 | u64_stats_update_end(&tstats->syncp); |
456 | 456 | ||
457 | if (tunnel->net != dev_net(tunnel->dev)) | ||
458 | skb_scrub_packet(skb); | ||
459 | |||
460 | if (tunnel->dev->type == ARPHRD_ETHER) { | 457 | if (tunnel->dev->type == ARPHRD_ETHER) { |
461 | skb->protocol = eth_type_trans(skb, tunnel->dev); | 458 | skb->protocol = eth_type_trans(skb, tunnel->dev); |
462 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | 459 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); |
463 | } else { | 460 | } else { |
464 | skb->dev = tunnel->dev; | 461 | skb->dev = tunnel->dev; |
465 | } | 462 | } |
463 | |||
464 | skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); | ||
465 | |||
466 | gro_cells_receive(&tunnel->gro_cells, skb); | 466 | gro_cells_receive(&tunnel->gro_cells, skb); |
467 | return 0; | 467 | return 0; |
468 | 468 | ||
@@ -613,9 +613,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, | |||
613 | goto tx_error; | 613 | goto tx_error; |
614 | } | 614 | } |
615 | 615 | ||
616 | if (tunnel->net != dev_net(dev)) | ||
617 | skb_scrub_packet(skb); | ||
618 | |||
619 | if (tunnel->err_count > 0) { | 616 | if (tunnel->err_count > 0) { |
620 | if (time_before(jiffies, | 617 | if (time_before(jiffies, |
621 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { | 618 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { |
@@ -653,9 +650,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, | |||
653 | } | 650 | } |
654 | } | 651 | } |
655 | 652 | ||
656 | err = iptunnel_xmit(dev_net(dev), rt, skb, | 653 | err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, |
657 | fl4.saddr, fl4.daddr, protocol, | 654 | ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df, |
658 | ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df); | 655 | !net_eq(tunnel->net, dev_net(dev))); |
659 | iptunnel_xmit_stats(err, &dev->stats, dev->tstats); | 656 | iptunnel_xmit_stats(err, &dev->stats, dev->tstats); |
660 | 657 | ||
661 | return; | 658 | return; |
@@ -820,11 +817,10 @@ static void ip_tunnel_dev_free(struct net_device *dev) | |||
820 | 817 | ||
821 | void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) | 818 | void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) |
822 | { | 819 | { |
823 | struct net *net = dev_net(dev); | ||
824 | struct ip_tunnel *tunnel = netdev_priv(dev); | 820 | struct ip_tunnel *tunnel = netdev_priv(dev); |
825 | struct ip_tunnel_net *itn; | 821 | struct ip_tunnel_net *itn; |
826 | 822 | ||
827 | itn = net_generic(net, tunnel->ip_tnl_net_id); | 823 | itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); |
828 | 824 | ||
829 | if (itn->fb_tunnel_dev != dev) { | 825 | if (itn->fb_tunnel_dev != dev) { |
830 | ip_tunnel_del(netdev_priv(dev)); | 826 | ip_tunnel_del(netdev_priv(dev)); |
@@ -838,56 +834,68 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, | |||
838 | { | 834 | { |
839 | struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); | 835 | struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); |
840 | struct ip_tunnel_parm parms; | 836 | struct ip_tunnel_parm parms; |
837 | unsigned int i; | ||
841 | 838 | ||
842 | itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL); | 839 | for (i = 0; i < IP_TNL_HASH_SIZE; i++) |
843 | if (!itn->tunnels) | 840 | INIT_HLIST_HEAD(&itn->tunnels[i]); |
844 | return -ENOMEM; | ||
845 | 841 | ||
846 | if (!ops) { | 842 | if (!ops) { |
847 | itn->fb_tunnel_dev = NULL; | 843 | itn->fb_tunnel_dev = NULL; |
848 | return 0; | 844 | return 0; |
849 | } | 845 | } |
846 | |||
850 | memset(&parms, 0, sizeof(parms)); | 847 | memset(&parms, 0, sizeof(parms)); |
851 | if (devname) | 848 | if (devname) |
852 | strlcpy(parms.name, devname, IFNAMSIZ); | 849 | strlcpy(parms.name, devname, IFNAMSIZ); |
853 | 850 | ||
854 | rtnl_lock(); | 851 | rtnl_lock(); |
855 | itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); | 852 | itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); |
853 | /* FB netdevice is special: we have one, and only one per netns. | ||
854 | * Allowing to move it to another netns is clearly unsafe. | ||
855 | */ | ||
856 | if (!IS_ERR(itn->fb_tunnel_dev)) | ||
857 | itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; | ||
856 | rtnl_unlock(); | 858 | rtnl_unlock(); |
857 | if (IS_ERR(itn->fb_tunnel_dev)) { | ||
858 | kfree(itn->tunnels); | ||
859 | return PTR_ERR(itn->fb_tunnel_dev); | ||
860 | } | ||
861 | 859 | ||
862 | return 0; | 860 | return PTR_RET(itn->fb_tunnel_dev); |
863 | } | 861 | } |
864 | EXPORT_SYMBOL_GPL(ip_tunnel_init_net); | 862 | EXPORT_SYMBOL_GPL(ip_tunnel_init_net); |
865 | 863 | ||
866 | static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head) | 864 | static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, |
865 | struct rtnl_link_ops *ops) | ||
867 | { | 866 | { |
867 | struct net *net = dev_net(itn->fb_tunnel_dev); | ||
868 | struct net_device *dev, *aux; | ||
868 | int h; | 869 | int h; |
869 | 870 | ||
871 | for_each_netdev_safe(net, dev, aux) | ||
872 | if (dev->rtnl_link_ops == ops) | ||
873 | unregister_netdevice_queue(dev, head); | ||
874 | |||
870 | for (h = 0; h < IP_TNL_HASH_SIZE; h++) { | 875 | for (h = 0; h < IP_TNL_HASH_SIZE; h++) { |
871 | struct ip_tunnel *t; | 876 | struct ip_tunnel *t; |
872 | struct hlist_node *n; | 877 | struct hlist_node *n; |
873 | struct hlist_head *thead = &itn->tunnels[h]; | 878 | struct hlist_head *thead = &itn->tunnels[h]; |
874 | 879 | ||
875 | hlist_for_each_entry_safe(t, n, thead, hash_node) | 880 | hlist_for_each_entry_safe(t, n, thead, hash_node) |
876 | unregister_netdevice_queue(t->dev, head); | 881 | /* If dev is in the same netns, it has already |
882 | * been added to the list by the previous loop. | ||
883 | */ | ||
884 | if (!net_eq(dev_net(t->dev), net)) | ||
885 | unregister_netdevice_queue(t->dev, head); | ||
877 | } | 886 | } |
878 | if (itn->fb_tunnel_dev) | 887 | if (itn->fb_tunnel_dev) |
879 | unregister_netdevice_queue(itn->fb_tunnel_dev, head); | 888 | unregister_netdevice_queue(itn->fb_tunnel_dev, head); |
880 | } | 889 | } |
881 | 890 | ||
882 | void ip_tunnel_delete_net(struct ip_tunnel_net *itn) | 891 | void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) |
883 | { | 892 | { |
884 | LIST_HEAD(list); | 893 | LIST_HEAD(list); |
885 | 894 | ||
886 | rtnl_lock(); | 895 | rtnl_lock(); |
887 | ip_tunnel_destroy(itn, &list); | 896 | ip_tunnel_destroy(itn, &list, ops); |
888 | unregister_netdevice_many(&list); | 897 | unregister_netdevice_many(&list); |
889 | rtnl_unlock(); | 898 | rtnl_unlock(); |
890 | kfree(itn->tunnels); | ||
891 | } | 899 | } |
892 | EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); | 900 | EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); |
893 | 901 | ||
@@ -929,23 +937,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink); | |||
929 | int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], | 937 | int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], |
930 | struct ip_tunnel_parm *p) | 938 | struct ip_tunnel_parm *p) |
931 | { | 939 | { |
932 | struct ip_tunnel *t, *nt; | 940 | struct ip_tunnel *t; |
933 | struct net *net = dev_net(dev); | ||
934 | struct ip_tunnel *tunnel = netdev_priv(dev); | 941 | struct ip_tunnel *tunnel = netdev_priv(dev); |
942 | struct net *net = tunnel->net; | ||
935 | struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); | 943 | struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); |
936 | 944 | ||
937 | if (dev == itn->fb_tunnel_dev) | 945 | if (dev == itn->fb_tunnel_dev) |
938 | return -EINVAL; | 946 | return -EINVAL; |
939 | 947 | ||
940 | nt = netdev_priv(dev); | ||
941 | |||
942 | t = ip_tunnel_find(itn, p, dev->type); | 948 | t = ip_tunnel_find(itn, p, dev->type); |
943 | 949 | ||
944 | if (t) { | 950 | if (t) { |
945 | if (t->dev != dev) | 951 | if (t->dev != dev) |
946 | return -EEXIST; | 952 | return -EEXIST; |
947 | } else { | 953 | } else { |
948 | t = nt; | 954 | t = tunnel; |
949 | 955 | ||
950 | if (dev->type != ARPHRD_ETHER) { | 956 | if (dev->type != ARPHRD_ETHER) { |
951 | unsigned int nflags = 0; | 957 | unsigned int nflags = 0; |
@@ -984,6 +990,7 @@ int ip_tunnel_init(struct net_device *dev) | |||
984 | } | 990 | } |
985 | 991 | ||
986 | tunnel->dev = dev; | 992 | tunnel->dev = dev; |
993 | tunnel->net = dev_net(dev); | ||
987 | strcpy(tunnel->parms.name, dev->name); | 994 | strcpy(tunnel->parms.name, dev->name); |
988 | iph->version = 4; | 995 | iph->version = 4; |
989 | iph->ihl = 5; | 996 | iph->ihl = 5; |
@@ -994,8 +1001,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init); | |||
994 | 1001 | ||
995 | void ip_tunnel_uninit(struct net_device *dev) | 1002 | void ip_tunnel_uninit(struct net_device *dev) |
996 | { | 1003 | { |
997 | struct net *net = dev_net(dev); | ||
998 | struct ip_tunnel *tunnel = netdev_priv(dev); | 1004 | struct ip_tunnel *tunnel = netdev_priv(dev); |
1005 | struct net *net = tunnel->net; | ||
999 | struct ip_tunnel_net *itn; | 1006 | struct ip_tunnel_net *itn; |
1000 | 1007 | ||
1001 | itn = net_generic(net, tunnel->ip_tnl_net_id); | 1008 | itn = net_generic(net, tunnel->ip_tnl_net_id); |
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 7167b08977df..d6c856b17fd4 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c | |||
@@ -46,19 +46,17 @@ | |||
46 | #include <net/netns/generic.h> | 46 | #include <net/netns/generic.h> |
47 | #include <net/rtnetlink.h> | 47 | #include <net/rtnetlink.h> |
48 | 48 | ||
49 | int iptunnel_xmit(struct net *net, struct rtable *rt, | 49 | int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, |
50 | struct sk_buff *skb, | ||
51 | __be32 src, __be32 dst, __u8 proto, | 50 | __be32 src, __be32 dst, __u8 proto, |
52 | __u8 tos, __u8 ttl, __be16 df) | 51 | __u8 tos, __u8 ttl, __be16 df, bool xnet) |
53 | { | 52 | { |
54 | int pkt_len = skb->len; | 53 | int pkt_len = skb->len; |
55 | struct iphdr *iph; | 54 | struct iphdr *iph; |
56 | int err; | 55 | int err; |
57 | 56 | ||
58 | nf_reset(skb); | 57 | skb_scrub_packet(skb, xnet); |
59 | secpath_reset(skb); | 58 | |
60 | skb->rxhash = 0; | 59 | skb->rxhash = 0; |
61 | skb_dst_drop(skb); | ||
62 | skb_dst_set(skb, &rt->dst); | 60 | skb_dst_set(skb, &rt->dst); |
63 | memset(IPCB(skb), 0, sizeof(*IPCB(skb))); | 61 | memset(IPCB(skb), 0, sizeof(*IPCB(skb))); |
64 | 62 | ||
@@ -76,9 +74,7 @@ int iptunnel_xmit(struct net *net, struct rtable *rt, | |||
76 | iph->daddr = dst; | 74 | iph->daddr = dst; |
77 | iph->saddr = src; | 75 | iph->saddr = src; |
78 | iph->ttl = ttl; | 76 | iph->ttl = ttl; |
79 | tunnel_ip_select_ident(skb, | 77 | __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); |
80 | (const struct iphdr *)skb_inner_network_header(skb), | ||
81 | &rt->dst); | ||
82 | 78 | ||
83 | err = ip_local_out(skb); | 79 | err = ip_local_out(skb); |
84 | if (unlikely(net_xmit_eval(err))) | 80 | if (unlikely(net_xmit_eval(err))) |
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 17cc0ffa8c0d..e805e7b3030e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c | |||
@@ -44,176 +44,10 @@ | |||
44 | #include <net/net_namespace.h> | 44 | #include <net/net_namespace.h> |
45 | #include <net/netns/generic.h> | 45 | #include <net/netns/generic.h> |
46 | 46 | ||
47 | #define HASH_SIZE 16 | ||
48 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1)) | ||
49 | |||
50 | static struct rtnl_link_ops vti_link_ops __read_mostly; | 47 | static struct rtnl_link_ops vti_link_ops __read_mostly; |
51 | 48 | ||
52 | static int vti_net_id __read_mostly; | 49 | static int vti_net_id __read_mostly; |
53 | struct vti_net { | ||
54 | struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; | ||
55 | struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; | ||
56 | struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; | ||
57 | struct ip_tunnel __rcu *tunnels_wc[1]; | ||
58 | struct ip_tunnel __rcu **tunnels[4]; | ||
59 | |||
60 | struct net_device *fb_tunnel_dev; | ||
61 | }; | ||
62 | |||
63 | static int vti_fb_tunnel_init(struct net_device *dev); | ||
64 | static int vti_tunnel_init(struct net_device *dev); | 50 | static int vti_tunnel_init(struct net_device *dev); |
65 | static void vti_tunnel_setup(struct net_device *dev); | ||
66 | static void vti_dev_free(struct net_device *dev); | ||
67 | static int vti_tunnel_bind_dev(struct net_device *dev); | ||
68 | |||
69 | #define VTI_XMIT(stats1, stats2) do { \ | ||
70 | int err; \ | ||
71 | int pkt_len = skb->len; \ | ||
72 | err = dst_output(skb); \ | ||
73 | if (net_xmit_eval(err) == 0) { \ | ||
74 | u64_stats_update_begin(&(stats1)->syncp); \ | ||
75 | (stats1)->tx_bytes += pkt_len; \ | ||
76 | (stats1)->tx_packets++; \ | ||
77 | u64_stats_update_end(&(stats1)->syncp); \ | ||
78 | } else { \ | ||
79 | (stats2)->tx_errors++; \ | ||
80 | (stats2)->tx_aborted_errors++; \ | ||
81 | } \ | ||
82 | } while (0) | ||
83 | |||
84 | |||
85 | static struct ip_tunnel *vti_tunnel_lookup(struct net *net, | ||
86 | __be32 remote, __be32 local) | ||
87 | { | ||
88 | unsigned h0 = HASH(remote); | ||
89 | unsigned h1 = HASH(local); | ||
90 | struct ip_tunnel *t; | ||
91 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
92 | |||
93 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1]) | ||
94 | if (local == t->parms.iph.saddr && | ||
95 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
96 | return t; | ||
97 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0]) | ||
98 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
99 | return t; | ||
100 | |||
101 | for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1]) | ||
102 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) | ||
103 | return t; | ||
104 | |||
105 | for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0]) | ||
106 | if (t && (t->dev->flags&IFF_UP)) | ||
107 | return t; | ||
108 | return NULL; | ||
109 | } | ||
110 | |||
111 | static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn, | ||
112 | struct ip_tunnel_parm *parms) | ||
113 | { | ||
114 | __be32 remote = parms->iph.daddr; | ||
115 | __be32 local = parms->iph.saddr; | ||
116 | unsigned h = 0; | ||
117 | int prio = 0; | ||
118 | |||
119 | if (remote) { | ||
120 | prio |= 2; | ||
121 | h ^= HASH(remote); | ||
122 | } | ||
123 | if (local) { | ||
124 | prio |= 1; | ||
125 | h ^= HASH(local); | ||
126 | } | ||
127 | return &ipn->tunnels[prio][h]; | ||
128 | } | ||
129 | |||
130 | static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn, | ||
131 | struct ip_tunnel *t) | ||
132 | { | ||
133 | return __vti_bucket(ipn, &t->parms); | ||
134 | } | ||
135 | |||
136 | static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t) | ||
137 | { | ||
138 | struct ip_tunnel __rcu **tp; | ||
139 | struct ip_tunnel *iter; | ||
140 | |||
141 | for (tp = vti_bucket(ipn, t); | ||
142 | (iter = rtnl_dereference(*tp)) != NULL; | ||
143 | tp = &iter->next) { | ||
144 | if (t == iter) { | ||
145 | rcu_assign_pointer(*tp, t->next); | ||
146 | break; | ||
147 | } | ||
148 | } | ||
149 | } | ||
150 | |||
151 | static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t) | ||
152 | { | ||
153 | struct ip_tunnel __rcu **tp = vti_bucket(ipn, t); | ||
154 | |||
155 | rcu_assign_pointer(t->next, rtnl_dereference(*tp)); | ||
156 | rcu_assign_pointer(*tp, t); | ||
157 | } | ||
158 | |||
159 | static struct ip_tunnel *vti_tunnel_locate(struct net *net, | ||
160 | struct ip_tunnel_parm *parms, | ||
161 | int create) | ||
162 | { | ||
163 | __be32 remote = parms->iph.daddr; | ||
164 | __be32 local = parms->iph.saddr; | ||
165 | struct ip_tunnel *t, *nt; | ||
166 | struct ip_tunnel __rcu **tp; | ||
167 | struct net_device *dev; | ||
168 | char name[IFNAMSIZ]; | ||
169 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
170 | |||
171 | for (tp = __vti_bucket(ipn, parms); | ||
172 | (t = rtnl_dereference(*tp)) != NULL; | ||
173 | tp = &t->next) { | ||
174 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) | ||
175 | return t; | ||
176 | } | ||
177 | if (!create) | ||
178 | return NULL; | ||
179 | |||
180 | if (parms->name[0]) | ||
181 | strlcpy(name, parms->name, IFNAMSIZ); | ||
182 | else | ||
183 | strcpy(name, "vti%d"); | ||
184 | |||
185 | dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup); | ||
186 | if (dev == NULL) | ||
187 | return NULL; | ||
188 | |||
189 | dev_net_set(dev, net); | ||
190 | |||
191 | nt = netdev_priv(dev); | ||
192 | nt->parms = *parms; | ||
193 | dev->rtnl_link_ops = &vti_link_ops; | ||
194 | |||
195 | vti_tunnel_bind_dev(dev); | ||
196 | |||
197 | if (register_netdevice(dev) < 0) | ||
198 | goto failed_free; | ||
199 | |||
200 | dev_hold(dev); | ||
201 | vti_tunnel_link(ipn, nt); | ||
202 | return nt; | ||
203 | |||
204 | failed_free: | ||
205 | free_netdev(dev); | ||
206 | return NULL; | ||
207 | } | ||
208 | |||
209 | static void vti_tunnel_uninit(struct net_device *dev) | ||
210 | { | ||
211 | struct net *net = dev_net(dev); | ||
212 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
213 | |||
214 | vti_tunnel_unlink(ipn, netdev_priv(dev)); | ||
215 | dev_put(dev); | ||
216 | } | ||
217 | 51 | ||
218 | static int vti_err(struct sk_buff *skb, u32 info) | 52 | static int vti_err(struct sk_buff *skb, u32 info) |
219 | { | 53 | { |
@@ -222,6 +56,8 @@ static int vti_err(struct sk_buff *skb, u32 info) | |||
222 | * 8 bytes of packet payload. It means, that precise relaying of | 56 | * 8 bytes of packet payload. It means, that precise relaying of |
223 | * ICMP in the real Internet is absolutely infeasible. | 57 | * ICMP in the real Internet is absolutely infeasible. |
224 | */ | 58 | */ |
59 | struct net *net = dev_net(skb->dev); | ||
60 | struct ip_tunnel_net *itn = net_generic(net, vti_net_id); | ||
225 | struct iphdr *iph = (struct iphdr *)skb->data; | 61 | struct iphdr *iph = (struct iphdr *)skb->data; |
226 | const int type = icmp_hdr(skb)->type; | 62 | const int type = icmp_hdr(skb)->type; |
227 | const int code = icmp_hdr(skb)->code; | 63 | const int code = icmp_hdr(skb)->code; |
@@ -252,7 +88,8 @@ static int vti_err(struct sk_buff *skb, u32 info) | |||
252 | 88 | ||
253 | err = -ENOENT; | 89 | err = -ENOENT; |
254 | 90 | ||
255 | t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | 91 | t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, |
92 | iph->daddr, iph->saddr, 0); | ||
256 | if (t == NULL) | 93 | if (t == NULL) |
257 | goto out; | 94 | goto out; |
258 | 95 | ||
@@ -281,8 +118,11 @@ static int vti_rcv(struct sk_buff *skb) | |||
281 | { | 118 | { |
282 | struct ip_tunnel *tunnel; | 119 | struct ip_tunnel *tunnel; |
283 | const struct iphdr *iph = ip_hdr(skb); | 120 | const struct iphdr *iph = ip_hdr(skb); |
121 | struct net *net = dev_net(skb->dev); | ||
122 | struct ip_tunnel_net *itn = net_generic(net, vti_net_id); | ||
284 | 123 | ||
285 | tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); | 124 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, |
125 | iph->saddr, iph->daddr, 0); | ||
286 | if (tunnel != NULL) { | 126 | if (tunnel != NULL) { |
287 | struct pcpu_tstats *tstats; | 127 | struct pcpu_tstats *tstats; |
288 | 128 | ||
@@ -311,7 +151,6 @@ static int vti_rcv(struct sk_buff *skb) | |||
311 | static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 151 | static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) |
312 | { | 152 | { |
313 | struct ip_tunnel *tunnel = netdev_priv(dev); | 153 | struct ip_tunnel *tunnel = netdev_priv(dev); |
314 | struct pcpu_tstats *tstats; | ||
315 | struct iphdr *tiph = &tunnel->parms.iph; | 154 | struct iphdr *tiph = &tunnel->parms.iph; |
316 | u8 tos; | 155 | u8 tos; |
317 | struct rtable *rt; /* Route to the other host */ | 156 | struct rtable *rt; /* Route to the other host */ |
@@ -319,6 +158,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
319 | struct iphdr *old_iph = ip_hdr(skb); | 158 | struct iphdr *old_iph = ip_hdr(skb); |
320 | __be32 dst = tiph->daddr; | 159 | __be32 dst = tiph->daddr; |
321 | struct flowi4 fl4; | 160 | struct flowi4 fl4; |
161 | int err; | ||
322 | 162 | ||
323 | if (skb->protocol != htons(ETH_P_IP)) | 163 | if (skb->protocol != htons(ETH_P_IP)) |
324 | goto tx_error; | 164 | goto tx_error; |
@@ -367,8 +207,10 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
367 | nf_reset(skb); | 207 | nf_reset(skb); |
368 | skb->dev = skb_dst(skb)->dev; | 208 | skb->dev = skb_dst(skb)->dev; |
369 | 209 | ||
370 | tstats = this_cpu_ptr(dev->tstats); | 210 | err = dst_output(skb); |
371 | VTI_XMIT(tstats, &dev->stats); | 211 | if (net_xmit_eval(err) == 0) |
212 | err = skb->len; | ||
213 | iptunnel_xmit_stats(err, &dev->stats, dev->tstats); | ||
372 | return NETDEV_TX_OK; | 214 | return NETDEV_TX_OK; |
373 | 215 | ||
374 | tx_error_icmp: | 216 | tx_error_icmp: |
@@ -379,198 +221,57 @@ tx_error: | |||
379 | return NETDEV_TX_OK; | 221 | return NETDEV_TX_OK; |
380 | } | 222 | } |
381 | 223 | ||
382 | static int vti_tunnel_bind_dev(struct net_device *dev) | ||
383 | { | ||
384 | struct net_device *tdev = NULL; | ||
385 | struct ip_tunnel *tunnel; | ||
386 | struct iphdr *iph; | ||
387 | |||
388 | tunnel = netdev_priv(dev); | ||
389 | iph = &tunnel->parms.iph; | ||
390 | |||
391 | if (iph->daddr) { | ||
392 | struct rtable *rt; | ||
393 | struct flowi4 fl4; | ||
394 | memset(&fl4, 0, sizeof(fl4)); | ||
395 | flowi4_init_output(&fl4, tunnel->parms.link, | ||
396 | be32_to_cpu(tunnel->parms.i_key), | ||
397 | RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, | ||
398 | IPPROTO_IPIP, 0, | ||
399 | iph->daddr, iph->saddr, 0, 0); | ||
400 | rt = ip_route_output_key(dev_net(dev), &fl4); | ||
401 | if (!IS_ERR(rt)) { | ||
402 | tdev = rt->dst.dev; | ||
403 | ip_rt_put(rt); | ||
404 | } | ||
405 | dev->flags |= IFF_POINTOPOINT; | ||
406 | } | ||
407 | |||
408 | if (!tdev && tunnel->parms.link) | ||
409 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | ||
410 | |||
411 | if (tdev) { | ||
412 | dev->hard_header_len = tdev->hard_header_len + | ||
413 | sizeof(struct iphdr); | ||
414 | dev->mtu = tdev->mtu; | ||
415 | } | ||
416 | dev->iflink = tunnel->parms.link; | ||
417 | return dev->mtu; | ||
418 | } | ||
419 | |||
420 | static int | 224 | static int |
421 | vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) | 225 | vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) |
422 | { | 226 | { |
423 | int err = 0; | 227 | int err = 0; |
424 | struct ip_tunnel_parm p; | 228 | struct ip_tunnel_parm p; |
425 | struct ip_tunnel *t; | ||
426 | struct net *net = dev_net(dev); | ||
427 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
428 | |||
429 | switch (cmd) { | ||
430 | case SIOCGETTUNNEL: | ||
431 | t = NULL; | ||
432 | if (dev == ipn->fb_tunnel_dev) { | ||
433 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, | ||
434 | sizeof(p))) { | ||
435 | err = -EFAULT; | ||
436 | break; | ||
437 | } | ||
438 | t = vti_tunnel_locate(net, &p, 0); | ||
439 | } | ||
440 | if (t == NULL) | ||
441 | t = netdev_priv(dev); | ||
442 | memcpy(&p, &t->parms, sizeof(p)); | ||
443 | p.i_flags |= GRE_KEY | VTI_ISVTI; | ||
444 | p.o_flags |= GRE_KEY; | ||
445 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) | ||
446 | err = -EFAULT; | ||
447 | break; | ||
448 | |||
449 | case SIOCADDTUNNEL: | ||
450 | case SIOCCHGTUNNEL: | ||
451 | err = -EPERM; | ||
452 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
453 | goto done; | ||
454 | 229 | ||
455 | err = -EFAULT; | 230 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) |
456 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | 231 | return -EFAULT; |
457 | goto done; | ||
458 | 232 | ||
459 | err = -EINVAL; | 233 | if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { |
460 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || | 234 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || |
461 | p.iph.ihl != 5) | 235 | p.iph.ihl != 5) |
462 | goto done; | 236 | return -EINVAL; |
463 | 237 | } | |
464 | t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); | ||
465 | |||
466 | if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | ||
467 | if (t != NULL) { | ||
468 | if (t->dev != dev) { | ||
469 | err = -EEXIST; | ||
470 | break; | ||
471 | } | ||
472 | } else { | ||
473 | if (((dev->flags&IFF_POINTOPOINT) && | ||
474 | !p.iph.daddr) || | ||
475 | (!(dev->flags&IFF_POINTOPOINT) && | ||
476 | p.iph.daddr)) { | ||
477 | err = -EINVAL; | ||
478 | break; | ||
479 | } | ||
480 | t = netdev_priv(dev); | ||
481 | vti_tunnel_unlink(ipn, t); | ||
482 | synchronize_net(); | ||
483 | t->parms.iph.saddr = p.iph.saddr; | ||
484 | t->parms.iph.daddr = p.iph.daddr; | ||
485 | t->parms.i_key = p.i_key; | ||
486 | t->parms.o_key = p.o_key; | ||
487 | t->parms.iph.protocol = IPPROTO_IPIP; | ||
488 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
489 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
490 | vti_tunnel_link(ipn, t); | ||
491 | netdev_state_change(dev); | ||
492 | } | ||
493 | } | ||
494 | |||
495 | if (t) { | ||
496 | err = 0; | ||
497 | if (cmd == SIOCCHGTUNNEL) { | ||
498 | t->parms.i_key = p.i_key; | ||
499 | t->parms.o_key = p.o_key; | ||
500 | if (t->parms.link != p.link) { | ||
501 | t->parms.link = p.link; | ||
502 | vti_tunnel_bind_dev(dev); | ||
503 | netdev_state_change(dev); | ||
504 | } | ||
505 | } | ||
506 | p.i_flags |= GRE_KEY | VTI_ISVTI; | ||
507 | p.o_flags |= GRE_KEY; | ||
508 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, | ||
509 | sizeof(p))) | ||
510 | err = -EFAULT; | ||
511 | } else | ||
512 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | ||
513 | break; | ||
514 | 238 | ||
515 | case SIOCDELTUNNEL: | 239 | err = ip_tunnel_ioctl(dev, &p, cmd); |
516 | err = -EPERM; | 240 | if (err) |
517 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 241 | return err; |
518 | goto done; | ||
519 | |||
520 | if (dev == ipn->fb_tunnel_dev) { | ||
521 | err = -EFAULT; | ||
522 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, | ||
523 | sizeof(p))) | ||
524 | goto done; | ||
525 | err = -ENOENT; | ||
526 | |||
527 | t = vti_tunnel_locate(net, &p, 0); | ||
528 | if (t == NULL) | ||
529 | goto done; | ||
530 | err = -EPERM; | ||
531 | if (t->dev == ipn->fb_tunnel_dev) | ||
532 | goto done; | ||
533 | dev = t->dev; | ||
534 | } | ||
535 | unregister_netdevice(dev); | ||
536 | err = 0; | ||
537 | break; | ||
538 | 242 | ||
539 | default: | 243 | if (cmd != SIOCDELTUNNEL) { |
540 | err = -EINVAL; | 244 | p.i_flags |= GRE_KEY | VTI_ISVTI; |
245 | p.o_flags |= GRE_KEY; | ||
541 | } | 246 | } |
542 | 247 | ||
543 | done: | 248 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) |
544 | return err; | 249 | return -EFAULT; |
545 | } | ||
546 | |||
547 | static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu) | ||
548 | { | ||
549 | if (new_mtu < 68 || new_mtu > 0xFFF8) | ||
550 | return -EINVAL; | ||
551 | dev->mtu = new_mtu; | ||
552 | return 0; | 250 | return 0; |
553 | } | 251 | } |
554 | 252 | ||
555 | static const struct net_device_ops vti_netdev_ops = { | 253 | static const struct net_device_ops vti_netdev_ops = { |
556 | .ndo_init = vti_tunnel_init, | 254 | .ndo_init = vti_tunnel_init, |
557 | .ndo_uninit = vti_tunnel_uninit, | 255 | .ndo_uninit = ip_tunnel_uninit, |
558 | .ndo_start_xmit = vti_tunnel_xmit, | 256 | .ndo_start_xmit = vti_tunnel_xmit, |
559 | .ndo_do_ioctl = vti_tunnel_ioctl, | 257 | .ndo_do_ioctl = vti_tunnel_ioctl, |
560 | .ndo_change_mtu = vti_tunnel_change_mtu, | 258 | .ndo_change_mtu = ip_tunnel_change_mtu, |
561 | .ndo_get_stats64 = ip_tunnel_get_stats64, | 259 | .ndo_get_stats64 = ip_tunnel_get_stats64, |
562 | }; | 260 | }; |
563 | 261 | ||
564 | static void vti_dev_free(struct net_device *dev) | 262 | static void vti_tunnel_setup(struct net_device *dev) |
565 | { | 263 | { |
566 | free_percpu(dev->tstats); | 264 | dev->netdev_ops = &vti_netdev_ops; |
567 | free_netdev(dev); | 265 | ip_tunnel_setup(dev, vti_net_id); |
568 | } | 266 | } |
569 | 267 | ||
570 | static void vti_tunnel_setup(struct net_device *dev) | 268 | static int vti_tunnel_init(struct net_device *dev) |
571 | { | 269 | { |
572 | dev->netdev_ops = &vti_netdev_ops; | 270 | struct ip_tunnel *tunnel = netdev_priv(dev); |
573 | dev->destructor = vti_dev_free; | 271 | struct iphdr *iph = &tunnel->parms.iph; |
272 | |||
273 | memcpy(dev->dev_addr, &iph->saddr, 4); | ||
274 | memcpy(dev->broadcast, &iph->daddr, 4); | ||
574 | 275 | ||
575 | dev->type = ARPHRD_TUNNEL; | 276 | dev->type = ARPHRD_TUNNEL; |
576 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); | 277 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); |
@@ -581,38 +282,18 @@ static void vti_tunnel_setup(struct net_device *dev) | |||
581 | dev->features |= NETIF_F_NETNS_LOCAL; | 282 | dev->features |= NETIF_F_NETNS_LOCAL; |
582 | dev->features |= NETIF_F_LLTX; | 283 | dev->features |= NETIF_F_LLTX; |
583 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 284 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
584 | } | ||
585 | 285 | ||
586 | static int vti_tunnel_init(struct net_device *dev) | 286 | return ip_tunnel_init(dev); |
587 | { | ||
588 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
589 | |||
590 | tunnel->dev = dev; | ||
591 | strcpy(tunnel->parms.name, dev->name); | ||
592 | |||
593 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | ||
594 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | ||
595 | |||
596 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
597 | if (!dev->tstats) | ||
598 | return -ENOMEM; | ||
599 | |||
600 | return 0; | ||
601 | } | 287 | } |
602 | 288 | ||
603 | static int __net_init vti_fb_tunnel_init(struct net_device *dev) | 289 | static void __net_init vti_fb_tunnel_init(struct net_device *dev) |
604 | { | 290 | { |
605 | struct ip_tunnel *tunnel = netdev_priv(dev); | 291 | struct ip_tunnel *tunnel = netdev_priv(dev); |
606 | struct iphdr *iph = &tunnel->parms.iph; | 292 | struct iphdr *iph = &tunnel->parms.iph; |
607 | struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id); | ||
608 | 293 | ||
609 | iph->version = 4; | 294 | iph->version = 4; |
610 | iph->protocol = IPPROTO_IPIP; | 295 | iph->protocol = IPPROTO_IPIP; |
611 | iph->ihl = 5; | 296 | iph->ihl = 5; |
612 | |||
613 | dev_hold(dev); | ||
614 | rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); | ||
615 | return 0; | ||
616 | } | 297 | } |
617 | 298 | ||
618 | static struct xfrm_tunnel vti_handler __read_mostly = { | 299 | static struct xfrm_tunnel vti_handler __read_mostly = { |
@@ -621,76 +302,30 @@ static struct xfrm_tunnel vti_handler __read_mostly = { | |||
621 | .priority = 1, | 302 | .priority = 1, |
622 | }; | 303 | }; |
623 | 304 | ||
624 | static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head) | ||
625 | { | ||
626 | int prio; | ||
627 | |||
628 | for (prio = 1; prio < 4; prio++) { | ||
629 | int h; | ||
630 | for (h = 0; h < HASH_SIZE; h++) { | ||
631 | struct ip_tunnel *t; | ||
632 | |||
633 | t = rtnl_dereference(ipn->tunnels[prio][h]); | ||
634 | while (t != NULL) { | ||
635 | unregister_netdevice_queue(t->dev, head); | ||
636 | t = rtnl_dereference(t->next); | ||
637 | } | ||
638 | } | ||
639 | } | ||
640 | } | ||
641 | |||
642 | static int __net_init vti_init_net(struct net *net) | 305 | static int __net_init vti_init_net(struct net *net) |
643 | { | 306 | { |
644 | int err; | 307 | int err; |
645 | struct vti_net *ipn = net_generic(net, vti_net_id); | 308 | struct ip_tunnel_net *itn; |
646 | |||
647 | ipn->tunnels[0] = ipn->tunnels_wc; | ||
648 | ipn->tunnels[1] = ipn->tunnels_l; | ||
649 | ipn->tunnels[2] = ipn->tunnels_r; | ||
650 | ipn->tunnels[3] = ipn->tunnels_r_l; | ||
651 | |||
652 | ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), | ||
653 | "ip_vti0", | ||
654 | vti_tunnel_setup); | ||
655 | if (!ipn->fb_tunnel_dev) { | ||
656 | err = -ENOMEM; | ||
657 | goto err_alloc_dev; | ||
658 | } | ||
659 | dev_net_set(ipn->fb_tunnel_dev, net); | ||
660 | |||
661 | err = vti_fb_tunnel_init(ipn->fb_tunnel_dev); | ||
662 | if (err) | ||
663 | goto err_reg_dev; | ||
664 | ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops; | ||
665 | 309 | ||
666 | err = register_netdev(ipn->fb_tunnel_dev); | 310 | err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); |
667 | if (err) | 311 | if (err) |
668 | goto err_reg_dev; | 312 | return err; |
313 | itn = net_generic(net, vti_net_id); | ||
314 | vti_fb_tunnel_init(itn->fb_tunnel_dev); | ||
669 | return 0; | 315 | return 0; |
670 | |||
671 | err_reg_dev: | ||
672 | vti_dev_free(ipn->fb_tunnel_dev); | ||
673 | err_alloc_dev: | ||
674 | /* nothing */ | ||
675 | return err; | ||
676 | } | 316 | } |
677 | 317 | ||
678 | static void __net_exit vti_exit_net(struct net *net) | 318 | static void __net_exit vti_exit_net(struct net *net) |
679 | { | 319 | { |
680 | struct vti_net *ipn = net_generic(net, vti_net_id); | 320 | struct ip_tunnel_net *itn = net_generic(net, vti_net_id); |
681 | LIST_HEAD(list); | 321 | ip_tunnel_delete_net(itn, &vti_link_ops); |
682 | |||
683 | rtnl_lock(); | ||
684 | vti_destroy_tunnels(ipn, &list); | ||
685 | unregister_netdevice_many(&list); | ||
686 | rtnl_unlock(); | ||
687 | } | 322 | } |
688 | 323 | ||
689 | static struct pernet_operations vti_net_ops = { | 324 | static struct pernet_operations vti_net_ops = { |
690 | .init = vti_init_net, | 325 | .init = vti_init_net, |
691 | .exit = vti_exit_net, | 326 | .exit = vti_exit_net, |
692 | .id = &vti_net_id, | 327 | .id = &vti_net_id, |
693 | .size = sizeof(struct vti_net), | 328 | .size = sizeof(struct ip_tunnel_net), |
694 | }; | 329 | }; |
695 | 330 | ||
696 | static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) | 331 | static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) |
@@ -728,78 +363,19 @@ static void vti_netlink_parms(struct nlattr *data[], | |||
728 | static int vti_newlink(struct net *src_net, struct net_device *dev, | 363 | static int vti_newlink(struct net *src_net, struct net_device *dev, |
729 | struct nlattr *tb[], struct nlattr *data[]) | 364 | struct nlattr *tb[], struct nlattr *data[]) |
730 | { | 365 | { |
731 | struct ip_tunnel *nt; | 366 | struct ip_tunnel_parm parms; |
732 | struct net *net = dev_net(dev); | ||
733 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
734 | int mtu; | ||
735 | int err; | ||
736 | |||
737 | nt = netdev_priv(dev); | ||
738 | vti_netlink_parms(data, &nt->parms); | ||
739 | |||
740 | if (vti_tunnel_locate(net, &nt->parms, 0)) | ||
741 | return -EEXIST; | ||
742 | 367 | ||
743 | mtu = vti_tunnel_bind_dev(dev); | 368 | vti_netlink_parms(data, &parms); |
744 | if (!tb[IFLA_MTU]) | 369 | return ip_tunnel_newlink(dev, tb, &parms); |
745 | dev->mtu = mtu; | ||
746 | |||
747 | err = register_netdevice(dev); | ||
748 | if (err) | ||
749 | goto out; | ||
750 | |||
751 | dev_hold(dev); | ||
752 | vti_tunnel_link(ipn, nt); | ||
753 | |||
754 | out: | ||
755 | return err; | ||
756 | } | 370 | } |
757 | 371 | ||
758 | static int vti_changelink(struct net_device *dev, struct nlattr *tb[], | 372 | static int vti_changelink(struct net_device *dev, struct nlattr *tb[], |
759 | struct nlattr *data[]) | 373 | struct nlattr *data[]) |
760 | { | 374 | { |
761 | struct ip_tunnel *t, *nt; | ||
762 | struct net *net = dev_net(dev); | ||
763 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
764 | struct ip_tunnel_parm p; | 375 | struct ip_tunnel_parm p; |
765 | int mtu; | ||
766 | |||
767 | if (dev == ipn->fb_tunnel_dev) | ||
768 | return -EINVAL; | ||
769 | 376 | ||
770 | nt = netdev_priv(dev); | ||
771 | vti_netlink_parms(data, &p); | 377 | vti_netlink_parms(data, &p); |
772 | 378 | return ip_tunnel_changelink(dev, tb, &p); | |
773 | t = vti_tunnel_locate(net, &p, 0); | ||
774 | |||
775 | if (t) { | ||
776 | if (t->dev != dev) | ||
777 | return -EEXIST; | ||
778 | } else { | ||
779 | t = nt; | ||
780 | |||
781 | vti_tunnel_unlink(ipn, t); | ||
782 | t->parms.iph.saddr = p.iph.saddr; | ||
783 | t->parms.iph.daddr = p.iph.daddr; | ||
784 | t->parms.i_key = p.i_key; | ||
785 | t->parms.o_key = p.o_key; | ||
786 | if (dev->type != ARPHRD_ETHER) { | ||
787 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
788 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
789 | } | ||
790 | vti_tunnel_link(ipn, t); | ||
791 | netdev_state_change(dev); | ||
792 | } | ||
793 | |||
794 | if (t->parms.link != p.link) { | ||
795 | t->parms.link = p.link; | ||
796 | mtu = vti_tunnel_bind_dev(dev); | ||
797 | if (!tb[IFLA_MTU]) | ||
798 | dev->mtu = mtu; | ||
799 | netdev_state_change(dev); | ||
800 | } | ||
801 | |||
802 | return 0; | ||
803 | } | 379 | } |
804 | 380 | ||
805 | static size_t vti_get_size(const struct net_device *dev) | 381 | static size_t vti_get_size(const struct net_device *dev) |
@@ -865,7 +441,7 @@ static int __init vti_init(void) | |||
865 | err = xfrm4_mode_tunnel_input_register(&vti_handler); | 441 | err = xfrm4_mode_tunnel_input_register(&vti_handler); |
866 | if (err < 0) { | 442 | if (err < 0) { |
867 | unregister_pernet_device(&vti_net_ops); | 443 | unregister_pernet_device(&vti_net_ops); |
868 | pr_info(KERN_INFO "vti init: can't register tunnel\n"); | 444 | pr_info("vti init: can't register tunnel\n"); |
869 | } | 445 | } |
870 | 446 | ||
871 | err = rtnl_link_register(&vti_link_ops); | 447 | err = rtnl_link_register(&vti_link_ops); |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 51fc2a1dcdd3..7f80fb4b82d3 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -190,15 +190,14 @@ static int ipip_rcv(struct sk_buff *skb) | |||
190 | struct ip_tunnel *tunnel; | 190 | struct ip_tunnel *tunnel; |
191 | const struct iphdr *iph; | 191 | const struct iphdr *iph; |
192 | 192 | ||
193 | if (iptunnel_pull_header(skb, 0, tpi.proto)) | ||
194 | goto drop; | ||
195 | |||
196 | iph = ip_hdr(skb); | 193 | iph = ip_hdr(skb); |
197 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, | 194 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, |
198 | iph->saddr, iph->daddr, 0); | 195 | iph->saddr, iph->daddr, 0); |
199 | if (tunnel) { | 196 | if (tunnel) { |
200 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) | 197 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) |
201 | goto drop; | 198 | goto drop; |
199 | if (iptunnel_pull_header(skb, 0, tpi.proto)) | ||
200 | goto drop; | ||
202 | return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); | 201 | return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); |
203 | } | 202 | } |
204 | 203 | ||
@@ -286,7 +285,6 @@ static void ipip_tunnel_setup(struct net_device *dev) | |||
286 | dev->flags = IFF_NOARP; | 285 | dev->flags = IFF_NOARP; |
287 | dev->iflink = 0; | 286 | dev->iflink = 0; |
288 | dev->addr_len = 4; | 287 | dev->addr_len = 4; |
289 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
290 | dev->features |= NETIF_F_LLTX; | 288 | dev->features |= NETIF_F_LLTX; |
291 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 289 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
292 | 290 | ||
@@ -437,7 +435,7 @@ static int __net_init ipip_init_net(struct net *net) | |||
437 | static void __net_exit ipip_exit_net(struct net *net) | 435 | static void __net_exit ipip_exit_net(struct net *net) |
438 | { | 436 | { |
439 | struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); | 437 | struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); |
440 | ip_tunnel_delete_net(itn); | 438 | ip_tunnel_delete_net(itn, &ipip_link_ops); |
441 | } | 439 | } |
442 | 440 | ||
443 | static struct pernet_operations ipip_net_ops = { | 441 | static struct pernet_operations ipip_net_ops = { |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 132a09664704..9ae54b09254f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -127,9 +127,9 @@ static struct kmem_cache *mrt_cachep __read_mostly; | |||
127 | static struct mr_table *ipmr_new_table(struct net *net, u32 id); | 127 | static struct mr_table *ipmr_new_table(struct net *net, u32 id); |
128 | static void ipmr_free_table(struct mr_table *mrt); | 128 | static void ipmr_free_table(struct mr_table *mrt); |
129 | 129 | ||
130 | static int ip_mr_forward(struct net *net, struct mr_table *mrt, | 130 | static void ip_mr_forward(struct net *net, struct mr_table *mrt, |
131 | struct sk_buff *skb, struct mfc_cache *cache, | 131 | struct sk_buff *skb, struct mfc_cache *cache, |
132 | int local); | 132 | int local); |
133 | static int ipmr_cache_report(struct mr_table *mrt, | 133 | static int ipmr_cache_report(struct mr_table *mrt, |
134 | struct sk_buff *pkt, vifi_t vifi, int assert); | 134 | struct sk_buff *pkt, vifi_t vifi, int assert); |
135 | static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | 135 | static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, |
@@ -1795,9 +1795,9 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) | |||
1795 | 1795 | ||
1796 | /* "local" means that we should preserve one skb (for local delivery) */ | 1796 | /* "local" means that we should preserve one skb (for local delivery) */ |
1797 | 1797 | ||
1798 | static int ip_mr_forward(struct net *net, struct mr_table *mrt, | 1798 | static void ip_mr_forward(struct net *net, struct mr_table *mrt, |
1799 | struct sk_buff *skb, struct mfc_cache *cache, | 1799 | struct sk_buff *skb, struct mfc_cache *cache, |
1800 | int local) | 1800 | int local) |
1801 | { | 1801 | { |
1802 | int psend = -1; | 1802 | int psend = -1; |
1803 | int vif, ct; | 1803 | int vif, ct; |
@@ -1903,14 +1903,13 @@ last_forward: | |||
1903 | ipmr_queue_xmit(net, mrt, skb2, cache, psend); | 1903 | ipmr_queue_xmit(net, mrt, skb2, cache, psend); |
1904 | } else { | 1904 | } else { |
1905 | ipmr_queue_xmit(net, mrt, skb, cache, psend); | 1905 | ipmr_queue_xmit(net, mrt, skb, cache, psend); |
1906 | return 0; | 1906 | return; |
1907 | } | 1907 | } |
1908 | } | 1908 | } |
1909 | 1909 | ||
1910 | dont_forward: | 1910 | dont_forward: |
1911 | if (!local) | 1911 | if (!local) |
1912 | kfree_skb(skb); | 1912 | kfree_skb(skb); |
1913 | return 0; | ||
1914 | } | 1913 | } |
1915 | 1914 | ||
1916 | static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) | 1915 | static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) |
@@ -2068,9 +2067,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, | |||
2068 | skb_reset_network_header(skb); | 2067 | skb_reset_network_header(skb); |
2069 | skb->protocol = htons(ETH_P_IP); | 2068 | skb->protocol = htons(ETH_P_IP); |
2070 | skb->ip_summed = CHECKSUM_NONE; | 2069 | skb->ip_summed = CHECKSUM_NONE; |
2071 | skb->pkt_type = PACKET_HOST; | ||
2072 | 2070 | ||
2073 | skb_tunnel_rx(skb, reg_dev); | 2071 | skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); |
2074 | 2072 | ||
2075 | netif_rx(skb); | 2073 | netif_rx(skb); |
2076 | 2074 | ||
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4e9028017428..1657e39b291f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
@@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT | |||
110 | 110 | ||
111 | To compile it as a module, choose M here. If unsure, say N. | 111 | To compile it as a module, choose M here. If unsure, say N. |
112 | 112 | ||
113 | config IP_NF_TARGET_SYNPROXY | ||
114 | tristate "SYNPROXY target support" | ||
115 | depends on NF_CONNTRACK && NETFILTER_ADVANCED | ||
116 | select NETFILTER_SYNPROXY | ||
117 | select SYN_COOKIES | ||
118 | help | ||
119 | The SYNPROXY target allows you to intercept TCP connections and | ||
120 | establish them using syncookies before they are passed on to the | ||
121 | server. This allows to avoid conntrack and server resource usage | ||
122 | during SYN-flood attacks. | ||
123 | |||
124 | To compile it as a module, choose M here. If unsure, say N. | ||
125 | |||
113 | config IP_NF_TARGET_ULOG | 126 | config IP_NF_TARGET_ULOG |
114 | tristate "ULOG target support (obsolete)" | 127 | tristate "ULOG target support (obsolete)" |
115 | default m if NETFILTER_ADVANCED=n | 128 | default m if NETFILTER_ADVANCED=n |
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 007b128eecc9..3622b248b6dd 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
@@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o | |||
46 | obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o | 46 | obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o |
47 | obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o | 47 | obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o |
48 | obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o | 48 | obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o |
49 | obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o | ||
49 | obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o | 50 | obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o |
50 | 51 | ||
51 | # generic ARP tables | 52 | # generic ARP tables |
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index eadab1ed6500..a865f6f94013 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c | |||
@@ -48,7 +48,7 @@ static int __net_init arptable_filter_net_init(struct net *net) | |||
48 | net->ipv4.arptable_filter = | 48 | net->ipv4.arptable_filter = |
49 | arpt_register_table(net, &packet_filter, repl); | 49 | arpt_register_table(net, &packet_filter, repl); |
50 | kfree(repl); | 50 | kfree(repl); |
51 | return PTR_RET(net->ipv4.arptable_filter); | 51 | return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter); |
52 | } | 52 | } |
53 | 53 | ||
54 | static void __net_exit arptable_filter_net_exit(struct net *net) | 54 | static void __net_exit arptable_filter_net_exit(struct net *net) |
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 30e4de940567..00352ce0f0de 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
@@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this, | |||
118 | NF_CT_ASSERT(dev->ifindex != 0); | 118 | NF_CT_ASSERT(dev->ifindex != 0); |
119 | 119 | ||
120 | nf_ct_iterate_cleanup(net, device_cmp, | 120 | nf_ct_iterate_cleanup(net, device_cmp, |
121 | (void *)(long)dev->ifindex); | 121 | (void *)(long)dev->ifindex, 0, 0); |
122 | } | 122 | } |
123 | 123 | ||
124 | return NOTIFY_DONE; | 124 | return NOTIFY_DONE; |
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 04b18c1ac345..b969131ad1c1 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
@@ -119,7 +119,26 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
119 | 119 | ||
120 | nf_ct_attach(nskb, oldskb); | 120 | nf_ct_attach(nskb, oldskb); |
121 | 121 | ||
122 | ip_local_out(nskb); | 122 | #ifdef CONFIG_BRIDGE_NETFILTER |
123 | /* If we use ip_local_out for bridged traffic, the MAC source on | ||
124 | * the RST will be ours, instead of the destination's. This confuses | ||
125 | * some routers/firewalls, and they drop the packet. So we need to | ||
126 | * build the eth header using the original destination's MAC as the | ||
127 | * source, and send the RST packet directly. | ||
128 | */ | ||
129 | if (oldskb->nf_bridge) { | ||
130 | struct ethhdr *oeth = eth_hdr(oldskb); | ||
131 | nskb->dev = oldskb->nf_bridge->physindev; | ||
132 | niph->tot_len = htons(nskb->len); | ||
133 | ip_send_check(niph); | ||
134 | if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), | ||
135 | oeth->h_source, oeth->h_dest, nskb->len) < 0) | ||
136 | goto free_nskb; | ||
137 | dev_queue_xmit(nskb); | ||
138 | } else | ||
139 | #endif | ||
140 | ip_local_out(nskb); | ||
141 | |||
123 | return; | 142 | return; |
124 | 143 | ||
125 | free_nskb: | 144 | free_nskb: |
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c new file mode 100644 index 000000000000..67e17dcda65e --- /dev/null +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c | |||
@@ -0,0 +1,476 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | */ | ||
8 | |||
9 | #include <linux/module.h> | ||
10 | #include <linux/skbuff.h> | ||
11 | #include <net/tcp.h> | ||
12 | |||
13 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
14 | #include <linux/netfilter/x_tables.h> | ||
15 | #include <linux/netfilter/xt_SYNPROXY.h> | ||
16 | #include <net/netfilter/nf_conntrack.h> | ||
17 | #include <net/netfilter/nf_conntrack_seqadj.h> | ||
18 | #include <net/netfilter/nf_conntrack_synproxy.h> | ||
19 | |||
20 | static struct iphdr * | ||
21 | synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) | ||
22 | { | ||
23 | struct iphdr *iph; | ||
24 | |||
25 | skb_reset_network_header(skb); | ||
26 | iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); | ||
27 | iph->version = 4; | ||
28 | iph->ihl = sizeof(*iph) / 4; | ||
29 | iph->tos = 0; | ||
30 | iph->id = 0; | ||
31 | iph->frag_off = htons(IP_DF); | ||
32 | iph->ttl = sysctl_ip_default_ttl; | ||
33 | iph->protocol = IPPROTO_TCP; | ||
34 | iph->check = 0; | ||
35 | iph->saddr = saddr; | ||
36 | iph->daddr = daddr; | ||
37 | |||
38 | return iph; | ||
39 | } | ||
40 | |||
41 | static void | ||
42 | synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, | ||
43 | struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, | ||
44 | struct iphdr *niph, struct tcphdr *nth, | ||
45 | unsigned int tcp_hdr_size) | ||
46 | { | ||
47 | nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); | ||
48 | nskb->ip_summed = CHECKSUM_PARTIAL; | ||
49 | nskb->csum_start = (unsigned char *)nth - nskb->head; | ||
50 | nskb->csum_offset = offsetof(struct tcphdr, check); | ||
51 | |||
52 | skb_dst_set_noref(nskb, skb_dst(skb)); | ||
53 | nskb->protocol = htons(ETH_P_IP); | ||
54 | if (ip_route_me_harder(nskb, RTN_UNSPEC)) | ||
55 | goto free_nskb; | ||
56 | |||
57 | if (nfct) { | ||
58 | nskb->nfct = nfct; | ||
59 | nskb->nfctinfo = ctinfo; | ||
60 | nf_conntrack_get(nfct); | ||
61 | } | ||
62 | |||
63 | ip_local_out(nskb); | ||
64 | return; | ||
65 | |||
66 | free_nskb: | ||
67 | kfree_skb(nskb); | ||
68 | } | ||
69 | |||
70 | static void | ||
71 | synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, | ||
72 | const struct synproxy_options *opts) | ||
73 | { | ||
74 | struct sk_buff *nskb; | ||
75 | struct iphdr *iph, *niph; | ||
76 | struct tcphdr *nth; | ||
77 | unsigned int tcp_hdr_size; | ||
78 | u16 mss = opts->mss; | ||
79 | |||
80 | iph = ip_hdr(skb); | ||
81 | |||
82 | tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); | ||
83 | nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, | ||
84 | GFP_ATOMIC); | ||
85 | if (nskb == NULL) | ||
86 | return; | ||
87 | skb_reserve(nskb, MAX_TCP_HEADER); | ||
88 | |||
89 | niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); | ||
90 | |||
91 | skb_reset_transport_header(nskb); | ||
92 | nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); | ||
93 | nth->source = th->dest; | ||
94 | nth->dest = th->source; | ||
95 | nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); | ||
96 | nth->ack_seq = htonl(ntohl(th->seq) + 1); | ||
97 | tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; | ||
98 | if (opts->options & XT_SYNPROXY_OPT_ECN) | ||
99 | tcp_flag_word(nth) |= TCP_FLAG_ECE; | ||
100 | nth->doff = tcp_hdr_size / 4; | ||
101 | nth->window = 0; | ||
102 | nth->check = 0; | ||
103 | nth->urg_ptr = 0; | ||
104 | |||
105 | synproxy_build_options(nth, opts); | ||
106 | |||
107 | synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, | ||
108 | niph, nth, tcp_hdr_size); | ||
109 | } | ||
110 | |||
111 | static void | ||
112 | synproxy_send_server_syn(const struct synproxy_net *snet, | ||
113 | const struct sk_buff *skb, const struct tcphdr *th, | ||
114 | const struct synproxy_options *opts, u32 recv_seq) | ||
115 | { | ||
116 | struct sk_buff *nskb; | ||
117 | struct iphdr *iph, *niph; | ||
118 | struct tcphdr *nth; | ||
119 | unsigned int tcp_hdr_size; | ||
120 | |||
121 | iph = ip_hdr(skb); | ||
122 | |||
123 | tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); | ||
124 | nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, | ||
125 | GFP_ATOMIC); | ||
126 | if (nskb == NULL) | ||
127 | return; | ||
128 | skb_reserve(nskb, MAX_TCP_HEADER); | ||
129 | |||
130 | niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); | ||
131 | |||
132 | skb_reset_transport_header(nskb); | ||
133 | nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); | ||
134 | nth->source = th->source; | ||
135 | nth->dest = th->dest; | ||
136 | nth->seq = htonl(recv_seq - 1); | ||
137 | /* ack_seq is used to relay our ISN to the synproxy hook to initialize | ||
138 | * sequence number translation once a connection tracking entry exists. | ||
139 | */ | ||
140 | nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); | ||
141 | tcp_flag_word(nth) = TCP_FLAG_SYN; | ||
142 | if (opts->options & XT_SYNPROXY_OPT_ECN) | ||
143 | tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; | ||
144 | nth->doff = tcp_hdr_size / 4; | ||
145 | nth->window = th->window; | ||
146 | nth->check = 0; | ||
147 | nth->urg_ptr = 0; | ||
148 | |||
149 | synproxy_build_options(nth, opts); | ||
150 | |||
151 | synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, | ||
152 | niph, nth, tcp_hdr_size); | ||
153 | } | ||
154 | |||
155 | static void | ||
156 | synproxy_send_server_ack(const struct synproxy_net *snet, | ||
157 | const struct ip_ct_tcp *state, | ||
158 | const struct sk_buff *skb, const struct tcphdr *th, | ||
159 | const struct synproxy_options *opts) | ||
160 | { | ||
161 | struct sk_buff *nskb; | ||
162 | struct iphdr *iph, *niph; | ||
163 | struct tcphdr *nth; | ||
164 | unsigned int tcp_hdr_size; | ||
165 | |||
166 | iph = ip_hdr(skb); | ||
167 | |||
168 | tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); | ||
169 | nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, | ||
170 | GFP_ATOMIC); | ||
171 | if (nskb == NULL) | ||
172 | return; | ||
173 | skb_reserve(nskb, MAX_TCP_HEADER); | ||
174 | |||
175 | niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); | ||
176 | |||
177 | skb_reset_transport_header(nskb); | ||
178 | nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); | ||
179 | nth->source = th->dest; | ||
180 | nth->dest = th->source; | ||
181 | nth->seq = htonl(ntohl(th->ack_seq)); | ||
182 | nth->ack_seq = htonl(ntohl(th->seq) + 1); | ||
183 | tcp_flag_word(nth) = TCP_FLAG_ACK; | ||
184 | nth->doff = tcp_hdr_size / 4; | ||
185 | nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); | ||
186 | nth->check = 0; | ||
187 | nth->urg_ptr = 0; | ||
188 | |||
189 | synproxy_build_options(nth, opts); | ||
190 | |||
191 | synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); | ||
192 | } | ||
193 | |||
194 | static void | ||
195 | synproxy_send_client_ack(const struct synproxy_net *snet, | ||
196 | const struct sk_buff *skb, const struct tcphdr *th, | ||
197 | const struct synproxy_options *opts) | ||
198 | { | ||
199 | struct sk_buff *nskb; | ||
200 | struct iphdr *iph, *niph; | ||
201 | struct tcphdr *nth; | ||
202 | unsigned int tcp_hdr_size; | ||
203 | |||
204 | iph = ip_hdr(skb); | ||
205 | |||
206 | tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); | ||
207 | nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, | ||
208 | GFP_ATOMIC); | ||
209 | if (nskb == NULL) | ||
210 | return; | ||
211 | skb_reserve(nskb, MAX_TCP_HEADER); | ||
212 | |||
213 | niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); | ||
214 | |||
215 | skb_reset_transport_header(nskb); | ||
216 | nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); | ||
217 | nth->source = th->source; | ||
218 | nth->dest = th->dest; | ||
219 | nth->seq = htonl(ntohl(th->seq) + 1); | ||
220 | nth->ack_seq = th->ack_seq; | ||
221 | tcp_flag_word(nth) = TCP_FLAG_ACK; | ||
222 | nth->doff = tcp_hdr_size / 4; | ||
223 | nth->window = ntohs(htons(th->window) >> opts->wscale); | ||
224 | nth->check = 0; | ||
225 | nth->urg_ptr = 0; | ||
226 | |||
227 | synproxy_build_options(nth, opts); | ||
228 | |||
229 | synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); | ||
230 | } | ||
231 | |||
232 | static bool | ||
233 | synproxy_recv_client_ack(const struct synproxy_net *snet, | ||
234 | const struct sk_buff *skb, const struct tcphdr *th, | ||
235 | struct synproxy_options *opts, u32 recv_seq) | ||
236 | { | ||
237 | int mss; | ||
238 | |||
239 | mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); | ||
240 | if (mss == 0) { | ||
241 | this_cpu_inc(snet->stats->cookie_invalid); | ||
242 | return false; | ||
243 | } | ||
244 | |||
245 | this_cpu_inc(snet->stats->cookie_valid); | ||
246 | opts->mss = mss; | ||
247 | |||
248 | if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) | ||
249 | synproxy_check_timestamp_cookie(opts); | ||
250 | |||
251 | synproxy_send_server_syn(snet, skb, th, opts, recv_seq); | ||
252 | return true; | ||
253 | } | ||
254 | |||
255 | static unsigned int | ||
256 | synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) | ||
257 | { | ||
258 | const struct xt_synproxy_info *info = par->targinfo; | ||
259 | struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); | ||
260 | struct synproxy_options opts = {}; | ||
261 | struct tcphdr *th, _th; | ||
262 | |||
263 | if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) | ||
264 | return NF_DROP; | ||
265 | |||
266 | th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); | ||
267 | if (th == NULL) | ||
268 | return NF_DROP; | ||
269 | |||
270 | synproxy_parse_options(skb, par->thoff, th, &opts); | ||
271 | |||
272 | if (th->syn && !(th->ack || th->fin || th->rst)) { | ||
273 | /* Initial SYN from client */ | ||
274 | this_cpu_inc(snet->stats->syn_received); | ||
275 | |||
276 | if (th->ece && th->cwr) | ||
277 | opts.options |= XT_SYNPROXY_OPT_ECN; | ||
278 | |||
279 | opts.options &= info->options; | ||
280 | if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) | ||
281 | synproxy_init_timestamp_cookie(info, &opts); | ||
282 | else | ||
283 | opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | | ||
284 | XT_SYNPROXY_OPT_SACK_PERM | | ||
285 | XT_SYNPROXY_OPT_ECN); | ||
286 | |||
287 | synproxy_send_client_synack(skb, th, &opts); | ||
288 | return NF_DROP; | ||
289 | |||
290 | } else if (th->ack && !(th->fin || th->rst || th->syn)) { | ||
291 | /* ACK from client */ | ||
292 | synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); | ||
293 | return NF_DROP; | ||
294 | } | ||
295 | |||
296 | return XT_CONTINUE; | ||
297 | } | ||
298 | |||
299 | static unsigned int ipv4_synproxy_hook(unsigned int hooknum, | ||
300 | struct sk_buff *skb, | ||
301 | const struct net_device *in, | ||
302 | const struct net_device *out, | ||
303 | int (*okfn)(struct sk_buff *)) | ||
304 | { | ||
305 | struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); | ||
306 | enum ip_conntrack_info ctinfo; | ||
307 | struct nf_conn *ct; | ||
308 | struct nf_conn_synproxy *synproxy; | ||
309 | struct synproxy_options opts = {}; | ||
310 | const struct ip_ct_tcp *state; | ||
311 | struct tcphdr *th, _th; | ||
312 | unsigned int thoff; | ||
313 | |||
314 | ct = nf_ct_get(skb, &ctinfo); | ||
315 | if (ct == NULL) | ||
316 | return NF_ACCEPT; | ||
317 | |||
318 | synproxy = nfct_synproxy(ct); | ||
319 | if (synproxy == NULL) | ||
320 | return NF_ACCEPT; | ||
321 | |||
322 | if (nf_is_loopback_packet(skb)) | ||
323 | return NF_ACCEPT; | ||
324 | |||
325 | thoff = ip_hdrlen(skb); | ||
326 | th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); | ||
327 | if (th == NULL) | ||
328 | return NF_DROP; | ||
329 | |||
330 | state = &ct->proto.tcp; | ||
331 | switch (state->state) { | ||
332 | case TCP_CONNTRACK_CLOSE: | ||
333 | if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { | ||
334 | nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - | ||
335 | ntohl(th->seq) + 1); | ||
336 | break; | ||
337 | } | ||
338 | |||
339 | if (!th->syn || th->ack || | ||
340 | CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) | ||
341 | break; | ||
342 | |||
343 | /* Reopened connection - reset the sequence number and timestamp | ||
344 | * adjustments, they will get initialized once the connection is | ||
345 | * reestablished. | ||
346 | */ | ||
347 | nf_ct_seqadj_init(ct, ctinfo, 0); | ||
348 | synproxy->tsoff = 0; | ||
349 | this_cpu_inc(snet->stats->conn_reopened); | ||
350 | |||
351 | /* fall through */ | ||
352 | case TCP_CONNTRACK_SYN_SENT: | ||
353 | synproxy_parse_options(skb, thoff, th, &opts); | ||
354 | |||
355 | if (!th->syn && th->ack && | ||
356 | CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { | ||
357 | /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, | ||
358 | * therefore we need to add 1 to make the SYN sequence | ||
359 | * number match the one of first SYN. | ||
360 | */ | ||
361 | if (synproxy_recv_client_ack(snet, skb, th, &opts, | ||
362 | ntohl(th->seq) + 1)) | ||
363 | this_cpu_inc(snet->stats->cookie_retrans); | ||
364 | |||
365 | return NF_DROP; | ||
366 | } | ||
367 | |||
368 | synproxy->isn = ntohl(th->ack_seq); | ||
369 | if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) | ||
370 | synproxy->its = opts.tsecr; | ||
371 | break; | ||
372 | case TCP_CONNTRACK_SYN_RECV: | ||
373 | if (!th->syn || !th->ack) | ||
374 | break; | ||
375 | |||
376 | synproxy_parse_options(skb, thoff, th, &opts); | ||
377 | if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) | ||
378 | synproxy->tsoff = opts.tsval - synproxy->its; | ||
379 | |||
380 | opts.options &= ~(XT_SYNPROXY_OPT_MSS | | ||
381 | XT_SYNPROXY_OPT_WSCALE | | ||
382 | XT_SYNPROXY_OPT_SACK_PERM); | ||
383 | |||
384 | swap(opts.tsval, opts.tsecr); | ||
385 | synproxy_send_server_ack(snet, state, skb, th, &opts); | ||
386 | |||
387 | nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); | ||
388 | |||
389 | swap(opts.tsval, opts.tsecr); | ||
390 | synproxy_send_client_ack(snet, skb, th, &opts); | ||
391 | |||
392 | consume_skb(skb); | ||
393 | return NF_STOLEN; | ||
394 | default: | ||
395 | break; | ||
396 | } | ||
397 | |||
398 | synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); | ||
399 | return NF_ACCEPT; | ||
400 | } | ||
401 | |||
402 | static int synproxy_tg4_check(const struct xt_tgchk_param *par) | ||
403 | { | ||
404 | const struct ipt_entry *e = par->entryinfo; | ||
405 | |||
406 | if (e->ip.proto != IPPROTO_TCP || | ||
407 | e->ip.invflags & XT_INV_PROTO) | ||
408 | return -EINVAL; | ||
409 | |||
410 | return nf_ct_l3proto_try_module_get(par->family); | ||
411 | } | ||
412 | |||
413 | static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) | ||
414 | { | ||
415 | nf_ct_l3proto_module_put(par->family); | ||
416 | } | ||
417 | |||
418 | static struct xt_target synproxy_tg4_reg __read_mostly = { | ||
419 | .name = "SYNPROXY", | ||
420 | .family = NFPROTO_IPV4, | ||
421 | .target = synproxy_tg4, | ||
422 | .targetsize = sizeof(struct xt_synproxy_info), | ||
423 | .checkentry = synproxy_tg4_check, | ||
424 | .destroy = synproxy_tg4_destroy, | ||
425 | .me = THIS_MODULE, | ||
426 | }; | ||
427 | |||
428 | static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { | ||
429 | { | ||
430 | .hook = ipv4_synproxy_hook, | ||
431 | .owner = THIS_MODULE, | ||
432 | .pf = NFPROTO_IPV4, | ||
433 | .hooknum = NF_INET_LOCAL_IN, | ||
434 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, | ||
435 | }, | ||
436 | { | ||
437 | .hook = ipv4_synproxy_hook, | ||
438 | .owner = THIS_MODULE, | ||
439 | .pf = NFPROTO_IPV4, | ||
440 | .hooknum = NF_INET_POST_ROUTING, | ||
441 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, | ||
442 | }, | ||
443 | }; | ||
444 | |||
445 | static int __init synproxy_tg4_init(void) | ||
446 | { | ||
447 | int err; | ||
448 | |||
449 | err = nf_register_hooks(ipv4_synproxy_ops, | ||
450 | ARRAY_SIZE(ipv4_synproxy_ops)); | ||
451 | if (err < 0) | ||
452 | goto err1; | ||
453 | |||
454 | err = xt_register_target(&synproxy_tg4_reg); | ||
455 | if (err < 0) | ||
456 | goto err2; | ||
457 | |||
458 | return 0; | ||
459 | |||
460 | err2: | ||
461 | nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); | ||
462 | err1: | ||
463 | return err; | ||
464 | } | ||
465 | |||
466 | static void __exit synproxy_tg4_exit(void) | ||
467 | { | ||
468 | xt_unregister_target(&synproxy_tg4_reg); | ||
469 | nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); | ||
470 | } | ||
471 | |||
472 | module_init(synproxy_tg4_init); | ||
473 | module_exit(synproxy_tg4_exit); | ||
474 | |||
475 | MODULE_LICENSE("GPL"); | ||
476 | MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); | ||
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 6b3da5cf54e9..50af5b45c050 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c | |||
@@ -69,7 +69,7 @@ static int __net_init iptable_filter_net_init(struct net *net) | |||
69 | net->ipv4.iptable_filter = | 69 | net->ipv4.iptable_filter = |
70 | ipt_register_table(net, &packet_filter, repl); | 70 | ipt_register_table(net, &packet_filter, repl); |
71 | kfree(repl); | 71 | kfree(repl); |
72 | return PTR_RET(net->ipv4.iptable_filter); | 72 | return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter); |
73 | } | 73 | } |
74 | 74 | ||
75 | static void __net_exit iptable_filter_net_exit(struct net *net) | 75 | static void __net_exit iptable_filter_net_exit(struct net *net) |
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index cba5658ec82c..0d8cd82e0fad 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c | |||
@@ -107,7 +107,7 @@ static int __net_init iptable_mangle_net_init(struct net *net) | |||
107 | net->ipv4.iptable_mangle = | 107 | net->ipv4.iptable_mangle = |
108 | ipt_register_table(net, &packet_mangler, repl); | 108 | ipt_register_table(net, &packet_mangler, repl); |
109 | kfree(repl); | 109 | kfree(repl); |
110 | return PTR_RET(net->ipv4.iptable_mangle); | 110 | return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle); |
111 | } | 111 | } |
112 | 112 | ||
113 | static void __net_exit iptable_mangle_net_exit(struct net *net) | 113 | static void __net_exit iptable_mangle_net_exit(struct net *net) |
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 6383273d54e1..683bfaffed65 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c | |||
@@ -292,7 +292,7 @@ static int __net_init iptable_nat_net_init(struct net *net) | |||
292 | return -ENOMEM; | 292 | return -ENOMEM; |
293 | net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); | 293 | net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); |
294 | kfree(repl); | 294 | kfree(repl); |
295 | return PTR_RET(net->ipv4.nat_table); | 295 | return PTR_ERR_OR_ZERO(net->ipv4.nat_table); |
296 | } | 296 | } |
297 | 297 | ||
298 | static void __net_exit iptable_nat_net_exit(struct net *net) | 298 | static void __net_exit iptable_nat_net_exit(struct net *net) |
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 03d9696d3c6e..1f82aea11df6 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c | |||
@@ -48,7 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net) | |||
48 | net->ipv4.iptable_raw = | 48 | net->ipv4.iptable_raw = |
49 | ipt_register_table(net, &packet_raw, repl); | 49 | ipt_register_table(net, &packet_raw, repl); |
50 | kfree(repl); | 50 | kfree(repl); |
51 | return PTR_RET(net->ipv4.iptable_raw); | 51 | return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw); |
52 | } | 52 | } |
53 | 53 | ||
54 | static void __net_exit iptable_raw_net_exit(struct net *net) | 54 | static void __net_exit iptable_raw_net_exit(struct net *net) |
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index b283d8e2601a..f867a8d38bf7 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c | |||
@@ -66,7 +66,7 @@ static int __net_init iptable_security_net_init(struct net *net) | |||
66 | net->ipv4.iptable_security = | 66 | net->ipv4.iptable_security = |
67 | ipt_register_table(net, &security_table, repl); | 67 | ipt_register_table(net, &security_table, repl); |
68 | kfree(repl); | 68 | kfree(repl); |
69 | return PTR_RET(net->ipv4.iptable_security); | 69 | return PTR_ERR_OR_ZERO(net->ipv4.iptable_security); |
70 | } | 70 | } |
71 | 71 | ||
72 | static void __net_exit iptable_security_net_exit(struct net *net) | 72 | static void __net_exit iptable_security_net_exit(struct net *net) |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 0a2e0e3e95ba..86f5b34a4ed1 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <net/netfilter/nf_conntrack_l3proto.h> | 25 | #include <net/netfilter/nf_conntrack_l3proto.h> |
26 | #include <net/netfilter/nf_conntrack_zones.h> | 26 | #include <net/netfilter/nf_conntrack_zones.h> |
27 | #include <net/netfilter/nf_conntrack_core.h> | 27 | #include <net/netfilter/nf_conntrack_core.h> |
28 | #include <net/netfilter/nf_conntrack_seqadj.h> | ||
28 | #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> | 29 | #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> |
29 | #include <net/netfilter/nf_nat_helper.h> | 30 | #include <net/netfilter/nf_nat_helper.h> |
30 | #include <net/netfilter/ipv4/nf_defrag_ipv4.h> | 31 | #include <net/netfilter/ipv4/nf_defrag_ipv4.h> |
@@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, | |||
136 | /* adjust seqs for loopback traffic only in outgoing direction */ | 137 | /* adjust seqs for loopback traffic only in outgoing direction */ |
137 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && | 138 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && |
138 | !nf_is_loopback_packet(skb)) { | 139 | !nf_is_loopback_packet(skb)) { |
139 | typeof(nf_nat_seq_adjust_hook) seq_adjust; | 140 | if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { |
140 | |||
141 | seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); | ||
142 | if (!seq_adjust || | ||
143 | !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { | ||
144 | NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); | 141 | NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); |
145 | return NF_DROP; | 142 | return NF_DROP; |
146 | } | 143 | } |
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 746427c9e719..d7d9882d4cae 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
@@ -1082,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, | |||
1082 | __u16 srcp = ntohs(inet->inet_sport); | 1082 | __u16 srcp = ntohs(inet->inet_sport); |
1083 | 1083 | ||
1084 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" | 1084 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" |
1085 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", | 1085 | " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", |
1086 | bucket, src, srcp, dest, destp, sp->sk_state, | 1086 | bucket, src, srcp, dest, destp, sp->sk_state, |
1087 | sk_wmem_alloc_get(sp), | 1087 | sk_wmem_alloc_get(sp), |
1088 | sk_rmem_alloc_get(sp), | 1088 | sk_rmem_alloc_get(sp), |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6577a1149a47..4a0335854b89 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = { | |||
111 | SNMP_MIB_SENTINEL | 111 | SNMP_MIB_SENTINEL |
112 | }; | 112 | }; |
113 | 113 | ||
114 | /* Following RFC4293 items are displayed in /proc/net/netstat */ | 114 | /* Following items are displayed in /proc/net/netstat */ |
115 | static const struct snmp_mib snmp4_ipextstats_list[] = { | 115 | static const struct snmp_mib snmp4_ipextstats_list[] = { |
116 | SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), | 116 | SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), |
117 | SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), | 117 | SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), |
@@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { | |||
125 | SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), | 125 | SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), |
126 | SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), | 126 | SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), |
127 | SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), | 127 | SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), |
128 | /* Non RFC4293 fields */ | ||
128 | SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), | 129 | SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), |
130 | SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), | ||
131 | SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), | ||
132 | SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), | ||
133 | SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), | ||
129 | SNMP_MIB_SENTINEL | 134 | SNMP_MIB_SENTINEL |
130 | }; | 135 | }; |
131 | 136 | ||
@@ -273,7 +278,7 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
273 | SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), | 278 | SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), |
274 | SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), | 279 | SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), |
275 | SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), | 280 | SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), |
276 | SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), | 281 | SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), |
277 | SNMP_MIB_SENTINEL | 282 | SNMP_MIB_SENTINEL |
278 | }; | 283 | }; |
279 | 284 | ||
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index dd44e0ab600c..a86c7ae71881 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -571,7 +571,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
571 | flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, | 571 | flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, |
572 | RT_SCOPE_UNIVERSE, | 572 | RT_SCOPE_UNIVERSE, |
573 | inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, | 573 | inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, |
574 | inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, | 574 | inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP | |
575 | (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), | ||
575 | daddr, saddr, 0, 0); | 576 | daddr, saddr, 0, 0); |
576 | 577 | ||
577 | if (!inet->hdrincl) { | 578 | if (!inet->hdrincl) { |
@@ -987,7 +988,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) | |||
987 | srcp = inet->inet_num; | 988 | srcp = inet->inet_num; |
988 | 989 | ||
989 | seq_printf(seq, "%4d: %08X:%04X %08X:%04X" | 990 | seq_printf(seq, "%4d: %08X:%04X %08X:%04X" |
990 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", | 991 | " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", |
991 | i, src, srcp, dest, destp, sp->sk_state, | 992 | i, src, srcp, dest, destp, sp->sk_state, |
992 | sk_wmem_alloc_get(sp), | 993 | sk_wmem_alloc_get(sp), |
993 | sk_rmem_alloc_get(sp), | 994 | sk_rmem_alloc_get(sp), |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a9a54a236832..727f4365bcdf 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -112,7 +112,8 @@ | |||
112 | #define RT_FL_TOS(oldflp4) \ | 112 | #define RT_FL_TOS(oldflp4) \ |
113 | ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) | 113 | ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) |
114 | 114 | ||
115 | #define IP_MAX_MTU 0xFFF0 | 115 | /* IPv4 datagram length is stored into 16bit field (tot_len) */ |
116 | #define IP_MAX_MTU 0xFFFF | ||
116 | 117 | ||
117 | #define RT_GC_TIMEOUT (300*HZ) | 118 | #define RT_GC_TIMEOUT (300*HZ) |
118 | 119 | ||
@@ -435,12 +436,12 @@ static inline int ip_rt_proc_init(void) | |||
435 | 436 | ||
436 | static inline bool rt_is_expired(const struct rtable *rth) | 437 | static inline bool rt_is_expired(const struct rtable *rth) |
437 | { | 438 | { |
438 | return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); | 439 | return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); |
439 | } | 440 | } |
440 | 441 | ||
441 | void rt_cache_flush(struct net *net) | 442 | void rt_cache_flush(struct net *net) |
442 | { | 443 | { |
443 | rt_genid_bump(net); | 444 | rt_genid_bump_ipv4(net); |
444 | } | 445 | } |
445 | 446 | ||
446 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, | 447 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, |
@@ -1227,10 +1228,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) | |||
1227 | mtu = 576; | 1228 | mtu = 576; |
1228 | } | 1229 | } |
1229 | 1230 | ||
1230 | if (mtu > IP_MAX_MTU) | 1231 | return min_t(unsigned int, mtu, IP_MAX_MTU); |
1231 | mtu = IP_MAX_MTU; | ||
1232 | |||
1233 | return mtu; | ||
1234 | } | 1232 | } |
1235 | 1233 | ||
1236 | static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) | 1234 | static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) |
@@ -1458,7 +1456,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
1458 | #endif | 1456 | #endif |
1459 | rth->dst.output = ip_rt_bug; | 1457 | rth->dst.output = ip_rt_bug; |
1460 | 1458 | ||
1461 | rth->rt_genid = rt_genid(dev_net(dev)); | 1459 | rth->rt_genid = rt_genid_ipv4(dev_net(dev)); |
1462 | rth->rt_flags = RTCF_MULTICAST; | 1460 | rth->rt_flags = RTCF_MULTICAST; |
1463 | rth->rt_type = RTN_MULTICAST; | 1461 | rth->rt_type = RTN_MULTICAST; |
1464 | rth->rt_is_input= 1; | 1462 | rth->rt_is_input= 1; |
@@ -1589,7 +1587,7 @@ static int __mkroute_input(struct sk_buff *skb, | |||
1589 | goto cleanup; | 1587 | goto cleanup; |
1590 | } | 1588 | } |
1591 | 1589 | ||
1592 | rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); | 1590 | rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); |
1593 | rth->rt_flags = flags; | 1591 | rth->rt_flags = flags; |
1594 | rth->rt_type = res->type; | 1592 | rth->rt_type = res->type; |
1595 | rth->rt_is_input = 1; | 1593 | rth->rt_is_input = 1; |
@@ -1760,7 +1758,7 @@ local_input: | |||
1760 | rth->dst.tclassid = itag; | 1758 | rth->dst.tclassid = itag; |
1761 | #endif | 1759 | #endif |
1762 | 1760 | ||
1763 | rth->rt_genid = rt_genid(net); | 1761 | rth->rt_genid = rt_genid_ipv4(net); |
1764 | rth->rt_flags = flags|RTCF_LOCAL; | 1762 | rth->rt_flags = flags|RTCF_LOCAL; |
1765 | rth->rt_type = res.type; | 1763 | rth->rt_type = res.type; |
1766 | rth->rt_is_input = 1; | 1764 | rth->rt_is_input = 1; |
@@ -1945,7 +1943,7 @@ add: | |||
1945 | 1943 | ||
1946 | rth->dst.output = ip_output; | 1944 | rth->dst.output = ip_output; |
1947 | 1945 | ||
1948 | rth->rt_genid = rt_genid(dev_net(dev_out)); | 1946 | rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); |
1949 | rth->rt_flags = flags; | 1947 | rth->rt_flags = flags; |
1950 | rth->rt_type = type; | 1948 | rth->rt_type = type; |
1951 | rth->rt_is_input = 0; | 1949 | rth->rt_is_input = 0; |
@@ -2227,7 +2225,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or | |||
2227 | rt->rt_iif = ort->rt_iif; | 2225 | rt->rt_iif = ort->rt_iif; |
2228 | rt->rt_pmtu = ort->rt_pmtu; | 2226 | rt->rt_pmtu = ort->rt_pmtu; |
2229 | 2227 | ||
2230 | rt->rt_genid = rt_genid(net); | 2228 | rt->rt_genid = rt_genid_ipv4(net); |
2231 | rt->rt_flags = ort->rt_flags; | 2229 | rt->rt_flags = ort->rt_flags; |
2232 | rt->rt_type = ort->rt_type; | 2230 | rt->rt_type = ort->rt_type; |
2233 | rt->rt_gateway = ort->rt_gateway; | 2231 | rt->rt_gateway = ort->rt_gateway; |
@@ -2665,7 +2663,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { | |||
2665 | 2663 | ||
2666 | static __net_init int rt_genid_init(struct net *net) | 2664 | static __net_init int rt_genid_init(struct net *net) |
2667 | { | 2665 | { |
2668 | atomic_set(&net->rt_genid, 0); | 2666 | atomic_set(&net->ipv4.rt_genid, 0); |
2669 | atomic_set(&net->fnhe_genid, 0); | 2667 | atomic_set(&net->fnhe_genid, 0); |
2670 | get_random_bytes(&net->ipv4.dev_addr_genid, | 2668 | get_random_bytes(&net->ipv4.dev_addr_genid, |
2671 | sizeof(net->ipv4.dev_addr_genid)); | 2669 | sizeof(net->ipv4.dev_addr_genid)); |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b05c96e7af8b..14a15c49129d 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -160,26 +160,33 @@ static __u16 const msstab[] = { | |||
160 | * Generate a syncookie. mssp points to the mss, which is returned | 160 | * Generate a syncookie. mssp points to the mss, which is returned |
161 | * rounded down to the value encoded in the cookie. | 161 | * rounded down to the value encoded in the cookie. |
162 | */ | 162 | */ |
163 | __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) | 163 | u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, |
164 | u16 *mssp) | ||
164 | { | 165 | { |
165 | const struct iphdr *iph = ip_hdr(skb); | ||
166 | const struct tcphdr *th = tcp_hdr(skb); | ||
167 | int mssind; | 166 | int mssind; |
168 | const __u16 mss = *mssp; | 167 | const __u16 mss = *mssp; |
169 | 168 | ||
170 | tcp_synq_overflow(sk); | ||
171 | |||
172 | for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) | 169 | for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) |
173 | if (mss >= msstab[mssind]) | 170 | if (mss >= msstab[mssind]) |
174 | break; | 171 | break; |
175 | *mssp = msstab[mssind]; | 172 | *mssp = msstab[mssind]; |
176 | 173 | ||
177 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); | ||
178 | |||
179 | return secure_tcp_syn_cookie(iph->saddr, iph->daddr, | 174 | return secure_tcp_syn_cookie(iph->saddr, iph->daddr, |
180 | th->source, th->dest, ntohl(th->seq), | 175 | th->source, th->dest, ntohl(th->seq), |
181 | jiffies / (HZ * 60), mssind); | 176 | jiffies / (HZ * 60), mssind); |
182 | } | 177 | } |
178 | EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); | ||
179 | |||
180 | __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) | ||
181 | { | ||
182 | const struct iphdr *iph = ip_hdr(skb); | ||
183 | const struct tcphdr *th = tcp_hdr(skb); | ||
184 | |||
185 | tcp_synq_overflow(sk); | ||
186 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); | ||
187 | |||
188 | return __cookie_v4_init_sequence(iph, th, mssp); | ||
189 | } | ||
183 | 190 | ||
184 | /* | 191 | /* |
185 | * This (misnamed) value is the age of syncookie which is permitted. | 192 | * This (misnamed) value is the age of syncookie which is permitted. |
@@ -192,10 +199,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) | |||
192 | * Check if a ack sequence number is a valid syncookie. | 199 | * Check if a ack sequence number is a valid syncookie. |
193 | * Return the decoded mss if it is, or 0 if not. | 200 | * Return the decoded mss if it is, or 0 if not. |
194 | */ | 201 | */ |
195 | static inline int cookie_check(struct sk_buff *skb, __u32 cookie) | 202 | int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, |
203 | u32 cookie) | ||
196 | { | 204 | { |
197 | const struct iphdr *iph = ip_hdr(skb); | ||
198 | const struct tcphdr *th = tcp_hdr(skb); | ||
199 | __u32 seq = ntohl(th->seq) - 1; | 205 | __u32 seq = ntohl(th->seq) - 1; |
200 | __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, | 206 | __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, |
201 | th->source, th->dest, seq, | 207 | th->source, th->dest, seq, |
@@ -204,6 +210,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) | |||
204 | 210 | ||
205 | return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; | 211 | return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; |
206 | } | 212 | } |
213 | EXPORT_SYMBOL_GPL(__cookie_v4_check); | ||
207 | 214 | ||
208 | static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, | 215 | static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, |
209 | struct request_sock *req, | 216 | struct request_sock *req, |
@@ -284,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
284 | goto out; | 291 | goto out; |
285 | 292 | ||
286 | if (tcp_synq_no_recent_overflow(sk) || | 293 | if (tcp_synq_no_recent_overflow(sk) || |
287 | (mss = cookie_check(skb, cookie)) == 0) { | 294 | (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { |
288 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); | 295 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); |
289 | goto out; | 296 | goto out; |
290 | } | 297 | } |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 610e324348d1..540279f4c531 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -29,6 +29,7 @@ | |||
29 | static int zero; | 29 | static int zero; |
30 | static int one = 1; | 30 | static int one = 1; |
31 | static int four = 4; | 31 | static int four = 4; |
32 | static int gso_max_segs = GSO_MAX_SEGS; | ||
32 | static int tcp_retr1_max = 255; | 33 | static int tcp_retr1_max = 255; |
33 | static int ip_local_port_range_min[] = { 1, 1 }; | 34 | static int ip_local_port_range_min[] = { 1, 1 }; |
34 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 35 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
@@ -559,6 +560,13 @@ static struct ctl_table ipv4_table[] = { | |||
559 | .extra1 = &one, | 560 | .extra1 = &one, |
560 | }, | 561 | }, |
561 | { | 562 | { |
563 | .procname = "tcp_notsent_lowat", | ||
564 | .data = &sysctl_tcp_notsent_lowat, | ||
565 | .maxlen = sizeof(sysctl_tcp_notsent_lowat), | ||
566 | .mode = 0644, | ||
567 | .proc_handler = proc_dointvec, | ||
568 | }, | ||
569 | { | ||
562 | .procname = "tcp_rmem", | 570 | .procname = "tcp_rmem", |
563 | .data = &sysctl_tcp_rmem, | 571 | .data = &sysctl_tcp_rmem, |
564 | .maxlen = sizeof(sysctl_tcp_rmem), | 572 | .maxlen = sizeof(sysctl_tcp_rmem), |
@@ -754,6 +762,15 @@ static struct ctl_table ipv4_table[] = { | |||
754 | .extra2 = &four, | 762 | .extra2 = &four, |
755 | }, | 763 | }, |
756 | { | 764 | { |
765 | .procname = "tcp_min_tso_segs", | ||
766 | .data = &sysctl_tcp_min_tso_segs, | ||
767 | .maxlen = sizeof(int), | ||
768 | .mode = 0644, | ||
769 | .proc_handler = proc_dointvec_minmax, | ||
770 | .extra1 = &zero, | ||
771 | .extra2 = &gso_max_segs, | ||
772 | }, | ||
773 | { | ||
757 | .procname = "udp_mem", | 774 | .procname = "udp_mem", |
758 | .data = &sysctl_udp_mem, | 775 | .data = &sysctl_udp_mem, |
759 | .maxlen = sizeof(sysctl_udp_mem), | 776 | .maxlen = sizeof(sysctl_udp_mem), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5423223e93c2..6e5617b9f9db 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -283,6 +283,8 @@ | |||
283 | 283 | ||
284 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | 284 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; |
285 | 285 | ||
286 | int sysctl_tcp_min_tso_segs __read_mostly = 2; | ||
287 | |||
286 | struct percpu_counter tcp_orphan_count; | 288 | struct percpu_counter tcp_orphan_count; |
287 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 289 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
288 | 290 | ||
@@ -410,10 +412,6 @@ void tcp_init_sock(struct sock *sk) | |||
410 | 412 | ||
411 | icsk->icsk_sync_mss = tcp_sync_mss; | 413 | icsk->icsk_sync_mss = tcp_sync_mss; |
412 | 414 | ||
413 | /* Presumed zeroed, in order of appearance: | ||
414 | * cookie_in_always, cookie_out_never, | ||
415 | * s_data_constant, s_data_in, s_data_out | ||
416 | */ | ||
417 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; | 415 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; |
418 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; | 416 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; |
419 | 417 | ||
@@ -499,7 +497,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
499 | mask |= POLLIN | POLLRDNORM; | 497 | mask |= POLLIN | POLLRDNORM; |
500 | 498 | ||
501 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { | 499 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { |
502 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { | 500 | if (sk_stream_is_writeable(sk)) { |
503 | mask |= POLLOUT | POLLWRNORM; | 501 | mask |= POLLOUT | POLLWRNORM; |
504 | } else { /* send SIGIO later */ | 502 | } else { /* send SIGIO later */ |
505 | set_bit(SOCK_ASYNC_NOSPACE, | 503 | set_bit(SOCK_ASYNC_NOSPACE, |
@@ -510,7 +508,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
510 | * wspace test but before the flags are set, | 508 | * wspace test but before the flags are set, |
511 | * IO signal will be lost. | 509 | * IO signal will be lost. |
512 | */ | 510 | */ |
513 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) | 511 | if (sk_stream_is_writeable(sk)) |
514 | mask |= POLLOUT | POLLWRNORM; | 512 | mask |= POLLOUT | POLLWRNORM; |
515 | } | 513 | } |
516 | } else | 514 | } else |
@@ -789,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
789 | xmit_size_goal = mss_now; | 787 | xmit_size_goal = mss_now; |
790 | 788 | ||
791 | if (large_allowed && sk_can_gso(sk)) { | 789 | if (large_allowed && sk_can_gso(sk)) { |
792 | xmit_size_goal = ((sk->sk_gso_max_size - 1) - | 790 | u32 gso_size, hlen; |
793 | inet_csk(sk)->icsk_af_ops->net_header_len - | 791 | |
794 | inet_csk(sk)->icsk_ext_hdr_len - | 792 | /* Maybe we should/could use sk->sk_prot->max_header here ? */ |
795 | tp->tcp_header_len); | 793 | hlen = inet_csk(sk)->icsk_af_ops->net_header_len + |
794 | inet_csk(sk)->icsk_ext_hdr_len + | ||
795 | tp->tcp_header_len; | ||
796 | |||
797 | /* Goal is to send at least one packet per ms, | ||
798 | * not one big TSO packet every 100 ms. | ||
799 | * This preserves ACK clocking and is consistent | ||
800 | * with tcp_tso_should_defer() heuristic. | ||
801 | */ | ||
802 | gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); | ||
803 | gso_size = max_t(u32, gso_size, | ||
804 | sysctl_tcp_min_tso_segs * mss_now); | ||
805 | |||
806 | xmit_size_goal = min_t(u32, gso_size, | ||
807 | sk->sk_gso_max_size - 1 - hlen); | ||
796 | 808 | ||
797 | /* TSQ : try to have two TSO segments in flight */ | 809 | /* TSQ : try to have at least two segments in flight |
810 | * (one in NIC TX ring, another in Qdisc) | ||
811 | */ | ||
798 | xmit_size_goal = min_t(u32, xmit_size_goal, | 812 | xmit_size_goal = min_t(u32, xmit_size_goal, |
799 | sysctl_tcp_limit_output_bytes >> 1); | 813 | sysctl_tcp_limit_output_bytes >> 1); |
800 | 814 | ||
@@ -1121,6 +1135,13 @@ new_segment: | |||
1121 | goto wait_for_memory; | 1135 | goto wait_for_memory; |
1122 | 1136 | ||
1123 | /* | 1137 | /* |
1138 | * All packets are restored as if they have | ||
1139 | * already been sent. | ||
1140 | */ | ||
1141 | if (tp->repair) | ||
1142 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
1143 | |||
1144 | /* | ||
1124 | * Check whether we can use HW checksum. | 1145 | * Check whether we can use HW checksum. |
1125 | */ | 1146 | */ |
1126 | if (sk->sk_route_caps & NETIF_F_ALL_CSUM) | 1147 | if (sk->sk_route_caps & NETIF_F_ALL_CSUM) |
@@ -2447,10 +2468,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2447 | case TCP_THIN_DUPACK: | 2468 | case TCP_THIN_DUPACK: |
2448 | if (val < 0 || val > 1) | 2469 | if (val < 0 || val > 1) |
2449 | err = -EINVAL; | 2470 | err = -EINVAL; |
2450 | else | 2471 | else { |
2451 | tp->thin_dupack = val; | 2472 | tp->thin_dupack = val; |
2452 | if (tp->thin_dupack) | 2473 | if (tp->thin_dupack) |
2453 | tcp_disable_early_retrans(tp); | 2474 | tcp_disable_early_retrans(tp); |
2475 | } | ||
2454 | break; | 2476 | break; |
2455 | 2477 | ||
2456 | case TCP_REPAIR: | 2478 | case TCP_REPAIR: |
@@ -2631,6 +2653,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2631 | else | 2653 | else |
2632 | tp->tsoffset = val - tcp_time_stamp; | 2654 | tp->tsoffset = val - tcp_time_stamp; |
2633 | break; | 2655 | break; |
2656 | case TCP_NOTSENT_LOWAT: | ||
2657 | tp->notsent_lowat = val; | ||
2658 | sk->sk_write_space(sk); | ||
2659 | break; | ||
2634 | default: | 2660 | default: |
2635 | err = -ENOPROTOOPT; | 2661 | err = -ENOPROTOOPT; |
2636 | break; | 2662 | break; |
@@ -2847,6 +2873,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
2847 | case TCP_TIMESTAMP: | 2873 | case TCP_TIMESTAMP: |
2848 | val = tcp_time_stamp + tp->tsoffset; | 2874 | val = tcp_time_stamp + tp->tsoffset; |
2849 | break; | 2875 | break; |
2876 | case TCP_NOTSENT_LOWAT: | ||
2877 | val = tp->notsent_lowat; | ||
2878 | break; | ||
2850 | default: | 2879 | default: |
2851 | return -ENOPROTOOPT; | 2880 | return -ENOPROTOOPT; |
2852 | } | 2881 | } |
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9077f441cb2..b6ae92a51f58 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c | |||
@@ -206,8 +206,8 @@ static u32 cubic_root(u64 a) | |||
206 | */ | 206 | */ |
207 | static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | 207 | static inline void bictcp_update(struct bictcp *ca, u32 cwnd) |
208 | { | 208 | { |
209 | u64 offs; | 209 | u32 delta, bic_target, max_cnt; |
210 | u32 delta, t, bic_target, max_cnt; | 210 | u64 offs, t; |
211 | 211 | ||
212 | ca->ack_cnt++; /* count the number of ACKs */ | 212 | ca->ack_cnt++; /* count the number of ACKs */ |
213 | 213 | ||
@@ -250,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
250 | * if the cwnd < 1 million packets !!! | 250 | * if the cwnd < 1 million packets !!! |
251 | */ | 251 | */ |
252 | 252 | ||
253 | t = (s32)(tcp_time_stamp - ca->epoch_start); | ||
254 | t += msecs_to_jiffies(ca->delay_min >> 3); | ||
253 | /* change the unit from HZ to bictcp_HZ */ | 255 | /* change the unit from HZ to bictcp_HZ */ |
254 | t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) | 256 | t <<= BICTCP_HZ; |
255 | - ca->epoch_start) << BICTCP_HZ) / HZ; | 257 | do_div(t, HZ); |
256 | 258 | ||
257 | if (t < ca->bic_K) /* t - K */ | 259 | if (t < ca->bic_K) /* t - K */ |
258 | offs = ca->bic_K - t; | 260 | offs = ca->bic_K - t; |
@@ -414,7 +416,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) | |||
414 | return; | 416 | return; |
415 | 417 | ||
416 | /* Discard delay samples right after fast recovery */ | 418 | /* Discard delay samples right after fast recovery */ |
417 | if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) | 419 | if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) |
418 | return; | 420 | return; |
419 | 421 | ||
420 | delay = (rtt_us << 3) / USEC_PER_MSEC; | 422 | delay = (rtt_us << 3) / USEC_PER_MSEC; |
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8f7ef0ad80e5..ab7bd35bb312 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c | |||
@@ -58,23 +58,22 @@ error: kfree(ctx); | |||
58 | return err; | 58 | return err; |
59 | } | 59 | } |
60 | 60 | ||
61 | /* Computes the fastopen cookie for the peer. | 61 | /* Computes the fastopen cookie for the IP path. |
62 | * The peer address is a 128 bits long (pad with zeros for IPv4). | 62 | * The path is a 128 bits long (pad with zeros for IPv4). |
63 | * | 63 | * |
64 | * The caller must check foc->len to determine if a valid cookie | 64 | * The caller must check foc->len to determine if a valid cookie |
65 | * has been generated successfully. | 65 | * has been generated successfully. |
66 | */ | 66 | */ |
67 | void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) | 67 | void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, |
68 | struct tcp_fastopen_cookie *foc) | ||
68 | { | 69 | { |
69 | __be32 peer_addr[4] = { addr, 0, 0, 0 }; | 70 | __be32 path[4] = { src, dst, 0, 0 }; |
70 | struct tcp_fastopen_context *ctx; | 71 | struct tcp_fastopen_context *ctx; |
71 | 72 | ||
72 | rcu_read_lock(); | 73 | rcu_read_lock(); |
73 | ctx = rcu_dereference(tcp_fastopen_ctx); | 74 | ctx = rcu_dereference(tcp_fastopen_ctx); |
74 | if (ctx) { | 75 | if (ctx) { |
75 | crypto_cipher_encrypt_one(ctx->tfm, | 76 | crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); |
76 | foc->val, | ||
77 | (__u8 *)peer_addr); | ||
78 | foc->len = TCP_FASTOPEN_COOKIE_SIZE; | 77 | foc->len = TCP_FASTOPEN_COOKIE_SIZE; |
79 | } | 78 | } |
80 | rcu_read_unlock(); | 79 | rcu_read_unlock(); |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28af45abe062..1969e16d936d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
688 | } | 688 | } |
689 | } | 689 | } |
690 | 690 | ||
691 | /* Set the sk_pacing_rate to allow proper sizing of TSO packets. | ||
692 | * Note: TCP stack does not yet implement pacing. | ||
693 | * FQ packet scheduler can be used to implement cheap but effective | ||
694 | * TCP pacing, to smooth the burst on large writes when packets | ||
695 | * in flight is significantly lower than cwnd (or rwin) | ||
696 | */ | ||
697 | static void tcp_update_pacing_rate(struct sock *sk) | ||
698 | { | ||
699 | const struct tcp_sock *tp = tcp_sk(sk); | ||
700 | u64 rate; | ||
701 | |||
702 | /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ | ||
703 | rate = (u64)tp->mss_cache * 2 * (HZ << 3); | ||
704 | |||
705 | rate *= max(tp->snd_cwnd, tp->packets_out); | ||
706 | |||
707 | /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), | ||
708 | * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) | ||
709 | * We probably need usec resolution in the future. | ||
710 | * Note: This also takes care of possible srtt=0 case, | ||
711 | * when tcp_rtt_estimator() was not yet called. | ||
712 | */ | ||
713 | if (tp->srtt > 8 + 2) | ||
714 | do_div(rate, tp->srtt); | ||
715 | |||
716 | sk->sk_pacing_rate = min_t(u64, rate, ~0U); | ||
717 | } | ||
718 | |||
691 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 719 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
692 | * routine referred to above. | 720 | * routine referred to above. |
693 | */ | 721 | */ |
@@ -1048,6 +1076,7 @@ struct tcp_sacktag_state { | |||
1048 | int reord; | 1076 | int reord; |
1049 | int fack_count; | 1077 | int fack_count; |
1050 | int flag; | 1078 | int flag; |
1079 | s32 rtt; /* RTT measured by SACKing never-retransmitted data */ | ||
1051 | }; | 1080 | }; |
1052 | 1081 | ||
1053 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, | 1082 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, |
@@ -1108,7 +1137,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | |||
1108 | static u8 tcp_sacktag_one(struct sock *sk, | 1137 | static u8 tcp_sacktag_one(struct sock *sk, |
1109 | struct tcp_sacktag_state *state, u8 sacked, | 1138 | struct tcp_sacktag_state *state, u8 sacked, |
1110 | u32 start_seq, u32 end_seq, | 1139 | u32 start_seq, u32 end_seq, |
1111 | bool dup_sack, int pcount) | 1140 | int dup_sack, int pcount, u32 xmit_time) |
1112 | { | 1141 | { |
1113 | struct tcp_sock *tp = tcp_sk(sk); | 1142 | struct tcp_sock *tp = tcp_sk(sk); |
1114 | int fack_count = state->fack_count; | 1143 | int fack_count = state->fack_count; |
@@ -1148,6 +1177,9 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1148 | state->reord); | 1177 | state->reord); |
1149 | if (!after(end_seq, tp->high_seq)) | 1178 | if (!after(end_seq, tp->high_seq)) |
1150 | state->flag |= FLAG_ORIG_SACK_ACKED; | 1179 | state->flag |= FLAG_ORIG_SACK_ACKED; |
1180 | /* Pick the earliest sequence sacked for RTT */ | ||
1181 | if (state->rtt < 0) | ||
1182 | state->rtt = tcp_time_stamp - xmit_time; | ||
1151 | } | 1183 | } |
1152 | 1184 | ||
1153 | if (sacked & TCPCB_LOST) { | 1185 | if (sacked & TCPCB_LOST) { |
@@ -1205,7 +1237,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1205 | * tcp_highest_sack_seq() when skb is highest_sack. | 1237 | * tcp_highest_sack_seq() when skb is highest_sack. |
1206 | */ | 1238 | */ |
1207 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, | 1239 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, |
1208 | start_seq, end_seq, dup_sack, pcount); | 1240 | start_seq, end_seq, dup_sack, pcount, |
1241 | TCP_SKB_CB(skb)->when); | ||
1209 | 1242 | ||
1210 | if (skb == tp->lost_skb_hint) | 1243 | if (skb == tp->lost_skb_hint) |
1211 | tp->lost_cnt_hint += pcount; | 1244 | tp->lost_cnt_hint += pcount; |
@@ -1479,7 +1512,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1479 | TCP_SKB_CB(skb)->seq, | 1512 | TCP_SKB_CB(skb)->seq, |
1480 | TCP_SKB_CB(skb)->end_seq, | 1513 | TCP_SKB_CB(skb)->end_seq, |
1481 | dup_sack, | 1514 | dup_sack, |
1482 | tcp_skb_pcount(skb)); | 1515 | tcp_skb_pcount(skb), |
1516 | TCP_SKB_CB(skb)->when); | ||
1483 | 1517 | ||
1484 | if (!before(TCP_SKB_CB(skb)->seq, | 1518 | if (!before(TCP_SKB_CB(skb)->seq, |
1485 | tcp_highest_sack_seq(tp))) | 1519 | tcp_highest_sack_seq(tp))) |
@@ -1536,7 +1570,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl | |||
1536 | 1570 | ||
1537 | static int | 1571 | static int |
1538 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | 1572 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, |
1539 | u32 prior_snd_una) | 1573 | u32 prior_snd_una, s32 *sack_rtt) |
1540 | { | 1574 | { |
1541 | struct tcp_sock *tp = tcp_sk(sk); | 1575 | struct tcp_sock *tp = tcp_sk(sk); |
1542 | const unsigned char *ptr = (skb_transport_header(ack_skb) + | 1576 | const unsigned char *ptr = (skb_transport_header(ack_skb) + |
@@ -1554,6 +1588,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1554 | 1588 | ||
1555 | state.flag = 0; | 1589 | state.flag = 0; |
1556 | state.reord = tp->packets_out; | 1590 | state.reord = tp->packets_out; |
1591 | state.rtt = -1; | ||
1557 | 1592 | ||
1558 | if (!tp->sacked_out) { | 1593 | if (!tp->sacked_out) { |
1559 | if (WARN_ON(tp->fackets_out)) | 1594 | if (WARN_ON(tp->fackets_out)) |
@@ -1737,6 +1772,7 @@ out: | |||
1737 | WARN_ON((int)tp->retrans_out < 0); | 1772 | WARN_ON((int)tp->retrans_out < 0); |
1738 | WARN_ON((int)tcp_packets_in_flight(tp) < 0); | 1773 | WARN_ON((int)tcp_packets_in_flight(tp) < 0); |
1739 | #endif | 1774 | #endif |
1775 | *sack_rtt = state.rtt; | ||
1740 | return state.flag; | 1776 | return state.flag; |
1741 | } | 1777 | } |
1742 | 1778 | ||
@@ -1869,8 +1905,13 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1869 | } | 1905 | } |
1870 | tcp_verify_left_out(tp); | 1906 | tcp_verify_left_out(tp); |
1871 | 1907 | ||
1872 | tp->reordering = min_t(unsigned int, tp->reordering, | 1908 | /* Timeout in disordered state after receiving substantial DUPACKs |
1873 | sysctl_tcp_reordering); | 1909 | * suggests that the degree of reordering is over-estimated. |
1910 | */ | ||
1911 | if (icsk->icsk_ca_state <= TCP_CA_Disorder && | ||
1912 | tp->sacked_out >= sysctl_tcp_reordering) | ||
1913 | tp->reordering = min_t(unsigned int, tp->reordering, | ||
1914 | sysctl_tcp_reordering); | ||
1874 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1915 | tcp_set_ca_state(sk, TCP_CA_Loss); |
1875 | tp->high_seq = tp->snd_nxt; | 1916 | tp->high_seq = tp->snd_nxt; |
1876 | TCP_ECN_queue_cwr(tp); | 1917 | TCP_ECN_queue_cwr(tp); |
@@ -2472,8 +2513,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) | |||
2472 | 2513 | ||
2473 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { | 2514 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { |
2474 | tcp_try_keep_open(sk); | 2515 | tcp_try_keep_open(sk); |
2475 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) | ||
2476 | tcp_moderate_cwnd(tp); | ||
2477 | } else { | 2516 | } else { |
2478 | tcp_cwnd_reduction(sk, prior_unsacked, 0); | 2517 | tcp_cwnd_reduction(sk, prior_unsacked, 0); |
2479 | } | 2518 | } |
@@ -2792,65 +2831,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
2792 | tcp_xmit_retransmit_queue(sk); | 2831 | tcp_xmit_retransmit_queue(sk); |
2793 | } | 2832 | } |
2794 | 2833 | ||
2795 | void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) | 2834 | static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, |
2835 | s32 seq_rtt, s32 sack_rtt) | ||
2796 | { | 2836 | { |
2797 | tcp_rtt_estimator(sk, seq_rtt); | 2837 | const struct tcp_sock *tp = tcp_sk(sk); |
2798 | tcp_set_rto(sk); | 2838 | |
2799 | inet_csk(sk)->icsk_backoff = 0; | 2839 | /* Prefer RTT measured from ACK's timing to TS-ECR. This is because |
2800 | } | 2840 | * broken middle-boxes or peers may corrupt TS-ECR fields. But |
2801 | EXPORT_SYMBOL(tcp_valid_rtt_meas); | 2841 | * Karn's algorithm forbids taking RTT if some retransmitted data |
2842 | * is acked (RFC6298). | ||
2843 | */ | ||
2844 | if (flag & FLAG_RETRANS_DATA_ACKED) | ||
2845 | seq_rtt = -1; | ||
2846 | |||
2847 | if (seq_rtt < 0) | ||
2848 | seq_rtt = sack_rtt; | ||
2802 | 2849 | ||
2803 | /* Read draft-ietf-tcplw-high-performance before mucking | ||
2804 | * with this code. (Supersedes RFC1323) | ||
2805 | */ | ||
2806 | static void tcp_ack_saw_tstamp(struct sock *sk, int flag) | ||
2807 | { | ||
2808 | /* RTTM Rule: A TSecr value received in a segment is used to | 2850 | /* RTTM Rule: A TSecr value received in a segment is used to |
2809 | * update the averaged RTT measurement only if the segment | 2851 | * update the averaged RTT measurement only if the segment |
2810 | * acknowledges some new data, i.e., only if it advances the | 2852 | * acknowledges some new data, i.e., only if it advances the |
2811 | * left edge of the send window. | 2853 | * left edge of the send window. |
2812 | * | ||
2813 | * See draft-ietf-tcplw-high-performance-00, section 3.3. | 2854 | * See draft-ietf-tcplw-high-performance-00, section 3.3. |
2814 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> | ||
2815 | * | ||
2816 | * Changed: reset backoff as soon as we see the first valid sample. | ||
2817 | * If we do not, we get strongly overestimated rto. With timestamps | ||
2818 | * samples are accepted even from very old segments: f.e., when rtt=1 | ||
2819 | * increases to 8, we retransmit 5 times and after 8 seconds delayed | ||
2820 | * answer arrives rto becomes 120 seconds! If at least one of segments | ||
2821 | * in window is lost... Voila. --ANK (010210) | ||
2822 | */ | 2855 | */ |
2823 | struct tcp_sock *tp = tcp_sk(sk); | 2856 | if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
2824 | 2857 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | |
2825 | tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); | ||
2826 | } | ||
2827 | 2858 | ||
2828 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) | 2859 | if (seq_rtt < 0) |
2829 | { | 2860 | return false; |
2830 | /* We don't have a timestamp. Can only use | ||
2831 | * packets that are not retransmitted to determine | ||
2832 | * rtt estimates. Also, we must not reset the | ||
2833 | * backoff for rto until we get a non-retransmitted | ||
2834 | * packet. This allows us to deal with a situation | ||
2835 | * where the network delay has increased suddenly. | ||
2836 | * I.e. Karn's algorithm. (SIGCOMM '87, p5.) | ||
2837 | */ | ||
2838 | 2861 | ||
2839 | if (flag & FLAG_RETRANS_DATA_ACKED) | 2862 | tcp_rtt_estimator(sk, seq_rtt); |
2840 | return; | 2863 | tcp_set_rto(sk); |
2841 | 2864 | ||
2842 | tcp_valid_rtt_meas(sk, seq_rtt); | 2865 | /* RFC6298: only reset backoff on valid RTT measurement. */ |
2866 | inet_csk(sk)->icsk_backoff = 0; | ||
2867 | return true; | ||
2843 | } | 2868 | } |
2844 | 2869 | ||
2845 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, | 2870 | /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ |
2846 | const s32 seq_rtt) | 2871 | static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) |
2847 | { | 2872 | { |
2848 | const struct tcp_sock *tp = tcp_sk(sk); | 2873 | struct tcp_sock *tp = tcp_sk(sk); |
2849 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 2874 | s32 seq_rtt = -1; |
2850 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 2875 | |
2851 | tcp_ack_saw_tstamp(sk, flag); | 2876 | if (tp->lsndtime && !tp->total_retrans) |
2852 | else if (seq_rtt >= 0) | 2877 | seq_rtt = tcp_time_stamp - tp->lsndtime; |
2853 | tcp_ack_no_tstamp(sk, seq_rtt, flag); | 2878 | tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); |
2854 | } | 2879 | } |
2855 | 2880 | ||
2856 | static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) | 2881 | static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) |
@@ -2939,7 +2964,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) | |||
2939 | * arrived at the other end. | 2964 | * arrived at the other end. |
2940 | */ | 2965 | */ |
2941 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | 2966 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, |
2942 | u32 prior_snd_una) | 2967 | u32 prior_snd_una, s32 sack_rtt) |
2943 | { | 2968 | { |
2944 | struct tcp_sock *tp = tcp_sk(sk); | 2969 | struct tcp_sock *tp = tcp_sk(sk); |
2945 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2970 | const struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -2978,8 +3003,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
2978 | if (sacked & TCPCB_SACKED_RETRANS) | 3003 | if (sacked & TCPCB_SACKED_RETRANS) |
2979 | tp->retrans_out -= acked_pcount; | 3004 | tp->retrans_out -= acked_pcount; |
2980 | flag |= FLAG_RETRANS_DATA_ACKED; | 3005 | flag |= FLAG_RETRANS_DATA_ACKED; |
2981 | ca_seq_rtt = -1; | ||
2982 | seq_rtt = -1; | ||
2983 | } else { | 3006 | } else { |
2984 | ca_seq_rtt = now - scb->when; | 3007 | ca_seq_rtt = now - scb->when; |
2985 | last_ackt = skb->tstamp; | 3008 | last_ackt = skb->tstamp; |
@@ -3031,6 +3054,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3031 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | 3054 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
3032 | flag |= FLAG_SACK_RENEGING; | 3055 | flag |= FLAG_SACK_RENEGING; |
3033 | 3056 | ||
3057 | if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || | ||
3058 | (flag & FLAG_ACKED)) | ||
3059 | tcp_rearm_rto(sk); | ||
3060 | |||
3034 | if (flag & FLAG_ACKED) { | 3061 | if (flag & FLAG_ACKED) { |
3035 | const struct tcp_congestion_ops *ca_ops | 3062 | const struct tcp_congestion_ops *ca_ops |
3036 | = inet_csk(sk)->icsk_ca_ops; | 3063 | = inet_csk(sk)->icsk_ca_ops; |
@@ -3040,9 +3067,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3040 | tcp_mtup_probe_success(sk); | 3067 | tcp_mtup_probe_success(sk); |
3041 | } | 3068 | } |
3042 | 3069 | ||
3043 | tcp_ack_update_rtt(sk, flag, seq_rtt); | ||
3044 | tcp_rearm_rto(sk); | ||
3045 | |||
3046 | if (tcp_is_reno(tp)) { | 3070 | if (tcp_is_reno(tp)) { |
3047 | tcp_remove_reno_sacks(sk, pkts_acked); | 3071 | tcp_remove_reno_sacks(sk, pkts_acked); |
3048 | } else { | 3072 | } else { |
@@ -3130,11 +3154,24 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) | |||
3130 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; | 3154 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; |
3131 | } | 3155 | } |
3132 | 3156 | ||
3157 | /* Decide wheather to run the increase function of congestion control. */ | ||
3133 | static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) | 3158 | static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
3134 | { | 3159 | { |
3135 | const struct tcp_sock *tp = tcp_sk(sk); | 3160 | if (tcp_in_cwnd_reduction(sk)) |
3136 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && | 3161 | return false; |
3137 | !tcp_in_cwnd_reduction(sk); | 3162 | |
3163 | /* If reordering is high then always grow cwnd whenever data is | ||
3164 | * delivered regardless of its ordering. Otherwise stay conservative | ||
3165 | * and only grow cwnd on in-order delivery in Open state, and retain | ||
3166 | * cwnd in Disordered state (RFC5681). A stretched ACK with | ||
3167 | * new SACK or ECE mark may first advance cwnd here and later reduce | ||
3168 | * cwnd in tcp_fastretrans_alert() based on more states. | ||
3169 | */ | ||
3170 | if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) | ||
3171 | return flag & FLAG_FORWARD_PROGRESS; | ||
3172 | |||
3173 | return inet_csk(sk)->icsk_ca_state == TCP_CA_Open && | ||
3174 | flag & FLAG_DATA_ACKED; | ||
3138 | } | 3175 | } |
3139 | 3176 | ||
3140 | /* Check that window update is acceptable. | 3177 | /* Check that window update is acceptable. |
@@ -3269,11 +3306,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3269 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3306 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
3270 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3307 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
3271 | bool is_dupack = false; | 3308 | bool is_dupack = false; |
3272 | u32 prior_in_flight; | 3309 | u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; |
3273 | u32 prior_fackets; | 3310 | u32 prior_fackets; |
3274 | int prior_packets = tp->packets_out; | 3311 | int prior_packets = tp->packets_out; |
3275 | const int prior_unsacked = tp->packets_out - tp->sacked_out; | 3312 | const int prior_unsacked = tp->packets_out - tp->sacked_out; |
3276 | int acked = 0; /* Number of packets newly acked */ | 3313 | int acked = 0; /* Number of packets newly acked */ |
3314 | s32 sack_rtt = -1; | ||
3277 | 3315 | ||
3278 | /* If the ack is older than previous acks | 3316 | /* If the ack is older than previous acks |
3279 | * then we can probably ignore it. | 3317 | * then we can probably ignore it. |
@@ -3330,7 +3368,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3330 | flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); | 3368 | flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); |
3331 | 3369 | ||
3332 | if (TCP_SKB_CB(skb)->sacked) | 3370 | if (TCP_SKB_CB(skb)->sacked) |
3333 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); | 3371 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3372 | &sack_rtt); | ||
3334 | 3373 | ||
3335 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) | 3374 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) |
3336 | flag |= FLAG_ECE; | 3375 | flag |= FLAG_ECE; |
@@ -3349,21 +3388,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3349 | 3388 | ||
3350 | /* See if we can take anything off of the retransmit queue. */ | 3389 | /* See if we can take anything off of the retransmit queue. */ |
3351 | acked = tp->packets_out; | 3390 | acked = tp->packets_out; |
3352 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); | 3391 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); |
3353 | acked -= tp->packets_out; | 3392 | acked -= tp->packets_out; |
3354 | 3393 | ||
3394 | /* Advance cwnd if state allows */ | ||
3395 | if (tcp_may_raise_cwnd(sk, flag)) | ||
3396 | tcp_cong_avoid(sk, ack, prior_in_flight); | ||
3397 | |||
3355 | if (tcp_ack_is_dubious(sk, flag)) { | 3398 | if (tcp_ack_is_dubious(sk, flag)) { |
3356 | /* Advance CWND, if state allows this. */ | ||
3357 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) | ||
3358 | tcp_cong_avoid(sk, ack, prior_in_flight); | ||
3359 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | 3399 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
3360 | tcp_fastretrans_alert(sk, acked, prior_unsacked, | 3400 | tcp_fastretrans_alert(sk, acked, prior_unsacked, |
3361 | is_dupack, flag); | 3401 | is_dupack, flag); |
3362 | } else { | ||
3363 | if (flag & FLAG_DATA_ACKED) | ||
3364 | tcp_cong_avoid(sk, ack, prior_in_flight); | ||
3365 | } | 3402 | } |
3366 | |||
3367 | if (tp->tlp_high_seq) | 3403 | if (tp->tlp_high_seq) |
3368 | tcp_process_tlp_ack(sk, ack, flag); | 3404 | tcp_process_tlp_ack(sk, ack, flag); |
3369 | 3405 | ||
@@ -3375,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3375 | 3411 | ||
3376 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) | 3412 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) |
3377 | tcp_schedule_loss_probe(sk); | 3413 | tcp_schedule_loss_probe(sk); |
3414 | if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) | ||
3415 | tcp_update_pacing_rate(sk); | ||
3378 | return 1; | 3416 | return 1; |
3379 | 3417 | ||
3380 | no_queue: | 3418 | no_queue: |
@@ -3402,7 +3440,8 @@ old_ack: | |||
3402 | * If data was DSACKed, see if we can undo a cwnd reduction. | 3440 | * If data was DSACKed, see if we can undo a cwnd reduction. |
3403 | */ | 3441 | */ |
3404 | if (TCP_SKB_CB(skb)->sacked) { | 3442 | if (TCP_SKB_CB(skb)->sacked) { |
3405 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); | 3443 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3444 | &sack_rtt); | ||
3406 | tcp_fastretrans_alert(sk, acked, prior_unsacked, | 3445 | tcp_fastretrans_alert(sk, acked, prior_unsacked, |
3407 | is_dupack, flag); | 3446 | is_dupack, flag); |
3408 | } | 3447 | } |
@@ -3535,7 +3574,10 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr | |||
3535 | ++ptr; | 3574 | ++ptr; |
3536 | tp->rx_opt.rcv_tsval = ntohl(*ptr); | 3575 | tp->rx_opt.rcv_tsval = ntohl(*ptr); |
3537 | ++ptr; | 3576 | ++ptr; |
3538 | tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; | 3577 | if (*ptr) |
3578 | tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; | ||
3579 | else | ||
3580 | tp->rx_opt.rcv_tsecr = 0; | ||
3539 | return true; | 3581 | return true; |
3540 | } | 3582 | } |
3541 | return false; | 3583 | return false; |
@@ -3560,7 +3602,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, | |||
3560 | } | 3602 | } |
3561 | 3603 | ||
3562 | tcp_parse_options(skb, &tp->rx_opt, 1, NULL); | 3604 | tcp_parse_options(skb, &tp->rx_opt, 1, NULL); |
3563 | if (tp->rx_opt.saw_tstamp) | 3605 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
3564 | tp->rx_opt.rcv_tsecr -= tp->tsoffset; | 3606 | tp->rx_opt.rcv_tsecr -= tp->tsoffset; |
3565 | 3607 | ||
3566 | return true; | 3608 | return true; |
@@ -5010,8 +5052,8 @@ discard: | |||
5010 | * the rest is checked inline. Fast processing is turned on in | 5052 | * the rest is checked inline. Fast processing is turned on in |
5011 | * tcp_data_queue when everything is OK. | 5053 | * tcp_data_queue when everything is OK. |
5012 | */ | 5054 | */ |
5013 | int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | 5055 | void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
5014 | const struct tcphdr *th, unsigned int len) | 5056 | const struct tcphdr *th, unsigned int len) |
5015 | { | 5057 | { |
5016 | struct tcp_sock *tp = tcp_sk(sk); | 5058 | struct tcp_sock *tp = tcp_sk(sk); |
5017 | 5059 | ||
@@ -5088,7 +5130,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5088 | tcp_ack(sk, skb, 0); | 5130 | tcp_ack(sk, skb, 0); |
5089 | __kfree_skb(skb); | 5131 | __kfree_skb(skb); |
5090 | tcp_data_snd_check(sk); | 5132 | tcp_data_snd_check(sk); |
5091 | return 0; | 5133 | return; |
5092 | } else { /* Header too small */ | 5134 | } else { /* Header too small */ |
5093 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); | 5135 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); |
5094 | goto discard; | 5136 | goto discard; |
@@ -5181,7 +5223,7 @@ no_ack: | |||
5181 | if (eaten) | 5223 | if (eaten) |
5182 | kfree_skb_partial(skb, fragstolen); | 5224 | kfree_skb_partial(skb, fragstolen); |
5183 | sk->sk_data_ready(sk, 0); | 5225 | sk->sk_data_ready(sk, 0); |
5184 | return 0; | 5226 | return; |
5185 | } | 5227 | } |
5186 | } | 5228 | } |
5187 | 5229 | ||
@@ -5197,7 +5239,7 @@ slow_path: | |||
5197 | */ | 5239 | */ |
5198 | 5240 | ||
5199 | if (!tcp_validate_incoming(sk, skb, th, 1)) | 5241 | if (!tcp_validate_incoming(sk, skb, th, 1)) |
5200 | return 0; | 5242 | return; |
5201 | 5243 | ||
5202 | step5: | 5244 | step5: |
5203 | if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) | 5245 | if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) |
@@ -5213,7 +5255,7 @@ step5: | |||
5213 | 5255 | ||
5214 | tcp_data_snd_check(sk); | 5256 | tcp_data_snd_check(sk); |
5215 | tcp_ack_snd_check(sk); | 5257 | tcp_ack_snd_check(sk); |
5216 | return 0; | 5258 | return; |
5217 | 5259 | ||
5218 | csum_error: | 5260 | csum_error: |
5219 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); | 5261 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); |
@@ -5221,7 +5263,6 @@ csum_error: | |||
5221 | 5263 | ||
5222 | discard: | 5264 | discard: |
5223 | __kfree_skb(skb); | 5265 | __kfree_skb(skb); |
5224 | return 0; | ||
5225 | } | 5266 | } |
5226 | EXPORT_SYMBOL(tcp_rcv_established); | 5267 | EXPORT_SYMBOL(tcp_rcv_established); |
5227 | 5268 | ||
@@ -5316,7 +5357,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5316 | int saved_clamp = tp->rx_opt.mss_clamp; | 5357 | int saved_clamp = tp->rx_opt.mss_clamp; |
5317 | 5358 | ||
5318 | tcp_parse_options(skb, &tp->rx_opt, 0, &foc); | 5359 | tcp_parse_options(skb, &tp->rx_opt, 0, &foc); |
5319 | if (tp->rx_opt.saw_tstamp) | 5360 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
5320 | tp->rx_opt.rcv_tsecr -= tp->tsoffset; | 5361 | tp->rx_opt.rcv_tsecr -= tp->tsoffset; |
5321 | 5362 | ||
5322 | if (th->ack) { | 5363 | if (th->ack) { |
@@ -5624,9 +5665,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5624 | * so release it. | 5665 | * so release it. |
5625 | */ | 5666 | */ |
5626 | if (req) { | 5667 | if (req) { |
5627 | tcp_synack_rtt_meas(sk, req); | ||
5628 | tp->total_retrans = req->num_retrans; | 5668 | tp->total_retrans = req->num_retrans; |
5629 | |||
5630 | reqsk_fastopen_remove(sk, req, false); | 5669 | reqsk_fastopen_remove(sk, req, false); |
5631 | } else { | 5670 | } else { |
5632 | /* Make sure socket is routed, for correct metrics. */ | 5671 | /* Make sure socket is routed, for correct metrics. */ |
@@ -5651,6 +5690,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5651 | tp->snd_una = TCP_SKB_CB(skb)->ack_seq; | 5690 | tp->snd_una = TCP_SKB_CB(skb)->ack_seq; |
5652 | tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; | 5691 | tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; |
5653 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | 5692 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
5693 | tcp_synack_rtt_meas(sk, req); | ||
5654 | 5694 | ||
5655 | if (tp->rx_opt.tstamp_ok) | 5695 | if (tp->rx_opt.tstamp_ok) |
5656 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 5696 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b299da5ff499..b14266bb91eb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -821,8 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
821 | */ | 821 | */ |
822 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | 822 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
823 | struct request_sock *req, | 823 | struct request_sock *req, |
824 | u16 queue_mapping, | 824 | u16 queue_mapping) |
825 | bool nocache) | ||
826 | { | 825 | { |
827 | const struct inet_request_sock *ireq = inet_rsk(req); | 826 | const struct inet_request_sock *ireq = inet_rsk(req); |
828 | struct flowi4 fl4; | 827 | struct flowi4 fl4; |
@@ -852,7 +851,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
852 | 851 | ||
853 | static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) | 852 | static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) |
854 | { | 853 | { |
855 | int res = tcp_v4_send_synack(sk, NULL, req, 0, false); | 854 | int res = tcp_v4_send_synack(sk, NULL, req, 0); |
856 | 855 | ||
857 | if (!res) | 856 | if (!res) |
858 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); | 857 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
@@ -890,7 +889,7 @@ bool tcp_syn_flood_action(struct sock *sk, | |||
890 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); | 889 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); |
891 | 890 | ||
892 | lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; | 891 | lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; |
893 | if (!lopt->synflood_warned) { | 892 | if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { |
894 | lopt->synflood_warned = 1; | 893 | lopt->synflood_warned = 1; |
895 | pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", | 894 | pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", |
896 | proto, ntohs(tcp_hdr(skb)->dest), msg); | 895 | proto, ntohs(tcp_hdr(skb)->dest), msg); |
@@ -1316,9 +1315,11 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, | |||
1316 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 1315 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
1317 | return true; | 1316 | return true; |
1318 | } | 1317 | } |
1318 | |||
1319 | if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { | 1319 | if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { |
1320 | if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { | 1320 | if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { |
1321 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | 1321 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, |
1322 | ip_hdr(skb)->daddr, valid_foc); | ||
1322 | if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || | 1323 | if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || |
1323 | memcmp(&foc->val[0], &valid_foc->val[0], | 1324 | memcmp(&foc->val[0], &valid_foc->val[0], |
1324 | TCP_FASTOPEN_COOKIE_SIZE) != 0) | 1325 | TCP_FASTOPEN_COOKIE_SIZE) != 0) |
@@ -1329,14 +1330,16 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, | |||
1329 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 1330 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
1330 | return true; | 1331 | return true; |
1331 | } else if (foc->len == 0) { /* Client requesting a cookie */ | 1332 | } else if (foc->len == 0) { /* Client requesting a cookie */ |
1332 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | 1333 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, |
1334 | ip_hdr(skb)->daddr, valid_foc); | ||
1333 | NET_INC_STATS_BH(sock_net(sk), | 1335 | NET_INC_STATS_BH(sock_net(sk), |
1334 | LINUX_MIB_TCPFASTOPENCOOKIEREQD); | 1336 | LINUX_MIB_TCPFASTOPENCOOKIEREQD); |
1335 | } else { | 1337 | } else { |
1336 | /* Client sent a cookie with wrong size. Treat it | 1338 | /* Client sent a cookie with wrong size. Treat it |
1337 | * the same as invalid and return a valid one. | 1339 | * the same as invalid and return a valid one. |
1338 | */ | 1340 | */ |
1339 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | 1341 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, |
1342 | ip_hdr(skb)->daddr, valid_foc); | ||
1340 | } | 1343 | } |
1341 | return false; | 1344 | return false; |
1342 | } | 1345 | } |
@@ -1462,7 +1465,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1462 | * limitations, they conserve resources and peer is | 1465 | * limitations, they conserve resources and peer is |
1463 | * evidently real one. | 1466 | * evidently real one. |
1464 | */ | 1467 | */ |
1465 | if (inet_csk_reqsk_queue_is_full(sk) && !isn) { | 1468 | if ((sysctl_tcp_syncookies == 2 || |
1469 | inet_csk_reqsk_queue_is_full(sk)) && !isn) { | ||
1466 | want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); | 1470 | want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); |
1467 | if (!want_cookie) | 1471 | if (!want_cookie) |
1468 | goto drop; | 1472 | goto drop; |
@@ -1671,8 +1675,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1671 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; | 1675 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; |
1672 | 1676 | ||
1673 | tcp_initialize_rcv_mss(newsk); | 1677 | tcp_initialize_rcv_mss(newsk); |
1674 | tcp_synack_rtt_meas(newsk, req); | ||
1675 | newtp->total_retrans = req->num_retrans; | ||
1676 | 1678 | ||
1677 | #ifdef CONFIG_TCP_MD5SIG | 1679 | #ifdef CONFIG_TCP_MD5SIG |
1678 | /* Copy over the MD5 key from the original socket */ | 1680 | /* Copy over the MD5 key from the original socket */ |
@@ -1797,10 +1799,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
1797 | sk->sk_rx_dst = NULL; | 1799 | sk->sk_rx_dst = NULL; |
1798 | } | 1800 | } |
1799 | } | 1801 | } |
1800 | if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { | 1802 | tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); |
1801 | rsk = sk; | ||
1802 | goto reset; | ||
1803 | } | ||
1804 | return 0; | 1803 | return 0; |
1805 | } | 1804 | } |
1806 | 1805 | ||
@@ -2605,7 +2604,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, | |||
2605 | long delta = req->expires - jiffies; | 2604 | long delta = req->expires - jiffies; |
2606 | 2605 | ||
2607 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" | 2606 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" |
2608 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", | 2607 | " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n", |
2609 | i, | 2608 | i, |
2610 | ireq->loc_addr, | 2609 | ireq->loc_addr, |
2611 | ntohs(inet_sk(sk)->inet_sport), | 2610 | ntohs(inet_sk(sk)->inet_sport), |
@@ -2663,7 +2662,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
2663 | rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); | 2662 | rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); |
2664 | 2663 | ||
2665 | seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " | 2664 | seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " |
2666 | "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", | 2665 | "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n", |
2667 | i, src, srcp, dest, destp, sk->sk_state, | 2666 | i, src, srcp, dest, destp, sk->sk_state, |
2668 | tp->write_seq - tp->snd_una, | 2667 | tp->write_seq - tp->snd_una, |
2669 | rx_queue, | 2668 | rx_queue, |
@@ -2802,6 +2801,7 @@ struct proto tcp_prot = { | |||
2802 | .unhash = inet_unhash, | 2801 | .unhash = inet_unhash, |
2803 | .get_port = inet_csk_get_port, | 2802 | .get_port = inet_csk_get_port, |
2804 | .enter_memory_pressure = tcp_enter_memory_pressure, | 2803 | .enter_memory_pressure = tcp_enter_memory_pressure, |
2804 | .stream_memory_free = tcp_stream_memory_free, | ||
2805 | .sockets_allocated = &tcp_sockets_allocated, | 2805 | .sockets_allocated = &tcp_sockets_allocated, |
2806 | .orphan_count = &tcp_orphan_count, | 2806 | .orphan_count = &tcp_orphan_count, |
2807 | .memory_allocated = &tcp_memory_allocated, | 2807 | .memory_allocated = &tcp_memory_allocated, |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index da14436c1735..8a57d79b0b16 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c | |||
@@ -132,10 +132,10 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |||
132 | return 0; | 132 | return 0; |
133 | } | 133 | } |
134 | 134 | ||
135 | static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, | 135 | static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, |
136 | const char *buffer) | 136 | const char *buffer) |
137 | { | 137 | { |
138 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 138 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
139 | unsigned long long val; | 139 | unsigned long long val; |
140 | int ret = 0; | 140 | int ret = 0; |
141 | 141 | ||
@@ -180,9 +180,9 @@ static u64 tcp_read_usage(struct mem_cgroup *memcg) | |||
180 | return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); | 180 | return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); |
181 | } | 181 | } |
182 | 182 | ||
183 | static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) | 183 | static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) |
184 | { | 184 | { |
185 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 185 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
186 | u64 val; | 186 | u64 val; |
187 | 187 | ||
188 | switch (cft->private) { | 188 | switch (cft->private) { |
@@ -202,13 +202,13 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
202 | return val; | 202 | return val; |
203 | } | 203 | } |
204 | 204 | ||
205 | static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) | 205 | static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) |
206 | { | 206 | { |
207 | struct mem_cgroup *memcg; | 207 | struct mem_cgroup *memcg; |
208 | struct tcp_memcontrol *tcp; | 208 | struct tcp_memcontrol *tcp; |
209 | struct cg_proto *cg_proto; | 209 | struct cg_proto *cg_proto; |
210 | 210 | ||
211 | memcg = mem_cgroup_from_cont(cont); | 211 | memcg = mem_cgroup_from_css(css); |
212 | cg_proto = tcp_prot.proto_cgroup(memcg); | 212 | cg_proto = tcp_prot.proto_cgroup(memcg); |
213 | if (!cg_proto) | 213 | if (!cg_proto) |
214 | return 0; | 214 | return 0; |
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f6a005c485a9..4a22f3e715df 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c | |||
@@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk) | |||
443 | struct dst_entry *dst = __sk_dst_get(sk); | 443 | struct dst_entry *dst = __sk_dst_get(sk); |
444 | struct tcp_sock *tp = tcp_sk(sk); | 444 | struct tcp_sock *tp = tcp_sk(sk); |
445 | struct tcp_metrics_block *tm; | 445 | struct tcp_metrics_block *tm; |
446 | u32 val; | 446 | u32 val, crtt = 0; /* cached RTT scaled by 8 */ |
447 | 447 | ||
448 | if (dst == NULL) | 448 | if (dst == NULL) |
449 | goto reset; | 449 | goto reset; |
@@ -478,15 +478,19 @@ void tcp_init_metrics(struct sock *sk) | |||
478 | tp->reordering = val; | 478 | tp->reordering = val; |
479 | } | 479 | } |
480 | 480 | ||
481 | val = tcp_metric_get(tm, TCP_METRIC_RTT); | 481 | crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); |
482 | if (val == 0 || tp->srtt == 0) { | 482 | rcu_read_unlock(); |
483 | rcu_read_unlock(); | 483 | reset: |
484 | goto reset; | 484 | /* The initial RTT measurement from the SYN/SYN-ACK is not ideal |
485 | } | 485 | * to seed the RTO for later data packets because SYN packets are |
486 | /* Initial rtt is determined from SYN,SYN-ACK. | 486 | * small. Use the per-dst cached values to seed the RTO but keep |
487 | * The segment is small and rtt may appear much | 487 | * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). |
488 | * less than real one. Use per-dst memory | 488 | * Later the RTO will be updated immediately upon obtaining the first |
489 | * to make it more realistic. | 489 | * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only |
490 | * influences the first RTO but not later RTT estimation. | ||
491 | * | ||
492 | * But if RTT is not available from the SYN (due to retransmits or | ||
493 | * syn cookies) or the cache, force a conservative 3secs timeout. | ||
490 | * | 494 | * |
491 | * A bit of theory. RTT is time passed after "normal" sized packet | 495 | * A bit of theory. RTT is time passed after "normal" sized packet |
492 | * is sent until it is ACKed. In normal circumstances sending small | 496 | * is sent until it is ACKed. In normal circumstances sending small |
@@ -497,21 +501,9 @@ void tcp_init_metrics(struct sock *sk) | |||
497 | * to low value, and then abruptly stops to do it and starts to delay | 501 | * to low value, and then abruptly stops to do it and starts to delay |
498 | * ACKs, wait for troubles. | 502 | * ACKs, wait for troubles. |
499 | */ | 503 | */ |
500 | val = msecs_to_jiffies(val); | 504 | if (crtt > tp->srtt) { |
501 | if (val > tp->srtt) { | 505 | inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk)); |
502 | tp->srtt = val; | 506 | } else if (tp->srtt == 0) { |
503 | tp->rtt_seq = tp->snd_nxt; | ||
504 | } | ||
505 | val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | ||
506 | if (val > tp->mdev) { | ||
507 | tp->mdev = val; | ||
508 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
509 | } | ||
510 | rcu_read_unlock(); | ||
511 | |||
512 | tcp_set_rto(sk); | ||
513 | reset: | ||
514 | if (tp->srtt == 0) { | ||
515 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | 507 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from |
516 | * 3WHS. This is most likely due to retransmission, | 508 | * 3WHS. This is most likely due to retransmission, |
517 | * including spurious one. Reset the RTO back to 3secs | 509 | * including spurious one. Reset the RTO back to 3secs |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ab1c08658528..58a3e69aef64 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
411 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | 411 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
412 | tcp_enable_early_retrans(newtp); | 412 | tcp_enable_early_retrans(newtp); |
413 | newtp->tlp_high_seq = 0; | 413 | newtp->tlp_high_seq = 0; |
414 | newtp->lsndtime = treq->snt_synack; | ||
415 | newtp->total_retrans = req->num_retrans; | ||
414 | 416 | ||
415 | /* So many TCP implementations out there (incorrectly) count the | 417 | /* So many TCP implementations out there (incorrectly) count the |
416 | * initial SYN frame in their delayed-ACK and congestion control | 418 | * initial SYN frame in their delayed-ACK and congestion control |
@@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
666 | if (!(flg & TCP_FLAG_ACK)) | 668 | if (!(flg & TCP_FLAG_ACK)) |
667 | return NULL; | 669 | return NULL; |
668 | 670 | ||
669 | /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ | ||
670 | if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) | ||
671 | tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; | ||
672 | else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */ | ||
673 | tcp_rsk(req)->snt_synack = 0; | ||
674 | |||
675 | /* For Fast Open no more processing is needed (sk is the | 671 | /* For Fast Open no more processing is needed (sk is the |
676 | * child socket). | 672 | * child socket). |
677 | */ | 673 | */ |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 92fde8d1aa82..7c83cb8bf137 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; | |||
65 | /* By default, RFC2861 behavior. */ | 65 | /* By default, RFC2861 behavior. */ |
66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | 66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; |
67 | 67 | ||
68 | unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; | ||
69 | EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); | ||
70 | |||
68 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | 71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
69 | int push_one, gfp_t gfp); | 72 | int push_one, gfp_t gfp); |
70 | 73 | ||
@@ -1628,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1628 | 1631 | ||
1629 | /* If a full-sized TSO skb can be sent, do it. */ | 1632 | /* If a full-sized TSO skb can be sent, do it. */ |
1630 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, | 1633 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, |
1631 | sk->sk_gso_max_segs * tp->mss_cache)) | 1634 | tp->xmit_size_goal_segs * tp->mss_cache)) |
1632 | goto send_now; | 1635 | goto send_now; |
1633 | 1636 | ||
1634 | /* Middle in queue won't get any more data, full sendable already? */ | 1637 | /* Middle in queue won't get any more data, full sendable already? */ |
@@ -2670,7 +2673,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2670 | int tcp_header_size; | 2673 | int tcp_header_size; |
2671 | int mss; | 2674 | int mss; |
2672 | 2675 | ||
2673 | skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC)); | 2676 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); |
2674 | if (unlikely(!skb)) { | 2677 | if (unlikely(!skb)) { |
2675 | dst_release(dst); | 2678 | dst_release(dst); |
2676 | return NULL; | 2679 | return NULL; |
@@ -2814,6 +2817,8 @@ void tcp_connect_init(struct sock *sk) | |||
2814 | 2817 | ||
2815 | if (likely(!tp->repair)) | 2818 | if (likely(!tp->repair)) |
2816 | tp->rcv_nxt = 0; | 2819 | tp->rcv_nxt = 0; |
2820 | else | ||
2821 | tp->rcv_tstamp = tcp_time_stamp; | ||
2817 | tp->rcv_wup = tp->rcv_nxt; | 2822 | tp->rcv_wup = tp->rcv_nxt; |
2818 | tp->copied_seq = tp->rcv_nxt; | 2823 | tp->copied_seq = tp->rcv_nxt; |
2819 | 2824 | ||
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index d4943f67aff2..611beab38a00 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c | |||
@@ -46,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096; | |||
46 | MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); | 46 | MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); |
47 | module_param(bufsize, uint, 0); | 47 | module_param(bufsize, uint, 0); |
48 | 48 | ||
49 | static unsigned int fwmark __read_mostly = 0; | ||
50 | MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); | ||
51 | module_param(fwmark, uint, 0); | ||
52 | |||
49 | static int full __read_mostly; | 53 | static int full __read_mostly; |
50 | MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); | 54 | MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); |
51 | module_param(full, int, 0); | 55 | module_param(full, int, 0); |
@@ -54,12 +58,16 @@ static const char procname[] = "tcpprobe"; | |||
54 | 58 | ||
55 | struct tcp_log { | 59 | struct tcp_log { |
56 | ktime_t tstamp; | 60 | ktime_t tstamp; |
57 | __be32 saddr, daddr; | 61 | union { |
58 | __be16 sport, dport; | 62 | struct sockaddr raw; |
63 | struct sockaddr_in v4; | ||
64 | struct sockaddr_in6 v6; | ||
65 | } src, dst; | ||
59 | u16 length; | 66 | u16 length; |
60 | u32 snd_nxt; | 67 | u32 snd_nxt; |
61 | u32 snd_una; | 68 | u32 snd_una; |
62 | u32 snd_wnd; | 69 | u32 snd_wnd; |
70 | u32 rcv_wnd; | ||
63 | u32 snd_cwnd; | 71 | u32 snd_cwnd; |
64 | u32 ssthresh; | 72 | u32 ssthresh; |
65 | u32 srtt; | 73 | u32 srtt; |
@@ -86,19 +94,45 @@ static inline int tcp_probe_avail(void) | |||
86 | return bufsize - tcp_probe_used() - 1; | 94 | return bufsize - tcp_probe_used() - 1; |
87 | } | 95 | } |
88 | 96 | ||
97 | #define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ | ||
98 | do { \ | ||
99 | si4.sin_family = AF_INET; \ | ||
100 | si4.sin_port = inet->inet_##mem##port; \ | ||
101 | si4.sin_addr.s_addr = inet->inet_##mem##addr; \ | ||
102 | } while (0) \ | ||
103 | |||
104 | #if IS_ENABLED(CONFIG_IPV6) | ||
105 | #define tcp_probe_copy_fl_to_si6(inet, si6, mem) \ | ||
106 | do { \ | ||
107 | struct ipv6_pinfo *pi6 = inet->pinet6; \ | ||
108 | si6.sin6_family = AF_INET6; \ | ||
109 | si6.sin6_port = inet->inet_##mem##port; \ | ||
110 | si6.sin6_addr = pi6->mem##addr; \ | ||
111 | si6.sin6_flowinfo = 0; /* No need here. */ \ | ||
112 | si6.sin6_scope_id = 0; /* No need here. */ \ | ||
113 | } while (0) | ||
114 | #else | ||
115 | #define tcp_probe_copy_fl_to_si6(fl, si6, mem) \ | ||
116 | do { \ | ||
117 | memset(&si6, 0, sizeof(si6)); \ | ||
118 | } while (0) | ||
119 | #endif | ||
120 | |||
89 | /* | 121 | /* |
90 | * Hook inserted to be called before each receive packet. | 122 | * Hook inserted to be called before each receive packet. |
91 | * Note: arguments must match tcp_rcv_established()! | 123 | * Note: arguments must match tcp_rcv_established()! |
92 | */ | 124 | */ |
93 | static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, | 125 | static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
94 | struct tcphdr *th, unsigned int len) | 126 | const struct tcphdr *th, unsigned int len) |
95 | { | 127 | { |
96 | const struct tcp_sock *tp = tcp_sk(sk); | 128 | const struct tcp_sock *tp = tcp_sk(sk); |
97 | const struct inet_sock *inet = inet_sk(sk); | 129 | const struct inet_sock *inet = inet_sk(sk); |
98 | 130 | ||
99 | /* Only update if port matches */ | 131 | /* Only update if port or skb mark matches */ |
100 | if ((port == 0 || ntohs(inet->inet_dport) == port || | 132 | if (((port == 0 && fwmark == 0) || |
101 | ntohs(inet->inet_sport) == port) && | 133 | ntohs(inet->inet_dport) == port || |
134 | ntohs(inet->inet_sport) == port || | ||
135 | (fwmark > 0 && skb->mark == fwmark)) && | ||
102 | (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { | 136 | (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { |
103 | 137 | ||
104 | spin_lock(&tcp_probe.lock); | 138 | spin_lock(&tcp_probe.lock); |
@@ -107,15 +141,25 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
107 | struct tcp_log *p = tcp_probe.log + tcp_probe.head; | 141 | struct tcp_log *p = tcp_probe.log + tcp_probe.head; |
108 | 142 | ||
109 | p->tstamp = ktime_get(); | 143 | p->tstamp = ktime_get(); |
110 | p->saddr = inet->inet_saddr; | 144 | switch (sk->sk_family) { |
111 | p->sport = inet->inet_sport; | 145 | case AF_INET: |
112 | p->daddr = inet->inet_daddr; | 146 | tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); |
113 | p->dport = inet->inet_dport; | 147 | tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); |
148 | break; | ||
149 | case AF_INET6: | ||
150 | tcp_probe_copy_fl_to_si6(inet, p->src.v6, s); | ||
151 | tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d); | ||
152 | break; | ||
153 | default: | ||
154 | BUG(); | ||
155 | } | ||
156 | |||
114 | p->length = skb->len; | 157 | p->length = skb->len; |
115 | p->snd_nxt = tp->snd_nxt; | 158 | p->snd_nxt = tp->snd_nxt; |
116 | p->snd_una = tp->snd_una; | 159 | p->snd_una = tp->snd_una; |
117 | p->snd_cwnd = tp->snd_cwnd; | 160 | p->snd_cwnd = tp->snd_cwnd; |
118 | p->snd_wnd = tp->snd_wnd; | 161 | p->snd_wnd = tp->snd_wnd; |
162 | p->rcv_wnd = tp->rcv_wnd; | ||
119 | p->ssthresh = tcp_current_ssthresh(sk); | 163 | p->ssthresh = tcp_current_ssthresh(sk); |
120 | p->srtt = tp->srtt >> 3; | 164 | p->srtt = tp->srtt >> 3; |
121 | 165 | ||
@@ -128,7 +172,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
128 | } | 172 | } |
129 | 173 | ||
130 | jprobe_return(); | 174 | jprobe_return(); |
131 | return 0; | ||
132 | } | 175 | } |
133 | 176 | ||
134 | static struct jprobe tcp_jprobe = { | 177 | static struct jprobe tcp_jprobe = { |
@@ -157,13 +200,11 @@ static int tcpprobe_sprint(char *tbuf, int n) | |||
157 | = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); | 200 | = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); |
158 | 201 | ||
159 | return scnprintf(tbuf, n, | 202 | return scnprintf(tbuf, n, |
160 | "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", | 203 | "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", |
161 | (unsigned long) tv.tv_sec, | 204 | (unsigned long) tv.tv_sec, |
162 | (unsigned long) tv.tv_nsec, | 205 | (unsigned long) tv.tv_nsec, |
163 | &p->saddr, ntohs(p->sport), | 206 | &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, |
164 | &p->daddr, ntohs(p->dport), | 207 | p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); |
165 | p->length, p->snd_nxt, p->snd_una, | ||
166 | p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); | ||
167 | } | 208 | } |
168 | 209 | ||
169 | static ssize_t tcpprobe_read(struct file *file, char __user *buf, | 210 | static ssize_t tcpprobe_read(struct file *file, char __user *buf, |
@@ -176,7 +217,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf, | |||
176 | return -EINVAL; | 217 | return -EINVAL; |
177 | 218 | ||
178 | while (cnt < len) { | 219 | while (cnt < len) { |
179 | char tbuf[164]; | 220 | char tbuf[256]; |
180 | int width; | 221 | int width; |
181 | 222 | ||
182 | /* Wait for data in buffer */ | 223 | /* Wait for data in buffer */ |
@@ -223,6 +264,13 @@ static __init int tcpprobe_init(void) | |||
223 | { | 264 | { |
224 | int ret = -ENOMEM; | 265 | int ret = -ENOMEM; |
225 | 266 | ||
267 | /* Warning: if the function signature of tcp_rcv_established, | ||
268 | * has been changed, you also have to change the signature of | ||
269 | * jtcp_rcv_established, otherwise you end up right here! | ||
270 | */ | ||
271 | BUILD_BUG_ON(__same_type(tcp_rcv_established, | ||
272 | jtcp_rcv_established) == 0); | ||
273 | |||
226 | init_waitqueue_head(&tcp_probe.wait); | 274 | init_waitqueue_head(&tcp_probe.wait); |
227 | spin_lock_init(&tcp_probe.lock); | 275 | spin_lock_init(&tcp_probe.lock); |
228 | 276 | ||
@@ -241,7 +289,8 @@ static __init int tcpprobe_init(void) | |||
241 | if (ret) | 289 | if (ret) |
242 | goto err1; | 290 | goto err1; |
243 | 291 | ||
244 | pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize); | 292 | pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", |
293 | port, fwmark, bufsize); | ||
245 | return 0; | 294 | return 0; |
246 | err1: | 295 | err1: |
247 | remove_proc_entry(procname, init_net.proc_net); | 296 | remove_proc_entry(procname, init_net.proc_net); |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 766e6bab9113..74d2c95db57f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -704,7 +704,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames); | |||
704 | * @src: source IP address | 704 | * @src: source IP address |
705 | * @dst: destination IP address | 705 | * @dst: destination IP address |
706 | */ | 706 | */ |
707 | static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) | 707 | void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) |
708 | { | 708 | { |
709 | struct udphdr *uh = udp_hdr(skb); | 709 | struct udphdr *uh = udp_hdr(skb); |
710 | struct sk_buff *frags = skb_shinfo(skb)->frag_list; | 710 | struct sk_buff *frags = skb_shinfo(skb)->frag_list; |
@@ -740,6 +740,7 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) | |||
740 | uh->check = CSUM_MANGLED_0; | 740 | uh->check = CSUM_MANGLED_0; |
741 | } | 741 | } |
742 | } | 742 | } |
743 | EXPORT_SYMBOL_GPL(udp4_hwcsum); | ||
743 | 744 | ||
744 | static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) | 745 | static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) |
745 | { | 746 | { |
@@ -2158,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, | |||
2158 | __u16 srcp = ntohs(inet->inet_sport); | 2159 | __u16 srcp = ntohs(inet->inet_sport); |
2159 | 2160 | ||
2160 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" | 2161 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" |
2161 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", | 2162 | " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", |
2162 | bucket, src, srcp, dest, destp, sp->sk_state, | 2163 | bucket, src, srcp, dest, destp, sp->sk_state, |
2163 | sk_wmem_alloc_get(sp), | 2164 | sk_wmem_alloc_get(sp), |
2164 | sk_rmem_alloc_get(sp), | 2165 | sk_rmem_alloc_get(sp), |
@@ -2336,7 +2337,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | |||
2336 | uh->len = htons(skb->len - udp_offset); | 2337 | uh->len = htons(skb->len - udp_offset); |
2337 | 2338 | ||
2338 | /* csum segment if tunnel sets skb with csum. */ | 2339 | /* csum segment if tunnel sets skb with csum. */ |
2339 | if (unlikely(uh->check)) { | 2340 | if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) { |
2340 | struct iphdr *iph = ip_hdr(skb); | 2341 | struct iphdr *iph = ip_hdr(skb); |
2341 | 2342 | ||
2342 | uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, | 2343 | uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, |
@@ -2347,7 +2348,18 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | |||
2347 | if (uh->check == 0) | 2348 | if (uh->check == 0) |
2348 | uh->check = CSUM_MANGLED_0; | 2349 | uh->check = CSUM_MANGLED_0; |
2349 | 2350 | ||
2351 | } else if (protocol == htons(ETH_P_IPV6)) { | ||
2352 | struct ipv6hdr *ipv6h = ipv6_hdr(skb); | ||
2353 | u32 len = skb->len - udp_offset; | ||
2354 | |||
2355 | uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, | ||
2356 | len, IPPROTO_UDP, 0); | ||
2357 | uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0)); | ||
2358 | if (uh->check == 0) | ||
2359 | uh->check = CSUM_MANGLED_0; | ||
2360 | skb->ip_summed = CHECKSUM_NONE; | ||
2350 | } | 2361 | } |
2362 | |||
2351 | skb->protocol = protocol; | 2363 | skb->protocol = protocol; |
2352 | } while ((skb = skb->next)); | 2364 | } while ((skb = skb->next)); |
2353 | out: | 2365 | out: |
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 327a617d594c..baa0f63731fd 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c | |||
@@ -21,7 +21,6 @@ | |||
21 | static int xfrm4_tunnel_check_size(struct sk_buff *skb) | 21 | static int xfrm4_tunnel_check_size(struct sk_buff *skb) |
22 | { | 22 | { |
23 | int mtu, ret = 0; | 23 | int mtu, ret = 0; |
24 | struct dst_entry *dst; | ||
25 | 24 | ||
26 | if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) | 25 | if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) |
27 | goto out; | 26 | goto out; |
@@ -29,12 +28,10 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) | |||
29 | if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) | 28 | if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) |
30 | goto out; | 29 | goto out; |
31 | 30 | ||
32 | dst = skb_dst(skb); | 31 | mtu = dst_mtu(skb_dst(skb)); |
33 | mtu = dst_mtu(dst); | ||
34 | if (skb->len > mtu) { | 32 | if (skb->len > mtu) { |
35 | if (skb->sk) | 33 | if (skb->sk) |
36 | ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, | 34 | xfrm_local_error(skb, mtu); |
37 | inet_sk(skb->sk)->inet_dport, mtu); | ||
38 | else | 35 | else |
39 | icmp_send(skb, ICMP_DEST_UNREACH, | 36 | icmp_send(skb, ICMP_DEST_UNREACH, |
40 | ICMP_FRAG_NEEDED, htonl(mtu)); | 37 | ICMP_FRAG_NEEDED, htonl(mtu)); |
@@ -99,3 +96,12 @@ int xfrm4_output(struct sk_buff *skb) | |||
99 | x->outer_mode->afinfo->output_finish, | 96 | x->outer_mode->afinfo->output_finish, |
100 | !(IPCB(skb)->flags & IPSKB_REROUTED)); | 97 | !(IPCB(skb)->flags & IPSKB_REROUTED)); |
101 | } | 98 | } |
99 | |||
100 | void xfrm4_local_error(struct sk_buff *skb, u32 mtu) | ||
101 | { | ||
102 | struct iphdr *hdr; | ||
103 | |||
104 | hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); | ||
105 | ip_local_error(skb->sk, EMSGSIZE, hdr->daddr, | ||
106 | inet_sk(skb->sk)->inet_dport, mtu); | ||
107 | } | ||
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 9258e751baba..0b2a0641526a 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c | |||
@@ -83,6 +83,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { | |||
83 | .extract_input = xfrm4_extract_input, | 83 | .extract_input = xfrm4_extract_input, |
84 | .extract_output = xfrm4_extract_output, | 84 | .extract_output = xfrm4_extract_output, |
85 | .transport_finish = xfrm4_transport_finish, | 85 | .transport_finish = xfrm4_transport_finish, |
86 | .local_error = xfrm4_local_error, | ||
86 | }; | 87 | }; |
87 | 88 | ||
88 | void __init xfrm4_state_init(void) | 89 | void __init xfrm4_state_init(void) |