diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 17:54:29 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 17:54:29 -0400 |
| commit | cc998ff8811530be521f6b316f37ab7676a07938 (patch) | |
| tree | a054b3bf4b2ef406bf756a6cfc9be2f9115f17ae /net/ipv4 | |
| parent | 57d730924d5cc2c3e280af16a9306587c3a511db (diff) | |
| parent | 0d40f75bdab241868c0eb6f97aef9f8b3a66f7b3 (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller:
"Noteworthy changes this time around:
1) Multicast rejoin support for team driver, from Jiri Pirko.
2) Centralize and simplify TCP RTT measurement handling in order to
reduce the impact of bad RTO seeding from SYN/ACKs. Also, when
both timestamps and local RTT measurements are available prefer
the later because there are broken middleware devices which
scramble the timestamp.
From Yuchung Cheng.
3) Add TCP_NOTSENT_LOWAT socket option to limit the amount of kernel
memory consumed to queue up unsend user data. From Eric Dumazet.
4) Add a "physical port ID" abstraction for network devices, from
Jiri Pirko.
5) Add a "suppress" operation to influence fib_rules lookups, from
Stefan Tomanek.
6) Add a networking development FAQ, from Paul Gortmaker.
7) Extend the information provided by tcp_probe and add ipv6 support,
from Daniel Borkmann.
8) Use RCU locking more extensively in openvswitch data paths, from
Pravin B Shelar.
9) Add SCTP support to openvswitch, from Joe Stringer.
10) Add EF10 chip support to SFC driver, from Ben Hutchings.
11) Add new SYNPROXY netfilter target, from Patrick McHardy.
12) Compute a rate approximation for sending in TCP sockets, and use
this to more intelligently coalesce TSO frames. Furthermore, add
a new packet scheduler which takes advantage of this estimate when
available. From Eric Dumazet.
13) Allow AF_PACKET fanouts with random selection, from Daniel
Borkmann.
14) Add ipv6 support to vxlan driver, from Cong Wang"
Resolved conflicts as per discussion.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1218 commits)
openvswitch: Fix alignment of struct sw_flow_key.
netfilter: Fix build errors with xt_socket.c
tcp: Add missing braces to do_tcp_setsockopt
caif: Add missing braces to multiline if in cfctrl_linkup_request
bnx2x: Add missing braces in bnx2x:bnx2x_link_initialize
vxlan: Fix kernel panic on device delete.
net: mvneta: implement ->ndo_do_ioctl() to support PHY ioctls
net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
icplus: Use netif_running to determine device state
ethernet/arc/arc_emac: Fix huge delays in large file copies
tuntap: orphan frags before trying to set tx timestamp
tuntap: purge socket error queue on detach
qlcnic: use standard NAPI weights
ipv6:introduce function to find route for redirect
bnx2x: VF RSS support - VF side
bnx2x: VF RSS support - PF side
vxlan: Notify drivers for listening UDP port changes
net: usbnet: update addr_assign_type if appropriate
driver/net: enic: update enic maintainers and driver
driver/net: enic: Exposing symbols for Cisco's low latency driver
...
Diffstat (limited to 'net/ipv4')
34 files changed, 1069 insertions, 778 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 37cf1a6ea3ad..05c57f0fcabe 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
| @@ -259,22 +259,6 @@ config IP_PIMSM_V2 | |||
| 259 | gated-5). This routing protocol is not used widely, so say N unless | 259 | gated-5). This routing protocol is not used widely, so say N unless |
| 260 | you want to play with it. | 260 | you want to play with it. |
| 261 | 261 | ||
| 262 | config ARPD | ||
| 263 | bool "IP: ARP daemon support" | ||
| 264 | ---help--- | ||
| 265 | The kernel maintains an internal cache which maps IP addresses to | ||
| 266 | hardware addresses on the local network, so that Ethernet | ||
| 267 | frames are sent to the proper address on the physical networking | ||
| 268 | layer. Normally, kernel uses the ARP protocol to resolve these | ||
| 269 | mappings. | ||
| 270 | |||
| 271 | Saying Y here adds support to have an user space daemon to do this | ||
| 272 | resolution instead. This is useful for implementing an alternate | ||
| 273 | address resolution protocol (e.g. NHRP on mGRE tunnels) and also for | ||
| 274 | testing purposes. | ||
| 275 | |||
| 276 | If unsure, say N. | ||
| 277 | |||
| 278 | config SYN_COOKIES | 262 | config SYN_COOKIES |
| 279 | bool "IP: TCP syncookie support" | 263 | bool "IP: TCP syncookie support" |
| 280 | ---help--- | 264 | ---help--- |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b4d0be2b7ce9..7a1874b7b8fd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
| @@ -1532,18 +1532,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) | |||
| 1532 | } | 1532 | } |
| 1533 | EXPORT_SYMBOL_GPL(snmp_mib_init); | 1533 | EXPORT_SYMBOL_GPL(snmp_mib_init); |
| 1534 | 1534 | ||
| 1535 | void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ]) | ||
| 1536 | { | ||
| 1537 | int i; | ||
| 1538 | |||
| 1539 | BUG_ON(ptr == NULL); | ||
| 1540 | for (i = 0; i < SNMP_ARRAY_SZ; i++) { | ||
| 1541 | free_percpu(ptr[i]); | ||
| 1542 | ptr[i] = NULL; | ||
| 1543 | } | ||
| 1544 | } | ||
| 1545 | EXPORT_SYMBOL_GPL(snmp_mib_free); | ||
| 1546 | |||
| 1547 | #ifdef CONFIG_IP_MULTICAST | 1535 | #ifdef CONFIG_IP_MULTICAST |
| 1548 | static const struct net_protocol igmp_protocol = { | 1536 | static const struct net_protocol igmp_protocol = { |
| 1549 | .handler = igmp_rcv, | 1537 | .handler = igmp_rcv, |
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 4429b013f269..7808093cede6 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
| @@ -368,9 +368,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) | |||
| 368 | } else { | 368 | } else { |
| 369 | probes -= neigh->parms->app_probes; | 369 | probes -= neigh->parms->app_probes; |
| 370 | if (probes < 0) { | 370 | if (probes < 0) { |
| 371 | #ifdef CONFIG_ARPD | ||
| 372 | neigh_app_ns(neigh); | 371 | neigh_app_ns(neigh); |
| 373 | #endif | ||
| 374 | return; | 372 | return; |
| 375 | } | 373 | } |
| 376 | } | 374 | } |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 34ca6d5a3a4b..a1b5bcbd04ae 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
| @@ -73,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = { | |||
| 73 | [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, | 73 | [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, |
| 74 | [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, | 74 | [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, |
| 75 | [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, | 75 | [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, |
| 76 | [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, | ||
| 77 | [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, | ||
| 76 | }, | 78 | }, |
| 77 | }; | 79 | }; |
| 78 | 80 | ||
| @@ -83,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = { | |||
| 83 | [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, | 85 | [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, |
| 84 | [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, | 86 | [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, |
| 85 | [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, | 87 | [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, |
| 88 | [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, | ||
| 89 | [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, | ||
| 86 | }, | 90 | }, |
| 87 | }; | 91 | }; |
| 88 | 92 | ||
| @@ -1126,10 +1130,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) | |||
| 1126 | if (len < (int) sizeof(ifr)) | 1130 | if (len < (int) sizeof(ifr)) |
| 1127 | break; | 1131 | break; |
| 1128 | memset(&ifr, 0, sizeof(struct ifreq)); | 1132 | memset(&ifr, 0, sizeof(struct ifreq)); |
| 1129 | if (ifa->ifa_label) | 1133 | strcpy(ifr.ifr_name, ifa->ifa_label); |
| 1130 | strcpy(ifr.ifr_name, ifa->ifa_label); | ||
| 1131 | else | ||
| 1132 | strcpy(ifr.ifr_name, dev->name); | ||
| 1133 | 1134 | ||
| 1134 | (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; | 1135 | (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; |
| 1135 | (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = | 1136 | (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = |
| @@ -2097,11 +2098,15 @@ static struct devinet_sysctl_table { | |||
| 2097 | DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), | 2098 | DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), |
| 2098 | DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), | 2099 | DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), |
| 2099 | DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), | 2100 | DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), |
| 2101 | DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, | ||
| 2102 | "force_igmp_version"), | ||
| 2103 | DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, | ||
| 2104 | "igmpv2_unsolicited_report_interval"), | ||
| 2105 | DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, | ||
| 2106 | "igmpv3_unsolicited_report_interval"), | ||
| 2100 | 2107 | ||
| 2101 | DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), | 2108 | DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), |
| 2102 | DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), | 2109 | DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), |
| 2103 | DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION, | ||
| 2104 | "force_igmp_version"), | ||
| 2105 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, | 2110 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, |
| 2106 | "promote_secondaries"), | 2111 | "promote_secondaries"), |
| 2107 | DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, | 2112 | DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, |
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 26aa65d1fce4..523be38e37de 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c | |||
| @@ -101,6 +101,30 @@ errout: | |||
| 101 | return err; | 101 | return err; |
| 102 | } | 102 | } |
| 103 | 103 | ||
| 104 | static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) | ||
| 105 | { | ||
| 106 | struct fib_result *result = (struct fib_result *) arg->result; | ||
| 107 | struct net_device *dev = result->fi->fib_dev; | ||
| 108 | |||
| 109 | /* do not accept result if the route does | ||
| 110 | * not meet the required prefix length | ||
| 111 | */ | ||
| 112 | if (result->prefixlen <= rule->suppress_prefixlen) | ||
| 113 | goto suppress_route; | ||
| 114 | |||
| 115 | /* do not accept result if the route uses a device | ||
| 116 | * belonging to a forbidden interface group | ||
| 117 | */ | ||
| 118 | if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) | ||
| 119 | goto suppress_route; | ||
| 120 | |||
| 121 | return false; | ||
| 122 | |||
| 123 | suppress_route: | ||
| 124 | if (!(arg->flags & FIB_LOOKUP_NOREF)) | ||
| 125 | fib_info_put(result->fi); | ||
| 126 | return true; | ||
| 127 | } | ||
| 104 | 128 | ||
| 105 | static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) | 129 | static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) |
| 106 | { | 130 | { |
| @@ -267,6 +291,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { | |||
| 267 | .rule_size = sizeof(struct fib4_rule), | 291 | .rule_size = sizeof(struct fib4_rule), |
| 268 | .addr_size = sizeof(u32), | 292 | .addr_size = sizeof(u32), |
| 269 | .action = fib4_rule_action, | 293 | .action = fib4_rule_action, |
| 294 | .suppress = fib4_rule_suppress, | ||
| 270 | .match = fib4_rule_match, | 295 | .match = fib4_rule_match, |
| 271 | .configure = fib4_rule_configure, | 296 | .configure = fib4_rule_configure, |
| 272 | .delete = fib4_rule_delete, | 297 | .delete = fib4_rule_delete, |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cd71190d2962..d6c0e64ec97f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
| @@ -88,6 +88,7 @@ | |||
| 88 | #include <linux/if_arp.h> | 88 | #include <linux/if_arp.h> |
| 89 | #include <linux/rtnetlink.h> | 89 | #include <linux/rtnetlink.h> |
| 90 | #include <linux/times.h> | 90 | #include <linux/times.h> |
| 91 | #include <linux/pkt_sched.h> | ||
| 91 | 92 | ||
| 92 | #include <net/net_namespace.h> | 93 | #include <net/net_namespace.h> |
| 93 | #include <net/arp.h> | 94 | #include <net/arp.h> |
| @@ -113,7 +114,8 @@ | |||
| 113 | 114 | ||
| 114 | #define IGMP_V1_Router_Present_Timeout (400*HZ) | 115 | #define IGMP_V1_Router_Present_Timeout (400*HZ) |
| 115 | #define IGMP_V2_Router_Present_Timeout (400*HZ) | 116 | #define IGMP_V2_Router_Present_Timeout (400*HZ) |
| 116 | #define IGMP_Unsolicited_Report_Interval (10*HZ) | 117 | #define IGMP_V2_Unsolicited_Report_Interval (10*HZ) |
| 118 | #define IGMP_V3_Unsolicited_Report_Interval (1*HZ) | ||
| 117 | #define IGMP_Query_Response_Interval (10*HZ) | 119 | #define IGMP_Query_Response_Interval (10*HZ) |
| 118 | #define IGMP_Unsolicited_Report_Count 2 | 120 | #define IGMP_Unsolicited_Report_Count 2 |
| 119 | 121 | ||
| @@ -138,6 +140,29 @@ | |||
| 138 | ((in_dev)->mr_v2_seen && \ | 140 | ((in_dev)->mr_v2_seen && \ |
| 139 | time_before(jiffies, (in_dev)->mr_v2_seen))) | 141 | time_before(jiffies, (in_dev)->mr_v2_seen))) |
| 140 | 142 | ||
| 143 | static int unsolicited_report_interval(struct in_device *in_dev) | ||
| 144 | { | ||
| 145 | int interval_ms, interval_jiffies; | ||
| 146 | |||
| 147 | if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) | ||
| 148 | interval_ms = IN_DEV_CONF_GET( | ||
| 149 | in_dev, | ||
| 150 | IGMPV2_UNSOLICITED_REPORT_INTERVAL); | ||
| 151 | else /* v3 */ | ||
| 152 | interval_ms = IN_DEV_CONF_GET( | ||
| 153 | in_dev, | ||
| 154 | IGMPV3_UNSOLICITED_REPORT_INTERVAL); | ||
| 155 | |||
| 156 | interval_jiffies = msecs_to_jiffies(interval_ms); | ||
| 157 | |||
| 158 | /* _timer functions can't handle a delay of 0 jiffies so ensure | ||
| 159 | * we always return a positive value. | ||
| 160 | */ | ||
| 161 | if (interval_jiffies <= 0) | ||
| 162 | interval_jiffies = 1; | ||
| 163 | return interval_jiffies; | ||
| 164 | } | ||
| 165 | |||
| 141 | static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); | 166 | static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); |
| 142 | static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); | 167 | static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); |
| 143 | static void igmpv3_clear_delrec(struct in_device *in_dev); | 168 | static void igmpv3_clear_delrec(struct in_device *in_dev); |
| @@ -315,6 +340,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | |||
| 315 | if (size < 256) | 340 | if (size < 256) |
| 316 | return NULL; | 341 | return NULL; |
| 317 | } | 342 | } |
| 343 | skb->priority = TC_PRIO_CONTROL; | ||
| 318 | igmp_skb_size(skb) = size; | 344 | igmp_skb_size(skb) = size; |
| 319 | 345 | ||
| 320 | rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, | 346 | rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, |
| @@ -670,6 +696,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
| 670 | ip_rt_put(rt); | 696 | ip_rt_put(rt); |
| 671 | return -1; | 697 | return -1; |
| 672 | } | 698 | } |
| 699 | skb->priority = TC_PRIO_CONTROL; | ||
| 673 | 700 | ||
| 674 | skb_dst_set(skb, &rt->dst); | 701 | skb_dst_set(skb, &rt->dst); |
| 675 | 702 | ||
| @@ -719,7 +746,8 @@ static void igmp_ifc_timer_expire(unsigned long data) | |||
| 719 | igmpv3_send_cr(in_dev); | 746 | igmpv3_send_cr(in_dev); |
| 720 | if (in_dev->mr_ifc_count) { | 747 | if (in_dev->mr_ifc_count) { |
| 721 | in_dev->mr_ifc_count--; | 748 | in_dev->mr_ifc_count--; |
| 722 | igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); | 749 | igmp_ifc_start_timer(in_dev, |
| 750 | unsolicited_report_interval(in_dev)); | ||
| 723 | } | 751 | } |
| 724 | __in_dev_put(in_dev); | 752 | __in_dev_put(in_dev); |
| 725 | } | 753 | } |
| @@ -744,7 +772,7 @@ static void igmp_timer_expire(unsigned long data) | |||
| 744 | 772 | ||
| 745 | if (im->unsolicit_count) { | 773 | if (im->unsolicit_count) { |
| 746 | im->unsolicit_count--; | 774 | im->unsolicit_count--; |
| 747 | igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); | 775 | igmp_start_timer(im, unsolicited_report_interval(in_dev)); |
| 748 | } | 776 | } |
| 749 | im->reporter = 1; | 777 | im->reporter = 1; |
| 750 | spin_unlock(&im->lock); | 778 | spin_unlock(&im->lock); |
| @@ -1323,16 +1351,17 @@ out: | |||
| 1323 | EXPORT_SYMBOL(ip_mc_inc_group); | 1351 | EXPORT_SYMBOL(ip_mc_inc_group); |
| 1324 | 1352 | ||
| 1325 | /* | 1353 | /* |
| 1326 | * Resend IGMP JOIN report; used for bonding. | 1354 | * Resend IGMP JOIN report; used by netdev notifier. |
| 1327 | * Called with rcu_read_lock() | ||
| 1328 | */ | 1355 | */ |
| 1329 | void ip_mc_rejoin_groups(struct in_device *in_dev) | 1356 | static void ip_mc_rejoin_groups(struct in_device *in_dev) |
| 1330 | { | 1357 | { |
| 1331 | #ifdef CONFIG_IP_MULTICAST | 1358 | #ifdef CONFIG_IP_MULTICAST |
| 1332 | struct ip_mc_list *im; | 1359 | struct ip_mc_list *im; |
| 1333 | int type; | 1360 | int type; |
| 1334 | 1361 | ||
| 1335 | for_each_pmc_rcu(in_dev, im) { | 1362 | ASSERT_RTNL(); |
| 1363 | |||
| 1364 | for_each_pmc_rtnl(in_dev, im) { | ||
| 1336 | if (im->multiaddr == IGMP_ALL_HOSTS) | 1365 | if (im->multiaddr == IGMP_ALL_HOSTS) |
| 1337 | continue; | 1366 | continue; |
| 1338 | 1367 | ||
| @@ -1349,7 +1378,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev) | |||
| 1349 | } | 1378 | } |
| 1350 | #endif | 1379 | #endif |
| 1351 | } | 1380 | } |
| 1352 | EXPORT_SYMBOL(ip_mc_rejoin_groups); | ||
| 1353 | 1381 | ||
| 1354 | /* | 1382 | /* |
| 1355 | * A socket has left a multicast group on device dev | 1383 | * A socket has left a multicast group on device dev |
| @@ -2735,8 +2763,42 @@ static struct pernet_operations igmp_net_ops = { | |||
| 2735 | .exit = igmp_net_exit, | 2763 | .exit = igmp_net_exit, |
| 2736 | }; | 2764 | }; |
| 2737 | 2765 | ||
| 2766 | static int igmp_netdev_event(struct notifier_block *this, | ||
| 2767 | unsigned long event, void *ptr) | ||
| 2768 | { | ||
| 2769 | struct net_device *dev = netdev_notifier_info_to_dev(ptr); | ||
| 2770 | struct in_device *in_dev; | ||
| 2771 | |||
| 2772 | switch (event) { | ||
| 2773 | case NETDEV_RESEND_IGMP: | ||
| 2774 | in_dev = __in_dev_get_rtnl(dev); | ||
| 2775 | if (in_dev) | ||
| 2776 | ip_mc_rejoin_groups(in_dev); | ||
| 2777 | break; | ||
| 2778 | default: | ||
| 2779 | break; | ||
| 2780 | } | ||
| 2781 | return NOTIFY_DONE; | ||
| 2782 | } | ||
| 2783 | |||
| 2784 | static struct notifier_block igmp_notifier = { | ||
| 2785 | .notifier_call = igmp_netdev_event, | ||
| 2786 | }; | ||
| 2787 | |||
| 2738 | int __init igmp_mc_proc_init(void) | 2788 | int __init igmp_mc_proc_init(void) |
| 2739 | { | 2789 | { |
| 2740 | return register_pernet_subsys(&igmp_net_ops); | 2790 | int err; |
| 2791 | |||
| 2792 | err = register_pernet_subsys(&igmp_net_ops); | ||
| 2793 | if (err) | ||
| 2794 | return err; | ||
| 2795 | err = register_netdevice_notifier(&igmp_notifier); | ||
| 2796 | if (err) | ||
| 2797 | goto reg_notif_fail; | ||
| 2798 | return 0; | ||
| 2799 | |||
| 2800 | reg_notif_fail: | ||
| 2801 | unregister_pernet_subsys(&igmp_net_ops); | ||
| 2802 | return err; | ||
| 2741 | } | 2803 | } |
| 2742 | #endif | 2804 | #endif |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8d6939eeb492..d7aea4c5b940 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
| @@ -534,7 +534,7 @@ static int __net_init ipgre_init_net(struct net *net) | |||
| 534 | static void __net_exit ipgre_exit_net(struct net *net) | 534 | static void __net_exit ipgre_exit_net(struct net *net) |
| 535 | { | 535 | { |
| 536 | struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); | 536 | struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); |
| 537 | ip_tunnel_delete_net(itn); | 537 | ip_tunnel_delete_net(itn, &ipgre_link_ops); |
| 538 | } | 538 | } |
| 539 | 539 | ||
| 540 | static struct pernet_operations ipgre_net_ops = { | 540 | static struct pernet_operations ipgre_net_ops = { |
| @@ -767,7 +767,7 @@ static int __net_init ipgre_tap_init_net(struct net *net) | |||
| 767 | static void __net_exit ipgre_tap_exit_net(struct net *net) | 767 | static void __net_exit ipgre_tap_exit_net(struct net *net) |
| 768 | { | 768 | { |
| 769 | struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); | 769 | struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); |
| 770 | ip_tunnel_delete_net(itn); | 770 | ip_tunnel_delete_net(itn, &ipgre_tap_ops); |
| 771 | } | 771 | } |
| 772 | 772 | ||
| 773 | static struct pernet_operations ipgre_tap_net_ops = { | 773 | static struct pernet_operations ipgre_tap_net_ops = { |
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 15e3e683adec..054a3e97d822 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
| @@ -141,6 +141,7 @@ | |||
| 141 | #include <net/icmp.h> | 141 | #include <net/icmp.h> |
| 142 | #include <net/raw.h> | 142 | #include <net/raw.h> |
| 143 | #include <net/checksum.h> | 143 | #include <net/checksum.h> |
| 144 | #include <net/inet_ecn.h> | ||
| 144 | #include <linux/netfilter_ipv4.h> | 145 | #include <linux/netfilter_ipv4.h> |
| 145 | #include <net/xfrm.h> | 146 | #include <net/xfrm.h> |
| 146 | #include <linux/mroute.h> | 147 | #include <linux/mroute.h> |
| @@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, | |||
| 410 | if (iph->ihl < 5 || iph->version != 4) | 411 | if (iph->ihl < 5 || iph->version != 4) |
| 411 | goto inhdr_error; | 412 | goto inhdr_error; |
| 412 | 413 | ||
| 414 | BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); | ||
| 415 | BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); | ||
| 416 | BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); | ||
| 417 | IP_ADD_STATS_BH(dev_net(dev), | ||
| 418 | IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), | ||
| 419 | max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); | ||
| 420 | |||
| 413 | if (!pskb_may_pull(skb, iph->ihl*4)) | 421 | if (!pskb_may_pull(skb, iph->ihl*4)) |
| 414 | goto inhdr_error; | 422 | goto inhdr_error; |
| 415 | 423 | ||
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index ca1cb2d5f6e2..ac9fabe0300f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c | |||
| @@ -350,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) | |||
| 350 | struct flowi4 fl4; | 350 | struct flowi4 fl4; |
| 351 | struct rtable *rt; | 351 | struct rtable *rt; |
| 352 | 352 | ||
| 353 | rt = ip_route_output_tunnel(dev_net(dev), &fl4, | 353 | rt = ip_route_output_tunnel(tunnel->net, &fl4, |
| 354 | tunnel->parms.iph.protocol, | 354 | tunnel->parms.iph.protocol, |
| 355 | iph->daddr, iph->saddr, | 355 | iph->daddr, iph->saddr, |
| 356 | tunnel->parms.o_key, | 356 | tunnel->parms.o_key, |
| @@ -365,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) | |||
| 365 | } | 365 | } |
| 366 | 366 | ||
| 367 | if (!tdev && tunnel->parms.link) | 367 | if (!tdev && tunnel->parms.link) |
| 368 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | 368 | tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); |
| 369 | 369 | ||
| 370 | if (tdev) { | 370 | if (tdev) { |
| 371 | hlen = tdev->hard_header_len + tdev->needed_headroom; | 371 | hlen = tdev->hard_header_len + tdev->needed_headroom; |
| @@ -454,15 +454,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, | |||
| 454 | tstats->rx_bytes += skb->len; | 454 | tstats->rx_bytes += skb->len; |
| 455 | u64_stats_update_end(&tstats->syncp); | 455 | u64_stats_update_end(&tstats->syncp); |
| 456 | 456 | ||
| 457 | if (tunnel->net != dev_net(tunnel->dev)) | ||
| 458 | skb_scrub_packet(skb); | ||
| 459 | |||
| 460 | if (tunnel->dev->type == ARPHRD_ETHER) { | 457 | if (tunnel->dev->type == ARPHRD_ETHER) { |
| 461 | skb->protocol = eth_type_trans(skb, tunnel->dev); | 458 | skb->protocol = eth_type_trans(skb, tunnel->dev); |
| 462 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | 459 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); |
| 463 | } else { | 460 | } else { |
| 464 | skb->dev = tunnel->dev; | 461 | skb->dev = tunnel->dev; |
| 465 | } | 462 | } |
| 463 | |||
| 464 | skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); | ||
| 465 | |||
| 466 | gro_cells_receive(&tunnel->gro_cells, skb); | 466 | gro_cells_receive(&tunnel->gro_cells, skb); |
| 467 | return 0; | 467 | return 0; |
| 468 | 468 | ||
| @@ -613,9 +613,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, | |||
| 613 | goto tx_error; | 613 | goto tx_error; |
| 614 | } | 614 | } |
| 615 | 615 | ||
| 616 | if (tunnel->net != dev_net(dev)) | ||
| 617 | skb_scrub_packet(skb); | ||
| 618 | |||
| 619 | if (tunnel->err_count > 0) { | 616 | if (tunnel->err_count > 0) { |
| 620 | if (time_before(jiffies, | 617 | if (time_before(jiffies, |
| 621 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { | 618 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { |
| @@ -653,9 +650,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, | |||
| 653 | } | 650 | } |
| 654 | } | 651 | } |
| 655 | 652 | ||
| 656 | err = iptunnel_xmit(dev_net(dev), rt, skb, | 653 | err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, |
| 657 | fl4.saddr, fl4.daddr, protocol, | 654 | ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df, |
| 658 | ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df); | 655 | !net_eq(tunnel->net, dev_net(dev))); |
| 659 | iptunnel_xmit_stats(err, &dev->stats, dev->tstats); | 656 | iptunnel_xmit_stats(err, &dev->stats, dev->tstats); |
| 660 | 657 | ||
| 661 | return; | 658 | return; |
| @@ -820,11 +817,10 @@ static void ip_tunnel_dev_free(struct net_device *dev) | |||
| 820 | 817 | ||
| 821 | void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) | 818 | void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) |
| 822 | { | 819 | { |
| 823 | struct net *net = dev_net(dev); | ||
| 824 | struct ip_tunnel *tunnel = netdev_priv(dev); | 820 | struct ip_tunnel *tunnel = netdev_priv(dev); |
| 825 | struct ip_tunnel_net *itn; | 821 | struct ip_tunnel_net *itn; |
| 826 | 822 | ||
| 827 | itn = net_generic(net, tunnel->ip_tnl_net_id); | 823 | itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); |
| 828 | 824 | ||
| 829 | if (itn->fb_tunnel_dev != dev) { | 825 | if (itn->fb_tunnel_dev != dev) { |
| 830 | ip_tunnel_del(netdev_priv(dev)); | 826 | ip_tunnel_del(netdev_priv(dev)); |
| @@ -838,56 +834,68 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, | |||
| 838 | { | 834 | { |
| 839 | struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); | 835 | struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); |
| 840 | struct ip_tunnel_parm parms; | 836 | struct ip_tunnel_parm parms; |
| 837 | unsigned int i; | ||
| 841 | 838 | ||
| 842 | itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL); | 839 | for (i = 0; i < IP_TNL_HASH_SIZE; i++) |
| 843 | if (!itn->tunnels) | 840 | INIT_HLIST_HEAD(&itn->tunnels[i]); |
| 844 | return -ENOMEM; | ||
| 845 | 841 | ||
| 846 | if (!ops) { | 842 | if (!ops) { |
| 847 | itn->fb_tunnel_dev = NULL; | 843 | itn->fb_tunnel_dev = NULL; |
| 848 | return 0; | 844 | return 0; |
| 849 | } | 845 | } |
| 846 | |||
| 850 | memset(&parms, 0, sizeof(parms)); | 847 | memset(&parms, 0, sizeof(parms)); |
| 851 | if (devname) | 848 | if (devname) |
| 852 | strlcpy(parms.name, devname, IFNAMSIZ); | 849 | strlcpy(parms.name, devname, IFNAMSIZ); |
| 853 | 850 | ||
| 854 | rtnl_lock(); | 851 | rtnl_lock(); |
| 855 | itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); | 852 | itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); |
| 853 | /* FB netdevice is special: we have one, and only one per netns. | ||
| 854 | * Allowing to move it to another netns is clearly unsafe. | ||
| 855 | */ | ||
| 856 | if (!IS_ERR(itn->fb_tunnel_dev)) | ||
| 857 | itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; | ||
| 856 | rtnl_unlock(); | 858 | rtnl_unlock(); |
| 857 | if (IS_ERR(itn->fb_tunnel_dev)) { | ||
| 858 | kfree(itn->tunnels); | ||
| 859 | return PTR_ERR(itn->fb_tunnel_dev); | ||
| 860 | } | ||
| 861 | 859 | ||
| 862 | return 0; | 860 | return PTR_RET(itn->fb_tunnel_dev); |
| 863 | } | 861 | } |
| 864 | EXPORT_SYMBOL_GPL(ip_tunnel_init_net); | 862 | EXPORT_SYMBOL_GPL(ip_tunnel_init_net); |
| 865 | 863 | ||
| 866 | static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head) | 864 | static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, |
| 865 | struct rtnl_link_ops *ops) | ||
| 867 | { | 866 | { |
| 867 | struct net *net = dev_net(itn->fb_tunnel_dev); | ||
| 868 | struct net_device *dev, *aux; | ||
| 868 | int h; | 869 | int h; |
| 869 | 870 | ||
| 871 | for_each_netdev_safe(net, dev, aux) | ||
| 872 | if (dev->rtnl_link_ops == ops) | ||
| 873 | unregister_netdevice_queue(dev, head); | ||
| 874 | |||
| 870 | for (h = 0; h < IP_TNL_HASH_SIZE; h++) { | 875 | for (h = 0; h < IP_TNL_HASH_SIZE; h++) { |
| 871 | struct ip_tunnel *t; | 876 | struct ip_tunnel *t; |
| 872 | struct hlist_node *n; | 877 | struct hlist_node *n; |
| 873 | struct hlist_head *thead = &itn->tunnels[h]; | 878 | struct hlist_head *thead = &itn->tunnels[h]; |
| 874 | 879 | ||
| 875 | hlist_for_each_entry_safe(t, n, thead, hash_node) | 880 | hlist_for_each_entry_safe(t, n, thead, hash_node) |
| 876 | unregister_netdevice_queue(t->dev, head); | 881 | /* If dev is in the same netns, it has already |
| 882 | * been added to the list by the previous loop. | ||
| 883 | */ | ||
| 884 | if (!net_eq(dev_net(t->dev), net)) | ||
| 885 | unregister_netdevice_queue(t->dev, head); | ||
| 877 | } | 886 | } |
| 878 | if (itn->fb_tunnel_dev) | 887 | if (itn->fb_tunnel_dev) |
| 879 | unregister_netdevice_queue(itn->fb_tunnel_dev, head); | 888 | unregister_netdevice_queue(itn->fb_tunnel_dev, head); |
| 880 | } | 889 | } |
| 881 | 890 | ||
| 882 | void ip_tunnel_delete_net(struct ip_tunnel_net *itn) | 891 | void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) |
| 883 | { | 892 | { |
| 884 | LIST_HEAD(list); | 893 | LIST_HEAD(list); |
| 885 | 894 | ||
| 886 | rtnl_lock(); | 895 | rtnl_lock(); |
| 887 | ip_tunnel_destroy(itn, &list); | 896 | ip_tunnel_destroy(itn, &list, ops); |
| 888 | unregister_netdevice_many(&list); | 897 | unregister_netdevice_many(&list); |
| 889 | rtnl_unlock(); | 898 | rtnl_unlock(); |
| 890 | kfree(itn->tunnels); | ||
| 891 | } | 899 | } |
| 892 | EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); | 900 | EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); |
| 893 | 901 | ||
| @@ -929,23 +937,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink); | |||
| 929 | int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], | 937 | int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], |
| 930 | struct ip_tunnel_parm *p) | 938 | struct ip_tunnel_parm *p) |
| 931 | { | 939 | { |
| 932 | struct ip_tunnel *t, *nt; | 940 | struct ip_tunnel *t; |
| 933 | struct net *net = dev_net(dev); | ||
| 934 | struct ip_tunnel *tunnel = netdev_priv(dev); | 941 | struct ip_tunnel *tunnel = netdev_priv(dev); |
| 942 | struct net *net = tunnel->net; | ||
| 935 | struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); | 943 | struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); |
| 936 | 944 | ||
| 937 | if (dev == itn->fb_tunnel_dev) | 945 | if (dev == itn->fb_tunnel_dev) |
| 938 | return -EINVAL; | 946 | return -EINVAL; |
| 939 | 947 | ||
| 940 | nt = netdev_priv(dev); | ||
| 941 | |||
| 942 | t = ip_tunnel_find(itn, p, dev->type); | 948 | t = ip_tunnel_find(itn, p, dev->type); |
| 943 | 949 | ||
| 944 | if (t) { | 950 | if (t) { |
| 945 | if (t->dev != dev) | 951 | if (t->dev != dev) |
| 946 | return -EEXIST; | 952 | return -EEXIST; |
| 947 | } else { | 953 | } else { |
| 948 | t = nt; | 954 | t = tunnel; |
| 949 | 955 | ||
| 950 | if (dev->type != ARPHRD_ETHER) { | 956 | if (dev->type != ARPHRD_ETHER) { |
| 951 | unsigned int nflags = 0; | 957 | unsigned int nflags = 0; |
| @@ -984,6 +990,7 @@ int ip_tunnel_init(struct net_device *dev) | |||
| 984 | } | 990 | } |
| 985 | 991 | ||
| 986 | tunnel->dev = dev; | 992 | tunnel->dev = dev; |
| 993 | tunnel->net = dev_net(dev); | ||
| 987 | strcpy(tunnel->parms.name, dev->name); | 994 | strcpy(tunnel->parms.name, dev->name); |
| 988 | iph->version = 4; | 995 | iph->version = 4; |
| 989 | iph->ihl = 5; | 996 | iph->ihl = 5; |
| @@ -994,8 +1001,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init); | |||
| 994 | 1001 | ||
| 995 | void ip_tunnel_uninit(struct net_device *dev) | 1002 | void ip_tunnel_uninit(struct net_device *dev) |
| 996 | { | 1003 | { |
| 997 | struct net *net = dev_net(dev); | ||
| 998 | struct ip_tunnel *tunnel = netdev_priv(dev); | 1004 | struct ip_tunnel *tunnel = netdev_priv(dev); |
| 1005 | struct net *net = tunnel->net; | ||
| 999 | struct ip_tunnel_net *itn; | 1006 | struct ip_tunnel_net *itn; |
| 1000 | 1007 | ||
| 1001 | itn = net_generic(net, tunnel->ip_tnl_net_id); | 1008 | itn = net_generic(net, tunnel->ip_tnl_net_id); |
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 850525b34899..d6c856b17fd4 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c | |||
| @@ -46,19 +46,17 @@ | |||
| 46 | #include <net/netns/generic.h> | 46 | #include <net/netns/generic.h> |
| 47 | #include <net/rtnetlink.h> | 47 | #include <net/rtnetlink.h> |
| 48 | 48 | ||
| 49 | int iptunnel_xmit(struct net *net, struct rtable *rt, | 49 | int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, |
| 50 | struct sk_buff *skb, | ||
| 51 | __be32 src, __be32 dst, __u8 proto, | 50 | __be32 src, __be32 dst, __u8 proto, |
| 52 | __u8 tos, __u8 ttl, __be16 df) | 51 | __u8 tos, __u8 ttl, __be16 df, bool xnet) |
| 53 | { | 52 | { |
| 54 | int pkt_len = skb->len; | 53 | int pkt_len = skb->len; |
| 55 | struct iphdr *iph; | 54 | struct iphdr *iph; |
| 56 | int err; | 55 | int err; |
| 57 | 56 | ||
| 58 | nf_reset(skb); | 57 | skb_scrub_packet(skb, xnet); |
| 59 | secpath_reset(skb); | 58 | |
| 60 | skb->rxhash = 0; | 59 | skb->rxhash = 0; |
| 61 | skb_dst_drop(skb); | ||
| 62 | skb_dst_set(skb, &rt->dst); | 60 | skb_dst_set(skb, &rt->dst); |
| 63 | memset(IPCB(skb), 0, sizeof(*IPCB(skb))); | 61 | memset(IPCB(skb), 0, sizeof(*IPCB(skb))); |
| 64 | 62 | ||
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 17cc0ffa8c0d..e805e7b3030e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c | |||
| @@ -44,176 +44,10 @@ | |||
| 44 | #include <net/net_namespace.h> | 44 | #include <net/net_namespace.h> |
| 45 | #include <net/netns/generic.h> | 45 | #include <net/netns/generic.h> |
| 46 | 46 | ||
| 47 | #define HASH_SIZE 16 | ||
| 48 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1)) | ||
| 49 | |||
| 50 | static struct rtnl_link_ops vti_link_ops __read_mostly; | 47 | static struct rtnl_link_ops vti_link_ops __read_mostly; |
| 51 | 48 | ||
| 52 | static int vti_net_id __read_mostly; | 49 | static int vti_net_id __read_mostly; |
| 53 | struct vti_net { | ||
| 54 | struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; | ||
| 55 | struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; | ||
| 56 | struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; | ||
| 57 | struct ip_tunnel __rcu *tunnels_wc[1]; | ||
| 58 | struct ip_tunnel __rcu **tunnels[4]; | ||
| 59 | |||
| 60 | struct net_device *fb_tunnel_dev; | ||
| 61 | }; | ||
| 62 | |||
| 63 | static int vti_fb_tunnel_init(struct net_device *dev); | ||
| 64 | static int vti_tunnel_init(struct net_device *dev); | 50 | static int vti_tunnel_init(struct net_device *dev); |
| 65 | static void vti_tunnel_setup(struct net_device *dev); | ||
| 66 | static void vti_dev_free(struct net_device *dev); | ||
| 67 | static int vti_tunnel_bind_dev(struct net_device *dev); | ||
| 68 | |||
| 69 | #define VTI_XMIT(stats1, stats2) do { \ | ||
| 70 | int err; \ | ||
| 71 | int pkt_len = skb->len; \ | ||
| 72 | err = dst_output(skb); \ | ||
| 73 | if (net_xmit_eval(err) == 0) { \ | ||
| 74 | u64_stats_update_begin(&(stats1)->syncp); \ | ||
| 75 | (stats1)->tx_bytes += pkt_len; \ | ||
| 76 | (stats1)->tx_packets++; \ | ||
| 77 | u64_stats_update_end(&(stats1)->syncp); \ | ||
| 78 | } else { \ | ||
| 79 | (stats2)->tx_errors++; \ | ||
| 80 | (stats2)->tx_aborted_errors++; \ | ||
| 81 | } \ | ||
| 82 | } while (0) | ||
| 83 | |||
| 84 | |||
| 85 | static struct ip_tunnel *vti_tunnel_lookup(struct net *net, | ||
| 86 | __be32 remote, __be32 local) | ||
| 87 | { | ||
| 88 | unsigned h0 = HASH(remote); | ||
| 89 | unsigned h1 = HASH(local); | ||
| 90 | struct ip_tunnel *t; | ||
| 91 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 92 | |||
| 93 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1]) | ||
| 94 | if (local == t->parms.iph.saddr && | ||
| 95 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
| 96 | return t; | ||
| 97 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0]) | ||
| 98 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
| 99 | return t; | ||
| 100 | |||
| 101 | for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1]) | ||
| 102 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) | ||
| 103 | return t; | ||
| 104 | |||
| 105 | for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0]) | ||
| 106 | if (t && (t->dev->flags&IFF_UP)) | ||
| 107 | return t; | ||
| 108 | return NULL; | ||
| 109 | } | ||
| 110 | |||
| 111 | static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn, | ||
| 112 | struct ip_tunnel_parm *parms) | ||
| 113 | { | ||
| 114 | __be32 remote = parms->iph.daddr; | ||
| 115 | __be32 local = parms->iph.saddr; | ||
| 116 | unsigned h = 0; | ||
| 117 | int prio = 0; | ||
| 118 | |||
| 119 | if (remote) { | ||
| 120 | prio |= 2; | ||
| 121 | h ^= HASH(remote); | ||
| 122 | } | ||
| 123 | if (local) { | ||
| 124 | prio |= 1; | ||
| 125 | h ^= HASH(local); | ||
| 126 | } | ||
| 127 | return &ipn->tunnels[prio][h]; | ||
| 128 | } | ||
| 129 | |||
| 130 | static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn, | ||
| 131 | struct ip_tunnel *t) | ||
| 132 | { | ||
| 133 | return __vti_bucket(ipn, &t->parms); | ||
| 134 | } | ||
| 135 | |||
| 136 | static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t) | ||
| 137 | { | ||
| 138 | struct ip_tunnel __rcu **tp; | ||
| 139 | struct ip_tunnel *iter; | ||
| 140 | |||
| 141 | for (tp = vti_bucket(ipn, t); | ||
| 142 | (iter = rtnl_dereference(*tp)) != NULL; | ||
| 143 | tp = &iter->next) { | ||
| 144 | if (t == iter) { | ||
| 145 | rcu_assign_pointer(*tp, t->next); | ||
| 146 | break; | ||
| 147 | } | ||
| 148 | } | ||
| 149 | } | ||
| 150 | |||
| 151 | static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t) | ||
| 152 | { | ||
| 153 | struct ip_tunnel __rcu **tp = vti_bucket(ipn, t); | ||
| 154 | |||
| 155 | rcu_assign_pointer(t->next, rtnl_dereference(*tp)); | ||
| 156 | rcu_assign_pointer(*tp, t); | ||
| 157 | } | ||
| 158 | |||
| 159 | static struct ip_tunnel *vti_tunnel_locate(struct net *net, | ||
| 160 | struct ip_tunnel_parm *parms, | ||
| 161 | int create) | ||
| 162 | { | ||
| 163 | __be32 remote = parms->iph.daddr; | ||
| 164 | __be32 local = parms->iph.saddr; | ||
| 165 | struct ip_tunnel *t, *nt; | ||
| 166 | struct ip_tunnel __rcu **tp; | ||
| 167 | struct net_device *dev; | ||
| 168 | char name[IFNAMSIZ]; | ||
| 169 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 170 | |||
| 171 | for (tp = __vti_bucket(ipn, parms); | ||
| 172 | (t = rtnl_dereference(*tp)) != NULL; | ||
| 173 | tp = &t->next) { | ||
| 174 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) | ||
| 175 | return t; | ||
| 176 | } | ||
| 177 | if (!create) | ||
| 178 | return NULL; | ||
| 179 | |||
| 180 | if (parms->name[0]) | ||
| 181 | strlcpy(name, parms->name, IFNAMSIZ); | ||
| 182 | else | ||
| 183 | strcpy(name, "vti%d"); | ||
| 184 | |||
| 185 | dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup); | ||
| 186 | if (dev == NULL) | ||
| 187 | return NULL; | ||
| 188 | |||
| 189 | dev_net_set(dev, net); | ||
| 190 | |||
| 191 | nt = netdev_priv(dev); | ||
| 192 | nt->parms = *parms; | ||
| 193 | dev->rtnl_link_ops = &vti_link_ops; | ||
| 194 | |||
| 195 | vti_tunnel_bind_dev(dev); | ||
| 196 | |||
| 197 | if (register_netdevice(dev) < 0) | ||
| 198 | goto failed_free; | ||
| 199 | |||
| 200 | dev_hold(dev); | ||
| 201 | vti_tunnel_link(ipn, nt); | ||
| 202 | return nt; | ||
| 203 | |||
| 204 | failed_free: | ||
| 205 | free_netdev(dev); | ||
| 206 | return NULL; | ||
| 207 | } | ||
| 208 | |||
| 209 | static void vti_tunnel_uninit(struct net_device *dev) | ||
| 210 | { | ||
| 211 | struct net *net = dev_net(dev); | ||
| 212 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 213 | |||
| 214 | vti_tunnel_unlink(ipn, netdev_priv(dev)); | ||
| 215 | dev_put(dev); | ||
| 216 | } | ||
| 217 | 51 | ||
| 218 | static int vti_err(struct sk_buff *skb, u32 info) | 52 | static int vti_err(struct sk_buff *skb, u32 info) |
| 219 | { | 53 | { |
| @@ -222,6 +56,8 @@ static int vti_err(struct sk_buff *skb, u32 info) | |||
| 222 | * 8 bytes of packet payload. It means, that precise relaying of | 56 | * 8 bytes of packet payload. It means, that precise relaying of |
| 223 | * ICMP in the real Internet is absolutely infeasible. | 57 | * ICMP in the real Internet is absolutely infeasible. |
| 224 | */ | 58 | */ |
| 59 | struct net *net = dev_net(skb->dev); | ||
| 60 | struct ip_tunnel_net *itn = net_generic(net, vti_net_id); | ||
| 225 | struct iphdr *iph = (struct iphdr *)skb->data; | 61 | struct iphdr *iph = (struct iphdr *)skb->data; |
| 226 | const int type = icmp_hdr(skb)->type; | 62 | const int type = icmp_hdr(skb)->type; |
| 227 | const int code = icmp_hdr(skb)->code; | 63 | const int code = icmp_hdr(skb)->code; |
| @@ -252,7 +88,8 @@ static int vti_err(struct sk_buff *skb, u32 info) | |||
| 252 | 88 | ||
| 253 | err = -ENOENT; | 89 | err = -ENOENT; |
| 254 | 90 | ||
| 255 | t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | 91 | t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, |
| 92 | iph->daddr, iph->saddr, 0); | ||
| 256 | if (t == NULL) | 93 | if (t == NULL) |
| 257 | goto out; | 94 | goto out; |
| 258 | 95 | ||
| @@ -281,8 +118,11 @@ static int vti_rcv(struct sk_buff *skb) | |||
| 281 | { | 118 | { |
| 282 | struct ip_tunnel *tunnel; | 119 | struct ip_tunnel *tunnel; |
| 283 | const struct iphdr *iph = ip_hdr(skb); | 120 | const struct iphdr *iph = ip_hdr(skb); |
| 121 | struct net *net = dev_net(skb->dev); | ||
| 122 | struct ip_tunnel_net *itn = net_generic(net, vti_net_id); | ||
| 284 | 123 | ||
| 285 | tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); | 124 | tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, |
| 125 | iph->saddr, iph->daddr, 0); | ||
| 286 | if (tunnel != NULL) { | 126 | if (tunnel != NULL) { |
| 287 | struct pcpu_tstats *tstats; | 127 | struct pcpu_tstats *tstats; |
| 288 | 128 | ||
| @@ -311,7 +151,6 @@ static int vti_rcv(struct sk_buff *skb) | |||
| 311 | static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 151 | static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) |
| 312 | { | 152 | { |
| 313 | struct ip_tunnel *tunnel = netdev_priv(dev); | 153 | struct ip_tunnel *tunnel = netdev_priv(dev); |
| 314 | struct pcpu_tstats *tstats; | ||
| 315 | struct iphdr *tiph = &tunnel->parms.iph; | 154 | struct iphdr *tiph = &tunnel->parms.iph; |
| 316 | u8 tos; | 155 | u8 tos; |
| 317 | struct rtable *rt; /* Route to the other host */ | 156 | struct rtable *rt; /* Route to the other host */ |
| @@ -319,6 +158,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
| 319 | struct iphdr *old_iph = ip_hdr(skb); | 158 | struct iphdr *old_iph = ip_hdr(skb); |
| 320 | __be32 dst = tiph->daddr; | 159 | __be32 dst = tiph->daddr; |
| 321 | struct flowi4 fl4; | 160 | struct flowi4 fl4; |
| 161 | int err; | ||
| 322 | 162 | ||
| 323 | if (skb->protocol != htons(ETH_P_IP)) | 163 | if (skb->protocol != htons(ETH_P_IP)) |
| 324 | goto tx_error; | 164 | goto tx_error; |
| @@ -367,8 +207,10 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
| 367 | nf_reset(skb); | 207 | nf_reset(skb); |
| 368 | skb->dev = skb_dst(skb)->dev; | 208 | skb->dev = skb_dst(skb)->dev; |
| 369 | 209 | ||
| 370 | tstats = this_cpu_ptr(dev->tstats); | 210 | err = dst_output(skb); |
| 371 | VTI_XMIT(tstats, &dev->stats); | 211 | if (net_xmit_eval(err) == 0) |
| 212 | err = skb->len; | ||
| 213 | iptunnel_xmit_stats(err, &dev->stats, dev->tstats); | ||
| 372 | return NETDEV_TX_OK; | 214 | return NETDEV_TX_OK; |
| 373 | 215 | ||
| 374 | tx_error_icmp: | 216 | tx_error_icmp: |
| @@ -379,198 +221,57 @@ tx_error: | |||
| 379 | return NETDEV_TX_OK; | 221 | return NETDEV_TX_OK; |
| 380 | } | 222 | } |
| 381 | 223 | ||
| 382 | static int vti_tunnel_bind_dev(struct net_device *dev) | ||
| 383 | { | ||
| 384 | struct net_device *tdev = NULL; | ||
| 385 | struct ip_tunnel *tunnel; | ||
| 386 | struct iphdr *iph; | ||
| 387 | |||
| 388 | tunnel = netdev_priv(dev); | ||
| 389 | iph = &tunnel->parms.iph; | ||
| 390 | |||
| 391 | if (iph->daddr) { | ||
| 392 | struct rtable *rt; | ||
| 393 | struct flowi4 fl4; | ||
| 394 | memset(&fl4, 0, sizeof(fl4)); | ||
| 395 | flowi4_init_output(&fl4, tunnel->parms.link, | ||
| 396 | be32_to_cpu(tunnel->parms.i_key), | ||
| 397 | RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, | ||
| 398 | IPPROTO_IPIP, 0, | ||
| 399 | iph->daddr, iph->saddr, 0, 0); | ||
| 400 | rt = ip_route_output_key(dev_net(dev), &fl4); | ||
| 401 | if (!IS_ERR(rt)) { | ||
| 402 | tdev = rt->dst.dev; | ||
| 403 | ip_rt_put(rt); | ||
| 404 | } | ||
| 405 | dev->flags |= IFF_POINTOPOINT; | ||
| 406 | } | ||
| 407 | |||
| 408 | if (!tdev && tunnel->parms.link) | ||
| 409 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | ||
| 410 | |||
| 411 | if (tdev) { | ||
| 412 | dev->hard_header_len = tdev->hard_header_len + | ||
| 413 | sizeof(struct iphdr); | ||
| 414 | dev->mtu = tdev->mtu; | ||
| 415 | } | ||
| 416 | dev->iflink = tunnel->parms.link; | ||
| 417 | return dev->mtu; | ||
| 418 | } | ||
| 419 | |||
| 420 | static int | 224 | static int |
| 421 | vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) | 225 | vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) |
| 422 | { | 226 | { |
| 423 | int err = 0; | 227 | int err = 0; |
| 424 | struct ip_tunnel_parm p; | 228 | struct ip_tunnel_parm p; |
| 425 | struct ip_tunnel *t; | ||
| 426 | struct net *net = dev_net(dev); | ||
| 427 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 428 | |||
| 429 | switch (cmd) { | ||
| 430 | case SIOCGETTUNNEL: | ||
| 431 | t = NULL; | ||
| 432 | if (dev == ipn->fb_tunnel_dev) { | ||
| 433 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, | ||
| 434 | sizeof(p))) { | ||
| 435 | err = -EFAULT; | ||
| 436 | break; | ||
| 437 | } | ||
| 438 | t = vti_tunnel_locate(net, &p, 0); | ||
| 439 | } | ||
| 440 | if (t == NULL) | ||
| 441 | t = netdev_priv(dev); | ||
| 442 | memcpy(&p, &t->parms, sizeof(p)); | ||
| 443 | p.i_flags |= GRE_KEY | VTI_ISVTI; | ||
| 444 | p.o_flags |= GRE_KEY; | ||
| 445 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) | ||
| 446 | err = -EFAULT; | ||
| 447 | break; | ||
| 448 | |||
| 449 | case SIOCADDTUNNEL: | ||
| 450 | case SIOCCHGTUNNEL: | ||
| 451 | err = -EPERM; | ||
| 452 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
| 453 | goto done; | ||
| 454 | 229 | ||
| 455 | err = -EFAULT; | 230 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) |
| 456 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | 231 | return -EFAULT; |
| 457 | goto done; | ||
| 458 | 232 | ||
| 459 | err = -EINVAL; | 233 | if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { |
| 460 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || | 234 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || |
| 461 | p.iph.ihl != 5) | 235 | p.iph.ihl != 5) |
| 462 | goto done; | 236 | return -EINVAL; |
| 463 | 237 | } | |
| 464 | t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); | ||
| 465 | |||
| 466 | if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | ||
| 467 | if (t != NULL) { | ||
| 468 | if (t->dev != dev) { | ||
| 469 | err = -EEXIST; | ||
| 470 | break; | ||
| 471 | } | ||
| 472 | } else { | ||
| 473 | if (((dev->flags&IFF_POINTOPOINT) && | ||
| 474 | !p.iph.daddr) || | ||
| 475 | (!(dev->flags&IFF_POINTOPOINT) && | ||
| 476 | p.iph.daddr)) { | ||
| 477 | err = -EINVAL; | ||
| 478 | break; | ||
| 479 | } | ||
| 480 | t = netdev_priv(dev); | ||
| 481 | vti_tunnel_unlink(ipn, t); | ||
| 482 | synchronize_net(); | ||
| 483 | t->parms.iph.saddr = p.iph.saddr; | ||
| 484 | t->parms.iph.daddr = p.iph.daddr; | ||
| 485 | t->parms.i_key = p.i_key; | ||
| 486 | t->parms.o_key = p.o_key; | ||
| 487 | t->parms.iph.protocol = IPPROTO_IPIP; | ||
| 488 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
| 489 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
| 490 | vti_tunnel_link(ipn, t); | ||
| 491 | netdev_state_change(dev); | ||
| 492 | } | ||
| 493 | } | ||
| 494 | |||
| 495 | if (t) { | ||
| 496 | err = 0; | ||
| 497 | if (cmd == SIOCCHGTUNNEL) { | ||
| 498 | t->parms.i_key = p.i_key; | ||
| 499 | t->parms.o_key = p.o_key; | ||
| 500 | if (t->parms.link != p.link) { | ||
| 501 | t->parms.link = p.link; | ||
| 502 | vti_tunnel_bind_dev(dev); | ||
| 503 | netdev_state_change(dev); | ||
| 504 | } | ||
| 505 | } | ||
| 506 | p.i_flags |= GRE_KEY | VTI_ISVTI; | ||
| 507 | p.o_flags |= GRE_KEY; | ||
| 508 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, | ||
| 509 | sizeof(p))) | ||
| 510 | err = -EFAULT; | ||
| 511 | } else | ||
| 512 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | ||
| 513 | break; | ||
| 514 | 238 | ||
| 515 | case SIOCDELTUNNEL: | 239 | err = ip_tunnel_ioctl(dev, &p, cmd); |
| 516 | err = -EPERM; | 240 | if (err) |
| 517 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 241 | return err; |
| 518 | goto done; | ||
| 519 | |||
| 520 | if (dev == ipn->fb_tunnel_dev) { | ||
| 521 | err = -EFAULT; | ||
| 522 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, | ||
| 523 | sizeof(p))) | ||
| 524 | goto done; | ||
| 525 | err = -ENOENT; | ||
| 526 | |||
| 527 | t = vti_tunnel_locate(net, &p, 0); | ||
| 528 | if (t == NULL) | ||
| 529 | goto done; | ||
| 530 | err = -EPERM; | ||
| 531 | if (t->dev == ipn->fb_tunnel_dev) | ||
| 532 | goto done; | ||
| 533 | dev = t->dev; | ||
| 534 | } | ||
| 535 | unregister_netdevice(dev); | ||
| 536 | err = 0; | ||
| 537 | break; | ||
| 538 | 242 | ||
| 539 | default: | 243 | if (cmd != SIOCDELTUNNEL) { |
| 540 | err = -EINVAL; | 244 | p.i_flags |= GRE_KEY | VTI_ISVTI; |
| 245 | p.o_flags |= GRE_KEY; | ||
| 541 | } | 246 | } |
| 542 | 247 | ||
| 543 | done: | 248 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) |
| 544 | return err; | 249 | return -EFAULT; |
| 545 | } | ||
| 546 | |||
| 547 | static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu) | ||
| 548 | { | ||
| 549 | if (new_mtu < 68 || new_mtu > 0xFFF8) | ||
| 550 | return -EINVAL; | ||
| 551 | dev->mtu = new_mtu; | ||
| 552 | return 0; | 250 | return 0; |
| 553 | } | 251 | } |
| 554 | 252 | ||
| 555 | static const struct net_device_ops vti_netdev_ops = { | 253 | static const struct net_device_ops vti_netdev_ops = { |
| 556 | .ndo_init = vti_tunnel_init, | 254 | .ndo_init = vti_tunnel_init, |
| 557 | .ndo_uninit = vti_tunnel_uninit, | 255 | .ndo_uninit = ip_tunnel_uninit, |
| 558 | .ndo_start_xmit = vti_tunnel_xmit, | 256 | .ndo_start_xmit = vti_tunnel_xmit, |
| 559 | .ndo_do_ioctl = vti_tunnel_ioctl, | 257 | .ndo_do_ioctl = vti_tunnel_ioctl, |
| 560 | .ndo_change_mtu = vti_tunnel_change_mtu, | 258 | .ndo_change_mtu = ip_tunnel_change_mtu, |
| 561 | .ndo_get_stats64 = ip_tunnel_get_stats64, | 259 | .ndo_get_stats64 = ip_tunnel_get_stats64, |
| 562 | }; | 260 | }; |
| 563 | 261 | ||
| 564 | static void vti_dev_free(struct net_device *dev) | 262 | static void vti_tunnel_setup(struct net_device *dev) |
| 565 | { | 263 | { |
| 566 | free_percpu(dev->tstats); | 264 | dev->netdev_ops = &vti_netdev_ops; |
| 567 | free_netdev(dev); | 265 | ip_tunnel_setup(dev, vti_net_id); |
| 568 | } | 266 | } |
| 569 | 267 | ||
| 570 | static void vti_tunnel_setup(struct net_device *dev) | 268 | static int vti_tunnel_init(struct net_device *dev) |
| 571 | { | 269 | { |
| 572 | dev->netdev_ops = &vti_netdev_ops; | 270 | struct ip_tunnel *tunnel = netdev_priv(dev); |
| 573 | dev->destructor = vti_dev_free; | 271 | struct iphdr *iph = &tunnel->parms.iph; |
| 272 | |||
| 273 | memcpy(dev->dev_addr, &iph->saddr, 4); | ||
| 274 | memcpy(dev->broadcast, &iph->daddr, 4); | ||
| 574 | 275 | ||
| 575 | dev->type = ARPHRD_TUNNEL; | 276 | dev->type = ARPHRD_TUNNEL; |
| 576 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); | 277 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); |
| @@ -581,38 +282,18 @@ static void vti_tunnel_setup(struct net_device *dev) | |||
| 581 | dev->features |= NETIF_F_NETNS_LOCAL; | 282 | dev->features |= NETIF_F_NETNS_LOCAL; |
| 582 | dev->features |= NETIF_F_LLTX; | 283 | dev->features |= NETIF_F_LLTX; |
| 583 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 284 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
| 584 | } | ||
| 585 | 285 | ||
| 586 | static int vti_tunnel_init(struct net_device *dev) | 286 | return ip_tunnel_init(dev); |
| 587 | { | ||
| 588 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
| 589 | |||
| 590 | tunnel->dev = dev; | ||
| 591 | strcpy(tunnel->parms.name, dev->name); | ||
| 592 | |||
| 593 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | ||
| 594 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | ||
| 595 | |||
| 596 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
| 597 | if (!dev->tstats) | ||
| 598 | return -ENOMEM; | ||
| 599 | |||
| 600 | return 0; | ||
| 601 | } | 287 | } |
| 602 | 288 | ||
| 603 | static int __net_init vti_fb_tunnel_init(struct net_device *dev) | 289 | static void __net_init vti_fb_tunnel_init(struct net_device *dev) |
| 604 | { | 290 | { |
| 605 | struct ip_tunnel *tunnel = netdev_priv(dev); | 291 | struct ip_tunnel *tunnel = netdev_priv(dev); |
| 606 | struct iphdr *iph = &tunnel->parms.iph; | 292 | struct iphdr *iph = &tunnel->parms.iph; |
| 607 | struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id); | ||
| 608 | 293 | ||
| 609 | iph->version = 4; | 294 | iph->version = 4; |
| 610 | iph->protocol = IPPROTO_IPIP; | 295 | iph->protocol = IPPROTO_IPIP; |
| 611 | iph->ihl = 5; | 296 | iph->ihl = 5; |
| 612 | |||
| 613 | dev_hold(dev); | ||
| 614 | rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); | ||
| 615 | return 0; | ||
| 616 | } | 297 | } |
| 617 | 298 | ||
| 618 | static struct xfrm_tunnel vti_handler __read_mostly = { | 299 | static struct xfrm_tunnel vti_handler __read_mostly = { |
| @@ -621,76 +302,30 @@ static struct xfrm_tunnel vti_handler __read_mostly = { | |||
| 621 | .priority = 1, | 302 | .priority = 1, |
| 622 | }; | 303 | }; |
| 623 | 304 | ||
| 624 | static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head) | ||
| 625 | { | ||
| 626 | int prio; | ||
| 627 | |||
| 628 | for (prio = 1; prio < 4; prio++) { | ||
| 629 | int h; | ||
| 630 | for (h = 0; h < HASH_SIZE; h++) { | ||
| 631 | struct ip_tunnel *t; | ||
| 632 | |||
| 633 | t = rtnl_dereference(ipn->tunnels[prio][h]); | ||
| 634 | while (t != NULL) { | ||
| 635 | unregister_netdevice_queue(t->dev, head); | ||
| 636 | t = rtnl_dereference(t->next); | ||
| 637 | } | ||
| 638 | } | ||
| 639 | } | ||
| 640 | } | ||
| 641 | |||
| 642 | static int __net_init vti_init_net(struct net *net) | 305 | static int __net_init vti_init_net(struct net *net) |
| 643 | { | 306 | { |
| 644 | int err; | 307 | int err; |
| 645 | struct vti_net *ipn = net_generic(net, vti_net_id); | 308 | struct ip_tunnel_net *itn; |
| 646 | |||
| 647 | ipn->tunnels[0] = ipn->tunnels_wc; | ||
| 648 | ipn->tunnels[1] = ipn->tunnels_l; | ||
| 649 | ipn->tunnels[2] = ipn->tunnels_r; | ||
| 650 | ipn->tunnels[3] = ipn->tunnels_r_l; | ||
| 651 | |||
| 652 | ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), | ||
| 653 | "ip_vti0", | ||
| 654 | vti_tunnel_setup); | ||
| 655 | if (!ipn->fb_tunnel_dev) { | ||
| 656 | err = -ENOMEM; | ||
| 657 | goto err_alloc_dev; | ||
| 658 | } | ||
| 659 | dev_net_set(ipn->fb_tunnel_dev, net); | ||
| 660 | |||
| 661 | err = vti_fb_tunnel_init(ipn->fb_tunnel_dev); | ||
| 662 | if (err) | ||
| 663 | goto err_reg_dev; | ||
| 664 | ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops; | ||
| 665 | 309 | ||
| 666 | err = register_netdev(ipn->fb_tunnel_dev); | 310 | err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); |
| 667 | if (err) | 311 | if (err) |
| 668 | goto err_reg_dev; | 312 | return err; |
| 313 | itn = net_generic(net, vti_net_id); | ||
| 314 | vti_fb_tunnel_init(itn->fb_tunnel_dev); | ||
| 669 | return 0; | 315 | return 0; |
| 670 | |||
| 671 | err_reg_dev: | ||
| 672 | vti_dev_free(ipn->fb_tunnel_dev); | ||
| 673 | err_alloc_dev: | ||
| 674 | /* nothing */ | ||
| 675 | return err; | ||
| 676 | } | 316 | } |
| 677 | 317 | ||
| 678 | static void __net_exit vti_exit_net(struct net *net) | 318 | static void __net_exit vti_exit_net(struct net *net) |
| 679 | { | 319 | { |
| 680 | struct vti_net *ipn = net_generic(net, vti_net_id); | 320 | struct ip_tunnel_net *itn = net_generic(net, vti_net_id); |
| 681 | LIST_HEAD(list); | 321 | ip_tunnel_delete_net(itn, &vti_link_ops); |
| 682 | |||
| 683 | rtnl_lock(); | ||
| 684 | vti_destroy_tunnels(ipn, &list); | ||
| 685 | unregister_netdevice_many(&list); | ||
| 686 | rtnl_unlock(); | ||
| 687 | } | 322 | } |
| 688 | 323 | ||
| 689 | static struct pernet_operations vti_net_ops = { | 324 | static struct pernet_operations vti_net_ops = { |
| 690 | .init = vti_init_net, | 325 | .init = vti_init_net, |
| 691 | .exit = vti_exit_net, | 326 | .exit = vti_exit_net, |
| 692 | .id = &vti_net_id, | 327 | .id = &vti_net_id, |
| 693 | .size = sizeof(struct vti_net), | 328 | .size = sizeof(struct ip_tunnel_net), |
| 694 | }; | 329 | }; |
| 695 | 330 | ||
| 696 | static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) | 331 | static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) |
| @@ -728,78 +363,19 @@ static void vti_netlink_parms(struct nlattr *data[], | |||
| 728 | static int vti_newlink(struct net *src_net, struct net_device *dev, | 363 | static int vti_newlink(struct net *src_net, struct net_device *dev, |
| 729 | struct nlattr *tb[], struct nlattr *data[]) | 364 | struct nlattr *tb[], struct nlattr *data[]) |
| 730 | { | 365 | { |
| 731 | struct ip_tunnel *nt; | 366 | struct ip_tunnel_parm parms; |
| 732 | struct net *net = dev_net(dev); | ||
| 733 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 734 | int mtu; | ||
| 735 | int err; | ||
| 736 | |||
| 737 | nt = netdev_priv(dev); | ||
| 738 | vti_netlink_parms(data, &nt->parms); | ||
| 739 | |||
| 740 | if (vti_tunnel_locate(net, &nt->parms, 0)) | ||
| 741 | return -EEXIST; | ||
| 742 | 367 | ||
| 743 | mtu = vti_tunnel_bind_dev(dev); | 368 | vti_netlink_parms(data, &parms); |
| 744 | if (!tb[IFLA_MTU]) | 369 | return ip_tunnel_newlink(dev, tb, &parms); |
| 745 | dev->mtu = mtu; | ||
| 746 | |||
| 747 | err = register_netdevice(dev); | ||
| 748 | if (err) | ||
| 749 | goto out; | ||
| 750 | |||
| 751 | dev_hold(dev); | ||
| 752 | vti_tunnel_link(ipn, nt); | ||
| 753 | |||
| 754 | out: | ||
| 755 | return err; | ||
| 756 | } | 370 | } |
| 757 | 371 | ||
| 758 | static int vti_changelink(struct net_device *dev, struct nlattr *tb[], | 372 | static int vti_changelink(struct net_device *dev, struct nlattr *tb[], |
| 759 | struct nlattr *data[]) | 373 | struct nlattr *data[]) |
| 760 | { | 374 | { |
| 761 | struct ip_tunnel *t, *nt; | ||
| 762 | struct net *net = dev_net(dev); | ||
| 763 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 764 | struct ip_tunnel_parm p; | 375 | struct ip_tunnel_parm p; |
| 765 | int mtu; | ||
| 766 | |||
| 767 | if (dev == ipn->fb_tunnel_dev) | ||
| 768 | return -EINVAL; | ||
| 769 | 376 | ||
| 770 | nt = netdev_priv(dev); | ||
| 771 | vti_netlink_parms(data, &p); | 377 | vti_netlink_parms(data, &p); |
| 772 | 378 | return ip_tunnel_changelink(dev, tb, &p); | |
| 773 | t = vti_tunnel_locate(net, &p, 0); | ||
| 774 | |||
| 775 | if (t) { | ||
| 776 | if (t->dev != dev) | ||
| 777 | return -EEXIST; | ||
| 778 | } else { | ||
| 779 | t = nt; | ||
| 780 | |||
| 781 | vti_tunnel_unlink(ipn, t); | ||
| 782 | t->parms.iph.saddr = p.iph.saddr; | ||
| 783 | t->parms.iph.daddr = p.iph.daddr; | ||
| 784 | t->parms.i_key = p.i_key; | ||
| 785 | t->parms.o_key = p.o_key; | ||
| 786 | if (dev->type != ARPHRD_ETHER) { | ||
| 787 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
| 788 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
| 789 | } | ||
| 790 | vti_tunnel_link(ipn, t); | ||
| 791 | netdev_state_change(dev); | ||
| 792 | } | ||
| 793 | |||
| 794 | if (t->parms.link != p.link) { | ||
| 795 | t->parms.link = p.link; | ||
| 796 | mtu = vti_tunnel_bind_dev(dev); | ||
| 797 | if (!tb[IFLA_MTU]) | ||
| 798 | dev->mtu = mtu; | ||
| 799 | netdev_state_change(dev); | ||
| 800 | } | ||
| 801 | |||
| 802 | return 0; | ||
| 803 | } | 379 | } |
| 804 | 380 | ||
| 805 | static size_t vti_get_size(const struct net_device *dev) | 381 | static size_t vti_get_size(const struct net_device *dev) |
| @@ -865,7 +441,7 @@ static int __init vti_init(void) | |||
| 865 | err = xfrm4_mode_tunnel_input_register(&vti_handler); | 441 | err = xfrm4_mode_tunnel_input_register(&vti_handler); |
| 866 | if (err < 0) { | 442 | if (err < 0) { |
| 867 | unregister_pernet_device(&vti_net_ops); | 443 | unregister_pernet_device(&vti_net_ops); |
| 868 | pr_info(KERN_INFO "vti init: can't register tunnel\n"); | 444 | pr_info("vti init: can't register tunnel\n"); |
| 869 | } | 445 | } |
| 870 | 446 | ||
| 871 | err = rtnl_link_register(&vti_link_ops); | 447 | err = rtnl_link_register(&vti_link_ops); |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index b3ac3c3f6219..7f80fb4b82d3 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
| @@ -285,7 +285,6 @@ static void ipip_tunnel_setup(struct net_device *dev) | |||
| 285 | dev->flags = IFF_NOARP; | 285 | dev->flags = IFF_NOARP; |
| 286 | dev->iflink = 0; | 286 | dev->iflink = 0; |
| 287 | dev->addr_len = 4; | 287 | dev->addr_len = 4; |
| 288 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
| 289 | dev->features |= NETIF_F_LLTX; | 288 | dev->features |= NETIF_F_LLTX; |
| 290 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 289 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
| 291 | 290 | ||
| @@ -436,7 +435,7 @@ static int __net_init ipip_init_net(struct net *net) | |||
| 436 | static void __net_exit ipip_exit_net(struct net *net) | 435 | static void __net_exit ipip_exit_net(struct net *net) |
| 437 | { | 436 | { |
| 438 | struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); | 437 | struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); |
| 439 | ip_tunnel_delete_net(itn); | 438 | ip_tunnel_delete_net(itn, &ipip_link_ops); |
| 440 | } | 439 | } |
| 441 | 440 | ||
| 442 | static struct pernet_operations ipip_net_ops = { | 441 | static struct pernet_operations ipip_net_ops = { |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 132a09664704..9ae54b09254f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
| @@ -127,9 +127,9 @@ static struct kmem_cache *mrt_cachep __read_mostly; | |||
| 127 | static struct mr_table *ipmr_new_table(struct net *net, u32 id); | 127 | static struct mr_table *ipmr_new_table(struct net *net, u32 id); |
| 128 | static void ipmr_free_table(struct mr_table *mrt); | 128 | static void ipmr_free_table(struct mr_table *mrt); |
| 129 | 129 | ||
| 130 | static int ip_mr_forward(struct net *net, struct mr_table *mrt, | 130 | static void ip_mr_forward(struct net *net, struct mr_table *mrt, |
| 131 | struct sk_buff *skb, struct mfc_cache *cache, | 131 | struct sk_buff *skb, struct mfc_cache *cache, |
| 132 | int local); | 132 | int local); |
| 133 | static int ipmr_cache_report(struct mr_table *mrt, | 133 | static int ipmr_cache_report(struct mr_table *mrt, |
| 134 | struct sk_buff *pkt, vifi_t vifi, int assert); | 134 | struct sk_buff *pkt, vifi_t vifi, int assert); |
| 135 | static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | 135 | static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, |
| @@ -1795,9 +1795,9 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) | |||
| 1795 | 1795 | ||
| 1796 | /* "local" means that we should preserve one skb (for local delivery) */ | 1796 | /* "local" means that we should preserve one skb (for local delivery) */ |
| 1797 | 1797 | ||
| 1798 | static int ip_mr_forward(struct net *net, struct mr_table *mrt, | 1798 | static void ip_mr_forward(struct net *net, struct mr_table *mrt, |
| 1799 | struct sk_buff *skb, struct mfc_cache *cache, | 1799 | struct sk_buff *skb, struct mfc_cache *cache, |
| 1800 | int local) | 1800 | int local) |
| 1801 | { | 1801 | { |
| 1802 | int psend = -1; | 1802 | int psend = -1; |
| 1803 | int vif, ct; | 1803 | int vif, ct; |
| @@ -1903,14 +1903,13 @@ last_forward: | |||
| 1903 | ipmr_queue_xmit(net, mrt, skb2, cache, psend); | 1903 | ipmr_queue_xmit(net, mrt, skb2, cache, psend); |
| 1904 | } else { | 1904 | } else { |
| 1905 | ipmr_queue_xmit(net, mrt, skb, cache, psend); | 1905 | ipmr_queue_xmit(net, mrt, skb, cache, psend); |
| 1906 | return 0; | 1906 | return; |
| 1907 | } | 1907 | } |
| 1908 | } | 1908 | } |
| 1909 | 1909 | ||
| 1910 | dont_forward: | 1910 | dont_forward: |
| 1911 | if (!local) | 1911 | if (!local) |
| 1912 | kfree_skb(skb); | 1912 | kfree_skb(skb); |
| 1913 | return 0; | ||
| 1914 | } | 1913 | } |
| 1915 | 1914 | ||
| 1916 | static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) | 1915 | static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) |
| @@ -2068,9 +2067,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, | |||
| 2068 | skb_reset_network_header(skb); | 2067 | skb_reset_network_header(skb); |
| 2069 | skb->protocol = htons(ETH_P_IP); | 2068 | skb->protocol = htons(ETH_P_IP); |
| 2070 | skb->ip_summed = CHECKSUM_NONE; | 2069 | skb->ip_summed = CHECKSUM_NONE; |
| 2071 | skb->pkt_type = PACKET_HOST; | ||
| 2072 | 2070 | ||
| 2073 | skb_tunnel_rx(skb, reg_dev); | 2071 | skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); |
| 2074 | 2072 | ||
| 2075 | netif_rx(skb); | 2073 | netif_rx(skb); |
| 2076 | 2074 | ||
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4e9028017428..1657e39b291f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
| @@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT | |||
| 110 | 110 | ||
| 111 | To compile it as a module, choose M here. If unsure, say N. | 111 | To compile it as a module, choose M here. If unsure, say N. |
| 112 | 112 | ||
| 113 | config IP_NF_TARGET_SYNPROXY | ||
| 114 | tristate "SYNPROXY target support" | ||
| 115 | depends on NF_CONNTRACK && NETFILTER_ADVANCED | ||
| 116 | select NETFILTER_SYNPROXY | ||
| 117 | select SYN_COOKIES | ||
| 118 | help | ||
| 119 | The SYNPROXY target allows you to intercept TCP connections and | ||
| 120 | establish them using syncookies before they are passed on to the | ||
| 121 | server. This allows to avoid conntrack and server resource usage | ||
| 122 | during SYN-flood attacks. | ||
| 123 | |||
| 124 | To compile it as a module, choose M here. If unsure, say N. | ||
| 125 | |||
| 113 | config IP_NF_TARGET_ULOG | 126 | config IP_NF_TARGET_ULOG |
| 114 | tristate "ULOG target support (obsolete)" | 127 | tristate "ULOG target support (obsolete)" |
| 115 | default m if NETFILTER_ADVANCED=n | 128 | default m if NETFILTER_ADVANCED=n |
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 007b128eecc9..3622b248b6dd 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
| @@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o | |||
| 46 | obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o | 46 | obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o |
| 47 | obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o | 47 | obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o |
| 48 | obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o | 48 | obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o |
| 49 | obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o | ||
| 49 | obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o | 50 | obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o |
| 50 | 51 | ||
| 51 | # generic ARP tables | 52 | # generic ARP tables |
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 30e4de940567..00352ce0f0de 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
| @@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this, | |||
| 118 | NF_CT_ASSERT(dev->ifindex != 0); | 118 | NF_CT_ASSERT(dev->ifindex != 0); |
| 119 | 119 | ||
| 120 | nf_ct_iterate_cleanup(net, device_cmp, | 120 | nf_ct_iterate_cleanup(net, device_cmp, |
| 121 | (void *)(long)dev->ifindex); | 121 | (void *)(long)dev->ifindex, 0, 0); |
| 122 | } | 122 | } |
| 123 | 123 | ||
| 124 | return NOTIFY_DONE; | 124 | return NOTIFY_DONE; |
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 04b18c1ac345..b969131ad1c1 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
| @@ -119,7 +119,26 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
| 119 | 119 | ||
| 120 | nf_ct_attach(nskb, oldskb); | 120 | nf_ct_attach(nskb, oldskb); |
| 121 | 121 | ||
| 122 | ip_local_out(nskb); | 122 | #ifdef CONFIG_BRIDGE_NETFILTER |
| 123 | /* If we use ip_local_out for bridged traffic, the MAC source on | ||
| 124 | * the RST will be ours, instead of the destination's. This confuses | ||
| 125 | * some routers/firewalls, and they drop the packet. So we need to | ||
| 126 | * build the eth header using the original destination's MAC as the | ||
| 127 | * source, and send the RST packet directly. | ||
| 128 | */ | ||
| 129 | if (oldskb->nf_bridge) { | ||
| 130 | struct ethhdr *oeth = eth_hdr(oldskb); | ||
| 131 | nskb->dev = oldskb->nf_bridge->physindev; | ||
| 132 | niph->tot_len = htons(nskb->len); | ||
| 133 | ip_send_check(niph); | ||
| 134 | if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), | ||
| 135 | oeth->h_source, oeth->h_dest, nskb->len) < 0) | ||
| 136 | goto free_nskb; | ||
| 137 | dev_queue_xmit(nskb); | ||
| 138 | } else | ||
| 139 | #endif | ||
| 140 | ip_local_out(nskb); | ||
| 141 | |||
| 123 | return; | 142 | return; |
| 124 | 143 | ||
| 125 | free_nskb: | 144 | free_nskb: |
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c new file mode 100644 index 000000000000..67e17dcda65e --- /dev/null +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c | |||
| @@ -0,0 +1,476 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License version 2 as | ||
| 6 | * published by the Free Software Foundation. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/module.h> | ||
| 10 | #include <linux/skbuff.h> | ||
| 11 | #include <net/tcp.h> | ||
| 12 | |||
| 13 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
| 14 | #include <linux/netfilter/x_tables.h> | ||
| 15 | #include <linux/netfilter/xt_SYNPROXY.h> | ||
| 16 | #include <net/netfilter/nf_conntrack.h> | ||
| 17 | #include <net/netfilter/nf_conntrack_seqadj.h> | ||
| 18 | #include <net/netfilter/nf_conntrack_synproxy.h> | ||
| 19 | |||
| 20 | static struct iphdr * | ||
| 21 | synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) | ||
| 22 | { | ||
| 23 | struct iphdr *iph; | ||
| 24 | |||
| 25 | skb_reset_network_header(skb); | ||
| 26 | iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); | ||
| 27 | iph->version = 4; | ||
| 28 | iph->ihl = sizeof(*iph) / 4; | ||
| 29 | iph->tos = 0; | ||
| 30 | iph->id = 0; | ||
| 31 | iph->frag_off = htons(IP_DF); | ||
| 32 | iph->ttl = sysctl_ip_default_ttl; | ||
| 33 | iph->protocol = IPPROTO_TCP; | ||
| 34 | iph->check = 0; | ||
| 35 | iph->saddr = saddr; | ||
| 36 | iph->daddr = daddr; | ||
| 37 | |||
| 38 | return iph; | ||
| 39 | } | ||
| 40 | |||
| 41 | static void | ||
| 42 | synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, | ||
| 43 | struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, | ||
| 44 | struct iphdr *niph, struct tcphdr *nth, | ||
| 45 | unsigned int tcp_hdr_size) | ||
| 46 | { | ||
| 47 | nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); | ||
| 48 | nskb->ip_summed = CHECKSUM_PARTIAL; | ||
| 49 | nskb->csum_start = (unsigned char *)nth - nskb->head; | ||
| 50 | nskb->csum_offset = offsetof(struct tcphdr, check); | ||
| 51 | |||
| 52 | skb_dst_set_noref(nskb, skb_dst(skb)); | ||
| 53 | nskb->protocol = htons(ETH_P_IP); | ||
| 54 | if (ip_route_me_harder(nskb, RTN_UNSPEC)) | ||
| 55 | goto free_nskb; | ||
| 56 | |||
| 57 | if (nfct) { | ||
| 58 | nskb->nfct = nfct; | ||
| 59 | nskb->nfctinfo = ctinfo; | ||
| 60 | nf_conntrack_get(nfct); | ||
| 61 | } | ||
| 62 | |||
| 63 | ip_local_out(nskb); | ||
| 64 | return; | ||
| 65 | |||
| 66 | free_nskb: | ||
| 67 | kfree_skb(nskb); | ||
| 68 | } | ||
| 69 | |||
| 70 | static void | ||
| 71 | synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, | ||
| 72 | const struct synproxy_options *opts) | ||
| 73 | { | ||
| 74 | struct sk_buff *nskb; | ||
| 75 | struct iphdr *iph, *niph; | ||
| 76 | struct tcphdr *nth; | ||
| 77 | unsigned int tcp_hdr_size; | ||
| 78 | u16 mss = opts->mss; | ||
| 79 | |||
| 80 | iph = ip_hdr(skb); | ||
| 81 | |||
| 82 | tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); | ||
| 83 | nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, | ||
| 84 | GFP_ATOMIC); | ||
| 85 | if (nskb == NULL) | ||
| 86 | return; | ||
| 87 | skb_reserve(nskb, MAX_TCP_HEADER); | ||
| 88 | |||
| 89 | niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); | ||
| 90 | |||
| 91 | skb_reset_transport_header(nskb); | ||
| 92 | nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); | ||
| 93 | nth->source = th->dest; | ||
| 94 | nth->dest = th->source; | ||
| 95 | nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); | ||
| 96 | nth->ack_seq = htonl(ntohl(th->seq) + 1); | ||
| 97 | tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; | ||
| 98 | if (opts->options & XT_SYNPROXY_OPT_ECN) | ||
| 99 | tcp_flag_word(nth) |= TCP_FLAG_ECE; | ||
| 100 | nth->doff = tcp_hdr_size / 4; | ||
| 101 | nth->window = 0; | ||
| 102 | nth->check = 0; | ||
| 103 | nth->urg_ptr = 0; | ||
| 104 | |||
| 105 | synproxy_build_options(nth, opts); | ||
| 106 | |||
| 107 | synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, | ||
| 108 | niph, nth, tcp_hdr_size); | ||
| 109 | } | ||
| 110 | |||
| 111 | static void | ||
| 112 | synproxy_send_server_syn(const struct synproxy_net *snet, | ||
| 113 | const struct sk_buff *skb, const struct tcphdr *th, | ||
| 114 | const struct synproxy_options *opts, u32 recv_seq) | ||
| 115 | { | ||
| 116 | struct sk_buff *nskb; | ||
| 117 | struct iphdr *iph, *niph; | ||
| 118 | struct tcphdr *nth; | ||
| 119 | unsigned int tcp_hdr_size; | ||
| 120 | |||
| 121 | iph = ip_hdr(skb); | ||
| 122 | |||
| 123 | tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); | ||
| 124 | nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, | ||
| 125 | GFP_ATOMIC); | ||
| 126 | if (nskb == NULL) | ||
| 127 | return; | ||
| 128 | skb_reserve(nskb, MAX_TCP_HEADER); | ||
| 129 | |||
| 130 | niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); | ||
| 131 | |||
| 132 | skb_reset_transport_header(nskb); | ||
| 133 | nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); | ||
| 134 | nth->source = th->source; | ||
| 135 | nth->dest = th->dest; | ||
| 136 | nth->seq = htonl(recv_seq - 1); | ||
| 137 | /* ack_seq is used to relay our ISN to the synproxy hook to initialize | ||
| 138 | * sequence number translation once a connection tracking entry exists. | ||
| 139 | */ | ||
| 140 | nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); | ||
| 141 | tcp_flag_word(nth) = TCP_FLAG_SYN; | ||
| 142 | if (opts->options & XT_SYNPROXY_OPT_ECN) | ||
| 143 | tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; | ||
| 144 | nth->doff = tcp_hdr_size / 4; | ||
| 145 | nth->window = th->window; | ||
| 146 | nth->check = 0; | ||
| 147 | nth->urg_ptr = 0; | ||
| 148 | |||
| 149 | synproxy_build_options(nth, opts); | ||
| 150 | |||
| 151 | synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, | ||
| 152 | niph, nth, tcp_hdr_size); | ||
| 153 | } | ||
| 154 | |||
| 155 | static void | ||
| 156 | synproxy_send_server_ack(const struct synproxy_net *snet, | ||
| 157 | const struct ip_ct_tcp *state, | ||
| 158 | const struct sk_buff *skb, const struct tcphdr *th, | ||
| 159 | const struct synproxy_options *opts) | ||
| 160 | { | ||
| 161 | struct sk_buff *nskb; | ||
| 162 | struct iphdr *iph, *niph; | ||
| 163 | struct tcphdr *nth; | ||
| 164 | unsigned int tcp_hdr_size; | ||
| 165 | |||
| 166 | iph = ip_hdr(skb); | ||
| 167 | |||
| 168 | tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); | ||
| 169 | nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, | ||
| 170 | GFP_ATOMIC); | ||
| 171 | if (nskb == NULL) | ||
| 172 | return; | ||
| 173 | skb_reserve(nskb, MAX_TCP_HEADER); | ||
| 174 | |||
| 175 | niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); | ||
| 176 | |||
| 177 | skb_reset_transport_header(nskb); | ||
| 178 | nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); | ||
| 179 | nth->source = th->dest; | ||
| 180 | nth->dest = th->source; | ||
| 181 | nth->seq = htonl(ntohl(th->ack_seq)); | ||
| 182 | nth->ack_seq = htonl(ntohl(th->seq) + 1); | ||
| 183 | tcp_flag_word(nth) = TCP_FLAG_ACK; | ||
| 184 | nth->doff = tcp_hdr_size / 4; | ||
| 185 | nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); | ||
| 186 | nth->check = 0; | ||
| 187 | nth->urg_ptr = 0; | ||
| 188 | |||
| 189 | synproxy_build_options(nth, opts); | ||
| 190 | |||
| 191 | synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); | ||
| 192 | } | ||
| 193 | |||
| 194 | static void | ||
| 195 | synproxy_send_client_ack(const struct synproxy_net *snet, | ||
| 196 | const struct sk_buff *skb, const struct tcphdr *th, | ||
| 197 | const struct synproxy_options *opts) | ||
| 198 | { | ||
| 199 | struct sk_buff *nskb; | ||
| 200 | struct iphdr *iph, *niph; | ||
| 201 | struct tcphdr *nth; | ||
| 202 | unsigned int tcp_hdr_size; | ||
| 203 | |||
| 204 | iph = ip_hdr(skb); | ||
| 205 | |||
| 206 | tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); | ||
| 207 | nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, | ||
| 208 | GFP_ATOMIC); | ||
| 209 | if (nskb == NULL) | ||
| 210 | return; | ||
| 211 | skb_reserve(nskb, MAX_TCP_HEADER); | ||
| 212 | |||
| 213 | niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); | ||
| 214 | |||
| 215 | skb_reset_transport_header(nskb); | ||
| 216 | nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); | ||
| 217 | nth->source = th->source; | ||
| 218 | nth->dest = th->dest; | ||
| 219 | nth->seq = htonl(ntohl(th->seq) + 1); | ||
| 220 | nth->ack_seq = th->ack_seq; | ||
| 221 | tcp_flag_word(nth) = TCP_FLAG_ACK; | ||
| 222 | nth->doff = tcp_hdr_size / 4; | ||
| 223 | nth->window = ntohs(htons(th->window) >> opts->wscale); | ||
| 224 | nth->check = 0; | ||
| 225 | nth->urg_ptr = 0; | ||
| 226 | |||
| 227 | synproxy_build_options(nth, opts); | ||
| 228 | |||
| 229 | synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); | ||
| 230 | } | ||
| 231 | |||
| 232 | static bool | ||
| 233 | synproxy_recv_client_ack(const struct synproxy_net *snet, | ||
| 234 | const struct sk_buff *skb, const struct tcphdr *th, | ||
| 235 | struct synproxy_options *opts, u32 recv_seq) | ||
| 236 | { | ||
| 237 | int mss; | ||
| 238 | |||
| 239 | mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); | ||
| 240 | if (mss == 0) { | ||
| 241 | this_cpu_inc(snet->stats->cookie_invalid); | ||
| 242 | return false; | ||
| 243 | } | ||
| 244 | |||
| 245 | this_cpu_inc(snet->stats->cookie_valid); | ||
| 246 | opts->mss = mss; | ||
| 247 | |||
| 248 | if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) | ||
| 249 | synproxy_check_timestamp_cookie(opts); | ||
| 250 | |||
| 251 | synproxy_send_server_syn(snet, skb, th, opts, recv_seq); | ||
| 252 | return true; | ||
| 253 | } | ||
| 254 | |||
| 255 | static unsigned int | ||
| 256 | synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) | ||
| 257 | { | ||
| 258 | const struct xt_synproxy_info *info = par->targinfo; | ||
| 259 | struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); | ||
| 260 | struct synproxy_options opts = {}; | ||
| 261 | struct tcphdr *th, _th; | ||
| 262 | |||
| 263 | if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) | ||
| 264 | return NF_DROP; | ||
| 265 | |||
| 266 | th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); | ||
| 267 | if (th == NULL) | ||
| 268 | return NF_DROP; | ||
| 269 | |||
| 270 | synproxy_parse_options(skb, par->thoff, th, &opts); | ||
| 271 | |||
| 272 | if (th->syn && !(th->ack || th->fin || th->rst)) { | ||
| 273 | /* Initial SYN from client */ | ||
| 274 | this_cpu_inc(snet->stats->syn_received); | ||
| 275 | |||
| 276 | if (th->ece && th->cwr) | ||
| 277 | opts.options |= XT_SYNPROXY_OPT_ECN; | ||
| 278 | |||
| 279 | opts.options &= info->options; | ||
| 280 | if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) | ||
| 281 | synproxy_init_timestamp_cookie(info, &opts); | ||
| 282 | else | ||
| 283 | opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | | ||
| 284 | XT_SYNPROXY_OPT_SACK_PERM | | ||
| 285 | XT_SYNPROXY_OPT_ECN); | ||
| 286 | |||
| 287 | synproxy_send_client_synack(skb, th, &opts); | ||
| 288 | return NF_DROP; | ||
| 289 | |||
| 290 | } else if (th->ack && !(th->fin || th->rst || th->syn)) { | ||
| 291 | /* ACK from client */ | ||
| 292 | synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); | ||
| 293 | return NF_DROP; | ||
| 294 | } | ||
| 295 | |||
| 296 | return XT_CONTINUE; | ||
| 297 | } | ||
| 298 | |||
| 299 | static unsigned int ipv4_synproxy_hook(unsigned int hooknum, | ||
| 300 | struct sk_buff *skb, | ||
| 301 | const struct net_device *in, | ||
| 302 | const struct net_device *out, | ||
| 303 | int (*okfn)(struct sk_buff *)) | ||
| 304 | { | ||
| 305 | struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); | ||
| 306 | enum ip_conntrack_info ctinfo; | ||
| 307 | struct nf_conn *ct; | ||
| 308 | struct nf_conn_synproxy *synproxy; | ||
| 309 | struct synproxy_options opts = {}; | ||
| 310 | const struct ip_ct_tcp *state; | ||
| 311 | struct tcphdr *th, _th; | ||
| 312 | unsigned int thoff; | ||
| 313 | |||
| 314 | ct = nf_ct_get(skb, &ctinfo); | ||
| 315 | if (ct == NULL) | ||
| 316 | return NF_ACCEPT; | ||
| 317 | |||
| 318 | synproxy = nfct_synproxy(ct); | ||
| 319 | if (synproxy == NULL) | ||
| 320 | return NF_ACCEPT; | ||
| 321 | |||
| 322 | if (nf_is_loopback_packet(skb)) | ||
| 323 | return NF_ACCEPT; | ||
| 324 | |||
| 325 | thoff = ip_hdrlen(skb); | ||
| 326 | th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); | ||
| 327 | if (th == NULL) | ||
| 328 | return NF_DROP; | ||
| 329 | |||
| 330 | state = &ct->proto.tcp; | ||
| 331 | switch (state->state) { | ||
| 332 | case TCP_CONNTRACK_CLOSE: | ||
| 333 | if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { | ||
| 334 | nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - | ||
| 335 | ntohl(th->seq) + 1); | ||
| 336 | break; | ||
| 337 | } | ||
| 338 | |||
| 339 | if (!th->syn || th->ack || | ||
| 340 | CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) | ||
| 341 | break; | ||
| 342 | |||
| 343 | /* Reopened connection - reset the sequence number and timestamp | ||
| 344 | * adjustments, they will get initialized once the connection is | ||
| 345 | * reestablished. | ||
| 346 | */ | ||
| 347 | nf_ct_seqadj_init(ct, ctinfo, 0); | ||
| 348 | synproxy->tsoff = 0; | ||
| 349 | this_cpu_inc(snet->stats->conn_reopened); | ||
| 350 | |||
| 351 | /* fall through */ | ||
| 352 | case TCP_CONNTRACK_SYN_SENT: | ||
| 353 | synproxy_parse_options(skb, thoff, th, &opts); | ||
| 354 | |||
| 355 | if (!th->syn && th->ack && | ||
| 356 | CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { | ||
| 357 | /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, | ||
| 358 | * therefore we need to add 1 to make the SYN sequence | ||
| 359 | * number match the one of first SYN. | ||
| 360 | */ | ||
| 361 | if (synproxy_recv_client_ack(snet, skb, th, &opts, | ||
| 362 | ntohl(th->seq) + 1)) | ||
| 363 | this_cpu_inc(snet->stats->cookie_retrans); | ||
| 364 | |||
| 365 | return NF_DROP; | ||
| 366 | } | ||
| 367 | |||
| 368 | synproxy->isn = ntohl(th->ack_seq); | ||
| 369 | if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) | ||
| 370 | synproxy->its = opts.tsecr; | ||
| 371 | break; | ||
| 372 | case TCP_CONNTRACK_SYN_RECV: | ||
| 373 | if (!th->syn || !th->ack) | ||
| 374 | break; | ||
| 375 | |||
| 376 | synproxy_parse_options(skb, thoff, th, &opts); | ||
| 377 | if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) | ||
| 378 | synproxy->tsoff = opts.tsval - synproxy->its; | ||
| 379 | |||
| 380 | opts.options &= ~(XT_SYNPROXY_OPT_MSS | | ||
| 381 | XT_SYNPROXY_OPT_WSCALE | | ||
| 382 | XT_SYNPROXY_OPT_SACK_PERM); | ||
| 383 | |||
| 384 | swap(opts.tsval, opts.tsecr); | ||
| 385 | synproxy_send_server_ack(snet, state, skb, th, &opts); | ||
| 386 | |||
| 387 | nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); | ||
| 388 | |||
| 389 | swap(opts.tsval, opts.tsecr); | ||
| 390 | synproxy_send_client_ack(snet, skb, th, &opts); | ||
| 391 | |||
| 392 | consume_skb(skb); | ||
| 393 | return NF_STOLEN; | ||
| 394 | default: | ||
| 395 | break; | ||
| 396 | } | ||
| 397 | |||
| 398 | synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); | ||
| 399 | return NF_ACCEPT; | ||
| 400 | } | ||
| 401 | |||
| 402 | static int synproxy_tg4_check(const struct xt_tgchk_param *par) | ||
| 403 | { | ||
| 404 | const struct ipt_entry *e = par->entryinfo; | ||
| 405 | |||
| 406 | if (e->ip.proto != IPPROTO_TCP || | ||
| 407 | e->ip.invflags & XT_INV_PROTO) | ||
| 408 | return -EINVAL; | ||
| 409 | |||
| 410 | return nf_ct_l3proto_try_module_get(par->family); | ||
| 411 | } | ||
| 412 | |||
| 413 | static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) | ||
| 414 | { | ||
| 415 | nf_ct_l3proto_module_put(par->family); | ||
| 416 | } | ||
| 417 | |||
| 418 | static struct xt_target synproxy_tg4_reg __read_mostly = { | ||
| 419 | .name = "SYNPROXY", | ||
| 420 | .family = NFPROTO_IPV4, | ||
| 421 | .target = synproxy_tg4, | ||
| 422 | .targetsize = sizeof(struct xt_synproxy_info), | ||
| 423 | .checkentry = synproxy_tg4_check, | ||
| 424 | .destroy = synproxy_tg4_destroy, | ||
| 425 | .me = THIS_MODULE, | ||
| 426 | }; | ||
| 427 | |||
| 428 | static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { | ||
| 429 | { | ||
| 430 | .hook = ipv4_synproxy_hook, | ||
| 431 | .owner = THIS_MODULE, | ||
| 432 | .pf = NFPROTO_IPV4, | ||
| 433 | .hooknum = NF_INET_LOCAL_IN, | ||
| 434 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, | ||
| 435 | }, | ||
| 436 | { | ||
| 437 | .hook = ipv4_synproxy_hook, | ||
| 438 | .owner = THIS_MODULE, | ||
| 439 | .pf = NFPROTO_IPV4, | ||
| 440 | .hooknum = NF_INET_POST_ROUTING, | ||
| 441 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, | ||
| 442 | }, | ||
| 443 | }; | ||
| 444 | |||
| 445 | static int __init synproxy_tg4_init(void) | ||
| 446 | { | ||
| 447 | int err; | ||
| 448 | |||
| 449 | err = nf_register_hooks(ipv4_synproxy_ops, | ||
| 450 | ARRAY_SIZE(ipv4_synproxy_ops)); | ||
| 451 | if (err < 0) | ||
| 452 | goto err1; | ||
| 453 | |||
| 454 | err = xt_register_target(&synproxy_tg4_reg); | ||
| 455 | if (err < 0) | ||
| 456 | goto err2; | ||
| 457 | |||
| 458 | return 0; | ||
| 459 | |||
| 460 | err2: | ||
| 461 | nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); | ||
| 462 | err1: | ||
| 463 | return err; | ||
| 464 | } | ||
| 465 | |||
| 466 | static void __exit synproxy_tg4_exit(void) | ||
| 467 | { | ||
| 468 | xt_unregister_target(&synproxy_tg4_reg); | ||
| 469 | nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); | ||
| 470 | } | ||
| 471 | |||
| 472 | module_init(synproxy_tg4_init); | ||
| 473 | module_exit(synproxy_tg4_exit); | ||
| 474 | |||
| 475 | MODULE_LICENSE("GPL"); | ||
| 476 | MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); | ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 0a2e0e3e95ba..86f5b34a4ed1 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <net/netfilter/nf_conntrack_l3proto.h> | 25 | #include <net/netfilter/nf_conntrack_l3proto.h> |
| 26 | #include <net/netfilter/nf_conntrack_zones.h> | 26 | #include <net/netfilter/nf_conntrack_zones.h> |
| 27 | #include <net/netfilter/nf_conntrack_core.h> | 27 | #include <net/netfilter/nf_conntrack_core.h> |
| 28 | #include <net/netfilter/nf_conntrack_seqadj.h> | ||
| 28 | #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> | 29 | #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> |
| 29 | #include <net/netfilter/nf_nat_helper.h> | 30 | #include <net/netfilter/nf_nat_helper.h> |
| 30 | #include <net/netfilter/ipv4/nf_defrag_ipv4.h> | 31 | #include <net/netfilter/ipv4/nf_defrag_ipv4.h> |
| @@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, | |||
| 136 | /* adjust seqs for loopback traffic only in outgoing direction */ | 137 | /* adjust seqs for loopback traffic only in outgoing direction */ |
| 137 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && | 138 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && |
| 138 | !nf_is_loopback_packet(skb)) { | 139 | !nf_is_loopback_packet(skb)) { |
| 139 | typeof(nf_nat_seq_adjust_hook) seq_adjust; | 140 | if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { |
| 140 | |||
| 141 | seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); | ||
| 142 | if (!seq_adjust || | ||
| 143 | !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { | ||
| 144 | NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); | 141 | NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); |
| 145 | return NF_DROP; | 142 | return NF_DROP; |
| 146 | } | 143 | } |
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 746427c9e719..d7d9882d4cae 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
| @@ -1082,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, | |||
| 1082 | __u16 srcp = ntohs(inet->inet_sport); | 1082 | __u16 srcp = ntohs(inet->inet_sport); |
| 1083 | 1083 | ||
| 1084 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" | 1084 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" |
| 1085 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", | 1085 | " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", |
| 1086 | bucket, src, srcp, dest, destp, sp->sk_state, | 1086 | bucket, src, srcp, dest, destp, sp->sk_state, |
| 1087 | sk_wmem_alloc_get(sp), | 1087 | sk_wmem_alloc_get(sp), |
| 1088 | sk_rmem_alloc_get(sp), | 1088 | sk_rmem_alloc_get(sp), |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 463bd1273346..4a0335854b89 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
| @@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = { | |||
| 111 | SNMP_MIB_SENTINEL | 111 | SNMP_MIB_SENTINEL |
| 112 | }; | 112 | }; |
| 113 | 113 | ||
| 114 | /* Following RFC4293 items are displayed in /proc/net/netstat */ | 114 | /* Following items are displayed in /proc/net/netstat */ |
| 115 | static const struct snmp_mib snmp4_ipextstats_list[] = { | 115 | static const struct snmp_mib snmp4_ipextstats_list[] = { |
| 116 | SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), | 116 | SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), |
| 117 | SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), | 117 | SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), |
| @@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { | |||
| 125 | SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), | 125 | SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), |
| 126 | SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), | 126 | SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), |
| 127 | SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), | 127 | SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), |
| 128 | /* Non RFC4293 fields */ | ||
| 128 | SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), | 129 | SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), |
| 130 | SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), | ||
| 131 | SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), | ||
| 132 | SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), | ||
| 133 | SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), | ||
| 129 | SNMP_MIB_SENTINEL | 134 | SNMP_MIB_SENTINEL |
| 130 | }; | 135 | }; |
| 131 | 136 | ||
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 61e60d67adca..a86c7ae71881 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
| @@ -988,7 +988,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) | |||
| 988 | srcp = inet->inet_num; | 988 | srcp = inet->inet_num; |
| 989 | 989 | ||
| 990 | seq_printf(seq, "%4d: %08X:%04X %08X:%04X" | 990 | seq_printf(seq, "%4d: %08X:%04X %08X:%04X" |
| 991 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", | 991 | " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", |
| 992 | i, src, srcp, dest, destp, sp->sk_state, | 992 | i, src, srcp, dest, destp, sp->sk_state, |
| 993 | sk_wmem_alloc_get(sp), | 993 | sk_wmem_alloc_get(sp), |
| 994 | sk_rmem_alloc_get(sp), | 994 | sk_rmem_alloc_get(sp), |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a9a54a236832..727f4365bcdf 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
| @@ -112,7 +112,8 @@ | |||
| 112 | #define RT_FL_TOS(oldflp4) \ | 112 | #define RT_FL_TOS(oldflp4) \ |
| 113 | ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) | 113 | ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) |
| 114 | 114 | ||
| 115 | #define IP_MAX_MTU 0xFFF0 | 115 | /* IPv4 datagram length is stored into 16bit field (tot_len) */ |
| 116 | #define IP_MAX_MTU 0xFFFF | ||
| 116 | 117 | ||
| 117 | #define RT_GC_TIMEOUT (300*HZ) | 118 | #define RT_GC_TIMEOUT (300*HZ) |
| 118 | 119 | ||
| @@ -435,12 +436,12 @@ static inline int ip_rt_proc_init(void) | |||
| 435 | 436 | ||
| 436 | static inline bool rt_is_expired(const struct rtable *rth) | 437 | static inline bool rt_is_expired(const struct rtable *rth) |
| 437 | { | 438 | { |
| 438 | return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); | 439 | return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); |
| 439 | } | 440 | } |
| 440 | 441 | ||
| 441 | void rt_cache_flush(struct net *net) | 442 | void rt_cache_flush(struct net *net) |
| 442 | { | 443 | { |
| 443 | rt_genid_bump(net); | 444 | rt_genid_bump_ipv4(net); |
| 444 | } | 445 | } |
| 445 | 446 | ||
| 446 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, | 447 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, |
| @@ -1227,10 +1228,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) | |||
| 1227 | mtu = 576; | 1228 | mtu = 576; |
| 1228 | } | 1229 | } |
| 1229 | 1230 | ||
| 1230 | if (mtu > IP_MAX_MTU) | 1231 | return min_t(unsigned int, mtu, IP_MAX_MTU); |
| 1231 | mtu = IP_MAX_MTU; | ||
| 1232 | |||
| 1233 | return mtu; | ||
| 1234 | } | 1232 | } |
| 1235 | 1233 | ||
| 1236 | static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) | 1234 | static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) |
| @@ -1458,7 +1456,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1458 | #endif | 1456 | #endif |
| 1459 | rth->dst.output = ip_rt_bug; | 1457 | rth->dst.output = ip_rt_bug; |
| 1460 | 1458 | ||
| 1461 | rth->rt_genid = rt_genid(dev_net(dev)); | 1459 | rth->rt_genid = rt_genid_ipv4(dev_net(dev)); |
| 1462 | rth->rt_flags = RTCF_MULTICAST; | 1460 | rth->rt_flags = RTCF_MULTICAST; |
| 1463 | rth->rt_type = RTN_MULTICAST; | 1461 | rth->rt_type = RTN_MULTICAST; |
| 1464 | rth->rt_is_input= 1; | 1462 | rth->rt_is_input= 1; |
| @@ -1589,7 +1587,7 @@ static int __mkroute_input(struct sk_buff *skb, | |||
| 1589 | goto cleanup; | 1587 | goto cleanup; |
| 1590 | } | 1588 | } |
| 1591 | 1589 | ||
| 1592 | rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); | 1590 | rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); |
| 1593 | rth->rt_flags = flags; | 1591 | rth->rt_flags = flags; |
| 1594 | rth->rt_type = res->type; | 1592 | rth->rt_type = res->type; |
| 1595 | rth->rt_is_input = 1; | 1593 | rth->rt_is_input = 1; |
| @@ -1760,7 +1758,7 @@ local_input: | |||
| 1760 | rth->dst.tclassid = itag; | 1758 | rth->dst.tclassid = itag; |
| 1761 | #endif | 1759 | #endif |
| 1762 | 1760 | ||
| 1763 | rth->rt_genid = rt_genid(net); | 1761 | rth->rt_genid = rt_genid_ipv4(net); |
| 1764 | rth->rt_flags = flags|RTCF_LOCAL; | 1762 | rth->rt_flags = flags|RTCF_LOCAL; |
| 1765 | rth->rt_type = res.type; | 1763 | rth->rt_type = res.type; |
| 1766 | rth->rt_is_input = 1; | 1764 | rth->rt_is_input = 1; |
| @@ -1945,7 +1943,7 @@ add: | |||
| 1945 | 1943 | ||
| 1946 | rth->dst.output = ip_output; | 1944 | rth->dst.output = ip_output; |
| 1947 | 1945 | ||
| 1948 | rth->rt_genid = rt_genid(dev_net(dev_out)); | 1946 | rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); |
| 1949 | rth->rt_flags = flags; | 1947 | rth->rt_flags = flags; |
| 1950 | rth->rt_type = type; | 1948 | rth->rt_type = type; |
| 1951 | rth->rt_is_input = 0; | 1949 | rth->rt_is_input = 0; |
| @@ -2227,7 +2225,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or | |||
| 2227 | rt->rt_iif = ort->rt_iif; | 2225 | rt->rt_iif = ort->rt_iif; |
| 2228 | rt->rt_pmtu = ort->rt_pmtu; | 2226 | rt->rt_pmtu = ort->rt_pmtu; |
| 2229 | 2227 | ||
| 2230 | rt->rt_genid = rt_genid(net); | 2228 | rt->rt_genid = rt_genid_ipv4(net); |
| 2231 | rt->rt_flags = ort->rt_flags; | 2229 | rt->rt_flags = ort->rt_flags; |
| 2232 | rt->rt_type = ort->rt_type; | 2230 | rt->rt_type = ort->rt_type; |
| 2233 | rt->rt_gateway = ort->rt_gateway; | 2231 | rt->rt_gateway = ort->rt_gateway; |
| @@ -2665,7 +2663,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { | |||
| 2665 | 2663 | ||
| 2666 | static __net_init int rt_genid_init(struct net *net) | 2664 | static __net_init int rt_genid_init(struct net *net) |
| 2667 | { | 2665 | { |
| 2668 | atomic_set(&net->rt_genid, 0); | 2666 | atomic_set(&net->ipv4.rt_genid, 0); |
| 2669 | atomic_set(&net->fnhe_genid, 0); | 2667 | atomic_set(&net->fnhe_genid, 0); |
| 2670 | get_random_bytes(&net->ipv4.dev_addr_genid, | 2668 | get_random_bytes(&net->ipv4.dev_addr_genid, |
| 2671 | sizeof(net->ipv4.dev_addr_genid)); | 2669 | sizeof(net->ipv4.dev_addr_genid)); |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b05c96e7af8b..14a15c49129d 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
| @@ -160,26 +160,33 @@ static __u16 const msstab[] = { | |||
| 160 | * Generate a syncookie. mssp points to the mss, which is returned | 160 | * Generate a syncookie. mssp points to the mss, which is returned |
| 161 | * rounded down to the value encoded in the cookie. | 161 | * rounded down to the value encoded in the cookie. |
| 162 | */ | 162 | */ |
| 163 | __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) | 163 | u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, |
| 164 | u16 *mssp) | ||
| 164 | { | 165 | { |
| 165 | const struct iphdr *iph = ip_hdr(skb); | ||
| 166 | const struct tcphdr *th = tcp_hdr(skb); | ||
| 167 | int mssind; | 166 | int mssind; |
| 168 | const __u16 mss = *mssp; | 167 | const __u16 mss = *mssp; |
| 169 | 168 | ||
| 170 | tcp_synq_overflow(sk); | ||
| 171 | |||
| 172 | for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) | 169 | for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) |
| 173 | if (mss >= msstab[mssind]) | 170 | if (mss >= msstab[mssind]) |
| 174 | break; | 171 | break; |
| 175 | *mssp = msstab[mssind]; | 172 | *mssp = msstab[mssind]; |
| 176 | 173 | ||
| 177 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); | ||
| 178 | |||
| 179 | return secure_tcp_syn_cookie(iph->saddr, iph->daddr, | 174 | return secure_tcp_syn_cookie(iph->saddr, iph->daddr, |
| 180 | th->source, th->dest, ntohl(th->seq), | 175 | th->source, th->dest, ntohl(th->seq), |
| 181 | jiffies / (HZ * 60), mssind); | 176 | jiffies / (HZ * 60), mssind); |
| 182 | } | 177 | } |
| 178 | EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); | ||
| 179 | |||
| 180 | __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) | ||
| 181 | { | ||
| 182 | const struct iphdr *iph = ip_hdr(skb); | ||
| 183 | const struct tcphdr *th = tcp_hdr(skb); | ||
| 184 | |||
| 185 | tcp_synq_overflow(sk); | ||
| 186 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); | ||
| 187 | |||
| 188 | return __cookie_v4_init_sequence(iph, th, mssp); | ||
| 189 | } | ||
| 183 | 190 | ||
| 184 | /* | 191 | /* |
| 185 | * This (misnamed) value is the age of syncookie which is permitted. | 192 | * This (misnamed) value is the age of syncookie which is permitted. |
| @@ -192,10 +199,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) | |||
| 192 | * Check if a ack sequence number is a valid syncookie. | 199 | * Check if a ack sequence number is a valid syncookie. |
| 193 | * Return the decoded mss if it is, or 0 if not. | 200 | * Return the decoded mss if it is, or 0 if not. |
| 194 | */ | 201 | */ |
| 195 | static inline int cookie_check(struct sk_buff *skb, __u32 cookie) | 202 | int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, |
| 203 | u32 cookie) | ||
| 196 | { | 204 | { |
| 197 | const struct iphdr *iph = ip_hdr(skb); | ||
| 198 | const struct tcphdr *th = tcp_hdr(skb); | ||
| 199 | __u32 seq = ntohl(th->seq) - 1; | 205 | __u32 seq = ntohl(th->seq) - 1; |
| 200 | __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, | 206 | __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, |
| 201 | th->source, th->dest, seq, | 207 | th->source, th->dest, seq, |
| @@ -204,6 +210,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) | |||
| 204 | 210 | ||
| 205 | return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; | 211 | return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; |
| 206 | } | 212 | } |
| 213 | EXPORT_SYMBOL_GPL(__cookie_v4_check); | ||
| 207 | 214 | ||
| 208 | static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, | 215 | static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, |
| 209 | struct request_sock *req, | 216 | struct request_sock *req, |
| @@ -284,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
| 284 | goto out; | 291 | goto out; |
| 285 | 292 | ||
| 286 | if (tcp_synq_no_recent_overflow(sk) || | 293 | if (tcp_synq_no_recent_overflow(sk) || |
| 287 | (mss = cookie_check(skb, cookie)) == 0) { | 294 | (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { |
| 288 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); | 295 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); |
| 289 | goto out; | 296 | goto out; |
| 290 | } | 297 | } |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 610e324348d1..540279f4c531 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | static int zero; | 29 | static int zero; |
| 30 | static int one = 1; | 30 | static int one = 1; |
| 31 | static int four = 4; | 31 | static int four = 4; |
| 32 | static int gso_max_segs = GSO_MAX_SEGS; | ||
| 32 | static int tcp_retr1_max = 255; | 33 | static int tcp_retr1_max = 255; |
| 33 | static int ip_local_port_range_min[] = { 1, 1 }; | 34 | static int ip_local_port_range_min[] = { 1, 1 }; |
| 34 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 35 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
| @@ -559,6 +560,13 @@ static struct ctl_table ipv4_table[] = { | |||
| 559 | .extra1 = &one, | 560 | .extra1 = &one, |
| 560 | }, | 561 | }, |
| 561 | { | 562 | { |
| 563 | .procname = "tcp_notsent_lowat", | ||
| 564 | .data = &sysctl_tcp_notsent_lowat, | ||
| 565 | .maxlen = sizeof(sysctl_tcp_notsent_lowat), | ||
| 566 | .mode = 0644, | ||
| 567 | .proc_handler = proc_dointvec, | ||
| 568 | }, | ||
| 569 | { | ||
| 562 | .procname = "tcp_rmem", | 570 | .procname = "tcp_rmem", |
| 563 | .data = &sysctl_tcp_rmem, | 571 | .data = &sysctl_tcp_rmem, |
| 564 | .maxlen = sizeof(sysctl_tcp_rmem), | 572 | .maxlen = sizeof(sysctl_tcp_rmem), |
| @@ -754,6 +762,15 @@ static struct ctl_table ipv4_table[] = { | |||
| 754 | .extra2 = &four, | 762 | .extra2 = &four, |
| 755 | }, | 763 | }, |
| 756 | { | 764 | { |
| 765 | .procname = "tcp_min_tso_segs", | ||
| 766 | .data = &sysctl_tcp_min_tso_segs, | ||
| 767 | .maxlen = sizeof(int), | ||
| 768 | .mode = 0644, | ||
| 769 | .proc_handler = proc_dointvec_minmax, | ||
| 770 | .extra1 = &zero, | ||
| 771 | .extra2 = &gso_max_segs, | ||
| 772 | }, | ||
| 773 | { | ||
| 757 | .procname = "udp_mem", | 774 | .procname = "udp_mem", |
| 758 | .data = &sysctl_udp_mem, | 775 | .data = &sysctl_udp_mem, |
| 759 | .maxlen = sizeof(sysctl_udp_mem), | 776 | .maxlen = sizeof(sysctl_udp_mem), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b2f6c74861af..6e5617b9f9db 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
| @@ -283,6 +283,8 @@ | |||
| 283 | 283 | ||
| 284 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | 284 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; |
| 285 | 285 | ||
| 286 | int sysctl_tcp_min_tso_segs __read_mostly = 2; | ||
| 287 | |||
| 286 | struct percpu_counter tcp_orphan_count; | 288 | struct percpu_counter tcp_orphan_count; |
| 287 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 289 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
| 288 | 290 | ||
| @@ -410,10 +412,6 @@ void tcp_init_sock(struct sock *sk) | |||
| 410 | 412 | ||
| 411 | icsk->icsk_sync_mss = tcp_sync_mss; | 413 | icsk->icsk_sync_mss = tcp_sync_mss; |
| 412 | 414 | ||
| 413 | /* Presumed zeroed, in order of appearance: | ||
| 414 | * cookie_in_always, cookie_out_never, | ||
| 415 | * s_data_constant, s_data_in, s_data_out | ||
| 416 | */ | ||
| 417 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; | 415 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; |
| 418 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; | 416 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; |
| 419 | 417 | ||
| @@ -499,7 +497,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
| 499 | mask |= POLLIN | POLLRDNORM; | 497 | mask |= POLLIN | POLLRDNORM; |
| 500 | 498 | ||
| 501 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { | 499 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { |
| 502 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { | 500 | if (sk_stream_is_writeable(sk)) { |
| 503 | mask |= POLLOUT | POLLWRNORM; | 501 | mask |= POLLOUT | POLLWRNORM; |
| 504 | } else { /* send SIGIO later */ | 502 | } else { /* send SIGIO later */ |
| 505 | set_bit(SOCK_ASYNC_NOSPACE, | 503 | set_bit(SOCK_ASYNC_NOSPACE, |
| @@ -510,7 +508,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
| 510 | * wspace test but before the flags are set, | 508 | * wspace test but before the flags are set, |
| 511 | * IO signal will be lost. | 509 | * IO signal will be lost. |
| 512 | */ | 510 | */ |
| 513 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) | 511 | if (sk_stream_is_writeable(sk)) |
| 514 | mask |= POLLOUT | POLLWRNORM; | 512 | mask |= POLLOUT | POLLWRNORM; |
| 515 | } | 513 | } |
| 516 | } else | 514 | } else |
| @@ -789,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
| 789 | xmit_size_goal = mss_now; | 787 | xmit_size_goal = mss_now; |
| 790 | 788 | ||
| 791 | if (large_allowed && sk_can_gso(sk)) { | 789 | if (large_allowed && sk_can_gso(sk)) { |
| 792 | xmit_size_goal = ((sk->sk_gso_max_size - 1) - | 790 | u32 gso_size, hlen; |
| 793 | inet_csk(sk)->icsk_af_ops->net_header_len - | 791 | |
| 794 | inet_csk(sk)->icsk_ext_hdr_len - | 792 | /* Maybe we should/could use sk->sk_prot->max_header here ? */ |
| 795 | tp->tcp_header_len); | 793 | hlen = inet_csk(sk)->icsk_af_ops->net_header_len + |
| 794 | inet_csk(sk)->icsk_ext_hdr_len + | ||
| 795 | tp->tcp_header_len; | ||
| 796 | |||
| 797 | /* Goal is to send at least one packet per ms, | ||
| 798 | * not one big TSO packet every 100 ms. | ||
| 799 | * This preserves ACK clocking and is consistent | ||
| 800 | * with tcp_tso_should_defer() heuristic. | ||
| 801 | */ | ||
| 802 | gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); | ||
| 803 | gso_size = max_t(u32, gso_size, | ||
| 804 | sysctl_tcp_min_tso_segs * mss_now); | ||
| 805 | |||
| 806 | xmit_size_goal = min_t(u32, gso_size, | ||
| 807 | sk->sk_gso_max_size - 1 - hlen); | ||
| 796 | 808 | ||
| 797 | /* TSQ : try to have two TSO segments in flight */ | 809 | /* TSQ : try to have at least two segments in flight |
| 810 | * (one in NIC TX ring, another in Qdisc) | ||
| 811 | */ | ||
| 798 | xmit_size_goal = min_t(u32, xmit_size_goal, | 812 | xmit_size_goal = min_t(u32, xmit_size_goal, |
| 799 | sysctl_tcp_limit_output_bytes >> 1); | 813 | sysctl_tcp_limit_output_bytes >> 1); |
| 800 | 814 | ||
| @@ -2454,10 +2468,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
| 2454 | case TCP_THIN_DUPACK: | 2468 | case TCP_THIN_DUPACK: |
| 2455 | if (val < 0 || val > 1) | 2469 | if (val < 0 || val > 1) |
| 2456 | err = -EINVAL; | 2470 | err = -EINVAL; |
| 2457 | else | 2471 | else { |
| 2458 | tp->thin_dupack = val; | 2472 | tp->thin_dupack = val; |
| 2459 | if (tp->thin_dupack) | 2473 | if (tp->thin_dupack) |
| 2460 | tcp_disable_early_retrans(tp); | 2474 | tcp_disable_early_retrans(tp); |
| 2475 | } | ||
| 2461 | break; | 2476 | break; |
| 2462 | 2477 | ||
| 2463 | case TCP_REPAIR: | 2478 | case TCP_REPAIR: |
| @@ -2638,6 +2653,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
| 2638 | else | 2653 | else |
| 2639 | tp->tsoffset = val - tcp_time_stamp; | 2654 | tp->tsoffset = val - tcp_time_stamp; |
| 2640 | break; | 2655 | break; |
| 2656 | case TCP_NOTSENT_LOWAT: | ||
| 2657 | tp->notsent_lowat = val; | ||
| 2658 | sk->sk_write_space(sk); | ||
| 2659 | break; | ||
| 2641 | default: | 2660 | default: |
| 2642 | err = -ENOPROTOOPT; | 2661 | err = -ENOPROTOOPT; |
| 2643 | break; | 2662 | break; |
| @@ -2854,6 +2873,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
| 2854 | case TCP_TIMESTAMP: | 2873 | case TCP_TIMESTAMP: |
| 2855 | val = tcp_time_stamp + tp->tsoffset; | 2874 | val = tcp_time_stamp + tp->tsoffset; |
| 2856 | break; | 2875 | break; |
| 2876 | case TCP_NOTSENT_LOWAT: | ||
| 2877 | val = tp->notsent_lowat; | ||
| 2878 | break; | ||
| 2857 | default: | 2879 | default: |
| 2858 | return -ENOPROTOOPT; | 2880 | return -ENOPROTOOPT; |
| 2859 | } | 2881 | } |
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8f7ef0ad80e5..ab7bd35bb312 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c | |||
| @@ -58,23 +58,22 @@ error: kfree(ctx); | |||
| 58 | return err; | 58 | return err; |
| 59 | } | 59 | } |
| 60 | 60 | ||
| 61 | /* Computes the fastopen cookie for the peer. | 61 | /* Computes the fastopen cookie for the IP path. |
| 62 | * The peer address is a 128 bits long (pad with zeros for IPv4). | 62 | * The path is a 128 bits long (pad with zeros for IPv4). |
| 63 | * | 63 | * |
| 64 | * The caller must check foc->len to determine if a valid cookie | 64 | * The caller must check foc->len to determine if a valid cookie |
| 65 | * has been generated successfully. | 65 | * has been generated successfully. |
| 66 | */ | 66 | */ |
| 67 | void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) | 67 | void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, |
| 68 | struct tcp_fastopen_cookie *foc) | ||
| 68 | { | 69 | { |
| 69 | __be32 peer_addr[4] = { addr, 0, 0, 0 }; | 70 | __be32 path[4] = { src, dst, 0, 0 }; |
| 70 | struct tcp_fastopen_context *ctx; | 71 | struct tcp_fastopen_context *ctx; |
| 71 | 72 | ||
| 72 | rcu_read_lock(); | 73 | rcu_read_lock(); |
| 73 | ctx = rcu_dereference(tcp_fastopen_ctx); | 74 | ctx = rcu_dereference(tcp_fastopen_ctx); |
| 74 | if (ctx) { | 75 | if (ctx) { |
| 75 | crypto_cipher_encrypt_one(ctx->tfm, | 76 | crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); |
| 76 | foc->val, | ||
| 77 | (__u8 *)peer_addr); | ||
| 78 | foc->len = TCP_FASTOPEN_COOKIE_SIZE; | 77 | foc->len = TCP_FASTOPEN_COOKIE_SIZE; |
| 79 | } | 78 | } |
| 80 | rcu_read_unlock(); | 79 | rcu_read_unlock(); |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ca2139a130b..1969e16d936d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
| 688 | } | 688 | } |
| 689 | } | 689 | } |
| 690 | 690 | ||
| 691 | /* Set the sk_pacing_rate to allow proper sizing of TSO packets. | ||
| 692 | * Note: TCP stack does not yet implement pacing. | ||
| 693 | * FQ packet scheduler can be used to implement cheap but effective | ||
| 694 | * TCP pacing, to smooth the burst on large writes when packets | ||
| 695 | * in flight is significantly lower than cwnd (or rwin) | ||
| 696 | */ | ||
| 697 | static void tcp_update_pacing_rate(struct sock *sk) | ||
| 698 | { | ||
| 699 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 700 | u64 rate; | ||
| 701 | |||
| 702 | /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ | ||
| 703 | rate = (u64)tp->mss_cache * 2 * (HZ << 3); | ||
| 704 | |||
| 705 | rate *= max(tp->snd_cwnd, tp->packets_out); | ||
| 706 | |||
| 707 | /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), | ||
| 708 | * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) | ||
| 709 | * We probably need usec resolution in the future. | ||
| 710 | * Note: This also takes care of possible srtt=0 case, | ||
| 711 | * when tcp_rtt_estimator() was not yet called. | ||
| 712 | */ | ||
| 713 | if (tp->srtt > 8 + 2) | ||
| 714 | do_div(rate, tp->srtt); | ||
| 715 | |||
| 716 | sk->sk_pacing_rate = min_t(u64, rate, ~0U); | ||
| 717 | } | ||
| 718 | |||
| 691 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 719 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
| 692 | * routine referred to above. | 720 | * routine referred to above. |
| 693 | */ | 721 | */ |
| @@ -1048,6 +1076,7 @@ struct tcp_sacktag_state { | |||
| 1048 | int reord; | 1076 | int reord; |
| 1049 | int fack_count; | 1077 | int fack_count; |
| 1050 | int flag; | 1078 | int flag; |
| 1079 | s32 rtt; /* RTT measured by SACKing never-retransmitted data */ | ||
| 1051 | }; | 1080 | }; |
| 1052 | 1081 | ||
| 1053 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, | 1082 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, |
| @@ -1108,7 +1137,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | |||
| 1108 | static u8 tcp_sacktag_one(struct sock *sk, | 1137 | static u8 tcp_sacktag_one(struct sock *sk, |
| 1109 | struct tcp_sacktag_state *state, u8 sacked, | 1138 | struct tcp_sacktag_state *state, u8 sacked, |
| 1110 | u32 start_seq, u32 end_seq, | 1139 | u32 start_seq, u32 end_seq, |
| 1111 | bool dup_sack, int pcount) | 1140 | int dup_sack, int pcount, u32 xmit_time) |
| 1112 | { | 1141 | { |
| 1113 | struct tcp_sock *tp = tcp_sk(sk); | 1142 | struct tcp_sock *tp = tcp_sk(sk); |
| 1114 | int fack_count = state->fack_count; | 1143 | int fack_count = state->fack_count; |
| @@ -1148,6 +1177,9 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
| 1148 | state->reord); | 1177 | state->reord); |
| 1149 | if (!after(end_seq, tp->high_seq)) | 1178 | if (!after(end_seq, tp->high_seq)) |
| 1150 | state->flag |= FLAG_ORIG_SACK_ACKED; | 1179 | state->flag |= FLAG_ORIG_SACK_ACKED; |
| 1180 | /* Pick the earliest sequence sacked for RTT */ | ||
| 1181 | if (state->rtt < 0) | ||
| 1182 | state->rtt = tcp_time_stamp - xmit_time; | ||
| 1151 | } | 1183 | } |
| 1152 | 1184 | ||
| 1153 | if (sacked & TCPCB_LOST) { | 1185 | if (sacked & TCPCB_LOST) { |
| @@ -1205,7 +1237,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
| 1205 | * tcp_highest_sack_seq() when skb is highest_sack. | 1237 | * tcp_highest_sack_seq() when skb is highest_sack. |
| 1206 | */ | 1238 | */ |
| 1207 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, | 1239 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, |
| 1208 | start_seq, end_seq, dup_sack, pcount); | 1240 | start_seq, end_seq, dup_sack, pcount, |
| 1241 | TCP_SKB_CB(skb)->when); | ||
| 1209 | 1242 | ||
| 1210 | if (skb == tp->lost_skb_hint) | 1243 | if (skb == tp->lost_skb_hint) |
| 1211 | tp->lost_cnt_hint += pcount; | 1244 | tp->lost_cnt_hint += pcount; |
| @@ -1479,7 +1512,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
| 1479 | TCP_SKB_CB(skb)->seq, | 1512 | TCP_SKB_CB(skb)->seq, |
| 1480 | TCP_SKB_CB(skb)->end_seq, | 1513 | TCP_SKB_CB(skb)->end_seq, |
| 1481 | dup_sack, | 1514 | dup_sack, |
| 1482 | tcp_skb_pcount(skb)); | 1515 | tcp_skb_pcount(skb), |
| 1516 | TCP_SKB_CB(skb)->when); | ||
| 1483 | 1517 | ||
| 1484 | if (!before(TCP_SKB_CB(skb)->seq, | 1518 | if (!before(TCP_SKB_CB(skb)->seq, |
| 1485 | tcp_highest_sack_seq(tp))) | 1519 | tcp_highest_sack_seq(tp))) |
| @@ -1536,7 +1570,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl | |||
| 1536 | 1570 | ||
| 1537 | static int | 1571 | static int |
| 1538 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | 1572 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, |
| 1539 | u32 prior_snd_una) | 1573 | u32 prior_snd_una, s32 *sack_rtt) |
| 1540 | { | 1574 | { |
| 1541 | struct tcp_sock *tp = tcp_sk(sk); | 1575 | struct tcp_sock *tp = tcp_sk(sk); |
| 1542 | const unsigned char *ptr = (skb_transport_header(ack_skb) + | 1576 | const unsigned char *ptr = (skb_transport_header(ack_skb) + |
| @@ -1554,6 +1588,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
| 1554 | 1588 | ||
| 1555 | state.flag = 0; | 1589 | state.flag = 0; |
| 1556 | state.reord = tp->packets_out; | 1590 | state.reord = tp->packets_out; |
| 1591 | state.rtt = -1; | ||
| 1557 | 1592 | ||
| 1558 | if (!tp->sacked_out) { | 1593 | if (!tp->sacked_out) { |
| 1559 | if (WARN_ON(tp->fackets_out)) | 1594 | if (WARN_ON(tp->fackets_out)) |
| @@ -1737,6 +1772,7 @@ out: | |||
| 1737 | WARN_ON((int)tp->retrans_out < 0); | 1772 | WARN_ON((int)tp->retrans_out < 0); |
| 1738 | WARN_ON((int)tcp_packets_in_flight(tp) < 0); | 1773 | WARN_ON((int)tcp_packets_in_flight(tp) < 0); |
| 1739 | #endif | 1774 | #endif |
| 1775 | *sack_rtt = state.rtt; | ||
| 1740 | return state.flag; | 1776 | return state.flag; |
| 1741 | } | 1777 | } |
| 1742 | 1778 | ||
| @@ -1869,8 +1905,13 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
| 1869 | } | 1905 | } |
| 1870 | tcp_verify_left_out(tp); | 1906 | tcp_verify_left_out(tp); |
| 1871 | 1907 | ||
| 1872 | tp->reordering = min_t(unsigned int, tp->reordering, | 1908 | /* Timeout in disordered state after receiving substantial DUPACKs |
| 1873 | sysctl_tcp_reordering); | 1909 | * suggests that the degree of reordering is over-estimated. |
| 1910 | */ | ||
| 1911 | if (icsk->icsk_ca_state <= TCP_CA_Disorder && | ||
| 1912 | tp->sacked_out >= sysctl_tcp_reordering) | ||
| 1913 | tp->reordering = min_t(unsigned int, tp->reordering, | ||
| 1914 | sysctl_tcp_reordering); | ||
| 1874 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1915 | tcp_set_ca_state(sk, TCP_CA_Loss); |
| 1875 | tp->high_seq = tp->snd_nxt; | 1916 | tp->high_seq = tp->snd_nxt; |
| 1876 | TCP_ECN_queue_cwr(tp); | 1917 | TCP_ECN_queue_cwr(tp); |
| @@ -2472,8 +2513,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) | |||
| 2472 | 2513 | ||
| 2473 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { | 2514 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { |
| 2474 | tcp_try_keep_open(sk); | 2515 | tcp_try_keep_open(sk); |
| 2475 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) | ||
| 2476 | tcp_moderate_cwnd(tp); | ||
| 2477 | } else { | 2516 | } else { |
| 2478 | tcp_cwnd_reduction(sk, prior_unsacked, 0); | 2517 | tcp_cwnd_reduction(sk, prior_unsacked, 0); |
| 2479 | } | 2518 | } |
| @@ -2792,65 +2831,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
| 2792 | tcp_xmit_retransmit_queue(sk); | 2831 | tcp_xmit_retransmit_queue(sk); |
| 2793 | } | 2832 | } |
| 2794 | 2833 | ||
| 2795 | void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) | 2834 | static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, |
| 2835 | s32 seq_rtt, s32 sack_rtt) | ||
| 2796 | { | 2836 | { |
| 2797 | tcp_rtt_estimator(sk, seq_rtt); | 2837 | const struct tcp_sock *tp = tcp_sk(sk); |
| 2798 | tcp_set_rto(sk); | 2838 | |
| 2799 | inet_csk(sk)->icsk_backoff = 0; | 2839 | /* Prefer RTT measured from ACK's timing to TS-ECR. This is because |
| 2800 | } | 2840 | * broken middle-boxes or peers may corrupt TS-ECR fields. But |
| 2801 | EXPORT_SYMBOL(tcp_valid_rtt_meas); | 2841 | * Karn's algorithm forbids taking RTT if some retransmitted data |
| 2842 | * is acked (RFC6298). | ||
| 2843 | */ | ||
| 2844 | if (flag & FLAG_RETRANS_DATA_ACKED) | ||
| 2845 | seq_rtt = -1; | ||
| 2846 | |||
| 2847 | if (seq_rtt < 0) | ||
| 2848 | seq_rtt = sack_rtt; | ||
| 2802 | 2849 | ||
| 2803 | /* Read draft-ietf-tcplw-high-performance before mucking | ||
| 2804 | * with this code. (Supersedes RFC1323) | ||
| 2805 | */ | ||
| 2806 | static void tcp_ack_saw_tstamp(struct sock *sk, int flag) | ||
| 2807 | { | ||
| 2808 | /* RTTM Rule: A TSecr value received in a segment is used to | 2850 | /* RTTM Rule: A TSecr value received in a segment is used to |
| 2809 | * update the averaged RTT measurement only if the segment | 2851 | * update the averaged RTT measurement only if the segment |
| 2810 | * acknowledges some new data, i.e., only if it advances the | 2852 | * acknowledges some new data, i.e., only if it advances the |
| 2811 | * left edge of the send window. | 2853 | * left edge of the send window. |
| 2812 | * | ||
| 2813 | * See draft-ietf-tcplw-high-performance-00, section 3.3. | 2854 | * See draft-ietf-tcplw-high-performance-00, section 3.3. |
| 2814 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> | ||
| 2815 | * | ||
| 2816 | * Changed: reset backoff as soon as we see the first valid sample. | ||
| 2817 | * If we do not, we get strongly overestimated rto. With timestamps | ||
| 2818 | * samples are accepted even from very old segments: f.e., when rtt=1 | ||
| 2819 | * increases to 8, we retransmit 5 times and after 8 seconds delayed | ||
| 2820 | * answer arrives rto becomes 120 seconds! If at least one of segments | ||
| 2821 | * in window is lost... Voila. --ANK (010210) | ||
| 2822 | */ | 2855 | */ |
| 2823 | struct tcp_sock *tp = tcp_sk(sk); | 2856 | if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
| 2824 | 2857 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | |
| 2825 | tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); | ||
| 2826 | } | ||
| 2827 | 2858 | ||
| 2828 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) | 2859 | if (seq_rtt < 0) |
| 2829 | { | 2860 | return false; |
| 2830 | /* We don't have a timestamp. Can only use | ||
| 2831 | * packets that are not retransmitted to determine | ||
| 2832 | * rtt estimates. Also, we must not reset the | ||
| 2833 | * backoff for rto until we get a non-retransmitted | ||
| 2834 | * packet. This allows us to deal with a situation | ||
| 2835 | * where the network delay has increased suddenly. | ||
| 2836 | * I.e. Karn's algorithm. (SIGCOMM '87, p5.) | ||
| 2837 | */ | ||
| 2838 | 2861 | ||
| 2839 | if (flag & FLAG_RETRANS_DATA_ACKED) | 2862 | tcp_rtt_estimator(sk, seq_rtt); |
| 2840 | return; | 2863 | tcp_set_rto(sk); |
| 2841 | 2864 | ||
| 2842 | tcp_valid_rtt_meas(sk, seq_rtt); | 2865 | /* RFC6298: only reset backoff on valid RTT measurement. */ |
| 2866 | inet_csk(sk)->icsk_backoff = 0; | ||
| 2867 | return true; | ||
| 2843 | } | 2868 | } |
| 2844 | 2869 | ||
| 2845 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, | 2870 | /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ |
| 2846 | const s32 seq_rtt) | 2871 | static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) |
| 2847 | { | 2872 | { |
| 2848 | const struct tcp_sock *tp = tcp_sk(sk); | 2873 | struct tcp_sock *tp = tcp_sk(sk); |
| 2849 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 2874 | s32 seq_rtt = -1; |
| 2850 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 2875 | |
| 2851 | tcp_ack_saw_tstamp(sk, flag); | 2876 | if (tp->lsndtime && !tp->total_retrans) |
| 2852 | else if (seq_rtt >= 0) | 2877 | seq_rtt = tcp_time_stamp - tp->lsndtime; |
| 2853 | tcp_ack_no_tstamp(sk, seq_rtt, flag); | 2878 | tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); |
| 2854 | } | 2879 | } |
| 2855 | 2880 | ||
| 2856 | static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) | 2881 | static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) |
| @@ -2939,7 +2964,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) | |||
| 2939 | * arrived at the other end. | 2964 | * arrived at the other end. |
| 2940 | */ | 2965 | */ |
| 2941 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | 2966 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, |
| 2942 | u32 prior_snd_una) | 2967 | u32 prior_snd_una, s32 sack_rtt) |
| 2943 | { | 2968 | { |
| 2944 | struct tcp_sock *tp = tcp_sk(sk); | 2969 | struct tcp_sock *tp = tcp_sk(sk); |
| 2945 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2970 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| @@ -2978,8 +3003,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
| 2978 | if (sacked & TCPCB_SACKED_RETRANS) | 3003 | if (sacked & TCPCB_SACKED_RETRANS) |
| 2979 | tp->retrans_out -= acked_pcount; | 3004 | tp->retrans_out -= acked_pcount; |
| 2980 | flag |= FLAG_RETRANS_DATA_ACKED; | 3005 | flag |= FLAG_RETRANS_DATA_ACKED; |
| 2981 | ca_seq_rtt = -1; | ||
| 2982 | seq_rtt = -1; | ||
| 2983 | } else { | 3006 | } else { |
| 2984 | ca_seq_rtt = now - scb->when; | 3007 | ca_seq_rtt = now - scb->when; |
| 2985 | last_ackt = skb->tstamp; | 3008 | last_ackt = skb->tstamp; |
| @@ -3031,6 +3054,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
| 3031 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | 3054 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
| 3032 | flag |= FLAG_SACK_RENEGING; | 3055 | flag |= FLAG_SACK_RENEGING; |
| 3033 | 3056 | ||
| 3057 | if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || | ||
| 3058 | (flag & FLAG_ACKED)) | ||
| 3059 | tcp_rearm_rto(sk); | ||
| 3060 | |||
| 3034 | if (flag & FLAG_ACKED) { | 3061 | if (flag & FLAG_ACKED) { |
| 3035 | const struct tcp_congestion_ops *ca_ops | 3062 | const struct tcp_congestion_ops *ca_ops |
| 3036 | = inet_csk(sk)->icsk_ca_ops; | 3063 | = inet_csk(sk)->icsk_ca_ops; |
| @@ -3040,9 +3067,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
| 3040 | tcp_mtup_probe_success(sk); | 3067 | tcp_mtup_probe_success(sk); |
| 3041 | } | 3068 | } |
| 3042 | 3069 | ||
| 3043 | tcp_ack_update_rtt(sk, flag, seq_rtt); | ||
| 3044 | tcp_rearm_rto(sk); | ||
| 3045 | |||
| 3046 | if (tcp_is_reno(tp)) { | 3070 | if (tcp_is_reno(tp)) { |
| 3047 | tcp_remove_reno_sacks(sk, pkts_acked); | 3071 | tcp_remove_reno_sacks(sk, pkts_acked); |
| 3048 | } else { | 3072 | } else { |
| @@ -3130,11 +3154,24 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) | |||
| 3130 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; | 3154 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; |
| 3131 | } | 3155 | } |
| 3132 | 3156 | ||
| 3157 | /* Decide wheather to run the increase function of congestion control. */ | ||
| 3133 | static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) | 3158 | static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
| 3134 | { | 3159 | { |
| 3135 | const struct tcp_sock *tp = tcp_sk(sk); | 3160 | if (tcp_in_cwnd_reduction(sk)) |
| 3136 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && | 3161 | return false; |
| 3137 | !tcp_in_cwnd_reduction(sk); | 3162 | |
| 3163 | /* If reordering is high then always grow cwnd whenever data is | ||
| 3164 | * delivered regardless of its ordering. Otherwise stay conservative | ||
| 3165 | * and only grow cwnd on in-order delivery in Open state, and retain | ||
| 3166 | * cwnd in Disordered state (RFC5681). A stretched ACK with | ||
| 3167 | * new SACK or ECE mark may first advance cwnd here and later reduce | ||
| 3168 | * cwnd in tcp_fastretrans_alert() based on more states. | ||
| 3169 | */ | ||
| 3170 | if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) | ||
| 3171 | return flag & FLAG_FORWARD_PROGRESS; | ||
| 3172 | |||
| 3173 | return inet_csk(sk)->icsk_ca_state == TCP_CA_Open && | ||
| 3174 | flag & FLAG_DATA_ACKED; | ||
| 3138 | } | 3175 | } |
| 3139 | 3176 | ||
| 3140 | /* Check that window update is acceptable. | 3177 | /* Check that window update is acceptable. |
| @@ -3269,11 +3306,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
| 3269 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3306 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
| 3270 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3307 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
| 3271 | bool is_dupack = false; | 3308 | bool is_dupack = false; |
| 3272 | u32 prior_in_flight; | 3309 | u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; |
| 3273 | u32 prior_fackets; | 3310 | u32 prior_fackets; |
| 3274 | int prior_packets = tp->packets_out; | 3311 | int prior_packets = tp->packets_out; |
| 3275 | const int prior_unsacked = tp->packets_out - tp->sacked_out; | 3312 | const int prior_unsacked = tp->packets_out - tp->sacked_out; |
| 3276 | int acked = 0; /* Number of packets newly acked */ | 3313 | int acked = 0; /* Number of packets newly acked */ |
| 3314 | s32 sack_rtt = -1; | ||
| 3277 | 3315 | ||
| 3278 | /* If the ack is older than previous acks | 3316 | /* If the ack is older than previous acks |
| 3279 | * then we can probably ignore it. | 3317 | * then we can probably ignore it. |
| @@ -3330,7 +3368,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
| 3330 | flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); | 3368 | flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); |
| 3331 | 3369 | ||
| 3332 | if (TCP_SKB_CB(skb)->sacked) | 3370 | if (TCP_SKB_CB(skb)->sacked) |
| 3333 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); | 3371 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
| 3372 | &sack_rtt); | ||
| 3334 | 3373 | ||
| 3335 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) | 3374 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) |
| 3336 | flag |= FLAG_ECE; | 3375 | flag |= FLAG_ECE; |
| @@ -3349,21 +3388,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
| 3349 | 3388 | ||
| 3350 | /* See if we can take anything off of the retransmit queue. */ | 3389 | /* See if we can take anything off of the retransmit queue. */ |
| 3351 | acked = tp->packets_out; | 3390 | acked = tp->packets_out; |
| 3352 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); | 3391 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); |
| 3353 | acked -= tp->packets_out; | 3392 | acked -= tp->packets_out; |
| 3354 | 3393 | ||
| 3394 | /* Advance cwnd if state allows */ | ||
| 3395 | if (tcp_may_raise_cwnd(sk, flag)) | ||
| 3396 | tcp_cong_avoid(sk, ack, prior_in_flight); | ||
| 3397 | |||
| 3355 | if (tcp_ack_is_dubious(sk, flag)) { | 3398 | if (tcp_ack_is_dubious(sk, flag)) { |
| 3356 | /* Advance CWND, if state allows this. */ | ||
| 3357 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) | ||
| 3358 | tcp_cong_avoid(sk, ack, prior_in_flight); | ||
| 3359 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | 3399 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
| 3360 | tcp_fastretrans_alert(sk, acked, prior_unsacked, | 3400 | tcp_fastretrans_alert(sk, acked, prior_unsacked, |
| 3361 | is_dupack, flag); | 3401 | is_dupack, flag); |
| 3362 | } else { | ||
| 3363 | if (flag & FLAG_DATA_ACKED) | ||
| 3364 | tcp_cong_avoid(sk, ack, prior_in_flight); | ||
| 3365 | } | 3402 | } |
| 3366 | |||
| 3367 | if (tp->tlp_high_seq) | 3403 | if (tp->tlp_high_seq) |
| 3368 | tcp_process_tlp_ack(sk, ack, flag); | 3404 | tcp_process_tlp_ack(sk, ack, flag); |
| 3369 | 3405 | ||
| @@ -3375,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
| 3375 | 3411 | ||
| 3376 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) | 3412 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) |
| 3377 | tcp_schedule_loss_probe(sk); | 3413 | tcp_schedule_loss_probe(sk); |
| 3414 | if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) | ||
| 3415 | tcp_update_pacing_rate(sk); | ||
| 3378 | return 1; | 3416 | return 1; |
| 3379 | 3417 | ||
| 3380 | no_queue: | 3418 | no_queue: |
| @@ -3402,7 +3440,8 @@ old_ack: | |||
| 3402 | * If data was DSACKed, see if we can undo a cwnd reduction. | 3440 | * If data was DSACKed, see if we can undo a cwnd reduction. |
| 3403 | */ | 3441 | */ |
| 3404 | if (TCP_SKB_CB(skb)->sacked) { | 3442 | if (TCP_SKB_CB(skb)->sacked) { |
| 3405 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); | 3443 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
| 3444 | &sack_rtt); | ||
| 3406 | tcp_fastretrans_alert(sk, acked, prior_unsacked, | 3445 | tcp_fastretrans_alert(sk, acked, prior_unsacked, |
| 3407 | is_dupack, flag); | 3446 | is_dupack, flag); |
| 3408 | } | 3447 | } |
| @@ -5013,8 +5052,8 @@ discard: | |||
| 5013 | * the rest is checked inline. Fast processing is turned on in | 5052 | * the rest is checked inline. Fast processing is turned on in |
| 5014 | * tcp_data_queue when everything is OK. | 5053 | * tcp_data_queue when everything is OK. |
| 5015 | */ | 5054 | */ |
| 5016 | int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | 5055 | void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
| 5017 | const struct tcphdr *th, unsigned int len) | 5056 | const struct tcphdr *th, unsigned int len) |
| 5018 | { | 5057 | { |
| 5019 | struct tcp_sock *tp = tcp_sk(sk); | 5058 | struct tcp_sock *tp = tcp_sk(sk); |
| 5020 | 5059 | ||
| @@ -5091,7 +5130,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 5091 | tcp_ack(sk, skb, 0); | 5130 | tcp_ack(sk, skb, 0); |
| 5092 | __kfree_skb(skb); | 5131 | __kfree_skb(skb); |
| 5093 | tcp_data_snd_check(sk); | 5132 | tcp_data_snd_check(sk); |
| 5094 | return 0; | 5133 | return; |
| 5095 | } else { /* Header too small */ | 5134 | } else { /* Header too small */ |
| 5096 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); | 5135 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); |
| 5097 | goto discard; | 5136 | goto discard; |
| @@ -5184,7 +5223,7 @@ no_ack: | |||
| 5184 | if (eaten) | 5223 | if (eaten) |
| 5185 | kfree_skb_partial(skb, fragstolen); | 5224 | kfree_skb_partial(skb, fragstolen); |
| 5186 | sk->sk_data_ready(sk, 0); | 5225 | sk->sk_data_ready(sk, 0); |
| 5187 | return 0; | 5226 | return; |
| 5188 | } | 5227 | } |
| 5189 | } | 5228 | } |
| 5190 | 5229 | ||
| @@ -5200,7 +5239,7 @@ slow_path: | |||
| 5200 | */ | 5239 | */ |
| 5201 | 5240 | ||
| 5202 | if (!tcp_validate_incoming(sk, skb, th, 1)) | 5241 | if (!tcp_validate_incoming(sk, skb, th, 1)) |
| 5203 | return 0; | 5242 | return; |
| 5204 | 5243 | ||
| 5205 | step5: | 5244 | step5: |
| 5206 | if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) | 5245 | if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) |
| @@ -5216,7 +5255,7 @@ step5: | |||
| 5216 | 5255 | ||
| 5217 | tcp_data_snd_check(sk); | 5256 | tcp_data_snd_check(sk); |
| 5218 | tcp_ack_snd_check(sk); | 5257 | tcp_ack_snd_check(sk); |
| 5219 | return 0; | 5258 | return; |
| 5220 | 5259 | ||
| 5221 | csum_error: | 5260 | csum_error: |
| 5222 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); | 5261 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); |
| @@ -5224,7 +5263,6 @@ csum_error: | |||
| 5224 | 5263 | ||
| 5225 | discard: | 5264 | discard: |
| 5226 | __kfree_skb(skb); | 5265 | __kfree_skb(skb); |
| 5227 | return 0; | ||
| 5228 | } | 5266 | } |
| 5229 | EXPORT_SYMBOL(tcp_rcv_established); | 5267 | EXPORT_SYMBOL(tcp_rcv_established); |
| 5230 | 5268 | ||
| @@ -5627,9 +5665,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5627 | * so release it. | 5665 | * so release it. |
| 5628 | */ | 5666 | */ |
| 5629 | if (req) { | 5667 | if (req) { |
| 5630 | tcp_synack_rtt_meas(sk, req); | ||
| 5631 | tp->total_retrans = req->num_retrans; | 5668 | tp->total_retrans = req->num_retrans; |
| 5632 | |||
| 5633 | reqsk_fastopen_remove(sk, req, false); | 5669 | reqsk_fastopen_remove(sk, req, false); |
| 5634 | } else { | 5670 | } else { |
| 5635 | /* Make sure socket is routed, for correct metrics. */ | 5671 | /* Make sure socket is routed, for correct metrics. */ |
| @@ -5654,6 +5690,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5654 | tp->snd_una = TCP_SKB_CB(skb)->ack_seq; | 5690 | tp->snd_una = TCP_SKB_CB(skb)->ack_seq; |
| 5655 | tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; | 5691 | tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; |
| 5656 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | 5692 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
| 5693 | tcp_synack_rtt_meas(sk, req); | ||
| 5657 | 5694 | ||
| 5658 | if (tp->rx_opt.tstamp_ok) | 5695 | if (tp->rx_opt.tstamp_ok) |
| 5659 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 5696 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b299da5ff499..b14266bb91eb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
| @@ -821,8 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
| 821 | */ | 821 | */ |
| 822 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | 822 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
| 823 | struct request_sock *req, | 823 | struct request_sock *req, |
| 824 | u16 queue_mapping, | 824 | u16 queue_mapping) |
| 825 | bool nocache) | ||
| 826 | { | 825 | { |
| 827 | const struct inet_request_sock *ireq = inet_rsk(req); | 826 | const struct inet_request_sock *ireq = inet_rsk(req); |
| 828 | struct flowi4 fl4; | 827 | struct flowi4 fl4; |
| @@ -852,7 +851,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
| 852 | 851 | ||
| 853 | static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) | 852 | static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) |
| 854 | { | 853 | { |
| 855 | int res = tcp_v4_send_synack(sk, NULL, req, 0, false); | 854 | int res = tcp_v4_send_synack(sk, NULL, req, 0); |
| 856 | 855 | ||
| 857 | if (!res) | 856 | if (!res) |
| 858 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); | 857 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
| @@ -890,7 +889,7 @@ bool tcp_syn_flood_action(struct sock *sk, | |||
| 890 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); | 889 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); |
| 891 | 890 | ||
| 892 | lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; | 891 | lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; |
| 893 | if (!lopt->synflood_warned) { | 892 | if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { |
| 894 | lopt->synflood_warned = 1; | 893 | lopt->synflood_warned = 1; |
| 895 | pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", | 894 | pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", |
| 896 | proto, ntohs(tcp_hdr(skb)->dest), msg); | 895 | proto, ntohs(tcp_hdr(skb)->dest), msg); |
| @@ -1316,9 +1315,11 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, | |||
| 1316 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 1315 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
| 1317 | return true; | 1316 | return true; |
| 1318 | } | 1317 | } |
| 1318 | |||
| 1319 | if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { | 1319 | if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { |
| 1320 | if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { | 1320 | if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { |
| 1321 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | 1321 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, |
| 1322 | ip_hdr(skb)->daddr, valid_foc); | ||
| 1322 | if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || | 1323 | if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || |
| 1323 | memcmp(&foc->val[0], &valid_foc->val[0], | 1324 | memcmp(&foc->val[0], &valid_foc->val[0], |
| 1324 | TCP_FASTOPEN_COOKIE_SIZE) != 0) | 1325 | TCP_FASTOPEN_COOKIE_SIZE) != 0) |
| @@ -1329,14 +1330,16 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, | |||
| 1329 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 1330 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
| 1330 | return true; | 1331 | return true; |
| 1331 | } else if (foc->len == 0) { /* Client requesting a cookie */ | 1332 | } else if (foc->len == 0) { /* Client requesting a cookie */ |
| 1332 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | 1333 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, |
| 1334 | ip_hdr(skb)->daddr, valid_foc); | ||
| 1333 | NET_INC_STATS_BH(sock_net(sk), | 1335 | NET_INC_STATS_BH(sock_net(sk), |
| 1334 | LINUX_MIB_TCPFASTOPENCOOKIEREQD); | 1336 | LINUX_MIB_TCPFASTOPENCOOKIEREQD); |
| 1335 | } else { | 1337 | } else { |
| 1336 | /* Client sent a cookie with wrong size. Treat it | 1338 | /* Client sent a cookie with wrong size. Treat it |
| 1337 | * the same as invalid and return a valid one. | 1339 | * the same as invalid and return a valid one. |
| 1338 | */ | 1340 | */ |
| 1339 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | 1341 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, |
| 1342 | ip_hdr(skb)->daddr, valid_foc); | ||
| 1340 | } | 1343 | } |
| 1341 | return false; | 1344 | return false; |
| 1342 | } | 1345 | } |
| @@ -1462,7 +1465,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1462 | * limitations, they conserve resources and peer is | 1465 | * limitations, they conserve resources and peer is |
| 1463 | * evidently real one. | 1466 | * evidently real one. |
| 1464 | */ | 1467 | */ |
| 1465 | if (inet_csk_reqsk_queue_is_full(sk) && !isn) { | 1468 | if ((sysctl_tcp_syncookies == 2 || |
| 1469 | inet_csk_reqsk_queue_is_full(sk)) && !isn) { | ||
| 1466 | want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); | 1470 | want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); |
| 1467 | if (!want_cookie) | 1471 | if (!want_cookie) |
| 1468 | goto drop; | 1472 | goto drop; |
| @@ -1671,8 +1675,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
| 1671 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; | 1675 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; |
| 1672 | 1676 | ||
| 1673 | tcp_initialize_rcv_mss(newsk); | 1677 | tcp_initialize_rcv_mss(newsk); |
| 1674 | tcp_synack_rtt_meas(newsk, req); | ||
| 1675 | newtp->total_retrans = req->num_retrans; | ||
| 1676 | 1678 | ||
| 1677 | #ifdef CONFIG_TCP_MD5SIG | 1679 | #ifdef CONFIG_TCP_MD5SIG |
| 1678 | /* Copy over the MD5 key from the original socket */ | 1680 | /* Copy over the MD5 key from the original socket */ |
| @@ -1797,10 +1799,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
| 1797 | sk->sk_rx_dst = NULL; | 1799 | sk->sk_rx_dst = NULL; |
| 1798 | } | 1800 | } |
| 1799 | } | 1801 | } |
| 1800 | if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { | 1802 | tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); |
| 1801 | rsk = sk; | ||
| 1802 | goto reset; | ||
| 1803 | } | ||
| 1804 | return 0; | 1803 | return 0; |
| 1805 | } | 1804 | } |
| 1806 | 1805 | ||
| @@ -2605,7 +2604,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, | |||
| 2605 | long delta = req->expires - jiffies; | 2604 | long delta = req->expires - jiffies; |
| 2606 | 2605 | ||
| 2607 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" | 2606 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" |
| 2608 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", | 2607 | " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n", |
| 2609 | i, | 2608 | i, |
| 2610 | ireq->loc_addr, | 2609 | ireq->loc_addr, |
| 2611 | ntohs(inet_sk(sk)->inet_sport), | 2610 | ntohs(inet_sk(sk)->inet_sport), |
| @@ -2663,7 +2662,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
| 2663 | rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); | 2662 | rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); |
| 2664 | 2663 | ||
| 2665 | seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " | 2664 | seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " |
| 2666 | "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", | 2665 | "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n", |
| 2667 | i, src, srcp, dest, destp, sk->sk_state, | 2666 | i, src, srcp, dest, destp, sk->sk_state, |
| 2668 | tp->write_seq - tp->snd_una, | 2667 | tp->write_seq - tp->snd_una, |
| 2669 | rx_queue, | 2668 | rx_queue, |
| @@ -2802,6 +2801,7 @@ struct proto tcp_prot = { | |||
| 2802 | .unhash = inet_unhash, | 2801 | .unhash = inet_unhash, |
| 2803 | .get_port = inet_csk_get_port, | 2802 | .get_port = inet_csk_get_port, |
| 2804 | .enter_memory_pressure = tcp_enter_memory_pressure, | 2803 | .enter_memory_pressure = tcp_enter_memory_pressure, |
| 2804 | .stream_memory_free = tcp_stream_memory_free, | ||
| 2805 | .sockets_allocated = &tcp_sockets_allocated, | 2805 | .sockets_allocated = &tcp_sockets_allocated, |
| 2806 | .orphan_count = &tcp_orphan_count, | 2806 | .orphan_count = &tcp_orphan_count, |
| 2807 | .memory_allocated = &tcp_memory_allocated, | 2807 | .memory_allocated = &tcp_memory_allocated, |
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f6a005c485a9..4a22f3e715df 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c | |||
| @@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk) | |||
| 443 | struct dst_entry *dst = __sk_dst_get(sk); | 443 | struct dst_entry *dst = __sk_dst_get(sk); |
| 444 | struct tcp_sock *tp = tcp_sk(sk); | 444 | struct tcp_sock *tp = tcp_sk(sk); |
| 445 | struct tcp_metrics_block *tm; | 445 | struct tcp_metrics_block *tm; |
| 446 | u32 val; | 446 | u32 val, crtt = 0; /* cached RTT scaled by 8 */ |
| 447 | 447 | ||
| 448 | if (dst == NULL) | 448 | if (dst == NULL) |
| 449 | goto reset; | 449 | goto reset; |
| @@ -478,15 +478,19 @@ void tcp_init_metrics(struct sock *sk) | |||
| 478 | tp->reordering = val; | 478 | tp->reordering = val; |
| 479 | } | 479 | } |
| 480 | 480 | ||
| 481 | val = tcp_metric_get(tm, TCP_METRIC_RTT); | 481 | crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); |
| 482 | if (val == 0 || tp->srtt == 0) { | 482 | rcu_read_unlock(); |
| 483 | rcu_read_unlock(); | 483 | reset: |
| 484 | goto reset; | 484 | /* The initial RTT measurement from the SYN/SYN-ACK is not ideal |
| 485 | } | 485 | * to seed the RTO for later data packets because SYN packets are |
| 486 | /* Initial rtt is determined from SYN,SYN-ACK. | 486 | * small. Use the per-dst cached values to seed the RTO but keep |
| 487 | * The segment is small and rtt may appear much | 487 | * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). |
| 488 | * less than real one. Use per-dst memory | 488 | * Later the RTO will be updated immediately upon obtaining the first |
| 489 | * to make it more realistic. | 489 | * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only |
| 490 | * influences the first RTO but not later RTT estimation. | ||
| 491 | * | ||
| 492 | * But if RTT is not available from the SYN (due to retransmits or | ||
| 493 | * syn cookies) or the cache, force a conservative 3secs timeout. | ||
| 490 | * | 494 | * |
| 491 | * A bit of theory. RTT is time passed after "normal" sized packet | 495 | * A bit of theory. RTT is time passed after "normal" sized packet |
| 492 | * is sent until it is ACKed. In normal circumstances sending small | 496 | * is sent until it is ACKed. In normal circumstances sending small |
| @@ -497,21 +501,9 @@ void tcp_init_metrics(struct sock *sk) | |||
| 497 | * to low value, and then abruptly stops to do it and starts to delay | 501 | * to low value, and then abruptly stops to do it and starts to delay |
| 498 | * ACKs, wait for troubles. | 502 | * ACKs, wait for troubles. |
| 499 | */ | 503 | */ |
| 500 | val = msecs_to_jiffies(val); | 504 | if (crtt > tp->srtt) { |
| 501 | if (val > tp->srtt) { | 505 | inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk)); |
| 502 | tp->srtt = val; | 506 | } else if (tp->srtt == 0) { |
| 503 | tp->rtt_seq = tp->snd_nxt; | ||
| 504 | } | ||
| 505 | val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | ||
| 506 | if (val > tp->mdev) { | ||
| 507 | tp->mdev = val; | ||
| 508 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
| 509 | } | ||
| 510 | rcu_read_unlock(); | ||
| 511 | |||
| 512 | tcp_set_rto(sk); | ||
| 513 | reset: | ||
| 514 | if (tp->srtt == 0) { | ||
| 515 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | 507 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from |
| 516 | * 3WHS. This is most likely due to retransmission, | 508 | * 3WHS. This is most likely due to retransmission, |
| 517 | * including spurious one. Reset the RTO back to 3secs | 509 | * including spurious one. Reset the RTO back to 3secs |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ab1c08658528..58a3e69aef64 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
| @@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 411 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | 411 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
| 412 | tcp_enable_early_retrans(newtp); | 412 | tcp_enable_early_retrans(newtp); |
| 413 | newtp->tlp_high_seq = 0; | 413 | newtp->tlp_high_seq = 0; |
| 414 | newtp->lsndtime = treq->snt_synack; | ||
| 415 | newtp->total_retrans = req->num_retrans; | ||
| 414 | 416 | ||
| 415 | /* So many TCP implementations out there (incorrectly) count the | 417 | /* So many TCP implementations out there (incorrectly) count the |
| 416 | * initial SYN frame in their delayed-ACK and congestion control | 418 | * initial SYN frame in their delayed-ACK and congestion control |
| @@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
| 666 | if (!(flg & TCP_FLAG_ACK)) | 668 | if (!(flg & TCP_FLAG_ACK)) |
| 667 | return NULL; | 669 | return NULL; |
| 668 | 670 | ||
| 669 | /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ | ||
| 670 | if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) | ||
| 671 | tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; | ||
| 672 | else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */ | ||
| 673 | tcp_rsk(req)->snt_synack = 0; | ||
| 674 | |||
| 675 | /* For Fast Open no more processing is needed (sk is the | 671 | /* For Fast Open no more processing is needed (sk is the |
| 676 | * child socket). | 672 | * child socket). |
| 677 | */ | 673 | */ |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 170737a9d56d..7c83cb8bf137 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; | |||
| 65 | /* By default, RFC2861 behavior. */ | 65 | /* By default, RFC2861 behavior. */ |
| 66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | 66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; |
| 67 | 67 | ||
| 68 | unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; | ||
| 69 | EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); | ||
| 70 | |||
| 68 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | 71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
| 69 | int push_one, gfp_t gfp); | 72 | int push_one, gfp_t gfp); |
| 70 | 73 | ||
| @@ -1628,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
| 1628 | 1631 | ||
| 1629 | /* If a full-sized TSO skb can be sent, do it. */ | 1632 | /* If a full-sized TSO skb can be sent, do it. */ |
| 1630 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, | 1633 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, |
| 1631 | sk->sk_gso_max_segs * tp->mss_cache)) | 1634 | tp->xmit_size_goal_segs * tp->mss_cache)) |
| 1632 | goto send_now; | 1635 | goto send_now; |
| 1633 | 1636 | ||
| 1634 | /* Middle in queue won't get any more data, full sendable already? */ | 1637 | /* Middle in queue won't get any more data, full sendable already? */ |
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index d4943f67aff2..611beab38a00 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c | |||
| @@ -46,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096; | |||
| 46 | MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); | 46 | MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); |
| 47 | module_param(bufsize, uint, 0); | 47 | module_param(bufsize, uint, 0); |
| 48 | 48 | ||
| 49 | static unsigned int fwmark __read_mostly = 0; | ||
| 50 | MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); | ||
| 51 | module_param(fwmark, uint, 0); | ||
| 52 | |||
| 49 | static int full __read_mostly; | 53 | static int full __read_mostly; |
| 50 | MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); | 54 | MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); |
| 51 | module_param(full, int, 0); | 55 | module_param(full, int, 0); |
| @@ -54,12 +58,16 @@ static const char procname[] = "tcpprobe"; | |||
| 54 | 58 | ||
| 55 | struct tcp_log { | 59 | struct tcp_log { |
| 56 | ktime_t tstamp; | 60 | ktime_t tstamp; |
| 57 | __be32 saddr, daddr; | 61 | union { |
| 58 | __be16 sport, dport; | 62 | struct sockaddr raw; |
| 63 | struct sockaddr_in v4; | ||
| 64 | struct sockaddr_in6 v6; | ||
| 65 | } src, dst; | ||
| 59 | u16 length; | 66 | u16 length; |
| 60 | u32 snd_nxt; | 67 | u32 snd_nxt; |
| 61 | u32 snd_una; | 68 | u32 snd_una; |
| 62 | u32 snd_wnd; | 69 | u32 snd_wnd; |
| 70 | u32 rcv_wnd; | ||
| 63 | u32 snd_cwnd; | 71 | u32 snd_cwnd; |
| 64 | u32 ssthresh; | 72 | u32 ssthresh; |
| 65 | u32 srtt; | 73 | u32 srtt; |
| @@ -86,19 +94,45 @@ static inline int tcp_probe_avail(void) | |||
| 86 | return bufsize - tcp_probe_used() - 1; | 94 | return bufsize - tcp_probe_used() - 1; |
| 87 | } | 95 | } |
| 88 | 96 | ||
| 97 | #define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ | ||
| 98 | do { \ | ||
| 99 | si4.sin_family = AF_INET; \ | ||
| 100 | si4.sin_port = inet->inet_##mem##port; \ | ||
| 101 | si4.sin_addr.s_addr = inet->inet_##mem##addr; \ | ||
| 102 | } while (0) \ | ||
| 103 | |||
| 104 | #if IS_ENABLED(CONFIG_IPV6) | ||
| 105 | #define tcp_probe_copy_fl_to_si6(inet, si6, mem) \ | ||
| 106 | do { \ | ||
| 107 | struct ipv6_pinfo *pi6 = inet->pinet6; \ | ||
| 108 | si6.sin6_family = AF_INET6; \ | ||
| 109 | si6.sin6_port = inet->inet_##mem##port; \ | ||
| 110 | si6.sin6_addr = pi6->mem##addr; \ | ||
| 111 | si6.sin6_flowinfo = 0; /* No need here. */ \ | ||
| 112 | si6.sin6_scope_id = 0; /* No need here. */ \ | ||
| 113 | } while (0) | ||
| 114 | #else | ||
| 115 | #define tcp_probe_copy_fl_to_si6(fl, si6, mem) \ | ||
| 116 | do { \ | ||
| 117 | memset(&si6, 0, sizeof(si6)); \ | ||
| 118 | } while (0) | ||
| 119 | #endif | ||
| 120 | |||
| 89 | /* | 121 | /* |
| 90 | * Hook inserted to be called before each receive packet. | 122 | * Hook inserted to be called before each receive packet. |
| 91 | * Note: arguments must match tcp_rcv_established()! | 123 | * Note: arguments must match tcp_rcv_established()! |
| 92 | */ | 124 | */ |
| 93 | static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, | 125 | static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
| 94 | struct tcphdr *th, unsigned int len) | 126 | const struct tcphdr *th, unsigned int len) |
| 95 | { | 127 | { |
| 96 | const struct tcp_sock *tp = tcp_sk(sk); | 128 | const struct tcp_sock *tp = tcp_sk(sk); |
| 97 | const struct inet_sock *inet = inet_sk(sk); | 129 | const struct inet_sock *inet = inet_sk(sk); |
| 98 | 130 | ||
| 99 | /* Only update if port matches */ | 131 | /* Only update if port or skb mark matches */ |
| 100 | if ((port == 0 || ntohs(inet->inet_dport) == port || | 132 | if (((port == 0 && fwmark == 0) || |
| 101 | ntohs(inet->inet_sport) == port) && | 133 | ntohs(inet->inet_dport) == port || |
| 134 | ntohs(inet->inet_sport) == port || | ||
| 135 | (fwmark > 0 && skb->mark == fwmark)) && | ||
| 102 | (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { | 136 | (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { |
| 103 | 137 | ||
| 104 | spin_lock(&tcp_probe.lock); | 138 | spin_lock(&tcp_probe.lock); |
| @@ -107,15 +141,25 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 107 | struct tcp_log *p = tcp_probe.log + tcp_probe.head; | 141 | struct tcp_log *p = tcp_probe.log + tcp_probe.head; |
| 108 | 142 | ||
| 109 | p->tstamp = ktime_get(); | 143 | p->tstamp = ktime_get(); |
| 110 | p->saddr = inet->inet_saddr; | 144 | switch (sk->sk_family) { |
| 111 | p->sport = inet->inet_sport; | 145 | case AF_INET: |
| 112 | p->daddr = inet->inet_daddr; | 146 | tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); |
| 113 | p->dport = inet->inet_dport; | 147 | tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); |
| 148 | break; | ||
| 149 | case AF_INET6: | ||
| 150 | tcp_probe_copy_fl_to_si6(inet, p->src.v6, s); | ||
| 151 | tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d); | ||
| 152 | break; | ||
| 153 | default: | ||
| 154 | BUG(); | ||
| 155 | } | ||
| 156 | |||
| 114 | p->length = skb->len; | 157 | p->length = skb->len; |
| 115 | p->snd_nxt = tp->snd_nxt; | 158 | p->snd_nxt = tp->snd_nxt; |
| 116 | p->snd_una = tp->snd_una; | 159 | p->snd_una = tp->snd_una; |
| 117 | p->snd_cwnd = tp->snd_cwnd; | 160 | p->snd_cwnd = tp->snd_cwnd; |
| 118 | p->snd_wnd = tp->snd_wnd; | 161 | p->snd_wnd = tp->snd_wnd; |
| 162 | p->rcv_wnd = tp->rcv_wnd; | ||
| 119 | p->ssthresh = tcp_current_ssthresh(sk); | 163 | p->ssthresh = tcp_current_ssthresh(sk); |
| 120 | p->srtt = tp->srtt >> 3; | 164 | p->srtt = tp->srtt >> 3; |
| 121 | 165 | ||
| @@ -128,7 +172,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 128 | } | 172 | } |
| 129 | 173 | ||
| 130 | jprobe_return(); | 174 | jprobe_return(); |
| 131 | return 0; | ||
| 132 | } | 175 | } |
| 133 | 176 | ||
| 134 | static struct jprobe tcp_jprobe = { | 177 | static struct jprobe tcp_jprobe = { |
| @@ -157,13 +200,11 @@ static int tcpprobe_sprint(char *tbuf, int n) | |||
| 157 | = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); | 200 | = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); |
| 158 | 201 | ||
| 159 | return scnprintf(tbuf, n, | 202 | return scnprintf(tbuf, n, |
| 160 | "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", | 203 | "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", |
| 161 | (unsigned long) tv.tv_sec, | 204 | (unsigned long) tv.tv_sec, |
| 162 | (unsigned long) tv.tv_nsec, | 205 | (unsigned long) tv.tv_nsec, |
| 163 | &p->saddr, ntohs(p->sport), | 206 | &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, |
| 164 | &p->daddr, ntohs(p->dport), | 207 | p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); |
| 165 | p->length, p->snd_nxt, p->snd_una, | ||
| 166 | p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); | ||
| 167 | } | 208 | } |
| 168 | 209 | ||
| 169 | static ssize_t tcpprobe_read(struct file *file, char __user *buf, | 210 | static ssize_t tcpprobe_read(struct file *file, char __user *buf, |
| @@ -176,7 +217,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf, | |||
| 176 | return -EINVAL; | 217 | return -EINVAL; |
| 177 | 218 | ||
| 178 | while (cnt < len) { | 219 | while (cnt < len) { |
| 179 | char tbuf[164]; | 220 | char tbuf[256]; |
| 180 | int width; | 221 | int width; |
| 181 | 222 | ||
| 182 | /* Wait for data in buffer */ | 223 | /* Wait for data in buffer */ |
| @@ -223,6 +264,13 @@ static __init int tcpprobe_init(void) | |||
| 223 | { | 264 | { |
| 224 | int ret = -ENOMEM; | 265 | int ret = -ENOMEM; |
| 225 | 266 | ||
| 267 | /* Warning: if the function signature of tcp_rcv_established, | ||
| 268 | * has been changed, you also have to change the signature of | ||
| 269 | * jtcp_rcv_established, otherwise you end up right here! | ||
| 270 | */ | ||
| 271 | BUILD_BUG_ON(__same_type(tcp_rcv_established, | ||
| 272 | jtcp_rcv_established) == 0); | ||
| 273 | |||
| 226 | init_waitqueue_head(&tcp_probe.wait); | 274 | init_waitqueue_head(&tcp_probe.wait); |
| 227 | spin_lock_init(&tcp_probe.lock); | 275 | spin_lock_init(&tcp_probe.lock); |
| 228 | 276 | ||
| @@ -241,7 +289,8 @@ static __init int tcpprobe_init(void) | |||
| 241 | if (ret) | 289 | if (ret) |
| 242 | goto err1; | 290 | goto err1; |
| 243 | 291 | ||
| 244 | pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize); | 292 | pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", |
| 293 | port, fwmark, bufsize); | ||
| 245 | return 0; | 294 | return 0; |
| 246 | err1: | 295 | err1: |
| 247 | remove_proc_entry(procname, init_net.proc_net); | 296 | remove_proc_entry(procname, init_net.proc_net); |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 766e6bab9113..74d2c95db57f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
| @@ -704,7 +704,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames); | |||
| 704 | * @src: source IP address | 704 | * @src: source IP address |
| 705 | * @dst: destination IP address | 705 | * @dst: destination IP address |
| 706 | */ | 706 | */ |
| 707 | static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) | 707 | void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) |
| 708 | { | 708 | { |
| 709 | struct udphdr *uh = udp_hdr(skb); | 709 | struct udphdr *uh = udp_hdr(skb); |
| 710 | struct sk_buff *frags = skb_shinfo(skb)->frag_list; | 710 | struct sk_buff *frags = skb_shinfo(skb)->frag_list; |
| @@ -740,6 +740,7 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) | |||
| 740 | uh->check = CSUM_MANGLED_0; | 740 | uh->check = CSUM_MANGLED_0; |
| 741 | } | 741 | } |
| 742 | } | 742 | } |
| 743 | EXPORT_SYMBOL_GPL(udp4_hwcsum); | ||
| 743 | 744 | ||
| 744 | static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) | 745 | static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) |
| 745 | { | 746 | { |
| @@ -2158,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, | |||
| 2158 | __u16 srcp = ntohs(inet->inet_sport); | 2159 | __u16 srcp = ntohs(inet->inet_sport); |
| 2159 | 2160 | ||
| 2160 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" | 2161 | seq_printf(f, "%5d: %08X:%04X %08X:%04X" |
| 2161 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", | 2162 | " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", |
| 2162 | bucket, src, srcp, dest, destp, sp->sk_state, | 2163 | bucket, src, srcp, dest, destp, sp->sk_state, |
| 2163 | sk_wmem_alloc_get(sp), | 2164 | sk_wmem_alloc_get(sp), |
| 2164 | sk_rmem_alloc_get(sp), | 2165 | sk_rmem_alloc_get(sp), |
| @@ -2336,7 +2337,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | |||
| 2336 | uh->len = htons(skb->len - udp_offset); | 2337 | uh->len = htons(skb->len - udp_offset); |
| 2337 | 2338 | ||
| 2338 | /* csum segment if tunnel sets skb with csum. */ | 2339 | /* csum segment if tunnel sets skb with csum. */ |
| 2339 | if (unlikely(uh->check)) { | 2340 | if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) { |
| 2340 | struct iphdr *iph = ip_hdr(skb); | 2341 | struct iphdr *iph = ip_hdr(skb); |
| 2341 | 2342 | ||
| 2342 | uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, | 2343 | uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, |
| @@ -2347,7 +2348,18 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, | |||
| 2347 | if (uh->check == 0) | 2348 | if (uh->check == 0) |
| 2348 | uh->check = CSUM_MANGLED_0; | 2349 | uh->check = CSUM_MANGLED_0; |
| 2349 | 2350 | ||
| 2351 | } else if (protocol == htons(ETH_P_IPV6)) { | ||
| 2352 | struct ipv6hdr *ipv6h = ipv6_hdr(skb); | ||
| 2353 | u32 len = skb->len - udp_offset; | ||
| 2354 | |||
| 2355 | uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, | ||
| 2356 | len, IPPROTO_UDP, 0); | ||
| 2357 | uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0)); | ||
| 2358 | if (uh->check == 0) | ||
| 2359 | uh->check = CSUM_MANGLED_0; | ||
| 2360 | skb->ip_summed = CHECKSUM_NONE; | ||
| 2350 | } | 2361 | } |
| 2362 | |||
| 2351 | skb->protocol = protocol; | 2363 | skb->protocol = protocol; |
| 2352 | } while ((skb = skb->next)); | 2364 | } while ((skb = skb->next)); |
| 2353 | out: | 2365 | out: |
