diff options
Diffstat (limited to 'net/ipv4')
48 files changed, 1667 insertions, 1159 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ff75d3bbcd6a..5a23e8b37106 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \ | |||
7 | ip_output.o ip_sockglue.o inet_hashtables.o \ | 7 | ip_output.o ip_sockglue.o inet_hashtables.o \ |
8 | inet_timewait_sock.o inet_connection_sock.o \ | 8 | inet_timewait_sock.o inet_connection_sock.o \ |
9 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ | 9 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ |
10 | tcp_minisocks.o tcp_cong.o \ | 10 | tcp_minisocks.o tcp_cong.o tcp_metrics.o \ |
11 | datagram.o raw.o udp.o udplite.o \ | 11 | datagram.o raw.o udp.o udplite.o \ |
12 | arp.o icmp.o devinet.o af_inet.o igmp.o \ | 12 | arp.o icmp.o devinet.o af_inet.o igmp.o \ |
13 | fib_frontend.o fib_semantics.o fib_trie.o \ | 13 | fib_frontend.o fib_semantics.o fib_trie.o \ |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c8f7aee587d1..07a02f6e9696 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk) | |||
157 | 157 | ||
158 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); | 158 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); |
159 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); | 159 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); |
160 | dst_release(sk->sk_rx_dst); | ||
160 | sk_refcnt_debug_dec(sk); | 161 | sk_refcnt_debug_dec(sk); |
161 | } | 162 | } |
162 | EXPORT_SYMBOL(inet_sock_destruct); | 163 | EXPORT_SYMBOL(inet_sock_destruct); |
@@ -242,20 +243,18 @@ void build_ehash_secret(void) | |||
242 | } | 243 | } |
243 | EXPORT_SYMBOL(build_ehash_secret); | 244 | EXPORT_SYMBOL(build_ehash_secret); |
244 | 245 | ||
245 | static inline int inet_netns_ok(struct net *net, int protocol) | 246 | static inline int inet_netns_ok(struct net *net, __u8 protocol) |
246 | { | 247 | { |
247 | int hash; | ||
248 | const struct net_protocol *ipprot; | 248 | const struct net_protocol *ipprot; |
249 | 249 | ||
250 | if (net_eq(net, &init_net)) | 250 | if (net_eq(net, &init_net)) |
251 | return 1; | 251 | return 1; |
252 | 252 | ||
253 | hash = protocol & (MAX_INET_PROTOS - 1); | 253 | ipprot = rcu_dereference(inet_protos[protocol]); |
254 | ipprot = rcu_dereference(inet_protos[hash]); | 254 | if (ipprot == NULL) { |
255 | |||
256 | if (ipprot == NULL) | ||
257 | /* raw IP is OK */ | 255 | /* raw IP is OK */ |
258 | return 1; | 256 | return 1; |
257 | } | ||
259 | return ipprot->netns_ok; | 258 | return ipprot->netns_ok; |
260 | } | 259 | } |
261 | 260 | ||
@@ -553,7 +552,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, | |||
553 | 552 | ||
554 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) | 553 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) |
555 | return -EAGAIN; | 554 | return -EAGAIN; |
556 | return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); | 555 | return sk->sk_prot->connect(sk, uaddr, addr_len); |
557 | } | 556 | } |
558 | EXPORT_SYMBOL(inet_dgram_connect); | 557 | EXPORT_SYMBOL(inet_dgram_connect); |
559 | 558 | ||
@@ -1216,8 +1215,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); | |||
1216 | 1215 | ||
1217 | static int inet_gso_send_check(struct sk_buff *skb) | 1216 | static int inet_gso_send_check(struct sk_buff *skb) |
1218 | { | 1217 | { |
1219 | const struct iphdr *iph; | ||
1220 | const struct net_protocol *ops; | 1218 | const struct net_protocol *ops; |
1219 | const struct iphdr *iph; | ||
1221 | int proto; | 1220 | int proto; |
1222 | int ihl; | 1221 | int ihl; |
1223 | int err = -EINVAL; | 1222 | int err = -EINVAL; |
@@ -1236,7 +1235,7 @@ static int inet_gso_send_check(struct sk_buff *skb) | |||
1236 | __skb_pull(skb, ihl); | 1235 | __skb_pull(skb, ihl); |
1237 | skb_reset_transport_header(skb); | 1236 | skb_reset_transport_header(skb); |
1238 | iph = ip_hdr(skb); | 1237 | iph = ip_hdr(skb); |
1239 | proto = iph->protocol & (MAX_INET_PROTOS - 1); | 1238 | proto = iph->protocol; |
1240 | err = -EPROTONOSUPPORT; | 1239 | err = -EPROTONOSUPPORT; |
1241 | 1240 | ||
1242 | rcu_read_lock(); | 1241 | rcu_read_lock(); |
@@ -1253,8 +1252,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
1253 | netdev_features_t features) | 1252 | netdev_features_t features) |
1254 | { | 1253 | { |
1255 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 1254 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
1256 | struct iphdr *iph; | ||
1257 | const struct net_protocol *ops; | 1255 | const struct net_protocol *ops; |
1256 | struct iphdr *iph; | ||
1258 | int proto; | 1257 | int proto; |
1259 | int ihl; | 1258 | int ihl; |
1260 | int id; | 1259 | int id; |
@@ -1286,7 +1285,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
1286 | skb_reset_transport_header(skb); | 1285 | skb_reset_transport_header(skb); |
1287 | iph = ip_hdr(skb); | 1286 | iph = ip_hdr(skb); |
1288 | id = ntohs(iph->id); | 1287 | id = ntohs(iph->id); |
1289 | proto = iph->protocol & (MAX_INET_PROTOS - 1); | 1288 | proto = iph->protocol; |
1290 | segs = ERR_PTR(-EPROTONOSUPPORT); | 1289 | segs = ERR_PTR(-EPROTONOSUPPORT); |
1291 | 1290 | ||
1292 | rcu_read_lock(); | 1291 | rcu_read_lock(); |
@@ -1340,7 +1339,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, | |||
1340 | goto out; | 1339 | goto out; |
1341 | } | 1340 | } |
1342 | 1341 | ||
1343 | proto = iph->protocol & (MAX_INET_PROTOS - 1); | 1342 | proto = iph->protocol; |
1344 | 1343 | ||
1345 | rcu_read_lock(); | 1344 | rcu_read_lock(); |
1346 | ops = rcu_dereference(inet_protos[proto]); | 1345 | ops = rcu_dereference(inet_protos[proto]); |
@@ -1398,11 +1397,11 @@ out: | |||
1398 | 1397 | ||
1399 | static int inet_gro_complete(struct sk_buff *skb) | 1398 | static int inet_gro_complete(struct sk_buff *skb) |
1400 | { | 1399 | { |
1401 | const struct net_protocol *ops; | 1400 | __be16 newlen = htons(skb->len - skb_network_offset(skb)); |
1402 | struct iphdr *iph = ip_hdr(skb); | 1401 | struct iphdr *iph = ip_hdr(skb); |
1403 | int proto = iph->protocol & (MAX_INET_PROTOS - 1); | 1402 | const struct net_protocol *ops; |
1403 | int proto = iph->protocol; | ||
1404 | int err = -ENOSYS; | 1404 | int err = -ENOSYS; |
1405 | __be16 newlen = htons(skb->len - skb_network_offset(skb)); | ||
1406 | 1405 | ||
1407 | csum_replace2(&iph->check, iph->tot_len, newlen); | 1406 | csum_replace2(&iph->check, iph->tot_len, newlen); |
1408 | iph->tot_len = newlen; | 1407 | iph->tot_len = newlen; |
@@ -1520,14 +1519,15 @@ static const struct net_protocol igmp_protocol = { | |||
1520 | #endif | 1519 | #endif |
1521 | 1520 | ||
1522 | static const struct net_protocol tcp_protocol = { | 1521 | static const struct net_protocol tcp_protocol = { |
1523 | .handler = tcp_v4_rcv, | 1522 | .early_demux = tcp_v4_early_demux, |
1524 | .err_handler = tcp_v4_err, | 1523 | .handler = tcp_v4_rcv, |
1525 | .gso_send_check = tcp_v4_gso_send_check, | 1524 | .err_handler = tcp_v4_err, |
1526 | .gso_segment = tcp_tso_segment, | 1525 | .gso_send_check = tcp_v4_gso_send_check, |
1527 | .gro_receive = tcp4_gro_receive, | 1526 | .gso_segment = tcp_tso_segment, |
1528 | .gro_complete = tcp4_gro_complete, | 1527 | .gro_receive = tcp4_gro_receive, |
1529 | .no_policy = 1, | 1528 | .gro_complete = tcp4_gro_complete, |
1530 | .netns_ok = 1, | 1529 | .no_policy = 1, |
1530 | .netns_ok = 1, | ||
1531 | }; | 1531 | }; |
1532 | 1532 | ||
1533 | static const struct net_protocol udp_protocol = { | 1533 | static const struct net_protocol udp_protocol = { |
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e8f2617ecd47..916d5ecaf6c6 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -408,6 +408,7 @@ static void ah4_err(struct sk_buff *skb, u32 info) | |||
408 | return; | 408 | return; |
409 | pr_debug("pmtu discovery on SA AH/%08x/%08x\n", | 409 | pr_debug("pmtu discovery on SA AH/%08x/%08x\n", |
410 | ntohl(ah->spi), ntohl(iph->daddr)); | 410 | ntohl(ah->spi), ntohl(iph->daddr)); |
411 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); | ||
411 | xfrm_state_put(x); | 412 | xfrm_state_put(x); |
412 | } | 413 | } |
413 | 414 | ||
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cda37be02f8d..2e560f0c757d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
@@ -790,7 +790,8 @@ static int arp_process(struct sk_buff *skb) | |||
790 | * Check for bad requests for 127.x.x.x and requests for multicast | 790 | * Check for bad requests for 127.x.x.x and requests for multicast |
791 | * addresses. If this is one such, delete it. | 791 | * addresses. If this is one such, delete it. |
792 | */ | 792 | */ |
793 | if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) | 793 | if (ipv4_is_multicast(tip) || |
794 | (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) | ||
794 | goto out; | 795 | goto out; |
795 | 796 | ||
796 | /* | 797 | /* |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 10e15a144e95..44bf82e3aef7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -1500,7 +1500,8 @@ static int devinet_conf_proc(ctl_table *ctl, int write, | |||
1500 | 1500 | ||
1501 | if (cnf == net->ipv4.devconf_dflt) | 1501 | if (cnf == net->ipv4.devconf_dflt) |
1502 | devinet_copy_dflt_conf(net, i); | 1502 | devinet_copy_dflt_conf(net, i); |
1503 | if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) | 1503 | if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || |
1504 | i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) | ||
1504 | if ((new_value == 0) && (old_value != 0)) | 1505 | if ((new_value == 0) && (old_value != 0)) |
1505 | rt_cache_flush(net, 0); | 1506 | rt_cache_flush(net, 0); |
1506 | } | 1507 | } |
@@ -1617,6 +1618,8 @@ static struct devinet_sysctl_table { | |||
1617 | "force_igmp_version"), | 1618 | "force_igmp_version"), |
1618 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, | 1619 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, |
1619 | "promote_secondaries"), | 1620 | "promote_secondaries"), |
1621 | DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, | ||
1622 | "route_localnet"), | ||
1620 | }, | 1623 | }, |
1621 | }; | 1624 | }; |
1622 | 1625 | ||
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index cb982a61536f..7b95b49a36ce 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -494,6 +494,7 @@ static void esp4_err(struct sk_buff *skb, u32 info) | |||
494 | return; | 494 | return; |
495 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", | 495 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", |
496 | ntohl(esph->spi), ntohl(iph->daddr)); | 496 | ntohl(esph->spi), ntohl(iph->daddr)); |
497 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); | ||
497 | xfrm_state_put(x); | 498 | xfrm_state_put(x); |
498 | } | 499 | } |
499 | 500 | ||
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3854411fa37c..81f85716a894 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/if_addr.h> | 31 | #include <linux/if_addr.h> |
32 | #include <linux/if_arp.h> | 32 | #include <linux/if_arp.h> |
33 | #include <linux/skbuff.h> | 33 | #include <linux/skbuff.h> |
34 | #include <linux/cache.h> | ||
34 | #include <linux/init.h> | 35 | #include <linux/init.h> |
35 | #include <linux/list.h> | 36 | #include <linux/list.h> |
36 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
@@ -85,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id) | |||
85 | tb = fib_trie_table(id); | 86 | tb = fib_trie_table(id); |
86 | if (!tb) | 87 | if (!tb) |
87 | return NULL; | 88 | return NULL; |
89 | |||
90 | switch (id) { | ||
91 | case RT_TABLE_LOCAL: | ||
92 | net->ipv4.fib_local = tb; | ||
93 | break; | ||
94 | |||
95 | case RT_TABLE_MAIN: | ||
96 | net->ipv4.fib_main = tb; | ||
97 | break; | ||
98 | |||
99 | case RT_TABLE_DEFAULT: | ||
100 | net->ipv4.fib_default = tb; | ||
101 | break; | ||
102 | |||
103 | default: | ||
104 | break; | ||
105 | } | ||
106 | |||
88 | h = id & (FIB_TABLE_HASHSZ - 1); | 107 | h = id & (FIB_TABLE_HASHSZ - 1); |
89 | hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); | 108 | hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); |
90 | return tb; | 109 | return tb; |
@@ -180,6 +199,43 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, | |||
180 | } | 199 | } |
181 | EXPORT_SYMBOL(inet_dev_addr_type); | 200 | EXPORT_SYMBOL(inet_dev_addr_type); |
182 | 201 | ||
202 | __be32 fib_compute_spec_dst(struct sk_buff *skb) | ||
203 | { | ||
204 | struct net_device *dev = skb->dev; | ||
205 | struct in_device *in_dev; | ||
206 | struct fib_result res; | ||
207 | struct rtable *rt; | ||
208 | struct flowi4 fl4; | ||
209 | struct net *net; | ||
210 | int scope; | ||
211 | |||
212 | rt = skb_rtable(skb); | ||
213 | if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) | ||
214 | return ip_hdr(skb)->daddr; | ||
215 | |||
216 | in_dev = __in_dev_get_rcu(dev); | ||
217 | BUG_ON(!in_dev); | ||
218 | |||
219 | net = dev_net(dev); | ||
220 | |||
221 | scope = RT_SCOPE_UNIVERSE; | ||
222 | if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { | ||
223 | fl4.flowi4_oif = 0; | ||
224 | fl4.flowi4_iif = net->loopback_dev->ifindex; | ||
225 | fl4.daddr = ip_hdr(skb)->saddr; | ||
226 | fl4.saddr = 0; | ||
227 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); | ||
228 | fl4.flowi4_scope = scope; | ||
229 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; | ||
230 | if (!fib_lookup(net, &fl4, &res)) | ||
231 | return FIB_RES_PREFSRC(net, res); | ||
232 | } else { | ||
233 | scope = RT_SCOPE_LINK; | ||
234 | } | ||
235 | |||
236 | return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); | ||
237 | } | ||
238 | |||
183 | /* Given (packet source, input interface) and optional (dst, oif, tos): | 239 | /* Given (packet source, input interface) and optional (dst, oif, tos): |
184 | * - (main) check, that source is valid i.e. not broadcast or our local | 240 | * - (main) check, that source is valid i.e. not broadcast or our local |
185 | * address. | 241 | * address. |
@@ -188,17 +244,15 @@ EXPORT_SYMBOL(inet_dev_addr_type); | |||
188 | * - check, that packet arrived from expected physical interface. | 244 | * - check, that packet arrived from expected physical interface. |
189 | * called with rcu_read_lock() | 245 | * called with rcu_read_lock() |
190 | */ | 246 | */ |
191 | int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | 247 | static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, |
192 | int oif, struct net_device *dev, __be32 *spec_dst, | 248 | u8 tos, int oif, struct net_device *dev, |
193 | u32 *itag) | 249 | int rpf, struct in_device *idev, u32 *itag) |
194 | { | 250 | { |
195 | struct in_device *in_dev; | 251 | int ret, no_addr, accept_local; |
196 | struct flowi4 fl4; | ||
197 | struct fib_result res; | 252 | struct fib_result res; |
198 | int no_addr, rpf, accept_local; | 253 | struct flowi4 fl4; |
199 | bool dev_match; | ||
200 | int ret; | ||
201 | struct net *net; | 254 | struct net *net; |
255 | bool dev_match; | ||
202 | 256 | ||
203 | fl4.flowi4_oif = 0; | 257 | fl4.flowi4_oif = 0; |
204 | fl4.flowi4_iif = oif; | 258 | fl4.flowi4_iif = oif; |
@@ -207,20 +261,11 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | |||
207 | fl4.flowi4_tos = tos; | 261 | fl4.flowi4_tos = tos; |
208 | fl4.flowi4_scope = RT_SCOPE_UNIVERSE; | 262 | fl4.flowi4_scope = RT_SCOPE_UNIVERSE; |
209 | 263 | ||
210 | no_addr = rpf = accept_local = 0; | 264 | no_addr = accept_local = 0; |
211 | in_dev = __in_dev_get_rcu(dev); | 265 | no_addr = idev->ifa_list == NULL; |
212 | if (in_dev) { | ||
213 | no_addr = in_dev->ifa_list == NULL; | ||
214 | |||
215 | /* Ignore rp_filter for packets protected by IPsec. */ | ||
216 | rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); | ||
217 | 266 | ||
218 | accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); | 267 | accept_local = IN_DEV_ACCEPT_LOCAL(idev); |
219 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; | 268 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; |
220 | } | ||
221 | |||
222 | if (in_dev == NULL) | ||
223 | goto e_inval; | ||
224 | 269 | ||
225 | net = dev_net(dev); | 270 | net = dev_net(dev); |
226 | if (fib_lookup(net, &fl4, &res)) | 271 | if (fib_lookup(net, &fl4, &res)) |
@@ -229,7 +274,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | |||
229 | if (res.type != RTN_LOCAL || !accept_local) | 274 | if (res.type != RTN_LOCAL || !accept_local) |
230 | goto e_inval; | 275 | goto e_inval; |
231 | } | 276 | } |
232 | *spec_dst = FIB_RES_PREFSRC(net, res); | ||
233 | fib_combine_itag(itag, &res); | 277 | fib_combine_itag(itag, &res); |
234 | dev_match = false; | 278 | dev_match = false; |
235 | 279 | ||
@@ -258,17 +302,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, | |||
258 | 302 | ||
259 | ret = 0; | 303 | ret = 0; |
260 | if (fib_lookup(net, &fl4, &res) == 0) { | 304 | if (fib_lookup(net, &fl4, &res) == 0) { |
261 | if (res.type == RTN_UNICAST) { | 305 | if (res.type == RTN_UNICAST) |
262 | *spec_dst = FIB_RES_PREFSRC(net, res); | ||
263 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; | 306 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; |
264 | } | ||
265 | } | 307 | } |
266 | return ret; | 308 | return ret; |
267 | 309 | ||
268 | last_resort: | 310 | last_resort: |
269 | if (rpf) | 311 | if (rpf) |
270 | goto e_rpf; | 312 | goto e_rpf; |
271 | *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
272 | *itag = 0; | 313 | *itag = 0; |
273 | return 0; | 314 | return 0; |
274 | 315 | ||
@@ -278,6 +319,20 @@ e_rpf: | |||
278 | return -EXDEV; | 319 | return -EXDEV; |
279 | } | 320 | } |
280 | 321 | ||
322 | /* Ignore rp_filter for packets protected by IPsec. */ | ||
323 | int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | ||
324 | u8 tos, int oif, struct net_device *dev, | ||
325 | struct in_device *idev, u32 *itag) | ||
326 | { | ||
327 | int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); | ||
328 | |||
329 | if (!r && !fib_num_tclassid_users(dev_net(dev))) { | ||
330 | *itag = 0; | ||
331 | return 0; | ||
332 | } | ||
333 | return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); | ||
334 | } | ||
335 | |||
281 | static inline __be32 sk_extract_addr(struct sockaddr *addr) | 336 | static inline __be32 sk_extract_addr(struct sockaddr *addr) |
282 | { | 337 | { |
283 | return ((struct sockaddr_in *) addr)->sin_addr.s_addr; | 338 | return ((struct sockaddr_in *) addr)->sin_addr.s_addr; |
@@ -935,8 +990,11 @@ static void nl_fib_input(struct sk_buff *skb) | |||
935 | static int __net_init nl_fib_lookup_init(struct net *net) | 990 | static int __net_init nl_fib_lookup_init(struct net *net) |
936 | { | 991 | { |
937 | struct sock *sk; | 992 | struct sock *sk; |
938 | sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, | 993 | struct netlink_kernel_cfg cfg = { |
939 | nl_fib_input, NULL, THIS_MODULE); | 994 | .input = nl_fib_input, |
995 | }; | ||
996 | |||
997 | sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg); | ||
940 | if (sk == NULL) | 998 | if (sk == NULL) |
941 | return -EAFNOSUPPORT; | 999 | return -EAFNOSUPPORT; |
942 | net->ipv4.fibnl = sk; | 1000 | net->ipv4.fibnl = sk; |
@@ -1090,6 +1148,9 @@ static int __net_init fib_net_init(struct net *net) | |||
1090 | { | 1148 | { |
1091 | int error; | 1149 | int error; |
1092 | 1150 | ||
1151 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
1152 | net->ipv4.fib_num_tclassid_users = 0; | ||
1153 | #endif | ||
1093 | error = ip_fib_net_init(net); | 1154 | error = ip_fib_net_init(net); |
1094 | if (error < 0) | 1155 | if (error < 0) |
1095 | goto out; | 1156 | goto out; |
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2d043f71ef70..c06da93b0b70 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c | |||
@@ -54,7 +54,7 @@ u32 fib_rules_tclass(const struct fib_result *res) | |||
54 | } | 54 | } |
55 | #endif | 55 | #endif |
56 | 56 | ||
57 | int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) | 57 | int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) |
58 | { | 58 | { |
59 | struct fib_lookup_arg arg = { | 59 | struct fib_lookup_arg arg = { |
60 | .result = res, | 60 | .result = res, |
@@ -67,7 +67,7 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) | |||
67 | 67 | ||
68 | return err; | 68 | return err; |
69 | } | 69 | } |
70 | EXPORT_SYMBOL_GPL(fib_lookup); | 70 | EXPORT_SYMBOL_GPL(__fib_lookup); |
71 | 71 | ||
72 | static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, | 72 | static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, |
73 | int flags, struct fib_lookup_arg *arg) | 73 | int flags, struct fib_lookup_arg *arg) |
@@ -169,8 +169,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, | |||
169 | rule4->dst = nla_get_be32(tb[FRA_DST]); | 169 | rule4->dst = nla_get_be32(tb[FRA_DST]); |
170 | 170 | ||
171 | #ifdef CONFIG_IP_ROUTE_CLASSID | 171 | #ifdef CONFIG_IP_ROUTE_CLASSID |
172 | if (tb[FRA_FLOW]) | 172 | if (tb[FRA_FLOW]) { |
173 | rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); | 173 | rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); |
174 | if (rule4->tclassid) | ||
175 | net->ipv4.fib_num_tclassid_users++; | ||
176 | } | ||
174 | #endif | 177 | #endif |
175 | 178 | ||
176 | rule4->src_len = frh->src_len; | 179 | rule4->src_len = frh->src_len; |
@@ -179,11 +182,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, | |||
179 | rule4->dstmask = inet_make_mask(rule4->dst_len); | 182 | rule4->dstmask = inet_make_mask(rule4->dst_len); |
180 | rule4->tos = frh->tos; | 183 | rule4->tos = frh->tos; |
181 | 184 | ||
185 | net->ipv4.fib_has_custom_rules = true; | ||
182 | err = 0; | 186 | err = 0; |
183 | errout: | 187 | errout: |
184 | return err; | 188 | return err; |
185 | } | 189 | } |
186 | 190 | ||
191 | static void fib4_rule_delete(struct fib_rule *rule) | ||
192 | { | ||
193 | struct net *net = rule->fr_net; | ||
194 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
195 | struct fib4_rule *rule4 = (struct fib4_rule *) rule; | ||
196 | |||
197 | if (rule4->tclassid) | ||
198 | net->ipv4.fib_num_tclassid_users--; | ||
199 | #endif | ||
200 | net->ipv4.fib_has_custom_rules = true; | ||
201 | } | ||
202 | |||
187 | static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, | 203 | static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, |
188 | struct nlattr **tb) | 204 | struct nlattr **tb) |
189 | { | 205 | { |
@@ -256,6 +272,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { | |||
256 | .action = fib4_rule_action, | 272 | .action = fib4_rule_action, |
257 | .match = fib4_rule_match, | 273 | .match = fib4_rule_match, |
258 | .configure = fib4_rule_configure, | 274 | .configure = fib4_rule_configure, |
275 | .delete = fib4_rule_delete, | ||
259 | .compare = fib4_rule_compare, | 276 | .compare = fib4_rule_compare, |
260 | .fill = fib4_rule_fill, | 277 | .fill = fib4_rule_fill, |
261 | .default_pref = fib_default_rule_pref, | 278 | .default_pref = fib_default_rule_pref, |
@@ -295,6 +312,7 @@ int __net_init fib4_rules_init(struct net *net) | |||
295 | if (err < 0) | 312 | if (err < 0) |
296 | goto fail; | 313 | goto fail; |
297 | net->ipv4.rules_ops = ops; | 314 | net->ipv4.rules_ops = ops; |
315 | net->ipv4.fib_has_custom_rules = false; | ||
298 | return 0; | 316 | return 0; |
299 | 317 | ||
300 | fail: | 318 | fail: |
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e5b7182fa099..d71bfbdc0bf4 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
@@ -163,6 +163,12 @@ void free_fib_info(struct fib_info *fi) | |||
163 | return; | 163 | return; |
164 | } | 164 | } |
165 | fib_info_cnt--; | 165 | fib_info_cnt--; |
166 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
167 | change_nexthops(fi) { | ||
168 | if (nexthop_nh->nh_tclassid) | ||
169 | fi->fib_net->ipv4.fib_num_tclassid_users--; | ||
170 | } endfor_nexthops(fi); | ||
171 | #endif | ||
166 | call_rcu(&fi->rcu, free_fib_info_rcu); | 172 | call_rcu(&fi->rcu, free_fib_info_rcu); |
167 | } | 173 | } |
168 | 174 | ||
@@ -421,6 +427,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, | |||
421 | #ifdef CONFIG_IP_ROUTE_CLASSID | 427 | #ifdef CONFIG_IP_ROUTE_CLASSID |
422 | nla = nla_find(attrs, attrlen, RTA_FLOW); | 428 | nla = nla_find(attrs, attrlen, RTA_FLOW); |
423 | nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; | 429 | nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; |
430 | if (nexthop_nh->nh_tclassid) | ||
431 | fi->fib_net->ipv4.fib_num_tclassid_users++; | ||
424 | #endif | 432 | #endif |
425 | } | 433 | } |
426 | 434 | ||
@@ -779,9 +787,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
779 | int type = nla_type(nla); | 787 | int type = nla_type(nla); |
780 | 788 | ||
781 | if (type) { | 789 | if (type) { |
790 | u32 val; | ||
791 | |||
782 | if (type > RTAX_MAX) | 792 | if (type > RTAX_MAX) |
783 | goto err_inval; | 793 | goto err_inval; |
784 | fi->fib_metrics[type - 1] = nla_get_u32(nla); | 794 | val = nla_get_u32(nla); |
795 | if (type == RTAX_ADVMSS && val > 65535 - 40) | ||
796 | val = 65535 - 40; | ||
797 | if (type == RTAX_MTU && val > 65535 - 15) | ||
798 | val = 65535 - 15; | ||
799 | fi->fib_metrics[type - 1] = val; | ||
785 | } | 800 | } |
786 | } | 801 | } |
787 | } | 802 | } |
@@ -810,6 +825,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
810 | nh->nh_flags = cfg->fc_flags; | 825 | nh->nh_flags = cfg->fc_flags; |
811 | #ifdef CONFIG_IP_ROUTE_CLASSID | 826 | #ifdef CONFIG_IP_ROUTE_CLASSID |
812 | nh->nh_tclassid = cfg->fc_flow; | 827 | nh->nh_tclassid = cfg->fc_flow; |
828 | if (nh->nh_tclassid) | ||
829 | fi->fib_net->ipv4.fib_num_tclassid_users++; | ||
813 | #endif | 830 | #endif |
814 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 831 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
815 | nh->nh_weight = 1; | 832 | nh->nh_weight = 1; |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 30b88d7b4bd6..9b0f25930fbc 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
@@ -1007,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) | |||
1007 | while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { | 1007 | while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { |
1008 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1008 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1009 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); | 1009 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); |
1010 | tn = (struct tnode *) resize(t, (struct tnode *)tn); | 1010 | tn = (struct tnode *)resize(t, tn); |
1011 | 1011 | ||
1012 | tnode_put_child_reorg((struct tnode *)tp, cindex, | 1012 | tnode_put_child_reorg(tp, cindex, |
1013 | (struct rt_trie_node *)tn, wasfull); | 1013 | (struct rt_trie_node *)tn, wasfull); |
1014 | 1014 | ||
1015 | tp = node_parent((struct rt_trie_node *) tn); | 1015 | tp = node_parent((struct rt_trie_node *) tn); |
@@ -1024,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) | |||
1024 | 1024 | ||
1025 | /* Handle last (top) tnode */ | 1025 | /* Handle last (top) tnode */ |
1026 | if (IS_TNODE(tn)) | 1026 | if (IS_TNODE(tn)) |
1027 | tn = (struct tnode *)resize(t, (struct tnode *)tn); | 1027 | tn = (struct tnode *)resize(t, tn); |
1028 | 1028 | ||
1029 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | 1029 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
1030 | tnode_free_flush(); | 1030 | tnode_free_flush(); |
@@ -1125,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
1125 | node_set_parent((struct rt_trie_node *)l, tp); | 1125 | node_set_parent((struct rt_trie_node *)l, tp); |
1126 | 1126 | ||
1127 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1127 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1128 | put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); | 1128 | put_child(t, tp, cindex, (struct rt_trie_node *)l); |
1129 | } else { | 1129 | } else { |
1130 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ | 1130 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ |
1131 | /* | 1131 | /* |
@@ -1160,8 +1160,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
1160 | 1160 | ||
1161 | if (tp) { | 1161 | if (tp) { |
1162 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1162 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
1163 | put_child(t, (struct tnode *)tp, cindex, | 1163 | put_child(t, tp, cindex, (struct rt_trie_node *)tn); |
1164 | (struct rt_trie_node *)tn); | ||
1165 | } else { | 1164 | } else { |
1166 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | 1165 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
1167 | tp = tn; | 1166 | tp = tn; |
@@ -1620,7 +1619,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) | |||
1620 | 1619 | ||
1621 | if (tp) { | 1620 | if (tp) { |
1622 | t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); | 1621 | t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); |
1623 | put_child(t, (struct tnode *)tp, cindex, NULL); | 1622 | put_child(t, tp, cindex, NULL); |
1624 | trie_rebalance(t, tp); | 1623 | trie_rebalance(t, tp); |
1625 | } else | 1624 | } else |
1626 | RCU_INIT_POINTER(t->trie, NULL); | 1625 | RCU_INIT_POINTER(t->trie, NULL); |
@@ -1844,6 +1843,8 @@ int fib_table_flush(struct fib_table *tb) | |||
1844 | if (ll && hlist_empty(&ll->list)) | 1843 | if (ll && hlist_empty(&ll->list)) |
1845 | trie_leaf_remove(t, ll); | 1844 | trie_leaf_remove(t, ll); |
1846 | 1845 | ||
1846 | inetpeer_invalidate_tree(&tb->tb_peers); | ||
1847 | |||
1847 | pr_debug("trie_flush found=%d\n", found); | 1848 | pr_debug("trie_flush found=%d\n", found); |
1848 | return found; | 1849 | return found; |
1849 | } | 1850 | } |
@@ -1992,6 +1993,7 @@ struct fib_table *fib_trie_table(u32 id) | |||
1992 | tb->tb_id = id; | 1993 | tb->tb_id = id; |
1993 | tb->tb_default = -1; | 1994 | tb->tb_default = -1; |
1994 | tb->tb_num_default = 0; | 1995 | tb->tb_num_default = 0; |
1996 | inet_peer_base_init(&tb->tb_peers); | ||
1995 | 1997 | ||
1996 | t = (struct trie *) tb->tb_data; | 1998 | t = (struct trie *) tb->tb_data; |
1997 | memset(t, 0, sizeof(*t)); | 1999 | memset(t, 0, sizeof(*t)); |
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c75efbdc71cb..4a049449305f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -95,6 +95,7 @@ | |||
95 | #include <net/checksum.h> | 95 | #include <net/checksum.h> |
96 | #include <net/xfrm.h> | 96 | #include <net/xfrm.h> |
97 | #include <net/inet_common.h> | 97 | #include <net/inet_common.h> |
98 | #include <net/ip_fib.h> | ||
98 | 99 | ||
99 | /* | 100 | /* |
100 | * Build xmit assembly blocks | 101 | * Build xmit assembly blocks |
@@ -253,10 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, | |||
253 | 254 | ||
254 | /* Limit if icmp type is enabled in ratemask. */ | 255 | /* Limit if icmp type is enabled in ratemask. */ |
255 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { | 256 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { |
256 | if (!rt->peer) | 257 | struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); |
257 | rt_bind_peer(rt, fl4->daddr, 1); | 258 | rc = inet_peer_xrlim_allow(peer, |
258 | rc = inet_peer_xrlim_allow(rt->peer, | ||
259 | net->ipv4.sysctl_icmp_ratelimit); | 259 | net->ipv4.sysctl_icmp_ratelimit); |
260 | inet_putpeer(peer); | ||
260 | } | 261 | } |
261 | out: | 262 | out: |
262 | return rc; | 263 | return rc; |
@@ -334,7 +335,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
334 | struct flowi4 fl4; | 335 | struct flowi4 fl4; |
335 | struct sock *sk; | 336 | struct sock *sk; |
336 | struct inet_sock *inet; | 337 | struct inet_sock *inet; |
337 | __be32 daddr; | 338 | __be32 daddr, saddr; |
338 | 339 | ||
339 | if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) | 340 | if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) |
340 | return; | 341 | return; |
@@ -348,6 +349,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
348 | 349 | ||
349 | inet->tos = ip_hdr(skb)->tos; | 350 | inet->tos = ip_hdr(skb)->tos; |
350 | daddr = ipc.addr = ip_hdr(skb)->saddr; | 351 | daddr = ipc.addr = ip_hdr(skb)->saddr; |
352 | saddr = fib_compute_spec_dst(skb); | ||
351 | ipc.opt = NULL; | 353 | ipc.opt = NULL; |
352 | ipc.tx_flags = 0; | 354 | ipc.tx_flags = 0; |
353 | if (icmp_param->replyopts.opt.opt.optlen) { | 355 | if (icmp_param->replyopts.opt.opt.optlen) { |
@@ -357,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
357 | } | 359 | } |
358 | memset(&fl4, 0, sizeof(fl4)); | 360 | memset(&fl4, 0, sizeof(fl4)); |
359 | fl4.daddr = daddr; | 361 | fl4.daddr = daddr; |
360 | fl4.saddr = rt->rt_spec_dst; | 362 | fl4.saddr = saddr; |
361 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); | 363 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); |
362 | fl4.flowi4_proto = IPPROTO_ICMP; | 364 | fl4.flowi4_proto = IPPROTO_ICMP; |
363 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); | 365 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
@@ -638,12 +640,12 @@ EXPORT_SYMBOL(icmp_send); | |||
638 | 640 | ||
639 | static void icmp_unreach(struct sk_buff *skb) | 641 | static void icmp_unreach(struct sk_buff *skb) |
640 | { | 642 | { |
643 | const struct net_protocol *ipprot; | ||
641 | const struct iphdr *iph; | 644 | const struct iphdr *iph; |
642 | struct icmphdr *icmph; | 645 | struct icmphdr *icmph; |
643 | int hash, protocol; | ||
644 | const struct net_protocol *ipprot; | ||
645 | u32 info = 0; | ||
646 | struct net *net; | 646 | struct net *net; |
647 | u32 info = 0; | ||
648 | int protocol; | ||
647 | 649 | ||
648 | net = dev_net(skb_dst(skb)->dev); | 650 | net = dev_net(skb_dst(skb)->dev); |
649 | 651 | ||
@@ -674,9 +676,7 @@ static void icmp_unreach(struct sk_buff *skb) | |||
674 | LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), | 676 | LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), |
675 | &iph->daddr); | 677 | &iph->daddr); |
676 | } else { | 678 | } else { |
677 | info = ip_rt_frag_needed(net, iph, | 679 | info = ntohs(icmph->un.frag.mtu); |
678 | ntohs(icmph->un.frag.mtu), | ||
679 | skb->dev); | ||
680 | if (!info) | 680 | if (!info) |
681 | goto out; | 681 | goto out; |
682 | } | 682 | } |
@@ -734,9 +734,8 @@ static void icmp_unreach(struct sk_buff *skb) | |||
734 | */ | 734 | */ |
735 | raw_icmp_error(skb, protocol, info); | 735 | raw_icmp_error(skb, protocol, info); |
736 | 736 | ||
737 | hash = protocol & (MAX_INET_PROTOS - 1); | ||
738 | rcu_read_lock(); | 737 | rcu_read_lock(); |
739 | ipprot = rcu_dereference(inet_protos[hash]); | 738 | ipprot = rcu_dereference(inet_protos[protocol]); |
740 | if (ipprot && ipprot->err_handler) | 739 | if (ipprot && ipprot->err_handler) |
741 | ipprot->err_handler(skb, info); | 740 | ipprot->err_handler(skb, info); |
742 | rcu_read_unlock(); | 741 | rcu_read_unlock(); |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f9ee7417f6a0..76825be3b643 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -368,17 +368,21 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); | |||
368 | 368 | ||
369 | struct dst_entry *inet_csk_route_req(struct sock *sk, | 369 | struct dst_entry *inet_csk_route_req(struct sock *sk, |
370 | struct flowi4 *fl4, | 370 | struct flowi4 *fl4, |
371 | const struct request_sock *req) | 371 | const struct request_sock *req, |
372 | bool nocache) | ||
372 | { | 373 | { |
373 | struct rtable *rt; | 374 | struct rtable *rt; |
374 | const struct inet_request_sock *ireq = inet_rsk(req); | 375 | const struct inet_request_sock *ireq = inet_rsk(req); |
375 | struct ip_options_rcu *opt = inet_rsk(req)->opt; | 376 | struct ip_options_rcu *opt = inet_rsk(req)->opt; |
376 | struct net *net = sock_net(sk); | 377 | struct net *net = sock_net(sk); |
378 | int flags = inet_sk_flowi_flags(sk); | ||
377 | 379 | ||
380 | if (nocache) | ||
381 | flags |= FLOWI_FLAG_RT_NOCACHE; | ||
378 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, | 382 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, |
379 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, | 383 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, |
380 | sk->sk_protocol, | 384 | sk->sk_protocol, |
381 | inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS, | 385 | flags, |
382 | (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, | 386 | (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, |
383 | ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); | 387 | ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); |
384 | security_req_classify_flow(req, flowi4_to_flowi(fl4)); | 388 | security_req_classify_flow(req, flowi4_to_flowi(fl4)); |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 46d1e7199a8c..38064a285cca 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -46,9 +46,6 @@ struct inet_diag_entry { | |||
46 | u16 userlocks; | 46 | u16 userlocks; |
47 | }; | 47 | }; |
48 | 48 | ||
49 | #define INET_DIAG_PUT(skb, attrtype, attrlen) \ | ||
50 | RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) | ||
51 | |||
52 | static DEFINE_MUTEX(inet_diag_table_mutex); | 49 | static DEFINE_MUTEX(inet_diag_table_mutex); |
53 | 50 | ||
54 | static const struct inet_diag_handler *inet_diag_lock_handler(int proto) | 51 | static const struct inet_diag_handler *inet_diag_lock_handler(int proto) |
@@ -78,24 +75,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
78 | const struct inet_sock *inet = inet_sk(sk); | 75 | const struct inet_sock *inet = inet_sk(sk); |
79 | struct inet_diag_msg *r; | 76 | struct inet_diag_msg *r; |
80 | struct nlmsghdr *nlh; | 77 | struct nlmsghdr *nlh; |
78 | struct nlattr *attr; | ||
81 | void *info = NULL; | 79 | void *info = NULL; |
82 | struct inet_diag_meminfo *minfo = NULL; | ||
83 | unsigned char *b = skb_tail_pointer(skb); | ||
84 | const struct inet_diag_handler *handler; | 80 | const struct inet_diag_handler *handler; |
85 | int ext = req->idiag_ext; | 81 | int ext = req->idiag_ext; |
86 | 82 | ||
87 | handler = inet_diag_table[req->sdiag_protocol]; | 83 | handler = inet_diag_table[req->sdiag_protocol]; |
88 | BUG_ON(handler == NULL); | 84 | BUG_ON(handler == NULL); |
89 | 85 | ||
90 | nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); | 86 | nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), |
91 | nlh->nlmsg_flags = nlmsg_flags; | 87 | nlmsg_flags); |
88 | if (!nlh) | ||
89 | return -EMSGSIZE; | ||
92 | 90 | ||
93 | r = NLMSG_DATA(nlh); | 91 | r = nlmsg_data(nlh); |
94 | BUG_ON(sk->sk_state == TCP_TIME_WAIT); | 92 | BUG_ON(sk->sk_state == TCP_TIME_WAIT); |
95 | 93 | ||
96 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) | ||
97 | minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); | ||
98 | |||
99 | r->idiag_family = sk->sk_family; | 94 | r->idiag_family = sk->sk_family; |
100 | r->idiag_state = sk->sk_state; | 95 | r->idiag_state = sk->sk_state; |
101 | r->idiag_timer = 0; | 96 | r->idiag_timer = 0; |
@@ -113,7 +108,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
113 | * hence this needs to be included regardless of socket family. | 108 | * hence this needs to be included regardless of socket family. |
114 | */ | 109 | */ |
115 | if (ext & (1 << (INET_DIAG_TOS - 1))) | 110 | if (ext & (1 << (INET_DIAG_TOS - 1))) |
116 | RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); | 111 | if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0) |
112 | goto errout; | ||
117 | 113 | ||
118 | #if IS_ENABLED(CONFIG_IPV6) | 114 | #if IS_ENABLED(CONFIG_IPV6) |
119 | if (r->idiag_family == AF_INET6) { | 115 | if (r->idiag_family == AF_INET6) { |
@@ -121,24 +117,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
121 | 117 | ||
122 | *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; | 118 | *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; |
123 | *(struct in6_addr *)r->id.idiag_dst = np->daddr; | 119 | *(struct in6_addr *)r->id.idiag_dst = np->daddr; |
120 | |||
124 | if (ext & (1 << (INET_DIAG_TCLASS - 1))) | 121 | if (ext & (1 << (INET_DIAG_TCLASS - 1))) |
125 | RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); | 122 | if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0) |
123 | goto errout; | ||
126 | } | 124 | } |
127 | #endif | 125 | #endif |
128 | 126 | ||
129 | r->idiag_uid = sock_i_uid(sk); | 127 | r->idiag_uid = sock_i_uid(sk); |
130 | r->idiag_inode = sock_i_ino(sk); | 128 | r->idiag_inode = sock_i_ino(sk); |
131 | 129 | ||
132 | if (minfo) { | 130 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { |
133 | minfo->idiag_rmem = sk_rmem_alloc_get(sk); | 131 | struct inet_diag_meminfo minfo = { |
134 | minfo->idiag_wmem = sk->sk_wmem_queued; | 132 | .idiag_rmem = sk_rmem_alloc_get(sk), |
135 | minfo->idiag_fmem = sk->sk_forward_alloc; | 133 | .idiag_wmem = sk->sk_wmem_queued, |
136 | minfo->idiag_tmem = sk_wmem_alloc_get(sk); | 134 | .idiag_fmem = sk->sk_forward_alloc, |
135 | .idiag_tmem = sk_wmem_alloc_get(sk), | ||
136 | }; | ||
137 | |||
138 | if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) | ||
139 | goto errout; | ||
137 | } | 140 | } |
138 | 141 | ||
139 | if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) | 142 | if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) |
140 | if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) | 143 | if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) |
141 | goto rtattr_failure; | 144 | goto errout; |
142 | 145 | ||
143 | if (icsk == NULL) { | 146 | if (icsk == NULL) { |
144 | handler->idiag_get_info(sk, r, NULL); | 147 | handler->idiag_get_info(sk, r, NULL); |
@@ -165,16 +168,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
165 | } | 168 | } |
166 | #undef EXPIRES_IN_MS | 169 | #undef EXPIRES_IN_MS |
167 | 170 | ||
168 | if (ext & (1 << (INET_DIAG_INFO - 1))) | 171 | if (ext & (1 << (INET_DIAG_INFO - 1))) { |
169 | info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info)); | 172 | attr = nla_reserve(skb, INET_DIAG_INFO, |
173 | sizeof(struct tcp_info)); | ||
174 | if (!attr) | ||
175 | goto errout; | ||
170 | 176 | ||
171 | if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { | 177 | info = nla_data(attr); |
172 | const size_t len = strlen(icsk->icsk_ca_ops->name); | ||
173 | |||
174 | strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), | ||
175 | icsk->icsk_ca_ops->name); | ||
176 | } | 178 | } |
177 | 179 | ||
180 | if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) | ||
181 | if (nla_put_string(skb, INET_DIAG_CONG, | ||
182 | icsk->icsk_ca_ops->name) < 0) | ||
183 | goto errout; | ||
184 | |||
178 | handler->idiag_get_info(sk, r, info); | 185 | handler->idiag_get_info(sk, r, info); |
179 | 186 | ||
180 | if (sk->sk_state < TCP_TIME_WAIT && | 187 | if (sk->sk_state < TCP_TIME_WAIT && |
@@ -182,12 +189,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
182 | icsk->icsk_ca_ops->get_info(sk, ext, skb); | 189 | icsk->icsk_ca_ops->get_info(sk, ext, skb); |
183 | 190 | ||
184 | out: | 191 | out: |
185 | nlh->nlmsg_len = skb_tail_pointer(skb) - b; | 192 | return nlmsg_end(skb, nlh); |
186 | return skb->len; | ||
187 | 193 | ||
188 | rtattr_failure: | 194 | errout: |
189 | nlmsg_failure: | 195 | nlmsg_cancel(skb, nlh); |
190 | nlmsg_trim(skb, b); | ||
191 | return -EMSGSIZE; | 196 | return -EMSGSIZE; |
192 | } | 197 | } |
193 | EXPORT_SYMBOL_GPL(inet_sk_diag_fill); | 198 | EXPORT_SYMBOL_GPL(inet_sk_diag_fill); |
@@ -208,14 +213,15 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | |||
208 | { | 213 | { |
209 | long tmo; | 214 | long tmo; |
210 | struct inet_diag_msg *r; | 215 | struct inet_diag_msg *r; |
211 | const unsigned char *previous_tail = skb_tail_pointer(skb); | 216 | struct nlmsghdr *nlh; |
212 | struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, | ||
213 | unlh->nlmsg_type, sizeof(*r)); | ||
214 | 217 | ||
215 | r = NLMSG_DATA(nlh); | 218 | nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), |
216 | BUG_ON(tw->tw_state != TCP_TIME_WAIT); | 219 | nlmsg_flags); |
220 | if (!nlh) | ||
221 | return -EMSGSIZE; | ||
217 | 222 | ||
218 | nlh->nlmsg_flags = nlmsg_flags; | 223 | r = nlmsg_data(nlh); |
224 | BUG_ON(tw->tw_state != TCP_TIME_WAIT); | ||
219 | 225 | ||
220 | tmo = tw->tw_ttd - jiffies; | 226 | tmo = tw->tw_ttd - jiffies; |
221 | if (tmo < 0) | 227 | if (tmo < 0) |
@@ -245,11 +251,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | |||
245 | *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; | 251 | *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; |
246 | } | 252 | } |
247 | #endif | 253 | #endif |
248 | nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; | 254 | |
249 | return skb->len; | 255 | return nlmsg_end(skb, nlh); |
250 | nlmsg_failure: | ||
251 | nlmsg_trim(skb, previous_tail); | ||
252 | return -EMSGSIZE; | ||
253 | } | 256 | } |
254 | 257 | ||
255 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, | 258 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, |
@@ -298,20 +301,20 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s | |||
298 | if (err) | 301 | if (err) |
299 | goto out; | 302 | goto out; |
300 | 303 | ||
301 | err = -ENOMEM; | 304 | rep = nlmsg_new(sizeof(struct inet_diag_msg) + |
302 | rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + | 305 | sizeof(struct inet_diag_meminfo) + |
303 | sizeof(struct inet_diag_meminfo) + | 306 | sizeof(struct tcp_info) + 64, GFP_KERNEL); |
304 | sizeof(struct tcp_info) + 64)), | 307 | if (!rep) { |
305 | GFP_KERNEL); | 308 | err = -ENOMEM; |
306 | if (!rep) | ||
307 | goto out; | 309 | goto out; |
310 | } | ||
308 | 311 | ||
309 | err = sk_diag_fill(sk, rep, req, | 312 | err = sk_diag_fill(sk, rep, req, |
310 | NETLINK_CB(in_skb).pid, | 313 | NETLINK_CB(in_skb).pid, |
311 | nlh->nlmsg_seq, 0, nlh); | 314 | nlh->nlmsg_seq, 0, nlh); |
312 | if (err < 0) { | 315 | if (err < 0) { |
313 | WARN_ON(err == -EMSGSIZE); | 316 | WARN_ON(err == -EMSGSIZE); |
314 | kfree_skb(rep); | 317 | nlmsg_free(rep); |
315 | goto out; | 318 | goto out; |
316 | } | 319 | } |
317 | err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, | 320 | err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, |
@@ -592,15 +595,16 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | |||
592 | { | 595 | { |
593 | const struct inet_request_sock *ireq = inet_rsk(req); | 596 | const struct inet_request_sock *ireq = inet_rsk(req); |
594 | struct inet_sock *inet = inet_sk(sk); | 597 | struct inet_sock *inet = inet_sk(sk); |
595 | unsigned char *b = skb_tail_pointer(skb); | ||
596 | struct inet_diag_msg *r; | 598 | struct inet_diag_msg *r; |
597 | struct nlmsghdr *nlh; | 599 | struct nlmsghdr *nlh; |
598 | long tmo; | 600 | long tmo; |
599 | 601 | ||
600 | nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); | 602 | nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), |
601 | nlh->nlmsg_flags = NLM_F_MULTI; | 603 | NLM_F_MULTI); |
602 | r = NLMSG_DATA(nlh); | 604 | if (!nlh) |
605 | return -EMSGSIZE; | ||
603 | 606 | ||
607 | r = nlmsg_data(nlh); | ||
604 | r->idiag_family = sk->sk_family; | 608 | r->idiag_family = sk->sk_family; |
605 | r->idiag_state = TCP_SYN_RECV; | 609 | r->idiag_state = TCP_SYN_RECV; |
606 | r->idiag_timer = 1; | 610 | r->idiag_timer = 1; |
@@ -628,13 +632,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | |||
628 | *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; | 632 | *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; |
629 | } | 633 | } |
630 | #endif | 634 | #endif |
631 | nlh->nlmsg_len = skb_tail_pointer(skb) - b; | ||
632 | |||
633 | return skb->len; | ||
634 | 635 | ||
635 | nlmsg_failure: | 636 | return nlmsg_end(skb, nlh); |
636 | nlmsg_trim(skb, b); | ||
637 | return -1; | ||
638 | } | 637 | } |
639 | 638 | ||
640 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | 639 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, |
@@ -892,7 +891,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
892 | if (nlmsg_attrlen(cb->nlh, hdrlen)) | 891 | if (nlmsg_attrlen(cb->nlh, hdrlen)) |
893 | bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); | 892 | bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); |
894 | 893 | ||
895 | return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc); | 894 | return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); |
896 | } | 895 | } |
897 | 896 | ||
898 | static inline int inet_diag_type2proto(int type) | 897 | static inline int inet_diag_type2proto(int type) |
@@ -909,7 +908,7 @@ static inline int inet_diag_type2proto(int type) | |||
909 | 908 | ||
910 | static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) | 909 | static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) |
911 | { | 910 | { |
912 | struct inet_diag_req *rc = NLMSG_DATA(cb->nlh); | 911 | struct inet_diag_req *rc = nlmsg_data(cb->nlh); |
913 | struct inet_diag_req_v2 req; | 912 | struct inet_diag_req_v2 req; |
914 | struct nlattr *bc = NULL; | 913 | struct nlattr *bc = NULL; |
915 | int hdrlen = sizeof(struct inet_diag_req); | 914 | int hdrlen = sizeof(struct inet_diag_req); |
@@ -929,7 +928,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c | |||
929 | static int inet_diag_get_exact_compat(struct sk_buff *in_skb, | 928 | static int inet_diag_get_exact_compat(struct sk_buff *in_skb, |
930 | const struct nlmsghdr *nlh) | 929 | const struct nlmsghdr *nlh) |
931 | { | 930 | { |
932 | struct inet_diag_req *rc = NLMSG_DATA(nlh); | 931 | struct inet_diag_req *rc = nlmsg_data(nlh); |
933 | struct inet_diag_req_v2 req; | 932 | struct inet_diag_req_v2 req; |
934 | 933 | ||
935 | req.sdiag_family = rc->idiag_family; | 934 | req.sdiag_family = rc->idiag_family; |
@@ -996,7 +995,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) | |||
996 | } | 995 | } |
997 | } | 996 | } |
998 | 997 | ||
999 | return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h)); | 998 | return inet_diag_get_exact(skb, h, nlmsg_data(h)); |
1000 | } | 999 | } |
1001 | 1000 | ||
1002 | static const struct sock_diag_handler inet_diag_handler = { | 1001 | static const struct sock_diag_handler inet_diag_handler = { |
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ff2a51b6d0c..85190e69297b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c | |||
@@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, | |||
243 | if (q == NULL) | 243 | if (q == NULL) |
244 | return NULL; | 244 | return NULL; |
245 | 245 | ||
246 | q->net = nf; | ||
246 | f->constructor(q, arg); | 247 | f->constructor(q, arg); |
247 | atomic_add(f->qsize, &nf->mem); | 248 | atomic_add(f->qsize, &nf->mem); |
248 | setup_timer(&q->timer, f->frag_expire, (unsigned long)q); | 249 | setup_timer(&q->timer, f->frag_expire, (unsigned long)q); |
249 | spin_lock_init(&q->lock); | 250 | spin_lock_init(&q->lock); |
250 | atomic_set(&q->refcnt, 1); | 251 | atomic_set(&q->refcnt, 1); |
251 | q->net = nf; | ||
252 | 252 | ||
253 | return q; | 253 | return q; |
254 | } | 254 | } |
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index dfba343b2509..e1e0a4e8fd34 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -82,23 +82,39 @@ static const struct inet_peer peer_fake_node = { | |||
82 | .avl_height = 0 | 82 | .avl_height = 0 |
83 | }; | 83 | }; |
84 | 84 | ||
85 | struct inet_peer_base { | 85 | void inet_peer_base_init(struct inet_peer_base *bp) |
86 | struct inet_peer __rcu *root; | 86 | { |
87 | seqlock_t lock; | 87 | bp->root = peer_avl_empty_rcu; |
88 | int total; | 88 | seqlock_init(&bp->lock); |
89 | }; | 89 | bp->flush_seq = ~0U; |
90 | bp->total = 0; | ||
91 | } | ||
92 | EXPORT_SYMBOL_GPL(inet_peer_base_init); | ||
90 | 93 | ||
91 | static struct inet_peer_base v4_peers = { | 94 | static atomic_t v4_seq = ATOMIC_INIT(0); |
92 | .root = peer_avl_empty_rcu, | 95 | static atomic_t v6_seq = ATOMIC_INIT(0); |
93 | .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), | ||
94 | .total = 0, | ||
95 | }; | ||
96 | 96 | ||
97 | static struct inet_peer_base v6_peers = { | 97 | static atomic_t *inetpeer_seq_ptr(int family) |
98 | .root = peer_avl_empty_rcu, | 98 | { |
99 | .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), | 99 | return (family == AF_INET ? &v4_seq : &v6_seq); |
100 | .total = 0, | 100 | } |
101 | }; | 101 | |
102 | static inline void flush_check(struct inet_peer_base *base, int family) | ||
103 | { | ||
104 | atomic_t *fp = inetpeer_seq_ptr(family); | ||
105 | |||
106 | if (unlikely(base->flush_seq != atomic_read(fp))) { | ||
107 | inetpeer_invalidate_tree(base); | ||
108 | base->flush_seq = atomic_read(fp); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | void inetpeer_invalidate_family(int family) | ||
113 | { | ||
114 | atomic_t *fp = inetpeer_seq_ptr(family); | ||
115 | |||
116 | atomic_inc(fp); | ||
117 | } | ||
102 | 118 | ||
103 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ | 119 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ |
104 | 120 | ||
@@ -110,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min | |||
110 | 126 | ||
111 | static void inetpeer_gc_worker(struct work_struct *work) | 127 | static void inetpeer_gc_worker(struct work_struct *work) |
112 | { | 128 | { |
113 | struct inet_peer *p, *n; | 129 | struct inet_peer *p, *n, *c; |
114 | LIST_HEAD(list); | 130 | LIST_HEAD(list); |
115 | 131 | ||
116 | spin_lock_bh(&gc_lock); | 132 | spin_lock_bh(&gc_lock); |
@@ -122,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work) | |||
122 | 138 | ||
123 | list_for_each_entry_safe(p, n, &list, gc_list) { | 139 | list_for_each_entry_safe(p, n, &list, gc_list) { |
124 | 140 | ||
125 | if(need_resched()) | 141 | if (need_resched()) |
126 | cond_resched(); | 142 | cond_resched(); |
127 | 143 | ||
128 | if (p->avl_left != peer_avl_empty) { | 144 | c = rcu_dereference_protected(p->avl_left, 1); |
129 | list_add_tail(&p->avl_left->gc_list, &list); | 145 | if (c != peer_avl_empty) { |
130 | p->avl_left = peer_avl_empty; | 146 | list_add_tail(&c->gc_list, &list); |
147 | p->avl_left = peer_avl_empty_rcu; | ||
131 | } | 148 | } |
132 | 149 | ||
133 | if (p->avl_right != peer_avl_empty) { | 150 | c = rcu_dereference_protected(p->avl_right, 1); |
134 | list_add_tail(&p->avl_right->gc_list, &list); | 151 | if (c != peer_avl_empty) { |
135 | p->avl_right = peer_avl_empty; | 152 | list_add_tail(&c->gc_list, &list); |
153 | p->avl_right = peer_avl_empty_rcu; | ||
136 | } | 154 | } |
137 | 155 | ||
138 | n = list_entry(p->gc_list.next, struct inet_peer, gc_list); | 156 | n = list_entry(p->gc_list.next, struct inet_peer, gc_list); |
@@ -401,11 +419,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, | |||
401 | call_rcu(&p->rcu, inetpeer_free_rcu); | 419 | call_rcu(&p->rcu, inetpeer_free_rcu); |
402 | } | 420 | } |
403 | 421 | ||
404 | static struct inet_peer_base *family_to_base(int family) | ||
405 | { | ||
406 | return family == AF_INET ? &v4_peers : &v6_peers; | ||
407 | } | ||
408 | |||
409 | /* perform garbage collect on all items stacked during a lookup */ | 422 | /* perform garbage collect on all items stacked during a lookup */ |
410 | static int inet_peer_gc(struct inet_peer_base *base, | 423 | static int inet_peer_gc(struct inet_peer_base *base, |
411 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], | 424 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], |
@@ -443,14 +456,17 @@ static int inet_peer_gc(struct inet_peer_base *base, | |||
443 | return cnt; | 456 | return cnt; |
444 | } | 457 | } |
445 | 458 | ||
446 | struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) | 459 | struct inet_peer *inet_getpeer(struct inet_peer_base *base, |
460 | const struct inetpeer_addr *daddr, | ||
461 | int create) | ||
447 | { | 462 | { |
448 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; | 463 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; |
449 | struct inet_peer_base *base = family_to_base(daddr->family); | ||
450 | struct inet_peer *p; | 464 | struct inet_peer *p; |
451 | unsigned int sequence; | 465 | unsigned int sequence; |
452 | int invalidated, gccnt = 0; | 466 | int invalidated, gccnt = 0; |
453 | 467 | ||
468 | flush_check(base, daddr->family); | ||
469 | |||
454 | /* Attempt a lockless lookup first. | 470 | /* Attempt a lockless lookup first. |
455 | * Because of a concurrent writer, we might not find an existing entry. | 471 | * Because of a concurrent writer, we might not find an existing entry. |
456 | */ | 472 | */ |
@@ -492,13 +508,9 @@ relookup: | |||
492 | (daddr->family == AF_INET) ? | 508 | (daddr->family == AF_INET) ? |
493 | secure_ip_id(daddr->addr.a4) : | 509 | secure_ip_id(daddr->addr.a4) : |
494 | secure_ipv6_id(daddr->addr.a6)); | 510 | secure_ipv6_id(daddr->addr.a6)); |
495 | p->tcp_ts_stamp = 0; | ||
496 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; | 511 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; |
497 | p->rate_tokens = 0; | 512 | p->rate_tokens = 0; |
498 | p->rate_last = 0; | 513 | p->rate_last = 0; |
499 | p->pmtu_expires = 0; | ||
500 | p->pmtu_orig = 0; | ||
501 | memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); | ||
502 | INIT_LIST_HEAD(&p->gc_list); | 514 | INIT_LIST_HEAD(&p->gc_list); |
503 | 515 | ||
504 | /* Link the node. */ | 516 | /* Link the node. */ |
@@ -571,26 +583,19 @@ static void inetpeer_inval_rcu(struct rcu_head *head) | |||
571 | schedule_delayed_work(&gc_work, gc_delay); | 583 | schedule_delayed_work(&gc_work, gc_delay); |
572 | } | 584 | } |
573 | 585 | ||
574 | void inetpeer_invalidate_tree(int family) | 586 | void inetpeer_invalidate_tree(struct inet_peer_base *base) |
575 | { | 587 | { |
576 | struct inet_peer *old, *new, *prev; | 588 | struct inet_peer *root; |
577 | struct inet_peer_base *base = family_to_base(family); | ||
578 | 589 | ||
579 | write_seqlock_bh(&base->lock); | 590 | write_seqlock_bh(&base->lock); |
580 | 591 | ||
581 | old = base->root; | 592 | root = rcu_deref_locked(base->root, base); |
582 | if (old == peer_avl_empty_rcu) | 593 | if (root != peer_avl_empty) { |
583 | goto out; | 594 | base->root = peer_avl_empty_rcu; |
584 | |||
585 | new = peer_avl_empty_rcu; | ||
586 | |||
587 | prev = cmpxchg(&base->root, old, new); | ||
588 | if (prev == old) { | ||
589 | base->total = 0; | 595 | base->total = 0; |
590 | call_rcu(&prev->gc_rcu, inetpeer_inval_rcu); | 596 | call_rcu(&root->gc_rcu, inetpeer_inval_rcu); |
591 | } | 597 | } |
592 | 598 | ||
593 | out: | ||
594 | write_sequnlock_bh(&base->lock); | 599 | write_sequnlock_bh(&base->lock); |
595 | } | 600 | } |
596 | EXPORT_SYMBOL(inetpeer_invalidate_tree); | 601 | EXPORT_SYMBOL(inetpeer_invalidate_tree); |
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9dbd3dd6022d..8d07c973409c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c | |||
@@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) | |||
171 | static void ip4_frag_init(struct inet_frag_queue *q, void *a) | 171 | static void ip4_frag_init(struct inet_frag_queue *q, void *a) |
172 | { | 172 | { |
173 | struct ipq *qp = container_of(q, struct ipq, q); | 173 | struct ipq *qp = container_of(q, struct ipq, q); |
174 | struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, | ||
175 | frags); | ||
176 | struct net *net = container_of(ipv4, struct net, ipv4); | ||
177 | |||
174 | struct ip4_create_arg *arg = a; | 178 | struct ip4_create_arg *arg = a; |
175 | 179 | ||
176 | qp->protocol = arg->iph->protocol; | 180 | qp->protocol = arg->iph->protocol; |
@@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) | |||
180 | qp->daddr = arg->iph->daddr; | 184 | qp->daddr = arg->iph->daddr; |
181 | qp->user = arg->user; | 185 | qp->user = arg->user; |
182 | qp->peer = sysctl_ipfrag_max_dist ? | 186 | qp->peer = sysctl_ipfrag_max_dist ? |
183 | inet_getpeer_v4(arg->iph->saddr, 1) : NULL; | 187 | inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; |
184 | } | 188 | } |
185 | 189 | ||
186 | static __inline__ void ip4_frag_free(struct inet_frag_queue *q) | 190 | static __inline__ void ip4_frag_free(struct inet_frag_queue *q) |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f49047b79609..594cec35ac4d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -516,9 +516,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
516 | case ICMP_PORT_UNREACH: | 516 | case ICMP_PORT_UNREACH: |
517 | /* Impossible event. */ | 517 | /* Impossible event. */ |
518 | return; | 518 | return; |
519 | case ICMP_FRAG_NEEDED: | ||
520 | /* Soft state for pmtu is maintained by IP core. */ | ||
521 | return; | ||
522 | default: | 519 | default: |
523 | /* All others are translated to HOST_UNREACH. | 520 | /* All others are translated to HOST_UNREACH. |
524 | rfc2003 contains "deep thoughts" about NET_UNREACH, | 521 | rfc2003 contains "deep thoughts" about NET_UNREACH, |
@@ -538,7 +535,16 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
538 | flags & GRE_KEY ? | 535 | flags & GRE_KEY ? |
539 | *(((__be32 *)p) + (grehlen / 4) - 1) : 0, | 536 | *(((__be32 *)p) + (grehlen / 4) - 1) : 0, |
540 | p[1]); | 537 | p[1]); |
541 | if (t == NULL || t->parms.iph.daddr == 0 || | 538 | if (t == NULL) |
539 | goto out; | ||
540 | |||
541 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { | ||
542 | ipv4_update_pmtu(skb, dev_net(skb->dev), info, | ||
543 | t->parms.link, 0, IPPROTO_GRE, 0); | ||
544 | goto out; | ||
545 | } | ||
546 | |||
547 | if (t->parms.iph.daddr == 0 || | ||
542 | ipv4_is_multicast(t->parms.iph.daddr)) | 548 | ipv4_is_multicast(t->parms.iph.daddr)) |
543 | goto out; | 549 | goto out; |
544 | 550 | ||
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 8590144ca330..b27d4440f523 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb) | |||
198 | rcu_read_lock(); | 198 | rcu_read_lock(); |
199 | { | 199 | { |
200 | int protocol = ip_hdr(skb)->protocol; | 200 | int protocol = ip_hdr(skb)->protocol; |
201 | int hash, raw; | ||
202 | const struct net_protocol *ipprot; | 201 | const struct net_protocol *ipprot; |
202 | int raw; | ||
203 | 203 | ||
204 | resubmit: | 204 | resubmit: |
205 | raw = raw_local_deliver(skb, protocol); | 205 | raw = raw_local_deliver(skb, protocol); |
206 | 206 | ||
207 | hash = protocol & (MAX_INET_PROTOS - 1); | 207 | ipprot = rcu_dereference(inet_protos[protocol]); |
208 | ipprot = rcu_dereference(inet_protos[hash]); | ||
209 | if (ipprot != NULL) { | 208 | if (ipprot != NULL) { |
210 | int ret; | 209 | int ret; |
211 | 210 | ||
@@ -314,26 +313,33 @@ drop: | |||
314 | return true; | 313 | return true; |
315 | } | 314 | } |
316 | 315 | ||
316 | int sysctl_ip_early_demux __read_mostly = 1; | ||
317 | |||
317 | static int ip_rcv_finish(struct sk_buff *skb) | 318 | static int ip_rcv_finish(struct sk_buff *skb) |
318 | { | 319 | { |
319 | const struct iphdr *iph = ip_hdr(skb); | 320 | const struct iphdr *iph = ip_hdr(skb); |
320 | struct rtable *rt; | 321 | struct rtable *rt; |
321 | 322 | ||
323 | if (sysctl_ip_early_demux && !skb_dst(skb)) { | ||
324 | const struct net_protocol *ipprot; | ||
325 | int protocol = iph->protocol; | ||
326 | |||
327 | rcu_read_lock(); | ||
328 | ipprot = rcu_dereference(inet_protos[protocol]); | ||
329 | if (ipprot && ipprot->early_demux) | ||
330 | ipprot->early_demux(skb); | ||
331 | rcu_read_unlock(); | ||
332 | } | ||
333 | |||
322 | /* | 334 | /* |
323 | * Initialise the virtual path cache for the packet. It describes | 335 | * Initialise the virtual path cache for the packet. It describes |
324 | * how the packet travels inside Linux networking. | 336 | * how the packet travels inside Linux networking. |
325 | */ | 337 | */ |
326 | if (skb_dst(skb) == NULL) { | 338 | if (!skb_dst(skb)) { |
327 | int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, | 339 | int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, |
328 | iph->tos, skb->dev); | 340 | iph->tos, skb->dev); |
329 | if (unlikely(err)) { | 341 | if (unlikely(err)) { |
330 | if (err == -EHOSTUNREACH) | 342 | if (err == -EXDEV) |
331 | IP_INC_STATS_BH(dev_net(skb->dev), | ||
332 | IPSTATS_MIB_INADDRERRORS); | ||
333 | else if (err == -ENETUNREACH) | ||
334 | IP_INC_STATS_BH(dev_net(skb->dev), | ||
335 | IPSTATS_MIB_INNOROUTES); | ||
336 | else if (err == -EXDEV) | ||
337 | NET_INC_STATS_BH(dev_net(skb->dev), | 343 | NET_INC_STATS_BH(dev_net(skb->dev), |
338 | LINUX_MIB_IPRPFILTER); | 344 | LINUX_MIB_IPRPFILTER); |
339 | goto drop; | 345 | goto drop; |
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 708b99494e23..a19d6471a318 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <net/icmp.h> | 27 | #include <net/icmp.h> |
28 | #include <net/route.h> | 28 | #include <net/route.h> |
29 | #include <net/cipso_ipv4.h> | 29 | #include <net/cipso_ipv4.h> |
30 | #include <net/ip_fib.h> | ||
30 | 31 | ||
31 | /* | 32 | /* |
32 | * Write options to IP header, record destination address to | 33 | * Write options to IP header, record destination address to |
@@ -104,7 +105,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | |||
104 | sptr = skb_network_header(skb); | 105 | sptr = skb_network_header(skb); |
105 | dptr = dopt->__data; | 106 | dptr = dopt->__data; |
106 | 107 | ||
107 | daddr = skb_rtable(skb)->rt_spec_dst; | 108 | daddr = fib_compute_spec_dst(skb); |
108 | 109 | ||
109 | if (sopt->rr) { | 110 | if (sopt->rr) { |
110 | optlen = sptr[sopt->rr+1]; | 111 | optlen = sptr[sopt->rr+1]; |
@@ -241,6 +242,15 @@ void ip_options_fragment(struct sk_buff *skb) | |||
241 | opt->ts_needtime = 0; | 242 | opt->ts_needtime = 0; |
242 | } | 243 | } |
243 | 244 | ||
245 | /* helper used by ip_options_compile() to call fib_compute_spec_dst() | ||
246 | * at most one time. | ||
247 | */ | ||
248 | static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) | ||
249 | { | ||
250 | if (*spec_dst == htonl(INADDR_ANY)) | ||
251 | *spec_dst = fib_compute_spec_dst(skb); | ||
252 | } | ||
253 | |||
244 | /* | 254 | /* |
245 | * Verify options and fill pointers in struct options. | 255 | * Verify options and fill pointers in struct options. |
246 | * Caller should clear *opt, and set opt->data. | 256 | * Caller should clear *opt, and set opt->data. |
@@ -250,12 +260,12 @@ void ip_options_fragment(struct sk_buff *skb) | |||
250 | int ip_options_compile(struct net *net, | 260 | int ip_options_compile(struct net *net, |
251 | struct ip_options *opt, struct sk_buff *skb) | 261 | struct ip_options *opt, struct sk_buff *skb) |
252 | { | 262 | { |
253 | int l; | 263 | __be32 spec_dst = htonl(INADDR_ANY); |
254 | unsigned char *iph; | ||
255 | unsigned char *optptr; | ||
256 | int optlen; | ||
257 | unsigned char *pp_ptr = NULL; | 264 | unsigned char *pp_ptr = NULL; |
258 | struct rtable *rt = NULL; | 265 | struct rtable *rt = NULL; |
266 | unsigned char *optptr; | ||
267 | unsigned char *iph; | ||
268 | int optlen, l; | ||
259 | 269 | ||
260 | if (skb != NULL) { | 270 | if (skb != NULL) { |
261 | rt = skb_rtable(skb); | 271 | rt = skb_rtable(skb); |
@@ -331,7 +341,8 @@ int ip_options_compile(struct net *net, | |||
331 | goto error; | 341 | goto error; |
332 | } | 342 | } |
333 | if (rt) { | 343 | if (rt) { |
334 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); | 344 | spec_dst_fill(&spec_dst, skb); |
345 | memcpy(&optptr[optptr[2]-1], &spec_dst, 4); | ||
335 | opt->is_changed = 1; | 346 | opt->is_changed = 1; |
336 | } | 347 | } |
337 | optptr[2] += 4; | 348 | optptr[2] += 4; |
@@ -373,7 +384,8 @@ int ip_options_compile(struct net *net, | |||
373 | } | 384 | } |
374 | opt->ts = optptr - iph; | 385 | opt->ts = optptr - iph; |
375 | if (rt) { | 386 | if (rt) { |
376 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); | 387 | spec_dst_fill(&spec_dst, skb); |
388 | memcpy(&optptr[optptr[2]-1], &spec_dst, 4); | ||
377 | timeptr = &optptr[optptr[2]+3]; | 389 | timeptr = &optptr[optptr[2]+3]; |
378 | } | 390 | } |
379 | opt->ts_needaddr = 1; | 391 | opt->ts_needaddr = 1; |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 451f97c42eb4..cc52679790b2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb) | |||
113 | } | 113 | } |
114 | EXPORT_SYMBOL_GPL(ip_local_out); | 114 | EXPORT_SYMBOL_GPL(ip_local_out); |
115 | 115 | ||
116 | /* dev_loopback_xmit for use with netfilter. */ | ||
117 | static int ip_dev_loopback_xmit(struct sk_buff *newskb) | ||
118 | { | ||
119 | skb_reset_mac_header(newskb); | ||
120 | __skb_pull(newskb, skb_network_offset(newskb)); | ||
121 | newskb->pkt_type = PACKET_LOOPBACK; | ||
122 | newskb->ip_summed = CHECKSUM_UNNECESSARY; | ||
123 | WARN_ON(!skb_dst(newskb)); | ||
124 | skb_dst_force(newskb); | ||
125 | netif_rx_ni(newskb); | ||
126 | return 0; | ||
127 | } | ||
128 | |||
129 | static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) | 116 | static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) |
130 | { | 117 | { |
131 | int ttl = inet->uc_ttl; | 118 | int ttl = inet->uc_ttl; |
@@ -183,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) | |||
183 | struct net_device *dev = dst->dev; | 170 | struct net_device *dev = dst->dev; |
184 | unsigned int hh_len = LL_RESERVED_SPACE(dev); | 171 | unsigned int hh_len = LL_RESERVED_SPACE(dev); |
185 | struct neighbour *neigh; | 172 | struct neighbour *neigh; |
173 | u32 nexthop; | ||
186 | 174 | ||
187 | if (rt->rt_type == RTN_MULTICAST) { | 175 | if (rt->rt_type == RTN_MULTICAST) { |
188 | IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); | 176 | IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); |
@@ -200,19 +188,22 @@ static inline int ip_finish_output2(struct sk_buff *skb) | |||
200 | } | 188 | } |
201 | if (skb->sk) | 189 | if (skb->sk) |
202 | skb_set_owner_w(skb2, skb->sk); | 190 | skb_set_owner_w(skb2, skb->sk); |
203 | kfree_skb(skb); | 191 | consume_skb(skb); |
204 | skb = skb2; | 192 | skb = skb2; |
205 | } | 193 | } |
206 | 194 | ||
207 | rcu_read_lock(); | 195 | rcu_read_lock_bh(); |
208 | neigh = dst_get_neighbour_noref(dst); | 196 | nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr; |
197 | neigh = __ipv4_neigh_lookup_noref(dev, nexthop); | ||
198 | if (unlikely(!neigh)) | ||
199 | neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); | ||
209 | if (neigh) { | 200 | if (neigh) { |
210 | int res = neigh_output(neigh, skb); | 201 | int res = dst_neigh_output(dst, neigh, skb); |
211 | 202 | ||
212 | rcu_read_unlock(); | 203 | rcu_read_unlock_bh(); |
213 | return res; | 204 | return res; |
214 | } | 205 | } |
215 | rcu_read_unlock(); | 206 | rcu_read_unlock_bh(); |
216 | 207 | ||
217 | net_dbg_ratelimited("%s: No header cache and no neighbour!\n", | 208 | net_dbg_ratelimited("%s: No header cache and no neighbour!\n", |
218 | __func__); | 209 | __func__); |
@@ -281,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb) | |||
281 | if (newskb) | 272 | if (newskb) |
282 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, | 273 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, |
283 | newskb, NULL, newskb->dev, | 274 | newskb, NULL, newskb->dev, |
284 | ip_dev_loopback_xmit); | 275 | dev_loopback_xmit); |
285 | } | 276 | } |
286 | 277 | ||
287 | /* Multicasts with ttl 0 must not go beyond the host */ | 278 | /* Multicasts with ttl 0 must not go beyond the host */ |
@@ -296,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb) | |||
296 | struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); | 287 | struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); |
297 | if (newskb) | 288 | if (newskb) |
298 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, | 289 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, |
299 | NULL, newskb->dev, ip_dev_loopback_xmit); | 290 | NULL, newskb->dev, dev_loopback_xmit); |
300 | } | 291 | } |
301 | 292 | ||
302 | return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, | 293 | return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, |
@@ -709,7 +700,7 @@ slow_path: | |||
709 | 700 | ||
710 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); | 701 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); |
711 | } | 702 | } |
712 | kfree_skb(skb); | 703 | consume_skb(skb); |
713 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); | 704 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); |
714 | return err; | 705 | return err; |
715 | 706 | ||
@@ -1472,13 +1463,14 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, | |||
1472 | 1463 | ||
1473 | /* | 1464 | /* |
1474 | * Generic function to send a packet as reply to another packet. | 1465 | * Generic function to send a packet as reply to another packet. |
1475 | * Used to send TCP resets so far. ICMP should use this function too. | 1466 | * Used to send TCP resets so far. |
1476 | * | 1467 | * |
1477 | * Should run single threaded per socket because it uses the sock | 1468 | * Should run single threaded per socket because it uses the sock |
1478 | * structure to pass arguments. | 1469 | * structure to pass arguments. |
1479 | */ | 1470 | */ |
1480 | void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, | 1471 | void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, |
1481 | const struct ip_reply_arg *arg, unsigned int len) | 1472 | __be32 saddr, const struct ip_reply_arg *arg, |
1473 | unsigned int len) | ||
1482 | { | 1474 | { |
1483 | struct inet_sock *inet = inet_sk(sk); | 1475 | struct inet_sock *inet = inet_sk(sk); |
1484 | struct ip_options_data replyopts; | 1476 | struct ip_options_data replyopts; |
@@ -1504,7 +1496,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, | |||
1504 | RT_TOS(arg->tos), | 1496 | RT_TOS(arg->tos), |
1505 | RT_SCOPE_UNIVERSE, sk->sk_protocol, | 1497 | RT_SCOPE_UNIVERSE, sk->sk_protocol, |
1506 | ip_reply_arg_flowi_flags(arg), | 1498 | ip_reply_arg_flowi_flags(arg), |
1507 | daddr, rt->rt_spec_dst, | 1499 | daddr, saddr, |
1508 | tcp_hdr(skb)->source, tcp_hdr(skb)->dest); | 1500 | tcp_hdr(skb)->source, tcp_hdr(skb)->dest); |
1509 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); | 1501 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
1510 | rt = ip_route_output_key(sock_net(sk), &fl4); | 1502 | rt = ip_route_output_key(sock_net(sk), &fl4); |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 0d11f234d615..de29f46f68b0 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #if IS_ENABLED(CONFIG_IPV6) | 40 | #if IS_ENABLED(CONFIG_IPV6) |
41 | #include <net/transp_v6.h> | 41 | #include <net/transp_v6.h> |
42 | #endif | 42 | #endif |
43 | #include <net/ip_fib.h> | ||
43 | 44 | ||
44 | #include <linux/errqueue.h> | 45 | #include <linux/errqueue.h> |
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
@@ -1019,8 +1020,8 @@ e_inval: | |||
1019 | * @sk: socket | 1020 | * @sk: socket |
1020 | * @skb: buffer | 1021 | * @skb: buffer |
1021 | * | 1022 | * |
1022 | * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst | 1023 | * To support IP_CMSG_PKTINFO option, we store rt_iif and specific |
1023 | * in skb->cb[] before dst drop. | 1024 | * destination in skb->cb[] before dst drop. |
1024 | * This way, receiver doesnt make cache line misses to read rtable. | 1025 | * This way, receiver doesnt make cache line misses to read rtable. |
1025 | */ | 1026 | */ |
1026 | void ipv4_pktinfo_prepare(struct sk_buff *skb) | 1027 | void ipv4_pktinfo_prepare(struct sk_buff *skb) |
@@ -1030,7 +1031,7 @@ void ipv4_pktinfo_prepare(struct sk_buff *skb) | |||
1030 | 1031 | ||
1031 | if (rt) { | 1032 | if (rt) { |
1032 | pktinfo->ipi_ifindex = rt->rt_iif; | 1033 | pktinfo->ipi_ifindex = rt->rt_iif; |
1033 | pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst; | 1034 | pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); |
1034 | } else { | 1035 | } else { |
1035 | pktinfo->ipi_ifindex = 0; | 1036 | pktinfo->ipi_ifindex = 0; |
1036 | pktinfo->ipi_spec_dst.s_addr = 0; | 1037 | pktinfo->ipi_spec_dst.s_addr = 0; |
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 63b64c45a826..b91375482d84 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
@@ -42,6 +42,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) | |||
42 | return; | 42 | return; |
43 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", | 43 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", |
44 | spi, &iph->daddr); | 44 | spi, &iph->daddr); |
45 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); | ||
45 | xfrm_state_put(x); | 46 | xfrm_state_put(x); |
46 | } | 47 | } |
47 | 48 | ||
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2d0f99bf61b3..715338a1b205 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -348,9 +348,6 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
348 | case ICMP_PORT_UNREACH: | 348 | case ICMP_PORT_UNREACH: |
349 | /* Impossible event. */ | 349 | /* Impossible event. */ |
350 | return 0; | 350 | return 0; |
351 | case ICMP_FRAG_NEEDED: | ||
352 | /* Soft state for pmtu is maintained by IP core. */ | ||
353 | return 0; | ||
354 | default: | 351 | default: |
355 | /* All others are translated to HOST_UNREACH. | 352 | /* All others are translated to HOST_UNREACH. |
356 | rfc2003 contains "deep thoughts" about NET_UNREACH, | 353 | rfc2003 contains "deep thoughts" about NET_UNREACH, |
@@ -369,7 +366,17 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
369 | 366 | ||
370 | rcu_read_lock(); | 367 | rcu_read_lock(); |
371 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | 368 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); |
372 | if (t == NULL || t->parms.iph.daddr == 0) | 369 | if (t == NULL) |
370 | goto out; | ||
371 | |||
372 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { | ||
373 | ipv4_update_pmtu(skb, dev_net(skb->dev), info, | ||
374 | t->dev->ifindex, 0, IPPROTO_IPIP, 0); | ||
375 | err = 0; | ||
376 | goto out; | ||
377 | } | ||
378 | |||
379 | if (t->parms.iph.daddr == 0) | ||
373 | goto out; | 380 | goto out; |
374 | 381 | ||
375 | err = 0; | 382 | err = 0; |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c94bbc6f2ba3..5716c6b808d6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -524,8 +524,8 @@ failure: | |||
524 | } | 524 | } |
525 | #endif | 525 | #endif |
526 | 526 | ||
527 | /* | 527 | /** |
528 | * Delete a VIF entry | 528 | * vif_delete - Delete a VIF entry |
529 | * @notify: Set to 1, if the caller is a notifier_call | 529 | * @notify: Set to 1, if the caller is a notifier_call |
530 | */ | 530 | */ |
531 | 531 | ||
@@ -2006,37 +2006,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | |||
2006 | { | 2006 | { |
2007 | int ct; | 2007 | int ct; |
2008 | struct rtnexthop *nhp; | 2008 | struct rtnexthop *nhp; |
2009 | u8 *b = skb_tail_pointer(skb); | 2009 | struct nlattr *mp_attr; |
2010 | struct rtattr *mp_head; | ||
2011 | 2010 | ||
2012 | /* If cache is unresolved, don't try to parse IIF and OIF */ | 2011 | /* If cache is unresolved, don't try to parse IIF and OIF */ |
2013 | if (c->mfc_parent >= MAXVIFS) | 2012 | if (c->mfc_parent >= MAXVIFS) |
2014 | return -ENOENT; | 2013 | return -ENOENT; |
2015 | 2014 | ||
2016 | if (VIF_EXISTS(mrt, c->mfc_parent)) | 2015 | if (VIF_EXISTS(mrt, c->mfc_parent) && |
2017 | RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); | 2016 | nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) |
2017 | return -EMSGSIZE; | ||
2018 | 2018 | ||
2019 | mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); | 2019 | if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) |
2020 | return -EMSGSIZE; | ||
2020 | 2021 | ||
2021 | for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { | 2022 | for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { |
2022 | if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { | 2023 | if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { |
2023 | if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) | 2024 | if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { |
2024 | goto rtattr_failure; | 2025 | nla_nest_cancel(skb, mp_attr); |
2025 | nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); | 2026 | return -EMSGSIZE; |
2027 | } | ||
2028 | |||
2026 | nhp->rtnh_flags = 0; | 2029 | nhp->rtnh_flags = 0; |
2027 | nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; | 2030 | nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; |
2028 | nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; | 2031 | nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; |
2029 | nhp->rtnh_len = sizeof(*nhp); | 2032 | nhp->rtnh_len = sizeof(*nhp); |
2030 | } | 2033 | } |
2031 | } | 2034 | } |
2032 | mp_head->rta_type = RTA_MULTIPATH; | 2035 | |
2033 | mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; | 2036 | nla_nest_end(skb, mp_attr); |
2037 | |||
2034 | rtm->rtm_type = RTN_MULTICAST; | 2038 | rtm->rtm_type = RTN_MULTICAST; |
2035 | return 1; | 2039 | return 1; |
2036 | |||
2037 | rtattr_failure: | ||
2038 | nlmsg_trim(skb, b); | ||
2039 | return -EMSGSIZE; | ||
2040 | } | 2040 | } |
2041 | 2041 | ||
2042 | int ipmr_get_route(struct net *net, struct sk_buff *skb, | 2042 | int ipmr_get_route(struct net *net, struct sk_buff *skb, |
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index ba5756d20165..1109f7f6c254 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c | |||
@@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum, | |||
196 | 196 | ||
197 | pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); | 197 | pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); |
198 | 198 | ||
199 | /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ | 199 | nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, |
200 | nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, | 200 | sizeof(*pm)+copy_len, 0); |
201 | sizeof(*pm)+copy_len); | 201 | if (!nlh) { |
202 | pr_debug("error during nlmsg_put\n"); | ||
203 | goto out_unlock; | ||
204 | } | ||
202 | ub->qlen++; | 205 | ub->qlen++; |
203 | 206 | ||
204 | pm = NLMSG_DATA(nlh); | 207 | pm = nlmsg_data(nlh); |
205 | 208 | ||
206 | /* We might not have a timestamp, get one */ | 209 | /* We might not have a timestamp, get one */ |
207 | if (skb->tstamp.tv64 == 0) | 210 | if (skb->tstamp.tv64 == 0) |
@@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum, | |||
261 | nlh->nlmsg_type = NLMSG_DONE; | 264 | nlh->nlmsg_type = NLMSG_DONE; |
262 | ulog_send(groupnum); | 265 | ulog_send(groupnum); |
263 | } | 266 | } |
264 | 267 | out_unlock: | |
265 | spin_unlock_bh(&ulog_lock); | 268 | spin_unlock_bh(&ulog_lock); |
266 | 269 | ||
267 | return; | 270 | return; |
268 | 271 | ||
269 | nlmsg_failure: | ||
270 | pr_debug("error during NLMSG_PUT\n"); | ||
271 | alloc_failure: | 272 | alloc_failure: |
272 | pr_debug("Error building netlink message\n"); | 273 | pr_debug("Error building netlink message\n"); |
273 | spin_unlock_bh(&ulog_lock); | 274 | spin_unlock_bh(&ulog_lock); |
@@ -380,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = { | |||
380 | static int __init ulog_tg_init(void) | 381 | static int __init ulog_tg_init(void) |
381 | { | 382 | { |
382 | int ret, i; | 383 | int ret, i; |
384 | struct netlink_kernel_cfg cfg = { | ||
385 | .groups = ULOG_MAXNLGROUPS, | ||
386 | }; | ||
383 | 387 | ||
384 | pr_debug("init module\n"); | 388 | pr_debug("init module\n"); |
385 | 389 | ||
@@ -392,9 +396,8 @@ static int __init ulog_tg_init(void) | |||
392 | for (i = 0; i < ULOG_MAXNLGROUPS; i++) | 396 | for (i = 0; i < ULOG_MAXNLGROUPS; i++) |
393 | setup_timer(&ulog_buffers[i].timer, ulog_timer, i); | 397 | setup_timer(&ulog_buffers[i].timer, ulog_timer, i); |
394 | 398 | ||
395 | nflognl = netlink_kernel_create(&init_net, | 399 | nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, |
396 | NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, | 400 | THIS_MODULE, &cfg); |
397 | NULL, THIS_MODULE); | ||
398 | if (!nflognl) | 401 | if (!nflognl) |
399 | return -ENOMEM; | 402 | return -ENOMEM; |
400 | 403 | ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 91747d4ebc26..e7ff2dcab6ce 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
@@ -95,11 +95,11 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, | |||
95 | return NF_ACCEPT; | 95 | return NF_ACCEPT; |
96 | } | 96 | } |
97 | 97 | ||
98 | static unsigned int ipv4_confirm(unsigned int hooknum, | 98 | static unsigned int ipv4_helper(unsigned int hooknum, |
99 | struct sk_buff *skb, | 99 | struct sk_buff *skb, |
100 | const struct net_device *in, | 100 | const struct net_device *in, |
101 | const struct net_device *out, | 101 | const struct net_device *out, |
102 | int (*okfn)(struct sk_buff *)) | 102 | int (*okfn)(struct sk_buff *)) |
103 | { | 103 | { |
104 | struct nf_conn *ct; | 104 | struct nf_conn *ct; |
105 | enum ip_conntrack_info ctinfo; | 105 | enum ip_conntrack_info ctinfo; |
@@ -110,24 +110,38 @@ static unsigned int ipv4_confirm(unsigned int hooknum, | |||
110 | /* This is where we call the helper: as the packet goes out. */ | 110 | /* This is where we call the helper: as the packet goes out. */ |
111 | ct = nf_ct_get(skb, &ctinfo); | 111 | ct = nf_ct_get(skb, &ctinfo); |
112 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) | 112 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) |
113 | goto out; | 113 | return NF_ACCEPT; |
114 | 114 | ||
115 | help = nfct_help(ct); | 115 | help = nfct_help(ct); |
116 | if (!help) | 116 | if (!help) |
117 | goto out; | 117 | return NF_ACCEPT; |
118 | 118 | ||
119 | /* rcu_read_lock()ed by nf_hook_slow */ | 119 | /* rcu_read_lock()ed by nf_hook_slow */ |
120 | helper = rcu_dereference(help->helper); | 120 | helper = rcu_dereference(help->helper); |
121 | if (!helper) | 121 | if (!helper) |
122 | goto out; | 122 | return NF_ACCEPT; |
123 | 123 | ||
124 | ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), | 124 | ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), |
125 | ct, ctinfo); | 125 | ct, ctinfo); |
126 | if (ret != NF_ACCEPT) { | 126 | if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) { |
127 | nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, | 127 | nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, |
128 | "nf_ct_%s: dropping packet", helper->name); | 128 | "nf_ct_%s: dropping packet", helper->name); |
129 | return ret; | ||
130 | } | 129 | } |
130 | return ret; | ||
131 | } | ||
132 | |||
133 | static unsigned int ipv4_confirm(unsigned int hooknum, | ||
134 | struct sk_buff *skb, | ||
135 | const struct net_device *in, | ||
136 | const struct net_device *out, | ||
137 | int (*okfn)(struct sk_buff *)) | ||
138 | { | ||
139 | struct nf_conn *ct; | ||
140 | enum ip_conntrack_info ctinfo; | ||
141 | |||
142 | ct = nf_ct_get(skb, &ctinfo); | ||
143 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) | ||
144 | goto out; | ||
131 | 145 | ||
132 | /* adjust seqs for loopback traffic only in outgoing direction */ | 146 | /* adjust seqs for loopback traffic only in outgoing direction */ |
133 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && | 147 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && |
@@ -185,6 +199,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { | |||
185 | .priority = NF_IP_PRI_CONNTRACK, | 199 | .priority = NF_IP_PRI_CONNTRACK, |
186 | }, | 200 | }, |
187 | { | 201 | { |
202 | .hook = ipv4_helper, | ||
203 | .owner = THIS_MODULE, | ||
204 | .pf = NFPROTO_IPV4, | ||
205 | .hooknum = NF_INET_POST_ROUTING, | ||
206 | .priority = NF_IP_PRI_CONNTRACK_HELPER, | ||
207 | }, | ||
208 | { | ||
188 | .hook = ipv4_confirm, | 209 | .hook = ipv4_confirm, |
189 | .owner = THIS_MODULE, | 210 | .owner = THIS_MODULE, |
190 | .pf = NFPROTO_IPV4, | 211 | .pf = NFPROTO_IPV4, |
@@ -192,6 +213,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { | |||
192 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM, | 213 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM, |
193 | }, | 214 | }, |
194 | { | 215 | { |
216 | .hook = ipv4_helper, | ||
217 | .owner = THIS_MODULE, | ||
218 | .pf = NFPROTO_IPV4, | ||
219 | .hooknum = NF_INET_LOCAL_IN, | ||
220 | .priority = NF_IP_PRI_CONNTRACK_HELPER, | ||
221 | }, | ||
222 | { | ||
195 | .hook = ipv4_confirm, | 223 | .hook = ipv4_confirm, |
196 | .owner = THIS_MODULE, | 224 | .owner = THIS_MODULE, |
197 | .pf = NFPROTO_IPV4, | 225 | .pf = NFPROTO_IPV4, |
@@ -207,35 +235,30 @@ static int log_invalid_proto_max = 255; | |||
207 | static ctl_table ip_ct_sysctl_table[] = { | 235 | static ctl_table ip_ct_sysctl_table[] = { |
208 | { | 236 | { |
209 | .procname = "ip_conntrack_max", | 237 | .procname = "ip_conntrack_max", |
210 | .data = &nf_conntrack_max, | ||
211 | .maxlen = sizeof(int), | 238 | .maxlen = sizeof(int), |
212 | .mode = 0644, | 239 | .mode = 0644, |
213 | .proc_handler = proc_dointvec, | 240 | .proc_handler = proc_dointvec, |
214 | }, | 241 | }, |
215 | { | 242 | { |
216 | .procname = "ip_conntrack_count", | 243 | .procname = "ip_conntrack_count", |
217 | .data = &init_net.ct.count, | ||
218 | .maxlen = sizeof(int), | 244 | .maxlen = sizeof(int), |
219 | .mode = 0444, | 245 | .mode = 0444, |
220 | .proc_handler = proc_dointvec, | 246 | .proc_handler = proc_dointvec, |
221 | }, | 247 | }, |
222 | { | 248 | { |
223 | .procname = "ip_conntrack_buckets", | 249 | .procname = "ip_conntrack_buckets", |
224 | .data = &init_net.ct.htable_size, | ||
225 | .maxlen = sizeof(unsigned int), | 250 | .maxlen = sizeof(unsigned int), |
226 | .mode = 0444, | 251 | .mode = 0444, |
227 | .proc_handler = proc_dointvec, | 252 | .proc_handler = proc_dointvec, |
228 | }, | 253 | }, |
229 | { | 254 | { |
230 | .procname = "ip_conntrack_checksum", | 255 | .procname = "ip_conntrack_checksum", |
231 | .data = &init_net.ct.sysctl_checksum, | ||
232 | .maxlen = sizeof(int), | 256 | .maxlen = sizeof(int), |
233 | .mode = 0644, | 257 | .mode = 0644, |
234 | .proc_handler = proc_dointvec, | 258 | .proc_handler = proc_dointvec, |
235 | }, | 259 | }, |
236 | { | 260 | { |
237 | .procname = "ip_conntrack_log_invalid", | 261 | .procname = "ip_conntrack_log_invalid", |
238 | .data = &init_net.ct.sysctl_log_invalid, | ||
239 | .maxlen = sizeof(unsigned int), | 262 | .maxlen = sizeof(unsigned int), |
240 | .mode = 0644, | 263 | .mode = 0644, |
241 | .proc_handler = proc_dointvec_minmax, | 264 | .proc_handler = proc_dointvec_minmax, |
@@ -351,6 +374,25 @@ static struct nf_sockopt_ops so_getorigdst = { | |||
351 | .owner = THIS_MODULE, | 374 | .owner = THIS_MODULE, |
352 | }; | 375 | }; |
353 | 376 | ||
377 | static int ipv4_init_net(struct net *net) | ||
378 | { | ||
379 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | ||
380 | struct nf_ip_net *in = &net->ct.nf_ct_proto; | ||
381 | in->ctl_table = kmemdup(ip_ct_sysctl_table, | ||
382 | sizeof(ip_ct_sysctl_table), | ||
383 | GFP_KERNEL); | ||
384 | if (!in->ctl_table) | ||
385 | return -ENOMEM; | ||
386 | |||
387 | in->ctl_table[0].data = &nf_conntrack_max; | ||
388 | in->ctl_table[1].data = &net->ct.count; | ||
389 | in->ctl_table[2].data = &net->ct.htable_size; | ||
390 | in->ctl_table[3].data = &net->ct.sysctl_checksum; | ||
391 | in->ctl_table[4].data = &net->ct.sysctl_log_invalid; | ||
392 | #endif | ||
393 | return 0; | ||
394 | } | ||
395 | |||
354 | struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { | 396 | struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { |
355 | .l3proto = PF_INET, | 397 | .l3proto = PF_INET, |
356 | .name = "ipv4", | 398 | .name = "ipv4", |
@@ -366,8 +408,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { | |||
366 | #endif | 408 | #endif |
367 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | 409 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) |
368 | .ctl_table_path = "net/ipv4/netfilter", | 410 | .ctl_table_path = "net/ipv4/netfilter", |
369 | .ctl_table = ip_ct_sysctl_table, | ||
370 | #endif | 411 | #endif |
412 | .init_net = ipv4_init_net, | ||
371 | .me = THIS_MODULE, | 413 | .me = THIS_MODULE, |
372 | }; | 414 | }; |
373 | 415 | ||
@@ -378,6 +420,65 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); | |||
378 | MODULE_ALIAS("ip_conntrack"); | 420 | MODULE_ALIAS("ip_conntrack"); |
379 | MODULE_LICENSE("GPL"); | 421 | MODULE_LICENSE("GPL"); |
380 | 422 | ||
423 | static int ipv4_net_init(struct net *net) | ||
424 | { | ||
425 | int ret = 0; | ||
426 | |||
427 | ret = nf_conntrack_l4proto_register(net, | ||
428 | &nf_conntrack_l4proto_tcp4); | ||
429 | if (ret < 0) { | ||
430 | pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n"); | ||
431 | goto out_tcp; | ||
432 | } | ||
433 | ret = nf_conntrack_l4proto_register(net, | ||
434 | &nf_conntrack_l4proto_udp4); | ||
435 | if (ret < 0) { | ||
436 | pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n"); | ||
437 | goto out_udp; | ||
438 | } | ||
439 | ret = nf_conntrack_l4proto_register(net, | ||
440 | &nf_conntrack_l4proto_icmp); | ||
441 | if (ret < 0) { | ||
442 | pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n"); | ||
443 | goto out_icmp; | ||
444 | } | ||
445 | ret = nf_conntrack_l3proto_register(net, | ||
446 | &nf_conntrack_l3proto_ipv4); | ||
447 | if (ret < 0) { | ||
448 | pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n"); | ||
449 | goto out_ipv4; | ||
450 | } | ||
451 | return 0; | ||
452 | out_ipv4: | ||
453 | nf_conntrack_l4proto_unregister(net, | ||
454 | &nf_conntrack_l4proto_icmp); | ||
455 | out_icmp: | ||
456 | nf_conntrack_l4proto_unregister(net, | ||
457 | &nf_conntrack_l4proto_udp4); | ||
458 | out_udp: | ||
459 | nf_conntrack_l4proto_unregister(net, | ||
460 | &nf_conntrack_l4proto_tcp4); | ||
461 | out_tcp: | ||
462 | return ret; | ||
463 | } | ||
464 | |||
465 | static void ipv4_net_exit(struct net *net) | ||
466 | { | ||
467 | nf_conntrack_l3proto_unregister(net, | ||
468 | &nf_conntrack_l3proto_ipv4); | ||
469 | nf_conntrack_l4proto_unregister(net, | ||
470 | &nf_conntrack_l4proto_icmp); | ||
471 | nf_conntrack_l4proto_unregister(net, | ||
472 | &nf_conntrack_l4proto_udp4); | ||
473 | nf_conntrack_l4proto_unregister(net, | ||
474 | &nf_conntrack_l4proto_tcp4); | ||
475 | } | ||
476 | |||
477 | static struct pernet_operations ipv4_net_ops = { | ||
478 | .init = ipv4_net_init, | ||
479 | .exit = ipv4_net_exit, | ||
480 | }; | ||
481 | |||
381 | static int __init nf_conntrack_l3proto_ipv4_init(void) | 482 | static int __init nf_conntrack_l3proto_ipv4_init(void) |
382 | { | 483 | { |
383 | int ret = 0; | 484 | int ret = 0; |
@@ -391,35 +492,17 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) | |||
391 | return ret; | 492 | return ret; |
392 | } | 493 | } |
393 | 494 | ||
394 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); | 495 | ret = register_pernet_subsys(&ipv4_net_ops); |
395 | if (ret < 0) { | 496 | if (ret < 0) { |
396 | pr_err("nf_conntrack_ipv4: can't register tcp.\n"); | 497 | pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); |
397 | goto cleanup_sockopt; | 498 | goto cleanup_sockopt; |
398 | } | 499 | } |
399 | 500 | ||
400 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); | ||
401 | if (ret < 0) { | ||
402 | pr_err("nf_conntrack_ipv4: can't register udp.\n"); | ||
403 | goto cleanup_tcp; | ||
404 | } | ||
405 | |||
406 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); | ||
407 | if (ret < 0) { | ||
408 | pr_err("nf_conntrack_ipv4: can't register icmp.\n"); | ||
409 | goto cleanup_udp; | ||
410 | } | ||
411 | |||
412 | ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); | ||
413 | if (ret < 0) { | ||
414 | pr_err("nf_conntrack_ipv4: can't register ipv4\n"); | ||
415 | goto cleanup_icmp; | ||
416 | } | ||
417 | |||
418 | ret = nf_register_hooks(ipv4_conntrack_ops, | 501 | ret = nf_register_hooks(ipv4_conntrack_ops, |
419 | ARRAY_SIZE(ipv4_conntrack_ops)); | 502 | ARRAY_SIZE(ipv4_conntrack_ops)); |
420 | if (ret < 0) { | 503 | if (ret < 0) { |
421 | pr_err("nf_conntrack_ipv4: can't register hooks.\n"); | 504 | pr_err("nf_conntrack_ipv4: can't register hooks.\n"); |
422 | goto cleanup_ipv4; | 505 | goto cleanup_pernet; |
423 | } | 506 | } |
424 | #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | 507 | #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) |
425 | ret = nf_conntrack_ipv4_compat_init(); | 508 | ret = nf_conntrack_ipv4_compat_init(); |
@@ -431,14 +514,8 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) | |||
431 | cleanup_hooks: | 514 | cleanup_hooks: |
432 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); | 515 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); |
433 | #endif | 516 | #endif |
434 | cleanup_ipv4: | 517 | cleanup_pernet: |
435 | nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); | 518 | unregister_pernet_subsys(&ipv4_net_ops); |
436 | cleanup_icmp: | ||
437 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); | ||
438 | cleanup_udp: | ||
439 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); | ||
440 | cleanup_tcp: | ||
441 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); | ||
442 | cleanup_sockopt: | 519 | cleanup_sockopt: |
443 | nf_unregister_sockopt(&so_getorigdst); | 520 | nf_unregister_sockopt(&so_getorigdst); |
444 | return ret; | 521 | return ret; |
@@ -451,10 +528,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) | |||
451 | nf_conntrack_ipv4_compat_fini(); | 528 | nf_conntrack_ipv4_compat_fini(); |
452 | #endif | 529 | #endif |
453 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); | 530 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); |
454 | nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); | 531 | unregister_pernet_subsys(&ipv4_net_ops); |
455 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); | ||
456 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); | ||
457 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); | ||
458 | nf_unregister_sockopt(&so_getorigdst); | 532 | nf_unregister_sockopt(&so_getorigdst); |
459 | } | 533 | } |
460 | 534 | ||
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 0847e373d33c..5241d997ab75 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c | |||
@@ -23,6 +23,11 @@ | |||
23 | 23 | ||
24 | static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; | 24 | static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; |
25 | 25 | ||
26 | static inline struct nf_icmp_net *icmp_pernet(struct net *net) | ||
27 | { | ||
28 | return &net->ct.nf_ct_proto.icmp; | ||
29 | } | ||
30 | |||
26 | static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, | 31 | static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, |
27 | struct nf_conntrack_tuple *tuple) | 32 | struct nf_conntrack_tuple *tuple) |
28 | { | 33 | { |
@@ -77,7 +82,7 @@ static int icmp_print_tuple(struct seq_file *s, | |||
77 | 82 | ||
78 | static unsigned int *icmp_get_timeouts(struct net *net) | 83 | static unsigned int *icmp_get_timeouts(struct net *net) |
79 | { | 84 | { |
80 | return &nf_ct_icmp_timeout; | 85 | return &icmp_pernet(net)->timeout; |
81 | } | 86 | } |
82 | 87 | ||
83 | /* Returns verdict for packet, or -1 for invalid. */ | 88 | /* Returns verdict for packet, or -1 for invalid. */ |
@@ -274,16 +279,18 @@ static int icmp_nlattr_tuple_size(void) | |||
274 | #include <linux/netfilter/nfnetlink.h> | 279 | #include <linux/netfilter/nfnetlink.h> |
275 | #include <linux/netfilter/nfnetlink_cttimeout.h> | 280 | #include <linux/netfilter/nfnetlink_cttimeout.h> |
276 | 281 | ||
277 | static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) | 282 | static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], |
283 | struct net *net, void *data) | ||
278 | { | 284 | { |
279 | unsigned int *timeout = data; | 285 | unsigned int *timeout = data; |
286 | struct nf_icmp_net *in = icmp_pernet(net); | ||
280 | 287 | ||
281 | if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { | 288 | if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { |
282 | *timeout = | 289 | *timeout = |
283 | ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; | 290 | ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; |
284 | } else { | 291 | } else { |
285 | /* Set default ICMP timeout. */ | 292 | /* Set default ICMP timeout. */ |
286 | *timeout = nf_ct_icmp_timeout; | 293 | *timeout = in->timeout; |
287 | } | 294 | } |
288 | return 0; | 295 | return 0; |
289 | } | 296 | } |
@@ -308,11 +315,9 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { | |||
308 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ | 315 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ |
309 | 316 | ||
310 | #ifdef CONFIG_SYSCTL | 317 | #ifdef CONFIG_SYSCTL |
311 | static struct ctl_table_header *icmp_sysctl_header; | ||
312 | static struct ctl_table icmp_sysctl_table[] = { | 318 | static struct ctl_table icmp_sysctl_table[] = { |
313 | { | 319 | { |
314 | .procname = "nf_conntrack_icmp_timeout", | 320 | .procname = "nf_conntrack_icmp_timeout", |
315 | .data = &nf_ct_icmp_timeout, | ||
316 | .maxlen = sizeof(unsigned int), | 321 | .maxlen = sizeof(unsigned int), |
317 | .mode = 0644, | 322 | .mode = 0644, |
318 | .proc_handler = proc_dointvec_jiffies, | 323 | .proc_handler = proc_dointvec_jiffies, |
@@ -323,7 +328,6 @@ static struct ctl_table icmp_sysctl_table[] = { | |||
323 | static struct ctl_table icmp_compat_sysctl_table[] = { | 328 | static struct ctl_table icmp_compat_sysctl_table[] = { |
324 | { | 329 | { |
325 | .procname = "ip_conntrack_icmp_timeout", | 330 | .procname = "ip_conntrack_icmp_timeout", |
326 | .data = &nf_ct_icmp_timeout, | ||
327 | .maxlen = sizeof(unsigned int), | 331 | .maxlen = sizeof(unsigned int), |
328 | .mode = 0644, | 332 | .mode = 0644, |
329 | .proc_handler = proc_dointvec_jiffies, | 333 | .proc_handler = proc_dointvec_jiffies, |
@@ -333,6 +337,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = { | |||
333 | #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ | 337 | #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ |
334 | #endif /* CONFIG_SYSCTL */ | 338 | #endif /* CONFIG_SYSCTL */ |
335 | 339 | ||
340 | static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, | ||
341 | struct nf_icmp_net *in) | ||
342 | { | ||
343 | #ifdef CONFIG_SYSCTL | ||
344 | pn->ctl_table = kmemdup(icmp_sysctl_table, | ||
345 | sizeof(icmp_sysctl_table), | ||
346 | GFP_KERNEL); | ||
347 | if (!pn->ctl_table) | ||
348 | return -ENOMEM; | ||
349 | |||
350 | pn->ctl_table[0].data = &in->timeout; | ||
351 | #endif | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, | ||
356 | struct nf_icmp_net *in) | ||
357 | { | ||
358 | #ifdef CONFIG_SYSCTL | ||
359 | #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT | ||
360 | pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, | ||
361 | sizeof(icmp_compat_sysctl_table), | ||
362 | GFP_KERNEL); | ||
363 | if (!pn->ctl_compat_table) | ||
364 | return -ENOMEM; | ||
365 | |||
366 | pn->ctl_compat_table[0].data = &in->timeout; | ||
367 | #endif | ||
368 | #endif | ||
369 | return 0; | ||
370 | } | ||
371 | |||
372 | static int icmp_init_net(struct net *net, u_int16_t proto) | ||
373 | { | ||
374 | int ret; | ||
375 | struct nf_icmp_net *in = icmp_pernet(net); | ||
376 | struct nf_proto_net *pn = &in->pn; | ||
377 | |||
378 | in->timeout = nf_ct_icmp_timeout; | ||
379 | |||
380 | ret = icmp_kmemdup_compat_sysctl_table(pn, in); | ||
381 | if (ret < 0) | ||
382 | return ret; | ||
383 | |||
384 | ret = icmp_kmemdup_sysctl_table(pn, in); | ||
385 | if (ret < 0) | ||
386 | nf_ct_kfree_compat_sysctl_table(pn); | ||
387 | |||
388 | return ret; | ||
389 | } | ||
390 | |||
391 | static struct nf_proto_net *icmp_get_net_proto(struct net *net) | ||
392 | { | ||
393 | return &net->ct.nf_ct_proto.icmp.pn; | ||
394 | } | ||
395 | |||
336 | struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = | 396 | struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = |
337 | { | 397 | { |
338 | .l3proto = PF_INET, | 398 | .l3proto = PF_INET, |
@@ -362,11 +422,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = | |||
362 | .nla_policy = icmp_timeout_nla_policy, | 422 | .nla_policy = icmp_timeout_nla_policy, |
363 | }, | 423 | }, |
364 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ | 424 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ |
365 | #ifdef CONFIG_SYSCTL | 425 | .init_net = icmp_init_net, |
366 | .ctl_table_header = &icmp_sysctl_header, | 426 | .get_net_proto = icmp_get_net_proto, |
367 | .ctl_table = icmp_sysctl_table, | ||
368 | #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT | ||
369 | .ctl_compat_table = icmp_compat_sysctl_table, | ||
370 | #endif | ||
371 | #endif | ||
372 | }; | 427 | }; |
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 9bb1b8a37a22..742815518b0f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c | |||
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = { | |||
94 | { | 94 | { |
95 | .hook = ipv4_conntrack_defrag, | 95 | .hook = ipv4_conntrack_defrag, |
96 | .owner = THIS_MODULE, | 96 | .owner = THIS_MODULE, |
97 | .pf = PF_INET, | 97 | .pf = NFPROTO_IPV4, |
98 | .hooknum = NF_INET_PRE_ROUTING, | 98 | .hooknum = NF_INET_PRE_ROUTING, |
99 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | 99 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, |
100 | }, | 100 | }, |
101 | { | 101 | { |
102 | .hook = ipv4_conntrack_defrag, | 102 | .hook = ipv4_conntrack_defrag, |
103 | .owner = THIS_MODULE, | 103 | .owner = THIS_MODULE, |
104 | .pf = PF_INET, | 104 | .pf = NFPROTO_IPV4, |
105 | .hooknum = NF_INET_LOCAL_OUT, | 105 | .hooknum = NF_INET_LOCAL_OUT, |
106 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | 106 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, |
107 | }, | 107 | }, |
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 7b22382ff0e9..3c04d24e2976 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c | |||
@@ -13,10 +13,10 @@ | |||
13 | #include <linux/skbuff.h> | 13 | #include <linux/skbuff.h> |
14 | #include <linux/udp.h> | 14 | #include <linux/udp.h> |
15 | 15 | ||
16 | #include <net/netfilter/nf_nat_helper.h> | ||
17 | #include <net/netfilter/nf_nat_rule.h> | ||
18 | #include <net/netfilter/nf_conntrack_helper.h> | 16 | #include <net/netfilter/nf_conntrack_helper.h> |
19 | #include <net/netfilter/nf_conntrack_expect.h> | 17 | #include <net/netfilter/nf_conntrack_expect.h> |
18 | #include <net/netfilter/nf_nat_helper.h> | ||
19 | #include <net/netfilter/nf_nat_rule.h> | ||
20 | #include <linux/netfilter/nf_conntrack_amanda.h> | 20 | #include <linux/netfilter/nf_conntrack_amanda.h> |
21 | 21 | ||
22 | MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); | 22 | MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); |
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index abb52adf5acd..44b082fd48ab 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c | |||
@@ -691,6 +691,10 @@ static struct nf_ct_helper_expectfn follow_master_nat = { | |||
691 | .expectfn = nf_nat_follow_master, | 691 | .expectfn = nf_nat_follow_master, |
692 | }; | 692 | }; |
693 | 693 | ||
694 | static struct nfq_ct_nat_hook nfq_ct_nat = { | ||
695 | .seq_adjust = nf_nat_tcp_seq_adjust, | ||
696 | }; | ||
697 | |||
694 | static int __init nf_nat_init(void) | 698 | static int __init nf_nat_init(void) |
695 | { | 699 | { |
696 | size_t i; | 700 | size_t i; |
@@ -731,6 +735,7 @@ static int __init nf_nat_init(void) | |||
731 | nfnetlink_parse_nat_setup); | 735 | nfnetlink_parse_nat_setup); |
732 | BUG_ON(nf_ct_nat_offset != NULL); | 736 | BUG_ON(nf_ct_nat_offset != NULL); |
733 | RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); | 737 | RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); |
738 | RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat); | ||
734 | return 0; | 739 | return 0; |
735 | 740 | ||
736 | cleanup_extend: | 741 | cleanup_extend: |
@@ -747,6 +752,7 @@ static void __exit nf_nat_cleanup(void) | |||
747 | RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); | 752 | RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); |
748 | RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); | 753 | RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); |
749 | RCU_INIT_POINTER(nf_ct_nat_offset, NULL); | 754 | RCU_INIT_POINTER(nf_ct_nat_offset, NULL); |
755 | RCU_INIT_POINTER(nfq_ct_nat_hook, NULL); | ||
750 | synchronize_net(); | 756 | synchronize_net(); |
751 | } | 757 | } |
752 | 758 | ||
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index cad29c121318..c6784a18c1c4 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c | |||
@@ -95,7 +95,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, | |||
95 | unsigned char **data, | 95 | unsigned char **data, |
96 | TransportAddress *taddr, int count) | 96 | TransportAddress *taddr, int count) |
97 | { | 97 | { |
98 | const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; | 98 | const struct nf_ct_h323_master *info = nfct_help_data(ct); |
99 | int dir = CTINFO2DIR(ctinfo); | 99 | int dir = CTINFO2DIR(ctinfo); |
100 | int i; | 100 | int i; |
101 | __be16 port; | 101 | __be16 port; |
@@ -178,7 +178,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, | |||
178 | struct nf_conntrack_expect *rtp_exp, | 178 | struct nf_conntrack_expect *rtp_exp, |
179 | struct nf_conntrack_expect *rtcp_exp) | 179 | struct nf_conntrack_expect *rtcp_exp) |
180 | { | 180 | { |
181 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; | 181 | struct nf_ct_h323_master *info = nfct_help_data(ct); |
182 | int dir = CTINFO2DIR(ctinfo); | 182 | int dir = CTINFO2DIR(ctinfo); |
183 | int i; | 183 | int i; |
184 | u_int16_t nated_port; | 184 | u_int16_t nated_port; |
@@ -330,7 +330,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, | |||
330 | TransportAddress *taddr, __be16 port, | 330 | TransportAddress *taddr, __be16 port, |
331 | struct nf_conntrack_expect *exp) | 331 | struct nf_conntrack_expect *exp) |
332 | { | 332 | { |
333 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; | 333 | struct nf_ct_h323_master *info = nfct_help_data(ct); |
334 | int dir = CTINFO2DIR(ctinfo); | 334 | int dir = CTINFO2DIR(ctinfo); |
335 | u_int16_t nated_port = ntohs(port); | 335 | u_int16_t nated_port = ntohs(port); |
336 | 336 | ||
@@ -419,7 +419,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, | |||
419 | unsigned char **data, TransportAddress *taddr, int idx, | 419 | unsigned char **data, TransportAddress *taddr, int idx, |
420 | __be16 port, struct nf_conntrack_expect *exp) | 420 | __be16 port, struct nf_conntrack_expect *exp) |
421 | { | 421 | { |
422 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; | 422 | struct nf_ct_h323_master *info = nfct_help_data(ct); |
423 | int dir = CTINFO2DIR(ctinfo); | 423 | int dir = CTINFO2DIR(ctinfo); |
424 | u_int16_t nated_port = ntohs(port); | 424 | u_int16_t nated_port = ntohs(port); |
425 | union nf_inet_addr addr; | 425 | union nf_inet_addr addr; |
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index af65958f6308..2e59ad0b90ca 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c | |||
@@ -153,6 +153,19 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, | |||
153 | } | 153 | } |
154 | EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); | 154 | EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); |
155 | 155 | ||
156 | void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, | ||
157 | u32 ctinfo, int off) | ||
158 | { | ||
159 | const struct tcphdr *th; | ||
160 | |||
161 | if (nf_ct_protonum(ct) != IPPROTO_TCP) | ||
162 | return; | ||
163 | |||
164 | th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb)); | ||
165 | nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); | ||
166 | } | ||
167 | EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust); | ||
168 | |||
156 | static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, | 169 | static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, |
157 | int datalen, __sum16 *check, int oldlen) | 170 | int datalen, __sum16 *check, int oldlen) |
158 | { | 171 | { |
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index c273d58980ae..388140881ebe 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c | |||
@@ -49,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct, | |||
49 | const struct nf_nat_pptp *nat_pptp_info; | 49 | const struct nf_nat_pptp *nat_pptp_info; |
50 | struct nf_nat_ipv4_range range; | 50 | struct nf_nat_ipv4_range range; |
51 | 51 | ||
52 | ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; | 52 | ct_pptp_info = nfct_help_data(master); |
53 | nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; | 53 | nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; |
54 | 54 | ||
55 | /* And here goes the grand finale of corrosion... */ | 55 | /* And here goes the grand finale of corrosion... */ |
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb, | |||
123 | __be16 new_callid; | 123 | __be16 new_callid; |
124 | unsigned int cid_off; | 124 | unsigned int cid_off; |
125 | 125 | ||
126 | ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; | 126 | ct_pptp_info = nfct_help_data(ct); |
127 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; | 127 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; |
128 | 128 | ||
129 | new_callid = ct_pptp_info->pns_call_id; | 129 | new_callid = ct_pptp_info->pns_call_id; |
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig, | |||
192 | struct nf_ct_pptp_master *ct_pptp_info; | 192 | struct nf_ct_pptp_master *ct_pptp_info; |
193 | struct nf_nat_pptp *nat_pptp_info; | 193 | struct nf_nat_pptp *nat_pptp_info; |
194 | 194 | ||
195 | ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; | 195 | ct_pptp_info = nfct_help_data(ct); |
196 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; | 196 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; |
197 | 197 | ||
198 | /* save original PAC call ID in nat_info */ | 198 | /* save original PAC call ID in nat_info */ |
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 746edec8b86e..bac712293fd6 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c | |||
@@ -405,7 +405,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, | |||
405 | 405 | ||
406 | ptr = *octets; | 406 | ptr = *octets; |
407 | while (ctx->pointer < eoc) { | 407 | while (ctx->pointer < eoc) { |
408 | if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { | 408 | if (!asn1_octet_decode(ctx, ptr++)) { |
409 | kfree(*octets); | 409 | kfree(*octets); |
410 | *octets = NULL; | 410 | *octets = NULL; |
411 | return 0; | 411 | return 0; |
@@ -759,7 +759,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
759 | } | 759 | } |
760 | break; | 760 | break; |
761 | case SNMP_OBJECTID: | 761 | case SNMP_OBJECTID: |
762 | if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { | 762 | if (!asn1_oid_decode(ctx, end, &lp, &len)) { |
763 | kfree(id); | 763 | kfree(id); |
764 | return 0; | 764 | return 0; |
765 | } | 765 | } |
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c index a2901bf829c0..9dbb8d284f99 100644 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ b/net/ipv4/netfilter/nf_nat_tftp.c | |||
@@ -8,10 +8,10 @@ | |||
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/udp.h> | 9 | #include <linux/udp.h> |
10 | 10 | ||
11 | #include <net/netfilter/nf_nat_helper.h> | ||
12 | #include <net/netfilter/nf_nat_rule.h> | ||
13 | #include <net/netfilter/nf_conntrack_helper.h> | 11 | #include <net/netfilter/nf_conntrack_helper.h> |
14 | #include <net/netfilter/nf_conntrack_expect.h> | 12 | #include <net/netfilter/nf_conntrack_expect.h> |
13 | #include <net/netfilter/nf_nat_helper.h> | ||
14 | #include <net/netfilter/nf_nat_rule.h> | ||
15 | #include <linux/netfilter/nf_conntrack_tftp.h> | 15 | #include <linux/netfilter/nf_conntrack_tftp.h> |
16 | 16 | ||
17 | MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); | 17 | MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); |
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2c00e8bf684d..340fcf29a966 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
@@ -371,6 +371,7 @@ void ping_err(struct sk_buff *skb, u32 info) | |||
371 | break; | 371 | break; |
372 | case ICMP_DEST_UNREACH: | 372 | case ICMP_DEST_UNREACH: |
373 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ | 373 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ |
374 | ipv4_sk_update_pmtu(skb, sk, info); | ||
374 | if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { | 375 | if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { |
375 | err = EMSGSIZE; | 376 | err = EMSGSIZE; |
376 | harderr = 1; | 377 | harderr = 1; |
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9ae5c01cd0b2..8918eff1426d 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c | |||
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; | |||
36 | 36 | ||
37 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) | 37 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) |
38 | { | 38 | { |
39 | int hash = protocol & (MAX_INET_PROTOS - 1); | 39 | return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], |
40 | |||
41 | return !cmpxchg((const struct net_protocol **)&inet_protos[hash], | ||
42 | NULL, prot) ? 0 : -1; | 40 | NULL, prot) ? 0 : -1; |
43 | } | 41 | } |
44 | EXPORT_SYMBOL(inet_add_protocol); | 42 | EXPORT_SYMBOL(inet_add_protocol); |
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol); | |||
49 | 47 | ||
50 | int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) | 48 | int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) |
51 | { | 49 | { |
52 | int ret, hash = protocol & (MAX_INET_PROTOS - 1); | 50 | int ret; |
53 | 51 | ||
54 | ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], | 52 | ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol], |
55 | prot, NULL) == prot) ? 0 : -1; | 53 | prot, NULL) == prot) ? 0 : -1; |
56 | 54 | ||
57 | synchronize_net(); | 55 | synchronize_net(); |
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4032b818f3e4..659ddfb10947 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -216,6 +216,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) | |||
216 | int err = 0; | 216 | int err = 0; |
217 | int harderr = 0; | 217 | int harderr = 0; |
218 | 218 | ||
219 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) | ||
220 | ipv4_sk_update_pmtu(skb, sk, info); | ||
221 | |||
219 | /* Report error on raw socket, if: | 222 | /* Report error on raw socket, if: |
220 | 1. User requested ip_recverr. | 223 | 1. User requested ip_recverr. |
221 | 2. Socket is connected (otherwise the error indication | 224 | 2. Socket is connected (otherwise the error indication |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98b30d08efe9..95bfa1ba5b28 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -158,40 +158,13 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | |||
158 | 158 | ||
159 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) | 159 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) |
160 | { | 160 | { |
161 | struct rtable *rt = (struct rtable *) dst; | 161 | WARN_ON(1); |
162 | struct inet_peer *peer; | 162 | return NULL; |
163 | u32 *p = NULL; | ||
164 | |||
165 | if (!rt->peer) | ||
166 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
167 | |||
168 | peer = rt->peer; | ||
169 | if (peer) { | ||
170 | u32 *old_p = __DST_METRICS_PTR(old); | ||
171 | unsigned long prev, new; | ||
172 | |||
173 | p = peer->metrics; | ||
174 | if (inet_metrics_new(peer)) | ||
175 | memcpy(p, old_p, sizeof(u32) * RTAX_MAX); | ||
176 | |||
177 | new = (unsigned long) p; | ||
178 | prev = cmpxchg(&dst->_metrics, old, new); | ||
179 | |||
180 | if (prev != old) { | ||
181 | p = __DST_METRICS_PTR(prev); | ||
182 | if (prev & DST_METRICS_READ_ONLY) | ||
183 | p = NULL; | ||
184 | } else { | ||
185 | if (rt->fi) { | ||
186 | fib_info_put(rt->fi); | ||
187 | rt->fi = NULL; | ||
188 | } | ||
189 | } | ||
190 | } | ||
191 | return p; | ||
192 | } | 163 | } |
193 | 164 | ||
194 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); | 165 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, |
166 | struct sk_buff *skb, | ||
167 | const void *daddr); | ||
195 | 168 | ||
196 | static struct dst_ops ipv4_dst_ops = { | 169 | static struct dst_ops ipv4_dst_ops = { |
197 | .family = AF_INET, | 170 | .family = AF_INET, |
@@ -421,29 +394,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) | |||
421 | "HHUptod\tSpecDst"); | 394 | "HHUptod\tSpecDst"); |
422 | else { | 395 | else { |
423 | struct rtable *r = v; | 396 | struct rtable *r = v; |
424 | struct neighbour *n; | 397 | int len; |
425 | int len, HHUptod; | ||
426 | |||
427 | rcu_read_lock(); | ||
428 | n = dst_get_neighbour_noref(&r->dst); | ||
429 | HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; | ||
430 | rcu_read_unlock(); | ||
431 | 398 | ||
432 | seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" | 399 | seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" |
433 | "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", | 400 | "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", |
434 | r->dst.dev ? r->dst.dev->name : "*", | 401 | r->dst.dev ? r->dst.dev->name : "*", |
435 | (__force u32)r->rt_dst, | 402 | (__force u32)r->rt_dst, |
436 | (__force u32)r->rt_gateway, | 403 | (__force u32)r->rt_gateway, |
437 | r->rt_flags, atomic_read(&r->dst.__refcnt), | 404 | r->rt_flags, atomic_read(&r->dst.__refcnt), |
438 | r->dst.__use, 0, (__force u32)r->rt_src, | 405 | r->dst.__use, 0, (__force u32)r->rt_src, |
439 | dst_metric_advmss(&r->dst) + 40, | 406 | dst_metric_advmss(&r->dst) + 40, |
440 | dst_metric(&r->dst, RTAX_WINDOW), | 407 | dst_metric(&r->dst, RTAX_WINDOW), 0, |
441 | (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + | 408 | r->rt_key_tos, |
442 | dst_metric(&r->dst, RTAX_RTTVAR)), | 409 | -1, 0, 0, &len); |
443 | r->rt_key_tos, | ||
444 | -1, | ||
445 | HHUptod, | ||
446 | r->rt_spec_dst, &len); | ||
447 | 410 | ||
448 | seq_printf(seq, "%*s\n", 127 - len, ""); | 411 | seq_printf(seq, "%*s\n", 127 - len, ""); |
449 | } | 412 | } |
@@ -680,7 +643,7 @@ static inline int rt_fast_clean(struct rtable *rth) | |||
680 | static inline int rt_valuable(struct rtable *rth) | 643 | static inline int rt_valuable(struct rtable *rth) |
681 | { | 644 | { |
682 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || | 645 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || |
683 | (rth->peer && rth->peer->pmtu_expires); | 646 | rth->dst.expires; |
684 | } | 647 | } |
685 | 648 | ||
686 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) | 649 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) |
@@ -873,34 +836,22 @@ static void rt_check_expire(void) | |||
873 | while ((rth = rcu_dereference_protected(*rthp, | 836 | while ((rth = rcu_dereference_protected(*rthp, |
874 | lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { | 837 | lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { |
875 | prefetch(rth->dst.rt_next); | 838 | prefetch(rth->dst.rt_next); |
876 | if (rt_is_expired(rth)) { | 839 | if (rt_is_expired(rth) || |
840 | rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { | ||
877 | *rthp = rth->dst.rt_next; | 841 | *rthp = rth->dst.rt_next; |
878 | rt_free(rth); | 842 | rt_free(rth); |
879 | continue; | 843 | continue; |
880 | } | 844 | } |
881 | if (rth->dst.expires) { | ||
882 | /* Entry is expired even if it is in use */ | ||
883 | if (time_before_eq(jiffies, rth->dst.expires)) { | ||
884 | nofree: | ||
885 | tmo >>= 1; | ||
886 | rthp = &rth->dst.rt_next; | ||
887 | /* | ||
888 | * We only count entries on | ||
889 | * a chain with equal hash inputs once | ||
890 | * so that entries for different QOS | ||
891 | * levels, and other non-hash input | ||
892 | * attributes don't unfairly skew | ||
893 | * the length computation | ||
894 | */ | ||
895 | length += has_noalias(rt_hash_table[i].chain, rth); | ||
896 | continue; | ||
897 | } | ||
898 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) | ||
899 | goto nofree; | ||
900 | 845 | ||
901 | /* Cleanup aged off entries. */ | 846 | /* We only count entries on a chain with equal |
902 | *rthp = rth->dst.rt_next; | 847 | * hash inputs once so that entries for |
903 | rt_free(rth); | 848 | * different QOS levels, and other non-hash |
849 | * input attributes don't unfairly skew the | ||
850 | * length computation | ||
851 | */ | ||
852 | tmo >>= 1; | ||
853 | rthp = &rth->dst.rt_next; | ||
854 | length += has_noalias(rt_hash_table[i].chain, rth); | ||
904 | } | 855 | } |
905 | spin_unlock_bh(rt_hash_lock_addr(i)); | 856 | spin_unlock_bh(rt_hash_lock_addr(i)); |
906 | sum += length; | 857 | sum += length; |
@@ -938,7 +889,6 @@ static void rt_cache_invalidate(struct net *net) | |||
938 | 889 | ||
939 | get_random_bytes(&shuffle, sizeof(shuffle)); | 890 | get_random_bytes(&shuffle, sizeof(shuffle)); |
940 | atomic_add(shuffle + 1U, &net->ipv4.rt_genid); | 891 | atomic_add(shuffle + 1U, &net->ipv4.rt_genid); |
941 | inetpeer_invalidate_tree(AF_INET); | ||
942 | } | 892 | } |
943 | 893 | ||
944 | /* | 894 | /* |
@@ -1111,20 +1061,20 @@ static int slow_chain_length(const struct rtable *head) | |||
1111 | return length >> FRACT_BITS; | 1061 | return length >> FRACT_BITS; |
1112 | } | 1062 | } |
1113 | 1063 | ||
1114 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) | 1064 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, |
1065 | struct sk_buff *skb, | ||
1066 | const void *daddr) | ||
1115 | { | 1067 | { |
1116 | static const __be32 inaddr_any = 0; | ||
1117 | struct net_device *dev = dst->dev; | 1068 | struct net_device *dev = dst->dev; |
1118 | const __be32 *pkey = daddr; | 1069 | const __be32 *pkey = daddr; |
1119 | const struct rtable *rt; | 1070 | const struct rtable *rt; |
1120 | struct neighbour *n; | 1071 | struct neighbour *n; |
1121 | 1072 | ||
1122 | rt = (const struct rtable *) dst; | 1073 | rt = (const struct rtable *) dst; |
1123 | 1074 | if (rt->rt_gateway) | |
1124 | if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) | ||
1125 | pkey = &inaddr_any; | ||
1126 | else if (rt->rt_gateway) | ||
1127 | pkey = (const __be32 *) &rt->rt_gateway; | 1075 | pkey = (const __be32 *) &rt->rt_gateway; |
1076 | else if (skb) | ||
1077 | pkey = &ip_hdr(skb)->daddr; | ||
1128 | 1078 | ||
1129 | n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); | 1079 | n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); |
1130 | if (n) | 1080 | if (n) |
@@ -1132,16 +1082,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo | |||
1132 | return neigh_create(&arp_tbl, pkey, dev); | 1082 | return neigh_create(&arp_tbl, pkey, dev); |
1133 | } | 1083 | } |
1134 | 1084 | ||
1135 | static int rt_bind_neighbour(struct rtable *rt) | ||
1136 | { | ||
1137 | struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); | ||
1138 | if (IS_ERR(n)) | ||
1139 | return PTR_ERR(n); | ||
1140 | dst_set_neighbour(&rt->dst, n); | ||
1141 | |||
1142 | return 0; | ||
1143 | } | ||
1144 | |||
1145 | static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, | 1085 | static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, |
1146 | struct sk_buff *skb, int ifindex) | 1086 | struct sk_buff *skb, int ifindex) |
1147 | { | 1087 | { |
@@ -1150,7 +1090,6 @@ static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, | |||
1150 | unsigned long now; | 1090 | unsigned long now; |
1151 | u32 min_score; | 1091 | u32 min_score; |
1152 | int chain_length; | 1092 | int chain_length; |
1153 | int attempts = !in_softirq(); | ||
1154 | 1093 | ||
1155 | restart: | 1094 | restart: |
1156 | chain_length = 0; | 1095 | chain_length = 0; |
@@ -1159,7 +1098,7 @@ restart: | |||
1159 | candp = NULL; | 1098 | candp = NULL; |
1160 | now = jiffies; | 1099 | now = jiffies; |
1161 | 1100 | ||
1162 | if (!rt_caching(dev_net(rt->dst.dev))) { | 1101 | if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) { |
1163 | /* | 1102 | /* |
1164 | * If we're not caching, just tell the caller we | 1103 | * If we're not caching, just tell the caller we |
1165 | * were successful and don't touch the route. The | 1104 | * were successful and don't touch the route. The |
@@ -1177,15 +1116,6 @@ restart: | |||
1177 | */ | 1116 | */ |
1178 | 1117 | ||
1179 | rt->dst.flags |= DST_NOCACHE; | 1118 | rt->dst.flags |= DST_NOCACHE; |
1180 | if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { | ||
1181 | int err = rt_bind_neighbour(rt); | ||
1182 | if (err) { | ||
1183 | net_warn_ratelimited("Neighbour table failure & not caching routes\n"); | ||
1184 | ip_rt_put(rt); | ||
1185 | return ERR_PTR(err); | ||
1186 | } | ||
1187 | } | ||
1188 | |||
1189 | goto skip_hashing; | 1119 | goto skip_hashing; |
1190 | } | 1120 | } |
1191 | 1121 | ||
@@ -1268,40 +1198,6 @@ restart: | |||
1268 | } | 1198 | } |
1269 | } | 1199 | } |
1270 | 1200 | ||
1271 | /* Try to bind route to arp only if it is output | ||
1272 | route or unicast forwarding path. | ||
1273 | */ | ||
1274 | if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { | ||
1275 | int err = rt_bind_neighbour(rt); | ||
1276 | if (err) { | ||
1277 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1278 | |||
1279 | if (err != -ENOBUFS) { | ||
1280 | rt_drop(rt); | ||
1281 | return ERR_PTR(err); | ||
1282 | } | ||
1283 | |||
1284 | /* Neighbour tables are full and nothing | ||
1285 | can be released. Try to shrink route cache, | ||
1286 | it is most likely it holds some neighbour records. | ||
1287 | */ | ||
1288 | if (attempts-- > 0) { | ||
1289 | int saved_elasticity = ip_rt_gc_elasticity; | ||
1290 | int saved_int = ip_rt_gc_min_interval; | ||
1291 | ip_rt_gc_elasticity = 1; | ||
1292 | ip_rt_gc_min_interval = 0; | ||
1293 | rt_garbage_collect(&ipv4_dst_ops); | ||
1294 | ip_rt_gc_min_interval = saved_int; | ||
1295 | ip_rt_gc_elasticity = saved_elasticity; | ||
1296 | goto restart; | ||
1297 | } | ||
1298 | |||
1299 | net_warn_ratelimited("Neighbour table overflow\n"); | ||
1300 | rt_drop(rt); | ||
1301 | return ERR_PTR(-ENOBUFS); | ||
1302 | } | ||
1303 | } | ||
1304 | |||
1305 | rt->dst.rt_next = rt_hash_table[hash].chain; | 1201 | rt->dst.rt_next = rt_hash_table[hash].chain; |
1306 | 1202 | ||
1307 | /* | 1203 | /* |
@@ -1319,25 +1215,6 @@ skip_hashing: | |||
1319 | return rt; | 1215 | return rt; |
1320 | } | 1216 | } |
1321 | 1217 | ||
1322 | static atomic_t __rt_peer_genid = ATOMIC_INIT(0); | ||
1323 | |||
1324 | static u32 rt_peer_genid(void) | ||
1325 | { | ||
1326 | return atomic_read(&__rt_peer_genid); | ||
1327 | } | ||
1328 | |||
1329 | void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) | ||
1330 | { | ||
1331 | struct inet_peer *peer; | ||
1332 | |||
1333 | peer = inet_getpeer_v4(daddr, create); | ||
1334 | |||
1335 | if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) | ||
1336 | inet_putpeer(peer); | ||
1337 | else | ||
1338 | rt->rt_peer_genid = rt_peer_genid(); | ||
1339 | } | ||
1340 | |||
1341 | /* | 1218 | /* |
1342 | * Peer allocation may fail only in serious out-of-memory conditions. However | 1219 | * Peer allocation may fail only in serious out-of-memory conditions. However |
1343 | * we still can generate some output. | 1220 | * we still can generate some output. |
@@ -1360,21 +1237,15 @@ static void ip_select_fb_ident(struct iphdr *iph) | |||
1360 | 1237 | ||
1361 | void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) | 1238 | void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) |
1362 | { | 1239 | { |
1363 | struct rtable *rt = (struct rtable *) dst; | 1240 | struct net *net = dev_net(dst->dev); |
1364 | 1241 | struct inet_peer *peer; | |
1365 | if (rt && !(rt->dst.flags & DST_NOPEER)) { | ||
1366 | if (rt->peer == NULL) | ||
1367 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
1368 | 1242 | ||
1369 | /* If peer is attached to destination, it is never detached, | 1243 | peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); |
1370 | so that we need not to grab a lock to dereference it. | 1244 | if (peer) { |
1371 | */ | 1245 | iph->id = htons(inet_getid(peer, more)); |
1372 | if (rt->peer) { | 1246 | inet_putpeer(peer); |
1373 | iph->id = htons(inet_getid(rt->peer, more)); | 1247 | return; |
1374 | return; | 1248 | } |
1375 | } | ||
1376 | } else if (!rt) | ||
1377 | pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0)); | ||
1378 | 1249 | ||
1379 | ip_select_fb_ident(iph); | 1250 | ip_select_fb_ident(iph); |
1380 | } | 1251 | } |
@@ -1400,32 +1271,6 @@ static void rt_del(unsigned int hash, struct rtable *rt) | |||
1400 | spin_unlock_bh(rt_hash_lock_addr(hash)); | 1271 | spin_unlock_bh(rt_hash_lock_addr(hash)); |
1401 | } | 1272 | } |
1402 | 1273 | ||
1403 | static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) | ||
1404 | { | ||
1405 | struct rtable *rt = (struct rtable *) dst; | ||
1406 | __be32 orig_gw = rt->rt_gateway; | ||
1407 | struct neighbour *n, *old_n; | ||
1408 | |||
1409 | dst_confirm(&rt->dst); | ||
1410 | |||
1411 | rt->rt_gateway = peer->redirect_learned.a4; | ||
1412 | |||
1413 | n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); | ||
1414 | if (IS_ERR(n)) { | ||
1415 | rt->rt_gateway = orig_gw; | ||
1416 | return; | ||
1417 | } | ||
1418 | old_n = xchg(&rt->dst._neighbour, n); | ||
1419 | if (old_n) | ||
1420 | neigh_release(old_n); | ||
1421 | if (!(n->nud_state & NUD_VALID)) { | ||
1422 | neigh_event_send(n, NULL); | ||
1423 | } else { | ||
1424 | rt->rt_flags |= RTCF_REDIRECTED; | ||
1425 | call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); | ||
1426 | } | ||
1427 | } | ||
1428 | |||
1429 | /* called in rcu_read_lock() section */ | 1274 | /* called in rcu_read_lock() section */ |
1430 | void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | 1275 | void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, |
1431 | __be32 saddr, struct net_device *dev) | 1276 | __be32 saddr, struct net_device *dev) |
@@ -1434,7 +1279,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1434 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 1279 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
1435 | __be32 skeys[2] = { saddr, 0 }; | 1280 | __be32 skeys[2] = { saddr, 0 }; |
1436 | int ikeys[2] = { dev->ifindex, 0 }; | 1281 | int ikeys[2] = { dev->ifindex, 0 }; |
1437 | struct inet_peer *peer; | ||
1438 | struct net *net; | 1282 | struct net *net; |
1439 | 1283 | ||
1440 | if (!in_dev) | 1284 | if (!in_dev) |
@@ -1467,6 +1311,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1467 | rthp = &rt_hash_table[hash].chain; | 1311 | rthp = &rt_hash_table[hash].chain; |
1468 | 1312 | ||
1469 | while ((rt = rcu_dereference(*rthp)) != NULL) { | 1313 | while ((rt = rcu_dereference(*rthp)) != NULL) { |
1314 | struct neighbour *n; | ||
1315 | |||
1470 | rthp = &rt->dst.rt_next; | 1316 | rthp = &rt->dst.rt_next; |
1471 | 1317 | ||
1472 | if (rt->rt_key_dst != daddr || | 1318 | if (rt->rt_key_dst != daddr || |
@@ -1480,16 +1326,16 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | |||
1480 | rt->rt_gateway != old_gw) | 1326 | rt->rt_gateway != old_gw) |
1481 | continue; | 1327 | continue; |
1482 | 1328 | ||
1483 | if (!rt->peer) | 1329 | n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); |
1484 | rt_bind_peer(rt, rt->rt_dst, 1); | 1330 | if (n) { |
1485 | 1331 | if (!(n->nud_state & NUD_VALID)) { | |
1486 | peer = rt->peer; | 1332 | neigh_event_send(n, NULL); |
1487 | if (peer) { | 1333 | } else { |
1488 | if (peer->redirect_learned.a4 != new_gw) { | 1334 | rt->rt_gateway = new_gw; |
1489 | peer->redirect_learned.a4 = new_gw; | 1335 | rt->rt_flags |= RTCF_REDIRECTED; |
1490 | atomic_inc(&__rt_peer_genid); | 1336 | call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); |
1491 | } | 1337 | } |
1492 | check_peer_redir(&rt->dst, peer); | 1338 | neigh_release(n); |
1493 | } | 1339 | } |
1494 | } | 1340 | } |
1495 | } | 1341 | } |
@@ -1507,23 +1353,6 @@ reject_redirect: | |||
1507 | ; | 1353 | ; |
1508 | } | 1354 | } |
1509 | 1355 | ||
1510 | static bool peer_pmtu_expired(struct inet_peer *peer) | ||
1511 | { | ||
1512 | unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); | ||
1513 | |||
1514 | return orig && | ||
1515 | time_after_eq(jiffies, orig) && | ||
1516 | cmpxchg(&peer->pmtu_expires, orig, 0) == orig; | ||
1517 | } | ||
1518 | |||
1519 | static bool peer_pmtu_cleaned(struct inet_peer *peer) | ||
1520 | { | ||
1521 | unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); | ||
1522 | |||
1523 | return orig && | ||
1524 | cmpxchg(&peer->pmtu_expires, orig, 0) == orig; | ||
1525 | } | ||
1526 | |||
1527 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | 1356 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) |
1528 | { | 1357 | { |
1529 | struct rtable *rt = (struct rtable *)dst; | 1358 | struct rtable *rt = (struct rtable *)dst; |
@@ -1533,14 +1362,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | |||
1533 | if (dst->obsolete > 0) { | 1362 | if (dst->obsolete > 0) { |
1534 | ip_rt_put(rt); | 1363 | ip_rt_put(rt); |
1535 | ret = NULL; | 1364 | ret = NULL; |
1536 | } else if (rt->rt_flags & RTCF_REDIRECTED) { | 1365 | } else if ((rt->rt_flags & RTCF_REDIRECTED) || |
1366 | rt->dst.expires) { | ||
1537 | unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, | 1367 | unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, |
1538 | rt->rt_oif, | 1368 | rt->rt_oif, |
1539 | rt_genid(dev_net(dst->dev))); | 1369 | rt_genid(dev_net(dst->dev))); |
1540 | rt_del(hash, rt); | 1370 | rt_del(hash, rt); |
1541 | ret = NULL; | 1371 | ret = NULL; |
1542 | } else if (rt->peer && peer_pmtu_expired(rt->peer)) { | ||
1543 | dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); | ||
1544 | } | 1372 | } |
1545 | } | 1373 | } |
1546 | return ret; | 1374 | return ret; |
@@ -1567,6 +1395,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1567 | struct rtable *rt = skb_rtable(skb); | 1395 | struct rtable *rt = skb_rtable(skb); |
1568 | struct in_device *in_dev; | 1396 | struct in_device *in_dev; |
1569 | struct inet_peer *peer; | 1397 | struct inet_peer *peer; |
1398 | struct net *net; | ||
1570 | int log_martians; | 1399 | int log_martians; |
1571 | 1400 | ||
1572 | rcu_read_lock(); | 1401 | rcu_read_lock(); |
@@ -1578,9 +1407,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1578 | log_martians = IN_DEV_LOG_MARTIANS(in_dev); | 1407 | log_martians = IN_DEV_LOG_MARTIANS(in_dev); |
1579 | rcu_read_unlock(); | 1408 | rcu_read_unlock(); |
1580 | 1409 | ||
1581 | if (!rt->peer) | 1410 | net = dev_net(rt->dst.dev); |
1582 | rt_bind_peer(rt, rt->rt_dst, 1); | 1411 | peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); |
1583 | peer = rt->peer; | ||
1584 | if (!peer) { | 1412 | if (!peer) { |
1585 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); | 1413 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); |
1586 | return; | 1414 | return; |
@@ -1597,7 +1425,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1597 | */ | 1425 | */ |
1598 | if (peer->rate_tokens >= ip_rt_redirect_number) { | 1426 | if (peer->rate_tokens >= ip_rt_redirect_number) { |
1599 | peer->rate_last = jiffies; | 1427 | peer->rate_last = jiffies; |
1600 | return; | 1428 | goto out_put_peer; |
1601 | } | 1429 | } |
1602 | 1430 | ||
1603 | /* Check for load limit; set rate_last to the latest sent | 1431 | /* Check for load limit; set rate_last to the latest sent |
@@ -1618,16 +1446,34 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
1618 | &rt->rt_dst, &rt->rt_gateway); | 1446 | &rt->rt_dst, &rt->rt_gateway); |
1619 | #endif | 1447 | #endif |
1620 | } | 1448 | } |
1449 | out_put_peer: | ||
1450 | inet_putpeer(peer); | ||
1621 | } | 1451 | } |
1622 | 1452 | ||
1623 | static int ip_error(struct sk_buff *skb) | 1453 | static int ip_error(struct sk_buff *skb) |
1624 | { | 1454 | { |
1455 | struct in_device *in_dev = __in_dev_get_rcu(skb->dev); | ||
1625 | struct rtable *rt = skb_rtable(skb); | 1456 | struct rtable *rt = skb_rtable(skb); |
1626 | struct inet_peer *peer; | 1457 | struct inet_peer *peer; |
1627 | unsigned long now; | 1458 | unsigned long now; |
1459 | struct net *net; | ||
1628 | bool send; | 1460 | bool send; |
1629 | int code; | 1461 | int code; |
1630 | 1462 | ||
1463 | net = dev_net(rt->dst.dev); | ||
1464 | if (!IN_DEV_FORWARD(in_dev)) { | ||
1465 | switch (rt->dst.error) { | ||
1466 | case EHOSTUNREACH: | ||
1467 | IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); | ||
1468 | break; | ||
1469 | |||
1470 | case ENETUNREACH: | ||
1471 | IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); | ||
1472 | break; | ||
1473 | } | ||
1474 | goto out; | ||
1475 | } | ||
1476 | |||
1631 | switch (rt->dst.error) { | 1477 | switch (rt->dst.error) { |
1632 | case EINVAL: | 1478 | case EINVAL: |
1633 | default: | 1479 | default: |
@@ -1637,17 +1483,14 @@ static int ip_error(struct sk_buff *skb) | |||
1637 | break; | 1483 | break; |
1638 | case ENETUNREACH: | 1484 | case ENETUNREACH: |
1639 | code = ICMP_NET_UNREACH; | 1485 | code = ICMP_NET_UNREACH; |
1640 | IP_INC_STATS_BH(dev_net(rt->dst.dev), | 1486 | IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); |
1641 | IPSTATS_MIB_INNOROUTES); | ||
1642 | break; | 1487 | break; |
1643 | case EACCES: | 1488 | case EACCES: |
1644 | code = ICMP_PKT_FILTERED; | 1489 | code = ICMP_PKT_FILTERED; |
1645 | break; | 1490 | break; |
1646 | } | 1491 | } |
1647 | 1492 | ||
1648 | if (!rt->peer) | 1493 | peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); |
1649 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
1650 | peer = rt->peer; | ||
1651 | 1494 | ||
1652 | send = true; | 1495 | send = true; |
1653 | if (peer) { | 1496 | if (peer) { |
@@ -1660,6 +1503,7 @@ static int ip_error(struct sk_buff *skb) | |||
1660 | peer->rate_tokens -= ip_rt_error_cost; | 1503 | peer->rate_tokens -= ip_rt_error_cost; |
1661 | else | 1504 | else |
1662 | send = false; | 1505 | send = false; |
1506 | inet_putpeer(peer); | ||
1663 | } | 1507 | } |
1664 | if (send) | 1508 | if (send) |
1665 | icmp_send(skb, ICMP_DEST_UNREACH, code, 0); | 1509 | icmp_send(skb, ICMP_DEST_UNREACH, code, 0); |
@@ -1668,136 +1512,47 @@ out: kfree_skb(skb); | |||
1668 | return 0; | 1512 | return 0; |
1669 | } | 1513 | } |
1670 | 1514 | ||
1671 | /* | ||
1672 | * The last two values are not from the RFC but | ||
1673 | * are needed for AMPRnet AX.25 paths. | ||
1674 | */ | ||
1675 | |||
1676 | static const unsigned short mtu_plateau[] = | ||
1677 | {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; | ||
1678 | |||
1679 | static inline unsigned short guess_mtu(unsigned short old_mtu) | ||
1680 | { | ||
1681 | int i; | ||
1682 | |||
1683 | for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) | ||
1684 | if (old_mtu > mtu_plateau[i]) | ||
1685 | return mtu_plateau[i]; | ||
1686 | return 68; | ||
1687 | } | ||
1688 | |||
1689 | unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, | ||
1690 | unsigned short new_mtu, | ||
1691 | struct net_device *dev) | ||
1692 | { | ||
1693 | unsigned short old_mtu = ntohs(iph->tot_len); | ||
1694 | unsigned short est_mtu = 0; | ||
1695 | struct inet_peer *peer; | ||
1696 | |||
1697 | peer = inet_getpeer_v4(iph->daddr, 1); | ||
1698 | if (peer) { | ||
1699 | unsigned short mtu = new_mtu; | ||
1700 | |||
1701 | if (new_mtu < 68 || new_mtu >= old_mtu) { | ||
1702 | /* BSD 4.2 derived systems incorrectly adjust | ||
1703 | * tot_len by the IP header length, and report | ||
1704 | * a zero MTU in the ICMP message. | ||
1705 | */ | ||
1706 | if (mtu == 0 && | ||
1707 | old_mtu >= 68 + (iph->ihl << 2)) | ||
1708 | old_mtu -= iph->ihl << 2; | ||
1709 | mtu = guess_mtu(old_mtu); | ||
1710 | } | ||
1711 | |||
1712 | if (mtu < ip_rt_min_pmtu) | ||
1713 | mtu = ip_rt_min_pmtu; | ||
1714 | if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { | ||
1715 | unsigned long pmtu_expires; | ||
1716 | |||
1717 | pmtu_expires = jiffies + ip_rt_mtu_expires; | ||
1718 | if (!pmtu_expires) | ||
1719 | pmtu_expires = 1UL; | ||
1720 | |||
1721 | est_mtu = mtu; | ||
1722 | peer->pmtu_learned = mtu; | ||
1723 | peer->pmtu_expires = pmtu_expires; | ||
1724 | atomic_inc(&__rt_peer_genid); | ||
1725 | } | ||
1726 | |||
1727 | inet_putpeer(peer); | ||
1728 | } | ||
1729 | return est_mtu ? : new_mtu; | ||
1730 | } | ||
1731 | |||
1732 | static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) | ||
1733 | { | ||
1734 | unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); | ||
1735 | |||
1736 | if (!expires) | ||
1737 | return; | ||
1738 | if (time_before(jiffies, expires)) { | ||
1739 | u32 orig_dst_mtu = dst_mtu(dst); | ||
1740 | if (peer->pmtu_learned < orig_dst_mtu) { | ||
1741 | if (!peer->pmtu_orig) | ||
1742 | peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); | ||
1743 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); | ||
1744 | } | ||
1745 | } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) | ||
1746 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); | ||
1747 | } | ||
1748 | |||
1749 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) | 1515 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) |
1750 | { | 1516 | { |
1751 | struct rtable *rt = (struct rtable *) dst; | 1517 | struct rtable *rt = (struct rtable *) dst; |
1752 | struct inet_peer *peer; | ||
1753 | 1518 | ||
1754 | dst_confirm(dst); | 1519 | dst_confirm(dst); |
1755 | 1520 | ||
1756 | if (!rt->peer) | 1521 | if (mtu < ip_rt_min_pmtu) |
1757 | rt_bind_peer(rt, rt->rt_dst, 1); | 1522 | mtu = ip_rt_min_pmtu; |
1758 | peer = rt->peer; | ||
1759 | if (peer) { | ||
1760 | unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); | ||
1761 | |||
1762 | if (mtu < ip_rt_min_pmtu) | ||
1763 | mtu = ip_rt_min_pmtu; | ||
1764 | if (!pmtu_expires || mtu < peer->pmtu_learned) { | ||
1765 | 1523 | ||
1766 | pmtu_expires = jiffies + ip_rt_mtu_expires; | 1524 | rt->rt_pmtu = mtu; |
1767 | if (!pmtu_expires) | 1525 | dst_set_expires(&rt->dst, ip_rt_mtu_expires); |
1768 | pmtu_expires = 1UL; | 1526 | } |
1769 | 1527 | ||
1770 | peer->pmtu_learned = mtu; | 1528 | void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, |
1771 | peer->pmtu_expires = pmtu_expires; | 1529 | int oif, u32 mark, u8 protocol, int flow_flags) |
1530 | { | ||
1531 | const struct iphdr *iph = (const struct iphdr *)skb->data; | ||
1532 | struct flowi4 fl4; | ||
1533 | struct rtable *rt; | ||
1772 | 1534 | ||
1773 | atomic_inc(&__rt_peer_genid); | 1535 | flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, |
1774 | rt->rt_peer_genid = rt_peer_genid(); | 1536 | protocol, flow_flags, |
1775 | } | 1537 | iph->daddr, iph->saddr, 0, 0); |
1776 | check_peer_pmtu(dst, peer); | 1538 | rt = __ip_route_output_key(net, &fl4); |
1539 | if (!IS_ERR(rt)) { | ||
1540 | ip_rt_update_pmtu(&rt->dst, mtu); | ||
1541 | ip_rt_put(rt); | ||
1777 | } | 1542 | } |
1778 | } | 1543 | } |
1544 | EXPORT_SYMBOL_GPL(ipv4_update_pmtu); | ||
1779 | 1545 | ||
1780 | 1546 | void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) | |
1781 | static void ipv4_validate_peer(struct rtable *rt) | ||
1782 | { | 1547 | { |
1783 | if (rt->rt_peer_genid != rt_peer_genid()) { | 1548 | const struct inet_sock *inet = inet_sk(sk); |
1784 | struct inet_peer *peer; | ||
1785 | |||
1786 | if (!rt->peer) | ||
1787 | rt_bind_peer(rt, rt->rt_dst, 0); | ||
1788 | |||
1789 | peer = rt->peer; | ||
1790 | if (peer) { | ||
1791 | check_peer_pmtu(&rt->dst, peer); | ||
1792 | 1549 | ||
1793 | if (peer->redirect_learned.a4 && | 1550 | return ipv4_update_pmtu(skb, sock_net(sk), mtu, |
1794 | peer->redirect_learned.a4 != rt->rt_gateway) | 1551 | sk->sk_bound_dev_if, sk->sk_mark, |
1795 | check_peer_redir(&rt->dst, peer); | 1552 | inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, |
1796 | } | 1553 | inet_sk_flowi_flags(sk)); |
1797 | |||
1798 | rt->rt_peer_genid = rt_peer_genid(); | ||
1799 | } | ||
1800 | } | 1554 | } |
1555 | EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); | ||
1801 | 1556 | ||
1802 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) | 1557 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) |
1803 | { | 1558 | { |
@@ -1805,23 +1560,17 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) | |||
1805 | 1560 | ||
1806 | if (rt_is_expired(rt)) | 1561 | if (rt_is_expired(rt)) |
1807 | return NULL; | 1562 | return NULL; |
1808 | ipv4_validate_peer(rt); | ||
1809 | return dst; | 1563 | return dst; |
1810 | } | 1564 | } |
1811 | 1565 | ||
1812 | static void ipv4_dst_destroy(struct dst_entry *dst) | 1566 | static void ipv4_dst_destroy(struct dst_entry *dst) |
1813 | { | 1567 | { |
1814 | struct rtable *rt = (struct rtable *) dst; | 1568 | struct rtable *rt = (struct rtable *) dst; |
1815 | struct inet_peer *peer = rt->peer; | ||
1816 | 1569 | ||
1817 | if (rt->fi) { | 1570 | if (rt->fi) { |
1818 | fib_info_put(rt->fi); | 1571 | fib_info_put(rt->fi); |
1819 | rt->fi = NULL; | 1572 | rt->fi = NULL; |
1820 | } | 1573 | } |
1821 | if (peer) { | ||
1822 | rt->peer = NULL; | ||
1823 | inet_putpeer(peer); | ||
1824 | } | ||
1825 | } | 1574 | } |
1826 | 1575 | ||
1827 | 1576 | ||
@@ -1832,8 +1581,8 @@ static void ipv4_link_failure(struct sk_buff *skb) | |||
1832 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); | 1581 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); |
1833 | 1582 | ||
1834 | rt = skb_rtable(skb); | 1583 | rt = skb_rtable(skb); |
1835 | if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) | 1584 | if (rt) |
1836 | dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); | 1585 | dst_set_expires(&rt->dst, 0); |
1837 | } | 1586 | } |
1838 | 1587 | ||
1839 | static int ip_rt_bug(struct sk_buff *skb) | 1588 | static int ip_rt_bug(struct sk_buff *skb) |
@@ -1913,7 +1662,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) | |||
1913 | static unsigned int ipv4_mtu(const struct dst_entry *dst) | 1662 | static unsigned int ipv4_mtu(const struct dst_entry *dst) |
1914 | { | 1663 | { |
1915 | const struct rtable *rt = (const struct rtable *) dst; | 1664 | const struct rtable *rt = (const struct rtable *) dst; |
1916 | unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); | 1665 | unsigned int mtu = rt->rt_pmtu; |
1666 | |||
1667 | if (mtu && time_after_eq(jiffies, rt->dst.expires)) | ||
1668 | mtu = 0; | ||
1669 | |||
1670 | if (!mtu) | ||
1671 | mtu = dst_metric_raw(dst, RTAX_MTU); | ||
1917 | 1672 | ||
1918 | if (mtu && rt_is_output_route(rt)) | 1673 | if (mtu && rt_is_output_route(rt)) |
1919 | return mtu; | 1674 | return mtu; |
@@ -1935,60 +1690,27 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) | |||
1935 | static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, | 1690 | static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, |
1936 | struct fib_info *fi) | 1691 | struct fib_info *fi) |
1937 | { | 1692 | { |
1938 | struct inet_peer *peer; | 1693 | if (fi->fib_metrics != (u32 *) dst_default_metrics) { |
1939 | int create = 0; | 1694 | rt->fi = fi; |
1940 | 1695 | atomic_inc(&fi->fib_clntref); | |
1941 | /* If a peer entry exists for this destination, we must hook | ||
1942 | * it up in order to get at cached metrics. | ||
1943 | */ | ||
1944 | if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) | ||
1945 | create = 1; | ||
1946 | |||
1947 | rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); | ||
1948 | if (peer) { | ||
1949 | rt->rt_peer_genid = rt_peer_genid(); | ||
1950 | if (inet_metrics_new(peer)) | ||
1951 | memcpy(peer->metrics, fi->fib_metrics, | ||
1952 | sizeof(u32) * RTAX_MAX); | ||
1953 | dst_init_metrics(&rt->dst, peer->metrics, false); | ||
1954 | |||
1955 | check_peer_pmtu(&rt->dst, peer); | ||
1956 | |||
1957 | if (peer->redirect_learned.a4 && | ||
1958 | peer->redirect_learned.a4 != rt->rt_gateway) { | ||
1959 | rt->rt_gateway = peer->redirect_learned.a4; | ||
1960 | rt->rt_flags |= RTCF_REDIRECTED; | ||
1961 | } | ||
1962 | } else { | ||
1963 | if (fi->fib_metrics != (u32 *) dst_default_metrics) { | ||
1964 | rt->fi = fi; | ||
1965 | atomic_inc(&fi->fib_clntref); | ||
1966 | } | ||
1967 | dst_init_metrics(&rt->dst, fi->fib_metrics, true); | ||
1968 | } | 1696 | } |
1697 | dst_init_metrics(&rt->dst, fi->fib_metrics, true); | ||
1969 | } | 1698 | } |
1970 | 1699 | ||
1971 | static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, | 1700 | static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, |
1972 | const struct fib_result *res, | 1701 | const struct fib_result *res, |
1973 | struct fib_info *fi, u16 type, u32 itag) | 1702 | struct fib_info *fi, u16 type, u32 itag) |
1974 | { | 1703 | { |
1975 | struct dst_entry *dst = &rt->dst; | ||
1976 | |||
1977 | if (fi) { | 1704 | if (fi) { |
1978 | if (FIB_RES_GW(*res) && | 1705 | if (FIB_RES_GW(*res) && |
1979 | FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) | 1706 | FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) |
1980 | rt->rt_gateway = FIB_RES_GW(*res); | 1707 | rt->rt_gateway = FIB_RES_GW(*res); |
1981 | rt_init_metrics(rt, fl4, fi); | 1708 | rt_init_metrics(rt, fl4, fi); |
1982 | #ifdef CONFIG_IP_ROUTE_CLASSID | 1709 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1983 | dst->tclassid = FIB_RES_NH(*res).nh_tclassid; | 1710 | rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; |
1984 | #endif | 1711 | #endif |
1985 | } | 1712 | } |
1986 | 1713 | ||
1987 | if (dst_mtu(dst) > IP_MAX_MTU) | ||
1988 | dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); | ||
1989 | if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) | ||
1990 | dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); | ||
1991 | |||
1992 | #ifdef CONFIG_IP_ROUTE_CLASSID | 1714 | #ifdef CONFIG_IP_ROUTE_CLASSID |
1993 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 1715 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
1994 | set_class_tag(rt, fib_rules_tclass(res)); | 1716 | set_class_tag(rt, fib_rules_tclass(res)); |
@@ -2012,7 +1734,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2012 | { | 1734 | { |
2013 | unsigned int hash; | 1735 | unsigned int hash; |
2014 | struct rtable *rth; | 1736 | struct rtable *rth; |
2015 | __be32 spec_dst; | ||
2016 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 1737 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
2017 | u32 itag = 0; | 1738 | u32 itag = 0; |
2018 | int err; | 1739 | int err; |
@@ -2023,16 +1744,19 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2023 | return -EINVAL; | 1744 | return -EINVAL; |
2024 | 1745 | ||
2025 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || | 1746 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || |
2026 | ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) | 1747 | skb->protocol != htons(ETH_P_IP)) |
2027 | goto e_inval; | 1748 | goto e_inval; |
2028 | 1749 | ||
1750 | if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) | ||
1751 | if (ipv4_is_loopback(saddr)) | ||
1752 | goto e_inval; | ||
1753 | |||
2029 | if (ipv4_is_zeronet(saddr)) { | 1754 | if (ipv4_is_zeronet(saddr)) { |
2030 | if (!ipv4_is_local_multicast(daddr)) | 1755 | if (!ipv4_is_local_multicast(daddr)) |
2031 | goto e_inval; | 1756 | goto e_inval; |
2032 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); | ||
2033 | } else { | 1757 | } else { |
2034 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, | 1758 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, |
2035 | &itag); | 1759 | in_dev, &itag); |
2036 | if (err < 0) | 1760 | if (err < 0) |
2037 | goto e_err; | 1761 | goto e_err; |
2038 | } | 1762 | } |
@@ -2058,10 +1782,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2058 | rth->rt_iif = dev->ifindex; | 1782 | rth->rt_iif = dev->ifindex; |
2059 | rth->rt_oif = 0; | 1783 | rth->rt_oif = 0; |
2060 | rth->rt_mark = skb->mark; | 1784 | rth->rt_mark = skb->mark; |
1785 | rth->rt_pmtu = 0; | ||
2061 | rth->rt_gateway = daddr; | 1786 | rth->rt_gateway = daddr; |
2062 | rth->rt_spec_dst= spec_dst; | ||
2063 | rth->rt_peer_genid = 0; | ||
2064 | rth->peer = NULL; | ||
2065 | rth->fi = NULL; | 1787 | rth->fi = NULL; |
2066 | if (our) { | 1788 | if (our) { |
2067 | rth->dst.input= ip_local_deliver; | 1789 | rth->dst.input= ip_local_deliver; |
@@ -2123,7 +1845,6 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2123 | int err; | 1845 | int err; |
2124 | struct in_device *out_dev; | 1846 | struct in_device *out_dev; |
2125 | unsigned int flags = 0; | 1847 | unsigned int flags = 0; |
2126 | __be32 spec_dst; | ||
2127 | u32 itag; | 1848 | u32 itag; |
2128 | 1849 | ||
2129 | /* get a working reference to the output device */ | 1850 | /* get a working reference to the output device */ |
@@ -2135,7 +1856,7 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2135 | 1856 | ||
2136 | 1857 | ||
2137 | err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), | 1858 | err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), |
2138 | in_dev->dev, &spec_dst, &itag); | 1859 | in_dev->dev, in_dev, &itag); |
2139 | if (err < 0) { | 1860 | if (err < 0) { |
2140 | ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, | 1861 | ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, |
2141 | saddr); | 1862 | saddr); |
@@ -2186,10 +1907,8 @@ static int __mkroute_input(struct sk_buff *skb, | |||
2186 | rth->rt_iif = in_dev->dev->ifindex; | 1907 | rth->rt_iif = in_dev->dev->ifindex; |
2187 | rth->rt_oif = 0; | 1908 | rth->rt_oif = 0; |
2188 | rth->rt_mark = skb->mark; | 1909 | rth->rt_mark = skb->mark; |
1910 | rth->rt_pmtu = 0; | ||
2189 | rth->rt_gateway = daddr; | 1911 | rth->rt_gateway = daddr; |
2190 | rth->rt_spec_dst= spec_dst; | ||
2191 | rth->rt_peer_genid = 0; | ||
2192 | rth->peer = NULL; | ||
2193 | rth->fi = NULL; | 1912 | rth->fi = NULL; |
2194 | 1913 | ||
2195 | rth->dst.input = ip_forward; | 1914 | rth->dst.input = ip_forward; |
@@ -2253,7 +1972,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2253 | u32 itag = 0; | 1972 | u32 itag = 0; |
2254 | struct rtable *rth; | 1973 | struct rtable *rth; |
2255 | unsigned int hash; | 1974 | unsigned int hash; |
2256 | __be32 spec_dst; | ||
2257 | int err = -EINVAL; | 1975 | int err = -EINVAL; |
2258 | struct net *net = dev_net(dev); | 1976 | struct net *net = dev_net(dev); |
2259 | 1977 | ||
@@ -2266,8 +1984,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2266 | by fib_lookup. | 1984 | by fib_lookup. |
2267 | */ | 1985 | */ |
2268 | 1986 | ||
2269 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || | 1987 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) |
2270 | ipv4_is_loopback(saddr)) | ||
2271 | goto martian_source; | 1988 | goto martian_source; |
2272 | 1989 | ||
2273 | if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) | 1990 | if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) |
@@ -2279,9 +1996,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2279 | if (ipv4_is_zeronet(saddr)) | 1996 | if (ipv4_is_zeronet(saddr)) |
2280 | goto martian_source; | 1997 | goto martian_source; |
2281 | 1998 | ||
2282 | if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) | 1999 | if (ipv4_is_zeronet(daddr)) |
2283 | goto martian_destination; | 2000 | goto martian_destination; |
2284 | 2001 | ||
2002 | if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) { | ||
2003 | if (ipv4_is_loopback(daddr)) | ||
2004 | goto martian_destination; | ||
2005 | |||
2006 | if (ipv4_is_loopback(saddr)) | ||
2007 | goto martian_source; | ||
2008 | } | ||
2009 | |||
2285 | /* | 2010 | /* |
2286 | * Now we are ready to route packet. | 2011 | * Now we are ready to route packet. |
2287 | */ | 2012 | */ |
@@ -2293,11 +2018,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2293 | fl4.daddr = daddr; | 2018 | fl4.daddr = daddr; |
2294 | fl4.saddr = saddr; | 2019 | fl4.saddr = saddr; |
2295 | err = fib_lookup(net, &fl4, &res); | 2020 | err = fib_lookup(net, &fl4, &res); |
2296 | if (err != 0) { | 2021 | if (err != 0) |
2297 | if (!IN_DEV_FORWARD(in_dev)) | ||
2298 | goto e_hostunreach; | ||
2299 | goto no_route; | 2022 | goto no_route; |
2300 | } | ||
2301 | 2023 | ||
2302 | RT_CACHE_STAT_INC(in_slow_tot); | 2024 | RT_CACHE_STAT_INC(in_slow_tot); |
2303 | 2025 | ||
@@ -2307,17 +2029,16 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2307 | if (res.type == RTN_LOCAL) { | 2029 | if (res.type == RTN_LOCAL) { |
2308 | err = fib_validate_source(skb, saddr, daddr, tos, | 2030 | err = fib_validate_source(skb, saddr, daddr, tos, |
2309 | net->loopback_dev->ifindex, | 2031 | net->loopback_dev->ifindex, |
2310 | dev, &spec_dst, &itag); | 2032 | dev, in_dev, &itag); |
2311 | if (err < 0) | 2033 | if (err < 0) |
2312 | goto martian_source_keep_err; | 2034 | goto martian_source_keep_err; |
2313 | if (err) | 2035 | if (err) |
2314 | flags |= RTCF_DIRECTSRC; | 2036 | flags |= RTCF_DIRECTSRC; |
2315 | spec_dst = daddr; | ||
2316 | goto local_input; | 2037 | goto local_input; |
2317 | } | 2038 | } |
2318 | 2039 | ||
2319 | if (!IN_DEV_FORWARD(in_dev)) | 2040 | if (!IN_DEV_FORWARD(in_dev)) |
2320 | goto e_hostunreach; | 2041 | goto no_route; |
2321 | if (res.type != RTN_UNICAST) | 2042 | if (res.type != RTN_UNICAST) |
2322 | goto martian_destination; | 2043 | goto martian_destination; |
2323 | 2044 | ||
@@ -2328,11 +2049,9 @@ brd_input: | |||
2328 | if (skb->protocol != htons(ETH_P_IP)) | 2049 | if (skb->protocol != htons(ETH_P_IP)) |
2329 | goto e_inval; | 2050 | goto e_inval; |
2330 | 2051 | ||
2331 | if (ipv4_is_zeronet(saddr)) | 2052 | if (!ipv4_is_zeronet(saddr)) { |
2332 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); | 2053 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, |
2333 | else { | 2054 | in_dev, &itag); |
2334 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, | ||
2335 | &itag); | ||
2336 | if (err < 0) | 2055 | if (err < 0) |
2337 | goto martian_source_keep_err; | 2056 | goto martian_source_keep_err; |
2338 | if (err) | 2057 | if (err) |
@@ -2362,17 +2081,12 @@ local_input: | |||
2362 | rth->rt_key_tos = tos; | 2081 | rth->rt_key_tos = tos; |
2363 | rth->rt_dst = daddr; | 2082 | rth->rt_dst = daddr; |
2364 | rth->rt_src = saddr; | 2083 | rth->rt_src = saddr; |
2365 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
2366 | rth->dst.tclassid = itag; | ||
2367 | #endif | ||
2368 | rth->rt_route_iif = dev->ifindex; | 2084 | rth->rt_route_iif = dev->ifindex; |
2369 | rth->rt_iif = dev->ifindex; | 2085 | rth->rt_iif = dev->ifindex; |
2370 | rth->rt_oif = 0; | 2086 | rth->rt_oif = 0; |
2371 | rth->rt_mark = skb->mark; | 2087 | rth->rt_mark = skb->mark; |
2088 | rth->rt_pmtu = 0; | ||
2372 | rth->rt_gateway = daddr; | 2089 | rth->rt_gateway = daddr; |
2373 | rth->rt_spec_dst= spec_dst; | ||
2374 | rth->rt_peer_genid = 0; | ||
2375 | rth->peer = NULL; | ||
2376 | rth->fi = NULL; | 2090 | rth->fi = NULL; |
2377 | if (res.type == RTN_UNREACHABLE) { | 2091 | if (res.type == RTN_UNREACHABLE) { |
2378 | rth->dst.input= ip_error; | 2092 | rth->dst.input= ip_error; |
@@ -2388,7 +2102,6 @@ local_input: | |||
2388 | 2102 | ||
2389 | no_route: | 2103 | no_route: |
2390 | RT_CACHE_STAT_INC(in_no_route); | 2104 | RT_CACHE_STAT_INC(in_no_route); |
2391 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
2392 | res.type = RTN_UNREACHABLE; | 2105 | res.type = RTN_UNREACHABLE; |
2393 | if (err == -ESRCH) | 2106 | if (err == -ESRCH) |
2394 | err = -ENETUNREACH; | 2107 | err = -ENETUNREACH; |
@@ -2405,10 +2118,6 @@ martian_destination: | |||
2405 | &daddr, &saddr, dev->name); | 2118 | &daddr, &saddr, dev->name); |
2406 | #endif | 2119 | #endif |
2407 | 2120 | ||
2408 | e_hostunreach: | ||
2409 | err = -EHOSTUNREACH; | ||
2410 | goto out; | ||
2411 | |||
2412 | e_inval: | 2121 | e_inval: |
2413 | err = -EINVAL; | 2122 | err = -EINVAL; |
2414 | goto out; | 2123 | goto out; |
@@ -2452,7 +2161,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2452 | rth->rt_mark == skb->mark && | 2161 | rth->rt_mark == skb->mark && |
2453 | net_eq(dev_net(rth->dst.dev), net) && | 2162 | net_eq(dev_net(rth->dst.dev), net) && |
2454 | !rt_is_expired(rth)) { | 2163 | !rt_is_expired(rth)) { |
2455 | ipv4_validate_peer(rth); | ||
2456 | if (noref) { | 2164 | if (noref) { |
2457 | dst_use_noref(&rth->dst, jiffies); | 2165 | dst_use_noref(&rth->dst, jiffies); |
2458 | skb_dst_set_noref(skb, &rth->dst); | 2166 | skb_dst_set_noref(skb, &rth->dst); |
@@ -2520,9 +2228,14 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
2520 | u16 type = res->type; | 2228 | u16 type = res->type; |
2521 | struct rtable *rth; | 2229 | struct rtable *rth; |
2522 | 2230 | ||
2523 | if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) | 2231 | in_dev = __in_dev_get_rcu(dev_out); |
2232 | if (!in_dev) | ||
2524 | return ERR_PTR(-EINVAL); | 2233 | return ERR_PTR(-EINVAL); |
2525 | 2234 | ||
2235 | if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) | ||
2236 | if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) | ||
2237 | return ERR_PTR(-EINVAL); | ||
2238 | |||
2526 | if (ipv4_is_lbcast(fl4->daddr)) | 2239 | if (ipv4_is_lbcast(fl4->daddr)) |
2527 | type = RTN_BROADCAST; | 2240 | type = RTN_BROADCAST; |
2528 | else if (ipv4_is_multicast(fl4->daddr)) | 2241 | else if (ipv4_is_multicast(fl4->daddr)) |
@@ -2533,10 +2246,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
2533 | if (dev_out->flags & IFF_LOOPBACK) | 2246 | if (dev_out->flags & IFF_LOOPBACK) |
2534 | flags |= RTCF_LOCAL; | 2247 | flags |= RTCF_LOCAL; |
2535 | 2248 | ||
2536 | in_dev = __in_dev_get_rcu(dev_out); | ||
2537 | if (!in_dev) | ||
2538 | return ERR_PTR(-EINVAL); | ||
2539 | |||
2540 | if (type == RTN_BROADCAST) { | 2249 | if (type == RTN_BROADCAST) { |
2541 | flags |= RTCF_BROADCAST | RTCF_LOCAL; | 2250 | flags |= RTCF_BROADCAST | RTCF_LOCAL; |
2542 | fi = NULL; | 2251 | fi = NULL; |
@@ -2573,20 +2282,15 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
2573 | rth->rt_iif = orig_oif ? : dev_out->ifindex; | 2282 | rth->rt_iif = orig_oif ? : dev_out->ifindex; |
2574 | rth->rt_oif = orig_oif; | 2283 | rth->rt_oif = orig_oif; |
2575 | rth->rt_mark = fl4->flowi4_mark; | 2284 | rth->rt_mark = fl4->flowi4_mark; |
2285 | rth->rt_pmtu = 0; | ||
2576 | rth->rt_gateway = fl4->daddr; | 2286 | rth->rt_gateway = fl4->daddr; |
2577 | rth->rt_spec_dst= fl4->saddr; | ||
2578 | rth->rt_peer_genid = 0; | ||
2579 | rth->peer = NULL; | ||
2580 | rth->fi = NULL; | 2287 | rth->fi = NULL; |
2581 | 2288 | ||
2582 | RT_CACHE_STAT_INC(out_slow_tot); | 2289 | RT_CACHE_STAT_INC(out_slow_tot); |
2583 | 2290 | ||
2584 | if (flags & RTCF_LOCAL) { | 2291 | if (flags & RTCF_LOCAL) |
2585 | rth->dst.input = ip_local_deliver; | 2292 | rth->dst.input = ip_local_deliver; |
2586 | rth->rt_spec_dst = fl4->daddr; | ||
2587 | } | ||
2588 | if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { | 2293 | if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { |
2589 | rth->rt_spec_dst = fl4->saddr; | ||
2590 | if (flags & RTCF_LOCAL && | 2294 | if (flags & RTCF_LOCAL && |
2591 | !(dev_out->flags & IFF_LOOPBACK)) { | 2295 | !(dev_out->flags & IFF_LOOPBACK)) { |
2592 | rth->dst.output = ip_mc_output; | 2296 | rth->dst.output = ip_mc_output; |
@@ -2605,6 +2309,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
2605 | 2309 | ||
2606 | rt_set_nexthop(rth, fl4, res, fi, type, 0); | 2310 | rt_set_nexthop(rth, fl4, res, fi, type, 0); |
2607 | 2311 | ||
2312 | if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE) | ||
2313 | rth->dst.flags |= DST_NOCACHE; | ||
2314 | |||
2608 | return rth; | 2315 | return rth; |
2609 | } | 2316 | } |
2610 | 2317 | ||
@@ -2625,6 +2332,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) | |||
2625 | int orig_oif; | 2332 | int orig_oif; |
2626 | 2333 | ||
2627 | res.fi = NULL; | 2334 | res.fi = NULL; |
2335 | res.table = NULL; | ||
2628 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 2336 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
2629 | res.r = NULL; | 2337 | res.r = NULL; |
2630 | #endif | 2338 | #endif |
@@ -2730,6 +2438,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) | |||
2730 | 2438 | ||
2731 | if (fib_lookup(net, fl4, &res)) { | 2439 | if (fib_lookup(net, fl4, &res)) { |
2732 | res.fi = NULL; | 2440 | res.fi = NULL; |
2441 | res.table = NULL; | ||
2733 | if (fl4->flowi4_oif) { | 2442 | if (fl4->flowi4_oif) { |
2734 | /* Apparently, routing tables are wrong. Assume, | 2443 | /* Apparently, routing tables are wrong. Assume, |
2735 | that the destination is on link. | 2444 | that the destination is on link. |
@@ -2828,7 +2537,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) | |||
2828 | (IPTOS_RT_MASK | RTO_ONLINK)) && | 2537 | (IPTOS_RT_MASK | RTO_ONLINK)) && |
2829 | net_eq(dev_net(rth->dst.dev), net) && | 2538 | net_eq(dev_net(rth->dst.dev), net) && |
2830 | !rt_is_expired(rth)) { | 2539 | !rt_is_expired(rth)) { |
2831 | ipv4_validate_peer(rth); | ||
2832 | dst_use(&rth->dst, jiffies); | 2540 | dst_use(&rth->dst, jiffies); |
2833 | RT_CACHE_STAT_INC(out_hit); | 2541 | RT_CACHE_STAT_INC(out_hit); |
2834 | rcu_read_unlock_bh(); | 2542 | rcu_read_unlock_bh(); |
@@ -2892,7 +2600,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or | |||
2892 | new->__use = 1; | 2600 | new->__use = 1; |
2893 | new->input = dst_discard; | 2601 | new->input = dst_discard; |
2894 | new->output = dst_discard; | 2602 | new->output = dst_discard; |
2895 | dst_copy_metrics(new, &ort->dst); | ||
2896 | 2603 | ||
2897 | new->dev = ort->dst.dev; | 2604 | new->dev = ort->dst.dev; |
2898 | if (new->dev) | 2605 | if (new->dev) |
@@ -2905,6 +2612,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or | |||
2905 | rt->rt_iif = ort->rt_iif; | 2612 | rt->rt_iif = ort->rt_iif; |
2906 | rt->rt_oif = ort->rt_oif; | 2613 | rt->rt_oif = ort->rt_oif; |
2907 | rt->rt_mark = ort->rt_mark; | 2614 | rt->rt_mark = ort->rt_mark; |
2615 | rt->rt_pmtu = ort->rt_pmtu; | ||
2908 | 2616 | ||
2909 | rt->rt_genid = rt_genid(net); | 2617 | rt->rt_genid = rt_genid(net); |
2910 | rt->rt_flags = ort->rt_flags; | 2618 | rt->rt_flags = ort->rt_flags; |
@@ -2912,10 +2620,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or | |||
2912 | rt->rt_dst = ort->rt_dst; | 2620 | rt->rt_dst = ort->rt_dst; |
2913 | rt->rt_src = ort->rt_src; | 2621 | rt->rt_src = ort->rt_src; |
2914 | rt->rt_gateway = ort->rt_gateway; | 2622 | rt->rt_gateway = ort->rt_gateway; |
2915 | rt->rt_spec_dst = ort->rt_spec_dst; | ||
2916 | rt->peer = ort->peer; | ||
2917 | if (rt->peer) | ||
2918 | atomic_inc(&rt->peer->refcnt); | ||
2919 | rt->fi = ort->fi; | 2623 | rt->fi = ort->fi; |
2920 | if (rt->fi) | 2624 | if (rt->fi) |
2921 | atomic_inc(&rt->fi->fib_clntref); | 2625 | atomic_inc(&rt->fi->fib_clntref); |
@@ -2953,8 +2657,7 @@ static int rt_fill_info(struct net *net, | |||
2953 | struct rtmsg *r; | 2657 | struct rtmsg *r; |
2954 | struct nlmsghdr *nlh; | 2658 | struct nlmsghdr *nlh; |
2955 | unsigned long expires = 0; | 2659 | unsigned long expires = 0; |
2956 | const struct inet_peer *peer = rt->peer; | 2660 | u32 error; |
2957 | u32 id = 0, ts = 0, tsage = 0, error; | ||
2958 | 2661 | ||
2959 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); | 2662 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); |
2960 | if (nlh == NULL) | 2663 | if (nlh == NULL) |
@@ -2990,10 +2693,8 @@ static int rt_fill_info(struct net *net, | |||
2990 | nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) | 2693 | nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) |
2991 | goto nla_put_failure; | 2694 | goto nla_put_failure; |
2992 | #endif | 2695 | #endif |
2993 | if (rt_is_input_route(rt)) { | 2696 | if (!rt_is_input_route(rt) && |
2994 | if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst)) | 2697 | rt->rt_src != rt->rt_key_src) { |
2995 | goto nla_put_failure; | ||
2996 | } else if (rt->rt_src != rt->rt_key_src) { | ||
2997 | if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) | 2698 | if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) |
2998 | goto nla_put_failure; | 2699 | goto nla_put_failure; |
2999 | } | 2700 | } |
@@ -3009,20 +2710,12 @@ static int rt_fill_info(struct net *net, | |||
3009 | goto nla_put_failure; | 2710 | goto nla_put_failure; |
3010 | 2711 | ||
3011 | error = rt->dst.error; | 2712 | error = rt->dst.error; |
3012 | if (peer) { | 2713 | expires = rt->dst.expires; |
3013 | inet_peer_refcheck(rt->peer); | 2714 | if (expires) { |
3014 | id = atomic_read(&peer->ip_id_count) & 0xffff; | 2715 | if (time_before(jiffies, expires)) |
3015 | if (peer->tcp_ts_stamp) { | 2716 | expires -= jiffies; |
3016 | ts = peer->tcp_ts; | 2717 | else |
3017 | tsage = get_seconds() - peer->tcp_ts_stamp; | 2718 | expires = 0; |
3018 | } | ||
3019 | expires = ACCESS_ONCE(peer->pmtu_expires); | ||
3020 | if (expires) { | ||
3021 | if (time_before(jiffies, expires)) | ||
3022 | expires -= jiffies; | ||
3023 | else | ||
3024 | expires = 0; | ||
3025 | } | ||
3026 | } | 2719 | } |
3027 | 2720 | ||
3028 | if (rt_is_input_route(rt)) { | 2721 | if (rt_is_input_route(rt)) { |
@@ -3051,8 +2744,7 @@ static int rt_fill_info(struct net *net, | |||
3051 | goto nla_put_failure; | 2744 | goto nla_put_failure; |
3052 | } | 2745 | } |
3053 | 2746 | ||
3054 | if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, | 2747 | if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) |
3055 | expires, error) < 0) | ||
3056 | goto nla_put_failure; | 2748 | goto nla_put_failure; |
3057 | 2749 | ||
3058 | return nlmsg_end(skb, nlh); | 2750 | return nlmsg_end(skb, nlh); |
@@ -3400,6 +3092,30 @@ static __net_initdata struct pernet_operations rt_genid_ops = { | |||
3400 | .init = rt_genid_init, | 3092 | .init = rt_genid_init, |
3401 | }; | 3093 | }; |
3402 | 3094 | ||
3095 | static int __net_init ipv4_inetpeer_init(struct net *net) | ||
3096 | { | ||
3097 | struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); | ||
3098 | |||
3099 | if (!bp) | ||
3100 | return -ENOMEM; | ||
3101 | inet_peer_base_init(bp); | ||
3102 | net->ipv4.peers = bp; | ||
3103 | return 0; | ||
3104 | } | ||
3105 | |||
3106 | static void __net_exit ipv4_inetpeer_exit(struct net *net) | ||
3107 | { | ||
3108 | struct inet_peer_base *bp = net->ipv4.peers; | ||
3109 | |||
3110 | net->ipv4.peers = NULL; | ||
3111 | inetpeer_invalidate_tree(bp); | ||
3112 | kfree(bp); | ||
3113 | } | ||
3114 | |||
3115 | static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { | ||
3116 | .init = ipv4_inetpeer_init, | ||
3117 | .exit = ipv4_inetpeer_exit, | ||
3118 | }; | ||
3403 | 3119 | ||
3404 | #ifdef CONFIG_IP_ROUTE_CLASSID | 3120 | #ifdef CONFIG_IP_ROUTE_CLASSID |
3405 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; | 3121 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; |
@@ -3480,6 +3196,7 @@ int __init ip_rt_init(void) | |||
3480 | register_pernet_subsys(&sysctl_route_ops); | 3196 | register_pernet_subsys(&sysctl_route_ops); |
3481 | #endif | 3197 | #endif |
3482 | register_pernet_subsys(&rt_genid_ops); | 3198 | register_pernet_subsys(&rt_genid_ops); |
3199 | register_pernet_subsys(&ipv4_inetpeer_ops); | ||
3483 | return rc; | 3200 | return rc; |
3484 | } | 3201 | } |
3485 | 3202 | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ef32956ed655..12aa0c5867c4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -301,6 +301,13 @@ static struct ctl_table ipv4_table[] = { | |||
301 | .proc_handler = proc_dointvec | 301 | .proc_handler = proc_dointvec |
302 | }, | 302 | }, |
303 | { | 303 | { |
304 | .procname = "ip_early_demux", | ||
305 | .data = &sysctl_ip_early_demux, | ||
306 | .maxlen = sizeof(int), | ||
307 | .mode = 0644, | ||
308 | .proc_handler = proc_dointvec | ||
309 | }, | ||
310 | { | ||
304 | .procname = "ip_dynaddr", | 311 | .procname = "ip_dynaddr", |
305 | .data = &sysctl_ip_dynaddr, | 312 | .data = &sysctl_ip_dynaddr, |
306 | .maxlen = sizeof(int), | 313 | .maxlen = sizeof(int), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3ba605f60e4e..d902da96d154 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -3310,8 +3310,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key); | |||
3310 | 3310 | ||
3311 | #endif | 3311 | #endif |
3312 | 3312 | ||
3313 | /** | 3313 | /* Each Responder maintains up to two secret values concurrently for |
3314 | * Each Responder maintains up to two secret values concurrently for | ||
3315 | * efficient secret rollover. Each secret value has 4 states: | 3314 | * efficient secret rollover. Each secret value has 4 states: |
3316 | * | 3315 | * |
3317 | * Generating. (tcp_secret_generating != tcp_secret_primary) | 3316 | * Generating. (tcp_secret_generating != tcp_secret_primary) |
@@ -3563,6 +3562,8 @@ void __init tcp_init(void) | |||
3563 | pr_info("Hash tables configured (established %u bind %u)\n", | 3562 | pr_info("Hash tables configured (established %u bind %u)\n", |
3564 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); | 3563 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); |
3565 | 3564 | ||
3565 | tcp_metrics_init(); | ||
3566 | |||
3566 | tcp_register_congestion_control(&tcp_reno); | 3567 | tcp_register_congestion_control(&tcp_reno); |
3567 | 3568 | ||
3568 | memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); | 3569 | memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b224eb8bce8b..055ac49b8b40 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly; | |||
93 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; | 93 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
94 | int sysctl_tcp_frto __read_mostly = 2; | 94 | int sysctl_tcp_frto __read_mostly = 2; |
95 | int sysctl_tcp_frto_response __read_mostly; | 95 | int sysctl_tcp_frto_response __read_mostly; |
96 | int sysctl_tcp_nometrics_save __read_mostly; | ||
97 | 96 | ||
98 | int sysctl_tcp_thin_dupack __read_mostly; | 97 | int sysctl_tcp_thin_dupack __read_mostly; |
99 | 98 | ||
@@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
701 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 700 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
702 | * routine referred to above. | 701 | * routine referred to above. |
703 | */ | 702 | */ |
704 | static inline void tcp_set_rto(struct sock *sk) | 703 | void tcp_set_rto(struct sock *sk) |
705 | { | 704 | { |
706 | const struct tcp_sock *tp = tcp_sk(sk); | 705 | const struct tcp_sock *tp = tcp_sk(sk); |
707 | /* Old crap is replaced with new one. 8) | 706 | /* Old crap is replaced with new one. 8) |
@@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk) | |||
728 | tcp_bound_rto(sk); | 727 | tcp_bound_rto(sk); |
729 | } | 728 | } |
730 | 729 | ||
731 | /* Save metrics learned by this TCP session. | ||
732 | This function is called only, when TCP finishes successfully | ||
733 | i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. | ||
734 | */ | ||
735 | void tcp_update_metrics(struct sock *sk) | ||
736 | { | ||
737 | struct tcp_sock *tp = tcp_sk(sk); | ||
738 | struct dst_entry *dst = __sk_dst_get(sk); | ||
739 | |||
740 | if (sysctl_tcp_nometrics_save) | ||
741 | return; | ||
742 | |||
743 | dst_confirm(dst); | ||
744 | |||
745 | if (dst && (dst->flags & DST_HOST)) { | ||
746 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
747 | int m; | ||
748 | unsigned long rtt; | ||
749 | |||
750 | if (icsk->icsk_backoff || !tp->srtt) { | ||
751 | /* This session failed to estimate rtt. Why? | ||
752 | * Probably, no packets returned in time. | ||
753 | * Reset our results. | ||
754 | */ | ||
755 | if (!(dst_metric_locked(dst, RTAX_RTT))) | ||
756 | dst_metric_set(dst, RTAX_RTT, 0); | ||
757 | return; | ||
758 | } | ||
759 | |||
760 | rtt = dst_metric_rtt(dst, RTAX_RTT); | ||
761 | m = rtt - tp->srtt; | ||
762 | |||
763 | /* If newly calculated rtt larger than stored one, | ||
764 | * store new one. Otherwise, use EWMA. Remember, | ||
765 | * rtt overestimation is always better than underestimation. | ||
766 | */ | ||
767 | if (!(dst_metric_locked(dst, RTAX_RTT))) { | ||
768 | if (m <= 0) | ||
769 | set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); | ||
770 | else | ||
771 | set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); | ||
772 | } | ||
773 | |||
774 | if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { | ||
775 | unsigned long var; | ||
776 | if (m < 0) | ||
777 | m = -m; | ||
778 | |||
779 | /* Scale deviation to rttvar fixed point */ | ||
780 | m >>= 1; | ||
781 | if (m < tp->mdev) | ||
782 | m = tp->mdev; | ||
783 | |||
784 | var = dst_metric_rtt(dst, RTAX_RTTVAR); | ||
785 | if (m >= var) | ||
786 | var = m; | ||
787 | else | ||
788 | var -= (var - m) >> 2; | ||
789 | |||
790 | set_dst_metric_rtt(dst, RTAX_RTTVAR, var); | ||
791 | } | ||
792 | |||
793 | if (tcp_in_initial_slowstart(tp)) { | ||
794 | /* Slow start still did not finish. */ | ||
795 | if (dst_metric(dst, RTAX_SSTHRESH) && | ||
796 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | ||
797 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) | ||
798 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); | ||
799 | if (!dst_metric_locked(dst, RTAX_CWND) && | ||
800 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) | ||
801 | dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); | ||
802 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | ||
803 | icsk->icsk_ca_state == TCP_CA_Open) { | ||
804 | /* Cong. avoidance phase, cwnd is reliable. */ | ||
805 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) | ||
806 | dst_metric_set(dst, RTAX_SSTHRESH, | ||
807 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); | ||
808 | if (!dst_metric_locked(dst, RTAX_CWND)) | ||
809 | dst_metric_set(dst, RTAX_CWND, | ||
810 | (dst_metric(dst, RTAX_CWND) + | ||
811 | tp->snd_cwnd) >> 1); | ||
812 | } else { | ||
813 | /* Else slow start did not finish, cwnd is non-sense, | ||
814 | ssthresh may be also invalid. | ||
815 | */ | ||
816 | if (!dst_metric_locked(dst, RTAX_CWND)) | ||
817 | dst_metric_set(dst, RTAX_CWND, | ||
818 | (dst_metric(dst, RTAX_CWND) + | ||
819 | tp->snd_ssthresh) >> 1); | ||
820 | if (dst_metric(dst, RTAX_SSTHRESH) && | ||
821 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | ||
822 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) | ||
823 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); | ||
824 | } | ||
825 | |||
826 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { | ||
827 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && | ||
828 | tp->reordering != sysctl_tcp_reordering) | ||
829 | dst_metric_set(dst, RTAX_REORDERING, tp->reordering); | ||
830 | } | ||
831 | } | ||
832 | } | ||
833 | |||
834 | __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) | 730 | __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) |
835 | { | 731 | { |
836 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); | 732 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); |
@@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) | |||
867 | * Packet counting of FACK is based on in-order assumptions, therefore TCP | 763 | * Packet counting of FACK is based on in-order assumptions, therefore TCP |
868 | * disables it when reordering is detected | 764 | * disables it when reordering is detected |
869 | */ | 765 | */ |
870 | static void tcp_disable_fack(struct tcp_sock *tp) | 766 | void tcp_disable_fack(struct tcp_sock *tp) |
871 | { | 767 | { |
872 | /* RFC3517 uses different metric in lost marker => reset on change */ | 768 | /* RFC3517 uses different metric in lost marker => reset on change */ |
873 | if (tcp_is_fack(tp)) | 769 | if (tcp_is_fack(tp)) |
@@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp) | |||
881 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; | 777 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; |
882 | } | 778 | } |
883 | 779 | ||
884 | /* Initialize metrics on socket. */ | ||
885 | |||
886 | static void tcp_init_metrics(struct sock *sk) | ||
887 | { | ||
888 | struct tcp_sock *tp = tcp_sk(sk); | ||
889 | struct dst_entry *dst = __sk_dst_get(sk); | ||
890 | |||
891 | if (dst == NULL) | ||
892 | goto reset; | ||
893 | |||
894 | dst_confirm(dst); | ||
895 | |||
896 | if (dst_metric_locked(dst, RTAX_CWND)) | ||
897 | tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); | ||
898 | if (dst_metric(dst, RTAX_SSTHRESH)) { | ||
899 | tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); | ||
900 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) | ||
901 | tp->snd_ssthresh = tp->snd_cwnd_clamp; | ||
902 | } else { | ||
903 | /* ssthresh may have been reduced unnecessarily during. | ||
904 | * 3WHS. Restore it back to its initial default. | ||
905 | */ | ||
906 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
907 | } | ||
908 | if (dst_metric(dst, RTAX_REORDERING) && | ||
909 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { | ||
910 | tcp_disable_fack(tp); | ||
911 | tcp_disable_early_retrans(tp); | ||
912 | tp->reordering = dst_metric(dst, RTAX_REORDERING); | ||
913 | } | ||
914 | |||
915 | if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) | ||
916 | goto reset; | ||
917 | |||
918 | /* Initial rtt is determined from SYN,SYN-ACK. | ||
919 | * The segment is small and rtt may appear much | ||
920 | * less than real one. Use per-dst memory | ||
921 | * to make it more realistic. | ||
922 | * | ||
923 | * A bit of theory. RTT is time passed after "normal" sized packet | ||
924 | * is sent until it is ACKed. In normal circumstances sending small | ||
925 | * packets force peer to delay ACKs and calculation is correct too. | ||
926 | * The algorithm is adaptive and, provided we follow specs, it | ||
927 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | ||
928 | * tricks sort of "quick acks" for time long enough to decrease RTT | ||
929 | * to low value, and then abruptly stops to do it and starts to delay | ||
930 | * ACKs, wait for troubles. | ||
931 | */ | ||
932 | if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { | ||
933 | tp->srtt = dst_metric_rtt(dst, RTAX_RTT); | ||
934 | tp->rtt_seq = tp->snd_nxt; | ||
935 | } | ||
936 | if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { | ||
937 | tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); | ||
938 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
939 | } | ||
940 | tcp_set_rto(sk); | ||
941 | reset: | ||
942 | if (tp->srtt == 0) { | ||
943 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | ||
944 | * 3WHS. This is most likely due to retransmission, | ||
945 | * including spurious one. Reset the RTO back to 3secs | ||
946 | * from the more aggressive 1sec to avoid more spurious | ||
947 | * retransmission. | ||
948 | */ | ||
949 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | ||
950 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | ||
951 | } | ||
952 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | ||
953 | * retransmitted. In light of RFC6298 more aggressive 1sec | ||
954 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK | ||
955 | * retransmission has occurred. | ||
956 | */ | ||
957 | if (tp->total_retrans > 1) | ||
958 | tp->snd_cwnd = 1; | ||
959 | else | ||
960 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | ||
961 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
962 | } | ||
963 | |||
964 | static void tcp_update_reordering(struct sock *sk, const int metric, | 780 | static void tcp_update_reordering(struct sock *sk, const int metric, |
965 | const int ts) | 781 | const int ts) |
966 | { | 782 | { |
@@ -3869,9 +3685,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3869 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3685 | tcp_cong_avoid(sk, ack, prior_in_flight); |
3870 | } | 3686 | } |
3871 | 3687 | ||
3872 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) | 3688 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { |
3873 | dst_confirm(__sk_dst_get(sk)); | 3689 | struct dst_entry *dst = __sk_dst_get(sk); |
3874 | 3690 | if (dst) | |
3691 | dst_confirm(dst); | ||
3692 | } | ||
3875 | return 1; | 3693 | return 1; |
3876 | 3694 | ||
3877 | no_queue: | 3695 | no_queue: |
@@ -5518,6 +5336,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5518 | struct tcp_sock *tp = tcp_sk(sk); | 5336 | struct tcp_sock *tp = tcp_sk(sk); |
5519 | int res; | 5337 | int res; |
5520 | 5338 | ||
5339 | if (sk->sk_rx_dst) { | ||
5340 | struct dst_entry *dst = sk->sk_rx_dst; | ||
5341 | if (unlikely(dst->obsolete)) { | ||
5342 | if (dst->ops->check(dst, 0) == NULL) { | ||
5343 | dst_release(dst); | ||
5344 | sk->sk_rx_dst = NULL; | ||
5345 | } | ||
5346 | } | ||
5347 | } | ||
5348 | if (unlikely(sk->sk_rx_dst == NULL)) | ||
5349 | sk->sk_rx_dst = dst_clone(skb_dst(skb)); | ||
5350 | |||
5521 | /* | 5351 | /* |
5522 | * Header prediction. | 5352 | * Header prediction. |
5523 | * The code loosely follows the one in the famous | 5353 | * The code loosely follows the one in the famous |
@@ -5729,8 +5559,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) | |||
5729 | 5559 | ||
5730 | tcp_set_state(sk, TCP_ESTABLISHED); | 5560 | tcp_set_state(sk, TCP_ESTABLISHED); |
5731 | 5561 | ||
5732 | if (skb != NULL) | 5562 | if (skb != NULL) { |
5563 | sk->sk_rx_dst = dst_clone(skb_dst(skb)); | ||
5733 | security_inet_conn_established(sk, skb); | 5564 | security_inet_conn_established(sk, skb); |
5565 | } | ||
5734 | 5566 | ||
5735 | /* Make sure socket is routed, for correct metrics. */ | 5567 | /* Make sure socket is routed, for correct metrics. */ |
5736 | icsk->icsk_af_ops->rebuild_header(sk); | 5568 | icsk->icsk_af_ops->rebuild_header(sk); |
@@ -6126,9 +5958,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6126 | 5958 | ||
6127 | case TCP_FIN_WAIT1: | 5959 | case TCP_FIN_WAIT1: |
6128 | if (tp->snd_una == tp->write_seq) { | 5960 | if (tp->snd_una == tp->write_seq) { |
5961 | struct dst_entry *dst; | ||
5962 | |||
6129 | tcp_set_state(sk, TCP_FIN_WAIT2); | 5963 | tcp_set_state(sk, TCP_FIN_WAIT2); |
6130 | sk->sk_shutdown |= SEND_SHUTDOWN; | 5964 | sk->sk_shutdown |= SEND_SHUTDOWN; |
6131 | dst_confirm(__sk_dst_get(sk)); | 5965 | |
5966 | dst = __sk_dst_get(sk); | ||
5967 | if (dst) | ||
5968 | dst_confirm(dst); | ||
6132 | 5969 | ||
6133 | if (!sock_flag(sk, SOCK_DEAD)) | 5970 | if (!sock_flag(sk, SOCK_DEAD)) |
6134 | /* Wake up lingering close() */ | 5971 | /* Wake up lingering close() */ |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c8d28c433b2b..ddefd39ac0cf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
209 | } | 209 | } |
210 | 210 | ||
211 | if (tcp_death_row.sysctl_tw_recycle && | 211 | if (tcp_death_row.sysctl_tw_recycle && |
212 | !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { | 212 | !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) |
213 | struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); | 213 | tcp_fetch_timewait_stamp(sk, &rt->dst); |
214 | /* | ||
215 | * VJ's idea. We save last timestamp seen from | ||
216 | * the destination in peer table, when entering state | ||
217 | * TIME-WAIT * and initialize rx_opt.ts_recent from it, | ||
218 | * when trying new connection. | ||
219 | */ | ||
220 | if (peer) { | ||
221 | inet_peer_refcheck(peer); | ||
222 | if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { | ||
223 | tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; | ||
224 | tp->rx_opt.ts_recent = peer->tcp_ts; | ||
225 | } | ||
226 | } | ||
227 | } | ||
228 | 214 | ||
229 | inet->inet_dport = usin->sin_port; | 215 | inet->inet_dport = usin->sin_port; |
230 | inet->inet_daddr = daddr; | 216 | inet->inet_daddr = daddr; |
@@ -698,8 +684,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
698 | 684 | ||
699 | net = dev_net(skb_dst(skb)->dev); | 685 | net = dev_net(skb_dst(skb)->dev); |
700 | arg.tos = ip_hdr(skb)->tos; | 686 | arg.tos = ip_hdr(skb)->tos; |
701 | ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, | 687 | ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, |
702 | &arg, arg.iov[0].iov_len); | 688 | ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); |
703 | 689 | ||
704 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 690 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
705 | TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); | 691 | TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); |
@@ -781,8 +767,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
781 | if (oif) | 767 | if (oif) |
782 | arg.bound_dev_if = oif; | 768 | arg.bound_dev_if = oif; |
783 | arg.tos = tos; | 769 | arg.tos = tos; |
784 | ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, | 770 | ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, |
785 | &arg, arg.iov[0].iov_len); | 771 | ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); |
786 | 772 | ||
787 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 773 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
788 | } | 774 | } |
@@ -825,7 +811,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
825 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | 811 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
826 | struct request_sock *req, | 812 | struct request_sock *req, |
827 | struct request_values *rvp, | 813 | struct request_values *rvp, |
828 | u16 queue_mapping) | 814 | u16 queue_mapping, |
815 | bool nocache) | ||
829 | { | 816 | { |
830 | const struct inet_request_sock *ireq = inet_rsk(req); | 817 | const struct inet_request_sock *ireq = inet_rsk(req); |
831 | struct flowi4 fl4; | 818 | struct flowi4 fl4; |
@@ -833,7 +820,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
833 | struct sk_buff * skb; | 820 | struct sk_buff * skb; |
834 | 821 | ||
835 | /* First, grab a route. */ | 822 | /* First, grab a route. */ |
836 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) | 823 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL) |
837 | return -1; | 824 | return -1; |
838 | 825 | ||
839 | skb = tcp_make_synack(sk, dst, req, rvp); | 826 | skb = tcp_make_synack(sk, dst, req, rvp); |
@@ -848,7 +835,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
848 | err = net_xmit_eval(err); | 835 | err = net_xmit_eval(err); |
849 | } | 836 | } |
850 | 837 | ||
851 | dst_release(dst); | ||
852 | return err; | 838 | return err; |
853 | } | 839 | } |
854 | 840 | ||
@@ -856,7 +842,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, | |||
856 | struct request_values *rvp) | 842 | struct request_values *rvp) |
857 | { | 843 | { |
858 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); | 844 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
859 | return tcp_v4_send_synack(sk, NULL, req, rvp, 0); | 845 | return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); |
860 | } | 846 | } |
861 | 847 | ||
862 | /* | 848 | /* |
@@ -1375,7 +1361,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1375 | isn = cookie_v4_init_sequence(sk, skb, &req->mss); | 1361 | isn = cookie_v4_init_sequence(sk, skb, &req->mss); |
1376 | req->cookie_ts = tmp_opt.tstamp_ok; | 1362 | req->cookie_ts = tmp_opt.tstamp_ok; |
1377 | } else if (!isn) { | 1363 | } else if (!isn) { |
1378 | struct inet_peer *peer = NULL; | ||
1379 | struct flowi4 fl4; | 1364 | struct flowi4 fl4; |
1380 | 1365 | ||
1381 | /* VJ's idea. We save last timestamp seen | 1366 | /* VJ's idea. We save last timestamp seen |
@@ -1389,13 +1374,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1389 | */ | 1374 | */ |
1390 | if (tmp_opt.saw_tstamp && | 1375 | if (tmp_opt.saw_tstamp && |
1391 | tcp_death_row.sysctl_tw_recycle && | 1376 | tcp_death_row.sysctl_tw_recycle && |
1392 | (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && | 1377 | (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && |
1393 | fl4.daddr == saddr && | 1378 | fl4.daddr == saddr) { |
1394 | (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { | 1379 | if (!tcp_peer_is_proven(req, dst, true)) { |
1395 | inet_peer_refcheck(peer); | ||
1396 | if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && | ||
1397 | (s32)(peer->tcp_ts - req->ts_recent) > | ||
1398 | TCP_PAWS_WINDOW) { | ||
1399 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); | 1380 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); |
1400 | goto drop_and_release; | 1381 | goto drop_and_release; |
1401 | } | 1382 | } |
@@ -1404,8 +1385,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1404 | else if (!sysctl_tcp_syncookies && | 1385 | else if (!sysctl_tcp_syncookies && |
1405 | (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < | 1386 | (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
1406 | (sysctl_max_syn_backlog >> 2)) && | 1387 | (sysctl_max_syn_backlog >> 2)) && |
1407 | (!peer || !peer->tcp_ts_stamp) && | 1388 | !tcp_peer_is_proven(req, dst, false)) { |
1408 | (!dst || !dst_metric(dst, RTAX_RTT))) { | ||
1409 | /* Without syncookies last quarter of | 1389 | /* Without syncookies last quarter of |
1410 | * backlog is filled with destinations, | 1390 | * backlog is filled with destinations, |
1411 | * proven to be alive. | 1391 | * proven to be alive. |
@@ -1425,7 +1405,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1425 | 1405 | ||
1426 | if (tcp_v4_send_synack(sk, dst, req, | 1406 | if (tcp_v4_send_synack(sk, dst, req, |
1427 | (struct request_values *)&tmp_ext, | 1407 | (struct request_values *)&tmp_ext, |
1428 | skb_get_queue_mapping(skb)) || | 1408 | skb_get_queue_mapping(skb), |
1409 | want_cookie) || | ||
1429 | want_cookie) | 1410 | want_cookie) |
1430 | goto drop_and_free; | 1411 | goto drop_and_free; |
1431 | 1412 | ||
@@ -1672,6 +1653,51 @@ csum_err: | |||
1672 | } | 1653 | } |
1673 | EXPORT_SYMBOL(tcp_v4_do_rcv); | 1654 | EXPORT_SYMBOL(tcp_v4_do_rcv); |
1674 | 1655 | ||
1656 | void tcp_v4_early_demux(struct sk_buff *skb) | ||
1657 | { | ||
1658 | struct net *net = dev_net(skb->dev); | ||
1659 | const struct iphdr *iph; | ||
1660 | const struct tcphdr *th; | ||
1661 | struct net_device *dev; | ||
1662 | struct sock *sk; | ||
1663 | |||
1664 | if (skb->pkt_type != PACKET_HOST) | ||
1665 | return; | ||
1666 | |||
1667 | if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr))) | ||
1668 | return; | ||
1669 | |||
1670 | iph = ip_hdr(skb); | ||
1671 | th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb)); | ||
1672 | |||
1673 | if (th->doff < sizeof(struct tcphdr) / 4) | ||
1674 | return; | ||
1675 | |||
1676 | if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) | ||
1677 | return; | ||
1678 | |||
1679 | dev = skb->dev; | ||
1680 | sk = __inet_lookup_established(net, &tcp_hashinfo, | ||
1681 | iph->saddr, th->source, | ||
1682 | iph->daddr, ntohs(th->dest), | ||
1683 | dev->ifindex); | ||
1684 | if (sk) { | ||
1685 | skb->sk = sk; | ||
1686 | skb->destructor = sock_edemux; | ||
1687 | if (sk->sk_state != TCP_TIME_WAIT) { | ||
1688 | struct dst_entry *dst = sk->sk_rx_dst; | ||
1689 | if (dst) | ||
1690 | dst = dst_check(dst, 0); | ||
1691 | if (dst) { | ||
1692 | struct rtable *rt = (struct rtable *) dst; | ||
1693 | |||
1694 | if (rt->rt_iif == dev->ifindex) | ||
1695 | skb_dst_set_noref(skb, dst); | ||
1696 | } | ||
1697 | } | ||
1698 | } | ||
1699 | } | ||
1700 | |||
1675 | /* | 1701 | /* |
1676 | * From tcp_input.c | 1702 | * From tcp_input.c |
1677 | */ | 1703 | */ |
@@ -1821,40 +1847,10 @@ do_time_wait: | |||
1821 | goto discard_it; | 1847 | goto discard_it; |
1822 | } | 1848 | } |
1823 | 1849 | ||
1824 | struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) | ||
1825 | { | ||
1826 | struct rtable *rt = (struct rtable *) __sk_dst_get(sk); | ||
1827 | struct inet_sock *inet = inet_sk(sk); | ||
1828 | struct inet_peer *peer; | ||
1829 | |||
1830 | if (!rt || | ||
1831 | inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { | ||
1832 | peer = inet_getpeer_v4(inet->inet_daddr, 1); | ||
1833 | *release_it = true; | ||
1834 | } else { | ||
1835 | if (!rt->peer) | ||
1836 | rt_bind_peer(rt, inet->inet_daddr, 1); | ||
1837 | peer = rt->peer; | ||
1838 | *release_it = false; | ||
1839 | } | ||
1840 | |||
1841 | return peer; | ||
1842 | } | ||
1843 | EXPORT_SYMBOL(tcp_v4_get_peer); | ||
1844 | |||
1845 | void *tcp_v4_tw_get_peer(struct sock *sk) | ||
1846 | { | ||
1847 | const struct inet_timewait_sock *tw = inet_twsk(sk); | ||
1848 | |||
1849 | return inet_getpeer_v4(tw->tw_daddr, 1); | ||
1850 | } | ||
1851 | EXPORT_SYMBOL(tcp_v4_tw_get_peer); | ||
1852 | |||
1853 | static struct timewait_sock_ops tcp_timewait_sock_ops = { | 1850 | static struct timewait_sock_ops tcp_timewait_sock_ops = { |
1854 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), | 1851 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), |
1855 | .twsk_unique = tcp_twsk_unique, | 1852 | .twsk_unique = tcp_twsk_unique, |
1856 | .twsk_destructor= tcp_twsk_destructor, | 1853 | .twsk_destructor= tcp_twsk_destructor, |
1857 | .twsk_getpeer = tcp_v4_tw_get_peer, | ||
1858 | }; | 1854 | }; |
1859 | 1855 | ||
1860 | const struct inet_connection_sock_af_ops ipv4_specific = { | 1856 | const struct inet_connection_sock_af_ops ipv4_specific = { |
@@ -1863,7 +1859,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { | |||
1863 | .rebuild_header = inet_sk_rebuild_header, | 1859 | .rebuild_header = inet_sk_rebuild_header, |
1864 | .conn_request = tcp_v4_conn_request, | 1860 | .conn_request = tcp_v4_conn_request, |
1865 | .syn_recv_sock = tcp_v4_syn_recv_sock, | 1861 | .syn_recv_sock = tcp_v4_syn_recv_sock, |
1866 | .get_peer = tcp_v4_get_peer, | ||
1867 | .net_header_len = sizeof(struct iphdr), | 1862 | .net_header_len = sizeof(struct iphdr), |
1868 | .setsockopt = ip_setsockopt, | 1863 | .setsockopt = ip_setsockopt, |
1869 | .getsockopt = ip_getsockopt, | 1864 | .getsockopt = ip_getsockopt, |
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c new file mode 100644 index 000000000000..1fd83d3118fe --- /dev/null +++ b/net/ipv4/tcp_metrics.c | |||
@@ -0,0 +1,697 @@ | |||
1 | #include <linux/rcupdate.h> | ||
2 | #include <linux/spinlock.h> | ||
3 | #include <linux/jiffies.h> | ||
4 | #include <linux/bootmem.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/cache.h> | ||
7 | #include <linux/slab.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/tcp.h> | ||
10 | |||
11 | #include <net/inet_connection_sock.h> | ||
12 | #include <net/net_namespace.h> | ||
13 | #include <net/request_sock.h> | ||
14 | #include <net/inetpeer.h> | ||
15 | #include <net/sock.h> | ||
16 | #include <net/ipv6.h> | ||
17 | #include <net/dst.h> | ||
18 | #include <net/tcp.h> | ||
19 | |||
20 | int sysctl_tcp_nometrics_save __read_mostly; | ||
21 | |||
22 | enum tcp_metric_index { | ||
23 | TCP_METRIC_RTT, | ||
24 | TCP_METRIC_RTTVAR, | ||
25 | TCP_METRIC_SSTHRESH, | ||
26 | TCP_METRIC_CWND, | ||
27 | TCP_METRIC_REORDERING, | ||
28 | |||
29 | /* Always last. */ | ||
30 | TCP_METRIC_MAX, | ||
31 | }; | ||
32 | |||
33 | struct tcp_metrics_block { | ||
34 | struct tcp_metrics_block __rcu *tcpm_next; | ||
35 | struct inetpeer_addr tcpm_addr; | ||
36 | unsigned long tcpm_stamp; | ||
37 | u32 tcpm_ts; | ||
38 | u32 tcpm_ts_stamp; | ||
39 | u32 tcpm_lock; | ||
40 | u32 tcpm_vals[TCP_METRIC_MAX]; | ||
41 | }; | ||
42 | |||
43 | static bool tcp_metric_locked(struct tcp_metrics_block *tm, | ||
44 | enum tcp_metric_index idx) | ||
45 | { | ||
46 | return tm->tcpm_lock & (1 << idx); | ||
47 | } | ||
48 | |||
49 | static u32 tcp_metric_get(struct tcp_metrics_block *tm, | ||
50 | enum tcp_metric_index idx) | ||
51 | { | ||
52 | return tm->tcpm_vals[idx]; | ||
53 | } | ||
54 | |||
55 | static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, | ||
56 | enum tcp_metric_index idx) | ||
57 | { | ||
58 | return msecs_to_jiffies(tm->tcpm_vals[idx]); | ||
59 | } | ||
60 | |||
61 | static void tcp_metric_set(struct tcp_metrics_block *tm, | ||
62 | enum tcp_metric_index idx, | ||
63 | u32 val) | ||
64 | { | ||
65 | tm->tcpm_vals[idx] = val; | ||
66 | } | ||
67 | |||
68 | static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, | ||
69 | enum tcp_metric_index idx, | ||
70 | u32 val) | ||
71 | { | ||
72 | tm->tcpm_vals[idx] = jiffies_to_msecs(val); | ||
73 | } | ||
74 | |||
75 | static bool addr_same(const struct inetpeer_addr *a, | ||
76 | const struct inetpeer_addr *b) | ||
77 | { | ||
78 | const struct in6_addr *a6, *b6; | ||
79 | |||
80 | if (a->family != b->family) | ||
81 | return false; | ||
82 | if (a->family == AF_INET) | ||
83 | return a->addr.a4 == b->addr.a4; | ||
84 | |||
85 | a6 = (const struct in6_addr *) &a->addr.a6[0]; | ||
86 | b6 = (const struct in6_addr *) &b->addr.a6[0]; | ||
87 | |||
88 | return ipv6_addr_equal(a6, b6); | ||
89 | } | ||
90 | |||
91 | struct tcpm_hash_bucket { | ||
92 | struct tcp_metrics_block __rcu *chain; | ||
93 | }; | ||
94 | |||
95 | static DEFINE_SPINLOCK(tcp_metrics_lock); | ||
96 | |||
97 | static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) | ||
98 | { | ||
99 | u32 val; | ||
100 | |||
101 | val = 0; | ||
102 | if (dst_metric_locked(dst, RTAX_RTT)) | ||
103 | val |= 1 << TCP_METRIC_RTT; | ||
104 | if (dst_metric_locked(dst, RTAX_RTTVAR)) | ||
105 | val |= 1 << TCP_METRIC_RTTVAR; | ||
106 | if (dst_metric_locked(dst, RTAX_SSTHRESH)) | ||
107 | val |= 1 << TCP_METRIC_SSTHRESH; | ||
108 | if (dst_metric_locked(dst, RTAX_CWND)) | ||
109 | val |= 1 << TCP_METRIC_CWND; | ||
110 | if (dst_metric_locked(dst, RTAX_REORDERING)) | ||
111 | val |= 1 << TCP_METRIC_REORDERING; | ||
112 | tm->tcpm_lock = val; | ||
113 | |||
114 | tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); | ||
115 | tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); | ||
116 | tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); | ||
117 | tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); | ||
118 | tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); | ||
119 | tm->tcpm_ts = 0; | ||
120 | tm->tcpm_ts_stamp = 0; | ||
121 | } | ||
122 | |||
123 | static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, | ||
124 | struct inetpeer_addr *addr, | ||
125 | unsigned int hash, | ||
126 | bool reclaim) | ||
127 | { | ||
128 | struct tcp_metrics_block *tm; | ||
129 | struct net *net; | ||
130 | |||
131 | spin_lock_bh(&tcp_metrics_lock); | ||
132 | net = dev_net(dst->dev); | ||
133 | if (unlikely(reclaim)) { | ||
134 | struct tcp_metrics_block *oldest; | ||
135 | |||
136 | oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); | ||
137 | for (tm = rcu_dereference(oldest->tcpm_next); tm; | ||
138 | tm = rcu_dereference(tm->tcpm_next)) { | ||
139 | if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) | ||
140 | oldest = tm; | ||
141 | } | ||
142 | tm = oldest; | ||
143 | } else { | ||
144 | tm = kmalloc(sizeof(*tm), GFP_ATOMIC); | ||
145 | if (!tm) | ||
146 | goto out_unlock; | ||
147 | } | ||
148 | tm->tcpm_addr = *addr; | ||
149 | tm->tcpm_stamp = jiffies; | ||
150 | |||
151 | tcpm_suck_dst(tm, dst); | ||
152 | |||
153 | if (likely(!reclaim)) { | ||
154 | tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; | ||
155 | rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); | ||
156 | } | ||
157 | |||
158 | out_unlock: | ||
159 | spin_unlock_bh(&tcp_metrics_lock); | ||
160 | return tm; | ||
161 | } | ||
162 | |||
163 | #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) | ||
164 | |||
165 | static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) | ||
166 | { | ||
167 | if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) | ||
168 | tcpm_suck_dst(tm, dst); | ||
169 | } | ||
170 | |||
171 | #define TCP_METRICS_RECLAIM_DEPTH 5 | ||
172 | #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL | ||
173 | |||
174 | static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) | ||
175 | { | ||
176 | if (tm) | ||
177 | return tm; | ||
178 | if (depth > TCP_METRICS_RECLAIM_DEPTH) | ||
179 | return TCP_METRICS_RECLAIM_PTR; | ||
180 | return NULL; | ||
181 | } | ||
182 | |||
183 | static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, | ||
184 | struct net *net, unsigned int hash) | ||
185 | { | ||
186 | struct tcp_metrics_block *tm; | ||
187 | int depth = 0; | ||
188 | |||
189 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
190 | tm = rcu_dereference(tm->tcpm_next)) { | ||
191 | if (addr_same(&tm->tcpm_addr, addr)) | ||
192 | break; | ||
193 | depth++; | ||
194 | } | ||
195 | return tcp_get_encode(tm, depth); | ||
196 | } | ||
197 | |||
198 | static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, | ||
199 | struct dst_entry *dst) | ||
200 | { | ||
201 | struct tcp_metrics_block *tm; | ||
202 | struct inetpeer_addr addr; | ||
203 | unsigned int hash; | ||
204 | struct net *net; | ||
205 | |||
206 | addr.family = req->rsk_ops->family; | ||
207 | switch (addr.family) { | ||
208 | case AF_INET: | ||
209 | addr.addr.a4 = inet_rsk(req)->rmt_addr; | ||
210 | hash = (__force unsigned int) addr.addr.a4; | ||
211 | break; | ||
212 | case AF_INET6: | ||
213 | *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; | ||
214 | hash = ((__force unsigned int) addr.addr.a6[0] ^ | ||
215 | (__force unsigned int) addr.addr.a6[1] ^ | ||
216 | (__force unsigned int) addr.addr.a6[2] ^ | ||
217 | (__force unsigned int) addr.addr.a6[3]); | ||
218 | break; | ||
219 | default: | ||
220 | return NULL; | ||
221 | } | ||
222 | |||
223 | hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); | ||
224 | |||
225 | net = dev_net(dst->dev); | ||
226 | hash &= net->ipv4.tcp_metrics_hash_mask; | ||
227 | |||
228 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
229 | tm = rcu_dereference(tm->tcpm_next)) { | ||
230 | if (addr_same(&tm->tcpm_addr, &addr)) | ||
231 | break; | ||
232 | } | ||
233 | tcpm_check_stamp(tm, dst); | ||
234 | return tm; | ||
235 | } | ||
236 | |||
237 | static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) | ||
238 | { | ||
239 | struct inet6_timewait_sock *tw6; | ||
240 | struct tcp_metrics_block *tm; | ||
241 | struct inetpeer_addr addr; | ||
242 | unsigned int hash; | ||
243 | struct net *net; | ||
244 | |||
245 | addr.family = tw->tw_family; | ||
246 | switch (addr.family) { | ||
247 | case AF_INET: | ||
248 | addr.addr.a4 = tw->tw_daddr; | ||
249 | hash = (__force unsigned int) addr.addr.a4; | ||
250 | break; | ||
251 | case AF_INET6: | ||
252 | tw6 = inet6_twsk((struct sock *)tw); | ||
253 | *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; | ||
254 | hash = ((__force unsigned int) addr.addr.a6[0] ^ | ||
255 | (__force unsigned int) addr.addr.a6[1] ^ | ||
256 | (__force unsigned int) addr.addr.a6[2] ^ | ||
257 | (__force unsigned int) addr.addr.a6[3]); | ||
258 | break; | ||
259 | default: | ||
260 | return NULL; | ||
261 | } | ||
262 | |||
263 | hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); | ||
264 | |||
265 | net = twsk_net(tw); | ||
266 | hash &= net->ipv4.tcp_metrics_hash_mask; | ||
267 | |||
268 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
269 | tm = rcu_dereference(tm->tcpm_next)) { | ||
270 | if (addr_same(&tm->tcpm_addr, &addr)) | ||
271 | break; | ||
272 | } | ||
273 | return tm; | ||
274 | } | ||
275 | |||
276 | static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, | ||
277 | struct dst_entry *dst, | ||
278 | bool create) | ||
279 | { | ||
280 | struct tcp_metrics_block *tm; | ||
281 | struct inetpeer_addr addr; | ||
282 | unsigned int hash; | ||
283 | struct net *net; | ||
284 | bool reclaim; | ||
285 | |||
286 | addr.family = sk->sk_family; | ||
287 | switch (addr.family) { | ||
288 | case AF_INET: | ||
289 | addr.addr.a4 = inet_sk(sk)->inet_daddr; | ||
290 | hash = (__force unsigned int) addr.addr.a4; | ||
291 | break; | ||
292 | case AF_INET6: | ||
293 | *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; | ||
294 | hash = ((__force unsigned int) addr.addr.a6[0] ^ | ||
295 | (__force unsigned int) addr.addr.a6[1] ^ | ||
296 | (__force unsigned int) addr.addr.a6[2] ^ | ||
297 | (__force unsigned int) addr.addr.a6[3]); | ||
298 | break; | ||
299 | default: | ||
300 | return NULL; | ||
301 | } | ||
302 | |||
303 | hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); | ||
304 | |||
305 | net = dev_net(dst->dev); | ||
306 | hash &= net->ipv4.tcp_metrics_hash_mask; | ||
307 | |||
308 | tm = __tcp_get_metrics(&addr, net, hash); | ||
309 | reclaim = false; | ||
310 | if (tm == TCP_METRICS_RECLAIM_PTR) { | ||
311 | reclaim = true; | ||
312 | tm = NULL; | ||
313 | } | ||
314 | if (!tm && create) | ||
315 | tm = tcpm_new(dst, &addr, hash, reclaim); | ||
316 | else | ||
317 | tcpm_check_stamp(tm, dst); | ||
318 | |||
319 | return tm; | ||
320 | } | ||
321 | |||
322 | /* Save metrics learned by this TCP session. This function is called | ||
323 | * only, when TCP finishes successfully i.e. when it enters TIME-WAIT | ||
324 | * or goes from LAST-ACK to CLOSE. | ||
325 | */ | ||
326 | void tcp_update_metrics(struct sock *sk) | ||
327 | { | ||
328 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
329 | struct dst_entry *dst = __sk_dst_get(sk); | ||
330 | struct tcp_sock *tp = tcp_sk(sk); | ||
331 | struct tcp_metrics_block *tm; | ||
332 | unsigned long rtt; | ||
333 | u32 val; | ||
334 | int m; | ||
335 | |||
336 | if (sysctl_tcp_nometrics_save || !dst) | ||
337 | return; | ||
338 | |||
339 | if (dst->flags & DST_HOST) | ||
340 | dst_confirm(dst); | ||
341 | |||
342 | rcu_read_lock(); | ||
343 | if (icsk->icsk_backoff || !tp->srtt) { | ||
344 | /* This session failed to estimate rtt. Why? | ||
345 | * Probably, no packets returned in time. Reset our | ||
346 | * results. | ||
347 | */ | ||
348 | tm = tcp_get_metrics(sk, dst, false); | ||
349 | if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) | ||
350 | tcp_metric_set(tm, TCP_METRIC_RTT, 0); | ||
351 | goto out_unlock; | ||
352 | } else | ||
353 | tm = tcp_get_metrics(sk, dst, true); | ||
354 | |||
355 | if (!tm) | ||
356 | goto out_unlock; | ||
357 | |||
358 | rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); | ||
359 | m = rtt - tp->srtt; | ||
360 | |||
361 | /* If newly calculated rtt larger than stored one, store new | ||
362 | * one. Otherwise, use EWMA. Remember, rtt overestimation is | ||
363 | * always better than underestimation. | ||
364 | */ | ||
365 | if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { | ||
366 | if (m <= 0) | ||
367 | rtt = tp->srtt; | ||
368 | else | ||
369 | rtt -= (m >> 3); | ||
370 | tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); | ||
371 | } | ||
372 | |||
373 | if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { | ||
374 | unsigned long var; | ||
375 | |||
376 | if (m < 0) | ||
377 | m = -m; | ||
378 | |||
379 | /* Scale deviation to rttvar fixed point */ | ||
380 | m >>= 1; | ||
381 | if (m < tp->mdev) | ||
382 | m = tp->mdev; | ||
383 | |||
384 | var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | ||
385 | if (m >= var) | ||
386 | var = m; | ||
387 | else | ||
388 | var -= (var - m) >> 2; | ||
389 | |||
390 | tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); | ||
391 | } | ||
392 | |||
393 | if (tcp_in_initial_slowstart(tp)) { | ||
394 | /* Slow start still did not finish. */ | ||
395 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { | ||
396 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
397 | if (val && (tp->snd_cwnd >> 1) > val) | ||
398 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
399 | tp->snd_cwnd >> 1); | ||
400 | } | ||
401 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
402 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
403 | if (tp->snd_cwnd > val) | ||
404 | tcp_metric_set(tm, TCP_METRIC_CWND, | ||
405 | tp->snd_cwnd); | ||
406 | } | ||
407 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | ||
408 | icsk->icsk_ca_state == TCP_CA_Open) { | ||
409 | /* Cong. avoidance phase, cwnd is reliable. */ | ||
410 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) | ||
411 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
412 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); | ||
413 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
414 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
415 | tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1); | ||
416 | } | ||
417 | } else { | ||
418 | /* Else slow start did not finish, cwnd is non-sense, | ||
419 | * ssthresh may be also invalid. | ||
420 | */ | ||
421 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
422 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
423 | tcp_metric_set(tm, TCP_METRIC_CWND, | ||
424 | (val + tp->snd_ssthresh) >> 1); | ||
425 | } | ||
426 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { | ||
427 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
428 | if (val && tp->snd_ssthresh > val) | ||
429 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
430 | tp->snd_ssthresh); | ||
431 | } | ||
432 | if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { | ||
433 | val = tcp_metric_get(tm, TCP_METRIC_REORDERING); | ||
434 | if (val < tp->reordering && | ||
435 | tp->reordering != sysctl_tcp_reordering) | ||
436 | tcp_metric_set(tm, TCP_METRIC_REORDERING, | ||
437 | tp->reordering); | ||
438 | } | ||
439 | } | ||
440 | tm->tcpm_stamp = jiffies; | ||
441 | out_unlock: | ||
442 | rcu_read_unlock(); | ||
443 | } | ||
444 | |||
445 | /* Initialize metrics on socket. */ | ||
446 | |||
447 | void tcp_init_metrics(struct sock *sk) | ||
448 | { | ||
449 | struct dst_entry *dst = __sk_dst_get(sk); | ||
450 | struct tcp_sock *tp = tcp_sk(sk); | ||
451 | struct tcp_metrics_block *tm; | ||
452 | u32 val; | ||
453 | |||
454 | if (dst == NULL) | ||
455 | goto reset; | ||
456 | |||
457 | dst_confirm(dst); | ||
458 | |||
459 | rcu_read_lock(); | ||
460 | tm = tcp_get_metrics(sk, dst, true); | ||
461 | if (!tm) { | ||
462 | rcu_read_unlock(); | ||
463 | goto reset; | ||
464 | } | ||
465 | |||
466 | if (tcp_metric_locked(tm, TCP_METRIC_CWND)) | ||
467 | tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
468 | |||
469 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
470 | if (val) { | ||
471 | tp->snd_ssthresh = val; | ||
472 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) | ||
473 | tp->snd_ssthresh = tp->snd_cwnd_clamp; | ||
474 | } else { | ||
475 | /* ssthresh may have been reduced unnecessarily during. | ||
476 | * 3WHS. Restore it back to its initial default. | ||
477 | */ | ||
478 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
479 | } | ||
480 | val = tcp_metric_get(tm, TCP_METRIC_REORDERING); | ||
481 | if (val && tp->reordering != val) { | ||
482 | tcp_disable_fack(tp); | ||
483 | tcp_disable_early_retrans(tp); | ||
484 | tp->reordering = val; | ||
485 | } | ||
486 | |||
487 | val = tcp_metric_get(tm, TCP_METRIC_RTT); | ||
488 | if (val == 0 || tp->srtt == 0) { | ||
489 | rcu_read_unlock(); | ||
490 | goto reset; | ||
491 | } | ||
492 | /* Initial rtt is determined from SYN,SYN-ACK. | ||
493 | * The segment is small and rtt may appear much | ||
494 | * less than real one. Use per-dst memory | ||
495 | * to make it more realistic. | ||
496 | * | ||
497 | * A bit of theory. RTT is time passed after "normal" sized packet | ||
498 | * is sent until it is ACKed. In normal circumstances sending small | ||
499 | * packets force peer to delay ACKs and calculation is correct too. | ||
500 | * The algorithm is adaptive and, provided we follow specs, it | ||
501 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | ||
502 | * tricks sort of "quick acks" for time long enough to decrease RTT | ||
503 | * to low value, and then abruptly stops to do it and starts to delay | ||
504 | * ACKs, wait for troubles. | ||
505 | */ | ||
506 | val = msecs_to_jiffies(val); | ||
507 | if (val > tp->srtt) { | ||
508 | tp->srtt = val; | ||
509 | tp->rtt_seq = tp->snd_nxt; | ||
510 | } | ||
511 | val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | ||
512 | if (val > tp->mdev) { | ||
513 | tp->mdev = val; | ||
514 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
515 | } | ||
516 | rcu_read_unlock(); | ||
517 | |||
518 | tcp_set_rto(sk); | ||
519 | reset: | ||
520 | if (tp->srtt == 0) { | ||
521 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | ||
522 | * 3WHS. This is most likely due to retransmission, | ||
523 | * including spurious one. Reset the RTO back to 3secs | ||
524 | * from the more aggressive 1sec to avoid more spurious | ||
525 | * retransmission. | ||
526 | */ | ||
527 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | ||
528 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | ||
529 | } | ||
530 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | ||
531 | * retransmitted. In light of RFC6298 more aggressive 1sec | ||
532 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK | ||
533 | * retransmission has occurred. | ||
534 | */ | ||
535 | if (tp->total_retrans > 1) | ||
536 | tp->snd_cwnd = 1; | ||
537 | else | ||
538 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | ||
539 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
540 | } | ||
541 | |||
542 | bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) | ||
543 | { | ||
544 | struct tcp_metrics_block *tm; | ||
545 | bool ret; | ||
546 | |||
547 | if (!dst) | ||
548 | return false; | ||
549 | |||
550 | rcu_read_lock(); | ||
551 | tm = __tcp_get_metrics_req(req, dst); | ||
552 | if (paws_check) { | ||
553 | if (tm && | ||
554 | (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && | ||
555 | (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) | ||
556 | ret = false; | ||
557 | else | ||
558 | ret = true; | ||
559 | } else { | ||
560 | if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) | ||
561 | ret = true; | ||
562 | else | ||
563 | ret = false; | ||
564 | } | ||
565 | rcu_read_unlock(); | ||
566 | |||
567 | return ret; | ||
568 | } | ||
569 | EXPORT_SYMBOL_GPL(tcp_peer_is_proven); | ||
570 | |||
571 | void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) | ||
572 | { | ||
573 | struct tcp_metrics_block *tm; | ||
574 | |||
575 | rcu_read_lock(); | ||
576 | tm = tcp_get_metrics(sk, dst, true); | ||
577 | if (tm) { | ||
578 | struct tcp_sock *tp = tcp_sk(sk); | ||
579 | |||
580 | if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { | ||
581 | tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; | ||
582 | tp->rx_opt.ts_recent = tm->tcpm_ts; | ||
583 | } | ||
584 | } | ||
585 | rcu_read_unlock(); | ||
586 | } | ||
587 | EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); | ||
588 | |||
589 | /* VJ's idea. Save last timestamp seen from this destination and hold | ||
590 | * it at least for normal timewait interval to use for duplicate | ||
591 | * segment detection in subsequent connections, before they enter | ||
592 | * synchronized state. | ||
593 | */ | ||
594 | bool tcp_remember_stamp(struct sock *sk) | ||
595 | { | ||
596 | struct dst_entry *dst = __sk_dst_get(sk); | ||
597 | bool ret = false; | ||
598 | |||
599 | if (dst) { | ||
600 | struct tcp_metrics_block *tm; | ||
601 | |||
602 | rcu_read_lock(); | ||
603 | tm = tcp_get_metrics(sk, dst, true); | ||
604 | if (tm) { | ||
605 | struct tcp_sock *tp = tcp_sk(sk); | ||
606 | |||
607 | if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || | ||
608 | ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && | ||
609 | tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { | ||
610 | tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; | ||
611 | tm->tcpm_ts = tp->rx_opt.ts_recent; | ||
612 | } | ||
613 | ret = true; | ||
614 | } | ||
615 | rcu_read_unlock(); | ||
616 | } | ||
617 | return ret; | ||
618 | } | ||
619 | |||
620 | bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) | ||
621 | { | ||
622 | struct tcp_metrics_block *tm; | ||
623 | bool ret = false; | ||
624 | |||
625 | rcu_read_lock(); | ||
626 | tm = __tcp_get_metrics_tw(tw); | ||
627 | if (tw) { | ||
628 | const struct tcp_timewait_sock *tcptw; | ||
629 | struct sock *sk = (struct sock *) tw; | ||
630 | |||
631 | tcptw = tcp_twsk(sk); | ||
632 | if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || | ||
633 | ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && | ||
634 | tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { | ||
635 | tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; | ||
636 | tm->tcpm_ts = tcptw->tw_ts_recent; | ||
637 | } | ||
638 | ret = true; | ||
639 | } | ||
640 | rcu_read_unlock(); | ||
641 | |||
642 | return ret; | ||
643 | } | ||
644 | |||
645 | static unsigned long tcpmhash_entries; | ||
646 | static int __init set_tcpmhash_entries(char *str) | ||
647 | { | ||
648 | ssize_t ret; | ||
649 | |||
650 | if (!str) | ||
651 | return 0; | ||
652 | |||
653 | ret = kstrtoul(str, 0, &tcpmhash_entries); | ||
654 | if (ret) | ||
655 | return 0; | ||
656 | |||
657 | return 1; | ||
658 | } | ||
659 | __setup("tcpmhash_entries=", set_tcpmhash_entries); | ||
660 | |||
661 | static int __net_init tcp_net_metrics_init(struct net *net) | ||
662 | { | ||
663 | int slots, size; | ||
664 | |||
665 | slots = tcpmhash_entries; | ||
666 | if (!slots) { | ||
667 | if (totalram_pages >= 128 * 1024) | ||
668 | slots = 16 * 1024; | ||
669 | else | ||
670 | slots = 8 * 1024; | ||
671 | } | ||
672 | |||
673 | size = slots * sizeof(struct tcpm_hash_bucket); | ||
674 | |||
675 | net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL); | ||
676 | if (!net->ipv4.tcp_metrics_hash) | ||
677 | return -ENOMEM; | ||
678 | |||
679 | net->ipv4.tcp_metrics_hash_mask = (slots - 1); | ||
680 | |||
681 | return 0; | ||
682 | } | ||
683 | |||
684 | static void __net_exit tcp_net_metrics_exit(struct net *net) | ||
685 | { | ||
686 | kfree(net->ipv4.tcp_metrics_hash); | ||
687 | } | ||
688 | |||
689 | static __net_initdata struct pernet_operations tcp_net_metrics_ops = { | ||
690 | .init = tcp_net_metrics_init, | ||
691 | .exit = tcp_net_metrics_exit, | ||
692 | }; | ||
693 | |||
694 | void __init tcp_metrics_init(void) | ||
695 | { | ||
696 | register_pernet_subsys(&tcp_net_metrics_ops); | ||
697 | } | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b85d9fe7d663..65608863fdee 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -49,56 +49,6 @@ struct inet_timewait_death_row tcp_death_row = { | |||
49 | }; | 49 | }; |
50 | EXPORT_SYMBOL_GPL(tcp_death_row); | 50 | EXPORT_SYMBOL_GPL(tcp_death_row); |
51 | 51 | ||
52 | /* VJ's idea. Save last timestamp seen from this destination | ||
53 | * and hold it at least for normal timewait interval to use for duplicate | ||
54 | * segment detection in subsequent connections, before they enter synchronized | ||
55 | * state. | ||
56 | */ | ||
57 | |||
58 | static bool tcp_remember_stamp(struct sock *sk) | ||
59 | { | ||
60 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
61 | struct tcp_sock *tp = tcp_sk(sk); | ||
62 | struct inet_peer *peer; | ||
63 | bool release_it; | ||
64 | |||
65 | peer = icsk->icsk_af_ops->get_peer(sk, &release_it); | ||
66 | if (peer) { | ||
67 | if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || | ||
68 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
69 | peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { | ||
70 | peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; | ||
71 | peer->tcp_ts = tp->rx_opt.ts_recent; | ||
72 | } | ||
73 | if (release_it) | ||
74 | inet_putpeer(peer); | ||
75 | return true; | ||
76 | } | ||
77 | |||
78 | return false; | ||
79 | } | ||
80 | |||
81 | static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) | ||
82 | { | ||
83 | struct sock *sk = (struct sock *) tw; | ||
84 | struct inet_peer *peer; | ||
85 | |||
86 | peer = twsk_getpeer(sk); | ||
87 | if (peer) { | ||
88 | const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); | ||
89 | |||
90 | if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || | ||
91 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
92 | peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { | ||
93 | peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; | ||
94 | peer->tcp_ts = tcptw->tw_ts_recent; | ||
95 | } | ||
96 | inet_putpeer(peer); | ||
97 | return true; | ||
98 | } | ||
99 | return false; | ||
100 | } | ||
101 | |||
102 | static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | 52 | static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) |
103 | { | 53 | { |
104 | if (seq == s_win) | 54 | if (seq == s_win) |
@@ -327,8 +277,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
327 | if (tw != NULL) { | 277 | if (tw != NULL) { |
328 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 278 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
329 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); | 279 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); |
280 | struct inet_sock *inet = inet_sk(sk); | ||
330 | 281 | ||
331 | tw->tw_transparent = inet_sk(sk)->transparent; | 282 | tw->tw_transparent = inet->transparent; |
332 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; | 283 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; |
333 | tcptw->tw_rcv_nxt = tp->rcv_nxt; | 284 | tcptw->tw_rcv_nxt = tp->rcv_nxt; |
334 | tcptw->tw_snd_nxt = tp->snd_nxt; | 285 | tcptw->tw_snd_nxt = tp->snd_nxt; |
@@ -403,6 +354,7 @@ void tcp_twsk_destructor(struct sock *sk) | |||
403 | { | 354 | { |
404 | #ifdef CONFIG_TCP_MD5SIG | 355 | #ifdef CONFIG_TCP_MD5SIG |
405 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); | 356 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); |
357 | |||
406 | if (twsk->tw_md5_key) { | 358 | if (twsk->tw_md5_key) { |
407 | tcp_free_md5sig_pool(); | 359 | tcp_free_md5sig_pool(); |
408 | kfree_rcu(twsk->tw_md5_key, rcu); | 360 | kfree_rcu(twsk->tw_md5_key, rcu); |
@@ -435,6 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
435 | struct tcp_sock *oldtp = tcp_sk(sk); | 387 | struct tcp_sock *oldtp = tcp_sk(sk); |
436 | struct tcp_cookie_values *oldcvp = oldtp->cookie_values; | 388 | struct tcp_cookie_values *oldcvp = oldtp->cookie_values; |
437 | 389 | ||
390 | newsk->sk_rx_dst = dst_clone(skb_dst(skb)); | ||
391 | |||
438 | /* TCP Cookie Transactions require space for the cookie pair, | 392 | /* TCP Cookie Transactions require space for the cookie pair, |
439 | * as it differs for each connection. There is no need to | 393 | * as it differs for each connection. There is no need to |
440 | * copy any s_data_payload stored at the original socket. | 394 | * copy any s_data_payload stored at the original socket. |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 803cbfe82fbc..c465d3e51e28 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -2442,7 +2442,16 @@ int tcp_send_synack(struct sock *sk) | |||
2442 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2442 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
2443 | } | 2443 | } |
2444 | 2444 | ||
2445 | /* Prepare a SYN-ACK. */ | 2445 | /** |
2446 | * tcp_make_synack - Prepare a SYN-ACK. | ||
2447 | * sk: listener socket | ||
2448 | * dst: dst entry attached to the SYNACK | ||
2449 | * req: request_sock pointer | ||
2450 | * rvp: request_values pointer | ||
2451 | * | ||
2452 | * Allocate one skb and build a SYNACK packet. | ||
2453 | * @dst is consumed : Caller should not use it again. | ||
2454 | */ | ||
2446 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | 2455 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
2447 | struct request_sock *req, | 2456 | struct request_sock *req, |
2448 | struct request_values *rvp) | 2457 | struct request_values *rvp) |
@@ -2461,14 +2470,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2461 | 2470 | ||
2462 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) | 2471 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) |
2463 | s_data_desired = cvp->s_data_desired; | 2472 | s_data_desired = cvp->s_data_desired; |
2464 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); | 2473 | skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); |
2465 | if (skb == NULL) | 2474 | if (unlikely(!skb)) { |
2475 | dst_release(dst); | ||
2466 | return NULL; | 2476 | return NULL; |
2467 | 2477 | } | |
2468 | /* Reserve space for headers. */ | 2478 | /* Reserve space for headers. */ |
2469 | skb_reserve(skb, MAX_TCP_HEADER); | 2479 | skb_reserve(skb, MAX_TCP_HEADER); |
2470 | 2480 | ||
2471 | skb_dst_set(skb, dst_clone(dst)); | 2481 | skb_dst_set(skb, dst); |
2472 | 2482 | ||
2473 | mss = dst_metric_advmss(dst); | 2483 | mss = dst_metric_advmss(dst); |
2474 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2484 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eaca73644e79..ee37d47d472e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -108,6 +108,7 @@ | |||
108 | #include <net/xfrm.h> | 108 | #include <net/xfrm.h> |
109 | #include <trace/events/udp.h> | 109 | #include <trace/events/udp.h> |
110 | #include <linux/static_key.h> | 110 | #include <linux/static_key.h> |
111 | #include <trace/events/skb.h> | ||
111 | #include "udp_impl.h" | 112 | #include "udp_impl.h" |
112 | 113 | ||
113 | struct udp_table udp_table __read_mostly; | 114 | struct udp_table udp_table __read_mostly; |
@@ -615,6 +616,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) | |||
615 | break; | 616 | break; |
616 | case ICMP_DEST_UNREACH: | 617 | case ICMP_DEST_UNREACH: |
617 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ | 618 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ |
619 | ipv4_sk_update_pmtu(skb, sk, info); | ||
618 | if (inet->pmtudisc != IP_PMTUDISC_DONT) { | 620 | if (inet->pmtudisc != IP_PMTUDISC_DONT) { |
619 | err = EMSGSIZE; | 621 | err = EMSGSIZE; |
620 | harderr = 1; | 622 | harderr = 1; |
@@ -1219,8 +1221,10 @@ try_again: | |||
1219 | goto csum_copy_err; | 1221 | goto csum_copy_err; |
1220 | } | 1222 | } |
1221 | 1223 | ||
1222 | if (err) | 1224 | if (unlikely(err)) { |
1225 | trace_kfree_skb(skb, udp_recvmsg); | ||
1223 | goto out_free; | 1226 | goto out_free; |
1227 | } | ||
1224 | 1228 | ||
1225 | if (!peeked) | 1229 | if (!peeked) |
1226 | UDP_INC_STATS_USER(sock_net(sk), | 1230 | UDP_INC_STATS_USER(sock_net(sk), |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 0d3426cb5c4f..87d3fcc302d4 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
@@ -90,10 +90,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, | |||
90 | xdst->u.dst.dev = dev; | 90 | xdst->u.dst.dev = dev; |
91 | dev_hold(dev); | 91 | dev_hold(dev); |
92 | 92 | ||
93 | xdst->u.rt.peer = rt->peer; | ||
94 | if (rt->peer) | ||
95 | atomic_inc(&rt->peer->refcnt); | ||
96 | |||
97 | /* Sheit... I remember I did this right. Apparently, | 93 | /* Sheit... I remember I did this right. Apparently, |
98 | * it was magically lost, so this code needs audit */ | 94 | * it was magically lost, so this code needs audit */ |
99 | xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | | 95 | xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | |
@@ -102,7 +98,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, | |||
102 | xdst->u.rt.rt_src = rt->rt_src; | 98 | xdst->u.rt.rt_src = rt->rt_src; |
103 | xdst->u.rt.rt_dst = rt->rt_dst; | 99 | xdst->u.rt.rt_dst = rt->rt_dst; |
104 | xdst->u.rt.rt_gateway = rt->rt_gateway; | 100 | xdst->u.rt.rt_gateway = rt->rt_gateway; |
105 | xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; | 101 | xdst->u.rt.rt_pmtu = rt->rt_pmtu; |
106 | 102 | ||
107 | return 0; | 103 | return 0; |
108 | } | 104 | } |
@@ -212,9 +208,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) | |||
212 | 208 | ||
213 | dst_destroy_metrics_generic(dst); | 209 | dst_destroy_metrics_generic(dst); |
214 | 210 | ||
215 | if (likely(xdst->u.rt.peer)) | ||
216 | inet_putpeer(xdst->u.rt.peer); | ||
217 | |||
218 | xfrm_dst_destroy(xdst); | 211 | xfrm_dst_destroy(xdst); |
219 | } | 212 | } |
220 | 213 | ||