aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c46
-rw-r--r--net/ipv4/ah4.c1
-rw-r--r--net/ipv4/arp.c3
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/esp4.c1
-rw-r--r--net/ipv4/fib_frontend.c117
-rw-r--r--net/ipv4/fib_rules.c24
-rw-r--r--net/ipv4/fib_semantics.c19
-rw-r--r--net/ipv4/fib_trie.c16
-rw-r--r--net/ipv4/icmp.c25
-rw-r--r--net/ipv4/inet_connection_sock.c8
-rw-r--r--net/ipv4/inet_diag.c125
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inetpeer.c99
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c14
-rw-r--r--net/ipv4/ip_input.c28
-rw-r--r--net/ipv4/ip_options.c26
-rw-r--r--net/ipv4/ip_output.c44
-rw-r--r--net/ipv4/ip_sockglue.c7
-rw-r--r--net/ipv4/ipcomp.c1
-rw-r--r--net/ipv4/ipip.c15
-rw-r--r--net/ipv4/ipmr.c32
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c23
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c172
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c81
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c13
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c4
-rw-r--r--net/ipv4/ping.c1
-rw-r--r--net/ipv4/protocol.c8
-rw-r--r--net/ipv4/raw.c3
-rw-r--r--net/ipv4/route.c677
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c5
-rw-r--r--net/ipv4/tcp_input.c219
-rw-r--r--net/ipv4/tcp_ipv4.c127
-rw-r--r--net/ipv4/tcp_metrics.c697
-rw-r--r--net/ipv4/tcp_minisocks.c56
-rw-r--r--net/ipv4/tcp_output.c20
-rw-r--r--net/ipv4/udp.c6
-rw-r--r--net/ipv4/xfrm4_policy.c9
48 files changed, 1667 insertions, 1159 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6a..5a23e8b37106 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
7 ip_output.o ip_sockglue.o inet_hashtables.o \ 7 ip_output.o ip_sockglue.o inet_hashtables.o \
8 inet_timewait_sock.o inet_connection_sock.o \ 8 inet_timewait_sock.o inet_connection_sock.o \
9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o tcp_metrics.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o fib_trie.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c8f7aee587d1..07a02f6e9696 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
157 157
158 kfree(rcu_dereference_protected(inet->inet_opt, 1)); 158 kfree(rcu_dereference_protected(inet->inet_opt, 1));
159 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); 159 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
160 dst_release(sk->sk_rx_dst);
160 sk_refcnt_debug_dec(sk); 161 sk_refcnt_debug_dec(sk);
161} 162}
162EXPORT_SYMBOL(inet_sock_destruct); 163EXPORT_SYMBOL(inet_sock_destruct);
@@ -242,20 +243,18 @@ void build_ehash_secret(void)
242} 243}
243EXPORT_SYMBOL(build_ehash_secret); 244EXPORT_SYMBOL(build_ehash_secret);
244 245
245static inline int inet_netns_ok(struct net *net, int protocol) 246static inline int inet_netns_ok(struct net *net, __u8 protocol)
246{ 247{
247 int hash;
248 const struct net_protocol *ipprot; 248 const struct net_protocol *ipprot;
249 249
250 if (net_eq(net, &init_net)) 250 if (net_eq(net, &init_net))
251 return 1; 251 return 1;
252 252
253 hash = protocol & (MAX_INET_PROTOS - 1); 253 ipprot = rcu_dereference(inet_protos[protocol]);
254 ipprot = rcu_dereference(inet_protos[hash]); 254 if (ipprot == NULL) {
255
256 if (ipprot == NULL)
257 /* raw IP is OK */ 255 /* raw IP is OK */
258 return 1; 256 return 1;
257 }
259 return ipprot->netns_ok; 258 return ipprot->netns_ok;
260} 259}
261 260
@@ -553,7 +552,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
553 552
554 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 553 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
555 return -EAGAIN; 554 return -EAGAIN;
556 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); 555 return sk->sk_prot->connect(sk, uaddr, addr_len);
557} 556}
558EXPORT_SYMBOL(inet_dgram_connect); 557EXPORT_SYMBOL(inet_dgram_connect);
559 558
@@ -1216,8 +1215,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
1216 1215
1217static int inet_gso_send_check(struct sk_buff *skb) 1216static int inet_gso_send_check(struct sk_buff *skb)
1218{ 1217{
1219 const struct iphdr *iph;
1220 const struct net_protocol *ops; 1218 const struct net_protocol *ops;
1219 const struct iphdr *iph;
1221 int proto; 1220 int proto;
1222 int ihl; 1221 int ihl;
1223 int err = -EINVAL; 1222 int err = -EINVAL;
@@ -1236,7 +1235,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
1236 __skb_pull(skb, ihl); 1235 __skb_pull(skb, ihl);
1237 skb_reset_transport_header(skb); 1236 skb_reset_transport_header(skb);
1238 iph = ip_hdr(skb); 1237 iph = ip_hdr(skb);
1239 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1238 proto = iph->protocol;
1240 err = -EPROTONOSUPPORT; 1239 err = -EPROTONOSUPPORT;
1241 1240
1242 rcu_read_lock(); 1241 rcu_read_lock();
@@ -1253,8 +1252,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1253 netdev_features_t features) 1252 netdev_features_t features)
1254{ 1253{
1255 struct sk_buff *segs = ERR_PTR(-EINVAL); 1254 struct sk_buff *segs = ERR_PTR(-EINVAL);
1256 struct iphdr *iph;
1257 const struct net_protocol *ops; 1255 const struct net_protocol *ops;
1256 struct iphdr *iph;
1258 int proto; 1257 int proto;
1259 int ihl; 1258 int ihl;
1260 int id; 1259 int id;
@@ -1286,7 +1285,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1286 skb_reset_transport_header(skb); 1285 skb_reset_transport_header(skb);
1287 iph = ip_hdr(skb); 1286 iph = ip_hdr(skb);
1288 id = ntohs(iph->id); 1287 id = ntohs(iph->id);
1289 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1288 proto = iph->protocol;
1290 segs = ERR_PTR(-EPROTONOSUPPORT); 1289 segs = ERR_PTR(-EPROTONOSUPPORT);
1291 1290
1292 rcu_read_lock(); 1291 rcu_read_lock();
@@ -1340,7 +1339,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1340 goto out; 1339 goto out;
1341 } 1340 }
1342 1341
1343 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1342 proto = iph->protocol;
1344 1343
1345 rcu_read_lock(); 1344 rcu_read_lock();
1346 ops = rcu_dereference(inet_protos[proto]); 1345 ops = rcu_dereference(inet_protos[proto]);
@@ -1398,11 +1397,11 @@ out:
1398 1397
1399static int inet_gro_complete(struct sk_buff *skb) 1398static int inet_gro_complete(struct sk_buff *skb)
1400{ 1399{
1401 const struct net_protocol *ops; 1400 __be16 newlen = htons(skb->len - skb_network_offset(skb));
1402 struct iphdr *iph = ip_hdr(skb); 1401 struct iphdr *iph = ip_hdr(skb);
1403 int proto = iph->protocol & (MAX_INET_PROTOS - 1); 1402 const struct net_protocol *ops;
1403 int proto = iph->protocol;
1404 int err = -ENOSYS; 1404 int err = -ENOSYS;
1405 __be16 newlen = htons(skb->len - skb_network_offset(skb));
1406 1405
1407 csum_replace2(&iph->check, iph->tot_len, newlen); 1406 csum_replace2(&iph->check, iph->tot_len, newlen);
1408 iph->tot_len = newlen; 1407 iph->tot_len = newlen;
@@ -1520,14 +1519,15 @@ static const struct net_protocol igmp_protocol = {
1520#endif 1519#endif
1521 1520
1522static const struct net_protocol tcp_protocol = { 1521static const struct net_protocol tcp_protocol = {
1523 .handler = tcp_v4_rcv, 1522 .early_demux = tcp_v4_early_demux,
1524 .err_handler = tcp_v4_err, 1523 .handler = tcp_v4_rcv,
1525 .gso_send_check = tcp_v4_gso_send_check, 1524 .err_handler = tcp_v4_err,
1526 .gso_segment = tcp_tso_segment, 1525 .gso_send_check = tcp_v4_gso_send_check,
1527 .gro_receive = tcp4_gro_receive, 1526 .gso_segment = tcp_tso_segment,
1528 .gro_complete = tcp4_gro_complete, 1527 .gro_receive = tcp4_gro_receive,
1529 .no_policy = 1, 1528 .gro_complete = tcp4_gro_complete,
1530 .netns_ok = 1, 1529 .no_policy = 1,
1530 .netns_ok = 1,
1531}; 1531};
1532 1532
1533static const struct net_protocol udp_protocol = { 1533static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e8f2617ecd47..916d5ecaf6c6 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -408,6 +408,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
408 return; 408 return;
409 pr_debug("pmtu discovery on SA AH/%08x/%08x\n", 409 pr_debug("pmtu discovery on SA AH/%08x/%08x\n",
410 ntohl(ah->spi), ntohl(iph->daddr)); 410 ntohl(ah->spi), ntohl(iph->daddr));
411 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
411 xfrm_state_put(x); 412 xfrm_state_put(x);
412} 413}
413 414
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index cda37be02f8d..2e560f0c757d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -790,7 +790,8 @@ static int arp_process(struct sk_buff *skb)
790 * Check for bad requests for 127.x.x.x and requests for multicast 790 * Check for bad requests for 127.x.x.x and requests for multicast
791 * addresses. If this is one such, delete it. 791 * addresses. If this is one such, delete it.
792 */ 792 */
793 if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) 793 if (ipv4_is_multicast(tip) ||
794 (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
794 goto out; 795 goto out;
795 796
796/* 797/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 10e15a144e95..44bf82e3aef7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1500,7 +1500,8 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
1500 1500
1501 if (cnf == net->ipv4.devconf_dflt) 1501 if (cnf == net->ipv4.devconf_dflt)
1502 devinet_copy_dflt_conf(net, i); 1502 devinet_copy_dflt_conf(net, i);
1503 if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) 1503 if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
1504 i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
1504 if ((new_value == 0) && (old_value != 0)) 1505 if ((new_value == 0) && (old_value != 0))
1505 rt_cache_flush(net, 0); 1506 rt_cache_flush(net, 0);
1506 } 1507 }
@@ -1617,6 +1618,8 @@ static struct devinet_sysctl_table {
1617 "force_igmp_version"), 1618 "force_igmp_version"),
1618 DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, 1619 DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
1619 "promote_secondaries"), 1620 "promote_secondaries"),
1621 DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
1622 "route_localnet"),
1620 }, 1623 },
1621}; 1624};
1622 1625
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index cb982a61536f..7b95b49a36ce 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -494,6 +494,7 @@ static void esp4_err(struct sk_buff *skb, u32 info)
494 return; 494 return;
495 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 495 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
496 ntohl(esph->spi), ntohl(iph->daddr)); 496 ntohl(esph->spi), ntohl(iph->daddr));
497 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
497 xfrm_state_put(x); 498 xfrm_state_put(x);
498} 499}
499 500
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3854411fa37c..81f85716a894 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -31,6 +31,7 @@
31#include <linux/if_addr.h> 31#include <linux/if_addr.h>
32#include <linux/if_arp.h> 32#include <linux/if_arp.h>
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <linux/cache.h>
34#include <linux/init.h> 35#include <linux/init.h>
35#include <linux/list.h> 36#include <linux/list.h>
36#include <linux/slab.h> 37#include <linux/slab.h>
@@ -85,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
85 tb = fib_trie_table(id); 86 tb = fib_trie_table(id);
86 if (!tb) 87 if (!tb)
87 return NULL; 88 return NULL;
89
90 switch (id) {
91 case RT_TABLE_LOCAL:
92 net->ipv4.fib_local = tb;
93 break;
94
95 case RT_TABLE_MAIN:
96 net->ipv4.fib_main = tb;
97 break;
98
99 case RT_TABLE_DEFAULT:
100 net->ipv4.fib_default = tb;
101 break;
102
103 default:
104 break;
105 }
106
88 h = id & (FIB_TABLE_HASHSZ - 1); 107 h = id & (FIB_TABLE_HASHSZ - 1);
89 hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); 108 hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
90 return tb; 109 return tb;
@@ -180,6 +199,43 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
180} 199}
181EXPORT_SYMBOL(inet_dev_addr_type); 200EXPORT_SYMBOL(inet_dev_addr_type);
182 201
202__be32 fib_compute_spec_dst(struct sk_buff *skb)
203{
204 struct net_device *dev = skb->dev;
205 struct in_device *in_dev;
206 struct fib_result res;
207 struct rtable *rt;
208 struct flowi4 fl4;
209 struct net *net;
210 int scope;
211
212 rt = skb_rtable(skb);
213 if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)))
214 return ip_hdr(skb)->daddr;
215
216 in_dev = __in_dev_get_rcu(dev);
217 BUG_ON(!in_dev);
218
219 net = dev_net(dev);
220
221 scope = RT_SCOPE_UNIVERSE;
222 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
223 fl4.flowi4_oif = 0;
224 fl4.flowi4_iif = net->loopback_dev->ifindex;
225 fl4.daddr = ip_hdr(skb)->saddr;
226 fl4.saddr = 0;
227 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
228 fl4.flowi4_scope = scope;
229 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
230 if (!fib_lookup(net, &fl4, &res))
231 return FIB_RES_PREFSRC(net, res);
232 } else {
233 scope = RT_SCOPE_LINK;
234 }
235
236 return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
237}
238
183/* Given (packet source, input interface) and optional (dst, oif, tos): 239/* Given (packet source, input interface) and optional (dst, oif, tos):
184 * - (main) check, that source is valid i.e. not broadcast or our local 240 * - (main) check, that source is valid i.e. not broadcast or our local
185 * address. 241 * address.
@@ -188,17 +244,15 @@ EXPORT_SYMBOL(inet_dev_addr_type);
188 * - check, that packet arrived from expected physical interface. 244 * - check, that packet arrived from expected physical interface.
189 * called with rcu_read_lock() 245 * called with rcu_read_lock()
190 */ 246 */
191int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, 247static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
192 int oif, struct net_device *dev, __be32 *spec_dst, 248 u8 tos, int oif, struct net_device *dev,
193 u32 *itag) 249 int rpf, struct in_device *idev, u32 *itag)
194{ 250{
195 struct in_device *in_dev; 251 int ret, no_addr, accept_local;
196 struct flowi4 fl4;
197 struct fib_result res; 252 struct fib_result res;
198 int no_addr, rpf, accept_local; 253 struct flowi4 fl4;
199 bool dev_match;
200 int ret;
201 struct net *net; 254 struct net *net;
255 bool dev_match;
202 256
203 fl4.flowi4_oif = 0; 257 fl4.flowi4_oif = 0;
204 fl4.flowi4_iif = oif; 258 fl4.flowi4_iif = oif;
@@ -207,20 +261,11 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
207 fl4.flowi4_tos = tos; 261 fl4.flowi4_tos = tos;
208 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 262 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
209 263
210 no_addr = rpf = accept_local = 0; 264 no_addr = accept_local = 0;
211 in_dev = __in_dev_get_rcu(dev); 265 no_addr = idev->ifa_list == NULL;
212 if (in_dev) {
213 no_addr = in_dev->ifa_list == NULL;
214
215 /* Ignore rp_filter for packets protected by IPsec. */
216 rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
217 266
218 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); 267 accept_local = IN_DEV_ACCEPT_LOCAL(idev);
219 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; 268 fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
220 }
221
222 if (in_dev == NULL)
223 goto e_inval;
224 269
225 net = dev_net(dev); 270 net = dev_net(dev);
226 if (fib_lookup(net, &fl4, &res)) 271 if (fib_lookup(net, &fl4, &res))
@@ -229,7 +274,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
229 if (res.type != RTN_LOCAL || !accept_local) 274 if (res.type != RTN_LOCAL || !accept_local)
230 goto e_inval; 275 goto e_inval;
231 } 276 }
232 *spec_dst = FIB_RES_PREFSRC(net, res);
233 fib_combine_itag(itag, &res); 277 fib_combine_itag(itag, &res);
234 dev_match = false; 278 dev_match = false;
235 279
@@ -258,17 +302,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
258 302
259 ret = 0; 303 ret = 0;
260 if (fib_lookup(net, &fl4, &res) == 0) { 304 if (fib_lookup(net, &fl4, &res) == 0) {
261 if (res.type == RTN_UNICAST) { 305 if (res.type == RTN_UNICAST)
262 *spec_dst = FIB_RES_PREFSRC(net, res);
263 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 306 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
264 }
265 } 307 }
266 return ret; 308 return ret;
267 309
268last_resort: 310last_resort:
269 if (rpf) 311 if (rpf)
270 goto e_rpf; 312 goto e_rpf;
271 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
272 *itag = 0; 313 *itag = 0;
273 return 0; 314 return 0;
274 315
@@ -278,6 +319,20 @@ e_rpf:
278 return -EXDEV; 319 return -EXDEV;
279} 320}
280 321
322/* Ignore rp_filter for packets protected by IPsec. */
323int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
324 u8 tos, int oif, struct net_device *dev,
325 struct in_device *idev, u32 *itag)
326{
327 int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
328
329 if (!r && !fib_num_tclassid_users(dev_net(dev))) {
330 *itag = 0;
331 return 0;
332 }
333 return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
334}
335
281static inline __be32 sk_extract_addr(struct sockaddr *addr) 336static inline __be32 sk_extract_addr(struct sockaddr *addr)
282{ 337{
283 return ((struct sockaddr_in *) addr)->sin_addr.s_addr; 338 return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
@@ -935,8 +990,11 @@ static void nl_fib_input(struct sk_buff *skb)
935static int __net_init nl_fib_lookup_init(struct net *net) 990static int __net_init nl_fib_lookup_init(struct net *net)
936{ 991{
937 struct sock *sk; 992 struct sock *sk;
938 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, 993 struct netlink_kernel_cfg cfg = {
939 nl_fib_input, NULL, THIS_MODULE); 994 .input = nl_fib_input,
995 };
996
997 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
940 if (sk == NULL) 998 if (sk == NULL)
941 return -EAFNOSUPPORT; 999 return -EAFNOSUPPORT;
942 net->ipv4.fibnl = sk; 1000 net->ipv4.fibnl = sk;
@@ -1090,6 +1148,9 @@ static int __net_init fib_net_init(struct net *net)
1090{ 1148{
1091 int error; 1149 int error;
1092 1150
1151#ifdef CONFIG_IP_ROUTE_CLASSID
1152 net->ipv4.fib_num_tclassid_users = 0;
1153#endif
1093 error = ip_fib_net_init(net); 1154 error = ip_fib_net_init(net);
1094 if (error < 0) 1155 if (error < 0)
1095 goto out; 1156 goto out;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 2d043f71ef70..c06da93b0b70 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -54,7 +54,7 @@ u32 fib_rules_tclass(const struct fib_result *res)
54} 54}
55#endif 55#endif
56 56
57int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) 57int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
58{ 58{
59 struct fib_lookup_arg arg = { 59 struct fib_lookup_arg arg = {
60 .result = res, 60 .result = res,
@@ -67,7 +67,7 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
67 67
68 return err; 68 return err;
69} 69}
70EXPORT_SYMBOL_GPL(fib_lookup); 70EXPORT_SYMBOL_GPL(__fib_lookup);
71 71
72static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, 72static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
73 int flags, struct fib_lookup_arg *arg) 73 int flags, struct fib_lookup_arg *arg)
@@ -169,8 +169,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
169 rule4->dst = nla_get_be32(tb[FRA_DST]); 169 rule4->dst = nla_get_be32(tb[FRA_DST]);
170 170
171#ifdef CONFIG_IP_ROUTE_CLASSID 171#ifdef CONFIG_IP_ROUTE_CLASSID
172 if (tb[FRA_FLOW]) 172 if (tb[FRA_FLOW]) {
173 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 173 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
174 if (rule4->tclassid)
175 net->ipv4.fib_num_tclassid_users++;
176 }
174#endif 177#endif
175 178
176 rule4->src_len = frh->src_len; 179 rule4->src_len = frh->src_len;
@@ -179,11 +182,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
179 rule4->dstmask = inet_make_mask(rule4->dst_len); 182 rule4->dstmask = inet_make_mask(rule4->dst_len);
180 rule4->tos = frh->tos; 183 rule4->tos = frh->tos;
181 184
185 net->ipv4.fib_has_custom_rules = true;
182 err = 0; 186 err = 0;
183errout: 187errout:
184 return err; 188 return err;
185} 189}
186 190
191static void fib4_rule_delete(struct fib_rule *rule)
192{
193 struct net *net = rule->fr_net;
194#ifdef CONFIG_IP_ROUTE_CLASSID
195 struct fib4_rule *rule4 = (struct fib4_rule *) rule;
196
197 if (rule4->tclassid)
198 net->ipv4.fib_num_tclassid_users--;
199#endif
200 net->ipv4.fib_has_custom_rules = true;
201}
202
187static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, 203static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
188 struct nlattr **tb) 204 struct nlattr **tb)
189{ 205{
@@ -256,6 +272,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
256 .action = fib4_rule_action, 272 .action = fib4_rule_action,
257 .match = fib4_rule_match, 273 .match = fib4_rule_match,
258 .configure = fib4_rule_configure, 274 .configure = fib4_rule_configure,
275 .delete = fib4_rule_delete,
259 .compare = fib4_rule_compare, 276 .compare = fib4_rule_compare,
260 .fill = fib4_rule_fill, 277 .fill = fib4_rule_fill,
261 .default_pref = fib_default_rule_pref, 278 .default_pref = fib_default_rule_pref,
@@ -295,6 +312,7 @@ int __net_init fib4_rules_init(struct net *net)
295 if (err < 0) 312 if (err < 0)
296 goto fail; 313 goto fail;
297 net->ipv4.rules_ops = ops; 314 net->ipv4.rules_ops = ops;
315 net->ipv4.fib_has_custom_rules = false;
298 return 0; 316 return 0;
299 317
300fail: 318fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e5b7182fa099..d71bfbdc0bf4 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -163,6 +163,12 @@ void free_fib_info(struct fib_info *fi)
163 return; 163 return;
164 } 164 }
165 fib_info_cnt--; 165 fib_info_cnt--;
166#ifdef CONFIG_IP_ROUTE_CLASSID
167 change_nexthops(fi) {
168 if (nexthop_nh->nh_tclassid)
169 fi->fib_net->ipv4.fib_num_tclassid_users--;
170 } endfor_nexthops(fi);
171#endif
166 call_rcu(&fi->rcu, free_fib_info_rcu); 172 call_rcu(&fi->rcu, free_fib_info_rcu);
167} 173}
168 174
@@ -421,6 +427,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
421#ifdef CONFIG_IP_ROUTE_CLASSID 427#ifdef CONFIG_IP_ROUTE_CLASSID
422 nla = nla_find(attrs, attrlen, RTA_FLOW); 428 nla = nla_find(attrs, attrlen, RTA_FLOW);
423 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 429 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
430 if (nexthop_nh->nh_tclassid)
431 fi->fib_net->ipv4.fib_num_tclassid_users++;
424#endif 432#endif
425 } 433 }
426 434
@@ -779,9 +787,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
779 int type = nla_type(nla); 787 int type = nla_type(nla);
780 788
781 if (type) { 789 if (type) {
790 u32 val;
791
782 if (type > RTAX_MAX) 792 if (type > RTAX_MAX)
783 goto err_inval; 793 goto err_inval;
784 fi->fib_metrics[type - 1] = nla_get_u32(nla); 794 val = nla_get_u32(nla);
795 if (type == RTAX_ADVMSS && val > 65535 - 40)
796 val = 65535 - 40;
797 if (type == RTAX_MTU && val > 65535 - 15)
798 val = 65535 - 15;
799 fi->fib_metrics[type - 1] = val;
785 } 800 }
786 } 801 }
787 } 802 }
@@ -810,6 +825,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
810 nh->nh_flags = cfg->fc_flags; 825 nh->nh_flags = cfg->fc_flags;
811#ifdef CONFIG_IP_ROUTE_CLASSID 826#ifdef CONFIG_IP_ROUTE_CLASSID
812 nh->nh_tclassid = cfg->fc_flow; 827 nh->nh_tclassid = cfg->fc_flow;
828 if (nh->nh_tclassid)
829 fi->fib_net->ipv4.fib_num_tclassid_users++;
813#endif 830#endif
814#ifdef CONFIG_IP_ROUTE_MULTIPATH 831#ifdef CONFIG_IP_ROUTE_MULTIPATH
815 nh->nh_weight = 1; 832 nh->nh_weight = 1;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 30b88d7b4bd6..9b0f25930fbc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1007,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1007 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { 1007 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1008 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1008 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1009 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 1009 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1010 tn = (struct tnode *) resize(t, (struct tnode *)tn); 1010 tn = (struct tnode *)resize(t, tn);
1011 1011
1012 tnode_put_child_reorg((struct tnode *)tp, cindex, 1012 tnode_put_child_reorg(tp, cindex,
1013 (struct rt_trie_node *)tn, wasfull); 1013 (struct rt_trie_node *)tn, wasfull);
1014 1014
1015 tp = node_parent((struct rt_trie_node *) tn); 1015 tp = node_parent((struct rt_trie_node *) tn);
@@ -1024,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1024 1024
1025 /* Handle last (top) tnode */ 1025 /* Handle last (top) tnode */
1026 if (IS_TNODE(tn)) 1026 if (IS_TNODE(tn))
1027 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1027 tn = (struct tnode *)resize(t, tn);
1028 1028
1029 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); 1029 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1030 tnode_free_flush(); 1030 tnode_free_flush();
@@ -1125,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1125 node_set_parent((struct rt_trie_node *)l, tp); 1125 node_set_parent((struct rt_trie_node *)l, tp);
1126 1126
1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); 1128 put_child(t, tp, cindex, (struct rt_trie_node *)l);
1129 } else { 1129 } else {
1130 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1130 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1131 /* 1131 /*
@@ -1160,8 +1160,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1160 1160
1161 if (tp) { 1161 if (tp) {
1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163 put_child(t, (struct tnode *)tp, cindex, 1163 put_child(t, tp, cindex, (struct rt_trie_node *)tn);
1164 (struct rt_trie_node *)tn);
1165 } else { 1164 } else {
1166 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); 1165 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1167 tp = tn; 1166 tp = tn;
@@ -1620,7 +1619,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
1620 1619
1621 if (tp) { 1620 if (tp) {
1622 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); 1621 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1623 put_child(t, (struct tnode *)tp, cindex, NULL); 1622 put_child(t, tp, cindex, NULL);
1624 trie_rebalance(t, tp); 1623 trie_rebalance(t, tp);
1625 } else 1624 } else
1626 RCU_INIT_POINTER(t->trie, NULL); 1625 RCU_INIT_POINTER(t->trie, NULL);
@@ -1844,6 +1843,8 @@ int fib_table_flush(struct fib_table *tb)
1844 if (ll && hlist_empty(&ll->list)) 1843 if (ll && hlist_empty(&ll->list))
1845 trie_leaf_remove(t, ll); 1844 trie_leaf_remove(t, ll);
1846 1845
1846 inetpeer_invalidate_tree(&tb->tb_peers);
1847
1847 pr_debug("trie_flush found=%d\n", found); 1848 pr_debug("trie_flush found=%d\n", found);
1848 return found; 1849 return found;
1849} 1850}
@@ -1992,6 +1993,7 @@ struct fib_table *fib_trie_table(u32 id)
1992 tb->tb_id = id; 1993 tb->tb_id = id;
1993 tb->tb_default = -1; 1994 tb->tb_default = -1;
1994 tb->tb_num_default = 0; 1995 tb->tb_num_default = 0;
1996 inet_peer_base_init(&tb->tb_peers);
1995 1997
1996 t = (struct trie *) tb->tb_data; 1998 t = (struct trie *) tb->tb_data;
1997 memset(t, 0, sizeof(*t)); 1999 memset(t, 0, sizeof(*t));
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c75efbdc71cb..4a049449305f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -95,6 +95,7 @@
95#include <net/checksum.h> 95#include <net/checksum.h>
96#include <net/xfrm.h> 96#include <net/xfrm.h>
97#include <net/inet_common.h> 97#include <net/inet_common.h>
98#include <net/ip_fib.h>
98 99
99/* 100/*
100 * Build xmit assembly blocks 101 * Build xmit assembly blocks
@@ -253,10 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
253 254
254 /* Limit if icmp type is enabled in ratemask. */ 255 /* Limit if icmp type is enabled in ratemask. */
255 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { 256 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
256 if (!rt->peer) 257 struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
257 rt_bind_peer(rt, fl4->daddr, 1); 258 rc = inet_peer_xrlim_allow(peer,
258 rc = inet_peer_xrlim_allow(rt->peer,
259 net->ipv4.sysctl_icmp_ratelimit); 259 net->ipv4.sysctl_icmp_ratelimit);
260 inet_putpeer(peer);
260 } 261 }
261out: 262out:
262 return rc; 263 return rc;
@@ -334,7 +335,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
334 struct flowi4 fl4; 335 struct flowi4 fl4;
335 struct sock *sk; 336 struct sock *sk;
336 struct inet_sock *inet; 337 struct inet_sock *inet;
337 __be32 daddr; 338 __be32 daddr, saddr;
338 339
339 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) 340 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
340 return; 341 return;
@@ -348,6 +349,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
348 349
349 inet->tos = ip_hdr(skb)->tos; 350 inet->tos = ip_hdr(skb)->tos;
350 daddr = ipc.addr = ip_hdr(skb)->saddr; 351 daddr = ipc.addr = ip_hdr(skb)->saddr;
352 saddr = fib_compute_spec_dst(skb);
351 ipc.opt = NULL; 353 ipc.opt = NULL;
352 ipc.tx_flags = 0; 354 ipc.tx_flags = 0;
353 if (icmp_param->replyopts.opt.opt.optlen) { 355 if (icmp_param->replyopts.opt.opt.optlen) {
@@ -357,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
357 } 359 }
358 memset(&fl4, 0, sizeof(fl4)); 360 memset(&fl4, 0, sizeof(fl4));
359 fl4.daddr = daddr; 361 fl4.daddr = daddr;
360 fl4.saddr = rt->rt_spec_dst; 362 fl4.saddr = saddr;
361 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 363 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
362 fl4.flowi4_proto = IPPROTO_ICMP; 364 fl4.flowi4_proto = IPPROTO_ICMP;
363 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 365 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -638,12 +640,12 @@ EXPORT_SYMBOL(icmp_send);
638 640
639static void icmp_unreach(struct sk_buff *skb) 641static void icmp_unreach(struct sk_buff *skb)
640{ 642{
643 const struct net_protocol *ipprot;
641 const struct iphdr *iph; 644 const struct iphdr *iph;
642 struct icmphdr *icmph; 645 struct icmphdr *icmph;
643 int hash, protocol;
644 const struct net_protocol *ipprot;
645 u32 info = 0;
646 struct net *net; 646 struct net *net;
647 u32 info = 0;
648 int protocol;
647 649
648 net = dev_net(skb_dst(skb)->dev); 650 net = dev_net(skb_dst(skb)->dev);
649 651
@@ -674,9 +676,7 @@ static void icmp_unreach(struct sk_buff *skb)
674 LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), 676 LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
675 &iph->daddr); 677 &iph->daddr);
676 } else { 678 } else {
677 info = ip_rt_frag_needed(net, iph, 679 info = ntohs(icmph->un.frag.mtu);
678 ntohs(icmph->un.frag.mtu),
679 skb->dev);
680 if (!info) 680 if (!info)
681 goto out; 681 goto out;
682 } 682 }
@@ -734,9 +734,8 @@ static void icmp_unreach(struct sk_buff *skb)
734 */ 734 */
735 raw_icmp_error(skb, protocol, info); 735 raw_icmp_error(skb, protocol, info);
736 736
737 hash = protocol & (MAX_INET_PROTOS - 1);
738 rcu_read_lock(); 737 rcu_read_lock();
739 ipprot = rcu_dereference(inet_protos[hash]); 738 ipprot = rcu_dereference(inet_protos[protocol]);
740 if (ipprot && ipprot->err_handler) 739 if (ipprot && ipprot->err_handler)
741 ipprot->err_handler(skb, info); 740 ipprot->err_handler(skb, info);
742 rcu_read_unlock(); 741 rcu_read_unlock();
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f9ee7417f6a0..76825be3b643 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -368,17 +368,21 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
368 368
369struct dst_entry *inet_csk_route_req(struct sock *sk, 369struct dst_entry *inet_csk_route_req(struct sock *sk,
370 struct flowi4 *fl4, 370 struct flowi4 *fl4,
371 const struct request_sock *req) 371 const struct request_sock *req,
372 bool nocache)
372{ 373{
373 struct rtable *rt; 374 struct rtable *rt;
374 const struct inet_request_sock *ireq = inet_rsk(req); 375 const struct inet_request_sock *ireq = inet_rsk(req);
375 struct ip_options_rcu *opt = inet_rsk(req)->opt; 376 struct ip_options_rcu *opt = inet_rsk(req)->opt;
376 struct net *net = sock_net(sk); 377 struct net *net = sock_net(sk);
378 int flags = inet_sk_flowi_flags(sk);
377 379
380 if (nocache)
381 flags |= FLOWI_FLAG_RT_NOCACHE;
378 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 382 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
379 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 383 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
380 sk->sk_protocol, 384 sk->sk_protocol,
381 inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS, 385 flags,
382 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, 386 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
383 ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); 387 ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
384 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 388 security_req_classify_flow(req, flowi4_to_flowi(fl4));
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 46d1e7199a8c..38064a285cca 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -46,9 +46,6 @@ struct inet_diag_entry {
46 u16 userlocks; 46 u16 userlocks;
47}; 47};
48 48
49#define INET_DIAG_PUT(skb, attrtype, attrlen) \
50 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
51
52static DEFINE_MUTEX(inet_diag_table_mutex); 49static DEFINE_MUTEX(inet_diag_table_mutex);
53 50
54static const struct inet_diag_handler *inet_diag_lock_handler(int proto) 51static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
@@ -78,24 +75,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
78 const struct inet_sock *inet = inet_sk(sk); 75 const struct inet_sock *inet = inet_sk(sk);
79 struct inet_diag_msg *r; 76 struct inet_diag_msg *r;
80 struct nlmsghdr *nlh; 77 struct nlmsghdr *nlh;
78 struct nlattr *attr;
81 void *info = NULL; 79 void *info = NULL;
82 struct inet_diag_meminfo *minfo = NULL;
83 unsigned char *b = skb_tail_pointer(skb);
84 const struct inet_diag_handler *handler; 80 const struct inet_diag_handler *handler;
85 int ext = req->idiag_ext; 81 int ext = req->idiag_ext;
86 82
87 handler = inet_diag_table[req->sdiag_protocol]; 83 handler = inet_diag_table[req->sdiag_protocol];
88 BUG_ON(handler == NULL); 84 BUG_ON(handler == NULL);
89 85
90 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); 86 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
91 nlh->nlmsg_flags = nlmsg_flags; 87 nlmsg_flags);
88 if (!nlh)
89 return -EMSGSIZE;
92 90
93 r = NLMSG_DATA(nlh); 91 r = nlmsg_data(nlh);
94 BUG_ON(sk->sk_state == TCP_TIME_WAIT); 92 BUG_ON(sk->sk_state == TCP_TIME_WAIT);
95 93
96 if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
97 minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
98
99 r->idiag_family = sk->sk_family; 94 r->idiag_family = sk->sk_family;
100 r->idiag_state = sk->sk_state; 95 r->idiag_state = sk->sk_state;
101 r->idiag_timer = 0; 96 r->idiag_timer = 0;
@@ -113,7 +108,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
113 * hence this needs to be included regardless of socket family. 108 * hence this needs to be included regardless of socket family.
114 */ 109 */
115 if (ext & (1 << (INET_DIAG_TOS - 1))) 110 if (ext & (1 << (INET_DIAG_TOS - 1)))
116 RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); 111 if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
112 goto errout;
117 113
118#if IS_ENABLED(CONFIG_IPV6) 114#if IS_ENABLED(CONFIG_IPV6)
119 if (r->idiag_family == AF_INET6) { 115 if (r->idiag_family == AF_INET6) {
@@ -121,24 +117,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
121 117
122 *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; 118 *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
123 *(struct in6_addr *)r->id.idiag_dst = np->daddr; 119 *(struct in6_addr *)r->id.idiag_dst = np->daddr;
120
124 if (ext & (1 << (INET_DIAG_TCLASS - 1))) 121 if (ext & (1 << (INET_DIAG_TCLASS - 1)))
125 RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); 122 if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
123 goto errout;
126 } 124 }
127#endif 125#endif
128 126
129 r->idiag_uid = sock_i_uid(sk); 127 r->idiag_uid = sock_i_uid(sk);
130 r->idiag_inode = sock_i_ino(sk); 128 r->idiag_inode = sock_i_ino(sk);
131 129
132 if (minfo) { 130 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
133 minfo->idiag_rmem = sk_rmem_alloc_get(sk); 131 struct inet_diag_meminfo minfo = {
134 minfo->idiag_wmem = sk->sk_wmem_queued; 132 .idiag_rmem = sk_rmem_alloc_get(sk),
135 minfo->idiag_fmem = sk->sk_forward_alloc; 133 .idiag_wmem = sk->sk_wmem_queued,
136 minfo->idiag_tmem = sk_wmem_alloc_get(sk); 134 .idiag_fmem = sk->sk_forward_alloc,
135 .idiag_tmem = sk_wmem_alloc_get(sk),
136 };
137
138 if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
139 goto errout;
137 } 140 }
138 141
139 if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) 142 if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
140 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) 143 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
141 goto rtattr_failure; 144 goto errout;
142 145
143 if (icsk == NULL) { 146 if (icsk == NULL) {
144 handler->idiag_get_info(sk, r, NULL); 147 handler->idiag_get_info(sk, r, NULL);
@@ -165,16 +168,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
165 } 168 }
166#undef EXPIRES_IN_MS 169#undef EXPIRES_IN_MS
167 170
168 if (ext & (1 << (INET_DIAG_INFO - 1))) 171 if (ext & (1 << (INET_DIAG_INFO - 1))) {
169 info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info)); 172 attr = nla_reserve(skb, INET_DIAG_INFO,
173 sizeof(struct tcp_info));
174 if (!attr)
175 goto errout;
170 176
171 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { 177 info = nla_data(attr);
172 const size_t len = strlen(icsk->icsk_ca_ops->name);
173
174 strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
175 icsk->icsk_ca_ops->name);
176 } 178 }
177 179
180 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
181 if (nla_put_string(skb, INET_DIAG_CONG,
182 icsk->icsk_ca_ops->name) < 0)
183 goto errout;
184
178 handler->idiag_get_info(sk, r, info); 185 handler->idiag_get_info(sk, r, info);
179 186
180 if (sk->sk_state < TCP_TIME_WAIT && 187 if (sk->sk_state < TCP_TIME_WAIT &&
@@ -182,12 +189,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
182 icsk->icsk_ca_ops->get_info(sk, ext, skb); 189 icsk->icsk_ca_ops->get_info(sk, ext, skb);
183 190
184out: 191out:
185 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 192 return nlmsg_end(skb, nlh);
186 return skb->len;
187 193
188rtattr_failure: 194errout:
189nlmsg_failure: 195 nlmsg_cancel(skb, nlh);
190 nlmsg_trim(skb, b);
191 return -EMSGSIZE; 196 return -EMSGSIZE;
192} 197}
193EXPORT_SYMBOL_GPL(inet_sk_diag_fill); 198EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
@@ -208,14 +213,15 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
208{ 213{
209 long tmo; 214 long tmo;
210 struct inet_diag_msg *r; 215 struct inet_diag_msg *r;
211 const unsigned char *previous_tail = skb_tail_pointer(skb); 216 struct nlmsghdr *nlh;
212 struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
213 unlh->nlmsg_type, sizeof(*r));
214 217
215 r = NLMSG_DATA(nlh); 218 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
216 BUG_ON(tw->tw_state != TCP_TIME_WAIT); 219 nlmsg_flags);
220 if (!nlh)
221 return -EMSGSIZE;
217 222
218 nlh->nlmsg_flags = nlmsg_flags; 223 r = nlmsg_data(nlh);
224 BUG_ON(tw->tw_state != TCP_TIME_WAIT);
219 225
220 tmo = tw->tw_ttd - jiffies; 226 tmo = tw->tw_ttd - jiffies;
221 if (tmo < 0) 227 if (tmo < 0)
@@ -245,11 +251,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
245 *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; 251 *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
246 } 252 }
247#endif 253#endif
248 nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; 254
249 return skb->len; 255 return nlmsg_end(skb, nlh);
250nlmsg_failure:
251 nlmsg_trim(skb, previous_tail);
252 return -EMSGSIZE;
253} 256}
254 257
255static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 258static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
@@ -298,20 +301,20 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
298 if (err) 301 if (err)
299 goto out; 302 goto out;
300 303
301 err = -ENOMEM; 304 rep = nlmsg_new(sizeof(struct inet_diag_msg) +
302 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + 305 sizeof(struct inet_diag_meminfo) +
303 sizeof(struct inet_diag_meminfo) + 306 sizeof(struct tcp_info) + 64, GFP_KERNEL);
304 sizeof(struct tcp_info) + 64)), 307 if (!rep) {
305 GFP_KERNEL); 308 err = -ENOMEM;
306 if (!rep)
307 goto out; 309 goto out;
310 }
308 311
309 err = sk_diag_fill(sk, rep, req, 312 err = sk_diag_fill(sk, rep, req,
310 NETLINK_CB(in_skb).pid, 313 NETLINK_CB(in_skb).pid,
311 nlh->nlmsg_seq, 0, nlh); 314 nlh->nlmsg_seq, 0, nlh);
312 if (err < 0) { 315 if (err < 0) {
313 WARN_ON(err == -EMSGSIZE); 316 WARN_ON(err == -EMSGSIZE);
314 kfree_skb(rep); 317 nlmsg_free(rep);
315 goto out; 318 goto out;
316 } 319 }
317 err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, 320 err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
@@ -592,15 +595,16 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
592{ 595{
593 const struct inet_request_sock *ireq = inet_rsk(req); 596 const struct inet_request_sock *ireq = inet_rsk(req);
594 struct inet_sock *inet = inet_sk(sk); 597 struct inet_sock *inet = inet_sk(sk);
595 unsigned char *b = skb_tail_pointer(skb);
596 struct inet_diag_msg *r; 598 struct inet_diag_msg *r;
597 struct nlmsghdr *nlh; 599 struct nlmsghdr *nlh;
598 long tmo; 600 long tmo;
599 601
600 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); 602 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
601 nlh->nlmsg_flags = NLM_F_MULTI; 603 NLM_F_MULTI);
602 r = NLMSG_DATA(nlh); 604 if (!nlh)
605 return -EMSGSIZE;
603 606
607 r = nlmsg_data(nlh);
604 r->idiag_family = sk->sk_family; 608 r->idiag_family = sk->sk_family;
605 r->idiag_state = TCP_SYN_RECV; 609 r->idiag_state = TCP_SYN_RECV;
606 r->idiag_timer = 1; 610 r->idiag_timer = 1;
@@ -628,13 +632,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
628 *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; 632 *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr;
629 } 633 }
630#endif 634#endif
631 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
632
633 return skb->len;
634 635
635nlmsg_failure: 636 return nlmsg_end(skb, nlh);
636 nlmsg_trim(skb, b);
637 return -1;
638} 637}
639 638
640static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, 639static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
@@ -892,7 +891,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
892 if (nlmsg_attrlen(cb->nlh, hdrlen)) 891 if (nlmsg_attrlen(cb->nlh, hdrlen))
893 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); 892 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
894 893
895 return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc); 894 return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
896} 895}
897 896
898static inline int inet_diag_type2proto(int type) 897static inline int inet_diag_type2proto(int type)
@@ -909,7 +908,7 @@ static inline int inet_diag_type2proto(int type)
909 908
910static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) 909static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
911{ 910{
912 struct inet_diag_req *rc = NLMSG_DATA(cb->nlh); 911 struct inet_diag_req *rc = nlmsg_data(cb->nlh);
913 struct inet_diag_req_v2 req; 912 struct inet_diag_req_v2 req;
914 struct nlattr *bc = NULL; 913 struct nlattr *bc = NULL;
915 int hdrlen = sizeof(struct inet_diag_req); 914 int hdrlen = sizeof(struct inet_diag_req);
@@ -929,7 +928,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c
929static int inet_diag_get_exact_compat(struct sk_buff *in_skb, 928static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
930 const struct nlmsghdr *nlh) 929 const struct nlmsghdr *nlh)
931{ 930{
932 struct inet_diag_req *rc = NLMSG_DATA(nlh); 931 struct inet_diag_req *rc = nlmsg_data(nlh);
933 struct inet_diag_req_v2 req; 932 struct inet_diag_req_v2 req;
934 933
935 req.sdiag_family = rc->idiag_family; 934 req.sdiag_family = rc->idiag_family;
@@ -996,7 +995,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
996 } 995 }
997 } 996 }
998 997
999 return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h)); 998 return inet_diag_get_exact(skb, h, nlmsg_data(h));
1000} 999}
1001 1000
1002static const struct sock_diag_handler inet_diag_handler = { 1001static const struct sock_diag_handler inet_diag_handler = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5ff2a51b6d0c..85190e69297b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
243 if (q == NULL) 243 if (q == NULL)
244 return NULL; 244 return NULL;
245 245
246 q->net = nf;
246 f->constructor(q, arg); 247 f->constructor(q, arg);
247 atomic_add(f->qsize, &nf->mem); 248 atomic_add(f->qsize, &nf->mem);
248 setup_timer(&q->timer, f->frag_expire, (unsigned long)q); 249 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
249 spin_lock_init(&q->lock); 250 spin_lock_init(&q->lock);
250 atomic_set(&q->refcnt, 1); 251 atomic_set(&q->refcnt, 1);
251 q->net = nf;
252 252
253 return q; 253 return q;
254} 254}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index dfba343b2509..e1e0a4e8fd34 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -82,23 +82,39 @@ static const struct inet_peer peer_fake_node = {
82 .avl_height = 0 82 .avl_height = 0
83}; 83};
84 84
85struct inet_peer_base { 85void inet_peer_base_init(struct inet_peer_base *bp)
86 struct inet_peer __rcu *root; 86{
87 seqlock_t lock; 87 bp->root = peer_avl_empty_rcu;
88 int total; 88 seqlock_init(&bp->lock);
89}; 89 bp->flush_seq = ~0U;
90 bp->total = 0;
91}
92EXPORT_SYMBOL_GPL(inet_peer_base_init);
90 93
91static struct inet_peer_base v4_peers = { 94static atomic_t v4_seq = ATOMIC_INIT(0);
92 .root = peer_avl_empty_rcu, 95static atomic_t v6_seq = ATOMIC_INIT(0);
93 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
94 .total = 0,
95};
96 96
97static struct inet_peer_base v6_peers = { 97static atomic_t *inetpeer_seq_ptr(int family)
98 .root = peer_avl_empty_rcu, 98{
99 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), 99 return (family == AF_INET ? &v4_seq : &v6_seq);
100 .total = 0, 100}
101}; 101
102static inline void flush_check(struct inet_peer_base *base, int family)
103{
104 atomic_t *fp = inetpeer_seq_ptr(family);
105
106 if (unlikely(base->flush_seq != atomic_read(fp))) {
107 inetpeer_invalidate_tree(base);
108 base->flush_seq = atomic_read(fp);
109 }
110}
111
112void inetpeer_invalidate_family(int family)
113{
114 atomic_t *fp = inetpeer_seq_ptr(family);
115
116 atomic_inc(fp);
117}
102 118
103#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 119#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
104 120
@@ -110,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
110 126
111static void inetpeer_gc_worker(struct work_struct *work) 127static void inetpeer_gc_worker(struct work_struct *work)
112{ 128{
113 struct inet_peer *p, *n; 129 struct inet_peer *p, *n, *c;
114 LIST_HEAD(list); 130 LIST_HEAD(list);
115 131
116 spin_lock_bh(&gc_lock); 132 spin_lock_bh(&gc_lock);
@@ -122,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work)
122 138
123 list_for_each_entry_safe(p, n, &list, gc_list) { 139 list_for_each_entry_safe(p, n, &list, gc_list) {
124 140
125 if(need_resched()) 141 if (need_resched())
126 cond_resched(); 142 cond_resched();
127 143
128 if (p->avl_left != peer_avl_empty) { 144 c = rcu_dereference_protected(p->avl_left, 1);
129 list_add_tail(&p->avl_left->gc_list, &list); 145 if (c != peer_avl_empty) {
130 p->avl_left = peer_avl_empty; 146 list_add_tail(&c->gc_list, &list);
147 p->avl_left = peer_avl_empty_rcu;
131 } 148 }
132 149
133 if (p->avl_right != peer_avl_empty) { 150 c = rcu_dereference_protected(p->avl_right, 1);
134 list_add_tail(&p->avl_right->gc_list, &list); 151 if (c != peer_avl_empty) {
135 p->avl_right = peer_avl_empty; 152 list_add_tail(&c->gc_list, &list);
153 p->avl_right = peer_avl_empty_rcu;
136 } 154 }
137 155
138 n = list_entry(p->gc_list.next, struct inet_peer, gc_list); 156 n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
@@ -401,11 +419,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
401 call_rcu(&p->rcu, inetpeer_free_rcu); 419 call_rcu(&p->rcu, inetpeer_free_rcu);
402} 420}
403 421
404static struct inet_peer_base *family_to_base(int family)
405{
406 return family == AF_INET ? &v4_peers : &v6_peers;
407}
408
409/* perform garbage collect on all items stacked during a lookup */ 422/* perform garbage collect on all items stacked during a lookup */
410static int inet_peer_gc(struct inet_peer_base *base, 423static int inet_peer_gc(struct inet_peer_base *base,
411 struct inet_peer __rcu **stack[PEER_MAXDEPTH], 424 struct inet_peer __rcu **stack[PEER_MAXDEPTH],
@@ -443,14 +456,17 @@ static int inet_peer_gc(struct inet_peer_base *base,
443 return cnt; 456 return cnt;
444} 457}
445 458
446struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) 459struct inet_peer *inet_getpeer(struct inet_peer_base *base,
460 const struct inetpeer_addr *daddr,
461 int create)
447{ 462{
448 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 463 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
449 struct inet_peer_base *base = family_to_base(daddr->family);
450 struct inet_peer *p; 464 struct inet_peer *p;
451 unsigned int sequence; 465 unsigned int sequence;
452 int invalidated, gccnt = 0; 466 int invalidated, gccnt = 0;
453 467
468 flush_check(base, daddr->family);
469
454 /* Attempt a lockless lookup first. 470 /* Attempt a lockless lookup first.
455 * Because of a concurrent writer, we might not find an existing entry. 471 * Because of a concurrent writer, we might not find an existing entry.
456 */ 472 */
@@ -492,13 +508,9 @@ relookup:
492 (daddr->family == AF_INET) ? 508 (daddr->family == AF_INET) ?
493 secure_ip_id(daddr->addr.a4) : 509 secure_ip_id(daddr->addr.a4) :
494 secure_ipv6_id(daddr->addr.a6)); 510 secure_ipv6_id(daddr->addr.a6));
495 p->tcp_ts_stamp = 0;
496 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 511 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
497 p->rate_tokens = 0; 512 p->rate_tokens = 0;
498 p->rate_last = 0; 513 p->rate_last = 0;
499 p->pmtu_expires = 0;
500 p->pmtu_orig = 0;
501 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
502 INIT_LIST_HEAD(&p->gc_list); 514 INIT_LIST_HEAD(&p->gc_list);
503 515
504 /* Link the node. */ 516 /* Link the node. */
@@ -571,26 +583,19 @@ static void inetpeer_inval_rcu(struct rcu_head *head)
571 schedule_delayed_work(&gc_work, gc_delay); 583 schedule_delayed_work(&gc_work, gc_delay);
572} 584}
573 585
574void inetpeer_invalidate_tree(int family) 586void inetpeer_invalidate_tree(struct inet_peer_base *base)
575{ 587{
576 struct inet_peer *old, *new, *prev; 588 struct inet_peer *root;
577 struct inet_peer_base *base = family_to_base(family);
578 589
579 write_seqlock_bh(&base->lock); 590 write_seqlock_bh(&base->lock);
580 591
581 old = base->root; 592 root = rcu_deref_locked(base->root, base);
582 if (old == peer_avl_empty_rcu) 593 if (root != peer_avl_empty) {
583 goto out; 594 base->root = peer_avl_empty_rcu;
584
585 new = peer_avl_empty_rcu;
586
587 prev = cmpxchg(&base->root, old, new);
588 if (prev == old) {
589 base->total = 0; 595 base->total = 0;
590 call_rcu(&prev->gc_rcu, inetpeer_inval_rcu); 596 call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
591 } 597 }
592 598
593out:
594 write_sequnlock_bh(&base->lock); 599 write_sequnlock_bh(&base->lock);
595} 600}
596EXPORT_SYMBOL(inetpeer_invalidate_tree); 601EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9dbd3dd6022d..8d07c973409c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
171static void ip4_frag_init(struct inet_frag_queue *q, void *a) 171static void ip4_frag_init(struct inet_frag_queue *q, void *a)
172{ 172{
173 struct ipq *qp = container_of(q, struct ipq, q); 173 struct ipq *qp = container_of(q, struct ipq, q);
174 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
175 frags);
176 struct net *net = container_of(ipv4, struct net, ipv4);
177
174 struct ip4_create_arg *arg = a; 178 struct ip4_create_arg *arg = a;
175 179
176 qp->protocol = arg->iph->protocol; 180 qp->protocol = arg->iph->protocol;
@@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
180 qp->daddr = arg->iph->daddr; 184 qp->daddr = arg->iph->daddr;
181 qp->user = arg->user; 185 qp->user = arg->user;
182 qp->peer = sysctl_ipfrag_max_dist ? 186 qp->peer = sysctl_ipfrag_max_dist ?
183 inet_getpeer_v4(arg->iph->saddr, 1) : NULL; 187 inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
184} 188}
185 189
186static __inline__ void ip4_frag_free(struct inet_frag_queue *q) 190static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f49047b79609..594cec35ac4d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -516,9 +516,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
516 case ICMP_PORT_UNREACH: 516 case ICMP_PORT_UNREACH:
517 /* Impossible event. */ 517 /* Impossible event. */
518 return; 518 return;
519 case ICMP_FRAG_NEEDED:
520 /* Soft state for pmtu is maintained by IP core. */
521 return;
522 default: 519 default:
523 /* All others are translated to HOST_UNREACH. 520 /* All others are translated to HOST_UNREACH.
524 rfc2003 contains "deep thoughts" about NET_UNREACH, 521 rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -538,7 +535,16 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
538 flags & GRE_KEY ? 535 flags & GRE_KEY ?
539 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 536 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
540 p[1]); 537 p[1]);
541 if (t == NULL || t->parms.iph.daddr == 0 || 538 if (t == NULL)
539 goto out;
540
541 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
542 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
543 t->parms.link, 0, IPPROTO_GRE, 0);
544 goto out;
545 }
546
547 if (t->parms.iph.daddr == 0 ||
542 ipv4_is_multicast(t->parms.iph.daddr)) 548 ipv4_is_multicast(t->parms.iph.daddr))
543 goto out; 549 goto out;
544 550
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144ca330..b27d4440f523 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
198 rcu_read_lock(); 198 rcu_read_lock();
199 { 199 {
200 int protocol = ip_hdr(skb)->protocol; 200 int protocol = ip_hdr(skb)->protocol;
201 int hash, raw;
202 const struct net_protocol *ipprot; 201 const struct net_protocol *ipprot;
202 int raw;
203 203
204 resubmit: 204 resubmit:
205 raw = raw_local_deliver(skb, protocol); 205 raw = raw_local_deliver(skb, protocol);
206 206
207 hash = protocol & (MAX_INET_PROTOS - 1); 207 ipprot = rcu_dereference(inet_protos[protocol]);
208 ipprot = rcu_dereference(inet_protos[hash]);
209 if (ipprot != NULL) { 208 if (ipprot != NULL) {
210 int ret; 209 int ret;
211 210
@@ -314,26 +313,33 @@ drop:
314 return true; 313 return true;
315} 314}
316 315
316int sysctl_ip_early_demux __read_mostly = 1;
317
317static int ip_rcv_finish(struct sk_buff *skb) 318static int ip_rcv_finish(struct sk_buff *skb)
318{ 319{
319 const struct iphdr *iph = ip_hdr(skb); 320 const struct iphdr *iph = ip_hdr(skb);
320 struct rtable *rt; 321 struct rtable *rt;
321 322
323 if (sysctl_ip_early_demux && !skb_dst(skb)) {
324 const struct net_protocol *ipprot;
325 int protocol = iph->protocol;
326
327 rcu_read_lock();
328 ipprot = rcu_dereference(inet_protos[protocol]);
329 if (ipprot && ipprot->early_demux)
330 ipprot->early_demux(skb);
331 rcu_read_unlock();
332 }
333
322 /* 334 /*
323 * Initialise the virtual path cache for the packet. It describes 335 * Initialise the virtual path cache for the packet. It describes
324 * how the packet travels inside Linux networking. 336 * how the packet travels inside Linux networking.
325 */ 337 */
326 if (skb_dst(skb) == NULL) { 338 if (!skb_dst(skb)) {
327 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 339 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
328 iph->tos, skb->dev); 340 iph->tos, skb->dev);
329 if (unlikely(err)) { 341 if (unlikely(err)) {
330 if (err == -EHOSTUNREACH) 342 if (err == -EXDEV)
331 IP_INC_STATS_BH(dev_net(skb->dev),
332 IPSTATS_MIB_INADDRERRORS);
333 else if (err == -ENETUNREACH)
334 IP_INC_STATS_BH(dev_net(skb->dev),
335 IPSTATS_MIB_INNOROUTES);
336 else if (err == -EXDEV)
337 NET_INC_STATS_BH(dev_net(skb->dev), 343 NET_INC_STATS_BH(dev_net(skb->dev),
338 LINUX_MIB_IPRPFILTER); 344 LINUX_MIB_IPRPFILTER);
339 goto drop; 345 goto drop;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 708b99494e23..a19d6471a318 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -27,6 +27,7 @@
27#include <net/icmp.h> 27#include <net/icmp.h>
28#include <net/route.h> 28#include <net/route.h>
29#include <net/cipso_ipv4.h> 29#include <net/cipso_ipv4.h>
30#include <net/ip_fib.h>
30 31
31/* 32/*
32 * Write options to IP header, record destination address to 33 * Write options to IP header, record destination address to
@@ -104,7 +105,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
104 sptr = skb_network_header(skb); 105 sptr = skb_network_header(skb);
105 dptr = dopt->__data; 106 dptr = dopt->__data;
106 107
107 daddr = skb_rtable(skb)->rt_spec_dst; 108 daddr = fib_compute_spec_dst(skb);
108 109
109 if (sopt->rr) { 110 if (sopt->rr) {
110 optlen = sptr[sopt->rr+1]; 111 optlen = sptr[sopt->rr+1];
@@ -241,6 +242,15 @@ void ip_options_fragment(struct sk_buff *skb)
241 opt->ts_needtime = 0; 242 opt->ts_needtime = 0;
242} 243}
243 244
245/* helper used by ip_options_compile() to call fib_compute_spec_dst()
246 * at most one time.
247 */
248static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
249{
250 if (*spec_dst == htonl(INADDR_ANY))
251 *spec_dst = fib_compute_spec_dst(skb);
252}
253
244/* 254/*
245 * Verify options and fill pointers in struct options. 255 * Verify options and fill pointers in struct options.
246 * Caller should clear *opt, and set opt->data. 256 * Caller should clear *opt, and set opt->data.
@@ -250,12 +260,12 @@ void ip_options_fragment(struct sk_buff *skb)
250int ip_options_compile(struct net *net, 260int ip_options_compile(struct net *net,
251 struct ip_options *opt, struct sk_buff *skb) 261 struct ip_options *opt, struct sk_buff *skb)
252{ 262{
253 int l; 263 __be32 spec_dst = htonl(INADDR_ANY);
254 unsigned char *iph;
255 unsigned char *optptr;
256 int optlen;
257 unsigned char *pp_ptr = NULL; 264 unsigned char *pp_ptr = NULL;
258 struct rtable *rt = NULL; 265 struct rtable *rt = NULL;
266 unsigned char *optptr;
267 unsigned char *iph;
268 int optlen, l;
259 269
260 if (skb != NULL) { 270 if (skb != NULL) {
261 rt = skb_rtable(skb); 271 rt = skb_rtable(skb);
@@ -331,7 +341,8 @@ int ip_options_compile(struct net *net,
331 goto error; 341 goto error;
332 } 342 }
333 if (rt) { 343 if (rt) {
334 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); 344 spec_dst_fill(&spec_dst, skb);
345 memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
335 opt->is_changed = 1; 346 opt->is_changed = 1;
336 } 347 }
337 optptr[2] += 4; 348 optptr[2] += 4;
@@ -373,7 +384,8 @@ int ip_options_compile(struct net *net,
373 } 384 }
374 opt->ts = optptr - iph; 385 opt->ts = optptr - iph;
375 if (rt) { 386 if (rt) {
376 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); 387 spec_dst_fill(&spec_dst, skb);
388 memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
377 timeptr = &optptr[optptr[2]+3]; 389 timeptr = &optptr[optptr[2]+3];
378 } 390 }
379 opt->ts_needaddr = 1; 391 opt->ts_needaddr = 1;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 451f97c42eb4..cc52679790b2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb)
113} 113}
114EXPORT_SYMBOL_GPL(ip_local_out); 114EXPORT_SYMBOL_GPL(ip_local_out);
115 115
116/* dev_loopback_xmit for use with netfilter. */
117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118{
119 skb_reset_mac_header(newskb);
120 __skb_pull(newskb, skb_network_offset(newskb));
121 newskb->pkt_type = PACKET_LOOPBACK;
122 newskb->ip_summed = CHECKSUM_UNNECESSARY;
123 WARN_ON(!skb_dst(newskb));
124 skb_dst_force(newskb);
125 netif_rx_ni(newskb);
126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) 116static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{ 117{
131 int ttl = inet->uc_ttl; 118 int ttl = inet->uc_ttl;
@@ -183,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
183 struct net_device *dev = dst->dev; 170 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev); 171 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh; 172 struct neighbour *neigh;
173 u32 nexthop;
186 174
187 if (rt->rt_type == RTN_MULTICAST) { 175 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); 176 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -200,19 +188,22 @@ static inline int ip_finish_output2(struct sk_buff *skb)
200 } 188 }
201 if (skb->sk) 189 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk); 190 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb); 191 consume_skb(skb);
204 skb = skb2; 192 skb = skb2;
205 } 193 }
206 194
207 rcu_read_lock(); 195 rcu_read_lock_bh();
208 neigh = dst_get_neighbour_noref(dst); 196 nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
198 if (unlikely(!neigh))
199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
209 if (neigh) { 200 if (neigh) {
210 int res = neigh_output(neigh, skb); 201 int res = dst_neigh_output(dst, neigh, skb);
211 202
212 rcu_read_unlock(); 203 rcu_read_unlock_bh();
213 return res; 204 return res;
214 } 205 }
215 rcu_read_unlock(); 206 rcu_read_unlock_bh();
216 207
217 net_dbg_ratelimited("%s: No header cache and no neighbour!\n", 208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
218 __func__); 209 __func__);
@@ -281,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb)
281 if (newskb) 272 if (newskb)
282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 273 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 newskb, NULL, newskb->dev, 274 newskb, NULL, newskb->dev,
284 ip_dev_loopback_xmit); 275 dev_loopback_xmit);
285 } 276 }
286 277
287 /* Multicasts with ttl 0 must not go beyond the host */ 278 /* Multicasts with ttl 0 must not go beyond the host */
@@ -296,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb)
296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 287 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 if (newskb) 288 if (newskb)
298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, 289 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 NULL, newskb->dev, ip_dev_loopback_xmit); 290 NULL, newskb->dev, dev_loopback_xmit);
300 } 291 }
301 292
302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, 293 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -709,7 +700,7 @@ slow_path:
709 700
710 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 701 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
711 } 702 }
712 kfree_skb(skb); 703 consume_skb(skb);
713 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
714 return err; 705 return err;
715 706
@@ -1472,13 +1463,14 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1472 1463
1473/* 1464/*
1474 * Generic function to send a packet as reply to another packet. 1465 * Generic function to send a packet as reply to another packet.
1475 * Used to send TCP resets so far. ICMP should use this function too. 1466 * Used to send TCP resets so far.
1476 * 1467 *
1477 * Should run single threaded per socket because it uses the sock 1468 * Should run single threaded per socket because it uses the sock
1478 * structure to pass arguments. 1469 * structure to pass arguments.
1479 */ 1470 */
1480void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, 1471void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1481 const struct ip_reply_arg *arg, unsigned int len) 1472 __be32 saddr, const struct ip_reply_arg *arg,
1473 unsigned int len)
1482{ 1474{
1483 struct inet_sock *inet = inet_sk(sk); 1475 struct inet_sock *inet = inet_sk(sk);
1484 struct ip_options_data replyopts; 1476 struct ip_options_data replyopts;
@@ -1504,7 +1496,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1504 RT_TOS(arg->tos), 1496 RT_TOS(arg->tos),
1505 RT_SCOPE_UNIVERSE, sk->sk_protocol, 1497 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1506 ip_reply_arg_flowi_flags(arg), 1498 ip_reply_arg_flowi_flags(arg),
1507 daddr, rt->rt_spec_dst, 1499 daddr, saddr,
1508 tcp_hdr(skb)->source, tcp_hdr(skb)->dest); 1500 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1509 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 1501 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1510 rt = ip_route_output_key(sock_net(sk), &fl4); 1502 rt = ip_route_output_key(sock_net(sk), &fl4);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0d11f234d615..de29f46f68b0 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -40,6 +40,7 @@
40#if IS_ENABLED(CONFIG_IPV6) 40#if IS_ENABLED(CONFIG_IPV6)
41#include <net/transp_v6.h> 41#include <net/transp_v6.h>
42#endif 42#endif
43#include <net/ip_fib.h>
43 44
44#include <linux/errqueue.h> 45#include <linux/errqueue.h>
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
@@ -1019,8 +1020,8 @@ e_inval:
1019 * @sk: socket 1020 * @sk: socket
1020 * @skb: buffer 1021 * @skb: buffer
1021 * 1022 *
1022 * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst 1023 * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
1023 * in skb->cb[] before dst drop. 1024 * destination in skb->cb[] before dst drop.
1024 * This way, receiver doesnt make cache line misses to read rtable. 1025 * This way, receiver doesnt make cache line misses to read rtable.
1025 */ 1026 */
1026void ipv4_pktinfo_prepare(struct sk_buff *skb) 1027void ipv4_pktinfo_prepare(struct sk_buff *skb)
@@ -1030,7 +1031,7 @@ void ipv4_pktinfo_prepare(struct sk_buff *skb)
1030 1031
1031 if (rt) { 1032 if (rt) {
1032 pktinfo->ipi_ifindex = rt->rt_iif; 1033 pktinfo->ipi_ifindex = rt->rt_iif;
1033 pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst; 1034 pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
1034 } else { 1035 } else {
1035 pktinfo->ipi_ifindex = 0; 1036 pktinfo->ipi_ifindex = 0;
1036 pktinfo->ipi_spec_dst.s_addr = 0; 1037 pktinfo->ipi_spec_dst.s_addr = 0;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 63b64c45a826..b91375482d84 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -42,6 +42,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
42 return; 42 return;
43 NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", 43 NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
44 spi, &iph->daddr); 44 spi, &iph->daddr);
45 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
45 xfrm_state_put(x); 46 xfrm_state_put(x);
46} 47}
47 48
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2d0f99bf61b3..715338a1b205 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -348,9 +348,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
348 case ICMP_PORT_UNREACH: 348 case ICMP_PORT_UNREACH:
349 /* Impossible event. */ 349 /* Impossible event. */
350 return 0; 350 return 0;
351 case ICMP_FRAG_NEEDED:
352 /* Soft state for pmtu is maintained by IP core. */
353 return 0;
354 default: 351 default:
355 /* All others are translated to HOST_UNREACH. 352 /* All others are translated to HOST_UNREACH.
356 rfc2003 contains "deep thoughts" about NET_UNREACH, 353 rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -369,7 +366,17 @@ static int ipip_err(struct sk_buff *skb, u32 info)
369 366
370 rcu_read_lock(); 367 rcu_read_lock();
371 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 368 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
372 if (t == NULL || t->parms.iph.daddr == 0) 369 if (t == NULL)
370 goto out;
371
372 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
373 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
374 t->dev->ifindex, 0, IPPROTO_IPIP, 0);
375 err = 0;
376 goto out;
377 }
378
379 if (t->parms.iph.daddr == 0)
373 goto out; 380 goto out;
374 381
375 err = 0; 382 err = 0;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c94bbc6f2ba3..5716c6b808d6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -524,8 +524,8 @@ failure:
524} 524}
525#endif 525#endif
526 526
527/* 527/**
528 * Delete a VIF entry 528 * vif_delete - Delete a VIF entry
529 * @notify: Set to 1, if the caller is a notifier_call 529 * @notify: Set to 1, if the caller is a notifier_call
530 */ 530 */
531 531
@@ -2006,37 +2006,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2006{ 2006{
2007 int ct; 2007 int ct;
2008 struct rtnexthop *nhp; 2008 struct rtnexthop *nhp;
2009 u8 *b = skb_tail_pointer(skb); 2009 struct nlattr *mp_attr;
2010 struct rtattr *mp_head;
2011 2010
2012 /* If cache is unresolved, don't try to parse IIF and OIF */ 2011 /* If cache is unresolved, don't try to parse IIF and OIF */
2013 if (c->mfc_parent >= MAXVIFS) 2012 if (c->mfc_parent >= MAXVIFS)
2014 return -ENOENT; 2013 return -ENOENT;
2015 2014
2016 if (VIF_EXISTS(mrt, c->mfc_parent)) 2015 if (VIF_EXISTS(mrt, c->mfc_parent) &&
2017 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); 2016 nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
2017 return -EMSGSIZE;
2018 2018
2019 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); 2019 if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
2020 return -EMSGSIZE;
2020 2021
2021 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { 2022 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
2022 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { 2023 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
2023 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) 2024 if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
2024 goto rtattr_failure; 2025 nla_nest_cancel(skb, mp_attr);
2025 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); 2026 return -EMSGSIZE;
2027 }
2028
2026 nhp->rtnh_flags = 0; 2029 nhp->rtnh_flags = 0;
2027 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 2030 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
2028 nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; 2031 nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
2029 nhp->rtnh_len = sizeof(*nhp); 2032 nhp->rtnh_len = sizeof(*nhp);
2030 } 2033 }
2031 } 2034 }
2032 mp_head->rta_type = RTA_MULTIPATH; 2035
2033 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; 2036 nla_nest_end(skb, mp_attr);
2037
2034 rtm->rtm_type = RTN_MULTICAST; 2038 rtm->rtm_type = RTN_MULTICAST;
2035 return 1; 2039 return 1;
2036
2037rtattr_failure:
2038 nlmsg_trim(skb, b);
2039 return -EMSGSIZE;
2040} 2040}
2041 2041
2042int ipmr_get_route(struct net *net, struct sk_buff *skb, 2042int ipmr_get_route(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index ba5756d20165..1109f7f6c254 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum,
196 196
197 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); 197 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
198 198
199 /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ 199 nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
200 nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, 200 sizeof(*pm)+copy_len, 0);
201 sizeof(*pm)+copy_len); 201 if (!nlh) {
202 pr_debug("error during nlmsg_put\n");
203 goto out_unlock;
204 }
202 ub->qlen++; 205 ub->qlen++;
203 206
204 pm = NLMSG_DATA(nlh); 207 pm = nlmsg_data(nlh);
205 208
206 /* We might not have a timestamp, get one */ 209 /* We might not have a timestamp, get one */
207 if (skb->tstamp.tv64 == 0) 210 if (skb->tstamp.tv64 == 0)
@@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
261 nlh->nlmsg_type = NLMSG_DONE; 264 nlh->nlmsg_type = NLMSG_DONE;
262 ulog_send(groupnum); 265 ulog_send(groupnum);
263 } 266 }
264 267out_unlock:
265 spin_unlock_bh(&ulog_lock); 268 spin_unlock_bh(&ulog_lock);
266 269
267 return; 270 return;
268 271
269nlmsg_failure:
270 pr_debug("error during NLMSG_PUT\n");
271alloc_failure: 272alloc_failure:
272 pr_debug("Error building netlink message\n"); 273 pr_debug("Error building netlink message\n");
273 spin_unlock_bh(&ulog_lock); 274 spin_unlock_bh(&ulog_lock);
@@ -380,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
380static int __init ulog_tg_init(void) 381static int __init ulog_tg_init(void)
381{ 382{
382 int ret, i; 383 int ret, i;
384 struct netlink_kernel_cfg cfg = {
385 .groups = ULOG_MAXNLGROUPS,
386 };
383 387
384 pr_debug("init module\n"); 388 pr_debug("init module\n");
385 389
@@ -392,9 +396,8 @@ static int __init ulog_tg_init(void)
392 for (i = 0; i < ULOG_MAXNLGROUPS; i++) 396 for (i = 0; i < ULOG_MAXNLGROUPS; i++)
393 setup_timer(&ulog_buffers[i].timer, ulog_timer, i); 397 setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
394 398
395 nflognl = netlink_kernel_create(&init_net, 399 nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
396 NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, 400 THIS_MODULE, &cfg);
397 NULL, THIS_MODULE);
398 if (!nflognl) 401 if (!nflognl)
399 return -ENOMEM; 402 return -ENOMEM;
400 403
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 91747d4ebc26..e7ff2dcab6ce 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -95,11 +95,11 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
95 return NF_ACCEPT; 95 return NF_ACCEPT;
96} 96}
97 97
98static unsigned int ipv4_confirm(unsigned int hooknum, 98static unsigned int ipv4_helper(unsigned int hooknum,
99 struct sk_buff *skb, 99 struct sk_buff *skb,
100 const struct net_device *in, 100 const struct net_device *in,
101 const struct net_device *out, 101 const struct net_device *out,
102 int (*okfn)(struct sk_buff *)) 102 int (*okfn)(struct sk_buff *))
103{ 103{
104 struct nf_conn *ct; 104 struct nf_conn *ct;
105 enum ip_conntrack_info ctinfo; 105 enum ip_conntrack_info ctinfo;
@@ -110,24 +110,38 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
110 /* This is where we call the helper: as the packet goes out. */ 110 /* This is where we call the helper: as the packet goes out. */
111 ct = nf_ct_get(skb, &ctinfo); 111 ct = nf_ct_get(skb, &ctinfo);
112 if (!ct || ctinfo == IP_CT_RELATED_REPLY) 112 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
113 goto out; 113 return NF_ACCEPT;
114 114
115 help = nfct_help(ct); 115 help = nfct_help(ct);
116 if (!help) 116 if (!help)
117 goto out; 117 return NF_ACCEPT;
118 118
119 /* rcu_read_lock()ed by nf_hook_slow */ 119 /* rcu_read_lock()ed by nf_hook_slow */
120 helper = rcu_dereference(help->helper); 120 helper = rcu_dereference(help->helper);
121 if (!helper) 121 if (!helper)
122 goto out; 122 return NF_ACCEPT;
123 123
124 ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), 124 ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
125 ct, ctinfo); 125 ct, ctinfo);
126 if (ret != NF_ACCEPT) { 126 if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
127 nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, 127 nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
128 "nf_ct_%s: dropping packet", helper->name); 128 "nf_ct_%s: dropping packet", helper->name);
129 return ret;
130 } 129 }
130 return ret;
131}
132
133static unsigned int ipv4_confirm(unsigned int hooknum,
134 struct sk_buff *skb,
135 const struct net_device *in,
136 const struct net_device *out,
137 int (*okfn)(struct sk_buff *))
138{
139 struct nf_conn *ct;
140 enum ip_conntrack_info ctinfo;
141
142 ct = nf_ct_get(skb, &ctinfo);
143 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
144 goto out;
131 145
132 /* adjust seqs for loopback traffic only in outgoing direction */ 146 /* adjust seqs for loopback traffic only in outgoing direction */
133 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 147 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
@@ -185,6 +199,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
185 .priority = NF_IP_PRI_CONNTRACK, 199 .priority = NF_IP_PRI_CONNTRACK,
186 }, 200 },
187 { 201 {
202 .hook = ipv4_helper,
203 .owner = THIS_MODULE,
204 .pf = NFPROTO_IPV4,
205 .hooknum = NF_INET_POST_ROUTING,
206 .priority = NF_IP_PRI_CONNTRACK_HELPER,
207 },
208 {
188 .hook = ipv4_confirm, 209 .hook = ipv4_confirm,
189 .owner = THIS_MODULE, 210 .owner = THIS_MODULE,
190 .pf = NFPROTO_IPV4, 211 .pf = NFPROTO_IPV4,
@@ -192,6 +213,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
192 .priority = NF_IP_PRI_CONNTRACK_CONFIRM, 213 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
193 }, 214 },
194 { 215 {
216 .hook = ipv4_helper,
217 .owner = THIS_MODULE,
218 .pf = NFPROTO_IPV4,
219 .hooknum = NF_INET_LOCAL_IN,
220 .priority = NF_IP_PRI_CONNTRACK_HELPER,
221 },
222 {
195 .hook = ipv4_confirm, 223 .hook = ipv4_confirm,
196 .owner = THIS_MODULE, 224 .owner = THIS_MODULE,
197 .pf = NFPROTO_IPV4, 225 .pf = NFPROTO_IPV4,
@@ -207,35 +235,30 @@ static int log_invalid_proto_max = 255;
207static ctl_table ip_ct_sysctl_table[] = { 235static ctl_table ip_ct_sysctl_table[] = {
208 { 236 {
209 .procname = "ip_conntrack_max", 237 .procname = "ip_conntrack_max",
210 .data = &nf_conntrack_max,
211 .maxlen = sizeof(int), 238 .maxlen = sizeof(int),
212 .mode = 0644, 239 .mode = 0644,
213 .proc_handler = proc_dointvec, 240 .proc_handler = proc_dointvec,
214 }, 241 },
215 { 242 {
216 .procname = "ip_conntrack_count", 243 .procname = "ip_conntrack_count",
217 .data = &init_net.ct.count,
218 .maxlen = sizeof(int), 244 .maxlen = sizeof(int),
219 .mode = 0444, 245 .mode = 0444,
220 .proc_handler = proc_dointvec, 246 .proc_handler = proc_dointvec,
221 }, 247 },
222 { 248 {
223 .procname = "ip_conntrack_buckets", 249 .procname = "ip_conntrack_buckets",
224 .data = &init_net.ct.htable_size,
225 .maxlen = sizeof(unsigned int), 250 .maxlen = sizeof(unsigned int),
226 .mode = 0444, 251 .mode = 0444,
227 .proc_handler = proc_dointvec, 252 .proc_handler = proc_dointvec,
228 }, 253 },
229 { 254 {
230 .procname = "ip_conntrack_checksum", 255 .procname = "ip_conntrack_checksum",
231 .data = &init_net.ct.sysctl_checksum,
232 .maxlen = sizeof(int), 256 .maxlen = sizeof(int),
233 .mode = 0644, 257 .mode = 0644,
234 .proc_handler = proc_dointvec, 258 .proc_handler = proc_dointvec,
235 }, 259 },
236 { 260 {
237 .procname = "ip_conntrack_log_invalid", 261 .procname = "ip_conntrack_log_invalid",
238 .data = &init_net.ct.sysctl_log_invalid,
239 .maxlen = sizeof(unsigned int), 262 .maxlen = sizeof(unsigned int),
240 .mode = 0644, 263 .mode = 0644,
241 .proc_handler = proc_dointvec_minmax, 264 .proc_handler = proc_dointvec_minmax,
@@ -351,6 +374,25 @@ static struct nf_sockopt_ops so_getorigdst = {
351 .owner = THIS_MODULE, 374 .owner = THIS_MODULE,
352}; 375};
353 376
377static int ipv4_init_net(struct net *net)
378{
379#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
380 struct nf_ip_net *in = &net->ct.nf_ct_proto;
381 in->ctl_table = kmemdup(ip_ct_sysctl_table,
382 sizeof(ip_ct_sysctl_table),
383 GFP_KERNEL);
384 if (!in->ctl_table)
385 return -ENOMEM;
386
387 in->ctl_table[0].data = &nf_conntrack_max;
388 in->ctl_table[1].data = &net->ct.count;
389 in->ctl_table[2].data = &net->ct.htable_size;
390 in->ctl_table[3].data = &net->ct.sysctl_checksum;
391 in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
392#endif
393 return 0;
394}
395
354struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { 396struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
355 .l3proto = PF_INET, 397 .l3proto = PF_INET,
356 .name = "ipv4", 398 .name = "ipv4",
@@ -366,8 +408,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
366#endif 408#endif
367#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 409#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
368 .ctl_table_path = "net/ipv4/netfilter", 410 .ctl_table_path = "net/ipv4/netfilter",
369 .ctl_table = ip_ct_sysctl_table,
370#endif 411#endif
412 .init_net = ipv4_init_net,
371 .me = THIS_MODULE, 413 .me = THIS_MODULE,
372}; 414};
373 415
@@ -378,6 +420,65 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
378MODULE_ALIAS("ip_conntrack"); 420MODULE_ALIAS("ip_conntrack");
379MODULE_LICENSE("GPL"); 421MODULE_LICENSE("GPL");
380 422
423static int ipv4_net_init(struct net *net)
424{
425 int ret = 0;
426
427 ret = nf_conntrack_l4proto_register(net,
428 &nf_conntrack_l4proto_tcp4);
429 if (ret < 0) {
430 pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
431 goto out_tcp;
432 }
433 ret = nf_conntrack_l4proto_register(net,
434 &nf_conntrack_l4proto_udp4);
435 if (ret < 0) {
436 pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
437 goto out_udp;
438 }
439 ret = nf_conntrack_l4proto_register(net,
440 &nf_conntrack_l4proto_icmp);
441 if (ret < 0) {
442 pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
443 goto out_icmp;
444 }
445 ret = nf_conntrack_l3proto_register(net,
446 &nf_conntrack_l3proto_ipv4);
447 if (ret < 0) {
448 pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
449 goto out_ipv4;
450 }
451 return 0;
452out_ipv4:
453 nf_conntrack_l4proto_unregister(net,
454 &nf_conntrack_l4proto_icmp);
455out_icmp:
456 nf_conntrack_l4proto_unregister(net,
457 &nf_conntrack_l4proto_udp4);
458out_udp:
459 nf_conntrack_l4proto_unregister(net,
460 &nf_conntrack_l4proto_tcp4);
461out_tcp:
462 return ret;
463}
464
465static void ipv4_net_exit(struct net *net)
466{
467 nf_conntrack_l3proto_unregister(net,
468 &nf_conntrack_l3proto_ipv4);
469 nf_conntrack_l4proto_unregister(net,
470 &nf_conntrack_l4proto_icmp);
471 nf_conntrack_l4proto_unregister(net,
472 &nf_conntrack_l4proto_udp4);
473 nf_conntrack_l4proto_unregister(net,
474 &nf_conntrack_l4proto_tcp4);
475}
476
477static struct pernet_operations ipv4_net_ops = {
478 .init = ipv4_net_init,
479 .exit = ipv4_net_exit,
480};
481
381static int __init nf_conntrack_l3proto_ipv4_init(void) 482static int __init nf_conntrack_l3proto_ipv4_init(void)
382{ 483{
383 int ret = 0; 484 int ret = 0;
@@ -391,35 +492,17 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
391 return ret; 492 return ret;
392 } 493 }
393 494
394 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); 495 ret = register_pernet_subsys(&ipv4_net_ops);
395 if (ret < 0) { 496 if (ret < 0) {
396 pr_err("nf_conntrack_ipv4: can't register tcp.\n"); 497 pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
397 goto cleanup_sockopt; 498 goto cleanup_sockopt;
398 } 499 }
399 500
400 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
401 if (ret < 0) {
402 pr_err("nf_conntrack_ipv4: can't register udp.\n");
403 goto cleanup_tcp;
404 }
405
406 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
407 if (ret < 0) {
408 pr_err("nf_conntrack_ipv4: can't register icmp.\n");
409 goto cleanup_udp;
410 }
411
412 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
413 if (ret < 0) {
414 pr_err("nf_conntrack_ipv4: can't register ipv4\n");
415 goto cleanup_icmp;
416 }
417
418 ret = nf_register_hooks(ipv4_conntrack_ops, 501 ret = nf_register_hooks(ipv4_conntrack_ops,
419 ARRAY_SIZE(ipv4_conntrack_ops)); 502 ARRAY_SIZE(ipv4_conntrack_ops));
420 if (ret < 0) { 503 if (ret < 0) {
421 pr_err("nf_conntrack_ipv4: can't register hooks.\n"); 504 pr_err("nf_conntrack_ipv4: can't register hooks.\n");
422 goto cleanup_ipv4; 505 goto cleanup_pernet;
423 } 506 }
424#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 507#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
425 ret = nf_conntrack_ipv4_compat_init(); 508 ret = nf_conntrack_ipv4_compat_init();
@@ -431,14 +514,8 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
431 cleanup_hooks: 514 cleanup_hooks:
432 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); 515 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
433#endif 516#endif
434 cleanup_ipv4: 517 cleanup_pernet:
435 nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); 518 unregister_pernet_subsys(&ipv4_net_ops);
436 cleanup_icmp:
437 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
438 cleanup_udp:
439 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
440 cleanup_tcp:
441 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
442 cleanup_sockopt: 519 cleanup_sockopt:
443 nf_unregister_sockopt(&so_getorigdst); 520 nf_unregister_sockopt(&so_getorigdst);
444 return ret; 521 return ret;
@@ -451,10 +528,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
451 nf_conntrack_ipv4_compat_fini(); 528 nf_conntrack_ipv4_compat_fini();
452#endif 529#endif
453 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); 530 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
454 nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); 531 unregister_pernet_subsys(&ipv4_net_ops);
455 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
456 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
457 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
458 nf_unregister_sockopt(&so_getorigdst); 532 nf_unregister_sockopt(&so_getorigdst);
459} 533}
460 534
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 0847e373d33c..5241d997ab75 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -23,6 +23,11 @@
23 23
24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; 24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
25 25
26static inline struct nf_icmp_net *icmp_pernet(struct net *net)
27{
28 return &net->ct.nf_ct_proto.icmp;
29}
30
26static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, 31static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
27 struct nf_conntrack_tuple *tuple) 32 struct nf_conntrack_tuple *tuple)
28{ 33{
@@ -77,7 +82,7 @@ static int icmp_print_tuple(struct seq_file *s,
77 82
78static unsigned int *icmp_get_timeouts(struct net *net) 83static unsigned int *icmp_get_timeouts(struct net *net)
79{ 84{
80 return &nf_ct_icmp_timeout; 85 return &icmp_pernet(net)->timeout;
81} 86}
82 87
83/* Returns verdict for packet, or -1 for invalid. */ 88/* Returns verdict for packet, or -1 for invalid. */
@@ -274,16 +279,18 @@ static int icmp_nlattr_tuple_size(void)
274#include <linux/netfilter/nfnetlink.h> 279#include <linux/netfilter/nfnetlink.h>
275#include <linux/netfilter/nfnetlink_cttimeout.h> 280#include <linux/netfilter/nfnetlink_cttimeout.h>
276 281
277static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) 282static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
283 struct net *net, void *data)
278{ 284{
279 unsigned int *timeout = data; 285 unsigned int *timeout = data;
286 struct nf_icmp_net *in = icmp_pernet(net);
280 287
281 if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { 288 if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
282 *timeout = 289 *timeout =
283 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; 290 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
284 } else { 291 } else {
285 /* Set default ICMP timeout. */ 292 /* Set default ICMP timeout. */
286 *timeout = nf_ct_icmp_timeout; 293 *timeout = in->timeout;
287 } 294 }
288 return 0; 295 return 0;
289} 296}
@@ -308,11 +315,9 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
308#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ 315#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
309 316
310#ifdef CONFIG_SYSCTL 317#ifdef CONFIG_SYSCTL
311static struct ctl_table_header *icmp_sysctl_header;
312static struct ctl_table icmp_sysctl_table[] = { 318static struct ctl_table icmp_sysctl_table[] = {
313 { 319 {
314 .procname = "nf_conntrack_icmp_timeout", 320 .procname = "nf_conntrack_icmp_timeout",
315 .data = &nf_ct_icmp_timeout,
316 .maxlen = sizeof(unsigned int), 321 .maxlen = sizeof(unsigned int),
317 .mode = 0644, 322 .mode = 0644,
318 .proc_handler = proc_dointvec_jiffies, 323 .proc_handler = proc_dointvec_jiffies,
@@ -323,7 +328,6 @@ static struct ctl_table icmp_sysctl_table[] = {
323static struct ctl_table icmp_compat_sysctl_table[] = { 328static struct ctl_table icmp_compat_sysctl_table[] = {
324 { 329 {
325 .procname = "ip_conntrack_icmp_timeout", 330 .procname = "ip_conntrack_icmp_timeout",
326 .data = &nf_ct_icmp_timeout,
327 .maxlen = sizeof(unsigned int), 331 .maxlen = sizeof(unsigned int),
328 .mode = 0644, 332 .mode = 0644,
329 .proc_handler = proc_dointvec_jiffies, 333 .proc_handler = proc_dointvec_jiffies,
@@ -333,6 +337,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
333#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ 337#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
334#endif /* CONFIG_SYSCTL */ 338#endif /* CONFIG_SYSCTL */
335 339
340static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
341 struct nf_icmp_net *in)
342{
343#ifdef CONFIG_SYSCTL
344 pn->ctl_table = kmemdup(icmp_sysctl_table,
345 sizeof(icmp_sysctl_table),
346 GFP_KERNEL);
347 if (!pn->ctl_table)
348 return -ENOMEM;
349
350 pn->ctl_table[0].data = &in->timeout;
351#endif
352 return 0;
353}
354
355static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
356 struct nf_icmp_net *in)
357{
358#ifdef CONFIG_SYSCTL
359#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
360 pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
361 sizeof(icmp_compat_sysctl_table),
362 GFP_KERNEL);
363 if (!pn->ctl_compat_table)
364 return -ENOMEM;
365
366 pn->ctl_compat_table[0].data = &in->timeout;
367#endif
368#endif
369 return 0;
370}
371
372static int icmp_init_net(struct net *net, u_int16_t proto)
373{
374 int ret;
375 struct nf_icmp_net *in = icmp_pernet(net);
376 struct nf_proto_net *pn = &in->pn;
377
378 in->timeout = nf_ct_icmp_timeout;
379
380 ret = icmp_kmemdup_compat_sysctl_table(pn, in);
381 if (ret < 0)
382 return ret;
383
384 ret = icmp_kmemdup_sysctl_table(pn, in);
385 if (ret < 0)
386 nf_ct_kfree_compat_sysctl_table(pn);
387
388 return ret;
389}
390
391static struct nf_proto_net *icmp_get_net_proto(struct net *net)
392{
393 return &net->ct.nf_ct_proto.icmp.pn;
394}
395
336struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = 396struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
337{ 397{
338 .l3proto = PF_INET, 398 .l3proto = PF_INET,
@@ -362,11 +422,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
362 .nla_policy = icmp_timeout_nla_policy, 422 .nla_policy = icmp_timeout_nla_policy,
363 }, 423 },
364#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ 424#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
365#ifdef CONFIG_SYSCTL 425 .init_net = icmp_init_net,
366 .ctl_table_header = &icmp_sysctl_header, 426 .get_net_proto = icmp_get_net_proto,
367 .ctl_table = icmp_sysctl_table,
368#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
369 .ctl_compat_table = icmp_compat_sysctl_table,
370#endif
371#endif
372}; 427};
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 9bb1b8a37a22..742815518b0f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
94 { 94 {
95 .hook = ipv4_conntrack_defrag, 95 .hook = ipv4_conntrack_defrag,
96 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
97 .pf = PF_INET, 97 .pf = NFPROTO_IPV4,
98 .hooknum = NF_INET_PRE_ROUTING, 98 .hooknum = NF_INET_PRE_ROUTING,
99 .priority = NF_IP_PRI_CONNTRACK_DEFRAG, 99 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
100 }, 100 },
101 { 101 {
102 .hook = ipv4_conntrack_defrag, 102 .hook = ipv4_conntrack_defrag,
103 .owner = THIS_MODULE, 103 .owner = THIS_MODULE,
104 .pf = PF_INET, 104 .pf = NFPROTO_IPV4,
105 .hooknum = NF_INET_LOCAL_OUT, 105 .hooknum = NF_INET_LOCAL_OUT,
106 .priority = NF_IP_PRI_CONNTRACK_DEFRAG, 106 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
107 }, 107 },
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 7b22382ff0e9..3c04d24e2976 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -13,10 +13,10 @@
13#include <linux/skbuff.h> 13#include <linux/skbuff.h>
14#include <linux/udp.h> 14#include <linux/udp.h>
15 15
16#include <net/netfilter/nf_nat_helper.h>
17#include <net/netfilter/nf_nat_rule.h>
18#include <net/netfilter/nf_conntrack_helper.h> 16#include <net/netfilter/nf_conntrack_helper.h>
19#include <net/netfilter/nf_conntrack_expect.h> 17#include <net/netfilter/nf_conntrack_expect.h>
18#include <net/netfilter/nf_nat_helper.h>
19#include <net/netfilter/nf_nat_rule.h>
20#include <linux/netfilter/nf_conntrack_amanda.h> 20#include <linux/netfilter/nf_conntrack_amanda.h>
21 21
22MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); 22MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index abb52adf5acd..44b082fd48ab 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -691,6 +691,10 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
691 .expectfn = nf_nat_follow_master, 691 .expectfn = nf_nat_follow_master,
692}; 692};
693 693
694static struct nfq_ct_nat_hook nfq_ct_nat = {
695 .seq_adjust = nf_nat_tcp_seq_adjust,
696};
697
694static int __init nf_nat_init(void) 698static int __init nf_nat_init(void)
695{ 699{
696 size_t i; 700 size_t i;
@@ -731,6 +735,7 @@ static int __init nf_nat_init(void)
731 nfnetlink_parse_nat_setup); 735 nfnetlink_parse_nat_setup);
732 BUG_ON(nf_ct_nat_offset != NULL); 736 BUG_ON(nf_ct_nat_offset != NULL);
733 RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); 737 RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
738 RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
734 return 0; 739 return 0;
735 740
736 cleanup_extend: 741 cleanup_extend:
@@ -747,6 +752,7 @@ static void __exit nf_nat_cleanup(void)
747 RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); 752 RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
748 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); 753 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
749 RCU_INIT_POINTER(nf_ct_nat_offset, NULL); 754 RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
755 RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
750 synchronize_net(); 756 synchronize_net();
751} 757}
752 758
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index cad29c121318..c6784a18c1c4 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -95,7 +95,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
95 unsigned char **data, 95 unsigned char **data,
96 TransportAddress *taddr, int count) 96 TransportAddress *taddr, int count)
97{ 97{
98 const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; 98 const struct nf_ct_h323_master *info = nfct_help_data(ct);
99 int dir = CTINFO2DIR(ctinfo); 99 int dir = CTINFO2DIR(ctinfo);
100 int i; 100 int i;
101 __be16 port; 101 __be16 port;
@@ -178,7 +178,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
178 struct nf_conntrack_expect *rtp_exp, 178 struct nf_conntrack_expect *rtp_exp,
179 struct nf_conntrack_expect *rtcp_exp) 179 struct nf_conntrack_expect *rtcp_exp)
180{ 180{
181 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; 181 struct nf_ct_h323_master *info = nfct_help_data(ct);
182 int dir = CTINFO2DIR(ctinfo); 182 int dir = CTINFO2DIR(ctinfo);
183 int i; 183 int i;
184 u_int16_t nated_port; 184 u_int16_t nated_port;
@@ -330,7 +330,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
330 TransportAddress *taddr, __be16 port, 330 TransportAddress *taddr, __be16 port,
331 struct nf_conntrack_expect *exp) 331 struct nf_conntrack_expect *exp)
332{ 332{
333 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; 333 struct nf_ct_h323_master *info = nfct_help_data(ct);
334 int dir = CTINFO2DIR(ctinfo); 334 int dir = CTINFO2DIR(ctinfo);
335 u_int16_t nated_port = ntohs(port); 335 u_int16_t nated_port = ntohs(port);
336 336
@@ -419,7 +419,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
419 unsigned char **data, TransportAddress *taddr, int idx, 419 unsigned char **data, TransportAddress *taddr, int idx,
420 __be16 port, struct nf_conntrack_expect *exp) 420 __be16 port, struct nf_conntrack_expect *exp)
421{ 421{
422 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; 422 struct nf_ct_h323_master *info = nfct_help_data(ct);
423 int dir = CTINFO2DIR(ctinfo); 423 int dir = CTINFO2DIR(ctinfo);
424 u_int16_t nated_port = ntohs(port); 424 u_int16_t nated_port = ntohs(port);
425 union nf_inet_addr addr; 425 union nf_inet_addr addr;
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index af65958f6308..2e59ad0b90ca 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,19 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
153} 153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); 154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155 155
156void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
157 u32 ctinfo, int off)
158{
159 const struct tcphdr *th;
160
161 if (nf_ct_protonum(ct) != IPPROTO_TCP)
162 return;
163
164 th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
165 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
166}
167EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
168
156static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, 169static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
157 int datalen, __sum16 *check, int oldlen) 170 int datalen, __sum16 *check, int oldlen)
158{ 171{
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index c273d58980ae..388140881ebe 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -49,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
49 const struct nf_nat_pptp *nat_pptp_info; 49 const struct nf_nat_pptp *nat_pptp_info;
50 struct nf_nat_ipv4_range range; 50 struct nf_nat_ipv4_range range;
51 51
52 ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; 52 ct_pptp_info = nfct_help_data(master);
53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; 53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
54 54
55 /* And here goes the grand finale of corrosion... */ 55 /* And here goes the grand finale of corrosion... */
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
123 __be16 new_callid; 123 __be16 new_callid;
124 unsigned int cid_off; 124 unsigned int cid_off;
125 125
126 ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; 126 ct_pptp_info = nfct_help_data(ct);
127 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; 127 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
128 128
129 new_callid = ct_pptp_info->pns_call_id; 129 new_callid = ct_pptp_info->pns_call_id;
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
192 struct nf_ct_pptp_master *ct_pptp_info; 192 struct nf_ct_pptp_master *ct_pptp_info;
193 struct nf_nat_pptp *nat_pptp_info; 193 struct nf_nat_pptp *nat_pptp_info;
194 194
195 ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; 195 ct_pptp_info = nfct_help_data(ct);
196 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; 196 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
197 197
198 /* save original PAC call ID in nat_info */ 198 /* save original PAC call ID in nat_info */
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 746edec8b86e..bac712293fd6 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -405,7 +405,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
405 405
406 ptr = *octets; 406 ptr = *octets;
407 while (ctx->pointer < eoc) { 407 while (ctx->pointer < eoc) {
408 if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { 408 if (!asn1_octet_decode(ctx, ptr++)) {
409 kfree(*octets); 409 kfree(*octets);
410 *octets = NULL; 410 *octets = NULL;
411 return 0; 411 return 0;
@@ -759,7 +759,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
759 } 759 }
760 break; 760 break;
761 case SNMP_OBJECTID: 761 case SNMP_OBJECTID:
762 if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { 762 if (!asn1_oid_decode(ctx, end, &lp, &len)) {
763 kfree(id); 763 kfree(id);
764 return 0; 764 return 0;
765 } 765 }
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index a2901bf829c0..9dbb8d284f99 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -8,10 +8,10 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/udp.h> 9#include <linux/udp.h>
10 10
11#include <net/netfilter/nf_nat_helper.h>
12#include <net/netfilter/nf_nat_rule.h>
13#include <net/netfilter/nf_conntrack_helper.h> 11#include <net/netfilter/nf_conntrack_helper.h>
14#include <net/netfilter/nf_conntrack_expect.h> 12#include <net/netfilter/nf_conntrack_expect.h>
13#include <net/netfilter/nf_nat_helper.h>
14#include <net/netfilter/nf_nat_rule.h>
15#include <linux/netfilter/nf_conntrack_tftp.h> 15#include <linux/netfilter/nf_conntrack_tftp.h>
16 16
17MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); 17MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2c00e8bf684d..340fcf29a966 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -371,6 +371,7 @@ void ping_err(struct sk_buff *skb, u32 info)
371 break; 371 break;
372 case ICMP_DEST_UNREACH: 372 case ICMP_DEST_UNREACH:
373 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ 373 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
374 ipv4_sk_update_pmtu(skb, sk, info);
374 if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { 375 if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
375 err = EMSGSIZE; 376 err = EMSGSIZE;
376 harderr = 1; 377 harderr = 1;
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01cd0b2..8918eff1426d 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
36 36
37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) 37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
38{ 38{
39 int hash = protocol & (MAX_INET_PROTOS - 1); 39 return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
40
41 return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
42 NULL, prot) ? 0 : -1; 40 NULL, prot) ? 0 : -1;
43} 41}
44EXPORT_SYMBOL(inet_add_protocol); 42EXPORT_SYMBOL(inet_add_protocol);
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol);
49 47
50int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) 48int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
51{ 49{
52 int ret, hash = protocol & (MAX_INET_PROTOS - 1); 50 int ret;
53 51
54 ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], 52 ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
55 prot, NULL) == prot) ? 0 : -1; 53 prot, NULL) == prot) ? 0 : -1;
56 54
57 synchronize_net(); 55 synchronize_net();
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4032b818f3e4..659ddfb10947 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -216,6 +216,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
216 int err = 0; 216 int err = 0;
217 int harderr = 0; 217 int harderr = 0;
218 218
219 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
220 ipv4_sk_update_pmtu(skb, sk, info);
221
219 /* Report error on raw socket, if: 222 /* Report error on raw socket, if:
220 1. User requested ip_recverr. 223 1. User requested ip_recverr.
221 2. Socket is connected (otherwise the error indication 224 2. Socket is connected (otherwise the error indication
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 98b30d08efe9..95bfa1ba5b28 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -158,40 +158,13 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158 158
159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160{ 160{
161 struct rtable *rt = (struct rtable *) dst; 161 WARN_ON(1);
162 struct inet_peer *peer; 162 return NULL;
163 u32 *p = NULL;
164
165 if (!rt->peer)
166 rt_bind_peer(rt, rt->rt_dst, 1);
167
168 peer = rt->peer;
169 if (peer) {
170 u32 *old_p = __DST_METRICS_PTR(old);
171 unsigned long prev, new;
172
173 p = peer->metrics;
174 if (inet_metrics_new(peer))
175 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
176
177 new = (unsigned long) p;
178 prev = cmpxchg(&dst->_metrics, old, new);
179
180 if (prev != old) {
181 p = __DST_METRICS_PTR(prev);
182 if (prev & DST_METRICS_READ_ONLY)
183 p = NULL;
184 } else {
185 if (rt->fi) {
186 fib_info_put(rt->fi);
187 rt->fi = NULL;
188 }
189 }
190 }
191 return p;
192} 163}
193 164
194static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); 165static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
166 struct sk_buff *skb,
167 const void *daddr);
195 168
196static struct dst_ops ipv4_dst_ops = { 169static struct dst_ops ipv4_dst_ops = {
197 .family = AF_INET, 170 .family = AF_INET,
@@ -421,29 +394,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
421 "HHUptod\tSpecDst"); 394 "HHUptod\tSpecDst");
422 else { 395 else {
423 struct rtable *r = v; 396 struct rtable *r = v;
424 struct neighbour *n; 397 int len;
425 int len, HHUptod;
426
427 rcu_read_lock();
428 n = dst_get_neighbour_noref(&r->dst);
429 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430 rcu_read_unlock();
431 398
432 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" 399 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
433 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 400 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
434 r->dst.dev ? r->dst.dev->name : "*", 401 r->dst.dev ? r->dst.dev->name : "*",
435 (__force u32)r->rt_dst, 402 (__force u32)r->rt_dst,
436 (__force u32)r->rt_gateway, 403 (__force u32)r->rt_gateway,
437 r->rt_flags, atomic_read(&r->dst.__refcnt), 404 r->rt_flags, atomic_read(&r->dst.__refcnt),
438 r->dst.__use, 0, (__force u32)r->rt_src, 405 r->dst.__use, 0, (__force u32)r->rt_src,
439 dst_metric_advmss(&r->dst) + 40, 406 dst_metric_advmss(&r->dst) + 40,
440 dst_metric(&r->dst, RTAX_WINDOW), 407 dst_metric(&r->dst, RTAX_WINDOW), 0,
441 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 408 r->rt_key_tos,
442 dst_metric(&r->dst, RTAX_RTTVAR)), 409 -1, 0, 0, &len);
443 r->rt_key_tos,
444 -1,
445 HHUptod,
446 r->rt_spec_dst, &len);
447 410
448 seq_printf(seq, "%*s\n", 127 - len, ""); 411 seq_printf(seq, "%*s\n", 127 - len, "");
449 } 412 }
@@ -680,7 +643,7 @@ static inline int rt_fast_clean(struct rtable *rth)
680static inline int rt_valuable(struct rtable *rth) 643static inline int rt_valuable(struct rtable *rth)
681{ 644{
682 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 645 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
683 (rth->peer && rth->peer->pmtu_expires); 646 rth->dst.expires;
684} 647}
685 648
686static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 649static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -873,34 +836,22 @@ static void rt_check_expire(void)
873 while ((rth = rcu_dereference_protected(*rthp, 836 while ((rth = rcu_dereference_protected(*rthp,
874 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { 837 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875 prefetch(rth->dst.rt_next); 838 prefetch(rth->dst.rt_next);
876 if (rt_is_expired(rth)) { 839 if (rt_is_expired(rth) ||
840 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
877 *rthp = rth->dst.rt_next; 841 *rthp = rth->dst.rt_next;
878 rt_free(rth); 842 rt_free(rth);
879 continue; 843 continue;
880 } 844 }
881 if (rth->dst.expires) {
882 /* Entry is expired even if it is in use */
883 if (time_before_eq(jiffies, rth->dst.expires)) {
884nofree:
885 tmo >>= 1;
886 rthp = &rth->dst.rt_next;
887 /*
888 * We only count entries on
889 * a chain with equal hash inputs once
890 * so that entries for different QOS
891 * levels, and other non-hash input
892 * attributes don't unfairly skew
893 * the length computation
894 */
895 length += has_noalias(rt_hash_table[i].chain, rth);
896 continue;
897 }
898 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899 goto nofree;
900 845
901 /* Cleanup aged off entries. */ 846 /* We only count entries on a chain with equal
902 *rthp = rth->dst.rt_next; 847 * hash inputs once so that entries for
903 rt_free(rth); 848 * different QOS levels, and other non-hash
849 * input attributes don't unfairly skew the
850 * length computation
851 */
852 tmo >>= 1;
853 rthp = &rth->dst.rt_next;
854 length += has_noalias(rt_hash_table[i].chain, rth);
904 } 855 }
905 spin_unlock_bh(rt_hash_lock_addr(i)); 856 spin_unlock_bh(rt_hash_lock_addr(i));
906 sum += length; 857 sum += length;
@@ -938,7 +889,6 @@ static void rt_cache_invalidate(struct net *net)
938 889
939 get_random_bytes(&shuffle, sizeof(shuffle)); 890 get_random_bytes(&shuffle, sizeof(shuffle));
940 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 891 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
941 inetpeer_invalidate_tree(AF_INET);
942} 892}
943 893
944/* 894/*
@@ -1111,20 +1061,20 @@ static int slow_chain_length(const struct rtable *head)
1111 return length >> FRACT_BITS; 1061 return length >> FRACT_BITS;
1112} 1062}
1113 1063
1114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) 1064static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1065 struct sk_buff *skb,
1066 const void *daddr)
1115{ 1067{
1116 static const __be32 inaddr_any = 0;
1117 struct net_device *dev = dst->dev; 1068 struct net_device *dev = dst->dev;
1118 const __be32 *pkey = daddr; 1069 const __be32 *pkey = daddr;
1119 const struct rtable *rt; 1070 const struct rtable *rt;
1120 struct neighbour *n; 1071 struct neighbour *n;
1121 1072
1122 rt = (const struct rtable *) dst; 1073 rt = (const struct rtable *) dst;
1123 1074 if (rt->rt_gateway)
1124 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125 pkey = &inaddr_any;
1126 else if (rt->rt_gateway)
1127 pkey = (const __be32 *) &rt->rt_gateway; 1075 pkey = (const __be32 *) &rt->rt_gateway;
1076 else if (skb)
1077 pkey = &ip_hdr(skb)->daddr;
1128 1078
1129 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); 1079 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130 if (n) 1080 if (n)
@@ -1132,16 +1082,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo
1132 return neigh_create(&arp_tbl, pkey, dev); 1082 return neigh_create(&arp_tbl, pkey, dev);
1133} 1083}
1134 1084
1135static int rt_bind_neighbour(struct rtable *rt)
1136{
1137 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138 if (IS_ERR(n))
1139 return PTR_ERR(n);
1140 dst_set_neighbour(&rt->dst, n);
1141
1142 return 0;
1143}
1144
1145static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, 1085static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1146 struct sk_buff *skb, int ifindex) 1086 struct sk_buff *skb, int ifindex)
1147{ 1087{
@@ -1150,7 +1090,6 @@ static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1150 unsigned long now; 1090 unsigned long now;
1151 u32 min_score; 1091 u32 min_score;
1152 int chain_length; 1092 int chain_length;
1153 int attempts = !in_softirq();
1154 1093
1155restart: 1094restart:
1156 chain_length = 0; 1095 chain_length = 0;
@@ -1159,7 +1098,7 @@ restart:
1159 candp = NULL; 1098 candp = NULL;
1160 now = jiffies; 1099 now = jiffies;
1161 1100
1162 if (!rt_caching(dev_net(rt->dst.dev))) { 1101 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1163 /* 1102 /*
1164 * If we're not caching, just tell the caller we 1103 * If we're not caching, just tell the caller we
1165 * were successful and don't touch the route. The 1104 * were successful and don't touch the route. The
@@ -1177,15 +1116,6 @@ restart:
1177 */ 1116 */
1178 1117
1179 rt->dst.flags |= DST_NOCACHE; 1118 rt->dst.flags |= DST_NOCACHE;
1180 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181 int err = rt_bind_neighbour(rt);
1182 if (err) {
1183 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1184 ip_rt_put(rt);
1185 return ERR_PTR(err);
1186 }
1187 }
1188
1189 goto skip_hashing; 1119 goto skip_hashing;
1190 } 1120 }
1191 1121
@@ -1268,40 +1198,6 @@ restart:
1268 } 1198 }
1269 } 1199 }
1270 1200
1271 /* Try to bind route to arp only if it is output
1272 route or unicast forwarding path.
1273 */
1274 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1275 int err = rt_bind_neighbour(rt);
1276 if (err) {
1277 spin_unlock_bh(rt_hash_lock_addr(hash));
1278
1279 if (err != -ENOBUFS) {
1280 rt_drop(rt);
1281 return ERR_PTR(err);
1282 }
1283
1284 /* Neighbour tables are full and nothing
1285 can be released. Try to shrink route cache,
1286 it is most likely it holds some neighbour records.
1287 */
1288 if (attempts-- > 0) {
1289 int saved_elasticity = ip_rt_gc_elasticity;
1290 int saved_int = ip_rt_gc_min_interval;
1291 ip_rt_gc_elasticity = 1;
1292 ip_rt_gc_min_interval = 0;
1293 rt_garbage_collect(&ipv4_dst_ops);
1294 ip_rt_gc_min_interval = saved_int;
1295 ip_rt_gc_elasticity = saved_elasticity;
1296 goto restart;
1297 }
1298
1299 net_warn_ratelimited("Neighbour table overflow\n");
1300 rt_drop(rt);
1301 return ERR_PTR(-ENOBUFS);
1302 }
1303 }
1304
1305 rt->dst.rt_next = rt_hash_table[hash].chain; 1201 rt->dst.rt_next = rt_hash_table[hash].chain;
1306 1202
1307 /* 1203 /*
@@ -1319,25 +1215,6 @@ skip_hashing:
1319 return rt; 1215 return rt;
1320} 1216}
1321 1217
1322static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323
1324static u32 rt_peer_genid(void)
1325{
1326 return atomic_read(&__rt_peer_genid);
1327}
1328
1329void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1330{
1331 struct inet_peer *peer;
1332
1333 peer = inet_getpeer_v4(daddr, create);
1334
1335 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1336 inet_putpeer(peer);
1337 else
1338 rt->rt_peer_genid = rt_peer_genid();
1339}
1340
1341/* 1218/*
1342 * Peer allocation may fail only in serious out-of-memory conditions. However 1219 * Peer allocation may fail only in serious out-of-memory conditions. However
1343 * we still can generate some output. 1220 * we still can generate some output.
@@ -1360,21 +1237,15 @@ static void ip_select_fb_ident(struct iphdr *iph)
1360 1237
1361void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1238void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1362{ 1239{
1363 struct rtable *rt = (struct rtable *) dst; 1240 struct net *net = dev_net(dst->dev);
1364 1241 struct inet_peer *peer;
1365 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1366 if (rt->peer == NULL)
1367 rt_bind_peer(rt, rt->rt_dst, 1);
1368 1242
1369 /* If peer is attached to destination, it is never detached, 1243 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1370 so that we need not to grab a lock to dereference it. 1244 if (peer) {
1371 */ 1245 iph->id = htons(inet_getid(peer, more));
1372 if (rt->peer) { 1246 inet_putpeer(peer);
1373 iph->id = htons(inet_getid(rt->peer, more)); 1247 return;
1374 return; 1248 }
1375 }
1376 } else if (!rt)
1377 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1378 1249
1379 ip_select_fb_ident(iph); 1250 ip_select_fb_ident(iph);
1380} 1251}
@@ -1400,32 +1271,6 @@ static void rt_del(unsigned int hash, struct rtable *rt)
1400 spin_unlock_bh(rt_hash_lock_addr(hash)); 1271 spin_unlock_bh(rt_hash_lock_addr(hash));
1401} 1272}
1402 1273
1403static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1404{
1405 struct rtable *rt = (struct rtable *) dst;
1406 __be32 orig_gw = rt->rt_gateway;
1407 struct neighbour *n, *old_n;
1408
1409 dst_confirm(&rt->dst);
1410
1411 rt->rt_gateway = peer->redirect_learned.a4;
1412
1413 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1414 if (IS_ERR(n)) {
1415 rt->rt_gateway = orig_gw;
1416 return;
1417 }
1418 old_n = xchg(&rt->dst._neighbour, n);
1419 if (old_n)
1420 neigh_release(old_n);
1421 if (!(n->nud_state & NUD_VALID)) {
1422 neigh_event_send(n, NULL);
1423 } else {
1424 rt->rt_flags |= RTCF_REDIRECTED;
1425 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426 }
1427}
1428
1429/* called in rcu_read_lock() section */ 1274/* called in rcu_read_lock() section */
1430void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1275void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431 __be32 saddr, struct net_device *dev) 1276 __be32 saddr, struct net_device *dev)
@@ -1434,7 +1279,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1434 struct in_device *in_dev = __in_dev_get_rcu(dev); 1279 struct in_device *in_dev = __in_dev_get_rcu(dev);
1435 __be32 skeys[2] = { saddr, 0 }; 1280 __be32 skeys[2] = { saddr, 0 };
1436 int ikeys[2] = { dev->ifindex, 0 }; 1281 int ikeys[2] = { dev->ifindex, 0 };
1437 struct inet_peer *peer;
1438 struct net *net; 1282 struct net *net;
1439 1283
1440 if (!in_dev) 1284 if (!in_dev)
@@ -1467,6 +1311,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1467 rthp = &rt_hash_table[hash].chain; 1311 rthp = &rt_hash_table[hash].chain;
1468 1312
1469 while ((rt = rcu_dereference(*rthp)) != NULL) { 1313 while ((rt = rcu_dereference(*rthp)) != NULL) {
1314 struct neighbour *n;
1315
1470 rthp = &rt->dst.rt_next; 1316 rthp = &rt->dst.rt_next;
1471 1317
1472 if (rt->rt_key_dst != daddr || 1318 if (rt->rt_key_dst != daddr ||
@@ -1480,16 +1326,16 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1480 rt->rt_gateway != old_gw) 1326 rt->rt_gateway != old_gw)
1481 continue; 1327 continue;
1482 1328
1483 if (!rt->peer) 1329 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1484 rt_bind_peer(rt, rt->rt_dst, 1); 1330 if (n) {
1485 1331 if (!(n->nud_state & NUD_VALID)) {
1486 peer = rt->peer; 1332 neigh_event_send(n, NULL);
1487 if (peer) { 1333 } else {
1488 if (peer->redirect_learned.a4 != new_gw) { 1334 rt->rt_gateway = new_gw;
1489 peer->redirect_learned.a4 = new_gw; 1335 rt->rt_flags |= RTCF_REDIRECTED;
1490 atomic_inc(&__rt_peer_genid); 1336 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1491 } 1337 }
1492 check_peer_redir(&rt->dst, peer); 1338 neigh_release(n);
1493 } 1339 }
1494 } 1340 }
1495 } 1341 }
@@ -1507,23 +1353,6 @@ reject_redirect:
1507 ; 1353 ;
1508} 1354}
1509 1355
1510static bool peer_pmtu_expired(struct inet_peer *peer)
1511{
1512 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1513
1514 return orig &&
1515 time_after_eq(jiffies, orig) &&
1516 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1517}
1518
1519static bool peer_pmtu_cleaned(struct inet_peer *peer)
1520{
1521 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1522
1523 return orig &&
1524 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525}
1526
1527static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1356static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1528{ 1357{
1529 struct rtable *rt = (struct rtable *)dst; 1358 struct rtable *rt = (struct rtable *)dst;
@@ -1533,14 +1362,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1533 if (dst->obsolete > 0) { 1362 if (dst->obsolete > 0) {
1534 ip_rt_put(rt); 1363 ip_rt_put(rt);
1535 ret = NULL; 1364 ret = NULL;
1536 } else if (rt->rt_flags & RTCF_REDIRECTED) { 1365 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1366 rt->dst.expires) {
1537 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1367 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1538 rt->rt_oif, 1368 rt->rt_oif,
1539 rt_genid(dev_net(dst->dev))); 1369 rt_genid(dev_net(dst->dev)));
1540 rt_del(hash, rt); 1370 rt_del(hash, rt);
1541 ret = NULL; 1371 ret = NULL;
1542 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1543 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1544 } 1372 }
1545 } 1373 }
1546 return ret; 1374 return ret;
@@ -1567,6 +1395,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1567 struct rtable *rt = skb_rtable(skb); 1395 struct rtable *rt = skb_rtable(skb);
1568 struct in_device *in_dev; 1396 struct in_device *in_dev;
1569 struct inet_peer *peer; 1397 struct inet_peer *peer;
1398 struct net *net;
1570 int log_martians; 1399 int log_martians;
1571 1400
1572 rcu_read_lock(); 1401 rcu_read_lock();
@@ -1578,9 +1407,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1578 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1407 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1579 rcu_read_unlock(); 1408 rcu_read_unlock();
1580 1409
1581 if (!rt->peer) 1410 net = dev_net(rt->dst.dev);
1582 rt_bind_peer(rt, rt->rt_dst, 1); 1411 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1583 peer = rt->peer;
1584 if (!peer) { 1412 if (!peer) {
1585 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1413 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1586 return; 1414 return;
@@ -1597,7 +1425,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1597 */ 1425 */
1598 if (peer->rate_tokens >= ip_rt_redirect_number) { 1426 if (peer->rate_tokens >= ip_rt_redirect_number) {
1599 peer->rate_last = jiffies; 1427 peer->rate_last = jiffies;
1600 return; 1428 goto out_put_peer;
1601 } 1429 }
1602 1430
1603 /* Check for load limit; set rate_last to the latest sent 1431 /* Check for load limit; set rate_last to the latest sent
@@ -1618,16 +1446,34 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1618 &rt->rt_dst, &rt->rt_gateway); 1446 &rt->rt_dst, &rt->rt_gateway);
1619#endif 1447#endif
1620 } 1448 }
1449out_put_peer:
1450 inet_putpeer(peer);
1621} 1451}
1622 1452
1623static int ip_error(struct sk_buff *skb) 1453static int ip_error(struct sk_buff *skb)
1624{ 1454{
1455 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1625 struct rtable *rt = skb_rtable(skb); 1456 struct rtable *rt = skb_rtable(skb);
1626 struct inet_peer *peer; 1457 struct inet_peer *peer;
1627 unsigned long now; 1458 unsigned long now;
1459 struct net *net;
1628 bool send; 1460 bool send;
1629 int code; 1461 int code;
1630 1462
1463 net = dev_net(rt->dst.dev);
1464 if (!IN_DEV_FORWARD(in_dev)) {
1465 switch (rt->dst.error) {
1466 case EHOSTUNREACH:
1467 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1468 break;
1469
1470 case ENETUNREACH:
1471 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1472 break;
1473 }
1474 goto out;
1475 }
1476
1631 switch (rt->dst.error) { 1477 switch (rt->dst.error) {
1632 case EINVAL: 1478 case EINVAL:
1633 default: 1479 default:
@@ -1637,17 +1483,14 @@ static int ip_error(struct sk_buff *skb)
1637 break; 1483 break;
1638 case ENETUNREACH: 1484 case ENETUNREACH:
1639 code = ICMP_NET_UNREACH; 1485 code = ICMP_NET_UNREACH;
1640 IP_INC_STATS_BH(dev_net(rt->dst.dev), 1486 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1641 IPSTATS_MIB_INNOROUTES);
1642 break; 1487 break;
1643 case EACCES: 1488 case EACCES:
1644 code = ICMP_PKT_FILTERED; 1489 code = ICMP_PKT_FILTERED;
1645 break; 1490 break;
1646 } 1491 }
1647 1492
1648 if (!rt->peer) 1493 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1649 rt_bind_peer(rt, rt->rt_dst, 1);
1650 peer = rt->peer;
1651 1494
1652 send = true; 1495 send = true;
1653 if (peer) { 1496 if (peer) {
@@ -1660,6 +1503,7 @@ static int ip_error(struct sk_buff *skb)
1660 peer->rate_tokens -= ip_rt_error_cost; 1503 peer->rate_tokens -= ip_rt_error_cost;
1661 else 1504 else
1662 send = false; 1505 send = false;
1506 inet_putpeer(peer);
1663 } 1507 }
1664 if (send) 1508 if (send)
1665 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1509 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -1668,136 +1512,47 @@ out: kfree_skb(skb);
1668 return 0; 1512 return 0;
1669} 1513}
1670 1514
1671/*
1672 * The last two values are not from the RFC but
1673 * are needed for AMPRnet AX.25 paths.
1674 */
1675
1676static const unsigned short mtu_plateau[] =
1677{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1678
1679static inline unsigned short guess_mtu(unsigned short old_mtu)
1680{
1681 int i;
1682
1683 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684 if (old_mtu > mtu_plateau[i])
1685 return mtu_plateau[i];
1686 return 68;
1687}
1688
1689unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690 unsigned short new_mtu,
1691 struct net_device *dev)
1692{
1693 unsigned short old_mtu = ntohs(iph->tot_len);
1694 unsigned short est_mtu = 0;
1695 struct inet_peer *peer;
1696
1697 peer = inet_getpeer_v4(iph->daddr, 1);
1698 if (peer) {
1699 unsigned short mtu = new_mtu;
1700
1701 if (new_mtu < 68 || new_mtu >= old_mtu) {
1702 /* BSD 4.2 derived systems incorrectly adjust
1703 * tot_len by the IP header length, and report
1704 * a zero MTU in the ICMP message.
1705 */
1706 if (mtu == 0 &&
1707 old_mtu >= 68 + (iph->ihl << 2))
1708 old_mtu -= iph->ihl << 2;
1709 mtu = guess_mtu(old_mtu);
1710 }
1711
1712 if (mtu < ip_rt_min_pmtu)
1713 mtu = ip_rt_min_pmtu;
1714 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715 unsigned long pmtu_expires;
1716
1717 pmtu_expires = jiffies + ip_rt_mtu_expires;
1718 if (!pmtu_expires)
1719 pmtu_expires = 1UL;
1720
1721 est_mtu = mtu;
1722 peer->pmtu_learned = mtu;
1723 peer->pmtu_expires = pmtu_expires;
1724 atomic_inc(&__rt_peer_genid);
1725 }
1726
1727 inet_putpeer(peer);
1728 }
1729 return est_mtu ? : new_mtu;
1730}
1731
1732static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733{
1734 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1735
1736 if (!expires)
1737 return;
1738 if (time_before(jiffies, expires)) {
1739 u32 orig_dst_mtu = dst_mtu(dst);
1740 if (peer->pmtu_learned < orig_dst_mtu) {
1741 if (!peer->pmtu_orig)
1742 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744 }
1745 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747}
1748
1749static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1515static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750{ 1516{
1751 struct rtable *rt = (struct rtable *) dst; 1517 struct rtable *rt = (struct rtable *) dst;
1752 struct inet_peer *peer;
1753 1518
1754 dst_confirm(dst); 1519 dst_confirm(dst);
1755 1520
1756 if (!rt->peer) 1521 if (mtu < ip_rt_min_pmtu)
1757 rt_bind_peer(rt, rt->rt_dst, 1); 1522 mtu = ip_rt_min_pmtu;
1758 peer = rt->peer;
1759 if (peer) {
1760 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
1762 if (mtu < ip_rt_min_pmtu)
1763 mtu = ip_rt_min_pmtu;
1764 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765 1523
1766 pmtu_expires = jiffies + ip_rt_mtu_expires; 1524 rt->rt_pmtu = mtu;
1767 if (!pmtu_expires) 1525 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1768 pmtu_expires = 1UL; 1526}
1769 1527
1770 peer->pmtu_learned = mtu; 1528void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1771 peer->pmtu_expires = pmtu_expires; 1529 int oif, u32 mark, u8 protocol, int flow_flags)
1530{
1531 const struct iphdr *iph = (const struct iphdr *)skb->data;
1532 struct flowi4 fl4;
1533 struct rtable *rt;
1772 1534
1773 atomic_inc(&__rt_peer_genid); 1535 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1774 rt->rt_peer_genid = rt_peer_genid(); 1536 protocol, flow_flags,
1775 } 1537 iph->daddr, iph->saddr, 0, 0);
1776 check_peer_pmtu(dst, peer); 1538 rt = __ip_route_output_key(net, &fl4);
1539 if (!IS_ERR(rt)) {
1540 ip_rt_update_pmtu(&rt->dst, mtu);
1541 ip_rt_put(rt);
1777 } 1542 }
1778} 1543}
1544EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1779 1545
1780 1546void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1781static void ipv4_validate_peer(struct rtable *rt)
1782{ 1547{
1783 if (rt->rt_peer_genid != rt_peer_genid()) { 1548 const struct inet_sock *inet = inet_sk(sk);
1784 struct inet_peer *peer;
1785
1786 if (!rt->peer)
1787 rt_bind_peer(rt, rt->rt_dst, 0);
1788
1789 peer = rt->peer;
1790 if (peer) {
1791 check_peer_pmtu(&rt->dst, peer);
1792 1549
1793 if (peer->redirect_learned.a4 && 1550 return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1794 peer->redirect_learned.a4 != rt->rt_gateway) 1551 sk->sk_bound_dev_if, sk->sk_mark,
1795 check_peer_redir(&rt->dst, peer); 1552 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1796 } 1553 inet_sk_flowi_flags(sk));
1797
1798 rt->rt_peer_genid = rt_peer_genid();
1799 }
1800} 1554}
1555EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1801 1556
1802static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1557static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803{ 1558{
@@ -1805,23 +1560,17 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1805 1560
1806 if (rt_is_expired(rt)) 1561 if (rt_is_expired(rt))
1807 return NULL; 1562 return NULL;
1808 ipv4_validate_peer(rt);
1809 return dst; 1563 return dst;
1810} 1564}
1811 1565
1812static void ipv4_dst_destroy(struct dst_entry *dst) 1566static void ipv4_dst_destroy(struct dst_entry *dst)
1813{ 1567{
1814 struct rtable *rt = (struct rtable *) dst; 1568 struct rtable *rt = (struct rtable *) dst;
1815 struct inet_peer *peer = rt->peer;
1816 1569
1817 if (rt->fi) { 1570 if (rt->fi) {
1818 fib_info_put(rt->fi); 1571 fib_info_put(rt->fi);
1819 rt->fi = NULL; 1572 rt->fi = NULL;
1820 } 1573 }
1821 if (peer) {
1822 rt->peer = NULL;
1823 inet_putpeer(peer);
1824 }
1825} 1574}
1826 1575
1827 1576
@@ -1832,8 +1581,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
1832 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1581 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833 1582
1834 rt = skb_rtable(skb); 1583 rt = skb_rtable(skb);
1835 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) 1584 if (rt)
1836 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); 1585 dst_set_expires(&rt->dst, 0);
1837} 1586}
1838 1587
1839static int ip_rt_bug(struct sk_buff *skb) 1588static int ip_rt_bug(struct sk_buff *skb)
@@ -1913,7 +1662,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1913static unsigned int ipv4_mtu(const struct dst_entry *dst) 1662static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914{ 1663{
1915 const struct rtable *rt = (const struct rtable *) dst; 1664 const struct rtable *rt = (const struct rtable *) dst;
1916 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 1665 unsigned int mtu = rt->rt_pmtu;
1666
1667 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1668 mtu = 0;
1669
1670 if (!mtu)
1671 mtu = dst_metric_raw(dst, RTAX_MTU);
1917 1672
1918 if (mtu && rt_is_output_route(rt)) 1673 if (mtu && rt_is_output_route(rt))
1919 return mtu; 1674 return mtu;
@@ -1935,60 +1690,27 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1935static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1690static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936 struct fib_info *fi) 1691 struct fib_info *fi)
1937{ 1692{
1938 struct inet_peer *peer; 1693 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1939 int create = 0; 1694 rt->fi = fi;
1940 1695 atomic_inc(&fi->fib_clntref);
1941 /* If a peer entry exists for this destination, we must hook
1942 * it up in order to get at cached metrics.
1943 */
1944 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945 create = 1;
1946
1947 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948 if (peer) {
1949 rt->rt_peer_genid = rt_peer_genid();
1950 if (inet_metrics_new(peer))
1951 memcpy(peer->metrics, fi->fib_metrics,
1952 sizeof(u32) * RTAX_MAX);
1953 dst_init_metrics(&rt->dst, peer->metrics, false);
1954
1955 check_peer_pmtu(&rt->dst, peer);
1956
1957 if (peer->redirect_learned.a4 &&
1958 peer->redirect_learned.a4 != rt->rt_gateway) {
1959 rt->rt_gateway = peer->redirect_learned.a4;
1960 rt->rt_flags |= RTCF_REDIRECTED;
1961 }
1962 } else {
1963 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964 rt->fi = fi;
1965 atomic_inc(&fi->fib_clntref);
1966 }
1967 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968 } 1696 }
1697 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1969} 1698}
1970 1699
1971static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1700static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1972 const struct fib_result *res, 1701 const struct fib_result *res,
1973 struct fib_info *fi, u16 type, u32 itag) 1702 struct fib_info *fi, u16 type, u32 itag)
1974{ 1703{
1975 struct dst_entry *dst = &rt->dst;
1976
1977 if (fi) { 1704 if (fi) {
1978 if (FIB_RES_GW(*res) && 1705 if (FIB_RES_GW(*res) &&
1979 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1706 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980 rt->rt_gateway = FIB_RES_GW(*res); 1707 rt->rt_gateway = FIB_RES_GW(*res);
1981 rt_init_metrics(rt, fl4, fi); 1708 rt_init_metrics(rt, fl4, fi);
1982#ifdef CONFIG_IP_ROUTE_CLASSID 1709#ifdef CONFIG_IP_ROUTE_CLASSID
1983 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1710 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1984#endif 1711#endif
1985 } 1712 }
1986 1713
1987 if (dst_mtu(dst) > IP_MAX_MTU)
1988 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1991
1992#ifdef CONFIG_IP_ROUTE_CLASSID 1714#ifdef CONFIG_IP_ROUTE_CLASSID
1993#ifdef CONFIG_IP_MULTIPLE_TABLES 1715#ifdef CONFIG_IP_MULTIPLE_TABLES
1994 set_class_tag(rt, fib_rules_tclass(res)); 1716 set_class_tag(rt, fib_rules_tclass(res));
@@ -2012,7 +1734,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2012{ 1734{
2013 unsigned int hash; 1735 unsigned int hash;
2014 struct rtable *rth; 1736 struct rtable *rth;
2015 __be32 spec_dst;
2016 struct in_device *in_dev = __in_dev_get_rcu(dev); 1737 struct in_device *in_dev = __in_dev_get_rcu(dev);
2017 u32 itag = 0; 1738 u32 itag = 0;
2018 int err; 1739 int err;
@@ -2023,16 +1744,19 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2023 return -EINVAL; 1744 return -EINVAL;
2024 1745
2025 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1746 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) 1747 skb->protocol != htons(ETH_P_IP))
2027 goto e_inval; 1748 goto e_inval;
2028 1749
1750 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1751 if (ipv4_is_loopback(saddr))
1752 goto e_inval;
1753
2029 if (ipv4_is_zeronet(saddr)) { 1754 if (ipv4_is_zeronet(saddr)) {
2030 if (!ipv4_is_local_multicast(daddr)) 1755 if (!ipv4_is_local_multicast(daddr))
2031 goto e_inval; 1756 goto e_inval;
2032 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033 } else { 1757 } else {
2034 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 1758 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2035 &itag); 1759 in_dev, &itag);
2036 if (err < 0) 1760 if (err < 0)
2037 goto e_err; 1761 goto e_err;
2038 } 1762 }
@@ -2058,10 +1782,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2058 rth->rt_iif = dev->ifindex; 1782 rth->rt_iif = dev->ifindex;
2059 rth->rt_oif = 0; 1783 rth->rt_oif = 0;
2060 rth->rt_mark = skb->mark; 1784 rth->rt_mark = skb->mark;
1785 rth->rt_pmtu = 0;
2061 rth->rt_gateway = daddr; 1786 rth->rt_gateway = daddr;
2062 rth->rt_spec_dst= spec_dst;
2063 rth->rt_peer_genid = 0;
2064 rth->peer = NULL;
2065 rth->fi = NULL; 1787 rth->fi = NULL;
2066 if (our) { 1788 if (our) {
2067 rth->dst.input= ip_local_deliver; 1789 rth->dst.input= ip_local_deliver;
@@ -2123,7 +1845,6 @@ static int __mkroute_input(struct sk_buff *skb,
2123 int err; 1845 int err;
2124 struct in_device *out_dev; 1846 struct in_device *out_dev;
2125 unsigned int flags = 0; 1847 unsigned int flags = 0;
2126 __be32 spec_dst;
2127 u32 itag; 1848 u32 itag;
2128 1849
2129 /* get a working reference to the output device */ 1850 /* get a working reference to the output device */
@@ -2135,7 +1856,7 @@ static int __mkroute_input(struct sk_buff *skb,
2135 1856
2136 1857
2137 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1858 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2138 in_dev->dev, &spec_dst, &itag); 1859 in_dev->dev, in_dev, &itag);
2139 if (err < 0) { 1860 if (err < 0) {
2140 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1861 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2141 saddr); 1862 saddr);
@@ -2186,10 +1907,8 @@ static int __mkroute_input(struct sk_buff *skb,
2186 rth->rt_iif = in_dev->dev->ifindex; 1907 rth->rt_iif = in_dev->dev->ifindex;
2187 rth->rt_oif = 0; 1908 rth->rt_oif = 0;
2188 rth->rt_mark = skb->mark; 1909 rth->rt_mark = skb->mark;
1910 rth->rt_pmtu = 0;
2189 rth->rt_gateway = daddr; 1911 rth->rt_gateway = daddr;
2190 rth->rt_spec_dst= spec_dst;
2191 rth->rt_peer_genid = 0;
2192 rth->peer = NULL;
2193 rth->fi = NULL; 1912 rth->fi = NULL;
2194 1913
2195 rth->dst.input = ip_forward; 1914 rth->dst.input = ip_forward;
@@ -2253,7 +1972,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2253 u32 itag = 0; 1972 u32 itag = 0;
2254 struct rtable *rth; 1973 struct rtable *rth;
2255 unsigned int hash; 1974 unsigned int hash;
2256 __be32 spec_dst;
2257 int err = -EINVAL; 1975 int err = -EINVAL;
2258 struct net *net = dev_net(dev); 1976 struct net *net = dev_net(dev);
2259 1977
@@ -2266,8 +1984,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2266 by fib_lookup. 1984 by fib_lookup.
2267 */ 1985 */
2268 1986
2269 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1987 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2270 ipv4_is_loopback(saddr))
2271 goto martian_source; 1988 goto martian_source;
2272 1989
2273 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 1990 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
@@ -2279,9 +1996,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2279 if (ipv4_is_zeronet(saddr)) 1996 if (ipv4_is_zeronet(saddr))
2280 goto martian_source; 1997 goto martian_source;
2281 1998
2282 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) 1999 if (ipv4_is_zeronet(daddr))
2283 goto martian_destination; 2000 goto martian_destination;
2284 2001
2002 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2003 if (ipv4_is_loopback(daddr))
2004 goto martian_destination;
2005
2006 if (ipv4_is_loopback(saddr))
2007 goto martian_source;
2008 }
2009
2285 /* 2010 /*
2286 * Now we are ready to route packet. 2011 * Now we are ready to route packet.
2287 */ 2012 */
@@ -2293,11 +2018,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2293 fl4.daddr = daddr; 2018 fl4.daddr = daddr;
2294 fl4.saddr = saddr; 2019 fl4.saddr = saddr;
2295 err = fib_lookup(net, &fl4, &res); 2020 err = fib_lookup(net, &fl4, &res);
2296 if (err != 0) { 2021 if (err != 0)
2297 if (!IN_DEV_FORWARD(in_dev))
2298 goto e_hostunreach;
2299 goto no_route; 2022 goto no_route;
2300 }
2301 2023
2302 RT_CACHE_STAT_INC(in_slow_tot); 2024 RT_CACHE_STAT_INC(in_slow_tot);
2303 2025
@@ -2307,17 +2029,16 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2307 if (res.type == RTN_LOCAL) { 2029 if (res.type == RTN_LOCAL) {
2308 err = fib_validate_source(skb, saddr, daddr, tos, 2030 err = fib_validate_source(skb, saddr, daddr, tos,
2309 net->loopback_dev->ifindex, 2031 net->loopback_dev->ifindex,
2310 dev, &spec_dst, &itag); 2032 dev, in_dev, &itag);
2311 if (err < 0) 2033 if (err < 0)
2312 goto martian_source_keep_err; 2034 goto martian_source_keep_err;
2313 if (err) 2035 if (err)
2314 flags |= RTCF_DIRECTSRC; 2036 flags |= RTCF_DIRECTSRC;
2315 spec_dst = daddr;
2316 goto local_input; 2037 goto local_input;
2317 } 2038 }
2318 2039
2319 if (!IN_DEV_FORWARD(in_dev)) 2040 if (!IN_DEV_FORWARD(in_dev))
2320 goto e_hostunreach; 2041 goto no_route;
2321 if (res.type != RTN_UNICAST) 2042 if (res.type != RTN_UNICAST)
2322 goto martian_destination; 2043 goto martian_destination;
2323 2044
@@ -2328,11 +2049,9 @@ brd_input:
2328 if (skb->protocol != htons(ETH_P_IP)) 2049 if (skb->protocol != htons(ETH_P_IP))
2329 goto e_inval; 2050 goto e_inval;
2330 2051
2331 if (ipv4_is_zeronet(saddr)) 2052 if (!ipv4_is_zeronet(saddr)) {
2332 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2053 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2333 else { 2054 in_dev, &itag);
2334 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2335 &itag);
2336 if (err < 0) 2055 if (err < 0)
2337 goto martian_source_keep_err; 2056 goto martian_source_keep_err;
2338 if (err) 2057 if (err)
@@ -2362,17 +2081,12 @@ local_input:
2362 rth->rt_key_tos = tos; 2081 rth->rt_key_tos = tos;
2363 rth->rt_dst = daddr; 2082 rth->rt_dst = daddr;
2364 rth->rt_src = saddr; 2083 rth->rt_src = saddr;
2365#ifdef CONFIG_IP_ROUTE_CLASSID
2366 rth->dst.tclassid = itag;
2367#endif
2368 rth->rt_route_iif = dev->ifindex; 2084 rth->rt_route_iif = dev->ifindex;
2369 rth->rt_iif = dev->ifindex; 2085 rth->rt_iif = dev->ifindex;
2370 rth->rt_oif = 0; 2086 rth->rt_oif = 0;
2371 rth->rt_mark = skb->mark; 2087 rth->rt_mark = skb->mark;
2088 rth->rt_pmtu = 0;
2372 rth->rt_gateway = daddr; 2089 rth->rt_gateway = daddr;
2373 rth->rt_spec_dst= spec_dst;
2374 rth->rt_peer_genid = 0;
2375 rth->peer = NULL;
2376 rth->fi = NULL; 2090 rth->fi = NULL;
2377 if (res.type == RTN_UNREACHABLE) { 2091 if (res.type == RTN_UNREACHABLE) {
2378 rth->dst.input= ip_error; 2092 rth->dst.input= ip_error;
@@ -2388,7 +2102,6 @@ local_input:
2388 2102
2389no_route: 2103no_route:
2390 RT_CACHE_STAT_INC(in_no_route); 2104 RT_CACHE_STAT_INC(in_no_route);
2391 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2392 res.type = RTN_UNREACHABLE; 2105 res.type = RTN_UNREACHABLE;
2393 if (err == -ESRCH) 2106 if (err == -ESRCH)
2394 err = -ENETUNREACH; 2107 err = -ENETUNREACH;
@@ -2405,10 +2118,6 @@ martian_destination:
2405 &daddr, &saddr, dev->name); 2118 &daddr, &saddr, dev->name);
2406#endif 2119#endif
2407 2120
2408e_hostunreach:
2409 err = -EHOSTUNREACH;
2410 goto out;
2411
2412e_inval: 2121e_inval:
2413 err = -EINVAL; 2122 err = -EINVAL;
2414 goto out; 2123 goto out;
@@ -2452,7 +2161,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2452 rth->rt_mark == skb->mark && 2161 rth->rt_mark == skb->mark &&
2453 net_eq(dev_net(rth->dst.dev), net) && 2162 net_eq(dev_net(rth->dst.dev), net) &&
2454 !rt_is_expired(rth)) { 2163 !rt_is_expired(rth)) {
2455 ipv4_validate_peer(rth);
2456 if (noref) { 2164 if (noref) {
2457 dst_use_noref(&rth->dst, jiffies); 2165 dst_use_noref(&rth->dst, jiffies);
2458 skb_dst_set_noref(skb, &rth->dst); 2166 skb_dst_set_noref(skb, &rth->dst);
@@ -2520,9 +2228,14 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2520 u16 type = res->type; 2228 u16 type = res->type;
2521 struct rtable *rth; 2229 struct rtable *rth;
2522 2230
2523 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) 2231 in_dev = __in_dev_get_rcu(dev_out);
2232 if (!in_dev)
2524 return ERR_PTR(-EINVAL); 2233 return ERR_PTR(-EINVAL);
2525 2234
2235 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2236 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2237 return ERR_PTR(-EINVAL);
2238
2526 if (ipv4_is_lbcast(fl4->daddr)) 2239 if (ipv4_is_lbcast(fl4->daddr))
2527 type = RTN_BROADCAST; 2240 type = RTN_BROADCAST;
2528 else if (ipv4_is_multicast(fl4->daddr)) 2241 else if (ipv4_is_multicast(fl4->daddr))
@@ -2533,10 +2246,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2533 if (dev_out->flags & IFF_LOOPBACK) 2246 if (dev_out->flags & IFF_LOOPBACK)
2534 flags |= RTCF_LOCAL; 2247 flags |= RTCF_LOCAL;
2535 2248
2536 in_dev = __in_dev_get_rcu(dev_out);
2537 if (!in_dev)
2538 return ERR_PTR(-EINVAL);
2539
2540 if (type == RTN_BROADCAST) { 2249 if (type == RTN_BROADCAST) {
2541 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2250 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2542 fi = NULL; 2251 fi = NULL;
@@ -2573,20 +2282,15 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2573 rth->rt_iif = orig_oif ? : dev_out->ifindex; 2282 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2574 rth->rt_oif = orig_oif; 2283 rth->rt_oif = orig_oif;
2575 rth->rt_mark = fl4->flowi4_mark; 2284 rth->rt_mark = fl4->flowi4_mark;
2285 rth->rt_pmtu = 0;
2576 rth->rt_gateway = fl4->daddr; 2286 rth->rt_gateway = fl4->daddr;
2577 rth->rt_spec_dst= fl4->saddr;
2578 rth->rt_peer_genid = 0;
2579 rth->peer = NULL;
2580 rth->fi = NULL; 2287 rth->fi = NULL;
2581 2288
2582 RT_CACHE_STAT_INC(out_slow_tot); 2289 RT_CACHE_STAT_INC(out_slow_tot);
2583 2290
2584 if (flags & RTCF_LOCAL) { 2291 if (flags & RTCF_LOCAL)
2585 rth->dst.input = ip_local_deliver; 2292 rth->dst.input = ip_local_deliver;
2586 rth->rt_spec_dst = fl4->daddr;
2587 }
2588 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2293 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2589 rth->rt_spec_dst = fl4->saddr;
2590 if (flags & RTCF_LOCAL && 2294 if (flags & RTCF_LOCAL &&
2591 !(dev_out->flags & IFF_LOOPBACK)) { 2295 !(dev_out->flags & IFF_LOOPBACK)) {
2592 rth->dst.output = ip_mc_output; 2296 rth->dst.output = ip_mc_output;
@@ -2605,6 +2309,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2605 2309
2606 rt_set_nexthop(rth, fl4, res, fi, type, 0); 2310 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2607 2311
2312 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2313 rth->dst.flags |= DST_NOCACHE;
2314
2608 return rth; 2315 return rth;
2609} 2316}
2610 2317
@@ -2625,6 +2332,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2625 int orig_oif; 2332 int orig_oif;
2626 2333
2627 res.fi = NULL; 2334 res.fi = NULL;
2335 res.table = NULL;
2628#ifdef CONFIG_IP_MULTIPLE_TABLES 2336#ifdef CONFIG_IP_MULTIPLE_TABLES
2629 res.r = NULL; 2337 res.r = NULL;
2630#endif 2338#endif
@@ -2730,6 +2438,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2730 2438
2731 if (fib_lookup(net, fl4, &res)) { 2439 if (fib_lookup(net, fl4, &res)) {
2732 res.fi = NULL; 2440 res.fi = NULL;
2441 res.table = NULL;
2733 if (fl4->flowi4_oif) { 2442 if (fl4->flowi4_oif) {
2734 /* Apparently, routing tables are wrong. Assume, 2443 /* Apparently, routing tables are wrong. Assume,
2735 that the destination is on link. 2444 that the destination is on link.
@@ -2828,7 +2537,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2828 (IPTOS_RT_MASK | RTO_ONLINK)) && 2537 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2829 net_eq(dev_net(rth->dst.dev), net) && 2538 net_eq(dev_net(rth->dst.dev), net) &&
2830 !rt_is_expired(rth)) { 2539 !rt_is_expired(rth)) {
2831 ipv4_validate_peer(rth);
2832 dst_use(&rth->dst, jiffies); 2540 dst_use(&rth->dst, jiffies);
2833 RT_CACHE_STAT_INC(out_hit); 2541 RT_CACHE_STAT_INC(out_hit);
2834 rcu_read_unlock_bh(); 2542 rcu_read_unlock_bh();
@@ -2892,7 +2600,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2892 new->__use = 1; 2600 new->__use = 1;
2893 new->input = dst_discard; 2601 new->input = dst_discard;
2894 new->output = dst_discard; 2602 new->output = dst_discard;
2895 dst_copy_metrics(new, &ort->dst);
2896 2603
2897 new->dev = ort->dst.dev; 2604 new->dev = ort->dst.dev;
2898 if (new->dev) 2605 if (new->dev)
@@ -2905,6 +2612,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2905 rt->rt_iif = ort->rt_iif; 2612 rt->rt_iif = ort->rt_iif;
2906 rt->rt_oif = ort->rt_oif; 2613 rt->rt_oif = ort->rt_oif;
2907 rt->rt_mark = ort->rt_mark; 2614 rt->rt_mark = ort->rt_mark;
2615 rt->rt_pmtu = ort->rt_pmtu;
2908 2616
2909 rt->rt_genid = rt_genid(net); 2617 rt->rt_genid = rt_genid(net);
2910 rt->rt_flags = ort->rt_flags; 2618 rt->rt_flags = ort->rt_flags;
@@ -2912,10 +2620,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2912 rt->rt_dst = ort->rt_dst; 2620 rt->rt_dst = ort->rt_dst;
2913 rt->rt_src = ort->rt_src; 2621 rt->rt_src = ort->rt_src;
2914 rt->rt_gateway = ort->rt_gateway; 2622 rt->rt_gateway = ort->rt_gateway;
2915 rt->rt_spec_dst = ort->rt_spec_dst;
2916 rt->peer = ort->peer;
2917 if (rt->peer)
2918 atomic_inc(&rt->peer->refcnt);
2919 rt->fi = ort->fi; 2623 rt->fi = ort->fi;
2920 if (rt->fi) 2624 if (rt->fi)
2921 atomic_inc(&rt->fi->fib_clntref); 2625 atomic_inc(&rt->fi->fib_clntref);
@@ -2953,8 +2657,7 @@ static int rt_fill_info(struct net *net,
2953 struct rtmsg *r; 2657 struct rtmsg *r;
2954 struct nlmsghdr *nlh; 2658 struct nlmsghdr *nlh;
2955 unsigned long expires = 0; 2659 unsigned long expires = 0;
2956 const struct inet_peer *peer = rt->peer; 2660 u32 error;
2957 u32 id = 0, ts = 0, tsage = 0, error;
2958 2661
2959 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2662 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2960 if (nlh == NULL) 2663 if (nlh == NULL)
@@ -2990,10 +2693,8 @@ static int rt_fill_info(struct net *net,
2990 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2693 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2991 goto nla_put_failure; 2694 goto nla_put_failure;
2992#endif 2695#endif
2993 if (rt_is_input_route(rt)) { 2696 if (!rt_is_input_route(rt) &&
2994 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst)) 2697 rt->rt_src != rt->rt_key_src) {
2995 goto nla_put_failure;
2996 } else if (rt->rt_src != rt->rt_key_src) {
2997 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) 2698 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2998 goto nla_put_failure; 2699 goto nla_put_failure;
2999 } 2700 }
@@ -3009,20 +2710,12 @@ static int rt_fill_info(struct net *net,
3009 goto nla_put_failure; 2710 goto nla_put_failure;
3010 2711
3011 error = rt->dst.error; 2712 error = rt->dst.error;
3012 if (peer) { 2713 expires = rt->dst.expires;
3013 inet_peer_refcheck(rt->peer); 2714 if (expires) {
3014 id = atomic_read(&peer->ip_id_count) & 0xffff; 2715 if (time_before(jiffies, expires))
3015 if (peer->tcp_ts_stamp) { 2716 expires -= jiffies;
3016 ts = peer->tcp_ts; 2717 else
3017 tsage = get_seconds() - peer->tcp_ts_stamp; 2718 expires = 0;
3018 }
3019 expires = ACCESS_ONCE(peer->pmtu_expires);
3020 if (expires) {
3021 if (time_before(jiffies, expires))
3022 expires -= jiffies;
3023 else
3024 expires = 0;
3025 }
3026 } 2719 }
3027 2720
3028 if (rt_is_input_route(rt)) { 2721 if (rt_is_input_route(rt)) {
@@ -3051,8 +2744,7 @@ static int rt_fill_info(struct net *net,
3051 goto nla_put_failure; 2744 goto nla_put_failure;
3052 } 2745 }
3053 2746
3054 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2747 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3055 expires, error) < 0)
3056 goto nla_put_failure; 2748 goto nla_put_failure;
3057 2749
3058 return nlmsg_end(skb, nlh); 2750 return nlmsg_end(skb, nlh);
@@ -3400,6 +3092,30 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3400 .init = rt_genid_init, 3092 .init = rt_genid_init,
3401}; 3093};
3402 3094
3095static int __net_init ipv4_inetpeer_init(struct net *net)
3096{
3097 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3098
3099 if (!bp)
3100 return -ENOMEM;
3101 inet_peer_base_init(bp);
3102 net->ipv4.peers = bp;
3103 return 0;
3104}
3105
3106static void __net_exit ipv4_inetpeer_exit(struct net *net)
3107{
3108 struct inet_peer_base *bp = net->ipv4.peers;
3109
3110 net->ipv4.peers = NULL;
3111 inetpeer_invalidate_tree(bp);
3112 kfree(bp);
3113}
3114
3115static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3116 .init = ipv4_inetpeer_init,
3117 .exit = ipv4_inetpeer_exit,
3118};
3403 3119
3404#ifdef CONFIG_IP_ROUTE_CLASSID 3120#ifdef CONFIG_IP_ROUTE_CLASSID
3405struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3121struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
@@ -3480,6 +3196,7 @@ int __init ip_rt_init(void)
3480 register_pernet_subsys(&sysctl_route_ops); 3196 register_pernet_subsys(&sysctl_route_ops);
3481#endif 3197#endif
3482 register_pernet_subsys(&rt_genid_ops); 3198 register_pernet_subsys(&rt_genid_ops);
3199 register_pernet_subsys(&ipv4_inetpeer_ops);
3483 return rc; 3200 return rc;
3484} 3201}
3485 3202
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ef32956ed655..12aa0c5867c4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -301,6 +301,13 @@ static struct ctl_table ipv4_table[] = {
301 .proc_handler = proc_dointvec 301 .proc_handler = proc_dointvec
302 }, 302 },
303 { 303 {
304 .procname = "ip_early_demux",
305 .data = &sysctl_ip_early_demux,
306 .maxlen = sizeof(int),
307 .mode = 0644,
308 .proc_handler = proc_dointvec
309 },
310 {
304 .procname = "ip_dynaddr", 311 .procname = "ip_dynaddr",
305 .data = &sysctl_ip_dynaddr, 312 .data = &sysctl_ip_dynaddr,
306 .maxlen = sizeof(int), 313 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f60e4e..d902da96d154 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3310,8 +3310,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
3310 3310
3311#endif 3311#endif
3312 3312
3313/** 3313/* Each Responder maintains up to two secret values concurrently for
3314 * Each Responder maintains up to two secret values concurrently for
3315 * efficient secret rollover. Each secret value has 4 states: 3314 * efficient secret rollover. Each secret value has 4 states:
3316 * 3315 *
3317 * Generating. (tcp_secret_generating != tcp_secret_primary) 3316 * Generating. (tcp_secret_generating != tcp_secret_primary)
@@ -3563,6 +3562,8 @@ void __init tcp_init(void)
3563 pr_info("Hash tables configured (established %u bind %u)\n", 3562 pr_info("Hash tables configured (established %u bind %u)\n",
3564 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3563 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3565 3564
3565 tcp_metrics_init();
3566
3566 tcp_register_congestion_control(&tcp_reno); 3567 tcp_register_congestion_control(&tcp_reno);
3567 3568
3568 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); 3569 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8bce8b..055ac49b8b40 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
93int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 93int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
94int sysctl_tcp_frto __read_mostly = 2; 94int sysctl_tcp_frto __read_mostly = 2;
95int sysctl_tcp_frto_response __read_mostly; 95int sysctl_tcp_frto_response __read_mostly;
96int sysctl_tcp_nometrics_save __read_mostly;
97 96
98int sysctl_tcp_thin_dupack __read_mostly; 97int sysctl_tcp_thin_dupack __read_mostly;
99 98
@@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
701/* Calculate rto without backoff. This is the second half of Van Jacobson's 700/* Calculate rto without backoff. This is the second half of Van Jacobson's
702 * routine referred to above. 701 * routine referred to above.
703 */ 702 */
704static inline void tcp_set_rto(struct sock *sk) 703void tcp_set_rto(struct sock *sk)
705{ 704{
706 const struct tcp_sock *tp = tcp_sk(sk); 705 const struct tcp_sock *tp = tcp_sk(sk);
707 /* Old crap is replaced with new one. 8) 706 /* Old crap is replaced with new one. 8)
@@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk)
728 tcp_bound_rto(sk); 727 tcp_bound_rto(sk);
729} 728}
730 729
731/* Save metrics learned by this TCP session.
732 This function is called only, when TCP finishes successfully
733 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
734 */
735void tcp_update_metrics(struct sock *sk)
736{
737 struct tcp_sock *tp = tcp_sk(sk);
738 struct dst_entry *dst = __sk_dst_get(sk);
739
740 if (sysctl_tcp_nometrics_save)
741 return;
742
743 dst_confirm(dst);
744
745 if (dst && (dst->flags & DST_HOST)) {
746 const struct inet_connection_sock *icsk = inet_csk(sk);
747 int m;
748 unsigned long rtt;
749
750 if (icsk->icsk_backoff || !tp->srtt) {
751 /* This session failed to estimate rtt. Why?
752 * Probably, no packets returned in time.
753 * Reset our results.
754 */
755 if (!(dst_metric_locked(dst, RTAX_RTT)))
756 dst_metric_set(dst, RTAX_RTT, 0);
757 return;
758 }
759
760 rtt = dst_metric_rtt(dst, RTAX_RTT);
761 m = rtt - tp->srtt;
762
763 /* If newly calculated rtt larger than stored one,
764 * store new one. Otherwise, use EWMA. Remember,
765 * rtt overestimation is always better than underestimation.
766 */
767 if (!(dst_metric_locked(dst, RTAX_RTT))) {
768 if (m <= 0)
769 set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
770 else
771 set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
772 }
773
774 if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
775 unsigned long var;
776 if (m < 0)
777 m = -m;
778
779 /* Scale deviation to rttvar fixed point */
780 m >>= 1;
781 if (m < tp->mdev)
782 m = tp->mdev;
783
784 var = dst_metric_rtt(dst, RTAX_RTTVAR);
785 if (m >= var)
786 var = m;
787 else
788 var -= (var - m) >> 2;
789
790 set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
791 }
792
793 if (tcp_in_initial_slowstart(tp)) {
794 /* Slow start still did not finish. */
795 if (dst_metric(dst, RTAX_SSTHRESH) &&
796 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
797 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
798 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
799 if (!dst_metric_locked(dst, RTAX_CWND) &&
800 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
801 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
802 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
803 icsk->icsk_ca_state == TCP_CA_Open) {
804 /* Cong. avoidance phase, cwnd is reliable. */
805 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
806 dst_metric_set(dst, RTAX_SSTHRESH,
807 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
808 if (!dst_metric_locked(dst, RTAX_CWND))
809 dst_metric_set(dst, RTAX_CWND,
810 (dst_metric(dst, RTAX_CWND) +
811 tp->snd_cwnd) >> 1);
812 } else {
813 /* Else slow start did not finish, cwnd is non-sense,
814 ssthresh may be also invalid.
815 */
816 if (!dst_metric_locked(dst, RTAX_CWND))
817 dst_metric_set(dst, RTAX_CWND,
818 (dst_metric(dst, RTAX_CWND) +
819 tp->snd_ssthresh) >> 1);
820 if (dst_metric(dst, RTAX_SSTHRESH) &&
821 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
822 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
823 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
824 }
825
826 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
827 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
828 tp->reordering != sysctl_tcp_reordering)
829 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
830 }
831 }
832}
833
834__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) 730__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
835{ 731{
836 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 732 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
867 * Packet counting of FACK is based on in-order assumptions, therefore TCP 763 * Packet counting of FACK is based on in-order assumptions, therefore TCP
868 * disables it when reordering is detected 764 * disables it when reordering is detected
869 */ 765 */
870static void tcp_disable_fack(struct tcp_sock *tp) 766void tcp_disable_fack(struct tcp_sock *tp)
871{ 767{
872 /* RFC3517 uses different metric in lost marker => reset on change */ 768 /* RFC3517 uses different metric in lost marker => reset on change */
873 if (tcp_is_fack(tp)) 769 if (tcp_is_fack(tp))
@@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
881 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 777 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
882} 778}
883 779
884/* Initialize metrics on socket. */
885
886static void tcp_init_metrics(struct sock *sk)
887{
888 struct tcp_sock *tp = tcp_sk(sk);
889 struct dst_entry *dst = __sk_dst_get(sk);
890
891 if (dst == NULL)
892 goto reset;
893
894 dst_confirm(dst);
895
896 if (dst_metric_locked(dst, RTAX_CWND))
897 tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
898 if (dst_metric(dst, RTAX_SSTHRESH)) {
899 tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
900 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
901 tp->snd_ssthresh = tp->snd_cwnd_clamp;
902 } else {
903 /* ssthresh may have been reduced unnecessarily during.
904 * 3WHS. Restore it back to its initial default.
905 */
906 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
907 }
908 if (dst_metric(dst, RTAX_REORDERING) &&
909 tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
910 tcp_disable_fack(tp);
911 tcp_disable_early_retrans(tp);
912 tp->reordering = dst_metric(dst, RTAX_REORDERING);
913 }
914
915 if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
916 goto reset;
917
918 /* Initial rtt is determined from SYN,SYN-ACK.
919 * The segment is small and rtt may appear much
920 * less than real one. Use per-dst memory
921 * to make it more realistic.
922 *
923 * A bit of theory. RTT is time passed after "normal" sized packet
924 * is sent until it is ACKed. In normal circumstances sending small
925 * packets force peer to delay ACKs and calculation is correct too.
926 * The algorithm is adaptive and, provided we follow specs, it
927 * NEVER underestimate RTT. BUT! If peer tries to make some clever
928 * tricks sort of "quick acks" for time long enough to decrease RTT
929 * to low value, and then abruptly stops to do it and starts to delay
930 * ACKs, wait for troubles.
931 */
932 if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
933 tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
934 tp->rtt_seq = tp->snd_nxt;
935 }
936 if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
937 tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
938 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
939 }
940 tcp_set_rto(sk);
941reset:
942 if (tp->srtt == 0) {
943 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
944 * 3WHS. This is most likely due to retransmission,
945 * including spurious one. Reset the RTO back to 3secs
946 * from the more aggressive 1sec to avoid more spurious
947 * retransmission.
948 */
949 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
950 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
951 }
952 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
953 * retransmitted. In light of RFC6298 more aggressive 1sec
954 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
955 * retransmission has occurred.
956 */
957 if (tp->total_retrans > 1)
958 tp->snd_cwnd = 1;
959 else
960 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
961 tp->snd_cwnd_stamp = tcp_time_stamp;
962}
963
964static void tcp_update_reordering(struct sock *sk, const int metric, 780static void tcp_update_reordering(struct sock *sk, const int metric,
965 const int ts) 781 const int ts)
966{ 782{
@@ -3869,9 +3685,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3869 tcp_cong_avoid(sk, ack, prior_in_flight); 3685 tcp_cong_avoid(sk, ack, prior_in_flight);
3870 } 3686 }
3871 3687
3872 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3688 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3873 dst_confirm(__sk_dst_get(sk)); 3689 struct dst_entry *dst = __sk_dst_get(sk);
3874 3690 if (dst)
3691 dst_confirm(dst);
3692 }
3875 return 1; 3693 return 1;
3876 3694
3877no_queue: 3695no_queue:
@@ -5518,6 +5336,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5518 struct tcp_sock *tp = tcp_sk(sk); 5336 struct tcp_sock *tp = tcp_sk(sk);
5519 int res; 5337 int res;
5520 5338
5339 if (sk->sk_rx_dst) {
5340 struct dst_entry *dst = sk->sk_rx_dst;
5341 if (unlikely(dst->obsolete)) {
5342 if (dst->ops->check(dst, 0) == NULL) {
5343 dst_release(dst);
5344 sk->sk_rx_dst = NULL;
5345 }
5346 }
5347 }
5348 if (unlikely(sk->sk_rx_dst == NULL))
5349 sk->sk_rx_dst = dst_clone(skb_dst(skb));
5350
5521 /* 5351 /*
5522 * Header prediction. 5352 * Header prediction.
5523 * The code loosely follows the one in the famous 5353 * The code loosely follows the one in the famous
@@ -5729,8 +5559,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5729 5559
5730 tcp_set_state(sk, TCP_ESTABLISHED); 5560 tcp_set_state(sk, TCP_ESTABLISHED);
5731 5561
5732 if (skb != NULL) 5562 if (skb != NULL) {
5563 sk->sk_rx_dst = dst_clone(skb_dst(skb));
5733 security_inet_conn_established(sk, skb); 5564 security_inet_conn_established(sk, skb);
5565 }
5734 5566
5735 /* Make sure socket is routed, for correct metrics. */ 5567 /* Make sure socket is routed, for correct metrics. */
5736 icsk->icsk_af_ops->rebuild_header(sk); 5568 icsk->icsk_af_ops->rebuild_header(sk);
@@ -6126,9 +5958,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6126 5958
6127 case TCP_FIN_WAIT1: 5959 case TCP_FIN_WAIT1:
6128 if (tp->snd_una == tp->write_seq) { 5960 if (tp->snd_una == tp->write_seq) {
5961 struct dst_entry *dst;
5962
6129 tcp_set_state(sk, TCP_FIN_WAIT2); 5963 tcp_set_state(sk, TCP_FIN_WAIT2);
6130 sk->sk_shutdown |= SEND_SHUTDOWN; 5964 sk->sk_shutdown |= SEND_SHUTDOWN;
6131 dst_confirm(__sk_dst_get(sk)); 5965
5966 dst = __sk_dst_get(sk);
5967 if (dst)
5968 dst_confirm(dst);
6132 5969
6133 if (!sock_flag(sk, SOCK_DEAD)) 5970 if (!sock_flag(sk, SOCK_DEAD))
6134 /* Wake up lingering close() */ 5971 /* Wake up lingering close() */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c8d28c433b2b..ddefd39ac0cf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
209 } 209 }
210 210
211 if (tcp_death_row.sysctl_tw_recycle && 211 if (tcp_death_row.sysctl_tw_recycle &&
212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { 212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
213 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); 213 tcp_fetch_timewait_stamp(sk, &rt->dst);
214 /*
215 * VJ's idea. We save last timestamp seen from
216 * the destination in peer table, when entering state
217 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
218 * when trying new connection.
219 */
220 if (peer) {
221 inet_peer_refcheck(peer);
222 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
223 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
224 tp->rx_opt.ts_recent = peer->tcp_ts;
225 }
226 }
227 }
228 214
229 inet->inet_dport = usin->sin_port; 215 inet->inet_dport = usin->sin_port;
230 inet->inet_daddr = daddr; 216 inet->inet_daddr = daddr;
@@ -698,8 +684,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
698 684
699 net = dev_net(skb_dst(skb)->dev); 685 net = dev_net(skb_dst(skb)->dev);
700 arg.tos = ip_hdr(skb)->tos; 686 arg.tos = ip_hdr(skb)->tos;
701 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, 687 ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
702 &arg, arg.iov[0].iov_len); 688 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
703 689
704 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 690 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
705 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); 691 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
@@ -781,8 +767,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
781 if (oif) 767 if (oif)
782 arg.bound_dev_if = oif; 768 arg.bound_dev_if = oif;
783 arg.tos = tos; 769 arg.tos = tos;
784 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, 770 ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
785 &arg, arg.iov[0].iov_len); 771 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
786 772
787 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 773 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
788} 774}
@@ -825,7 +811,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
825static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 811static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
826 struct request_sock *req, 812 struct request_sock *req,
827 struct request_values *rvp, 813 struct request_values *rvp,
828 u16 queue_mapping) 814 u16 queue_mapping,
815 bool nocache)
829{ 816{
830 const struct inet_request_sock *ireq = inet_rsk(req); 817 const struct inet_request_sock *ireq = inet_rsk(req);
831 struct flowi4 fl4; 818 struct flowi4 fl4;
@@ -833,7 +820,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
833 struct sk_buff * skb; 820 struct sk_buff * skb;
834 821
835 /* First, grab a route. */ 822 /* First, grab a route. */
836 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 823 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
837 return -1; 824 return -1;
838 825
839 skb = tcp_make_synack(sk, dst, req, rvp); 826 skb = tcp_make_synack(sk, dst, req, rvp);
@@ -848,7 +835,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
848 err = net_xmit_eval(err); 835 err = net_xmit_eval(err);
849 } 836 }
850 837
851 dst_release(dst);
852 return err; 838 return err;
853} 839}
854 840
@@ -856,7 +842,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
856 struct request_values *rvp) 842 struct request_values *rvp)
857{ 843{
858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 844 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
859 return tcp_v4_send_synack(sk, NULL, req, rvp, 0); 845 return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
860} 846}
861 847
862/* 848/*
@@ -1375,7 +1361,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1375 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1361 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1376 req->cookie_ts = tmp_opt.tstamp_ok; 1362 req->cookie_ts = tmp_opt.tstamp_ok;
1377 } else if (!isn) { 1363 } else if (!isn) {
1378 struct inet_peer *peer = NULL;
1379 struct flowi4 fl4; 1364 struct flowi4 fl4;
1380 1365
1381 /* VJ's idea. We save last timestamp seen 1366 /* VJ's idea. We save last timestamp seen
@@ -1389,13 +1374,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1389 */ 1374 */
1390 if (tmp_opt.saw_tstamp && 1375 if (tmp_opt.saw_tstamp &&
1391 tcp_death_row.sysctl_tw_recycle && 1376 tcp_death_row.sysctl_tw_recycle &&
1392 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && 1377 (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
1393 fl4.daddr == saddr && 1378 fl4.daddr == saddr) {
1394 (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { 1379 if (!tcp_peer_is_proven(req, dst, true)) {
1395 inet_peer_refcheck(peer);
1396 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1397 (s32)(peer->tcp_ts - req->ts_recent) >
1398 TCP_PAWS_WINDOW) {
1399 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1380 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1400 goto drop_and_release; 1381 goto drop_and_release;
1401 } 1382 }
@@ -1404,8 +1385,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1404 else if (!sysctl_tcp_syncookies && 1385 else if (!sysctl_tcp_syncookies &&
1405 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1386 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1406 (sysctl_max_syn_backlog >> 2)) && 1387 (sysctl_max_syn_backlog >> 2)) &&
1407 (!peer || !peer->tcp_ts_stamp) && 1388 !tcp_peer_is_proven(req, dst, false)) {
1408 (!dst || !dst_metric(dst, RTAX_RTT))) {
1409 /* Without syncookies last quarter of 1389 /* Without syncookies last quarter of
1410 * backlog is filled with destinations, 1390 * backlog is filled with destinations,
1411 * proven to be alive. 1391 * proven to be alive.
@@ -1425,7 +1405,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1425 1405
1426 if (tcp_v4_send_synack(sk, dst, req, 1406 if (tcp_v4_send_synack(sk, dst, req,
1427 (struct request_values *)&tmp_ext, 1407 (struct request_values *)&tmp_ext,
1428 skb_get_queue_mapping(skb)) || 1408 skb_get_queue_mapping(skb),
1409 want_cookie) ||
1429 want_cookie) 1410 want_cookie)
1430 goto drop_and_free; 1411 goto drop_and_free;
1431 1412
@@ -1672,6 +1653,51 @@ csum_err:
1672} 1653}
1673EXPORT_SYMBOL(tcp_v4_do_rcv); 1654EXPORT_SYMBOL(tcp_v4_do_rcv);
1674 1655
1656void tcp_v4_early_demux(struct sk_buff *skb)
1657{
1658 struct net *net = dev_net(skb->dev);
1659 const struct iphdr *iph;
1660 const struct tcphdr *th;
1661 struct net_device *dev;
1662 struct sock *sk;
1663
1664 if (skb->pkt_type != PACKET_HOST)
1665 return;
1666
1667 if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1668 return;
1669
1670 iph = ip_hdr(skb);
1671 th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1672
1673 if (th->doff < sizeof(struct tcphdr) / 4)
1674 return;
1675
1676 if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1677 return;
1678
1679 dev = skb->dev;
1680 sk = __inet_lookup_established(net, &tcp_hashinfo,
1681 iph->saddr, th->source,
1682 iph->daddr, ntohs(th->dest),
1683 dev->ifindex);
1684 if (sk) {
1685 skb->sk = sk;
1686 skb->destructor = sock_edemux;
1687 if (sk->sk_state != TCP_TIME_WAIT) {
1688 struct dst_entry *dst = sk->sk_rx_dst;
1689 if (dst)
1690 dst = dst_check(dst, 0);
1691 if (dst) {
1692 struct rtable *rt = (struct rtable *) dst;
1693
1694 if (rt->rt_iif == dev->ifindex)
1695 skb_dst_set_noref(skb, dst);
1696 }
1697 }
1698 }
1699}
1700
1675/* 1701/*
1676 * From tcp_input.c 1702 * From tcp_input.c
1677 */ 1703 */
@@ -1821,40 +1847,10 @@ do_time_wait:
1821 goto discard_it; 1847 goto discard_it;
1822} 1848}
1823 1849
1824struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1825{
1826 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1827 struct inet_sock *inet = inet_sk(sk);
1828 struct inet_peer *peer;
1829
1830 if (!rt ||
1831 inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1832 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1833 *release_it = true;
1834 } else {
1835 if (!rt->peer)
1836 rt_bind_peer(rt, inet->inet_daddr, 1);
1837 peer = rt->peer;
1838 *release_it = false;
1839 }
1840
1841 return peer;
1842}
1843EXPORT_SYMBOL(tcp_v4_get_peer);
1844
1845void *tcp_v4_tw_get_peer(struct sock *sk)
1846{
1847 const struct inet_timewait_sock *tw = inet_twsk(sk);
1848
1849 return inet_getpeer_v4(tw->tw_daddr, 1);
1850}
1851EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1852
1853static struct timewait_sock_ops tcp_timewait_sock_ops = { 1850static struct timewait_sock_ops tcp_timewait_sock_ops = {
1854 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1851 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1855 .twsk_unique = tcp_twsk_unique, 1852 .twsk_unique = tcp_twsk_unique,
1856 .twsk_destructor= tcp_twsk_destructor, 1853 .twsk_destructor= tcp_twsk_destructor,
1857 .twsk_getpeer = tcp_v4_tw_get_peer,
1858}; 1854};
1859 1855
1860const struct inet_connection_sock_af_ops ipv4_specific = { 1856const struct inet_connection_sock_af_ops ipv4_specific = {
@@ -1863,7 +1859,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1863 .rebuild_header = inet_sk_rebuild_header, 1859 .rebuild_header = inet_sk_rebuild_header,
1864 .conn_request = tcp_v4_conn_request, 1860 .conn_request = tcp_v4_conn_request,
1865 .syn_recv_sock = tcp_v4_syn_recv_sock, 1861 .syn_recv_sock = tcp_v4_syn_recv_sock,
1866 .get_peer = tcp_v4_get_peer,
1867 .net_header_len = sizeof(struct iphdr), 1862 .net_header_len = sizeof(struct iphdr),
1868 .setsockopt = ip_setsockopt, 1863 .setsockopt = ip_setsockopt,
1869 .getsockopt = ip_getsockopt, 1864 .getsockopt = ip_getsockopt,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000000..1fd83d3118fe
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,697 @@
1#include <linux/rcupdate.h>
2#include <linux/spinlock.h>
3#include <linux/jiffies.h>
4#include <linux/bootmem.h>
5#include <linux/module.h>
6#include <linux/cache.h>
7#include <linux/slab.h>
8#include <linux/init.h>
9#include <linux/tcp.h>
10
11#include <net/inet_connection_sock.h>
12#include <net/net_namespace.h>
13#include <net/request_sock.h>
14#include <net/inetpeer.h>
15#include <net/sock.h>
16#include <net/ipv6.h>
17#include <net/dst.h>
18#include <net/tcp.h>
19
20int sysctl_tcp_nometrics_save __read_mostly;
21
22enum tcp_metric_index {
23 TCP_METRIC_RTT,
24 TCP_METRIC_RTTVAR,
25 TCP_METRIC_SSTHRESH,
26 TCP_METRIC_CWND,
27 TCP_METRIC_REORDERING,
28
29 /* Always last. */
30 TCP_METRIC_MAX,
31};
32
33struct tcp_metrics_block {
34 struct tcp_metrics_block __rcu *tcpm_next;
35 struct inetpeer_addr tcpm_addr;
36 unsigned long tcpm_stamp;
37 u32 tcpm_ts;
38 u32 tcpm_ts_stamp;
39 u32 tcpm_lock;
40 u32 tcpm_vals[TCP_METRIC_MAX];
41};
42
43static bool tcp_metric_locked(struct tcp_metrics_block *tm,
44 enum tcp_metric_index idx)
45{
46 return tm->tcpm_lock & (1 << idx);
47}
48
49static u32 tcp_metric_get(struct tcp_metrics_block *tm,
50 enum tcp_metric_index idx)
51{
52 return tm->tcpm_vals[idx];
53}
54
55static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
56 enum tcp_metric_index idx)
57{
58 return msecs_to_jiffies(tm->tcpm_vals[idx]);
59}
60
61static void tcp_metric_set(struct tcp_metrics_block *tm,
62 enum tcp_metric_index idx,
63 u32 val)
64{
65 tm->tcpm_vals[idx] = val;
66}
67
68static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
69 enum tcp_metric_index idx,
70 u32 val)
71{
72 tm->tcpm_vals[idx] = jiffies_to_msecs(val);
73}
74
75static bool addr_same(const struct inetpeer_addr *a,
76 const struct inetpeer_addr *b)
77{
78 const struct in6_addr *a6, *b6;
79
80 if (a->family != b->family)
81 return false;
82 if (a->family == AF_INET)
83 return a->addr.a4 == b->addr.a4;
84
85 a6 = (const struct in6_addr *) &a->addr.a6[0];
86 b6 = (const struct in6_addr *) &b->addr.a6[0];
87
88 return ipv6_addr_equal(a6, b6);
89}
90
91struct tcpm_hash_bucket {
92 struct tcp_metrics_block __rcu *chain;
93};
94
95static DEFINE_SPINLOCK(tcp_metrics_lock);
96
97static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
98{
99 u32 val;
100
101 val = 0;
102 if (dst_metric_locked(dst, RTAX_RTT))
103 val |= 1 << TCP_METRIC_RTT;
104 if (dst_metric_locked(dst, RTAX_RTTVAR))
105 val |= 1 << TCP_METRIC_RTTVAR;
106 if (dst_metric_locked(dst, RTAX_SSTHRESH))
107 val |= 1 << TCP_METRIC_SSTHRESH;
108 if (dst_metric_locked(dst, RTAX_CWND))
109 val |= 1 << TCP_METRIC_CWND;
110 if (dst_metric_locked(dst, RTAX_REORDERING))
111 val |= 1 << TCP_METRIC_REORDERING;
112 tm->tcpm_lock = val;
113
114 tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
115 tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
116 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
117 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
118 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
119 tm->tcpm_ts = 0;
120 tm->tcpm_ts_stamp = 0;
121}
122
123static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
124 struct inetpeer_addr *addr,
125 unsigned int hash,
126 bool reclaim)
127{
128 struct tcp_metrics_block *tm;
129 struct net *net;
130
131 spin_lock_bh(&tcp_metrics_lock);
132 net = dev_net(dst->dev);
133 if (unlikely(reclaim)) {
134 struct tcp_metrics_block *oldest;
135
136 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
137 for (tm = rcu_dereference(oldest->tcpm_next); tm;
138 tm = rcu_dereference(tm->tcpm_next)) {
139 if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
140 oldest = tm;
141 }
142 tm = oldest;
143 } else {
144 tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
145 if (!tm)
146 goto out_unlock;
147 }
148 tm->tcpm_addr = *addr;
149 tm->tcpm_stamp = jiffies;
150
151 tcpm_suck_dst(tm, dst);
152
153 if (likely(!reclaim)) {
154 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
155 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
156 }
157
158out_unlock:
159 spin_unlock_bh(&tcp_metrics_lock);
160 return tm;
161}
162
163#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
164
165static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
166{
167 if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
168 tcpm_suck_dst(tm, dst);
169}
170
171#define TCP_METRICS_RECLAIM_DEPTH 5
172#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
173
174static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
175{
176 if (tm)
177 return tm;
178 if (depth > TCP_METRICS_RECLAIM_DEPTH)
179 return TCP_METRICS_RECLAIM_PTR;
180 return NULL;
181}
182
183static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
184 struct net *net, unsigned int hash)
185{
186 struct tcp_metrics_block *tm;
187 int depth = 0;
188
189 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
190 tm = rcu_dereference(tm->tcpm_next)) {
191 if (addr_same(&tm->tcpm_addr, addr))
192 break;
193 depth++;
194 }
195 return tcp_get_encode(tm, depth);
196}
197
198static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
199 struct dst_entry *dst)
200{
201 struct tcp_metrics_block *tm;
202 struct inetpeer_addr addr;
203 unsigned int hash;
204 struct net *net;
205
206 addr.family = req->rsk_ops->family;
207 switch (addr.family) {
208 case AF_INET:
209 addr.addr.a4 = inet_rsk(req)->rmt_addr;
210 hash = (__force unsigned int) addr.addr.a4;
211 break;
212 case AF_INET6:
213 *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
214 hash = ((__force unsigned int) addr.addr.a6[0] ^
215 (__force unsigned int) addr.addr.a6[1] ^
216 (__force unsigned int) addr.addr.a6[2] ^
217 (__force unsigned int) addr.addr.a6[3]);
218 break;
219 default:
220 return NULL;
221 }
222
223 hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
224
225 net = dev_net(dst->dev);
226 hash &= net->ipv4.tcp_metrics_hash_mask;
227
228 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
229 tm = rcu_dereference(tm->tcpm_next)) {
230 if (addr_same(&tm->tcpm_addr, &addr))
231 break;
232 }
233 tcpm_check_stamp(tm, dst);
234 return tm;
235}
236
237static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
238{
239 struct inet6_timewait_sock *tw6;
240 struct tcp_metrics_block *tm;
241 struct inetpeer_addr addr;
242 unsigned int hash;
243 struct net *net;
244
245 addr.family = tw->tw_family;
246 switch (addr.family) {
247 case AF_INET:
248 addr.addr.a4 = tw->tw_daddr;
249 hash = (__force unsigned int) addr.addr.a4;
250 break;
251 case AF_INET6:
252 tw6 = inet6_twsk((struct sock *)tw);
253 *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
254 hash = ((__force unsigned int) addr.addr.a6[0] ^
255 (__force unsigned int) addr.addr.a6[1] ^
256 (__force unsigned int) addr.addr.a6[2] ^
257 (__force unsigned int) addr.addr.a6[3]);
258 break;
259 default:
260 return NULL;
261 }
262
263 hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
264
265 net = twsk_net(tw);
266 hash &= net->ipv4.tcp_metrics_hash_mask;
267
268 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
269 tm = rcu_dereference(tm->tcpm_next)) {
270 if (addr_same(&tm->tcpm_addr, &addr))
271 break;
272 }
273 return tm;
274}
275
276static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
277 struct dst_entry *dst,
278 bool create)
279{
280 struct tcp_metrics_block *tm;
281 struct inetpeer_addr addr;
282 unsigned int hash;
283 struct net *net;
284 bool reclaim;
285
286 addr.family = sk->sk_family;
287 switch (addr.family) {
288 case AF_INET:
289 addr.addr.a4 = inet_sk(sk)->inet_daddr;
290 hash = (__force unsigned int) addr.addr.a4;
291 break;
292 case AF_INET6:
293 *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
294 hash = ((__force unsigned int) addr.addr.a6[0] ^
295 (__force unsigned int) addr.addr.a6[1] ^
296 (__force unsigned int) addr.addr.a6[2] ^
297 (__force unsigned int) addr.addr.a6[3]);
298 break;
299 default:
300 return NULL;
301 }
302
303 hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
304
305 net = dev_net(dst->dev);
306 hash &= net->ipv4.tcp_metrics_hash_mask;
307
308 tm = __tcp_get_metrics(&addr, net, hash);
309 reclaim = false;
310 if (tm == TCP_METRICS_RECLAIM_PTR) {
311 reclaim = true;
312 tm = NULL;
313 }
314 if (!tm && create)
315 tm = tcpm_new(dst, &addr, hash, reclaim);
316 else
317 tcpm_check_stamp(tm, dst);
318
319 return tm;
320}
321
322/* Save metrics learned by this TCP session. This function is called
323 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
324 * or goes from LAST-ACK to CLOSE.
325 */
326void tcp_update_metrics(struct sock *sk)
327{
328 const struct inet_connection_sock *icsk = inet_csk(sk);
329 struct dst_entry *dst = __sk_dst_get(sk);
330 struct tcp_sock *tp = tcp_sk(sk);
331 struct tcp_metrics_block *tm;
332 unsigned long rtt;
333 u32 val;
334 int m;
335
336 if (sysctl_tcp_nometrics_save || !dst)
337 return;
338
339 if (dst->flags & DST_HOST)
340 dst_confirm(dst);
341
342 rcu_read_lock();
343 if (icsk->icsk_backoff || !tp->srtt) {
344 /* This session failed to estimate rtt. Why?
345 * Probably, no packets returned in time. Reset our
346 * results.
347 */
348 tm = tcp_get_metrics(sk, dst, false);
349 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
350 tcp_metric_set(tm, TCP_METRIC_RTT, 0);
351 goto out_unlock;
352 } else
353 tm = tcp_get_metrics(sk, dst, true);
354
355 if (!tm)
356 goto out_unlock;
357
358 rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
359 m = rtt - tp->srtt;
360
361 /* If newly calculated rtt larger than stored one, store new
362 * one. Otherwise, use EWMA. Remember, rtt overestimation is
363 * always better than underestimation.
364 */
365 if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
366 if (m <= 0)
367 rtt = tp->srtt;
368 else
369 rtt -= (m >> 3);
370 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
371 }
372
373 if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
374 unsigned long var;
375
376 if (m < 0)
377 m = -m;
378
379 /* Scale deviation to rttvar fixed point */
380 m >>= 1;
381 if (m < tp->mdev)
382 m = tp->mdev;
383
384 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
385 if (m >= var)
386 var = m;
387 else
388 var -= (var - m) >> 2;
389
390 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
391 }
392
393 if (tcp_in_initial_slowstart(tp)) {
394 /* Slow start still did not finish. */
395 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
396 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
397 if (val && (tp->snd_cwnd >> 1) > val)
398 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
399 tp->snd_cwnd >> 1);
400 }
401 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
402 val = tcp_metric_get(tm, TCP_METRIC_CWND);
403 if (tp->snd_cwnd > val)
404 tcp_metric_set(tm, TCP_METRIC_CWND,
405 tp->snd_cwnd);
406 }
407 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
408 icsk->icsk_ca_state == TCP_CA_Open) {
409 /* Cong. avoidance phase, cwnd is reliable. */
410 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
411 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
412 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
413 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
414 val = tcp_metric_get(tm, TCP_METRIC_CWND);
415 tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1);
416 }
417 } else {
418 /* Else slow start did not finish, cwnd is non-sense,
419 * ssthresh may be also invalid.
420 */
421 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
422 val = tcp_metric_get(tm, TCP_METRIC_CWND);
423 tcp_metric_set(tm, TCP_METRIC_CWND,
424 (val + tp->snd_ssthresh) >> 1);
425 }
426 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
427 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
428 if (val && tp->snd_ssthresh > val)
429 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
430 tp->snd_ssthresh);
431 }
432 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
433 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
434 if (val < tp->reordering &&
435 tp->reordering != sysctl_tcp_reordering)
436 tcp_metric_set(tm, TCP_METRIC_REORDERING,
437 tp->reordering);
438 }
439 }
440 tm->tcpm_stamp = jiffies;
441out_unlock:
442 rcu_read_unlock();
443}
444
445/* Initialize metrics on socket. */
446
447void tcp_init_metrics(struct sock *sk)
448{
449 struct dst_entry *dst = __sk_dst_get(sk);
450 struct tcp_sock *tp = tcp_sk(sk);
451 struct tcp_metrics_block *tm;
452 u32 val;
453
454 if (dst == NULL)
455 goto reset;
456
457 dst_confirm(dst);
458
459 rcu_read_lock();
460 tm = tcp_get_metrics(sk, dst, true);
461 if (!tm) {
462 rcu_read_unlock();
463 goto reset;
464 }
465
466 if (tcp_metric_locked(tm, TCP_METRIC_CWND))
467 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
468
469 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
470 if (val) {
471 tp->snd_ssthresh = val;
472 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
473 tp->snd_ssthresh = tp->snd_cwnd_clamp;
474 } else {
475 /* ssthresh may have been reduced unnecessarily during.
476 * 3WHS. Restore it back to its initial default.
477 */
478 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
479 }
480 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
481 if (val && tp->reordering != val) {
482 tcp_disable_fack(tp);
483 tcp_disable_early_retrans(tp);
484 tp->reordering = val;
485 }
486
487 val = tcp_metric_get(tm, TCP_METRIC_RTT);
488 if (val == 0 || tp->srtt == 0) {
489 rcu_read_unlock();
490 goto reset;
491 }
492 /* Initial rtt is determined from SYN,SYN-ACK.
493 * The segment is small and rtt may appear much
494 * less than real one. Use per-dst memory
495 * to make it more realistic.
496 *
497 * A bit of theory. RTT is time passed after "normal" sized packet
498 * is sent until it is ACKed. In normal circumstances sending small
499 * packets force peer to delay ACKs and calculation is correct too.
500 * The algorithm is adaptive and, provided we follow specs, it
501 * NEVER underestimate RTT. BUT! If peer tries to make some clever
502 * tricks sort of "quick acks" for time long enough to decrease RTT
503 * to low value, and then abruptly stops to do it and starts to delay
504 * ACKs, wait for troubles.
505 */
506 val = msecs_to_jiffies(val);
507 if (val > tp->srtt) {
508 tp->srtt = val;
509 tp->rtt_seq = tp->snd_nxt;
510 }
511 val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
512 if (val > tp->mdev) {
513 tp->mdev = val;
514 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
515 }
516 rcu_read_unlock();
517
518 tcp_set_rto(sk);
519reset:
520 if (tp->srtt == 0) {
521 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
522 * 3WHS. This is most likely due to retransmission,
523 * including spurious one. Reset the RTO back to 3secs
524 * from the more aggressive 1sec to avoid more spurious
525 * retransmission.
526 */
527 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
528 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
529 }
530 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
531 * retransmitted. In light of RFC6298 more aggressive 1sec
532 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
533 * retransmission has occurred.
534 */
535 if (tp->total_retrans > 1)
536 tp->snd_cwnd = 1;
537 else
538 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
539 tp->snd_cwnd_stamp = tcp_time_stamp;
540}
541
542bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
543{
544 struct tcp_metrics_block *tm;
545 bool ret;
546
547 if (!dst)
548 return false;
549
550 rcu_read_lock();
551 tm = __tcp_get_metrics_req(req, dst);
552 if (paws_check) {
553 if (tm &&
554 (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
555 (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
556 ret = false;
557 else
558 ret = true;
559 } else {
560 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
561 ret = true;
562 else
563 ret = false;
564 }
565 rcu_read_unlock();
566
567 return ret;
568}
569EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
570
571void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
572{
573 struct tcp_metrics_block *tm;
574
575 rcu_read_lock();
576 tm = tcp_get_metrics(sk, dst, true);
577 if (tm) {
578 struct tcp_sock *tp = tcp_sk(sk);
579
580 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
581 tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
582 tp->rx_opt.ts_recent = tm->tcpm_ts;
583 }
584 }
585 rcu_read_unlock();
586}
587EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
588
589/* VJ's idea. Save last timestamp seen from this destination and hold
590 * it at least for normal timewait interval to use for duplicate
591 * segment detection in subsequent connections, before they enter
592 * synchronized state.
593 */
594bool tcp_remember_stamp(struct sock *sk)
595{
596 struct dst_entry *dst = __sk_dst_get(sk);
597 bool ret = false;
598
599 if (dst) {
600 struct tcp_metrics_block *tm;
601
602 rcu_read_lock();
603 tm = tcp_get_metrics(sk, dst, true);
604 if (tm) {
605 struct tcp_sock *tp = tcp_sk(sk);
606
607 if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
608 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
609 tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
610 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
611 tm->tcpm_ts = tp->rx_opt.ts_recent;
612 }
613 ret = true;
614 }
615 rcu_read_unlock();
616 }
617 return ret;
618}
619
620bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
621{
622 struct tcp_metrics_block *tm;
623 bool ret = false;
624
625 rcu_read_lock();
626 tm = __tcp_get_metrics_tw(tw);
627 if (tw) {
628 const struct tcp_timewait_sock *tcptw;
629 struct sock *sk = (struct sock *) tw;
630
631 tcptw = tcp_twsk(sk);
632 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
633 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
634 tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
635 tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
636 tm->tcpm_ts = tcptw->tw_ts_recent;
637 }
638 ret = true;
639 }
640 rcu_read_unlock();
641
642 return ret;
643}
644
645static unsigned long tcpmhash_entries;
646static int __init set_tcpmhash_entries(char *str)
647{
648 ssize_t ret;
649
650 if (!str)
651 return 0;
652
653 ret = kstrtoul(str, 0, &tcpmhash_entries);
654 if (ret)
655 return 0;
656
657 return 1;
658}
659__setup("tcpmhash_entries=", set_tcpmhash_entries);
660
661static int __net_init tcp_net_metrics_init(struct net *net)
662{
663 int slots, size;
664
665 slots = tcpmhash_entries;
666 if (!slots) {
667 if (totalram_pages >= 128 * 1024)
668 slots = 16 * 1024;
669 else
670 slots = 8 * 1024;
671 }
672
673 size = slots * sizeof(struct tcpm_hash_bucket);
674
675 net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
676 if (!net->ipv4.tcp_metrics_hash)
677 return -ENOMEM;
678
679 net->ipv4.tcp_metrics_hash_mask = (slots - 1);
680
681 return 0;
682}
683
684static void __net_exit tcp_net_metrics_exit(struct net *net)
685{
686 kfree(net->ipv4.tcp_metrics_hash);
687}
688
689static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
690 .init = tcp_net_metrics_init,
691 .exit = tcp_net_metrics_exit,
692};
693
694void __init tcp_metrics_init(void)
695{
696 register_pernet_subsys(&tcp_net_metrics_ops);
697}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b85d9fe7d663..65608863fdee 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,56 +49,6 @@ struct inet_timewait_death_row tcp_death_row = {
49}; 49};
50EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
51 51
52/* VJ's idea. Save last timestamp seen from this destination
53 * and hold it at least for normal timewait interval to use for duplicate
54 * segment detection in subsequent connections, before they enter synchronized
55 * state.
56 */
57
58static bool tcp_remember_stamp(struct sock *sk)
59{
60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk);
62 struct inet_peer *peer;
63 bool release_it;
64
65 peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
66 if (peer) {
67 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
68 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
69 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
70 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
71 peer->tcp_ts = tp->rx_opt.ts_recent;
72 }
73 if (release_it)
74 inet_putpeer(peer);
75 return true;
76 }
77
78 return false;
79}
80
81static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
82{
83 struct sock *sk = (struct sock *) tw;
84 struct inet_peer *peer;
85
86 peer = twsk_getpeer(sk);
87 if (peer) {
88 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
89
90 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
91 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
92 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
93 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
94 peer->tcp_ts = tcptw->tw_ts_recent;
95 }
96 inet_putpeer(peer);
97 return true;
98 }
99 return false;
100}
101
102static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 52static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
103{ 53{
104 if (seq == s_win) 54 if (seq == s_win)
@@ -327,8 +277,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
327 if (tw != NULL) { 277 if (tw != NULL) {
328 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 278 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
329 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 279 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
280 struct inet_sock *inet = inet_sk(sk);
330 281
331 tw->tw_transparent = inet_sk(sk)->transparent; 282 tw->tw_transparent = inet->transparent;
332 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 283 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
333 tcptw->tw_rcv_nxt = tp->rcv_nxt; 284 tcptw->tw_rcv_nxt = tp->rcv_nxt;
334 tcptw->tw_snd_nxt = tp->snd_nxt; 285 tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -403,6 +354,7 @@ void tcp_twsk_destructor(struct sock *sk)
403{ 354{
404#ifdef CONFIG_TCP_MD5SIG 355#ifdef CONFIG_TCP_MD5SIG
405 struct tcp_timewait_sock *twsk = tcp_twsk(sk); 356 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
357
406 if (twsk->tw_md5_key) { 358 if (twsk->tw_md5_key) {
407 tcp_free_md5sig_pool(); 359 tcp_free_md5sig_pool();
408 kfree_rcu(twsk->tw_md5_key, rcu); 360 kfree_rcu(twsk->tw_md5_key, rcu);
@@ -435,6 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
435 struct tcp_sock *oldtp = tcp_sk(sk); 387 struct tcp_sock *oldtp = tcp_sk(sk);
436 struct tcp_cookie_values *oldcvp = oldtp->cookie_values; 388 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
437 389
390 newsk->sk_rx_dst = dst_clone(skb_dst(skb));
391
438 /* TCP Cookie Transactions require space for the cookie pair, 392 /* TCP Cookie Transactions require space for the cookie pair,
439 * as it differs for each connection. There is no need to 393 * as it differs for each connection. There is no need to
440 * copy any s_data_payload stored at the original socket. 394 * copy any s_data_payload stored at the original socket.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 803cbfe82fbc..c465d3e51e28 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2442,7 +2442,16 @@ int tcp_send_synack(struct sock *sk)
2442 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2442 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2443} 2443}
2444 2444
2445/* Prepare a SYN-ACK. */ 2445/**
2446 * tcp_make_synack - Prepare a SYN-ACK.
2447 * sk: listener socket
2448 * dst: dst entry attached to the SYNACK
2449 * req: request_sock pointer
2450 * rvp: request_values pointer
2451 *
2452 * Allocate one skb and build a SYNACK packet.
2453 * @dst is consumed : Caller should not use it again.
2454 */
2446struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2455struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2447 struct request_sock *req, 2456 struct request_sock *req,
2448 struct request_values *rvp) 2457 struct request_values *rvp)
@@ -2461,14 +2470,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2461 2470
2462 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) 2471 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2463 s_data_desired = cvp->s_data_desired; 2472 s_data_desired = cvp->s_data_desired;
2464 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); 2473 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
2465 if (skb == NULL) 2474 if (unlikely(!skb)) {
2475 dst_release(dst);
2466 return NULL; 2476 return NULL;
2467 2477 }
2468 /* Reserve space for headers. */ 2478 /* Reserve space for headers. */
2469 skb_reserve(skb, MAX_TCP_HEADER); 2479 skb_reserve(skb, MAX_TCP_HEADER);
2470 2480
2471 skb_dst_set(skb, dst_clone(dst)); 2481 skb_dst_set(skb, dst);
2472 2482
2473 mss = dst_metric_advmss(dst); 2483 mss = dst_metric_advmss(dst);
2474 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2484 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eaca73644e79..ee37d47d472e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -108,6 +108,7 @@
108#include <net/xfrm.h> 108#include <net/xfrm.h>
109#include <trace/events/udp.h> 109#include <trace/events/udp.h>
110#include <linux/static_key.h> 110#include <linux/static_key.h>
111#include <trace/events/skb.h>
111#include "udp_impl.h" 112#include "udp_impl.h"
112 113
113struct udp_table udp_table __read_mostly; 114struct udp_table udp_table __read_mostly;
@@ -615,6 +616,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
615 break; 616 break;
616 case ICMP_DEST_UNREACH: 617 case ICMP_DEST_UNREACH:
617 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ 618 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
619 ipv4_sk_update_pmtu(skb, sk, info);
618 if (inet->pmtudisc != IP_PMTUDISC_DONT) { 620 if (inet->pmtudisc != IP_PMTUDISC_DONT) {
619 err = EMSGSIZE; 621 err = EMSGSIZE;
620 harderr = 1; 622 harderr = 1;
@@ -1219,8 +1221,10 @@ try_again:
1219 goto csum_copy_err; 1221 goto csum_copy_err;
1220 } 1222 }
1221 1223
1222 if (err) 1224 if (unlikely(err)) {
1225 trace_kfree_skb(skb, udp_recvmsg);
1223 goto out_free; 1226 goto out_free;
1227 }
1224 1228
1225 if (!peeked) 1229 if (!peeked)
1226 UDP_INC_STATS_USER(sock_net(sk), 1230 UDP_INC_STATS_USER(sock_net(sk),
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 0d3426cb5c4f..87d3fcc302d4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -90,10 +90,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
90 xdst->u.dst.dev = dev; 90 xdst->u.dst.dev = dev;
91 dev_hold(dev); 91 dev_hold(dev);
92 92
93 xdst->u.rt.peer = rt->peer;
94 if (rt->peer)
95 atomic_inc(&rt->peer->refcnt);
96
97 /* Sheit... I remember I did this right. Apparently, 93 /* Sheit... I remember I did this right. Apparently,
98 * it was magically lost, so this code needs audit */ 94 * it was magically lost, so this code needs audit */
99 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | 95 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
@@ -102,7 +98,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
102 xdst->u.rt.rt_src = rt->rt_src; 98 xdst->u.rt.rt_src = rt->rt_src;
103 xdst->u.rt.rt_dst = rt->rt_dst; 99 xdst->u.rt.rt_dst = rt->rt_dst;
104 xdst->u.rt.rt_gateway = rt->rt_gateway; 100 xdst->u.rt.rt_gateway = rt->rt_gateway;
105 xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; 101 xdst->u.rt.rt_pmtu = rt->rt_pmtu;
106 102
107 return 0; 103 return 0;
108} 104}
@@ -212,9 +208,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
212 208
213 dst_destroy_metrics_generic(dst); 209 dst_destroy_metrics_generic(dst);
214 210
215 if (likely(xdst->u.rt.peer))
216 inet_putpeer(xdst->u.rt.peer);
217
218 xfrm_dst_destroy(xdst); 211 xfrm_dst_destroy(xdst);
219} 212}
220 213