aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-07-11 01:53:57 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-11 01:53:57 -0400
commitfdd28d7328f4c3ddf77ba163e257d7dc48392767 (patch)
treef8d4221951e6da429f7d252bc167e8d64a83d1f1
parentad7eee98bef92481581060801bdfd1b25a6106c0 (diff)
parentf185071ddf799e194ba015d040d3d49cdbfa7e48 (diff)
Merge branch 'metrics_restructure'
This patch series works towards the goal of minimizing the amount of things that can change in an ipv4 route. In a regime where the routing cache is removed, route changes will lead to cloning in the FIB tables or similar. The largest trigger of route metrics writes, TCP, now has it's own cache of dynamic metric state. The timewait timestamps are stored there now as well. As a result of that, pre-cowing metrics is no longer necessary, and therefore FLOWI_FLAG_PRECOW_METRICS is removed. Redirect and PMTU handling is moved back into the ipv4 routes. I'm sorry for all the headaches trying to do this in the inetpeer has caused, it was the wrong approach for sure. Since metrics become read-only for ipv4 we no longer need the inetpeer hung off of the ipv4 routes either. So those disappear too. Also, timewait sockets no longer need to hold onto an inetpeer either. After this series, we still have some details to resolve wrt. PMTU and redirects for a route-cache-less system: 1) With just the plain route cache removal, PMTU will continue to work mostly fine. This is because of how the local route users call down into the PMTU update code with the route they already hold. However, if we wish to cache pre-computed routes in fib_info nexthops (which we want for performance), then we need to add route cloning for PMTU events. 2) Redirects require more work. First, redirects must be changed to be handled like PMTU. Wherein we call down into the sockets and other entities, and then they call back into the routing code with the route they were using. So we'll be adding an ->update_nexthop() method alongside ->update_pmtu(). And then, like for PMTU, we'll need cloning support once we start caching routes in the fib_info nexthops. But that's it, we can completely pull the trigger and remove the routing cache with minimal disruptions. As it is, this patch series alone helps a lot of things. For one, routing cache entry creation should be a lot faster, because we no longer do inetpeer lookups (even to check if an entry exists). This patch series also opens the door for non-DST_HOST ipv4 routes, because nothing fundamentally cares about rt->rt_dst any more. It can be removed with the base routing cache removal patch. In fact, that was the primary goal of this patch series. Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/rtnetlink.h3
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/dst.h6
-rw-r--r--include/net/flow.h5
-rw-r--r--include/net/inet_connection_sock.h1
-rw-r--r--include/net/inet_sock.h2
-rw-r--r--include/net/inetpeer.h8
-rw-r--r--include/net/netns/ipv4.h3
-rw-r--r--include/net/route.h61
-rw-r--r--include/net/tcp.h9
-rw-r--r--net/core/rtnetlink.c4
-rw-r--r--net/decnet/dn_route.c13
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/inetpeer.c4
-rw-r--r--net/ipv4/route.c349
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_input.c188
-rw-r--r--net/ipv4/tcp_ipv4.c46
-rw-r--r--net/ipv4/tcp_metrics.c697
-rw-r--r--net/ipv4/tcp_minisocks.c62
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv6/icmp.c4
-rw-r--r--net/ipv6/ip6_output.c10
-rw-r--r--net/ipv6/ndisc.c8
-rw-r--r--net/ipv6/route.c16
-rw-r--r--net/ipv6/tcp_ipv6.c49
29 files changed, 837 insertions, 731 deletions
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ea60b0854109..db71c4ad8624 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -619,8 +619,7 @@ extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
619extern void rtnl_set_sk_err(struct net *net, u32 group, int error); 619extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
620extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); 620extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
621extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, 621extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
622 u32 id, u32 ts, u32 tsage, long expires, 622 u32 id, long expires, u32 error);
623 u32 error);
624 623
625extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); 624extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change);
626 625
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7d3bcedc062a..2de9cf46f9fc 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -506,7 +506,6 @@ struct tcp_timewait_sock {
506 u32 tw_rcv_wnd; 506 u32 tw_rcv_wnd;
507 u32 tw_ts_recent; 507 u32 tw_ts_recent;
508 long tw_ts_recent_stamp; 508 long tw_ts_recent_stamp;
509 struct inet_peer *tw_peer;
510#ifdef CONFIG_TCP_MD5SIG 509#ifdef CONFIG_TCP_MD5SIG
511 struct tcp_md5sig_key *tw_md5_key; 510 struct tcp_md5sig_key *tw_md5_key;
512#endif 511#endif
diff --git a/include/net/dst.h b/include/net/dst.h
index b2634e446613..51610468c63d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -209,12 +209,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr
209 return msecs_to_jiffies(dst_metric(dst, metric)); 209 return msecs_to_jiffies(dst_metric(dst, metric));
210} 210}
211 211
212static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
213 unsigned long rtt)
214{
215 dst_metric_set(dst, metric, jiffies_to_msecs(rtt));
216}
217
218static inline u32 212static inline u32
219dst_allfrag(const struct dst_entry *dst) 213dst_allfrag(const struct dst_entry *dst)
220{ 214{
diff --git a/include/net/flow.h b/include/net/flow.h
index bd524f598561..ce9cb7656b47 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -20,9 +20,8 @@ struct flowi_common {
20 __u8 flowic_proto; 20 __u8 flowic_proto;
21 __u8 flowic_flags; 21 __u8 flowic_flags;
22#define FLOWI_FLAG_ANYSRC 0x01 22#define FLOWI_FLAG_ANYSRC 0x01
23#define FLOWI_FLAG_PRECOW_METRICS 0x02 23#define FLOWI_FLAG_CAN_SLEEP 0x02
24#define FLOWI_FLAG_CAN_SLEEP 0x04 24#define FLOWI_FLAG_RT_NOCACHE 0x04
25#define FLOWI_FLAG_RT_NOCACHE 0x08
26 __u32 flowic_secid; 25 __u32 flowic_secid;
27}; 26};
28 27
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index af3c743a40e4..291e7cee14e7 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -43,7 +43,6 @@ struct inet_connection_sock_af_ops {
43 struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, 43 struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
44 struct request_sock *req, 44 struct request_sock *req,
45 struct dst_entry *dst); 45 struct dst_entry *dst);
46 struct inet_peer *(*get_peer)(struct sock *sk);
47 u16 net_header_len; 46 u16 net_header_len;
48 u16 net_frag_header_len; 47 u16 net_frag_header_len;
49 u16 sockaddr_len; 48 u16 sockaddr_len;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ae17e1352d7e..924d7b98ab60 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -245,8 +245,6 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
245 245
246 if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) 246 if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
247 flags |= FLOWI_FLAG_ANYSRC; 247 flags |= FLOWI_FLAG_ANYSRC;
248 if (sk->sk_protocol == IPPROTO_TCP)
249 flags |= FLOWI_FLAG_PRECOW_METRICS;
250 return flags; 248 return flags;
251} 249}
252 250
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index c27c8f10ebdc..53f464d7cddc 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -36,25 +36,19 @@ struct inet_peer {
36 u32 metrics[RTAX_MAX]; 36 u32 metrics[RTAX_MAX];
37 u32 rate_tokens; /* rate limiting for ICMP */ 37 u32 rate_tokens; /* rate limiting for ICMP */
38 unsigned long rate_last; 38 unsigned long rate_last;
39 unsigned long pmtu_expires;
40 u32 pmtu_orig;
41 u32 pmtu_learned;
42 struct inetpeer_addr_base redirect_learned;
43 union { 39 union {
44 struct list_head gc_list; 40 struct list_head gc_list;
45 struct rcu_head gc_rcu; 41 struct rcu_head gc_rcu;
46 }; 42 };
47 /* 43 /*
48 * Once inet_peer is queued for deletion (refcnt == -1), following fields 44 * Once inet_peer is queued for deletion (refcnt == -1), following fields
49 * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp 45 * are not available: rid, ip_id_count
50 * We can share memory with rcu_head to help keep inet_peer small. 46 * We can share memory with rcu_head to help keep inet_peer small.
51 */ 47 */
52 union { 48 union {
53 struct { 49 struct {
54 atomic_t rid; /* Frag reception counter */ 50 atomic_t rid; /* Frag reception counter */
55 atomic_t ip_id_count; /* IP ID for the next packet */ 51 atomic_t ip_id_count; /* IP ID for the next packet */
56 __u32 tcp_ts;
57 __u32 tcp_ts_stamp;
58 }; 52 };
59 struct rcu_head rcu; 53 struct rcu_head rcu;
60 struct inet_peer *gc_next; 54 struct inet_peer *gc_next;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 599e48fa97cb..2e089a99d603 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -7,6 +7,7 @@
7 7
8#include <net/inet_frag.h> 8#include <net/inet_frag.h>
9 9
10struct tcpm_hash_bucket;
10struct ctl_table_header; 11struct ctl_table_header;
11struct ipv4_devconf; 12struct ipv4_devconf;
12struct fib_rules_ops; 13struct fib_rules_ops;
@@ -39,6 +40,8 @@ struct netns_ipv4 {
39 struct sock **icmp_sk; 40 struct sock **icmp_sk;
40 struct sock *tcp_sock; 41 struct sock *tcp_sock;
41 struct inet_peer_base *peers; 42 struct inet_peer_base *peers;
43 struct tcpm_hash_bucket *tcp_metrics_hash;
44 unsigned int tcp_metrics_hash_mask;
42 struct netns_frags frags; 45 struct netns_frags frags;
43#ifdef CONFIG_NETFILTER 46#ifdef CONFIG_NETFILTER
44 struct xt_table *iptable_filter; 47 struct xt_table *iptable_filter;
diff --git a/include/net/route.h b/include/net/route.h
index 211e2665139b..52362368af09 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -40,7 +40,6 @@
40#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) 40#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
41 41
42struct fib_nh; 42struct fib_nh;
43struct inet_peer;
44struct fib_info; 43struct fib_info;
45struct rtable { 44struct rtable {
46 struct dst_entry dst; 45 struct dst_entry dst;
@@ -65,45 +64,10 @@ struct rtable {
65 __be32 rt_gateway; 64 __be32 rt_gateway;
66 65
67 /* Miscellaneous cached information */ 66 /* Miscellaneous cached information */
68 u32 rt_peer_genid; 67 u32 rt_pmtu;
69 unsigned long _peer; /* long-living peer info */
70 struct fib_info *fi; /* for client ref to shared metrics */ 68 struct fib_info *fi; /* for client ref to shared metrics */
71}; 69};
72 70
73static inline struct inet_peer *rt_peer_ptr(struct rtable *rt)
74{
75 return inetpeer_ptr(rt->_peer);
76}
77
78static inline bool rt_has_peer(struct rtable *rt)
79{
80 return inetpeer_ptr_is_peer(rt->_peer);
81}
82
83static inline void __rt_set_peer(struct rtable *rt, struct inet_peer *peer)
84{
85 __inetpeer_ptr_set_peer(&rt->_peer, peer);
86}
87
88static inline bool rt_set_peer(struct rtable *rt, struct inet_peer *peer)
89{
90 return inetpeer_ptr_set_peer(&rt->_peer, peer);
91}
92
93static inline void rt_init_peer(struct rtable *rt, struct inet_peer_base *base)
94{
95 inetpeer_init_ptr(&rt->_peer, base);
96}
97
98static inline void rt_transfer_peer(struct rtable *rt, struct rtable *ort)
99{
100 rt->_peer = ort->_peer;
101 if (rt_has_peer(ort)) {
102 struct inet_peer *peer = rt_peer_ptr(ort);
103 atomic_inc(&peer->refcnt);
104 }
105}
106
107static inline bool rt_is_input_route(const struct rtable *rt) 71static inline bool rt_is_input_route(const struct rtable *rt)
108{ 72{
109 return rt->rt_route_iif != 0; 73 return rt->rt_route_iif != 0;
@@ -278,8 +242,6 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
278 242
279 if (inet_sk(sk)->transparent) 243 if (inet_sk(sk)->transparent)
280 flow_flags |= FLOWI_FLAG_ANYSRC; 244 flow_flags |= FLOWI_FLAG_ANYSRC;
281 if (protocol == IPPROTO_TCP)
282 flow_flags |= FLOWI_FLAG_PRECOW_METRICS;
283 if (can_sleep) 245 if (can_sleep)
284 flow_flags |= FLOWI_FLAG_CAN_SLEEP; 246 flow_flags |= FLOWI_FLAG_CAN_SLEEP;
285 247
@@ -328,27 +290,6 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
328 return rt; 290 return rt;
329} 291}
330 292
331extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create);
332
333static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create)
334{
335 if (rt_has_peer(rt))
336 return rt_peer_ptr(rt);
337
338 rt_bind_peer(rt, daddr, create);
339 return (rt_has_peer(rt) ? rt_peer_ptr(rt) : NULL);
340}
341
342static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr)
343{
344 return __rt_get_peer(rt, daddr, 0);
345}
346
347static inline struct inet_peer *rt_get_peer_create(struct rtable *rt, __be32 daddr)
348{
349 return __rt_get_peer(rt, daddr, 1);
350}
351
352static inline int inet_iif(const struct sk_buff *skb) 293static inline int inet_iif(const struct sk_buff *skb)
353{ 294{
354 return skb_rtable(skb)->rt_iif; 295 return skb_rtable(skb)->rt_iif;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 53fb7d814170..3618fefae049 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,6 +388,13 @@ extern void tcp_enter_frto(struct sock *sk);
388extern void tcp_enter_loss(struct sock *sk, int how); 388extern void tcp_enter_loss(struct sock *sk, int how);
389extern void tcp_clear_retrans(struct tcp_sock *tp); 389extern void tcp_clear_retrans(struct tcp_sock *tp);
390extern void tcp_update_metrics(struct sock *sk); 390extern void tcp_update_metrics(struct sock *sk);
391extern void tcp_init_metrics(struct sock *sk);
392extern void tcp_metrics_init(void);
393extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
394extern bool tcp_remember_stamp(struct sock *sk);
395extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
396extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
397extern void tcp_disable_fack(struct tcp_sock *tp);
391extern void tcp_close(struct sock *sk, long timeout); 398extern void tcp_close(struct sock *sk, long timeout);
392extern void tcp_init_sock(struct sock *sk); 399extern void tcp_init_sock(struct sock *sk);
393extern unsigned int tcp_poll(struct file * file, struct socket *sock, 400extern unsigned int tcp_poll(struct file * file, struct socket *sock,
@@ -556,6 +563,8 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
556 return (tp->srtt >> 3) + tp->rttvar; 563 return (tp->srtt >> 3) + tp->rttvar;
557} 564}
558 565
566extern void tcp_set_rto(struct sock *sk);
567
559static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) 568static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
560{ 569{
561 tp->pred_flags = htonl((tp->tcp_header_len << 26) | 570 tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2b325c340b44..64127eee786d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -615,7 +615,7 @@ nla_put_failure:
615EXPORT_SYMBOL(rtnetlink_put_metrics); 615EXPORT_SYMBOL(rtnetlink_put_metrics);
616 616
617int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, 617int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
618 u32 ts, u32 tsage, long expires, u32 error) 618 long expires, u32 error)
619{ 619{
620 struct rta_cacheinfo ci = { 620 struct rta_cacheinfo ci = {
621 .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), 621 .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
@@ -623,8 +623,6 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
623 .rta_clntref = atomic_read(&(dst->__refcnt)), 623 .rta_clntref = atomic_read(&(dst->__refcnt)),
624 .rta_error = error, 624 .rta_error = error,
625 .rta_id = id, 625 .rta_id = id,
626 .rta_ts = ts,
627 .rta_tsage = tsage,
628 }; 626 };
629 627
630 if (expires) 628 if (expires)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6e74b3f110bc..b5594cc73ee1 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1590,7 +1590,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
1590 goto errout; 1590 goto errout;
1591 1591
1592 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 1592 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
1593 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires, 1593 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires,
1594 rt->dst.error) < 0) 1594 rt->dst.error) < 0)
1595 goto errout; 1595 goto errout;
1596 1596
@@ -1812,12 +1812,11 @@ static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
1812 char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN]; 1812 char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
1813 1813
1814 seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n", 1814 seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
1815 rt->dst.dev ? rt->dst.dev->name : "*", 1815 rt->dst.dev ? rt->dst.dev->name : "*",
1816 dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1), 1816 dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
1817 dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2), 1817 dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
1818 atomic_read(&rt->dst.__refcnt), 1818 atomic_read(&rt->dst.__refcnt),
1819 rt->dst.__use, 1819 rt->dst.__use, 0);
1820 (int) dst_metric(&rt->dst, RTAX_RTT));
1821 return 0; 1820 return 0;
1822} 1821}
1823 1822
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6a..5a23e8b37106 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
7 ip_output.o ip_sockglue.o inet_hashtables.o \ 7 ip_output.o ip_sockglue.o inet_hashtables.o \
8 inet_timewait_sock.o inet_connection_sock.o \ 8 inet_timewait_sock.o inet_connection_sock.o \
9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o tcp_metrics.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o fib_trie.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ae301c897a19..d71bfbdc0bf4 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -794,6 +794,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
794 val = nla_get_u32(nla); 794 val = nla_get_u32(nla);
795 if (type == RTAX_ADVMSS && val > 65535 - 40) 795 if (type == RTAX_ADVMSS && val > 65535 - 40)
796 val = 65535 - 40; 796 val = 65535 - 40;
797 if (type == RTAX_MTU && val > 65535 - 15)
798 val = 65535 - 15;
797 fi->fib_metrics[type - 1] = val; 799 fi->fib_metrics[type - 1] = val;
798 } 800 }
799 } 801 }
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4bce5a2830aa..4a049449305f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -254,9 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
254 254
255 /* Limit if icmp type is enabled in ratemask. */ 255 /* Limit if icmp type is enabled in ratemask. */
256 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { 256 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
257 struct inet_peer *peer = rt_get_peer_create(rt, fl4->daddr); 257 struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
258 rc = inet_peer_xrlim_allow(peer, 258 rc = inet_peer_xrlim_allow(peer,
259 net->ipv4.sysctl_icmp_ratelimit); 259 net->ipv4.sysctl_icmp_ratelimit);
260 inet_putpeer(peer);
260 } 261 }
261out: 262out:
262 return rc; 263 return rc;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 034ddbe42adf..76825be3b643 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -375,7 +375,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
375 const struct inet_request_sock *ireq = inet_rsk(req); 375 const struct inet_request_sock *ireq = inet_rsk(req);
376 struct ip_options_rcu *opt = inet_rsk(req)->opt; 376 struct ip_options_rcu *opt = inet_rsk(req)->opt;
377 struct net *net = sock_net(sk); 377 struct net *net = sock_net(sk);
378 int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS; 378 int flags = inet_sk_flowi_flags(sk);
379 379
380 if (nocache) 380 if (nocache)
381 flags |= FLOWI_FLAG_RT_NOCACHE; 381 flags |= FLOWI_FLAG_RT_NOCACHE;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index da90a8cab614..e1e0a4e8fd34 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -508,13 +508,9 @@ relookup:
508 (daddr->family == AF_INET) ? 508 (daddr->family == AF_INET) ?
509 secure_ip_id(daddr->addr.a4) : 509 secure_ip_id(daddr->addr.a4) :
510 secure_ipv6_id(daddr->addr.a6)); 510 secure_ipv6_id(daddr->addr.a6));
511 p->tcp_ts_stamp = 0;
512 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 511 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
513 p->rate_tokens = 0; 512 p->rate_tokens = 0;
514 p->rate_last = 0; 513 p->rate_last = 0;
515 p->pmtu_expires = 0;
516 p->pmtu_orig = 0;
517 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
518 INIT_LIST_HEAD(&p->gc_list); 514 INIT_LIST_HEAD(&p->gc_list);
519 515
520 /* Link the node. */ 516 /* Link the node. */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 72e88c208025..95bfa1ba5b28 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -158,34 +158,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158 158
159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160{ 160{
161 struct rtable *rt = (struct rtable *) dst; 161 WARN_ON(1);
162 struct inet_peer *peer; 162 return NULL;
163 u32 *p = NULL;
164
165 peer = rt_get_peer_create(rt, rt->rt_dst);
166 if (peer) {
167 u32 *old_p = __DST_METRICS_PTR(old);
168 unsigned long prev, new;
169
170 p = peer->metrics;
171 if (inet_metrics_new(peer))
172 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
173
174 new = (unsigned long) p;
175 prev = cmpxchg(&dst->_metrics, old, new);
176
177 if (prev != old) {
178 p = __DST_METRICS_PTR(prev);
179 if (prev & DST_METRICS_READ_ONLY)
180 p = NULL;
181 } else {
182 if (rt->fi) {
183 fib_info_put(rt->fi);
184 rt->fi = NULL;
185 }
186 }
187 }
188 return p;
189} 163}
190 164
191static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 165static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -423,18 +397,16 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
423 int len; 397 int len;
424 398
425 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" 399 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
426 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 400 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
427 r->dst.dev ? r->dst.dev->name : "*", 401 r->dst.dev ? r->dst.dev->name : "*",
428 (__force u32)r->rt_dst, 402 (__force u32)r->rt_dst,
429 (__force u32)r->rt_gateway, 403 (__force u32)r->rt_gateway,
430 r->rt_flags, atomic_read(&r->dst.__refcnt), 404 r->rt_flags, atomic_read(&r->dst.__refcnt),
431 r->dst.__use, 0, (__force u32)r->rt_src, 405 r->dst.__use, 0, (__force u32)r->rt_src,
432 dst_metric_advmss(&r->dst) + 40, 406 dst_metric_advmss(&r->dst) + 40,
433 dst_metric(&r->dst, RTAX_WINDOW), 407 dst_metric(&r->dst, RTAX_WINDOW), 0,
434 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 408 r->rt_key_tos,
435 dst_metric(&r->dst, RTAX_RTTVAR)), 409 -1, 0, 0, &len);
436 r->rt_key_tos,
437 -1, 0, 0, &len);
438 410
439 seq_printf(seq, "%*s\n", 127 - len, ""); 411 seq_printf(seq, "%*s\n", 127 - len, "");
440 } 412 }
@@ -671,7 +643,7 @@ static inline int rt_fast_clean(struct rtable *rth)
671static inline int rt_valuable(struct rtable *rth) 643static inline int rt_valuable(struct rtable *rth)
672{ 644{
673 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 645 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
674 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires); 646 rth->dst.expires;
675} 647}
676 648
677static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 649static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -917,7 +889,6 @@ static void rt_cache_invalidate(struct net *net)
917 889
918 get_random_bytes(&shuffle, sizeof(shuffle)); 890 get_random_bytes(&shuffle, sizeof(shuffle));
919 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 891 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
920 inetpeer_invalidate_family(AF_INET);
921} 892}
922 893
923/* 894/*
@@ -1244,31 +1215,6 @@ skip_hashing:
1244 return rt; 1215 return rt;
1245} 1216}
1246 1217
1247static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1248
1249static u32 rt_peer_genid(void)
1250{
1251 return atomic_read(&__rt_peer_genid);
1252}
1253
1254void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1255{
1256 struct inet_peer_base *base;
1257 struct inet_peer *peer;
1258
1259 base = inetpeer_base_ptr(rt->_peer);
1260 if (!base)
1261 return;
1262
1263 peer = inet_getpeer_v4(base, daddr, create);
1264 if (peer) {
1265 if (!rt_set_peer(rt, peer))
1266 inet_putpeer(peer);
1267 else
1268 rt->rt_peer_genid = rt_peer_genid();
1269 }
1270}
1271
1272/* 1218/*
1273 * Peer allocation may fail only in serious out-of-memory conditions. However 1219 * Peer allocation may fail only in serious out-of-memory conditions. However
1274 * we still can generate some output. 1220 * we still can generate some output.
@@ -1291,20 +1237,15 @@ static void ip_select_fb_ident(struct iphdr *iph)
1291 1237
1292void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1238void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1293{ 1239{
1294 struct rtable *rt = (struct rtable *) dst; 1240 struct net *net = dev_net(dst->dev);
1295 1241 struct inet_peer *peer;
1296 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1297 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1298 1242
1299 /* If peer is attached to destination, it is never detached, 1243 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1300 so that we need not to grab a lock to dereference it. 1244 if (peer) {
1301 */ 1245 iph->id = htons(inet_getid(peer, more));
1302 if (peer) { 1246 inet_putpeer(peer);
1303 iph->id = htons(inet_getid(peer, more)); 1247 return;
1304 return; 1248 }
1305 }
1306 } else if (!rt)
1307 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1308 1249
1309 ip_select_fb_ident(iph); 1250 ip_select_fb_ident(iph);
1310} 1251}
@@ -1330,30 +1271,6 @@ static void rt_del(unsigned int hash, struct rtable *rt)
1330 spin_unlock_bh(rt_hash_lock_addr(hash)); 1271 spin_unlock_bh(rt_hash_lock_addr(hash));
1331} 1272}
1332 1273
1333static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1334{
1335 struct rtable *rt = (struct rtable *) dst;
1336 __be32 orig_gw = rt->rt_gateway;
1337 struct neighbour *n;
1338
1339 dst_confirm(&rt->dst);
1340
1341 rt->rt_gateway = peer->redirect_learned.a4;
1342
1343 n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
1344 if (!n) {
1345 rt->rt_gateway = orig_gw;
1346 return;
1347 }
1348 if (!(n->nud_state & NUD_VALID)) {
1349 neigh_event_send(n, NULL);
1350 } else {
1351 rt->rt_flags |= RTCF_REDIRECTED;
1352 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1353 }
1354 neigh_release(n);
1355}
1356
1357/* called in rcu_read_lock() section */ 1274/* called in rcu_read_lock() section */
1358void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1275void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1359 __be32 saddr, struct net_device *dev) 1276 __be32 saddr, struct net_device *dev)
@@ -1362,7 +1279,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1362 struct in_device *in_dev = __in_dev_get_rcu(dev); 1279 struct in_device *in_dev = __in_dev_get_rcu(dev);
1363 __be32 skeys[2] = { saddr, 0 }; 1280 __be32 skeys[2] = { saddr, 0 };
1364 int ikeys[2] = { dev->ifindex, 0 }; 1281 int ikeys[2] = { dev->ifindex, 0 };
1365 struct inet_peer *peer;
1366 struct net *net; 1282 struct net *net;
1367 1283
1368 if (!in_dev) 1284 if (!in_dev)
@@ -1395,6 +1311,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1395 rthp = &rt_hash_table[hash].chain; 1311 rthp = &rt_hash_table[hash].chain;
1396 1312
1397 while ((rt = rcu_dereference(*rthp)) != NULL) { 1313 while ((rt = rcu_dereference(*rthp)) != NULL) {
1314 struct neighbour *n;
1315
1398 rthp = &rt->dst.rt_next; 1316 rthp = &rt->dst.rt_next;
1399 1317
1400 if (rt->rt_key_dst != daddr || 1318 if (rt->rt_key_dst != daddr ||
@@ -1408,13 +1326,16 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1408 rt->rt_gateway != old_gw) 1326 rt->rt_gateway != old_gw)
1409 continue; 1327 continue;
1410 1328
1411 peer = rt_get_peer_create(rt, rt->rt_dst); 1329 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1412 if (peer) { 1330 if (n) {
1413 if (peer->redirect_learned.a4 != new_gw) { 1331 if (!(n->nud_state & NUD_VALID)) {
1414 peer->redirect_learned.a4 = new_gw; 1332 neigh_event_send(n, NULL);
1415 atomic_inc(&__rt_peer_genid); 1333 } else {
1334 rt->rt_gateway = new_gw;
1335 rt->rt_flags |= RTCF_REDIRECTED;
1336 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1416 } 1337 }
1417 check_peer_redir(&rt->dst, peer); 1338 neigh_release(n);
1418 } 1339 }
1419 } 1340 }
1420 } 1341 }
@@ -1432,23 +1353,6 @@ reject_redirect:
1432 ; 1353 ;
1433} 1354}
1434 1355
1435static bool peer_pmtu_expired(struct inet_peer *peer)
1436{
1437 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1438
1439 return orig &&
1440 time_after_eq(jiffies, orig) &&
1441 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1442}
1443
1444static bool peer_pmtu_cleaned(struct inet_peer *peer)
1445{
1446 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1447
1448 return orig &&
1449 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1450}
1451
1452static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1356static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1453{ 1357{
1454 struct rtable *rt = (struct rtable *)dst; 1358 struct rtable *rt = (struct rtable *)dst;
@@ -1458,16 +1362,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1458 if (dst->obsolete > 0) { 1362 if (dst->obsolete > 0) {
1459 ip_rt_put(rt); 1363 ip_rt_put(rt);
1460 ret = NULL; 1364 ret = NULL;
1461 } else if (rt->rt_flags & RTCF_REDIRECTED) { 1365 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1366 rt->dst.expires) {
1462 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1367 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1463 rt->rt_oif, 1368 rt->rt_oif,
1464 rt_genid(dev_net(dst->dev))); 1369 rt_genid(dev_net(dst->dev)));
1465 rt_del(hash, rt); 1370 rt_del(hash, rt);
1466 ret = NULL; 1371 ret = NULL;
1467 } else if (rt_has_peer(rt)) {
1468 struct inet_peer *peer = rt_peer_ptr(rt);
1469 if (peer_pmtu_expired(peer))
1470 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1471 } 1372 }
1472 } 1373 }
1473 return ret; 1374 return ret;
@@ -1494,6 +1395,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1494 struct rtable *rt = skb_rtable(skb); 1395 struct rtable *rt = skb_rtable(skb);
1495 struct in_device *in_dev; 1396 struct in_device *in_dev;
1496 struct inet_peer *peer; 1397 struct inet_peer *peer;
1398 struct net *net;
1497 int log_martians; 1399 int log_martians;
1498 1400
1499 rcu_read_lock(); 1401 rcu_read_lock();
@@ -1505,7 +1407,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1505 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1407 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1506 rcu_read_unlock(); 1408 rcu_read_unlock();
1507 1409
1508 peer = rt_get_peer_create(rt, rt->rt_dst); 1410 net = dev_net(rt->dst.dev);
1411 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1509 if (!peer) { 1412 if (!peer) {
1510 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1413 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1511 return; 1414 return;
@@ -1522,7 +1425,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1522 */ 1425 */
1523 if (peer->rate_tokens >= ip_rt_redirect_number) { 1426 if (peer->rate_tokens >= ip_rt_redirect_number) {
1524 peer->rate_last = jiffies; 1427 peer->rate_last = jiffies;
1525 return; 1428 goto out_put_peer;
1526 } 1429 }
1527 1430
1528 /* Check for load limit; set rate_last to the latest sent 1431 /* Check for load limit; set rate_last to the latest sent
@@ -1543,6 +1446,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1543 &rt->rt_dst, &rt->rt_gateway); 1446 &rt->rt_dst, &rt->rt_gateway);
1544#endif 1447#endif
1545 } 1448 }
1449out_put_peer:
1450 inet_putpeer(peer);
1546} 1451}
1547 1452
1548static int ip_error(struct sk_buff *skb) 1453static int ip_error(struct sk_buff *skb)
@@ -1585,7 +1490,7 @@ static int ip_error(struct sk_buff *skb)
1585 break; 1490 break;
1586 } 1491 }
1587 1492
1588 peer = rt_get_peer_create(rt, rt->rt_dst); 1493 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1589 1494
1590 send = true; 1495 send = true;
1591 if (peer) { 1496 if (peer) {
@@ -1598,6 +1503,7 @@ static int ip_error(struct sk_buff *skb)
1598 peer->rate_tokens -= ip_rt_error_cost; 1503 peer->rate_tokens -= ip_rt_error_cost;
1599 else 1504 else
1600 send = false; 1505 send = false;
1506 inet_putpeer(peer);
1601 } 1507 }
1602 if (send) 1508 if (send)
1603 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1509 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -1606,50 +1512,17 @@ out: kfree_skb(skb);
1606 return 0; 1512 return 0;
1607} 1513}
1608 1514
1609static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1610{
1611 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1612
1613 if (!expires)
1614 return;
1615 if (time_before(jiffies, expires)) {
1616 u32 orig_dst_mtu = dst_mtu(dst);
1617 if (peer->pmtu_learned < orig_dst_mtu) {
1618 if (!peer->pmtu_orig)
1619 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1620 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1621 }
1622 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1623 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1624}
1625
1626static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1515static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1627{ 1516{
1628 struct rtable *rt = (struct rtable *) dst; 1517 struct rtable *rt = (struct rtable *) dst;
1629 struct inet_peer *peer;
1630 1518
1631 dst_confirm(dst); 1519 dst_confirm(dst);
1632 1520
1633 peer = rt_get_peer_create(rt, rt->rt_dst); 1521 if (mtu < ip_rt_min_pmtu)
1634 if (peer) { 1522 mtu = ip_rt_min_pmtu;
1635 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1636
1637 if (mtu < ip_rt_min_pmtu)
1638 mtu = ip_rt_min_pmtu;
1639 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1640
1641 pmtu_expires = jiffies + ip_rt_mtu_expires;
1642 if (!pmtu_expires)
1643 pmtu_expires = 1UL;
1644
1645 peer->pmtu_learned = mtu;
1646 peer->pmtu_expires = pmtu_expires;
1647 1523
1648 atomic_inc(&__rt_peer_genid); 1524 rt->rt_pmtu = mtu;
1649 rt->rt_peer_genid = rt_peer_genid(); 1525 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1650 }
1651 check_peer_pmtu(dst, peer);
1652 }
1653} 1526}
1654 1527
1655void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1528void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -1660,7 +1533,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1660 struct rtable *rt; 1533 struct rtable *rt;
1661 1534
1662 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, 1535 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1663 protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS, 1536 protocol, flow_flags,
1664 iph->daddr, iph->saddr, 0, 0); 1537 iph->daddr, iph->saddr, 0, 0);
1665 rt = __ip_route_output_key(net, &fl4); 1538 rt = __ip_route_output_key(net, &fl4);
1666 if (!IS_ERR(rt)) { 1539 if (!IS_ERR(rt)) {
@@ -1681,30 +1554,12 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1681} 1554}
1682EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1555EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1683 1556
1684static void ipv4_validate_peer(struct rtable *rt)
1685{
1686 if (rt->rt_peer_genid != rt_peer_genid()) {
1687 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1688
1689 if (peer) {
1690 check_peer_pmtu(&rt->dst, peer);
1691
1692 if (peer->redirect_learned.a4 &&
1693 peer->redirect_learned.a4 != rt->rt_gateway)
1694 check_peer_redir(&rt->dst, peer);
1695 }
1696
1697 rt->rt_peer_genid = rt_peer_genid();
1698 }
1699}
1700
1701static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1557static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1702{ 1558{
1703 struct rtable *rt = (struct rtable *) dst; 1559 struct rtable *rt = (struct rtable *) dst;
1704 1560
1705 if (rt_is_expired(rt)) 1561 if (rt_is_expired(rt))
1706 return NULL; 1562 return NULL;
1707 ipv4_validate_peer(rt);
1708 return dst; 1563 return dst;
1709} 1564}
1710 1565
@@ -1716,10 +1571,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1716 fib_info_put(rt->fi); 1571 fib_info_put(rt->fi);
1717 rt->fi = NULL; 1572 rt->fi = NULL;
1718 } 1573 }
1719 if (rt_has_peer(rt)) {
1720 struct inet_peer *peer = rt_peer_ptr(rt);
1721 inet_putpeer(peer);
1722 }
1723} 1574}
1724 1575
1725 1576
@@ -1730,11 +1581,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
1730 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1581 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1731 1582
1732 rt = skb_rtable(skb); 1583 rt = skb_rtable(skb);
1733 if (rt && rt_has_peer(rt)) { 1584 if (rt)
1734 struct inet_peer *peer = rt_peer_ptr(rt); 1585 dst_set_expires(&rt->dst, 0);
1735 if (peer_pmtu_cleaned(peer))
1736 dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1737 }
1738} 1586}
1739 1587
1740static int ip_rt_bug(struct sk_buff *skb) 1588static int ip_rt_bug(struct sk_buff *skb)
@@ -1814,7 +1662,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1814static unsigned int ipv4_mtu(const struct dst_entry *dst) 1662static unsigned int ipv4_mtu(const struct dst_entry *dst)
1815{ 1663{
1816 const struct rtable *rt = (const struct rtable *) dst; 1664 const struct rtable *rt = (const struct rtable *) dst;
1817 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 1665 unsigned int mtu = rt->rt_pmtu;
1666
1667 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1668 mtu = 0;
1669
1670 if (!mtu)
1671 mtu = dst_metric_raw(dst, RTAX_MTU);
1818 1672
1819 if (mtu && rt_is_output_route(rt)) 1673 if (mtu && rt_is_output_route(rt))
1820 return mtu; 1674 return mtu;
@@ -1836,63 +1690,27 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1836static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1690static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1837 struct fib_info *fi) 1691 struct fib_info *fi)
1838{ 1692{
1839 struct inet_peer_base *base; 1693 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1840 struct inet_peer *peer; 1694 rt->fi = fi;
1841 int create = 0; 1695 atomic_inc(&fi->fib_clntref);
1842
1843 /* If a peer entry exists for this destination, we must hook
1844 * it up in order to get at cached metrics.
1845 */
1846 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1847 create = 1;
1848
1849 base = inetpeer_base_ptr(rt->_peer);
1850 BUG_ON(!base);
1851
1852 peer = inet_getpeer_v4(base, rt->rt_dst, create);
1853 if (peer) {
1854 __rt_set_peer(rt, peer);
1855 rt->rt_peer_genid = rt_peer_genid();
1856 if (inet_metrics_new(peer))
1857 memcpy(peer->metrics, fi->fib_metrics,
1858 sizeof(u32) * RTAX_MAX);
1859 dst_init_metrics(&rt->dst, peer->metrics, false);
1860
1861 check_peer_pmtu(&rt->dst, peer);
1862
1863 if (peer->redirect_learned.a4 &&
1864 peer->redirect_learned.a4 != rt->rt_gateway) {
1865 rt->rt_gateway = peer->redirect_learned.a4;
1866 rt->rt_flags |= RTCF_REDIRECTED;
1867 }
1868 } else {
1869 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1870 rt->fi = fi;
1871 atomic_inc(&fi->fib_clntref);
1872 }
1873 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1874 } 1696 }
1697 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1875} 1698}
1876 1699
1877static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1700static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1878 const struct fib_result *res, 1701 const struct fib_result *res,
1879 struct fib_info *fi, u16 type, u32 itag) 1702 struct fib_info *fi, u16 type, u32 itag)
1880{ 1703{
1881 struct dst_entry *dst = &rt->dst;
1882
1883 if (fi) { 1704 if (fi) {
1884 if (FIB_RES_GW(*res) && 1705 if (FIB_RES_GW(*res) &&
1885 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1706 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1886 rt->rt_gateway = FIB_RES_GW(*res); 1707 rt->rt_gateway = FIB_RES_GW(*res);
1887 rt_init_metrics(rt, fl4, fi); 1708 rt_init_metrics(rt, fl4, fi);
1888#ifdef CONFIG_IP_ROUTE_CLASSID 1709#ifdef CONFIG_IP_ROUTE_CLASSID
1889 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1710 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1890#endif 1711#endif
1891 } 1712 }
1892 1713
1893 if (dst_mtu(dst) > IP_MAX_MTU)
1894 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1895
1896#ifdef CONFIG_IP_ROUTE_CLASSID 1714#ifdef CONFIG_IP_ROUTE_CLASSID
1897#ifdef CONFIG_IP_MULTIPLE_TABLES 1715#ifdef CONFIG_IP_MULTIPLE_TABLES
1898 set_class_tag(rt, fib_rules_tclass(res)); 1716 set_class_tag(rt, fib_rules_tclass(res));
@@ -1964,9 +1782,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1964 rth->rt_iif = dev->ifindex; 1782 rth->rt_iif = dev->ifindex;
1965 rth->rt_oif = 0; 1783 rth->rt_oif = 0;
1966 rth->rt_mark = skb->mark; 1784 rth->rt_mark = skb->mark;
1785 rth->rt_pmtu = 0;
1967 rth->rt_gateway = daddr; 1786 rth->rt_gateway = daddr;
1968 rth->rt_peer_genid = 0;
1969 rt_init_peer(rth, dev_net(dev)->ipv4.peers);
1970 rth->fi = NULL; 1787 rth->fi = NULL;
1971 if (our) { 1788 if (our) {
1972 rth->dst.input= ip_local_deliver; 1789 rth->dst.input= ip_local_deliver;
@@ -2090,9 +1907,8 @@ static int __mkroute_input(struct sk_buff *skb,
2090 rth->rt_iif = in_dev->dev->ifindex; 1907 rth->rt_iif = in_dev->dev->ifindex;
2091 rth->rt_oif = 0; 1908 rth->rt_oif = 0;
2092 rth->rt_mark = skb->mark; 1909 rth->rt_mark = skb->mark;
1910 rth->rt_pmtu = 0;
2093 rth->rt_gateway = daddr; 1911 rth->rt_gateway = daddr;
2094 rth->rt_peer_genid = 0;
2095 rt_init_peer(rth, &res->table->tb_peers);
2096 rth->fi = NULL; 1912 rth->fi = NULL;
2097 1913
2098 rth->dst.input = ip_forward; 1914 rth->dst.input = ip_forward;
@@ -2269,9 +2085,8 @@ local_input:
2269 rth->rt_iif = dev->ifindex; 2085 rth->rt_iif = dev->ifindex;
2270 rth->rt_oif = 0; 2086 rth->rt_oif = 0;
2271 rth->rt_mark = skb->mark; 2087 rth->rt_mark = skb->mark;
2088 rth->rt_pmtu = 0;
2272 rth->rt_gateway = daddr; 2089 rth->rt_gateway = daddr;
2273 rth->rt_peer_genid = 0;
2274 rt_init_peer(rth, net->ipv4.peers);
2275 rth->fi = NULL; 2090 rth->fi = NULL;
2276 if (res.type == RTN_UNREACHABLE) { 2091 if (res.type == RTN_UNREACHABLE) {
2277 rth->dst.input= ip_error; 2092 rth->dst.input= ip_error;
@@ -2346,7 +2161,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2346 rth->rt_mark == skb->mark && 2161 rth->rt_mark == skb->mark &&
2347 net_eq(dev_net(rth->dst.dev), net) && 2162 net_eq(dev_net(rth->dst.dev), net) &&
2348 !rt_is_expired(rth)) { 2163 !rt_is_expired(rth)) {
2349 ipv4_validate_peer(rth);
2350 if (noref) { 2164 if (noref) {
2351 dst_use_noref(&rth->dst, jiffies); 2165 dst_use_noref(&rth->dst, jiffies);
2352 skb_dst_set_noref(skb, &rth->dst); 2166 skb_dst_set_noref(skb, &rth->dst);
@@ -2468,11 +2282,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2468 rth->rt_iif = orig_oif ? : dev_out->ifindex; 2282 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2469 rth->rt_oif = orig_oif; 2283 rth->rt_oif = orig_oif;
2470 rth->rt_mark = fl4->flowi4_mark; 2284 rth->rt_mark = fl4->flowi4_mark;
2285 rth->rt_pmtu = 0;
2471 rth->rt_gateway = fl4->daddr; 2286 rth->rt_gateway = fl4->daddr;
2472 rth->rt_peer_genid = 0;
2473 rt_init_peer(rth, (res->table ?
2474 &res->table->tb_peers :
2475 dev_net(dev_out)->ipv4.peers));
2476 rth->fi = NULL; 2287 rth->fi = NULL;
2477 2288
2478 RT_CACHE_STAT_INC(out_slow_tot); 2289 RT_CACHE_STAT_INC(out_slow_tot);
@@ -2726,7 +2537,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2726 (IPTOS_RT_MASK | RTO_ONLINK)) && 2537 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2727 net_eq(dev_net(rth->dst.dev), net) && 2538 net_eq(dev_net(rth->dst.dev), net) &&
2728 !rt_is_expired(rth)) { 2539 !rt_is_expired(rth)) {
2729 ipv4_validate_peer(rth);
2730 dst_use(&rth->dst, jiffies); 2540 dst_use(&rth->dst, jiffies);
2731 RT_CACHE_STAT_INC(out_hit); 2541 RT_CACHE_STAT_INC(out_hit);
2732 rcu_read_unlock_bh(); 2542 rcu_read_unlock_bh();
@@ -2790,7 +2600,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2790 new->__use = 1; 2600 new->__use = 1;
2791 new->input = dst_discard; 2601 new->input = dst_discard;
2792 new->output = dst_discard; 2602 new->output = dst_discard;
2793 dst_copy_metrics(new, &ort->dst);
2794 2603
2795 new->dev = ort->dst.dev; 2604 new->dev = ort->dst.dev;
2796 if (new->dev) 2605 if (new->dev)
@@ -2803,6 +2612,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2803 rt->rt_iif = ort->rt_iif; 2612 rt->rt_iif = ort->rt_iif;
2804 rt->rt_oif = ort->rt_oif; 2613 rt->rt_oif = ort->rt_oif;
2805 rt->rt_mark = ort->rt_mark; 2614 rt->rt_mark = ort->rt_mark;
2615 rt->rt_pmtu = ort->rt_pmtu;
2806 2616
2807 rt->rt_genid = rt_genid(net); 2617 rt->rt_genid = rt_genid(net);
2808 rt->rt_flags = ort->rt_flags; 2618 rt->rt_flags = ort->rt_flags;
@@ -2810,7 +2620,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2810 rt->rt_dst = ort->rt_dst; 2620 rt->rt_dst = ort->rt_dst;
2811 rt->rt_src = ort->rt_src; 2621 rt->rt_src = ort->rt_src;
2812 rt->rt_gateway = ort->rt_gateway; 2622 rt->rt_gateway = ort->rt_gateway;
2813 rt_transfer_peer(rt, ort);
2814 rt->fi = ort->fi; 2623 rt->fi = ort->fi;
2815 if (rt->fi) 2624 if (rt->fi)
2816 atomic_inc(&rt->fi->fib_clntref); 2625 atomic_inc(&rt->fi->fib_clntref);
@@ -2848,7 +2657,7 @@ static int rt_fill_info(struct net *net,
2848 struct rtmsg *r; 2657 struct rtmsg *r;
2849 struct nlmsghdr *nlh; 2658 struct nlmsghdr *nlh;
2850 unsigned long expires = 0; 2659 unsigned long expires = 0;
2851 u32 id = 0, ts = 0, tsage = 0, error; 2660 u32 error;
2852 2661
2853 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2662 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2854 if (nlh == NULL) 2663 if (nlh == NULL)
@@ -2901,21 +2710,12 @@ static int rt_fill_info(struct net *net,
2901 goto nla_put_failure; 2710 goto nla_put_failure;
2902 2711
2903 error = rt->dst.error; 2712 error = rt->dst.error;
2904 if (rt_has_peer(rt)) { 2713 expires = rt->dst.expires;
2905 const struct inet_peer *peer = rt_peer_ptr(rt); 2714 if (expires) {
2906 inet_peer_refcheck(peer); 2715 if (time_before(jiffies, expires))
2907 id = atomic_read(&peer->ip_id_count) & 0xffff; 2716 expires -= jiffies;
2908 if (peer->tcp_ts_stamp) { 2717 else
2909 ts = peer->tcp_ts; 2718 expires = 0;
2910 tsage = get_seconds() - peer->tcp_ts_stamp;
2911 }
2912 expires = ACCESS_ONCE(peer->pmtu_expires);
2913 if (expires) {
2914 if (time_before(jiffies, expires))
2915 expires -= jiffies;
2916 else
2917 expires = 0;
2918 }
2919 } 2719 }
2920 2720
2921 if (rt_is_input_route(rt)) { 2721 if (rt_is_input_route(rt)) {
@@ -2944,8 +2744,7 @@ static int rt_fill_info(struct net *net,
2944 goto nla_put_failure; 2744 goto nla_put_failure;
2945 } 2745 }
2946 2746
2947 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2747 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2948 expires, error) < 0)
2949 goto nla_put_failure; 2748 goto nla_put_failure;
2950 2749
2951 return nlmsg_end(skb, nlh); 2750 return nlmsg_end(skb, nlh);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f60e4e..29aa0c800cd0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3563,6 +3563,8 @@ void __init tcp_init(void)
3563 pr_info("Hash tables configured (established %u bind %u)\n", 3563 pr_info("Hash tables configured (established %u bind %u)\n",
3564 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3564 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3565 3565
3566 tcp_metrics_init();
3567
3566 tcp_register_congestion_control(&tcp_reno); 3568 tcp_register_congestion_control(&tcp_reno);
3567 3569
3568 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); 3570 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ca0d0e7c9778..055ac49b8b40 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
93int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 93int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
94int sysctl_tcp_frto __read_mostly = 2; 94int sysctl_tcp_frto __read_mostly = 2;
95int sysctl_tcp_frto_response __read_mostly; 95int sysctl_tcp_frto_response __read_mostly;
96int sysctl_tcp_nometrics_save __read_mostly;
97 96
98int sysctl_tcp_thin_dupack __read_mostly; 97int sysctl_tcp_thin_dupack __read_mostly;
99 98
@@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
701/* Calculate rto without backoff. This is the second half of Van Jacobson's 700/* Calculate rto without backoff. This is the second half of Van Jacobson's
702 * routine referred to above. 701 * routine referred to above.
703 */ 702 */
704static inline void tcp_set_rto(struct sock *sk) 703void tcp_set_rto(struct sock *sk)
705{ 704{
706 const struct tcp_sock *tp = tcp_sk(sk); 705 const struct tcp_sock *tp = tcp_sk(sk);
707 /* Old crap is replaced with new one. 8) 706 /* Old crap is replaced with new one. 8)
@@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk)
728 tcp_bound_rto(sk); 727 tcp_bound_rto(sk);
729} 728}
730 729
731/* Save metrics learned by this TCP session.
732 This function is called only, when TCP finishes successfully
733 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
734 */
735void tcp_update_metrics(struct sock *sk)
736{
737 struct tcp_sock *tp = tcp_sk(sk);
738 struct dst_entry *dst = __sk_dst_get(sk);
739
740 if (sysctl_tcp_nometrics_save)
741 return;
742
743 if (dst && (dst->flags & DST_HOST)) {
744 const struct inet_connection_sock *icsk = inet_csk(sk);
745 int m;
746 unsigned long rtt;
747
748 dst_confirm(dst);
749
750 if (icsk->icsk_backoff || !tp->srtt) {
751 /* This session failed to estimate rtt. Why?
752 * Probably, no packets returned in time.
753 * Reset our results.
754 */
755 if (!(dst_metric_locked(dst, RTAX_RTT)))
756 dst_metric_set(dst, RTAX_RTT, 0);
757 return;
758 }
759
760 rtt = dst_metric_rtt(dst, RTAX_RTT);
761 m = rtt - tp->srtt;
762
763 /* If newly calculated rtt larger than stored one,
764 * store new one. Otherwise, use EWMA. Remember,
765 * rtt overestimation is always better than underestimation.
766 */
767 if (!(dst_metric_locked(dst, RTAX_RTT))) {
768 if (m <= 0)
769 set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
770 else
771 set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
772 }
773
774 if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
775 unsigned long var;
776 if (m < 0)
777 m = -m;
778
779 /* Scale deviation to rttvar fixed point */
780 m >>= 1;
781 if (m < tp->mdev)
782 m = tp->mdev;
783
784 var = dst_metric_rtt(dst, RTAX_RTTVAR);
785 if (m >= var)
786 var = m;
787 else
788 var -= (var - m) >> 2;
789
790 set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
791 }
792
793 if (tcp_in_initial_slowstart(tp)) {
794 /* Slow start still did not finish. */
795 if (dst_metric(dst, RTAX_SSTHRESH) &&
796 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
797 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
798 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
799 if (!dst_metric_locked(dst, RTAX_CWND) &&
800 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
801 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
802 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
803 icsk->icsk_ca_state == TCP_CA_Open) {
804 /* Cong. avoidance phase, cwnd is reliable. */
805 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
806 dst_metric_set(dst, RTAX_SSTHRESH,
807 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
808 if (!dst_metric_locked(dst, RTAX_CWND))
809 dst_metric_set(dst, RTAX_CWND,
810 (dst_metric(dst, RTAX_CWND) +
811 tp->snd_cwnd) >> 1);
812 } else {
813 /* Else slow start did not finish, cwnd is non-sense,
814 ssthresh may be also invalid.
815 */
816 if (!dst_metric_locked(dst, RTAX_CWND))
817 dst_metric_set(dst, RTAX_CWND,
818 (dst_metric(dst, RTAX_CWND) +
819 tp->snd_ssthresh) >> 1);
820 if (dst_metric(dst, RTAX_SSTHRESH) &&
821 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
822 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
823 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
824 }
825
826 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
827 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
828 tp->reordering != sysctl_tcp_reordering)
829 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
830 }
831 }
832}
833
834__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) 730__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
835{ 731{
836 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 732 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
867 * Packet counting of FACK is based on in-order assumptions, therefore TCP 763 * Packet counting of FACK is based on in-order assumptions, therefore TCP
868 * disables it when reordering is detected 764 * disables it when reordering is detected
869 */ 765 */
870static void tcp_disable_fack(struct tcp_sock *tp) 766void tcp_disable_fack(struct tcp_sock *tp)
871{ 767{
872 /* RFC3517 uses different metric in lost marker => reset on change */ 768 /* RFC3517 uses different metric in lost marker => reset on change */
873 if (tcp_is_fack(tp)) 769 if (tcp_is_fack(tp))
@@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
881 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 777 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
882} 778}
883 779
884/* Initialize metrics on socket. */
885
886static void tcp_init_metrics(struct sock *sk)
887{
888 struct tcp_sock *tp = tcp_sk(sk);
889 struct dst_entry *dst = __sk_dst_get(sk);
890
891 if (dst == NULL)
892 goto reset;
893
894 dst_confirm(dst);
895
896 if (dst_metric_locked(dst, RTAX_CWND))
897 tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
898 if (dst_metric(dst, RTAX_SSTHRESH)) {
899 tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
900 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
901 tp->snd_ssthresh = tp->snd_cwnd_clamp;
902 } else {
903 /* ssthresh may have been reduced unnecessarily during.
904 * 3WHS. Restore it back to its initial default.
905 */
906 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
907 }
908 if (dst_metric(dst, RTAX_REORDERING) &&
909 tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
910 tcp_disable_fack(tp);
911 tcp_disable_early_retrans(tp);
912 tp->reordering = dst_metric(dst, RTAX_REORDERING);
913 }
914
915 if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
916 goto reset;
917
918 /* Initial rtt is determined from SYN,SYN-ACK.
919 * The segment is small and rtt may appear much
920 * less than real one. Use per-dst memory
921 * to make it more realistic.
922 *
923 * A bit of theory. RTT is time passed after "normal" sized packet
924 * is sent until it is ACKed. In normal circumstances sending small
925 * packets force peer to delay ACKs and calculation is correct too.
926 * The algorithm is adaptive and, provided we follow specs, it
927 * NEVER underestimate RTT. BUT! If peer tries to make some clever
928 * tricks sort of "quick acks" for time long enough to decrease RTT
929 * to low value, and then abruptly stops to do it and starts to delay
930 * ACKs, wait for troubles.
931 */
932 if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
933 tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
934 tp->rtt_seq = tp->snd_nxt;
935 }
936 if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
937 tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
938 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
939 }
940 tcp_set_rto(sk);
941reset:
942 if (tp->srtt == 0) {
943 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
944 * 3WHS. This is most likely due to retransmission,
945 * including spurious one. Reset the RTO back to 3secs
946 * from the more aggressive 1sec to avoid more spurious
947 * retransmission.
948 */
949 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
950 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
951 }
952 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
953 * retransmitted. In light of RFC6298 more aggressive 1sec
954 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
955 * retransmission has occurred.
956 */
957 if (tp->total_retrans > 1)
958 tp->snd_cwnd = 1;
959 else
960 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
961 tp->snd_cwnd_stamp = tcp_time_stamp;
962}
963
964static void tcp_update_reordering(struct sock *sk, const int metric, 780static void tcp_update_reordering(struct sock *sk, const int metric,
965 const int ts) 781 const int ts)
966{ 782{
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 64568fa21d05..ddefd39ac0cf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
209 } 209 }
210 210
211 if (tcp_death_row.sysctl_tw_recycle && 211 if (tcp_death_row.sysctl_tw_recycle &&
212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { 212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
213 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); 213 tcp_fetch_timewait_stamp(sk, &rt->dst);
214 /*
215 * VJ's idea. We save last timestamp seen from
216 * the destination in peer table, when entering state
217 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
218 * when trying new connection.
219 */
220 if (peer) {
221 inet_peer_refcheck(peer);
222 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
223 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
224 tp->rx_opt.ts_recent = peer->tcp_ts;
225 }
226 }
227 }
228 214
229 inet->inet_dport = usin->sin_port; 215 inet->inet_dport = usin->sin_port;
230 inet->inet_daddr = daddr; 216 inet->inet_daddr = daddr;
@@ -1375,7 +1361,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1375 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1361 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1376 req->cookie_ts = tmp_opt.tstamp_ok; 1362 req->cookie_ts = tmp_opt.tstamp_ok;
1377 } else if (!isn) { 1363 } else if (!isn) {
1378 struct inet_peer *peer = NULL;
1379 struct flowi4 fl4; 1364 struct flowi4 fl4;
1380 1365
1381 /* VJ's idea. We save last timestamp seen 1366 /* VJ's idea. We save last timestamp seen
@@ -1390,12 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1390 if (tmp_opt.saw_tstamp && 1375 if (tmp_opt.saw_tstamp &&
1391 tcp_death_row.sysctl_tw_recycle && 1376 tcp_death_row.sysctl_tw_recycle &&
1392 (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && 1377 (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
1393 fl4.daddr == saddr && 1378 fl4.daddr == saddr) {
1394 (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { 1379 if (!tcp_peer_is_proven(req, dst, true)) {
1395 inet_peer_refcheck(peer);
1396 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1397 (s32)(peer->tcp_ts - req->ts_recent) >
1398 TCP_PAWS_WINDOW) {
1399 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1380 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1400 goto drop_and_release; 1381 goto drop_and_release;
1401 } 1382 }
@@ -1404,8 +1385,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1404 else if (!sysctl_tcp_syncookies && 1385 else if (!sysctl_tcp_syncookies &&
1405 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1386 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1406 (sysctl_max_syn_backlog >> 2)) && 1387 (sysctl_max_syn_backlog >> 2)) &&
1407 (!peer || !peer->tcp_ts_stamp) && 1388 !tcp_peer_is_proven(req, dst, false)) {
1408 (!dst || !dst_metric(dst, RTAX_RTT))) {
1409 /* Without syncookies last quarter of 1389 /* Without syncookies last quarter of
1410 * backlog is filled with destinations, 1390 * backlog is filled with destinations,
1411 * proven to be alive. 1391 * proven to be alive.
@@ -1867,21 +1847,6 @@ do_time_wait:
1867 goto discard_it; 1847 goto discard_it;
1868} 1848}
1869 1849
1870struct inet_peer *tcp_v4_get_peer(struct sock *sk)
1871{
1872 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1873 struct inet_sock *inet = inet_sk(sk);
1874
1875 /* If we don't have a valid cached route, or we're doing IP
1876 * options which make the IPv4 header destination address
1877 * different from our peer's, do not bother with this.
1878 */
1879 if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr)
1880 return NULL;
1881 return rt_get_peer_create(rt, inet->inet_daddr);
1882}
1883EXPORT_SYMBOL(tcp_v4_get_peer);
1884
1885static struct timewait_sock_ops tcp_timewait_sock_ops = { 1850static struct timewait_sock_ops tcp_timewait_sock_ops = {
1886 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1851 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1887 .twsk_unique = tcp_twsk_unique, 1852 .twsk_unique = tcp_twsk_unique,
@@ -1894,7 +1859,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1894 .rebuild_header = inet_sk_rebuild_header, 1859 .rebuild_header = inet_sk_rebuild_header,
1895 .conn_request = tcp_v4_conn_request, 1860 .conn_request = tcp_v4_conn_request,
1896 .syn_recv_sock = tcp_v4_syn_recv_sock, 1861 .syn_recv_sock = tcp_v4_syn_recv_sock,
1897 .get_peer = tcp_v4_get_peer,
1898 .net_header_len = sizeof(struct iphdr), 1862 .net_header_len = sizeof(struct iphdr),
1899 .setsockopt = ip_setsockopt, 1863 .setsockopt = ip_setsockopt,
1900 .getsockopt = ip_getsockopt, 1864 .getsockopt = ip_getsockopt,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000000..1fd83d3118fe
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,697 @@
1#include <linux/rcupdate.h>
2#include <linux/spinlock.h>
3#include <linux/jiffies.h>
4#include <linux/bootmem.h>
5#include <linux/module.h>
6#include <linux/cache.h>
7#include <linux/slab.h>
8#include <linux/init.h>
9#include <linux/tcp.h>
10
11#include <net/inet_connection_sock.h>
12#include <net/net_namespace.h>
13#include <net/request_sock.h>
14#include <net/inetpeer.h>
15#include <net/sock.h>
16#include <net/ipv6.h>
17#include <net/dst.h>
18#include <net/tcp.h>
19
20int sysctl_tcp_nometrics_save __read_mostly;
21
22enum tcp_metric_index {
23 TCP_METRIC_RTT,
24 TCP_METRIC_RTTVAR,
25 TCP_METRIC_SSTHRESH,
26 TCP_METRIC_CWND,
27 TCP_METRIC_REORDERING,
28
29 /* Always last. */
30 TCP_METRIC_MAX,
31};
32
33struct tcp_metrics_block {
34 struct tcp_metrics_block __rcu *tcpm_next;
35 struct inetpeer_addr tcpm_addr;
36 unsigned long tcpm_stamp;
37 u32 tcpm_ts;
38 u32 tcpm_ts_stamp;
39 u32 tcpm_lock;
40 u32 tcpm_vals[TCP_METRIC_MAX];
41};
42
43static bool tcp_metric_locked(struct tcp_metrics_block *tm,
44 enum tcp_metric_index idx)
45{
46 return tm->tcpm_lock & (1 << idx);
47}
48
49static u32 tcp_metric_get(struct tcp_metrics_block *tm,
50 enum tcp_metric_index idx)
51{
52 return tm->tcpm_vals[idx];
53}
54
55static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
56 enum tcp_metric_index idx)
57{
58 return msecs_to_jiffies(tm->tcpm_vals[idx]);
59}
60
61static void tcp_metric_set(struct tcp_metrics_block *tm,
62 enum tcp_metric_index idx,
63 u32 val)
64{
65 tm->tcpm_vals[idx] = val;
66}
67
68static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
69 enum tcp_metric_index idx,
70 u32 val)
71{
72 tm->tcpm_vals[idx] = jiffies_to_msecs(val);
73}
74
75static bool addr_same(const struct inetpeer_addr *a,
76 const struct inetpeer_addr *b)
77{
78 const struct in6_addr *a6, *b6;
79
80 if (a->family != b->family)
81 return false;
82 if (a->family == AF_INET)
83 return a->addr.a4 == b->addr.a4;
84
85 a6 = (const struct in6_addr *) &a->addr.a6[0];
86 b6 = (const struct in6_addr *) &b->addr.a6[0];
87
88 return ipv6_addr_equal(a6, b6);
89}
90
91struct tcpm_hash_bucket {
92 struct tcp_metrics_block __rcu *chain;
93};
94
95static DEFINE_SPINLOCK(tcp_metrics_lock);
96
97static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
98{
99 u32 val;
100
101 val = 0;
102 if (dst_metric_locked(dst, RTAX_RTT))
103 val |= 1 << TCP_METRIC_RTT;
104 if (dst_metric_locked(dst, RTAX_RTTVAR))
105 val |= 1 << TCP_METRIC_RTTVAR;
106 if (dst_metric_locked(dst, RTAX_SSTHRESH))
107 val |= 1 << TCP_METRIC_SSTHRESH;
108 if (dst_metric_locked(dst, RTAX_CWND))
109 val |= 1 << TCP_METRIC_CWND;
110 if (dst_metric_locked(dst, RTAX_REORDERING))
111 val |= 1 << TCP_METRIC_REORDERING;
112 tm->tcpm_lock = val;
113
114 tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
115 tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
116 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
117 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
118 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
119 tm->tcpm_ts = 0;
120 tm->tcpm_ts_stamp = 0;
121}
122
123static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
124 struct inetpeer_addr *addr,
125 unsigned int hash,
126 bool reclaim)
127{
128 struct tcp_metrics_block *tm;
129 struct net *net;
130
131 spin_lock_bh(&tcp_metrics_lock);
132 net = dev_net(dst->dev);
133 if (unlikely(reclaim)) {
134 struct tcp_metrics_block *oldest;
135
136 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
137 for (tm = rcu_dereference(oldest->tcpm_next); tm;
138 tm = rcu_dereference(tm->tcpm_next)) {
139 if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
140 oldest = tm;
141 }
142 tm = oldest;
143 } else {
144 tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
145 if (!tm)
146 goto out_unlock;
147 }
148 tm->tcpm_addr = *addr;
149 tm->tcpm_stamp = jiffies;
150
151 tcpm_suck_dst(tm, dst);
152
153 if (likely(!reclaim)) {
154 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
155 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
156 }
157
158out_unlock:
159 spin_unlock_bh(&tcp_metrics_lock);
160 return tm;
161}
162
163#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
164
165static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
166{
167 if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
168 tcpm_suck_dst(tm, dst);
169}
170
171#define TCP_METRICS_RECLAIM_DEPTH 5
172#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
173
174static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
175{
176 if (tm)
177 return tm;
178 if (depth > TCP_METRICS_RECLAIM_DEPTH)
179 return TCP_METRICS_RECLAIM_PTR;
180 return NULL;
181}
182
183static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
184 struct net *net, unsigned int hash)
185{
186 struct tcp_metrics_block *tm;
187 int depth = 0;
188
189 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
190 tm = rcu_dereference(tm->tcpm_next)) {
191 if (addr_same(&tm->tcpm_addr, addr))
192 break;
193 depth++;
194 }
195 return tcp_get_encode(tm, depth);
196}
197
198static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
199 struct dst_entry *dst)
200{
201 struct tcp_metrics_block *tm;
202 struct inetpeer_addr addr;
203 unsigned int hash;
204 struct net *net;
205
206 addr.family = req->rsk_ops->family;
207 switch (addr.family) {
208 case AF_INET:
209 addr.addr.a4 = inet_rsk(req)->rmt_addr;
210 hash = (__force unsigned int) addr.addr.a4;
211 break;
212 case AF_INET6:
213 *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
214 hash = ((__force unsigned int) addr.addr.a6[0] ^
215 (__force unsigned int) addr.addr.a6[1] ^
216 (__force unsigned int) addr.addr.a6[2] ^
217 (__force unsigned int) addr.addr.a6[3]);
218 break;
219 default:
220 return NULL;
221 }
222
223 hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
224
225 net = dev_net(dst->dev);
226 hash &= net->ipv4.tcp_metrics_hash_mask;
227
228 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
229 tm = rcu_dereference(tm->tcpm_next)) {
230 if (addr_same(&tm->tcpm_addr, &addr))
231 break;
232 }
233 tcpm_check_stamp(tm, dst);
234 return tm;
235}
236
237static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
238{
239 struct inet6_timewait_sock *tw6;
240 struct tcp_metrics_block *tm;
241 struct inetpeer_addr addr;
242 unsigned int hash;
243 struct net *net;
244
245 addr.family = tw->tw_family;
246 switch (addr.family) {
247 case AF_INET:
248 addr.addr.a4 = tw->tw_daddr;
249 hash = (__force unsigned int) addr.addr.a4;
250 break;
251 case AF_INET6:
252 tw6 = inet6_twsk((struct sock *)tw);
253 *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
254 hash = ((__force unsigned int) addr.addr.a6[0] ^
255 (__force unsigned int) addr.addr.a6[1] ^
256 (__force unsigned int) addr.addr.a6[2] ^
257 (__force unsigned int) addr.addr.a6[3]);
258 break;
259 default:
260 return NULL;
261 }
262
263 hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
264
265 net = twsk_net(tw);
266 hash &= net->ipv4.tcp_metrics_hash_mask;
267
268 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
269 tm = rcu_dereference(tm->tcpm_next)) {
270 if (addr_same(&tm->tcpm_addr, &addr))
271 break;
272 }
273 return tm;
274}
275
276static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
277 struct dst_entry *dst,
278 bool create)
279{
280 struct tcp_metrics_block *tm;
281 struct inetpeer_addr addr;
282 unsigned int hash;
283 struct net *net;
284 bool reclaim;
285
286 addr.family = sk->sk_family;
287 switch (addr.family) {
288 case AF_INET:
289 addr.addr.a4 = inet_sk(sk)->inet_daddr;
290 hash = (__force unsigned int) addr.addr.a4;
291 break;
292 case AF_INET6:
293 *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
294 hash = ((__force unsigned int) addr.addr.a6[0] ^
295 (__force unsigned int) addr.addr.a6[1] ^
296 (__force unsigned int) addr.addr.a6[2] ^
297 (__force unsigned int) addr.addr.a6[3]);
298 break;
299 default:
300 return NULL;
301 }
302
303 hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
304
305 net = dev_net(dst->dev);
306 hash &= net->ipv4.tcp_metrics_hash_mask;
307
308 tm = __tcp_get_metrics(&addr, net, hash);
309 reclaim = false;
310 if (tm == TCP_METRICS_RECLAIM_PTR) {
311 reclaim = true;
312 tm = NULL;
313 }
314 if (!tm && create)
315 tm = tcpm_new(dst, &addr, hash, reclaim);
316 else
317 tcpm_check_stamp(tm, dst);
318
319 return tm;
320}
321
322/* Save metrics learned by this TCP session. This function is called
323 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
324 * or goes from LAST-ACK to CLOSE.
325 */
326void tcp_update_metrics(struct sock *sk)
327{
328 const struct inet_connection_sock *icsk = inet_csk(sk);
329 struct dst_entry *dst = __sk_dst_get(sk);
330 struct tcp_sock *tp = tcp_sk(sk);
331 struct tcp_metrics_block *tm;
332 unsigned long rtt;
333 u32 val;
334 int m;
335
336 if (sysctl_tcp_nometrics_save || !dst)
337 return;
338
339 if (dst->flags & DST_HOST)
340 dst_confirm(dst);
341
342 rcu_read_lock();
343 if (icsk->icsk_backoff || !tp->srtt) {
344 /* This session failed to estimate rtt. Why?
345 * Probably, no packets returned in time. Reset our
346 * results.
347 */
348 tm = tcp_get_metrics(sk, dst, false);
349 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
350 tcp_metric_set(tm, TCP_METRIC_RTT, 0);
351 goto out_unlock;
352 } else
353 tm = tcp_get_metrics(sk, dst, true);
354
355 if (!tm)
356 goto out_unlock;
357
358 rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
359 m = rtt - tp->srtt;
360
361 /* If newly calculated rtt larger than stored one, store new
362 * one. Otherwise, use EWMA. Remember, rtt overestimation is
363 * always better than underestimation.
364 */
365 if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
366 if (m <= 0)
367 rtt = tp->srtt;
368 else
369 rtt -= (m >> 3);
370 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
371 }
372
373 if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
374 unsigned long var;
375
376 if (m < 0)
377 m = -m;
378
379 /* Scale deviation to rttvar fixed point */
380 m >>= 1;
381 if (m < tp->mdev)
382 m = tp->mdev;
383
384 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
385 if (m >= var)
386 var = m;
387 else
388 var -= (var - m) >> 2;
389
390 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
391 }
392
393 if (tcp_in_initial_slowstart(tp)) {
394 /* Slow start still did not finish. */
395 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
396 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
397 if (val && (tp->snd_cwnd >> 1) > val)
398 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
399 tp->snd_cwnd >> 1);
400 }
401 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
402 val = tcp_metric_get(tm, TCP_METRIC_CWND);
403 if (tp->snd_cwnd > val)
404 tcp_metric_set(tm, TCP_METRIC_CWND,
405 tp->snd_cwnd);
406 }
407 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
408 icsk->icsk_ca_state == TCP_CA_Open) {
409 /* Cong. avoidance phase, cwnd is reliable. */
410 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
411 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
412 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
413 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
414 val = tcp_metric_get(tm, TCP_METRIC_CWND);
415 tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1);
416 }
417 } else {
418 /* Else slow start did not finish, cwnd is non-sense,
419 * ssthresh may be also invalid.
420 */
421 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
422 val = tcp_metric_get(tm, TCP_METRIC_CWND);
423 tcp_metric_set(tm, TCP_METRIC_CWND,
424 (val + tp->snd_ssthresh) >> 1);
425 }
426 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
427 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
428 if (val && tp->snd_ssthresh > val)
429 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
430 tp->snd_ssthresh);
431 }
432 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
433 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
434 if (val < tp->reordering &&
435 tp->reordering != sysctl_tcp_reordering)
436 tcp_metric_set(tm, TCP_METRIC_REORDERING,
437 tp->reordering);
438 }
439 }
440 tm->tcpm_stamp = jiffies;
441out_unlock:
442 rcu_read_unlock();
443}
444
445/* Initialize metrics on socket. */
446
447void tcp_init_metrics(struct sock *sk)
448{
449 struct dst_entry *dst = __sk_dst_get(sk);
450 struct tcp_sock *tp = tcp_sk(sk);
451 struct tcp_metrics_block *tm;
452 u32 val;
453
454 if (dst == NULL)
455 goto reset;
456
457 dst_confirm(dst);
458
459 rcu_read_lock();
460 tm = tcp_get_metrics(sk, dst, true);
461 if (!tm) {
462 rcu_read_unlock();
463 goto reset;
464 }
465
466 if (tcp_metric_locked(tm, TCP_METRIC_CWND))
467 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
468
469 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
470 if (val) {
471 tp->snd_ssthresh = val;
472 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
473 tp->snd_ssthresh = tp->snd_cwnd_clamp;
474 } else {
475 /* ssthresh may have been reduced unnecessarily during.
476 * 3WHS. Restore it back to its initial default.
477 */
478 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
479 }
480 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
481 if (val && tp->reordering != val) {
482 tcp_disable_fack(tp);
483 tcp_disable_early_retrans(tp);
484 tp->reordering = val;
485 }
486
487 val = tcp_metric_get(tm, TCP_METRIC_RTT);
488 if (val == 0 || tp->srtt == 0) {
489 rcu_read_unlock();
490 goto reset;
491 }
492 /* Initial rtt is determined from SYN,SYN-ACK.
493 * The segment is small and rtt may appear much
494 * less than real one. Use per-dst memory
495 * to make it more realistic.
496 *
497 * A bit of theory. RTT is time passed after "normal" sized packet
498 * is sent until it is ACKed. In normal circumstances sending small
499 * packets force peer to delay ACKs and calculation is correct too.
500 * The algorithm is adaptive and, provided we follow specs, it
501 * NEVER underestimate RTT. BUT! If peer tries to make some clever
502 * tricks sort of "quick acks" for time long enough to decrease RTT
503 * to low value, and then abruptly stops to do it and starts to delay
504 * ACKs, wait for troubles.
505 */
506 val = msecs_to_jiffies(val);
507 if (val > tp->srtt) {
508 tp->srtt = val;
509 tp->rtt_seq = tp->snd_nxt;
510 }
511 val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
512 if (val > tp->mdev) {
513 tp->mdev = val;
514 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
515 }
516 rcu_read_unlock();
517
518 tcp_set_rto(sk);
519reset:
520 if (tp->srtt == 0) {
521 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
522 * 3WHS. This is most likely due to retransmission,
523 * including spurious one. Reset the RTO back to 3secs
524 * from the more aggressive 1sec to avoid more spurious
525 * retransmission.
526 */
527 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
528 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
529 }
530 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
531 * retransmitted. In light of RFC6298 more aggressive 1sec
532 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
533 * retransmission has occurred.
534 */
535 if (tp->total_retrans > 1)
536 tp->snd_cwnd = 1;
537 else
538 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
539 tp->snd_cwnd_stamp = tcp_time_stamp;
540}
541
542bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
543{
544 struct tcp_metrics_block *tm;
545 bool ret;
546
547 if (!dst)
548 return false;
549
550 rcu_read_lock();
551 tm = __tcp_get_metrics_req(req, dst);
552 if (paws_check) {
553 if (tm &&
554 (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
555 (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
556 ret = false;
557 else
558 ret = true;
559 } else {
560 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
561 ret = true;
562 else
563 ret = false;
564 }
565 rcu_read_unlock();
566
567 return ret;
568}
569EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
570
571void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
572{
573 struct tcp_metrics_block *tm;
574
575 rcu_read_lock();
576 tm = tcp_get_metrics(sk, dst, true);
577 if (tm) {
578 struct tcp_sock *tp = tcp_sk(sk);
579
580 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
581 tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
582 tp->rx_opt.ts_recent = tm->tcpm_ts;
583 }
584 }
585 rcu_read_unlock();
586}
587EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
588
589/* VJ's idea. Save last timestamp seen from this destination and hold
590 * it at least for normal timewait interval to use for duplicate
591 * segment detection in subsequent connections, before they enter
592 * synchronized state.
593 */
594bool tcp_remember_stamp(struct sock *sk)
595{
596 struct dst_entry *dst = __sk_dst_get(sk);
597 bool ret = false;
598
599 if (dst) {
600 struct tcp_metrics_block *tm;
601
602 rcu_read_lock();
603 tm = tcp_get_metrics(sk, dst, true);
604 if (tm) {
605 struct tcp_sock *tp = tcp_sk(sk);
606
607 if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
608 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
609 tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
610 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
611 tm->tcpm_ts = tp->rx_opt.ts_recent;
612 }
613 ret = true;
614 }
615 rcu_read_unlock();
616 }
617 return ret;
618}
619
620bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
621{
622 struct tcp_metrics_block *tm;
623 bool ret = false;
624
625 rcu_read_lock();
626 tm = __tcp_get_metrics_tw(tw);
627 if (tw) {
628 const struct tcp_timewait_sock *tcptw;
629 struct sock *sk = (struct sock *) tw;
630
631 tcptw = tcp_twsk(sk);
632 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
633 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
634 tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
635 tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
636 tm->tcpm_ts = tcptw->tw_ts_recent;
637 }
638 ret = true;
639 }
640 rcu_read_unlock();
641
642 return ret;
643}
644
645static unsigned long tcpmhash_entries;
646static int __init set_tcpmhash_entries(char *str)
647{
648 ssize_t ret;
649
650 if (!str)
651 return 0;
652
653 ret = kstrtoul(str, 0, &tcpmhash_entries);
654 if (ret)
655 return 0;
656
657 return 1;
658}
659__setup("tcpmhash_entries=", set_tcpmhash_entries);
660
661static int __net_init tcp_net_metrics_init(struct net *net)
662{
663 int slots, size;
664
665 slots = tcpmhash_entries;
666 if (!slots) {
667 if (totalram_pages >= 128 * 1024)
668 slots = 16 * 1024;
669 else
670 slots = 8 * 1024;
671 }
672
673 size = slots * sizeof(struct tcpm_hash_bucket);
674
675 net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
676 if (!net->ipv4.tcp_metrics_hash)
677 return -ENOMEM;
678
679 net->ipv4.tcp_metrics_hash_mask = (slots - 1);
680
681 return 0;
682}
683
684static void __net_exit tcp_net_metrics_exit(struct net *net)
685{
686 kfree(net->ipv4.tcp_metrics_hash);
687}
688
689static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
690 .init = tcp_net_metrics_init,
691 .exit = tcp_net_metrics_exit,
692};
693
694void __init tcp_metrics_init(void)
695{
696 register_pernet_subsys(&tcp_net_metrics_ops);
697}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 72b7c63b1a39..65608863fdee 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,52 +49,6 @@ struct inet_timewait_death_row tcp_death_row = {
49}; 49};
50EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
51 51
52/* VJ's idea. Save last timestamp seen from this destination
53 * and hold it at least for normal timewait interval to use for duplicate
54 * segment detection in subsequent connections, before they enter synchronized
55 * state.
56 */
57
58static bool tcp_remember_stamp(struct sock *sk)
59{
60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk);
62 struct inet_peer *peer;
63
64 peer = icsk->icsk_af_ops->get_peer(sk);
65 if (peer) {
66 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
67 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
68 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
69 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
70 peer->tcp_ts = tp->rx_opt.ts_recent;
71 }
72 return true;
73 }
74
75 return false;
76}
77
78static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
79{
80 const struct tcp_timewait_sock *tcptw;
81 struct sock *sk = (struct sock *) tw;
82 struct inet_peer *peer;
83
84 tcptw = tcp_twsk(sk);
85 peer = tcptw->tw_peer;
86 if (peer) {
87 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
88 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
89 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
90 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
91 peer->tcp_ts = tcptw->tw_ts_recent;
92 }
93 return true;
94 }
95 return false;
96}
97
98static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 52static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
99{ 53{
100 if (seq == s_win) 54 if (seq == s_win)
@@ -313,12 +267,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
313 const struct inet_connection_sock *icsk = inet_csk(sk); 267 const struct inet_connection_sock *icsk = inet_csk(sk);
314 const struct tcp_sock *tp = tcp_sk(sk); 268 const struct tcp_sock *tp = tcp_sk(sk);
315 bool recycle_ok = false; 269 bool recycle_ok = false;
316 bool recycle_on = false;
317 270
318 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) { 271 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
319 recycle_ok = tcp_remember_stamp(sk); 272 recycle_ok = tcp_remember_stamp(sk);
320 recycle_on = true;
321 }
322 273
323 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 274 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
324 tw = inet_twsk_alloc(sk, state); 275 tw = inet_twsk_alloc(sk, state);
@@ -327,7 +278,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
327 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 278 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
328 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 279 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
329 struct inet_sock *inet = inet_sk(sk); 280 struct inet_sock *inet = inet_sk(sk);
330 struct inet_peer *peer = NULL;
331 281
332 tw->tw_transparent = inet->transparent; 282 tw->tw_transparent = inet->transparent;
333 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 283 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -351,12 +301,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
351 } 301 }
352#endif 302#endif
353 303
354 if (recycle_on)
355 peer = icsk->icsk_af_ops->get_peer(sk);
356 tcptw->tw_peer = peer;
357 if (peer)
358 atomic_inc(&peer->refcnt);
359
360#ifdef CONFIG_TCP_MD5SIG 304#ifdef CONFIG_TCP_MD5SIG
361 /* 305 /*
362 * The timewait bucket does not have the key DB from the 306 * The timewait bucket does not have the key DB from the
@@ -408,11 +352,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
408 352
409void tcp_twsk_destructor(struct sock *sk) 353void tcp_twsk_destructor(struct sock *sk)
410{ 354{
355#ifdef CONFIG_TCP_MD5SIG
411 struct tcp_timewait_sock *twsk = tcp_twsk(sk); 356 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
412 357
413 if (twsk->tw_peer)
414 inet_putpeer(twsk->tw_peer);
415#ifdef CONFIG_TCP_MD5SIG
416 if (twsk->tw_md5_key) { 358 if (twsk->tw_md5_key) {
417 tcp_free_md5sig_pool(); 359 tcp_free_md5sig_pool();
418 kfree_rcu(twsk->tw_md5_key, rcu); 360 kfree_rcu(twsk->tw_md5_key, rcu);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 9815ea0bca7f..87d3fcc302d4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -90,8 +90,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
90 xdst->u.dst.dev = dev; 90 xdst->u.dst.dev = dev;
91 dev_hold(dev); 91 dev_hold(dev);
92 92
93 rt_transfer_peer(&xdst->u.rt, rt);
94
95 /* Sheit... I remember I did this right. Apparently, 93 /* Sheit... I remember I did this right. Apparently,
96 * it was magically lost, so this code needs audit */ 94 * it was magically lost, so this code needs audit */
97 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | 95 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
@@ -100,6 +98,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
100 xdst->u.rt.rt_src = rt->rt_src; 98 xdst->u.rt.rt_src = rt->rt_src;
101 xdst->u.rt.rt_dst = rt->rt_dst; 99 xdst->u.rt.rt_dst = rt->rt_dst;
102 xdst->u.rt.rt_gateway = rt->rt_gateway; 100 xdst->u.rt.rt_gateway = rt->rt_gateway;
101 xdst->u.rt.rt_pmtu = rt->rt_pmtu;
103 102
104 return 0; 103 return 0;
105} 104}
@@ -209,11 +208,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
209 208
210 dst_destroy_metrics_generic(dst); 209 dst_destroy_metrics_generic(dst);
211 210
212 if (rt_has_peer(&xdst->u.rt)) {
213 struct inet_peer *peer = rt_peer_ptr(&xdst->u.rt);
214 inet_putpeer(peer);
215 }
216
217 xfrm_dst_destroy(xdst); 211 xfrm_dst_destroy(xdst);
218} 212}
219 213
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c7da1422cbde..a113f7d7e938 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -194,8 +194,10 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
194 if (rt->rt6i_dst.plen < 128) 194 if (rt->rt6i_dst.plen < 128)
195 tmo >>= ((128 - rt->rt6i_dst.plen)>>5); 195 tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
196 196
197 peer = rt6_get_peer_create(rt); 197 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
198 res = inet_peer_xrlim_allow(peer, tmo); 198 res = inet_peer_xrlim_allow(peer, tmo);
199 if (peer)
200 inet_putpeer(peer);
199 } 201 }
200 dst_release(dst); 202 dst_release(dst);
201 return res; 203 return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c6af5963a202..5b2d63ed793e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -466,13 +466,15 @@ int ip6_forward(struct sk_buff *skb)
466 else 466 else
467 target = &hdr->daddr; 467 target = &hdr->daddr;
468 468
469 peer = rt6_get_peer_create(rt); 469 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
470 470
471 /* Limit redirects both by destination (here) 471 /* Limit redirects both by destination (here)
472 and by source (inside ndisc_send_redirect) 472 and by source (inside ndisc_send_redirect)
473 */ 473 */
474 if (inet_peer_xrlim_allow(peer, 1*HZ)) 474 if (inet_peer_xrlim_allow(peer, 1*HZ))
475 ndisc_send_redirect(skb, target); 475 ndisc_send_redirect(skb, target);
476 if (peer)
477 inet_putpeer(peer);
476 } else { 478 } else {
477 int addrtype = ipv6_addr_type(&hdr->saddr); 479 int addrtype = ipv6_addr_type(&hdr->saddr);
478 480
@@ -592,10 +594,14 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
592 int old, new; 594 int old, new;
593 595
594 if (rt && !(rt->dst.flags & DST_NOPEER)) { 596 if (rt && !(rt->dst.flags & DST_NOPEER)) {
595 struct inet_peer *peer = rt6_get_peer_create(rt); 597 struct inet_peer *peer;
598 struct net *net;
596 599
600 net = dev_net(rt->dst.dev);
601 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
597 if (peer) { 602 if (peer) {
598 fhdr->identification = htonl(inet_getid(peer, 0)); 603 fhdr->identification = htonl(inet_getid(peer, 0));
604 inet_putpeer(peer);
599 return; 605 return;
600 } 606 }
601 } 607 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 69a6330dea91..0fddd571400d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1486,6 +1486,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
1486 int rd_len; 1486 int rd_len;
1487 int err; 1487 int err;
1488 u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; 1488 u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
1489 bool ret;
1489 1490
1490 if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { 1491 if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
1491 ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", 1492 ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
@@ -1519,8 +1520,11 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
1519 "Redirect: destination is not a neighbour\n"); 1520 "Redirect: destination is not a neighbour\n");
1520 goto release; 1521 goto release;
1521 } 1522 }
1522 peer = rt6_get_peer_create(rt); 1523 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
1523 if (!inet_peer_xrlim_allow(peer, 1*HZ)) 1524 ret = inet_peer_xrlim_allow(peer, 1*HZ);
1525 if (peer)
1526 inet_putpeer(peer);
1527 if (!ret)
1524 goto release; 1528 goto release;
1525 1529
1526 if (dev->addr_len) { 1530 if (dev->addr_len) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6cc6c881f54f..563f12c1c99c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1093,7 +1093,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1093 memset(&fl6, 0, sizeof(fl6)); 1093 memset(&fl6, 0, sizeof(fl6));
1094 fl6.flowi6_oif = oif; 1094 fl6.flowi6_oif = oif;
1095 fl6.flowi6_mark = mark; 1095 fl6.flowi6_mark = mark;
1096 fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS; 1096 fl6.flowi6_flags = 0;
1097 fl6.daddr = iph->daddr; 1097 fl6.daddr = iph->daddr;
1098 fl6.saddr = iph->saddr; 1098 fl6.saddr = iph->saddr;
1099 fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; 1099 fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
@@ -2348,13 +2348,11 @@ static int rt6_fill_node(struct net *net,
2348 int iif, int type, u32 pid, u32 seq, 2348 int iif, int type, u32 pid, u32 seq,
2349 int prefix, int nowait, unsigned int flags) 2349 int prefix, int nowait, unsigned int flags)
2350{ 2350{
2351 const struct inet_peer *peer;
2352 struct rtmsg *rtm; 2351 struct rtmsg *rtm;
2353 struct nlmsghdr *nlh; 2352 struct nlmsghdr *nlh;
2354 long expires; 2353 long expires;
2355 u32 table; 2354 u32 table;
2356 struct neighbour *n; 2355 struct neighbour *n;
2357 u32 ts, tsage;
2358 2356
2359 if (prefix) { /* user wants prefix routes only */ 2357 if (prefix) { /* user wants prefix routes only */
2360 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { 2358 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
@@ -2473,17 +2471,7 @@ static int rt6_fill_node(struct net *net,
2473 else 2471 else
2474 expires = INT_MAX; 2472 expires = INT_MAX;
2475 2473
2476 peer = NULL; 2474 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2477 if (rt6_has_peer(rt))
2478 peer = rt6_peer_ptr(rt);
2479 ts = tsage = 0;
2480 if (peer && peer->tcp_ts_stamp) {
2481 ts = peer->tcp_ts;
2482 tsage = get_seconds() - peer->tcp_ts_stamp;
2483 }
2484
2485 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2486 expires, rt->dst.error) < 0)
2487 goto nla_put_failure; 2475 goto nla_put_failure;
2488 2476
2489 return nlmsg_end(skb, nlh); 2477 return nlmsg_end(skb, nlh);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6cc67ed6c2e6..61175cb2478f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -277,22 +277,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
277 rt = (struct rt6_info *) dst; 277 rt = (struct rt6_info *) dst;
278 if (tcp_death_row.sysctl_tw_recycle && 278 if (tcp_death_row.sysctl_tw_recycle &&
279 !tp->rx_opt.ts_recent_stamp && 279 !tp->rx_opt.ts_recent_stamp &&
280 ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) { 280 ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr))
281 struct inet_peer *peer = rt6_get_peer(rt); 281 tcp_fetch_timewait_stamp(sk, dst);
282 /*
283 * VJ's idea. We save last timestamp seen from
284 * the destination in peer table, when entering state
285 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
286 * when trying new connection.
287 */
288 if (peer) {
289 inet_peer_refcheck(peer);
290 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
291 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
292 tp->rx_opt.ts_recent = peer->tcp_ts;
293 }
294 }
295 }
296 282
297 icsk->icsk_ext_hdr_len = 0; 283 icsk->icsk_ext_hdr_len = 0;
298 if (np->opt) 284 if (np->opt)
@@ -1134,8 +1120,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1134 treq->iif = inet6_iif(skb); 1120 treq->iif = inet6_iif(skb);
1135 1121
1136 if (!isn) { 1122 if (!isn) {
1137 struct inet_peer *peer = NULL;
1138
1139 if (ipv6_opt_accepted(sk, skb) || 1123 if (ipv6_opt_accepted(sk, skb) ||
1140 np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || 1124 np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
1141 np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { 1125 np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
@@ -1160,14 +1144,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1160 */ 1144 */
1161 if (tmp_opt.saw_tstamp && 1145 if (tmp_opt.saw_tstamp &&
1162 tcp_death_row.sysctl_tw_recycle && 1146 tcp_death_row.sysctl_tw_recycle &&
1163 (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL && 1147 (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {
1164 (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL && 1148 if (!tcp_peer_is_proven(req, dst, true)) {
1165 ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6,
1166 &treq->rmt_addr)) {
1167 inet_peer_refcheck(peer);
1168 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1169 (s32)(peer->tcp_ts - req->ts_recent) >
1170 TCP_PAWS_WINDOW) {
1171 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1149 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1172 goto drop_and_release; 1150 goto drop_and_release;
1173 } 1151 }
@@ -1176,8 +1154,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1176 else if (!sysctl_tcp_syncookies && 1154 else if (!sysctl_tcp_syncookies &&
1177 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1155 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1178 (sysctl_max_syn_backlog >> 2)) && 1156 (sysctl_max_syn_backlog >> 2)) &&
1179 (!peer || !peer->tcp_ts_stamp) && 1157 !tcp_peer_is_proven(req, dst, false)) {
1180 (!dst || !dst_metric(dst, RTAX_RTT))) {
1181 /* Without syncookies last quarter of 1158 /* Without syncookies last quarter of
1182 * backlog is filled with destinations, 1159 * backlog is filled with destinations,
1183 * proven to be alive. 1160 * proven to be alive.
@@ -1712,20 +1689,6 @@ do_time_wait:
1712 goto discard_it; 1689 goto discard_it;
1713} 1690}
1714 1691
1715static struct inet_peer *tcp_v6_get_peer(struct sock *sk)
1716{
1717 struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk);
1718 struct ipv6_pinfo *np = inet6_sk(sk);
1719
1720 /* If we don't have a valid cached route, or we're doing IP
1721 * options which make the IPv6 header destination address
1722 * different from our peer's, do not bother with this.
1723 */
1724 if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr))
1725 return NULL;
1726 return rt6_get_peer_create(rt);
1727}
1728
1729static struct timewait_sock_ops tcp6_timewait_sock_ops = { 1692static struct timewait_sock_ops tcp6_timewait_sock_ops = {
1730 .twsk_obj_size = sizeof(struct tcp6_timewait_sock), 1693 .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
1731 .twsk_unique = tcp_twsk_unique, 1694 .twsk_unique = tcp_twsk_unique,
@@ -1738,7 +1701,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
1738 .rebuild_header = inet6_sk_rebuild_header, 1701 .rebuild_header = inet6_sk_rebuild_header,
1739 .conn_request = tcp_v6_conn_request, 1702 .conn_request = tcp_v6_conn_request,
1740 .syn_recv_sock = tcp_v6_syn_recv_sock, 1703 .syn_recv_sock = tcp_v6_syn_recv_sock,
1741 .get_peer = tcp_v6_get_peer,
1742 .net_header_len = sizeof(struct ipv6hdr), 1704 .net_header_len = sizeof(struct ipv6hdr),
1743 .net_frag_header_len = sizeof(struct frag_hdr), 1705 .net_frag_header_len = sizeof(struct frag_hdr),
1744 .setsockopt = ipv6_setsockopt, 1706 .setsockopt = ipv6_setsockopt,
@@ -1770,7 +1732,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
1770 .rebuild_header = inet_sk_rebuild_header, 1732 .rebuild_header = inet_sk_rebuild_header,
1771 .conn_request = tcp_v6_conn_request, 1733 .conn_request = tcp_v6_conn_request,
1772 .syn_recv_sock = tcp_v6_syn_recv_sock, 1734 .syn_recv_sock = tcp_v6_syn_recv_sock,
1773 .get_peer = tcp_v4_get_peer,
1774 .net_header_len = sizeof(struct iphdr), 1735 .net_header_len = sizeof(struct iphdr),
1775 .setsockopt = ipv6_setsockopt, 1736 .setsockopt = ipv6_setsockopt,
1776 .getsockopt = ipv6_getsockopt, 1737 .getsockopt = ipv6_getsockopt,