aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_metrics.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-07-10 03:49:14 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-11 01:39:57 -0400
commit51c5d0c4b169bf762f09e0d5b283a7f0b2a45739 (patch)
tree9dd99b27be4dc469954a2d67515593c9f71cbcd0 /net/ipv4/tcp_metrics.c
parentab92bb2f679d66c7e12a6b1c0cdd76fe308f6546 (diff)
tcp: Maintain dynamic metrics in local cache.
Maintain a local hash table of TCP dynamic metrics blobs. Computed TCP metrics are no longer maintained in the route metrics. The table uses RCU and an extremely simple hash so that it has low latency and low overhead. A simple hash is legitimate because we only make metrics blobs for fully established connections. Some tweaking of the default hash table sizes, metric timeouts, and the hash chain length limit certainly could use some tweaking. But the basic design seems sound. With help from Eric Dumazet and Joe Perches. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_metrics.c')
-rw-r--r--net/ipv4/tcp_metrics.c555
1 files changed, 462 insertions, 93 deletions
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9afe703c85cc..56223bab251b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1,134 +1,431 @@
1#include <linux/rcupdate.h>
2#include <linux/spinlock.h>
3#include <linux/jiffies.h>
4#include <linux/bootmem.h>
1#include <linux/module.h> 5#include <linux/module.h>
2#include <linux/cache.h> 6#include <linux/cache.h>
7#include <linux/slab.h>
8#include <linux/init.h>
3#include <linux/tcp.h> 9#include <linux/tcp.h>
4 10
5#include <net/inet_connection_sock.h> 11#include <net/inet_connection_sock.h>
12#include <net/net_namespace.h>
6#include <net/request_sock.h> 13#include <net/request_sock.h>
14#include <net/inetpeer.h>
7#include <net/sock.h> 15#include <net/sock.h>
16#include <net/ipv6.h>
8#include <net/dst.h> 17#include <net/dst.h>
9#include <net/tcp.h> 18#include <net/tcp.h>
10 19
11int sysctl_tcp_nometrics_save __read_mostly; 20int sysctl_tcp_nometrics_save __read_mostly;
12 21
22enum tcp_metric_index {
23 TCP_METRIC_RTT,
24 TCP_METRIC_RTTVAR,
25 TCP_METRIC_SSTHRESH,
26 TCP_METRIC_CWND,
27 TCP_METRIC_REORDERING,
28
29 /* Always last. */
30 TCP_METRIC_MAX,
31};
32
33struct tcp_metrics_block {
34 struct tcp_metrics_block __rcu *tcpm_next;
35 struct inetpeer_addr tcpm_addr;
36 unsigned long tcpm_stamp;
37 u32 tcpm_lock;
38 u32 tcpm_vals[TCP_METRIC_MAX];
39};
40
41static bool tcp_metric_locked(struct tcp_metrics_block *tm,
42 enum tcp_metric_index idx)
43{
44 return tm->tcpm_lock & (1 << idx);
45}
46
47static u32 tcp_metric_get(struct tcp_metrics_block *tm,
48 enum tcp_metric_index idx)
49{
50 return tm->tcpm_vals[idx];
51}
52
53static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
54 enum tcp_metric_index idx)
55{
56 return msecs_to_jiffies(tm->tcpm_vals[idx]);
57}
58
59static void tcp_metric_set(struct tcp_metrics_block *tm,
60 enum tcp_metric_index idx,
61 u32 val)
62{
63 tm->tcpm_vals[idx] = val;
64}
65
66static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
67 enum tcp_metric_index idx,
68 u32 val)
69{
70 tm->tcpm_vals[idx] = jiffies_to_msecs(val);
71}
72
73static bool addr_same(const struct inetpeer_addr *a,
74 const struct inetpeer_addr *b)
75{
76 const struct in6_addr *a6, *b6;
77
78 if (a->family != b->family)
79 return false;
80 if (a->family == AF_INET)
81 return a->addr.a4 == b->addr.a4;
82
83 a6 = (const struct in6_addr *) &a->addr.a6[0];
84 b6 = (const struct in6_addr *) &b->addr.a6[0];
85
86 return ipv6_addr_equal(a6, b6);
87}
88
89struct tcpm_hash_bucket {
90 struct tcp_metrics_block __rcu *chain;
91};
92
93static DEFINE_SPINLOCK(tcp_metrics_lock);
94
95static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
96{
97 u32 val;
98
99 val = 0;
100 if (dst_metric_locked(dst, RTAX_RTT))
101 val |= 1 << TCP_METRIC_RTT;
102 if (dst_metric_locked(dst, RTAX_RTTVAR))
103 val |= 1 << TCP_METRIC_RTTVAR;
104 if (dst_metric_locked(dst, RTAX_SSTHRESH))
105 val |= 1 << TCP_METRIC_SSTHRESH;
106 if (dst_metric_locked(dst, RTAX_CWND))
107 val |= 1 << TCP_METRIC_CWND;
108 if (dst_metric_locked(dst, RTAX_REORDERING))
109 val |= 1 << TCP_METRIC_REORDERING;
110 tm->tcpm_lock = val;
111
112 tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
113 tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
114 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
115 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
116 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
117}
118
119static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
120 struct inetpeer_addr *addr,
121 unsigned int hash,
122 bool reclaim)
123{
124 struct tcp_metrics_block *tm;
125 struct net *net;
126
127 spin_lock_bh(&tcp_metrics_lock);
128 net = dev_net(dst->dev);
129 if (unlikely(reclaim)) {
130 struct tcp_metrics_block *oldest;
131
132 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
133 for (tm = rcu_dereference(oldest->tcpm_next); tm;
134 tm = rcu_dereference(tm->tcpm_next)) {
135 if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
136 oldest = tm;
137 }
138 tm = oldest;
139 } else {
140 tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
141 if (!tm)
142 goto out_unlock;
143 }
144 tm->tcpm_addr = *addr;
145 tm->tcpm_stamp = jiffies;
146
147 tcpm_suck_dst(tm, dst);
148
149 if (likely(!reclaim)) {
150 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
151 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
152 }
153
154out_unlock:
155 spin_unlock_bh(&tcp_metrics_lock);
156 return tm;
157}
158
159#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
160
161static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
162{
163 if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
164 tcpm_suck_dst(tm, dst);
165}
166
167#define TCP_METRICS_RECLAIM_DEPTH 5
168#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
169
170static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
171{
172 if (tm)
173 return tm;
174 if (depth > TCP_METRICS_RECLAIM_DEPTH)
175 return TCP_METRICS_RECLAIM_PTR;
176 return NULL;
177}
178
179static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
180 struct net *net, unsigned int hash)
181{
182 struct tcp_metrics_block *tm;
183 int depth = 0;
184
185 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
186 tm = rcu_dereference(tm->tcpm_next)) {
187 if (addr_same(&tm->tcpm_addr, addr))
188 break;
189 depth++;
190 }
191 return tcp_get_encode(tm, depth);
192}
193
194static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
195 struct dst_entry *dst)
196{
197 struct tcp_metrics_block *tm;
198 struct inetpeer_addr addr;
199 unsigned int hash;
200 struct net *net;
201
202 addr.family = req->rsk_ops->family;
203 switch (addr.family) {
204 case AF_INET:
205 addr.addr.a4 = inet_rsk(req)->rmt_addr;
206 hash = (__force unsigned int) addr.addr.a4;
207 break;
208 case AF_INET6:
209 *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
210 hash = ((__force unsigned int) addr.addr.a6[0] ^
211 (__force unsigned int) addr.addr.a6[1] ^
212 (__force unsigned int) addr.addr.a6[2] ^
213 (__force unsigned int) addr.addr.a6[3]);
214 break;
215 default:
216 return NULL;
217 }
218
219 hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
220
221 net = dev_net(dst->dev);
222 hash &= net->ipv4.tcp_metrics_hash_mask;
223
224 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
225 tm = rcu_dereference(tm->tcpm_next)) {
226 if (addr_same(&tm->tcpm_addr, &addr))
227 break;
228 }
229 tcpm_check_stamp(tm, dst);
230 return tm;
231}
232
233static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
234 struct dst_entry *dst,
235 bool create)
236{
237 struct tcp_metrics_block *tm;
238 struct inetpeer_addr addr;
239 unsigned int hash;
240 struct net *net;
241 bool reclaim;
242
243 addr.family = sk->sk_family;
244 switch (addr.family) {
245 case AF_INET:
246 addr.addr.a4 = inet_sk(sk)->inet_daddr;
247 hash = (__force unsigned int) addr.addr.a4;
248 break;
249 case AF_INET6:
250 *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
251 hash = ((__force unsigned int) addr.addr.a6[0] ^
252 (__force unsigned int) addr.addr.a6[1] ^
253 (__force unsigned int) addr.addr.a6[2] ^
254 (__force unsigned int) addr.addr.a6[3]);
255 break;
256 default:
257 return NULL;
258 }
259
260 hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
261
262 net = dev_net(dst->dev);
263 hash &= net->ipv4.tcp_metrics_hash_mask;
264
265 tm = __tcp_get_metrics(&addr, net, hash);
266 reclaim = false;
267 if (tm == TCP_METRICS_RECLAIM_PTR) {
268 reclaim = true;
269 tm = NULL;
270 }
271 if (!tm && create)
272 tm = tcpm_new(dst, &addr, hash, reclaim);
273 else
274 tcpm_check_stamp(tm, dst);
275
276 return tm;
277}
278
13/* Save metrics learned by this TCP session. This function is called 279/* Save metrics learned by this TCP session. This function is called
14 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT 280 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
15 * or goes from LAST-ACK to CLOSE. 281 * or goes from LAST-ACK to CLOSE.
16 */ 282 */
17void tcp_update_metrics(struct sock *sk) 283void tcp_update_metrics(struct sock *sk)
18{ 284{
19 struct tcp_sock *tp = tcp_sk(sk); 285 const struct inet_connection_sock *icsk = inet_csk(sk);
20 struct dst_entry *dst = __sk_dst_get(sk); 286 struct dst_entry *dst = __sk_dst_get(sk);
287 struct tcp_sock *tp = tcp_sk(sk);
288 struct tcp_metrics_block *tm;
289 unsigned long rtt;
290 u32 val;
291 int m;
21 292
22 if (sysctl_tcp_nometrics_save) 293 if (sysctl_tcp_nometrics_save || !dst)
23 return; 294 return;
24 295
25 if (dst && (dst->flags & DST_HOST)) { 296 if (dst->flags & DST_HOST)
26 const struct inet_connection_sock *icsk = inet_csk(sk);
27 int m;
28 unsigned long rtt;
29
30 dst_confirm(dst); 297 dst_confirm(dst);
31 298
32 if (icsk->icsk_backoff || !tp->srtt) { 299 rcu_read_lock();
33 /* This session failed to estimate rtt. Why? 300 if (icsk->icsk_backoff || !tp->srtt) {
34 * Probably, no packets returned in time. 301 /* This session failed to estimate rtt. Why?
35 * Reset our results. 302 * Probably, no packets returned in time. Reset our
36 */ 303 * results.
37 if (!(dst_metric_locked(dst, RTAX_RTT))) 304 */
38 dst_metric_set(dst, RTAX_RTT, 0); 305 tm = tcp_get_metrics(sk, dst, false);
39 return; 306 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
40 } 307 tcp_metric_set(tm, TCP_METRIC_RTT, 0);
308 goto out_unlock;
309 } else
310 tm = tcp_get_metrics(sk, dst, true);
41 311
42 rtt = dst_metric_rtt(dst, RTAX_RTT); 312 if (!tm)
43 m = rtt - tp->srtt; 313 goto out_unlock;
44 314
45 /* If newly calculated rtt larger than stored one, 315 rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
46 * store new one. Otherwise, use EWMA. Remember, 316 m = rtt - tp->srtt;
47 * rtt overestimation is always better than underestimation.
48 */
49 if (!(dst_metric_locked(dst, RTAX_RTT))) {
50 if (m <= 0)
51 set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
52 else
53 set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
54 }
55 317
56 if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { 318 /* If newly calculated rtt larger than stored one, store new
57 unsigned long var; 319 * one. Otherwise, use EWMA. Remember, rtt overestimation is
58 if (m < 0) 320 * always better than underestimation.
59 m = -m; 321 */
322 if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
323 if (m <= 0)
324 rtt = tp->srtt;
325 else
326 rtt -= (m >> 3);
327 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
328 }
60 329
61 /* Scale deviation to rttvar fixed point */ 330 if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
62 m >>= 1; 331 unsigned long var;
63 if (m < tp->mdev)
64 m = tp->mdev;
65 332
66 var = dst_metric_rtt(dst, RTAX_RTTVAR); 333 if (m < 0)
67 if (m >= var) 334 m = -m;
68 var = m;
69 else
70 var -= (var - m) >> 2;
71 335
72 set_dst_metric_rtt(dst, RTAX_RTTVAR, var); 336 /* Scale deviation to rttvar fixed point */
73 } 337 m >>= 1;
338 if (m < tp->mdev)
339 m = tp->mdev;
74 340
75 if (tcp_in_initial_slowstart(tp)) { 341 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
76 /* Slow start still did not finish. */ 342 if (m >= var)
77 if (dst_metric(dst, RTAX_SSTHRESH) && 343 var = m;
78 !dst_metric_locked(dst, RTAX_SSTHRESH) && 344 else
79 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) 345 var -= (var - m) >> 2;
80 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
81 if (!dst_metric_locked(dst, RTAX_CWND) &&
82 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
83 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
84 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
85 icsk->icsk_ca_state == TCP_CA_Open) {
86 /* Cong. avoidance phase, cwnd is reliable. */
87 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
88 dst_metric_set(dst, RTAX_SSTHRESH,
89 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
90 if (!dst_metric_locked(dst, RTAX_CWND))
91 dst_metric_set(dst, RTAX_CWND,
92 (dst_metric(dst, RTAX_CWND) +
93 tp->snd_cwnd) >> 1);
94 } else {
95 /* Else slow start did not finish, cwnd is non-sense,
96 ssthresh may be also invalid.
97 */
98 if (!dst_metric_locked(dst, RTAX_CWND))
99 dst_metric_set(dst, RTAX_CWND,
100 (dst_metric(dst, RTAX_CWND) +
101 tp->snd_ssthresh) >> 1);
102 if (dst_metric(dst, RTAX_SSTHRESH) &&
103 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
104 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
105 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
106 }
107 346
108 if (!dst_metric_locked(dst, RTAX_REORDERING)) { 347 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
109 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && 348 }
349
350 if (tcp_in_initial_slowstart(tp)) {
351 /* Slow start still did not finish. */
352 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
353 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
354 if (val && (tp->snd_cwnd >> 1) > val)
355 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
356 tp->snd_cwnd >> 1);
357 }
358 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
359 val = tcp_metric_get(tm, TCP_METRIC_CWND);
360 if (tp->snd_cwnd > val)
361 tcp_metric_set(tm, TCP_METRIC_CWND,
362 tp->snd_cwnd);
363 }
364 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
365 icsk->icsk_ca_state == TCP_CA_Open) {
366 /* Cong. avoidance phase, cwnd is reliable. */
367 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
368 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
369 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
370 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
371 val = tcp_metric_get(tm, TCP_METRIC_CWND);
372 tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1);
373 }
374 } else {
375 /* Else slow start did not finish, cwnd is non-sense,
376 * ssthresh may be also invalid.
377 */
378 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
379 val = tcp_metric_get(tm, TCP_METRIC_CWND);
380 tcp_metric_set(tm, TCP_METRIC_CWND,
381 (val + tp->snd_ssthresh) >> 1);
382 }
383 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
384 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
385 if (val && tp->snd_ssthresh > val)
386 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
387 tp->snd_ssthresh);
388 }
389 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
390 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
391 if (val < tp->reordering &&
110 tp->reordering != sysctl_tcp_reordering) 392 tp->reordering != sysctl_tcp_reordering)
111 dst_metric_set(dst, RTAX_REORDERING, tp->reordering); 393 tcp_metric_set(tm, TCP_METRIC_REORDERING,
394 tp->reordering);
112 } 395 }
113 } 396 }
397 tm->tcpm_stamp = jiffies;
398out_unlock:
399 rcu_read_unlock();
114} 400}
115 401
116/* Initialize metrics on socket. */ 402/* Initialize metrics on socket. */
117 403
118void tcp_init_metrics(struct sock *sk) 404void tcp_init_metrics(struct sock *sk)
119{ 405{
120 struct tcp_sock *tp = tcp_sk(sk);
121 struct dst_entry *dst = __sk_dst_get(sk); 406 struct dst_entry *dst = __sk_dst_get(sk);
407 struct tcp_sock *tp = tcp_sk(sk);
408 struct tcp_metrics_block *tm;
409 u32 val;
122 410
123 if (dst == NULL) 411 if (dst == NULL)
124 goto reset; 412 goto reset;
125 413
126 dst_confirm(dst); 414 dst_confirm(dst);
127 415
128 if (dst_metric_locked(dst, RTAX_CWND)) 416 rcu_read_lock();
129 tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); 417 tm = tcp_get_metrics(sk, dst, true);
130 if (dst_metric(dst, RTAX_SSTHRESH)) { 418 if (!tm) {
131 tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); 419 rcu_read_unlock();
420 goto reset;
421 }
422
423 if (tcp_metric_locked(tm, TCP_METRIC_CWND))
424 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
425
426 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
427 if (val) {
428 tp->snd_ssthresh = val;
132 if (tp->snd_ssthresh > tp->snd_cwnd_clamp) 429 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
133 tp->snd_ssthresh = tp->snd_cwnd_clamp; 430 tp->snd_ssthresh = tp->snd_cwnd_clamp;
134 } else { 431 } else {
@@ -137,16 +434,18 @@ void tcp_init_metrics(struct sock *sk)
137 */ 434 */
138 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 435 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
139 } 436 }
140 if (dst_metric(dst, RTAX_REORDERING) && 437 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
141 tp->reordering != dst_metric(dst, RTAX_REORDERING)) { 438 if (val && tp->reordering != val) {
142 tcp_disable_fack(tp); 439 tcp_disable_fack(tp);
143 tcp_disable_early_retrans(tp); 440 tcp_disable_early_retrans(tp);
144 tp->reordering = dst_metric(dst, RTAX_REORDERING); 441 tp->reordering = val;
145 } 442 }
146 443
147 if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) 444 val = tcp_metric_get(tm, TCP_METRIC_RTT);
445 if (val == 0 || tp->srtt == 0) {
446 rcu_read_unlock();
148 goto reset; 447 goto reset;
149 448 }
150 /* Initial rtt is determined from SYN,SYN-ACK. 449 /* Initial rtt is determined from SYN,SYN-ACK.
151 * The segment is small and rtt may appear much 450 * The segment is small and rtt may appear much
152 * less than real one. Use per-dst memory 451 * less than real one. Use per-dst memory
@@ -161,14 +460,18 @@ void tcp_init_metrics(struct sock *sk)
161 * to low value, and then abruptly stops to do it and starts to delay 460 * to low value, and then abruptly stops to do it and starts to delay
162 * ACKs, wait for troubles. 461 * ACKs, wait for troubles.
163 */ 462 */
164 if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { 463 val = msecs_to_jiffies(val);
165 tp->srtt = dst_metric_rtt(dst, RTAX_RTT); 464 if (val > tp->srtt) {
465 tp->srtt = val;
166 tp->rtt_seq = tp->snd_nxt; 466 tp->rtt_seq = tp->snd_nxt;
167 } 467 }
168 if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { 468 val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
169 tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); 469 if (val > tp->mdev) {
470 tp->mdev = val;
170 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 471 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
171 } 472 }
473 rcu_read_unlock();
474
172 tcp_set_rto(sk); 475 tcp_set_rto(sk);
173reset: 476reset:
174 if (tp->srtt == 0) { 477 if (tp->srtt == 0) {
@@ -195,8 +498,74 @@ reset:
195 498
196bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) 499bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
197{ 500{
501 struct tcp_metrics_block *tm;
502 bool ret;
503
198 if (!dst) 504 if (!dst)
199 return false; 505 return false;
200 return dst_metric(dst, RTAX_RTT) ? true : false; 506
507 rcu_read_lock();
508 tm = __tcp_get_metrics_req(req, dst);
509 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
510 ret = true;
511 else
512 ret = false;
513 rcu_read_unlock();
514
515 return ret;
201} 516}
202EXPORT_SYMBOL_GPL(tcp_peer_is_proven); 517EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
518
519static unsigned long tcpmhash_entries;
520static int __init set_tcpmhash_entries(char *str)
521{
522 ssize_t ret;
523
524 if (!str)
525 return 0;
526
527 ret = kstrtoul(str, 0, &tcpmhash_entries);
528 if (ret)
529 return 0;
530
531 return 1;
532}
533__setup("tcpmhash_entries=", set_tcpmhash_entries);
534
535static int __net_init tcp_net_metrics_init(struct net *net)
536{
537 int slots, size;
538
539 slots = tcpmhash_entries;
540 if (!slots) {
541 if (totalram_pages >= 128 * 1024)
542 slots = 16 * 1024;
543 else
544 slots = 8 * 1024;
545 }
546
547 size = slots * sizeof(struct tcpm_hash_bucket);
548
549 net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
550 if (!net->ipv4.tcp_metrics_hash)
551 return -ENOMEM;
552
553 net->ipv4.tcp_metrics_hash_mask = (slots - 1);
554
555 return 0;
556}
557
558static void __net_exit tcp_net_metrics_exit(struct net *net)
559{
560 kfree(net->ipv4.tcp_metrics_hash);
561}
562
563static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
564 .init = tcp_net_metrics_init,
565 .exit = tcp_net_metrics_exit,
566};
567
568void __init tcp_metrics_init(void)
569{
570 register_pernet_subsys(&tcp_net_metrics_ops);
571}