diff options
author | Eric Dumazet <edumazet@google.com> | 2015-04-12 21:51:09 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-04-13 16:40:05 -0400 |
commit | 789f558cfb3680aeb52de137418637f6b04b7d22 (patch) | |
tree | 0031c54a2fe41480ed509ba140a1c12ecad075a6 /include/net | |
parent | 20a1d16526b79c76cd45e29cb637aec1d43c41de (diff) |
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net')
-rw-r--r-- | include/net/inet_timewait_sock.h | 107 |
1 files changed, 9 insertions, 98 deletions
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b7ce1003c429..360c4802288d 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h | |||
@@ -31,67 +31,14 @@ | |||
31 | 31 | ||
32 | struct inet_hashinfo; | 32 | struct inet_hashinfo; |
33 | 33 | ||
34 | #define INET_TWDR_RECYCLE_SLOTS_LOG 5 | ||
35 | #define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG) | ||
36 | |||
37 | /* | ||
38 | * If time > 4sec, it is "slow" path, no recycling is required, | ||
39 | * so that we select tick to get range about 4 seconds. | ||
40 | */ | ||
41 | #if HZ <= 16 || HZ > 4096 | ||
42 | # error Unsupported: HZ <= 16 or HZ > 4096 | ||
43 | #elif HZ <= 32 | ||
44 | # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
45 | #elif HZ <= 64 | ||
46 | # define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
47 | #elif HZ <= 128 | ||
48 | # define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
49 | #elif HZ <= 256 | ||
50 | # define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
51 | #elif HZ <= 512 | ||
52 | # define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
53 | #elif HZ <= 1024 | ||
54 | # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
55 | #elif HZ <= 2048 | ||
56 | # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
57 | #else | ||
58 | # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
59 | #endif | ||
60 | |||
61 | static inline u32 inet_tw_time_stamp(void) | ||
62 | { | ||
63 | return jiffies; | ||
64 | } | ||
65 | |||
66 | /* TIME_WAIT reaping mechanism. */ | ||
67 | #define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ | ||
68 | |||
69 | #define INET_TWDR_TWKILL_QUOTA 100 | ||
70 | |||
71 | struct inet_timewait_death_row { | 34 | struct inet_timewait_death_row { |
72 | /* Short-time timewait calendar */ | 35 | atomic_t tw_count; |
73 | int twcal_hand; | 36 | |
74 | unsigned long twcal_jiffie; | 37 | struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; |
75 | struct timer_list twcal_timer; | ||
76 | struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS]; | ||
77 | |||
78 | spinlock_t death_lock; | ||
79 | int tw_count; | ||
80 | int period; | ||
81 | u32 thread_slots; | ||
82 | struct work_struct twkill_work; | ||
83 | struct timer_list tw_timer; | ||
84 | int slot; | ||
85 | struct hlist_head cells[INET_TWDR_TWKILL_SLOTS]; | ||
86 | struct inet_hashinfo *hashinfo; | ||
87 | int sysctl_tw_recycle; | 38 | int sysctl_tw_recycle; |
88 | int sysctl_max_tw_buckets; | 39 | int sysctl_max_tw_buckets; |
89 | }; | 40 | }; |
90 | 41 | ||
91 | void inet_twdr_hangman(unsigned long data); | ||
92 | void inet_twdr_twkill_work(struct work_struct *work); | ||
93 | void inet_twdr_twcal_tick(unsigned long data); | ||
94 | |||
95 | struct inet_bind_bucket; | 42 | struct inet_bind_bucket; |
96 | 43 | ||
97 | /* | 44 | /* |
@@ -133,52 +80,18 @@ struct inet_timewait_sock { | |||
133 | __be16 tw_sport; | 80 | __be16 tw_sport; |
134 | kmemcheck_bitfield_begin(flags); | 81 | kmemcheck_bitfield_begin(flags); |
135 | /* And these are ours. */ | 82 | /* And these are ours. */ |
136 | unsigned int tw_pad0 : 1, /* 1 bit hole */ | 83 | unsigned int tw_kill : 1, |
137 | tw_transparent : 1, | 84 | tw_transparent : 1, |
138 | tw_flowlabel : 20, | 85 | tw_flowlabel : 20, |
139 | tw_pad : 2, /* 2 bits hole */ | 86 | tw_pad : 2, /* 2 bits hole */ |
140 | tw_tos : 8; | 87 | tw_tos : 8; |
141 | kmemcheck_bitfield_end(flags); | 88 | kmemcheck_bitfield_end(flags); |
142 | u32 tw_ttd; | 89 | struct timer_list tw_timer; |
143 | struct inet_bind_bucket *tw_tb; | 90 | struct inet_bind_bucket *tw_tb; |
144 | struct hlist_node tw_death_node; | 91 | struct inet_timewait_death_row *tw_dr; |
145 | }; | 92 | }; |
146 | #define tw_tclass tw_tos | 93 | #define tw_tclass tw_tos |
147 | 94 | ||
148 | static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw) | ||
149 | { | ||
150 | return !hlist_unhashed(&tw->tw_death_node); | ||
151 | } | ||
152 | |||
153 | static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw) | ||
154 | { | ||
155 | tw->tw_death_node.pprev = NULL; | ||
156 | } | ||
157 | |||
158 | static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw) | ||
159 | { | ||
160 | __hlist_del(&tw->tw_death_node); | ||
161 | inet_twsk_dead_node_init(tw); | ||
162 | } | ||
163 | |||
164 | static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw) | ||
165 | { | ||
166 | if (inet_twsk_dead_hashed(tw)) { | ||
167 | __inet_twsk_del_dead_node(tw); | ||
168 | return 1; | ||
169 | } | ||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | #define inet_twsk_for_each(tw, node, head) \ | ||
174 | hlist_nulls_for_each_entry(tw, node, head, tw_node) | ||
175 | |||
176 | #define inet_twsk_for_each_inmate(tw, jail) \ | ||
177 | hlist_for_each_entry(tw, jail, tw_death_node) | ||
178 | |||
179 | #define inet_twsk_for_each_inmate_safe(tw, safe, jail) \ | ||
180 | hlist_for_each_entry_safe(tw, safe, jail, tw_death_node) | ||
181 | |||
182 | static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) | 95 | static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) |
183 | { | 96 | { |
184 | return (struct inet_timewait_sock *)sk; | 97 | return (struct inet_timewait_sock *)sk; |
@@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, | |||
193 | struct inet_hashinfo *hashinfo); | 106 | struct inet_hashinfo *hashinfo); |
194 | 107 | ||
195 | struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, | 108 | struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, |
109 | struct inet_timewait_death_row *dr, | ||
196 | const int state); | 110 | const int state); |
197 | 111 | ||
198 | void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | 112 | void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, |
199 | struct inet_hashinfo *hashinfo); | 113 | struct inet_hashinfo *hashinfo); |
200 | 114 | ||
201 | void inet_twsk_schedule(struct inet_timewait_sock *tw, | 115 | void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); |
202 | struct inet_timewait_death_row *twdr, | 116 | void inet_twsk_deschedule(struct inet_timewait_sock *tw); |
203 | const int timeo, const int timewait_len); | ||
204 | void inet_twsk_deschedule(struct inet_timewait_sock *tw, | ||
205 | struct inet_timewait_death_row *twdr); | ||
206 | 117 | ||
207 | void inet_twsk_purge(struct inet_hashinfo *hashinfo, | 118 | void inet_twsk_purge(struct inet_hashinfo *hashinfo, |
208 | struct inet_timewait_death_row *twdr, int family); | 119 | struct inet_timewait_death_row *twdr, int family); |