diff options
| author | Eric Dumazet <edumazet@google.com> | 2015-04-12 21:51:09 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2015-04-13 16:40:05 -0400 |
| commit | 789f558cfb3680aeb52de137418637f6b04b7d22 (patch) | |
| tree | 0031c54a2fe41480ed509ba140a1c12ecad075a6 /include/net | |
| parent | 20a1d16526b79c76cd45e29cb637aec1d43c41de (diff) | |
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net')
| -rw-r--r-- | include/net/inet_timewait_sock.h | 107 |
1 files changed, 9 insertions, 98 deletions
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b7ce1003c429..360c4802288d 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h | |||
| @@ -31,67 +31,14 @@ | |||
| 31 | 31 | ||
| 32 | struct inet_hashinfo; | 32 | struct inet_hashinfo; |
| 33 | 33 | ||
| 34 | #define INET_TWDR_RECYCLE_SLOTS_LOG 5 | ||
| 35 | #define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 36 | |||
| 37 | /* | ||
| 38 | * If time > 4sec, it is "slow" path, no recycling is required, | ||
| 39 | * so that we select tick to get range about 4 seconds. | ||
| 40 | */ | ||
| 41 | #if HZ <= 16 || HZ > 4096 | ||
| 42 | # error Unsupported: HZ <= 16 or HZ > 4096 | ||
| 43 | #elif HZ <= 32 | ||
| 44 | # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 45 | #elif HZ <= 64 | ||
| 46 | # define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 47 | #elif HZ <= 128 | ||
| 48 | # define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 49 | #elif HZ <= 256 | ||
| 50 | # define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 51 | #elif HZ <= 512 | ||
| 52 | # define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 53 | #elif HZ <= 1024 | ||
| 54 | # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 55 | #elif HZ <= 2048 | ||
| 56 | # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 57 | #else | ||
| 58 | # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) | ||
| 59 | #endif | ||
| 60 | |||
| 61 | static inline u32 inet_tw_time_stamp(void) | ||
| 62 | { | ||
| 63 | return jiffies; | ||
| 64 | } | ||
| 65 | |||
| 66 | /* TIME_WAIT reaping mechanism. */ | ||
| 67 | #define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ | ||
| 68 | |||
| 69 | #define INET_TWDR_TWKILL_QUOTA 100 | ||
| 70 | |||
| 71 | struct inet_timewait_death_row { | 34 | struct inet_timewait_death_row { |
| 72 | /* Short-time timewait calendar */ | 35 | atomic_t tw_count; |
| 73 | int twcal_hand; | 36 | |
| 74 | unsigned long twcal_jiffie; | 37 | struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; |
| 75 | struct timer_list twcal_timer; | ||
| 76 | struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS]; | ||
| 77 | |||
| 78 | spinlock_t death_lock; | ||
| 79 | int tw_count; | ||
| 80 | int period; | ||
| 81 | u32 thread_slots; | ||
| 82 | struct work_struct twkill_work; | ||
| 83 | struct timer_list tw_timer; | ||
| 84 | int slot; | ||
| 85 | struct hlist_head cells[INET_TWDR_TWKILL_SLOTS]; | ||
| 86 | struct inet_hashinfo *hashinfo; | ||
| 87 | int sysctl_tw_recycle; | 38 | int sysctl_tw_recycle; |
| 88 | int sysctl_max_tw_buckets; | 39 | int sysctl_max_tw_buckets; |
| 89 | }; | 40 | }; |
| 90 | 41 | ||
| 91 | void inet_twdr_hangman(unsigned long data); | ||
| 92 | void inet_twdr_twkill_work(struct work_struct *work); | ||
| 93 | void inet_twdr_twcal_tick(unsigned long data); | ||
| 94 | |||
| 95 | struct inet_bind_bucket; | 42 | struct inet_bind_bucket; |
| 96 | 43 | ||
| 97 | /* | 44 | /* |
| @@ -133,52 +80,18 @@ struct inet_timewait_sock { | |||
| 133 | __be16 tw_sport; | 80 | __be16 tw_sport; |
| 134 | kmemcheck_bitfield_begin(flags); | 81 | kmemcheck_bitfield_begin(flags); |
| 135 | /* And these are ours. */ | 82 | /* And these are ours. */ |
| 136 | unsigned int tw_pad0 : 1, /* 1 bit hole */ | 83 | unsigned int tw_kill : 1, |
| 137 | tw_transparent : 1, | 84 | tw_transparent : 1, |
| 138 | tw_flowlabel : 20, | 85 | tw_flowlabel : 20, |
| 139 | tw_pad : 2, /* 2 bits hole */ | 86 | tw_pad : 2, /* 2 bits hole */ |
| 140 | tw_tos : 8; | 87 | tw_tos : 8; |
| 141 | kmemcheck_bitfield_end(flags); | 88 | kmemcheck_bitfield_end(flags); |
| 142 | u32 tw_ttd; | 89 | struct timer_list tw_timer; |
| 143 | struct inet_bind_bucket *tw_tb; | 90 | struct inet_bind_bucket *tw_tb; |
| 144 | struct hlist_node tw_death_node; | 91 | struct inet_timewait_death_row *tw_dr; |
| 145 | }; | 92 | }; |
| 146 | #define tw_tclass tw_tos | 93 | #define tw_tclass tw_tos |
| 147 | 94 | ||
| 148 | static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw) | ||
| 149 | { | ||
| 150 | return !hlist_unhashed(&tw->tw_death_node); | ||
| 151 | } | ||
| 152 | |||
| 153 | static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw) | ||
| 154 | { | ||
| 155 | tw->tw_death_node.pprev = NULL; | ||
| 156 | } | ||
| 157 | |||
| 158 | static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw) | ||
| 159 | { | ||
| 160 | __hlist_del(&tw->tw_death_node); | ||
| 161 | inet_twsk_dead_node_init(tw); | ||
| 162 | } | ||
| 163 | |||
| 164 | static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw) | ||
| 165 | { | ||
| 166 | if (inet_twsk_dead_hashed(tw)) { | ||
| 167 | __inet_twsk_del_dead_node(tw); | ||
| 168 | return 1; | ||
| 169 | } | ||
| 170 | return 0; | ||
| 171 | } | ||
| 172 | |||
| 173 | #define inet_twsk_for_each(tw, node, head) \ | ||
| 174 | hlist_nulls_for_each_entry(tw, node, head, tw_node) | ||
| 175 | |||
| 176 | #define inet_twsk_for_each_inmate(tw, jail) \ | ||
| 177 | hlist_for_each_entry(tw, jail, tw_death_node) | ||
| 178 | |||
| 179 | #define inet_twsk_for_each_inmate_safe(tw, safe, jail) \ | ||
| 180 | hlist_for_each_entry_safe(tw, safe, jail, tw_death_node) | ||
| 181 | |||
| 182 | static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) | 95 | static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) |
| 183 | { | 96 | { |
| 184 | return (struct inet_timewait_sock *)sk; | 97 | return (struct inet_timewait_sock *)sk; |
| @@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, | |||
| 193 | struct inet_hashinfo *hashinfo); | 106 | struct inet_hashinfo *hashinfo); |
| 194 | 107 | ||
| 195 | struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, | 108 | struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, |
| 109 | struct inet_timewait_death_row *dr, | ||
| 196 | const int state); | 110 | const int state); |
| 197 | 111 | ||
| 198 | void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | 112 | void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, |
| 199 | struct inet_hashinfo *hashinfo); | 113 | struct inet_hashinfo *hashinfo); |
| 200 | 114 | ||
| 201 | void inet_twsk_schedule(struct inet_timewait_sock *tw, | 115 | void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); |
| 202 | struct inet_timewait_death_row *twdr, | 116 | void inet_twsk_deschedule(struct inet_timewait_sock *tw); |
| 203 | const int timeo, const int timewait_len); | ||
| 204 | void inet_twsk_deschedule(struct inet_timewait_sock *tw, | ||
| 205 | struct inet_timewait_death_row *twdr); | ||
| 206 | 117 | ||
| 207 | void inet_twsk_purge(struct inet_hashinfo *hashinfo, | 118 | void inet_twsk_purge(struct inet_hashinfo *hashinfo, |
| 208 | struct inet_timewait_death_row *twdr, int family); | 119 | struct inet_timewait_death_row *twdr, int family); |
