aboutsummaryrefslogtreecommitdiffstats
path: root/include/net
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-04-12 21:51:09 -0400
committerDavid S. Miller <davem@davemloft.net>2015-04-13 16:40:05 -0400
commit789f558cfb3680aeb52de137418637f6b04b7d22 (patch)
tree0031c54a2fe41480ed509ba140a1c12ecad075a6 /include/net
parent20a1d16526b79c76cd45e29cb637aec1d43c41de (diff)
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when memory was expensive and machines had a single processor. This does not scale, code is ugly and source of huge latencies (Typically 30 ms have been seen, cpus spinning on death_lock spinlock.) We can afford to use an extra 64 bytes per timewait sock and spread timewait load to all cpus to have better behavior. Tested: On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1 on the target (lpaa24) Before patch : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 419594 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 437171 While test is running, we can observe 25 or even 33 ms latencies. lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2 lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2 After patch : About 90% increase of throughput : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 810442 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 800992 And latencies are kept to minimal values during this load, even if network utilization is 90% higher : lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net')
-rw-r--r--include/net/inet_timewait_sock.h107
1 files changed, 9 insertions, 98 deletions
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index b7ce1003c429..360c4802288d 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -31,67 +31,14 @@
31 31
32struct inet_hashinfo; 32struct inet_hashinfo;
33 33
34#define INET_TWDR_RECYCLE_SLOTS_LOG 5
35#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG)
36
37/*
38 * If time > 4sec, it is "slow" path, no recycling is required,
39 * so that we select tick to get range about 4 seconds.
40 */
41#if HZ <= 16 || HZ > 4096
42# error Unsupported: HZ <= 16 or HZ > 4096
43#elif HZ <= 32
44# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
45#elif HZ <= 64
46# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
47#elif HZ <= 128
48# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
49#elif HZ <= 256
50# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
51#elif HZ <= 512
52# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
53#elif HZ <= 1024
54# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
55#elif HZ <= 2048
56# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
57#else
58# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
59#endif
60
61static inline u32 inet_tw_time_stamp(void)
62{
63 return jiffies;
64}
65
66/* TIME_WAIT reaping mechanism. */
67#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
68
69#define INET_TWDR_TWKILL_QUOTA 100
70
71struct inet_timewait_death_row { 34struct inet_timewait_death_row {
72 /* Short-time timewait calendar */ 35 atomic_t tw_count;
73 int twcal_hand; 36
74 unsigned long twcal_jiffie; 37 struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp;
75 struct timer_list twcal_timer;
76 struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];
77
78 spinlock_t death_lock;
79 int tw_count;
80 int period;
81 u32 thread_slots;
82 struct work_struct twkill_work;
83 struct timer_list tw_timer;
84 int slot;
85 struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
86 struct inet_hashinfo *hashinfo;
87 int sysctl_tw_recycle; 38 int sysctl_tw_recycle;
88 int sysctl_max_tw_buckets; 39 int sysctl_max_tw_buckets;
89}; 40};
90 41
91void inet_twdr_hangman(unsigned long data);
92void inet_twdr_twkill_work(struct work_struct *work);
93void inet_twdr_twcal_tick(unsigned long data);
94
95struct inet_bind_bucket; 42struct inet_bind_bucket;
96 43
97/* 44/*
@@ -133,52 +80,18 @@ struct inet_timewait_sock {
133 __be16 tw_sport; 80 __be16 tw_sport;
134 kmemcheck_bitfield_begin(flags); 81 kmemcheck_bitfield_begin(flags);
135 /* And these are ours. */ 82 /* And these are ours. */
136 unsigned int tw_pad0 : 1, /* 1 bit hole */ 83 unsigned int tw_kill : 1,
137 tw_transparent : 1, 84 tw_transparent : 1,
138 tw_flowlabel : 20, 85 tw_flowlabel : 20,
139 tw_pad : 2, /* 2 bits hole */ 86 tw_pad : 2, /* 2 bits hole */
140 tw_tos : 8; 87 tw_tos : 8;
141 kmemcheck_bitfield_end(flags); 88 kmemcheck_bitfield_end(flags);
142 u32 tw_ttd; 89 struct timer_list tw_timer;
143 struct inet_bind_bucket *tw_tb; 90 struct inet_bind_bucket *tw_tb;
144 struct hlist_node tw_death_node; 91 struct inet_timewait_death_row *tw_dr;
145}; 92};
146#define tw_tclass tw_tos 93#define tw_tclass tw_tos
147 94
148static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
149{
150 return !hlist_unhashed(&tw->tw_death_node);
151}
152
153static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
154{
155 tw->tw_death_node.pprev = NULL;
156}
157
158static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
159{
160 __hlist_del(&tw->tw_death_node);
161 inet_twsk_dead_node_init(tw);
162}
163
164static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
165{
166 if (inet_twsk_dead_hashed(tw)) {
167 __inet_twsk_del_dead_node(tw);
168 return 1;
169 }
170 return 0;
171}
172
173#define inet_twsk_for_each(tw, node, head) \
174 hlist_nulls_for_each_entry(tw, node, head, tw_node)
175
176#define inet_twsk_for_each_inmate(tw, jail) \
177 hlist_for_each_entry(tw, jail, tw_death_node)
178
179#define inet_twsk_for_each_inmate_safe(tw, safe, jail) \
180 hlist_for_each_entry_safe(tw, safe, jail, tw_death_node)
181
182static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) 95static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
183{ 96{
184 return (struct inet_timewait_sock *)sk; 97 return (struct inet_timewait_sock *)sk;
@@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
193 struct inet_hashinfo *hashinfo); 106 struct inet_hashinfo *hashinfo);
194 107
195struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, 108struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
109 struct inet_timewait_death_row *dr,
196 const int state); 110 const int state);
197 111
198void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, 112void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
199 struct inet_hashinfo *hashinfo); 113 struct inet_hashinfo *hashinfo);
200 114
201void inet_twsk_schedule(struct inet_timewait_sock *tw, 115void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo);
202 struct inet_timewait_death_row *twdr, 116void inet_twsk_deschedule(struct inet_timewait_sock *tw);
203 const int timeo, const int timewait_len);
204void inet_twsk_deschedule(struct inet_timewait_sock *tw,
205 struct inet_timewait_death_row *twdr);
206 117
207void inet_twsk_purge(struct inet_hashinfo *hashinfo, 118void inet_twsk_purge(struct inet_hashinfo *hashinfo,
208 struct inet_timewait_death_row *twdr, int family); 119 struct inet_timewait_death_row *twdr, int family);