aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/net/inet_timewait_sock.h57
-rw-r--r--include/net/tcp.h36
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c4
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv4/tcp_minisocks.c256
-rw-r--r--net/ipv6/tcp_ipv6.c9
8 files changed, 207 insertions, 172 deletions
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index e00861b16696..a7e8052e2fbf 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -19,13 +19,69 @@
19 19
20#include <linux/ip.h> 20#include <linux/ip.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/timer.h>
22#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/workqueue.h>
23 25
24#include <net/sock.h> 26#include <net/sock.h>
25#include <net/tcp_states.h> 27#include <net/tcp_states.h>
26 28
27#include <asm/atomic.h> 29#include <asm/atomic.h>
28 30
31struct inet_hashinfo;
32
33#define INET_TWDR_RECYCLE_SLOTS_LOG 5
34#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG)
35
36/*
37 * If time > 4sec, it is "slow" path, no recycling is required,
38 * so that we select tick to get range about 4 seconds.
39 */
40#if HZ <= 16 || HZ > 4096
41# error Unsupported: HZ <= 16 or HZ > 4096
42#elif HZ <= 32
43# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
44#elif HZ <= 64
45# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
46#elif HZ <= 128
47# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
48#elif HZ <= 256
49# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
50#elif HZ <= 512
51# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
52#elif HZ <= 1024
53# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
54#elif HZ <= 2048
55# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
56#else
57# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
58#endif
59
60/* TIME_WAIT reaping mechanism. */
61#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
62
63#define INET_TWDR_TWKILL_QUOTA 100
64
65struct inet_timewait_death_row {
66 /* Short-time timewait calendar */
67 int twcal_hand;
68 int twcal_jiffie;
69 struct timer_list twcal_timer;
70 struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];
71
72 spinlock_t death_lock;
73 int tw_count;
74 int period;
75 u32 thread_slots;
76 struct work_struct twkill_work;
77 struct timer_list tw_timer;
78 int slot;
79 struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
80 struct inet_hashinfo *hashinfo;
81 int sysctl_tw_recycle;
82 int sysctl_max_tw_buckets;
83};
84
29#if (BITS_PER_LONG == 64) 85#if (BITS_PER_LONG == 64)
30#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8 86#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8
31#else 87#else
@@ -33,7 +89,6 @@
33#endif 89#endif
34 90
35struct inet_bind_bucket; 91struct inet_bind_bucket;
36struct inet_hashinfo;
37 92
38/* 93/*
39 * This is a TIME_WAIT sock. It works around the memory consumption 94 * This is a TIME_WAIT sock. It works around the memory consumption
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 077db859ae0d..4c4cd4fb1ed8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -29,6 +29,7 @@
29#include <linux/percpu.h> 29#include <linux/percpu.h>
30 30
31#include <net/inet_connection_sock.h> 31#include <net/inet_connection_sock.h>
32#include <net/inet_timewait_sock.h>
32#include <net/inet_hashtables.h> 33#include <net/inet_hashtables.h>
33#include <net/checksum.h> 34#include <net/checksum.h>
34#include <net/request_sock.h> 35#include <net/request_sock.h>
@@ -42,9 +43,9 @@
42extern struct inet_hashinfo tcp_hashinfo; 43extern struct inet_hashinfo tcp_hashinfo;
43 44
44extern atomic_t tcp_orphan_count; 45extern atomic_t tcp_orphan_count;
45extern int tcp_tw_count;
46extern void tcp_time_wait(struct sock *sk, int state, int timeo); 46extern void tcp_time_wait(struct sock *sk, int state, int timeo);
47extern void tcp_tw_deschedule(struct inet_timewait_sock *tw); 47extern void inet_twsk_deschedule(struct inet_timewait_sock *tw,
48 struct inet_timewait_death_row *twdr);
48 49
49#define MAX_TCP_HEADER (128 + MAX_HEADER) 50#define MAX_TCP_HEADER (128 + MAX_HEADER)
50 51
@@ -148,33 +149,6 @@ extern void tcp_tw_deschedule(struct inet_timewait_sock *tw);
148 * timestamps. It must be less than 149 * timestamps. It must be less than
149 * minimal timewait lifetime. 150 * minimal timewait lifetime.
150 */ 151 */
151
152#define TCP_TW_RECYCLE_SLOTS_LOG 5
153#define TCP_TW_RECYCLE_SLOTS (1<<TCP_TW_RECYCLE_SLOTS_LOG)
154
155/* If time > 4sec, it is "slow" path, no recycling is required,
156 so that we select tick to get range about 4 seconds.
157 */
158
159#if HZ <= 16 || HZ > 4096
160# error Unsupported: HZ <= 16 or HZ > 4096
161#elif HZ <= 32
162# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
163#elif HZ <= 64
164# define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG)
165#elif HZ <= 128
166# define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG)
167#elif HZ <= 256
168# define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG)
169#elif HZ <= 512
170# define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG)
171#elif HZ <= 1024
172# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
173#elif HZ <= 2048
174# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
175#else
176# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
177#endif
178/* 152/*
179 * TCP option 153 * TCP option
180 */ 154 */
@@ -209,12 +183,13 @@ extern void tcp_tw_deschedule(struct inet_timewait_sock *tw);
209#define TCP_NAGLE_CORK 2 /* Socket is corked */ 183#define TCP_NAGLE_CORK 2 /* Socket is corked */
210#define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */ 184#define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */
211 185
186extern struct inet_timewait_death_row tcp_death_row;
187
212/* sysctl variables for tcp */ 188/* sysctl variables for tcp */
213extern int sysctl_tcp_timestamps; 189extern int sysctl_tcp_timestamps;
214extern int sysctl_tcp_window_scaling; 190extern int sysctl_tcp_window_scaling;
215extern int sysctl_tcp_sack; 191extern int sysctl_tcp_sack;
216extern int sysctl_tcp_fin_timeout; 192extern int sysctl_tcp_fin_timeout;
217extern int sysctl_tcp_tw_recycle;
218extern int sysctl_tcp_keepalive_time; 193extern int sysctl_tcp_keepalive_time;
219extern int sysctl_tcp_keepalive_probes; 194extern int sysctl_tcp_keepalive_probes;
220extern int sysctl_tcp_keepalive_intvl; 195extern int sysctl_tcp_keepalive_intvl;
@@ -229,7 +204,6 @@ extern int sysctl_tcp_stdurg;
229extern int sysctl_tcp_rfc1337; 204extern int sysctl_tcp_rfc1337;
230extern int sysctl_tcp_abort_on_overflow; 205extern int sysctl_tcp_abort_on_overflow;
231extern int sysctl_tcp_max_orphans; 206extern int sysctl_tcp_max_orphans;
232extern int sysctl_tcp_max_tw_buckets;
233extern int sysctl_tcp_fack; 207extern int sysctl_tcp_fack;
234extern int sysctl_tcp_reordering; 208extern int sysctl_tcp_reordering;
235extern int sysctl_tcp_ecn; 209extern int sysctl_tcp_ecn;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc7f415..3eadbb271871 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -65,7 +65,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
65 socket_seq_show(seq); 65 socket_seq_show(seq);
66 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", 66 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
67 fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), 67 fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
68 tcp_tw_count, atomic_read(&tcp_sockets_allocated), 68 tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
69 atomic_read(&tcp_memory_allocated)); 69 atomic_read(&tcp_memory_allocated));
70 seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); 70 seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
71 seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); 71 seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e32894532416..ce47a345ecc5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -259,7 +259,7 @@ ctl_table ipv4_table[] = {
259 { 259 {
260 .ctl_name = NET_TCP_MAX_TW_BUCKETS, 260 .ctl_name = NET_TCP_MAX_TW_BUCKETS,
261 .procname = "tcp_max_tw_buckets", 261 .procname = "tcp_max_tw_buckets",
262 .data = &sysctl_tcp_max_tw_buckets, 262 .data = &tcp_death_row.sysctl_max_tw_buckets,
263 .maxlen = sizeof(int), 263 .maxlen = sizeof(int),
264 .mode = 0644, 264 .mode = 0644,
265 .proc_handler = &proc_dointvec 265 .proc_handler = &proc_dointvec
@@ -363,7 +363,7 @@ ctl_table ipv4_table[] = {
363 { 363 {
364 .ctl_name = NET_TCP_TW_RECYCLE, 364 .ctl_name = NET_TCP_TW_RECYCLE,
365 .procname = "tcp_tw_recycle", 365 .procname = "tcp_tw_recycle",
366 .data = &sysctl_tcp_tw_recycle, 366 .data = &tcp_death_row.sysctl_tw_recycle,
367 .maxlen = sizeof(int), 367 .maxlen = sizeof(int),
368 .mode = 0644, 368 .mode = 0644,
369 .proc_handler = &proc_dointvec 369 .proc_handler = &proc_dointvec
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4bda522d25cf..0eed64a1991d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2109,12 +2109,12 @@ void __init tcp_init(void)
2109 if (order >= 4) { 2109 if (order >= 4) {
2110 sysctl_local_port_range[0] = 32768; 2110 sysctl_local_port_range[0] = 32768;
2111 sysctl_local_port_range[1] = 61000; 2111 sysctl_local_port_range[1] = 61000;
2112 sysctl_tcp_max_tw_buckets = 180000; 2112 tcp_death_row.sysctl_max_tw_buckets = 180000;
2113 sysctl_tcp_max_orphans = 4096 << (order - 4); 2113 sysctl_tcp_max_orphans = 4096 << (order - 4);
2114 sysctl_max_syn_backlog = 1024; 2114 sysctl_max_syn_backlog = 1024;
2115 } else if (order < 3) { 2115 } else if (order < 3) {
2116 sysctl_local_port_range[0] = 1024 * (3 - order); 2116 sysctl_local_port_range[0] = 1024 * (3 - order);
2117 sysctl_tcp_max_tw_buckets >>= (3 - order); 2117 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2118 sysctl_tcp_max_orphans >>= (3 - order); 2118 sysctl_tcp_max_orphans >>= (3 - order);
2119 sysctl_max_syn_backlog = 128; 2119 sysctl_max_syn_backlog = 128;
2120 } 2120 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b966102b9f39..83f72346274a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -199,7 +199,7 @@ unique:
199 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 199 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
200 } else if (tw) { 200 } else if (tw) {
201 /* Silly. Should hash-dance instead... */ 201 /* Silly. Should hash-dance instead... */
202 tcp_tw_deschedule(tw); 202 inet_twsk_deschedule(tw, &tcp_death_row);
203 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 203 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
204 204
205 inet_twsk_put(tw); 205 inet_twsk_put(tw);
@@ -291,7 +291,7 @@ ok:
291 spin_unlock(&head->lock); 291 spin_unlock(&head->lock);
292 292
293 if (tw) { 293 if (tw) {
294 tcp_tw_deschedule(tw); 294 inet_twsk_deschedule(tw, &tcp_death_row);;
295 inet_twsk_put(tw); 295 inet_twsk_put(tw);
296 } 296 }
297 297
@@ -366,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
366 tp->write_seq = 0; 366 tp->write_seq = 0;
367 } 367 }
368 368
369 if (sysctl_tcp_tw_recycle && 369 if (tcp_death_row.sysctl_tw_recycle &&
370 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { 370 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
371 struct inet_peer *peer = rt_get_peer(rt); 371 struct inet_peer *peer = rt_get_peer(rt);
372 372
@@ -965,7 +965,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
965 * are made in the function processing timewait state. 965 * are made in the function processing timewait state.
966 */ 966 */
967 if (tmp_opt.saw_tstamp && 967 if (tmp_opt.saw_tstamp &&
968 sysctl_tcp_tw_recycle && 968 tcp_death_row.sysctl_tw_recycle &&
969 (dst = inet_csk_route_req(sk, req)) != NULL && 969 (dst = inet_csk_route_req(sk, req)) != NULL &&
970 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 970 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
971 peer->v4daddr == saddr) { 971 peer->v4daddr == saddr) {
@@ -1305,7 +1305,8 @@ do_time_wait:
1305 ntohs(th->dest), 1305 ntohs(th->dest),
1306 inet_iif(skb)); 1306 inet_iif(skb));
1307 if (sk2) { 1307 if (sk2) {
1308 tcp_tw_deschedule((struct inet_timewait_sock *)sk); 1308 inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1309 &tcp_death_row);
1309 inet_twsk_put((struct inet_timewait_sock *)sk); 1310 inet_twsk_put((struct inet_timewait_sock *)sk);
1310 sk = sk2; 1311 sk = sk2;
1311 goto process; 1312 goto process;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2d95afe5b393..81b9a52c50c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,37 @@
35#define SYNC_INIT 1 35#define SYNC_INIT 1
36#endif 36#endif
37 37
38int sysctl_tcp_tw_recycle; 38/* New-style handling of TIME_WAIT sockets. */
39int sysctl_tcp_max_tw_buckets = NR_FILE*2; 39
40static void inet_twdr_hangman(unsigned long data);
41static void inet_twdr_twkill_work(void *data);
42static void inet_twdr_twcal_tick(unsigned long data);
40 43
41int sysctl_tcp_syncookies = SYNC_INIT; 44int sysctl_tcp_syncookies = SYNC_INIT;
42int sysctl_tcp_abort_on_overflow; 45int sysctl_tcp_abort_on_overflow;
43 46
44static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo); 47struct inet_timewait_death_row tcp_death_row = {
48 .sysctl_max_tw_buckets = NR_FILE * 2,
49 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
50 .death_lock = SPIN_LOCK_UNLOCKED,
51 .hashinfo = &tcp_hashinfo,
52 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
53 (unsigned long)&tcp_death_row),
54 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
55 inet_twdr_twkill_work,
56 &tcp_death_row),
57/* Short-time timewait calendar */
58
59 .twcal_hand = -1,
60 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
61 (unsigned long)&tcp_death_row),
62};
63
64EXPORT_SYMBOL_GPL(tcp_death_row);
65
66static void inet_twsk_schedule(struct inet_timewait_sock *tw,
67 struct inet_timewait_death_row *twdr,
68 const int timeo);
45 69
46static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 70static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
47{ 71{
@@ -52,10 +76,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
52 return (seq == e_win && seq == end_seq); 76 return (seq == e_win && seq == end_seq);
53} 77}
54 78
55/* New-style handling of TIME_WAIT sockets. */
56
57int tcp_tw_count;
58
59/* 79/*
60 * * Main purpose of TIME-WAIT state is to close connection gracefully, 80 * * Main purpose of TIME-WAIT state is to close connection gracefully,
61 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN 81 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -132,7 +152,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
132 if (!th->fin || 152 if (!th->fin ||
133 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { 153 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
134kill_with_rst: 154kill_with_rst:
135 tcp_tw_deschedule(tw); 155 inet_twsk_deschedule(tw, &tcp_death_row);
136 inet_twsk_put(tw); 156 inet_twsk_put(tw);
137 return TCP_TW_RST; 157 return TCP_TW_RST;
138 } 158 }
@@ -151,11 +171,11 @@ kill_with_rst:
151 * do not undertsnad recycling in any case, it not 171 * do not undertsnad recycling in any case, it not
152 * a big problem in practice. --ANK */ 172 * a big problem in practice. --ANK */
153 if (tw->tw_family == AF_INET && 173 if (tw->tw_family == AF_INET &&
154 sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp && 174 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
155 tcp_v4_tw_remember_stamp(tw)) 175 tcp_v4_tw_remember_stamp(tw))
156 tcp_tw_schedule(tw, tw->tw_timeout); 176 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout);
157 else 177 else
158 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 178 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN);
159 return TCP_TW_ACK; 179 return TCP_TW_ACK;
160 } 180 }
161 181
@@ -188,12 +208,12 @@ kill_with_rst:
188 */ 208 */
189 if (sysctl_tcp_rfc1337 == 0) { 209 if (sysctl_tcp_rfc1337 == 0) {
190kill: 210kill:
191 tcp_tw_deschedule(tw); 211 inet_twsk_deschedule(tw, &tcp_death_row);
192 inet_twsk_put(tw); 212 inet_twsk_put(tw);
193 return TCP_TW_SUCCESS; 213 return TCP_TW_SUCCESS;
194 } 214 }
195 } 215 }
196 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 216 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN);
197 217
198 if (tmp_opt.saw_tstamp) { 218 if (tmp_opt.saw_tstamp) {
199 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 219 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
@@ -243,7 +263,7 @@ kill:
243 * Do not reschedule in the last case. 263 * Do not reschedule in the last case.
244 */ 264 */
245 if (paws_reject || th->ack) 265 if (paws_reject || th->ack)
246 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 266 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN);
247 267
248 /* Send ACK. Note, we do not put the bucket, 268 /* Send ACK. Note, we do not put the bucket,
249 * it will be released by caller. 269 * it will be released by caller.
@@ -263,10 +283,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
263 const struct tcp_sock *tp = tcp_sk(sk); 283 const struct tcp_sock *tp = tcp_sk(sk);
264 int recycle_ok = 0; 284 int recycle_ok = 0;
265 285
266 if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) 286 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
267 recycle_ok = tp->af_specific->remember_stamp(sk); 287 recycle_ok = tp->af_specific->remember_stamp(sk);
268 288
269 if (tcp_tw_count < sysctl_tcp_max_tw_buckets) 289 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
270 tw = inet_twsk_alloc(sk, state); 290 tw = inet_twsk_alloc(sk, state);
271 291
272 if (tw != NULL) { 292 if (tw != NULL) {
@@ -306,7 +326,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
306 timeo = TCP_TIMEWAIT_LEN; 326 timeo = TCP_TIMEWAIT_LEN;
307 } 327 }
308 328
309 tcp_tw_schedule(tw, timeo); 329 inet_twsk_schedule(tw, &tcp_death_row, timeo);
310 inet_twsk_put(tw); 330 inet_twsk_put(tw);
311 } else { 331 } else {
312 /* Sorry, if we're out of memory, just CLOSE this 332 /* Sorry, if we're out of memory, just CLOSE this
@@ -321,26 +341,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
321 tcp_done(sk); 341 tcp_done(sk);
322} 342}
323 343
324/* Kill off TIME_WAIT sockets once their lifetime has expired. */
325static int tcp_tw_death_row_slot;
326
327static void tcp_twkill(unsigned long);
328
329/* TIME_WAIT reaping mechanism. */
330#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
331#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
332
333#define TCP_TWKILL_QUOTA 100
334
335static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
336static DEFINE_SPINLOCK(tw_death_lock);
337static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
338static void twkill_work(void *);
339static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
340static u32 twkill_thread_slots;
341
342/* Returns non-zero if quota exceeded. */ 344/* Returns non-zero if quota exceeded. */
343static int tcp_do_twkill_work(int slot, unsigned int quota) 345static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
346 const int slot)
344{ 347{
345 struct inet_timewait_sock *tw; 348 struct inet_timewait_sock *tw;
346 struct hlist_node *node; 349 struct hlist_node *node;
@@ -356,19 +359,19 @@ static int tcp_do_twkill_work(int slot, unsigned int quota)
356 killed = 0; 359 killed = 0;
357 ret = 0; 360 ret = 0;
358rescan: 361rescan:
359 inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { 362 inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
360 __inet_twsk_del_dead_node(tw); 363 __inet_twsk_del_dead_node(tw);
361 spin_unlock(&tw_death_lock); 364 spin_unlock(&twdr->death_lock);
362 __inet_twsk_kill(tw, &tcp_hashinfo); 365 __inet_twsk_kill(tw, twdr->hashinfo);
363 inet_twsk_put(tw); 366 inet_twsk_put(tw);
364 killed++; 367 killed++;
365 spin_lock(&tw_death_lock); 368 spin_lock(&twdr->death_lock);
366 if (killed > quota) { 369 if (killed > INET_TWDR_TWKILL_QUOTA) {
367 ret = 1; 370 ret = 1;
368 break; 371 break;
369 } 372 }
370 373
371 /* While we dropped tw_death_lock, another cpu may have 374 /* While we dropped twdr->death_lock, another cpu may have
372 * killed off the next TW bucket in the list, therefore 375 * killed off the next TW bucket in the list, therefore
373 * do a fresh re-read of the hlist head node with the 376 * do a fresh re-read of the hlist head node with the
374 * lock reacquired. We still use the hlist traversal 377 * lock reacquired. We still use the hlist traversal
@@ -377,67 +380,68 @@ rescan:
377 goto rescan; 380 goto rescan;
378 } 381 }
379 382
380 tcp_tw_count -= killed; 383 twdr->tw_count -= killed;
381 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); 384 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
382 385
383 return ret; 386 return ret;
384} 387}
385 388
386static void tcp_twkill(unsigned long dummy) 389static void inet_twdr_hangman(unsigned long data)
387{ 390{
388 int need_timer, ret; 391 struct inet_timewait_death_row *twdr;
392 int unsigned need_timer;
389 393
390 spin_lock(&tw_death_lock); 394 twdr = (struct inet_timewait_death_row *)data;
395 spin_lock(&twdr->death_lock);
391 396
392 if (tcp_tw_count == 0) 397 if (twdr->tw_count == 0)
393 goto out; 398 goto out;
394 399
395 need_timer = 0; 400 need_timer = 0;
396 ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA); 401 if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
397 if (ret) { 402 twdr->thread_slots |= (1 << twdr->slot);
398 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
399 mb(); 403 mb();
400 schedule_work(&tcp_twkill_work); 404 schedule_work(&twdr->twkill_work);
401 need_timer = 1; 405 need_timer = 1;
402 } else { 406 } else {
403 /* We purged the entire slot, anything left? */ 407 /* We purged the entire slot, anything left? */
404 if (tcp_tw_count) 408 if (twdr->tw_count)
405 need_timer = 1; 409 need_timer = 1;
406 } 410 }
407 tcp_tw_death_row_slot = 411 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
408 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
409 if (need_timer) 412 if (need_timer)
410 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD); 413 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
411out: 414out:
412 spin_unlock(&tw_death_lock); 415 spin_unlock(&twdr->death_lock);
413} 416}
414 417
415extern void twkill_slots_invalid(void); 418extern void twkill_slots_invalid(void);
416 419
417static void twkill_work(void *dummy) 420static void inet_twdr_twkill_work(void *data)
418{ 421{
422 struct inet_timewait_death_row *twdr = data;
419 int i; 423 int i;
420 424
421 if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8)) 425 if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
422 twkill_slots_invalid(); 426 twkill_slots_invalid();
423 427
424 while (twkill_thread_slots) { 428 while (twdr->thread_slots) {
425 spin_lock_bh(&tw_death_lock); 429 spin_lock_bh(&twdr->death_lock);
426 for (i = 0; i < TCP_TWKILL_SLOTS; i++) { 430 for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
427 if (!(twkill_thread_slots & (1 << i))) 431 if (!(twdr->thread_slots & (1 << i)))
428 continue; 432 continue;
429 433
430 while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { 434 while (inet_twdr_do_twkill_work(twdr, i) != 0) {
431 if (need_resched()) { 435 if (need_resched()) {
432 spin_unlock_bh(&tw_death_lock); 436 spin_unlock_bh(&twdr->death_lock);
433 schedule(); 437 schedule();
434 spin_lock_bh(&tw_death_lock); 438 spin_lock_bh(&twdr->death_lock);
435 } 439 }
436 } 440 }
437 441
438 twkill_thread_slots &= ~(1 << i); 442 twdr->thread_slots &= ~(1 << i);
439 } 443 }
440 spin_unlock_bh(&tw_death_lock); 444 spin_unlock_bh(&twdr->death_lock);
441 } 445 }
442} 446}
443 447
@@ -446,28 +450,22 @@ static void twkill_work(void *dummy)
446 */ 450 */
447 451
448/* This is for handling early-kills of TIME_WAIT sockets. */ 452/* This is for handling early-kills of TIME_WAIT sockets. */
449void tcp_tw_deschedule(struct inet_timewait_sock *tw) 453void inet_twsk_deschedule(struct inet_timewait_sock *tw,
454 struct inet_timewait_death_row *twdr)
450{ 455{
451 spin_lock(&tw_death_lock); 456 spin_lock(&twdr->death_lock);
452 if (inet_twsk_del_dead_node(tw)) { 457 if (inet_twsk_del_dead_node(tw)) {
453 inet_twsk_put(tw); 458 inet_twsk_put(tw);
454 if (--tcp_tw_count == 0) 459 if (--twdr->tw_count == 0)
455 del_timer(&tcp_tw_timer); 460 del_timer(&twdr->tw_timer);
456 } 461 }
457 spin_unlock(&tw_death_lock); 462 spin_unlock(&twdr->death_lock);
458 __inet_twsk_kill(tw, &tcp_hashinfo); 463 __inet_twsk_kill(tw, twdr->hashinfo);
459} 464}
460 465
461/* Short-time timewait calendar */ 466static void inet_twsk_schedule(struct inet_timewait_sock *tw,
462 467 struct inet_timewait_death_row *twdr,
463static int tcp_twcal_hand = -1; 468 const int timeo)
464static int tcp_twcal_jiffie;
465static void tcp_twcal_tick(unsigned long);
466static struct timer_list tcp_twcal_timer =
467 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
468static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
469
470static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo)
471{ 469{
472 struct hlist_head *list; 470 struct hlist_head *list;
473 int slot; 471 int slot;
@@ -496,100 +494,106 @@ static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo)
496 * is greater than TS tick!) and detect old duplicates with help 494 * is greater than TS tick!) and detect old duplicates with help
497 * of PAWS. 495 * of PAWS.
498 */ 496 */
499 slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; 497 slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
500 498
501 spin_lock(&tw_death_lock); 499 spin_lock(&twdr->death_lock);
502 500
503 /* Unlink it, if it was scheduled */ 501 /* Unlink it, if it was scheduled */
504 if (inet_twsk_del_dead_node(tw)) 502 if (inet_twsk_del_dead_node(tw))
505 tcp_tw_count--; 503 twdr->tw_count--;
506 else 504 else
507 atomic_inc(&tw->tw_refcnt); 505 atomic_inc(&tw->tw_refcnt);
508 506
509 if (slot >= TCP_TW_RECYCLE_SLOTS) { 507 if (slot >= INET_TWDR_RECYCLE_SLOTS) {
510 /* Schedule to slow timer */ 508 /* Schedule to slow timer */
511 if (timeo >= TCP_TIMEWAIT_LEN) { 509 if (timeo >= TCP_TIMEWAIT_LEN) {
512 slot = TCP_TWKILL_SLOTS-1; 510 slot = INET_TWDR_TWKILL_SLOTS - 1;
513 } else { 511 } else {
514 slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; 512 slot = (timeo + twdr->period - 1) / twdr->period;
515 if (slot >= TCP_TWKILL_SLOTS) 513 if (slot >= INET_TWDR_TWKILL_SLOTS)
516 slot = TCP_TWKILL_SLOTS-1; 514 slot = INET_TWDR_TWKILL_SLOTS - 1;
517 } 515 }
518 tw->tw_ttd = jiffies + timeo; 516 tw->tw_ttd = jiffies + timeo;
519 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); 517 slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
520 list = &tcp_tw_death_row[slot]; 518 list = &twdr->cells[slot];
521 } else { 519 } else {
522 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK); 520 tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
523 521
524 if (tcp_twcal_hand < 0) { 522 if (twdr->twcal_hand < 0) {
525 tcp_twcal_hand = 0; 523 twdr->twcal_hand = 0;
526 tcp_twcal_jiffie = jiffies; 524 twdr->twcal_jiffie = jiffies;
527 tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); 525 twdr->twcal_timer.expires = twdr->twcal_jiffie +
528 add_timer(&tcp_twcal_timer); 526 (slot << INET_TWDR_RECYCLE_TICK);
527 add_timer(&twdr->twcal_timer);
529 } else { 528 } else {
530 if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK))) 529 if (time_after(twdr->twcal_timer.expires,
531 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); 530 jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
532 slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); 531 mod_timer(&twdr->twcal_timer,
532 jiffies + (slot << INET_TWDR_RECYCLE_TICK));
533 slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
533 } 534 }
534 list = &tcp_twcal_row[slot]; 535 list = &twdr->twcal_row[slot];
535 } 536 }
536 537
537 hlist_add_head(&tw->tw_death_node, list); 538 hlist_add_head(&tw->tw_death_node, list);
538 539
539 if (tcp_tw_count++ == 0) 540 if (twdr->tw_count++ == 0)
540 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); 541 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
541 spin_unlock(&tw_death_lock); 542 spin_unlock(&twdr->death_lock);
542} 543}
543 544
544void tcp_twcal_tick(unsigned long dummy) 545void inet_twdr_twcal_tick(unsigned long data)
545{ 546{
547 struct inet_timewait_death_row *twdr;
546 int n, slot; 548 int n, slot;
547 unsigned long j; 549 unsigned long j;
548 unsigned long now = jiffies; 550 unsigned long now = jiffies;
549 int killed = 0; 551 int killed = 0;
550 int adv = 0; 552 int adv = 0;
551 553
552 spin_lock(&tw_death_lock); 554 twdr = (struct inet_timewait_death_row *)data;
553 if (tcp_twcal_hand < 0) 555
556 spin_lock(&twdr->death_lock);
557 if (twdr->twcal_hand < 0)
554 goto out; 558 goto out;
555 559
556 slot = tcp_twcal_hand; 560 slot = twdr->twcal_hand;
557 j = tcp_twcal_jiffie; 561 j = twdr->twcal_jiffie;
558 562
559 for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { 563 for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
560 if (time_before_eq(j, now)) { 564 if (time_before_eq(j, now)) {
561 struct hlist_node *node, *safe; 565 struct hlist_node *node, *safe;
562 struct inet_timewait_sock *tw; 566 struct inet_timewait_sock *tw;
563 567
564 inet_twsk_for_each_inmate_safe(tw, node, safe, 568 inet_twsk_for_each_inmate_safe(tw, node, safe,
565 &tcp_twcal_row[slot]) { 569 &twdr->twcal_row[slot]) {
566 __inet_twsk_del_dead_node(tw); 570 __inet_twsk_del_dead_node(tw);
567 __inet_twsk_kill(tw, &tcp_hashinfo); 571 __inet_twsk_kill(tw, twdr->hashinfo);
568 inet_twsk_put(tw); 572 inet_twsk_put(tw);
569 killed++; 573 killed++;
570 } 574 }
571 } else { 575 } else {
572 if (!adv) { 576 if (!adv) {
573 adv = 1; 577 adv = 1;
574 tcp_twcal_jiffie = j; 578 twdr->twcal_jiffie = j;
575 tcp_twcal_hand = slot; 579 twdr->twcal_hand = slot;
576 } 580 }
577 581
578 if (!hlist_empty(&tcp_twcal_row[slot])) { 582 if (!hlist_empty(&twdr->twcal_row[slot])) {
579 mod_timer(&tcp_twcal_timer, j); 583 mod_timer(&twdr->twcal_timer, j);
580 goto out; 584 goto out;
581 } 585 }
582 } 586 }
583 j += (1<<TCP_TW_RECYCLE_TICK); 587 j += 1 << INET_TWDR_RECYCLE_TICK;
584 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); 588 slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
585 } 589 }
586 tcp_twcal_hand = -1; 590 twdr->twcal_hand = -1;
587 591
588out: 592out:
589 if ((tcp_tw_count -= killed) == 0) 593 if ((twdr->tw_count -= killed) == 0)
590 del_timer(&tcp_tw_timer); 594 del_timer(&twdr->tw_timer);
591 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); 595 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
592 spin_unlock(&tw_death_lock); 596 spin_unlock(&twdr->death_lock);
593} 597}
594 598
595/* This is not only more efficient than what we used to do, it eliminates 599/* This is not only more efficient than what we used to do, it eliminates
@@ -929,4 +933,4 @@ EXPORT_SYMBOL(tcp_check_req);
929EXPORT_SYMBOL(tcp_child_process); 933EXPORT_SYMBOL(tcp_child_process);
930EXPORT_SYMBOL(tcp_create_openreq_child); 934EXPORT_SYMBOL(tcp_create_openreq_child);
931EXPORT_SYMBOL(tcp_timewait_state_process); 935EXPORT_SYMBOL(tcp_timewait_state_process);
932EXPORT_SYMBOL(tcp_tw_deschedule); 936EXPORT_SYMBOL(inet_twsk_deschedule);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0b51ec310ebe..1c21ad66cfad 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -521,7 +521,7 @@ unique:
521 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 521 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
522 } else if (tw) { 522 } else if (tw) {
523 /* Silly. Should hash-dance instead... */ 523 /* Silly. Should hash-dance instead... */
524 tcp_tw_deschedule(tw); 524 inet_twsk_deschedule(tw, &tcp_death_row);
525 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 525 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
526 526
527 inet_twsk_put(tw); 527 inet_twsk_put(tw);
@@ -611,7 +611,7 @@ ok:
611 spin_unlock(&head->lock); 611 spin_unlock(&head->lock);
612 612
613 if (tw) { 613 if (tw) {
614 tcp_tw_deschedule(tw); 614 inet_twsk_deschedule(tw, &tcp_death_row);
615 inet_twsk_put(tw); 615 inet_twsk_put(tw);
616 } 616 }
617 617
@@ -1820,8 +1820,9 @@ do_time_wait:
1820 1820
1821 sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); 1821 sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
1822 if (sk2 != NULL) { 1822 if (sk2 != NULL) {
1823 tcp_tw_deschedule((struct inet_timewait_sock *)sk); 1823 struct inet_timewait_sock *tw = inet_twsk(sk);
1824 inet_twsk_put((struct inet_timewait_sock *)sk); 1824 inet_twsk_deschedule(tw, &tcp_death_row);
1825 inet_twsk_put(tw);
1825 sk = sk2; 1826 sk = sk2;
1826 goto process; 1827 goto process;
1827 } 1828 }