aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/inet_connection_sock.c401
-rw-r--r--net/ipv4/tcp.c93
-rw-r--r--net/ipv4/tcp_input.c10
-rw-r--r--net/ipv4/tcp_ipv4.c210
-rw-r--r--net/ipv4/tcp_output.c19
-rw-r--r--net/ipv4/tcp_timer.c65
7 files changed, 427 insertions, 373 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 6650d18e400f..ea0e1d87dc7e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,7 +5,7 @@
5obj-y := route.o inetpeer.o protocol.o \ 5obj-y := route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o inet_hashtables.o \ 7 ip_output.o ip_sockglue.o inet_hashtables.o \
8 inet_timewait_sock.o \ 8 inet_timewait_sock.o inet_connection_sock.o \
9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
11 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 11 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 000000000000..2712400a8bb8
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,401 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Support for INET connection oriented protocols.
7 *
8 * Authors: See the TCP sources
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or(at your option) any later version.
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/jhash.h>
19
20#include <net/inet_connection_sock.h>
21#include <net/inet_hashtables.h>
22#include <net/inet_timewait_sock.h>
23#include <net/ip.h>
24#include <net/route.h>
25#include <net/tcp_states.h>
26
27#ifdef INET_CSK_DEBUG
28const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
29EXPORT_SYMBOL(inet_csk_timer_bug_msg);
30#endif
31
32/*
33 * This array holds the first and last local port number.
34 * For high-usage systems, use sysctl to change this to
35 * 32768-61000
36 */
37int sysctl_local_port_range[2] = { 1024, 4999 };
38
39static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
40{
41 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
42 struct sock *sk2;
43 struct hlist_node *node;
44 int reuse = sk->sk_reuse;
45
46 sk_for_each_bound(sk2, node, &tb->owners) {
47 if (sk != sk2 &&
48 !inet_v6_ipv6only(sk2) &&
49 (!sk->sk_bound_dev_if ||
50 !sk2->sk_bound_dev_if ||
51 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
52 if (!reuse || !sk2->sk_reuse ||
53 sk2->sk_state == TCP_LISTEN) {
54 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
55 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
56 sk2_rcv_saddr == sk_rcv_saddr)
57 break;
58 }
59 }
60 }
61 return node != NULL;
62}
63
64/* Obtain a reference to a local port for the given sock,
65 * if snum is zero it means select any available local port.
66 */
67int inet_csk_get_port(struct inet_hashinfo *hashinfo,
68 struct sock *sk, unsigned short snum)
69{
70 struct inet_bind_hashbucket *head;
71 struct hlist_node *node;
72 struct inet_bind_bucket *tb;
73 int ret;
74
75 local_bh_disable();
76 if (!snum) {
77 int low = sysctl_local_port_range[0];
78 int high = sysctl_local_port_range[1];
79 int remaining = (high - low) + 1;
80 int rover;
81
82 spin_lock(&hashinfo->portalloc_lock);
83 if (hashinfo->port_rover < low)
84 rover = low;
85 else
86 rover = hashinfo->port_rover;
87 do {
88 rover++;
89 if (rover > high)
90 rover = low;
91 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
92 spin_lock(&head->lock);
93 inet_bind_bucket_for_each(tb, node, &head->chain)
94 if (tb->port == rover)
95 goto next;
96 break;
97 next:
98 spin_unlock(&head->lock);
99 } while (--remaining > 0);
100 hashinfo->port_rover = rover;
101 spin_unlock(&hashinfo->portalloc_lock);
102
103 /* Exhausted local port range during search? It is not
104 * possible for us to be holding one of the bind hash
105 * locks if this test triggers, because if 'remaining'
106 * drops to zero, we broke out of the do/while loop at
107 * the top level, not from the 'break;' statement.
108 */
109 ret = 1;
110 if (remaining <= 0)
111 goto fail;
112
113 /* OK, here is the one we will use. HEAD is
114 * non-NULL and we hold it's mutex.
115 */
116 snum = rover;
117 } else {
118 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
119 spin_lock(&head->lock);
120 inet_bind_bucket_for_each(tb, node, &head->chain)
121 if (tb->port == snum)
122 goto tb_found;
123 }
124 tb = NULL;
125 goto tb_not_found;
126tb_found:
127 if (!hlist_empty(&tb->owners)) {
128 if (sk->sk_reuse > 1)
129 goto success;
130 if (tb->fastreuse > 0 &&
131 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
132 goto success;
133 } else {
134 ret = 1;
135 if (inet_csk_bind_conflict(sk, tb))
136 goto fail_unlock;
137 }
138 }
139tb_not_found:
140 ret = 1;
141 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
142 goto fail_unlock;
143 if (hlist_empty(&tb->owners)) {
144 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
145 tb->fastreuse = 1;
146 else
147 tb->fastreuse = 0;
148 } else if (tb->fastreuse &&
149 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
150 tb->fastreuse = 0;
151success:
152 if (!inet_csk(sk)->icsk_bind_hash)
153 inet_bind_hash(sk, tb, snum);
154 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
155 ret = 0;
156
157fail_unlock:
158 spin_unlock(&head->lock);
159fail:
160 local_bh_enable();
161 return ret;
162}
163
164EXPORT_SYMBOL_GPL(inet_csk_get_port);
165
166/*
167 * Wait for an incoming connection, avoid race conditions. This must be called
168 * with the socket locked.
169 */
170static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
171{
172 struct inet_connection_sock *icsk = inet_csk(sk);
173 DEFINE_WAIT(wait);
174 int err;
175
176 /*
177 * True wake-one mechanism for incoming connections: only
178 * one process gets woken up, not the 'whole herd'.
179 * Since we do not 'race & poll' for established sockets
180 * anymore, the common case will execute the loop only once.
181 *
182 * Subtle issue: "add_wait_queue_exclusive()" will be added
183 * after any current non-exclusive waiters, and we know that
184 * it will always _stay_ after any new non-exclusive waiters
185 * because all non-exclusive waiters are added at the
186 * beginning of the wait-queue. As such, it's ok to "drop"
187 * our exclusiveness temporarily when we get woken up without
188 * having to remove and re-insert us on the wait queue.
189 */
190 for (;;) {
191 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
192 TASK_INTERRUPTIBLE);
193 release_sock(sk);
194 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
195 timeo = schedule_timeout(timeo);
196 lock_sock(sk);
197 err = 0;
198 if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
199 break;
200 err = -EINVAL;
201 if (sk->sk_state != TCP_LISTEN)
202 break;
203 err = sock_intr_errno(timeo);
204 if (signal_pending(current))
205 break;
206 err = -EAGAIN;
207 if (!timeo)
208 break;
209 }
210 finish_wait(sk->sk_sleep, &wait);
211 return err;
212}
213
214/*
215 * This will accept the next outstanding connection.
216 */
217struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
218{
219 struct inet_connection_sock *icsk = inet_csk(sk);
220 struct sock *newsk;
221 int error;
222
223 lock_sock(sk);
224
225 /* We need to make sure that this socket is listening,
226 * and that it has something pending.
227 */
228 error = -EINVAL;
229 if (sk->sk_state != TCP_LISTEN)
230 goto out_err;
231
232 /* Find already established connection */
233 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
234 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
235
236 /* If this is a non blocking socket don't sleep */
237 error = -EAGAIN;
238 if (!timeo)
239 goto out_err;
240
241 error = inet_csk_wait_for_connect(sk, timeo);
242 if (error)
243 goto out_err;
244 }
245
246 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
247 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
248out:
249 release_sock(sk);
250 return newsk;
251out_err:
252 newsk = NULL;
253 *err = error;
254 goto out;
255}
256
257EXPORT_SYMBOL(inet_csk_accept);
258
259/*
260 * Using different timers for retransmit, delayed acks and probes
261 * We may wish use just one timer maintaining a list of expire jiffies
262 * to optimize.
263 */
264void inet_csk_init_xmit_timers(struct sock *sk,
265 void (*retransmit_handler)(unsigned long),
266 void (*delack_handler)(unsigned long),
267 void (*keepalive_handler)(unsigned long))
268{
269 struct inet_connection_sock *icsk = inet_csk(sk);
270
271 init_timer(&icsk->icsk_retransmit_timer);
272 init_timer(&icsk->icsk_delack_timer);
273 init_timer(&sk->sk_timer);
274
275 icsk->icsk_retransmit_timer.function = retransmit_handler;
276 icsk->icsk_delack_timer.function = delack_handler;
277 sk->sk_timer.function = keepalive_handler;
278
279 icsk->icsk_retransmit_timer.data =
280 icsk->icsk_delack_timer.data =
281 sk->sk_timer.data = (unsigned long)sk;
282
283 icsk->icsk_pending = icsk->icsk_ack.pending = 0;
284}
285
286EXPORT_SYMBOL(inet_csk_init_xmit_timers);
287
288void inet_csk_clear_xmit_timers(struct sock *sk)
289{
290 struct inet_connection_sock *icsk = inet_csk(sk);
291
292 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
293
294 sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
295 sk_stop_timer(sk, &icsk->icsk_delack_timer);
296 sk_stop_timer(sk, &sk->sk_timer);
297}
298
299EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
300
301void inet_csk_delete_keepalive_timer(struct sock *sk)
302{
303 sk_stop_timer(sk, &sk->sk_timer);
304}
305
306EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
307
308void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
309{
310 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
311}
312
313EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
314
315struct dst_entry* inet_csk_route_req(struct sock *sk,
316 const struct request_sock *req)
317{
318 struct rtable *rt;
319 const struct inet_request_sock *ireq = inet_rsk(req);
320 struct ip_options *opt = inet_rsk(req)->opt;
321 struct flowi fl = { .oif = sk->sk_bound_dev_if,
322 .nl_u = { .ip4_u =
323 { .daddr = ((opt && opt->srr) ?
324 opt->faddr :
325 ireq->rmt_addr),
326 .saddr = ireq->loc_addr,
327 .tos = RT_CONN_FLAGS(sk) } },
328 .proto = sk->sk_protocol,
329 .uli_u = { .ports =
330 { .sport = inet_sk(sk)->sport,
331 .dport = ireq->rmt_port } } };
332
333 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
334 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
335 return NULL;
336 }
337 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
338 ip_rt_put(rt);
339 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
340 return NULL;
341 }
342 return &rt->u.dst;
343}
344
345EXPORT_SYMBOL_GPL(inet_csk_route_req);
346
347static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
348 const u32 rnd, const u16 synq_hsize)
349{
350 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
351}
352
353#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
354#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
355#else
356#define AF_INET_FAMILY(fam) 1
357#endif
358
359struct request_sock *inet_csk_search_req(const struct sock *sk,
360 struct request_sock ***prevp,
361 const __u16 rport, const __u32 raddr,
362 const __u32 laddr)
363{
364 const struct inet_connection_sock *icsk = inet_csk(sk);
365 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
366 struct request_sock *req, **prev;
367
368 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
369 lopt->nr_table_entries)];
370 (req = *prev) != NULL;
371 prev = &req->dl_next) {
372 const struct inet_request_sock *ireq = inet_rsk(req);
373
374 if (ireq->rmt_port == rport &&
375 ireq->rmt_addr == raddr &&
376 ireq->loc_addr == laddr &&
377 AF_INET_FAMILY(req->rsk_ops->family)) {
378 BUG_TRAP(!req->sk);
379 *prevp = prev;
380 break;
381 }
382 }
383
384 return req;
385}
386
387EXPORT_SYMBOL_GPL(inet_csk_search_req);
388
389void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
390 const unsigned timeout)
391{
392 struct inet_connection_sock *icsk = inet_csk(sk);
393 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
394 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
395 lopt->hash_rnd, lopt->nr_table_entries);
396
397 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
398 inet_csk_reqsk_queue_added(sk, timeout);
399}
400
401EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8177b86570db..581016a6a93f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1805,98 +1805,6 @@ int tcp_disconnect(struct sock *sk, int flags)
1805} 1805}
1806 1806
1807/* 1807/*
1808 * Wait for an incoming connection, avoid race
1809 * conditions. This must be called with the socket locked.
1810 */
1811static int wait_for_connect(struct sock *sk, long timeo)
1812{
1813 struct inet_connection_sock *icsk = inet_csk(sk);
1814 DEFINE_WAIT(wait);
1815 int err;
1816
1817 /*
1818 * True wake-one mechanism for incoming connections: only
1819 * one process gets woken up, not the 'whole herd'.
1820 * Since we do not 'race & poll' for established sockets
1821 * anymore, the common case will execute the loop only once.
1822 *
1823 * Subtle issue: "add_wait_queue_exclusive()" will be added
1824 * after any current non-exclusive waiters, and we know that
1825 * it will always _stay_ after any new non-exclusive waiters
1826 * because all non-exclusive waiters are added at the
1827 * beginning of the wait-queue. As such, it's ok to "drop"
1828 * our exclusiveness temporarily when we get woken up without
1829 * having to remove and re-insert us on the wait queue.
1830 */
1831 for (;;) {
1832 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1833 TASK_INTERRUPTIBLE);
1834 release_sock(sk);
1835 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
1836 timeo = schedule_timeout(timeo);
1837 lock_sock(sk);
1838 err = 0;
1839 if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
1840 break;
1841 err = -EINVAL;
1842 if (sk->sk_state != TCP_LISTEN)
1843 break;
1844 err = sock_intr_errno(timeo);
1845 if (signal_pending(current))
1846 break;
1847 err = -EAGAIN;
1848 if (!timeo)
1849 break;
1850 }
1851 finish_wait(sk->sk_sleep, &wait);
1852 return err;
1853}
1854
1855/*
1856 * This will accept the next outstanding connection.
1857 */
1858
1859struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
1860{
1861 struct inet_connection_sock *icsk = inet_csk(sk);
1862 struct sock *newsk;
1863 int error;
1864
1865 lock_sock(sk);
1866
1867 /* We need to make sure that this socket is listening,
1868 * and that it has something pending.
1869 */
1870 error = -EINVAL;
1871 if (sk->sk_state != TCP_LISTEN)
1872 goto out_err;
1873
1874 /* Find already established connection */
1875 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
1876 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1877
1878 /* If this is a non blocking socket don't sleep */
1879 error = -EAGAIN;
1880 if (!timeo)
1881 goto out_err;
1882
1883 error = wait_for_connect(sk, timeo);
1884 if (error)
1885 goto out_err;
1886 }
1887
1888 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
1889 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1890out:
1891 release_sock(sk);
1892 return newsk;
1893out_err:
1894 newsk = NULL;
1895 *err = error;
1896 goto out;
1897}
1898
1899/*
1900 * Socket option code for TCP. 1808 * Socket option code for TCP.
1901 */ 1809 */
1902int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, 1810int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
@@ -2344,7 +2252,6 @@ void __init tcp_init(void)
2344 tcp_register_congestion_control(&tcp_reno); 2252 tcp_register_congestion_control(&tcp_reno);
2345} 2253}
2346 2254
2347EXPORT_SYMBOL(inet_csk_accept);
2348EXPORT_SYMBOL(tcp_close); 2255EXPORT_SYMBOL(tcp_close);
2349EXPORT_SYMBOL(tcp_destroy_sock); 2256EXPORT_SYMBOL(tcp_destroy_sock);
2350EXPORT_SYMBOL(tcp_disconnect); 2257EXPORT_SYMBOL(tcp_disconnect);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8a8c5c2d90cb..b35badf53aa5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1278,7 +1278,7 @@ static int tcp_check_sack_reneging(struct sock *sk)
1278 inet_csk(sk)->icsk_retransmits++; 1278 inet_csk(sk)->icsk_retransmits++;
1279 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); 1279 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
1280 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 1280 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1281 inet_csk(sk)->icsk_rto); 1281 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1282 return 1; 1282 return 1;
1283 } 1283 }
1284 return 0; 1284 return 0;
@@ -1961,7 +1961,7 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
1961 if (!tp->packets_out) { 1961 if (!tp->packets_out) {
1962 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 1962 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
1963 } else { 1963 } else {
1964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); 1964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1965 } 1965 }
1966} 1966}
1967 1967
@@ -2147,7 +2147,8 @@ static void tcp_ack_probe(struct sock *sk)
2147 */ 2147 */
2148 } else { 2148 } else {
2149 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 2149 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2150 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); 2150 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
2151 TCP_RTO_MAX);
2151 } 2152 }
2152} 2153}
2153 2154
@@ -3968,7 +3969,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3968 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; 3969 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3969 tcp_incr_quickack(sk); 3970 tcp_incr_quickack(sk);
3970 tcp_enter_quickack_mode(sk); 3971 tcp_enter_quickack_mode(sk);
3971 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); 3972 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3973 TCP_DELACK_MAX, TCP_RTO_MAX);
3972 3974
3973discard: 3975discard:
3974 __kfree_skb(skb); 3976 __kfree_skb(skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2cd41265d17f..2f605b9e6b67 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -97,138 +97,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
97 .port_rover = 1024 - 1, 97 .port_rover = 1024 - 1,
98}; 98};
99 99
100/*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105int sysctl_local_port_range[2] = { 1024, 4999 };
106
107static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
108{
109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !inet_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130}
131
132/* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
135int inet_csk_get_port(struct inet_hashinfo *hashinfo,
136 struct sock *sk, unsigned short snum)
137{
138 struct inet_bind_hashbucket *head;
139 struct hlist_node *node;
140 struct inet_bind_bucket *tb;
141 int ret;
142
143 local_bh_disable();
144 if (!snum) {
145 int low = sysctl_local_port_range[0];
146 int high = sysctl_local_port_range[1];
147 int remaining = (high - low) + 1;
148 int rover;
149
150 spin_lock(&hashinfo->portalloc_lock);
151 if (hashinfo->port_rover < low)
152 rover = low;
153 else
154 rover = hashinfo->port_rover;
155 do {
156 rover++;
157 if (rover > high)
158 rover = low;
159 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
160 spin_lock(&head->lock);
161 inet_bind_bucket_for_each(tb, node, &head->chain)
162 if (tb->port == rover)
163 goto next;
164 break;
165 next:
166 spin_unlock(&head->lock);
167 } while (--remaining > 0);
168 hashinfo->port_rover = rover;
169 spin_unlock(&hashinfo->portalloc_lock);
170
171 /* Exhausted local port range during search? It is not
172 * possible for us to be holding one of the bind hash
173 * locks if this test triggers, because if 'remaining'
174 * drops to zero, we broke out of the do/while loop at
175 * the top level, not from the 'break;' statement.
176 */
177 ret = 1;
178 if (unlikely(remaining <= 0))
179 goto fail;
180
181 /* OK, here is the one we will use. HEAD is
182 * non-NULL and we hold it's mutex.
183 */
184 snum = rover;
185 } else {
186 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
187 spin_lock(&head->lock);
188 inet_bind_bucket_for_each(tb, node, &head->chain)
189 if (tb->port == snum)
190 goto tb_found;
191 }
192 tb = NULL;
193 goto tb_not_found;
194tb_found:
195 if (!hlist_empty(&tb->owners)) {
196 if (sk->sk_reuse > 1)
197 goto success;
198 if (tb->fastreuse > 0 &&
199 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
200 goto success;
201 } else {
202 ret = 1;
203 if (inet_csk_bind_conflict(sk, tb))
204 goto fail_unlock;
205 }
206 }
207tb_not_found:
208 ret = 1;
209 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
210 goto fail_unlock;
211 if (hlist_empty(&tb->owners)) {
212 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
213 tb->fastreuse = 1;
214 else
215 tb->fastreuse = 0;
216 } else if (tb->fastreuse &&
217 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
218 tb->fastreuse = 0;
219success:
220 if (!inet_csk(sk)->icsk_bind_hash)
221 inet_bind_hash(sk, tb, snum);
222 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
223 ret = 0;
224
225fail_unlock:
226 spin_unlock(&head->lock);
227fail:
228 local_bh_enable();
229 return ret;
230}
231
232static int tcp_v4_get_port(struct sock *sk, unsigned short snum) 100static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
233{ 101{
234 return inet_csk_get_port(&tcp_hashinfo, sk, snum); 102 return inet_csk_get_port(&tcp_hashinfo, sk, snum);
@@ -568,52 +436,6 @@ static inline int inet_iif(const struct sk_buff *skb)
568 return ((struct rtable *)skb->dst)->rt_iif; 436 return ((struct rtable *)skb->dst)->rt_iif;
569} 437}
570 438
571static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
572 const u32 rnd, const u16 synq_hsize)
573{
574 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
575}
576
577struct request_sock *inet_csk_search_req(const struct sock *sk,
578 struct request_sock ***prevp,
579 const __u16 rport, const __u32 raddr,
580 const __u32 laddr)
581{
582 const struct inet_connection_sock *icsk = inet_csk(sk);
583 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
584 struct request_sock *req, **prev;
585
586 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
587 lopt->nr_table_entries)];
588 (req = *prev) != NULL;
589 prev = &req->dl_next) {
590 const struct inet_request_sock *ireq = inet_rsk(req);
591
592 if (ireq->rmt_port == rport &&
593 ireq->rmt_addr == raddr &&
594 ireq->loc_addr == laddr &&
595 AF_INET_FAMILY(req->rsk_ops->family)) {
596 BUG_TRAP(!req->sk);
597 *prevp = prev;
598 break;
599 }
600 }
601
602 return req;
603}
604
605static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
606{
607 struct inet_connection_sock *icsk = inet_csk(sk);
608 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
609 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
610 lopt->hash_rnd, lopt->nr_table_entries);
611
612 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
613 inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
614}
615
616
617/* 439/*
618 * This routine does path mtu discovery as defined in RFC1191. 440 * This routine does path mtu discovery as defined in RFC1191.
619 */ 441 */
@@ -963,36 +785,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
963 req->ts_recent); 785 req->ts_recent);
964} 786}
965 787
966struct dst_entry* inet_csk_route_req(struct sock *sk,
967 const struct request_sock *req)
968{
969 struct rtable *rt;
970 const struct inet_request_sock *ireq = inet_rsk(req);
971 struct ip_options *opt = inet_rsk(req)->opt;
972 struct flowi fl = { .oif = sk->sk_bound_dev_if,
973 .nl_u = { .ip4_u =
974 { .daddr = ((opt && opt->srr) ?
975 opt->faddr :
976 ireq->rmt_addr),
977 .saddr = ireq->loc_addr,
978 .tos = RT_CONN_FLAGS(sk) } },
979 .proto = sk->sk_protocol,
980 .uli_u = { .ports =
981 { .sport = inet_sk(sk)->sport,
982 .dport = ireq->rmt_port } } };
983
984 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
985 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
986 return NULL;
987 }
988 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
989 ip_rt_put(rt);
990 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
991 return NULL;
992 }
993 return &rt->u.dst;
994}
995
996/* 788/*
997 * Send a SYN-ACK after having received an ACK. 789 * Send a SYN-ACK after having received an ACK.
998 * This still operates on a request_sock only, not on a big 790 * This still operates on a request_sock only, not on a big
@@ -1222,7 +1014,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1222 if (want_cookie) { 1014 if (want_cookie) {
1223 reqsk_free(req); 1015 reqsk_free(req);
1224 } else { 1016 } else {
1225 tcp_v4_synq_add(sk, req); 1017 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1226 } 1018 }
1227 return 0; 1019 return 0;
1228 1020
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6f0a7e30ceac..f458eacb5ef2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1493,7 +1493,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1493 if (skb == 1493 if (skb ==
1494 skb_peek(&sk->sk_write_queue)) 1494 skb_peek(&sk->sk_write_queue))
1495 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 1495 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1496 inet_csk(sk)->icsk_rto); 1496 inet_csk(sk)->icsk_rto,
1497 TCP_RTO_MAX);
1497 } 1498 }
1498 1499
1499 packet_cnt -= tcp_skb_pcount(skb); 1500 packet_cnt -= tcp_skb_pcount(skb);
@@ -1546,7 +1547,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1546 break; 1547 break;
1547 1548
1548 if (skb == skb_peek(&sk->sk_write_queue)) 1549 if (skb == skb_peek(&sk->sk_write_queue))
1549 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); 1550 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1551 inet_csk(sk)->icsk_rto,
1552 TCP_RTO_MAX);
1550 1553
1551 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); 1554 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1552 } 1555 }
@@ -1826,7 +1829,8 @@ int tcp_connect(struct sock *sk)
1826 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); 1829 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1827 1830
1828 /* Timer for repeating the SYN until an answer. */ 1831 /* Timer for repeating the SYN until an answer. */
1829 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); 1832 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1833 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1830 return 0; 1834 return 0;
1831} 1835}
1832 1836
@@ -1901,7 +1905,8 @@ void tcp_send_ack(struct sock *sk)
1901 if (buff == NULL) { 1905 if (buff == NULL) {
1902 inet_csk_schedule_ack(sk); 1906 inet_csk_schedule_ack(sk);
1903 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; 1907 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
1904 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); 1908 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1909 TCP_DELACK_MAX, TCP_RTO_MAX);
1905 return; 1910 return;
1906 } 1911 }
1907 1912
@@ -2033,7 +2038,8 @@ void tcp_send_probe0(struct sock *sk)
2033 icsk->icsk_backoff++; 2038 icsk->icsk_backoff++;
2034 tp->probes_out++; 2039 tp->probes_out++;
2035 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 2040 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2036 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); 2041 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
2042 TCP_RTO_MAX);
2037 } else { 2043 } else {
2038 /* If packet was not sent due to local congestion, 2044 /* If packet was not sent due to local congestion,
2039 * do not backoff and do not remember probes_out. 2045 * do not backoff and do not remember probes_out.
@@ -2045,7 +2051,8 @@ void tcp_send_probe0(struct sock *sk)
2045 tp->probes_out=1; 2051 tp->probes_out=1;
2046 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 2052 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2047 min(icsk->icsk_rto << icsk->icsk_backoff, 2053 min(icsk->icsk_rto << icsk->icsk_backoff,
2048 TCP_RESOURCE_PROBE_INTERVAL)); 2054 TCP_RESOURCE_PROBE_INTERVAL),
2055 TCP_RTO_MAX);
2049 } 2056 }
2050} 2057}
2051 2058
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0b71380ee42f..c03930c48f42 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,55 +36,14 @@ static void tcp_write_timer(unsigned long);
36static void tcp_delack_timer(unsigned long); 36static void tcp_delack_timer(unsigned long);
37static void tcp_keepalive_timer (unsigned long data); 37static void tcp_keepalive_timer (unsigned long data);
38 38
39#ifdef INET_CSK_DEBUG
40const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
41EXPORT_SYMBOL(inet_csk_timer_bug_msg);
42#endif
43
44/*
45 * Using different timers for retransmit, delayed acks and probes
46 * We may wish use just one timer maintaining a list of expire jiffies
47 * to optimize.
48 */
49void inet_csk_init_xmit_timers(struct sock *sk,
50 void (*retransmit_handler)(unsigned long),
51 void (*delack_handler)(unsigned long),
52 void (*keepalive_handler)(unsigned long))
53{
54 struct inet_connection_sock *icsk = inet_csk(sk);
55
56 init_timer(&icsk->icsk_retransmit_timer);
57 init_timer(&icsk->icsk_delack_timer);
58 init_timer(&sk->sk_timer);
59
60 icsk->icsk_retransmit_timer.function = retransmit_handler;
61 icsk->icsk_delack_timer.function = delack_handler;
62 sk->sk_timer.function = keepalive_handler;
63
64 icsk->icsk_retransmit_timer.data =
65 icsk->icsk_delack_timer.data =
66 sk->sk_timer.data = (unsigned long)sk;
67
68 icsk->icsk_pending = icsk->icsk_ack.pending = 0;
69}
70
71void inet_csk_clear_xmit_timers(struct sock *sk)
72{
73 struct inet_connection_sock *icsk = inet_csk(sk);
74
75 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
76
77 sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
78 sk_stop_timer(sk, &icsk->icsk_delack_timer);
79 sk_stop_timer(sk, &sk->sk_timer);
80}
81
82void tcp_init_xmit_timers(struct sock *sk) 39void tcp_init_xmit_timers(struct sock *sk)
83{ 40{
84 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, 41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
85 &tcp_keepalive_timer); 42 &tcp_keepalive_timer);
86} 43}
87 44
45EXPORT_SYMBOL(tcp_init_xmit_timers);
46
88static void tcp_write_err(struct sock *sk) 47static void tcp_write_err(struct sock *sk)
89{ 48{
90 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; 49 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -392,7 +351,8 @@ static void tcp_retransmit_timer(struct sock *sk)
392 if (!icsk->icsk_retransmits) 351 if (!icsk->icsk_retransmits)
393 icsk->icsk_retransmits = 1; 352 icsk->icsk_retransmits = 1;
394 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 353 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
395 min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL)); 354 min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
355 TCP_RTO_MAX);
396 goto out; 356 goto out;
397 } 357 }
398 358
@@ -416,7 +376,7 @@ static void tcp_retransmit_timer(struct sock *sk)
416 376
417out_reset_timer: 377out_reset_timer:
418 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 378 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
419 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto); 379 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
420 if (icsk->icsk_retransmits > sysctl_tcp_retries1) 380 if (icsk->icsk_retransmits > sysctl_tcp_retries1)
421 __sk_dst_reset(sk); 381 __sk_dst_reset(sk);
422 382
@@ -553,16 +513,6 @@ static void tcp_synack_timer(struct sock *sk)
553 inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); 513 inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
554} 514}
555 515
556void inet_csk_delete_keepalive_timer(struct sock *sk)
557{
558 sk_stop_timer(sk, &sk->sk_timer);
559}
560
561void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
562{
563 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
564}
565
566void tcp_set_keepalive(struct sock *sk, int val) 516void tcp_set_keepalive(struct sock *sk, int val)
567{ 517{
568 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) 518 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
@@ -653,8 +603,3 @@ out:
653 bh_unlock_sock(sk); 603 bh_unlock_sock(sk);
654 sock_put(sk); 604 sock_put(sk);
655} 605}
656
657EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
658EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
659EXPORT_SYMBOL(tcp_init_xmit_timers);
660EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);