diff options
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 401 |
1 files changed, 401 insertions, 0 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c new file mode 100644 index 000000000000..2712400a8bb8 --- /dev/null +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -0,0 +1,401 @@ | |||
1 | /* | ||
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
3 | * operating system. INET is implemented using the BSD Socket | ||
4 | * interface as the means of communication with the user level. | ||
5 | * | ||
6 | * Support for INET connection oriented protocols. | ||
7 | * | ||
8 | * Authors: See the TCP sources | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public License | ||
12 | * as published by the Free Software Foundation; either version | ||
13 | * 2 of the License, or(at your option) any later version. | ||
14 | */ | ||
15 | |||
16 | #include <linux/config.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/jhash.h> | ||
19 | |||
20 | #include <net/inet_connection_sock.h> | ||
21 | #include <net/inet_hashtables.h> | ||
22 | #include <net/inet_timewait_sock.h> | ||
23 | #include <net/ip.h> | ||
24 | #include <net/route.h> | ||
25 | #include <net/tcp_states.h> | ||
26 | |||
27 | #ifdef INET_CSK_DEBUG | ||
28 | const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; | ||
29 | EXPORT_SYMBOL(inet_csk_timer_bug_msg); | ||
30 | #endif | ||
31 | |||
32 | /* | ||
33 | * This array holds the first and last local port number. | ||
34 | * For high-usage systems, use sysctl to change this to | ||
35 | * 32768-61000 | ||
36 | */ | ||
37 | int sysctl_local_port_range[2] = { 1024, 4999 }; | ||
38 | |||
39 | static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) | ||
40 | { | ||
41 | const u32 sk_rcv_saddr = inet_rcv_saddr(sk); | ||
42 | struct sock *sk2; | ||
43 | struct hlist_node *node; | ||
44 | int reuse = sk->sk_reuse; | ||
45 | |||
46 | sk_for_each_bound(sk2, node, &tb->owners) { | ||
47 | if (sk != sk2 && | ||
48 | !inet_v6_ipv6only(sk2) && | ||
49 | (!sk->sk_bound_dev_if || | ||
50 | !sk2->sk_bound_dev_if || | ||
51 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { | ||
52 | if (!reuse || !sk2->sk_reuse || | ||
53 | sk2->sk_state == TCP_LISTEN) { | ||
54 | const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); | ||
55 | if (!sk2_rcv_saddr || !sk_rcv_saddr || | ||
56 | sk2_rcv_saddr == sk_rcv_saddr) | ||
57 | break; | ||
58 | } | ||
59 | } | ||
60 | } | ||
61 | return node != NULL; | ||
62 | } | ||
63 | |||
64 | /* Obtain a reference to a local port for the given sock, | ||
65 | * if snum is zero it means select any available local port. | ||
66 | */ | ||
67 | int inet_csk_get_port(struct inet_hashinfo *hashinfo, | ||
68 | struct sock *sk, unsigned short snum) | ||
69 | { | ||
70 | struct inet_bind_hashbucket *head; | ||
71 | struct hlist_node *node; | ||
72 | struct inet_bind_bucket *tb; | ||
73 | int ret; | ||
74 | |||
75 | local_bh_disable(); | ||
76 | if (!snum) { | ||
77 | int low = sysctl_local_port_range[0]; | ||
78 | int high = sysctl_local_port_range[1]; | ||
79 | int remaining = (high - low) + 1; | ||
80 | int rover; | ||
81 | |||
82 | spin_lock(&hashinfo->portalloc_lock); | ||
83 | if (hashinfo->port_rover < low) | ||
84 | rover = low; | ||
85 | else | ||
86 | rover = hashinfo->port_rover; | ||
87 | do { | ||
88 | rover++; | ||
89 | if (rover > high) | ||
90 | rover = low; | ||
91 | head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; | ||
92 | spin_lock(&head->lock); | ||
93 | inet_bind_bucket_for_each(tb, node, &head->chain) | ||
94 | if (tb->port == rover) | ||
95 | goto next; | ||
96 | break; | ||
97 | next: | ||
98 | spin_unlock(&head->lock); | ||
99 | } while (--remaining > 0); | ||
100 | hashinfo->port_rover = rover; | ||
101 | spin_unlock(&hashinfo->portalloc_lock); | ||
102 | |||
103 | /* Exhausted local port range during search? It is not | ||
104 | * possible for us to be holding one of the bind hash | ||
105 | * locks if this test triggers, because if 'remaining' | ||
106 | * drops to zero, we broke out of the do/while loop at | ||
107 | * the top level, not from the 'break;' statement. | ||
108 | */ | ||
109 | ret = 1; | ||
110 | if (remaining <= 0) | ||
111 | goto fail; | ||
112 | |||
113 | /* OK, here is the one we will use. HEAD is | ||
114 | * non-NULL and we hold it's mutex. | ||
115 | */ | ||
116 | snum = rover; | ||
117 | } else { | ||
118 | head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; | ||
119 | spin_lock(&head->lock); | ||
120 | inet_bind_bucket_for_each(tb, node, &head->chain) | ||
121 | if (tb->port == snum) | ||
122 | goto tb_found; | ||
123 | } | ||
124 | tb = NULL; | ||
125 | goto tb_not_found; | ||
126 | tb_found: | ||
127 | if (!hlist_empty(&tb->owners)) { | ||
128 | if (sk->sk_reuse > 1) | ||
129 | goto success; | ||
130 | if (tb->fastreuse > 0 && | ||
131 | sk->sk_reuse && sk->sk_state != TCP_LISTEN) { | ||
132 | goto success; | ||
133 | } else { | ||
134 | ret = 1; | ||
135 | if (inet_csk_bind_conflict(sk, tb)) | ||
136 | goto fail_unlock; | ||
137 | } | ||
138 | } | ||
139 | tb_not_found: | ||
140 | ret = 1; | ||
141 | if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) | ||
142 | goto fail_unlock; | ||
143 | if (hlist_empty(&tb->owners)) { | ||
144 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) | ||
145 | tb->fastreuse = 1; | ||
146 | else | ||
147 | tb->fastreuse = 0; | ||
148 | } else if (tb->fastreuse && | ||
149 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | ||
150 | tb->fastreuse = 0; | ||
151 | success: | ||
152 | if (!inet_csk(sk)->icsk_bind_hash) | ||
153 | inet_bind_hash(sk, tb, snum); | ||
154 | BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); | ||
155 | ret = 0; | ||
156 | |||
157 | fail_unlock: | ||
158 | spin_unlock(&head->lock); | ||
159 | fail: | ||
160 | local_bh_enable(); | ||
161 | return ret; | ||
162 | } | ||
163 | |||
164 | EXPORT_SYMBOL_GPL(inet_csk_get_port); | ||
165 | |||
166 | /* | ||
167 | * Wait for an incoming connection, avoid race conditions. This must be called | ||
168 | * with the socket locked. | ||
169 | */ | ||
170 | static int inet_csk_wait_for_connect(struct sock *sk, long timeo) | ||
171 | { | ||
172 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
173 | DEFINE_WAIT(wait); | ||
174 | int err; | ||
175 | |||
176 | /* | ||
177 | * True wake-one mechanism for incoming connections: only | ||
178 | * one process gets woken up, not the 'whole herd'. | ||
179 | * Since we do not 'race & poll' for established sockets | ||
180 | * anymore, the common case will execute the loop only once. | ||
181 | * | ||
182 | * Subtle issue: "add_wait_queue_exclusive()" will be added | ||
183 | * after any current non-exclusive waiters, and we know that | ||
184 | * it will always _stay_ after any new non-exclusive waiters | ||
185 | * because all non-exclusive waiters are added at the | ||
186 | * beginning of the wait-queue. As such, it's ok to "drop" | ||
187 | * our exclusiveness temporarily when we get woken up without | ||
188 | * having to remove and re-insert us on the wait queue. | ||
189 | */ | ||
190 | for (;;) { | ||
191 | prepare_to_wait_exclusive(sk->sk_sleep, &wait, | ||
192 | TASK_INTERRUPTIBLE); | ||
193 | release_sock(sk); | ||
194 | if (reqsk_queue_empty(&icsk->icsk_accept_queue)) | ||
195 | timeo = schedule_timeout(timeo); | ||
196 | lock_sock(sk); | ||
197 | err = 0; | ||
198 | if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) | ||
199 | break; | ||
200 | err = -EINVAL; | ||
201 | if (sk->sk_state != TCP_LISTEN) | ||
202 | break; | ||
203 | err = sock_intr_errno(timeo); | ||
204 | if (signal_pending(current)) | ||
205 | break; | ||
206 | err = -EAGAIN; | ||
207 | if (!timeo) | ||
208 | break; | ||
209 | } | ||
210 | finish_wait(sk->sk_sleep, &wait); | ||
211 | return err; | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * This will accept the next outstanding connection. | ||
216 | */ | ||
217 | struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | ||
218 | { | ||
219 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
220 | struct sock *newsk; | ||
221 | int error; | ||
222 | |||
223 | lock_sock(sk); | ||
224 | |||
225 | /* We need to make sure that this socket is listening, | ||
226 | * and that it has something pending. | ||
227 | */ | ||
228 | error = -EINVAL; | ||
229 | if (sk->sk_state != TCP_LISTEN) | ||
230 | goto out_err; | ||
231 | |||
232 | /* Find already established connection */ | ||
233 | if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { | ||
234 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); | ||
235 | |||
236 | /* If this is a non blocking socket don't sleep */ | ||
237 | error = -EAGAIN; | ||
238 | if (!timeo) | ||
239 | goto out_err; | ||
240 | |||
241 | error = inet_csk_wait_for_connect(sk, timeo); | ||
242 | if (error) | ||
243 | goto out_err; | ||
244 | } | ||
245 | |||
246 | newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); | ||
247 | BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); | ||
248 | out: | ||
249 | release_sock(sk); | ||
250 | return newsk; | ||
251 | out_err: | ||
252 | newsk = NULL; | ||
253 | *err = error; | ||
254 | goto out; | ||
255 | } | ||
256 | |||
257 | EXPORT_SYMBOL(inet_csk_accept); | ||
258 | |||
259 | /* | ||
260 | * Using different timers for retransmit, delayed acks and probes | ||
261 | * We may wish use just one timer maintaining a list of expire jiffies | ||
262 | * to optimize. | ||
263 | */ | ||
264 | void inet_csk_init_xmit_timers(struct sock *sk, | ||
265 | void (*retransmit_handler)(unsigned long), | ||
266 | void (*delack_handler)(unsigned long), | ||
267 | void (*keepalive_handler)(unsigned long)) | ||
268 | { | ||
269 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
270 | |||
271 | init_timer(&icsk->icsk_retransmit_timer); | ||
272 | init_timer(&icsk->icsk_delack_timer); | ||
273 | init_timer(&sk->sk_timer); | ||
274 | |||
275 | icsk->icsk_retransmit_timer.function = retransmit_handler; | ||
276 | icsk->icsk_delack_timer.function = delack_handler; | ||
277 | sk->sk_timer.function = keepalive_handler; | ||
278 | |||
279 | icsk->icsk_retransmit_timer.data = | ||
280 | icsk->icsk_delack_timer.data = | ||
281 | sk->sk_timer.data = (unsigned long)sk; | ||
282 | |||
283 | icsk->icsk_pending = icsk->icsk_ack.pending = 0; | ||
284 | } | ||
285 | |||
286 | EXPORT_SYMBOL(inet_csk_init_xmit_timers); | ||
287 | |||
288 | void inet_csk_clear_xmit_timers(struct sock *sk) | ||
289 | { | ||
290 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
291 | |||
292 | icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; | ||
293 | |||
294 | sk_stop_timer(sk, &icsk->icsk_retransmit_timer); | ||
295 | sk_stop_timer(sk, &icsk->icsk_delack_timer); | ||
296 | sk_stop_timer(sk, &sk->sk_timer); | ||
297 | } | ||
298 | |||
299 | EXPORT_SYMBOL(inet_csk_clear_xmit_timers); | ||
300 | |||
301 | void inet_csk_delete_keepalive_timer(struct sock *sk) | ||
302 | { | ||
303 | sk_stop_timer(sk, &sk->sk_timer); | ||
304 | } | ||
305 | |||
306 | EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); | ||
307 | |||
308 | void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) | ||
309 | { | ||
310 | sk_reset_timer(sk, &sk->sk_timer, jiffies + len); | ||
311 | } | ||
312 | |||
313 | EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); | ||
314 | |||
315 | struct dst_entry* inet_csk_route_req(struct sock *sk, | ||
316 | const struct request_sock *req) | ||
317 | { | ||
318 | struct rtable *rt; | ||
319 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
320 | struct ip_options *opt = inet_rsk(req)->opt; | ||
321 | struct flowi fl = { .oif = sk->sk_bound_dev_if, | ||
322 | .nl_u = { .ip4_u = | ||
323 | { .daddr = ((opt && opt->srr) ? | ||
324 | opt->faddr : | ||
325 | ireq->rmt_addr), | ||
326 | .saddr = ireq->loc_addr, | ||
327 | .tos = RT_CONN_FLAGS(sk) } }, | ||
328 | .proto = sk->sk_protocol, | ||
329 | .uli_u = { .ports = | ||
330 | { .sport = inet_sk(sk)->sport, | ||
331 | .dport = ireq->rmt_port } } }; | ||
332 | |||
333 | if (ip_route_output_flow(&rt, &fl, sk, 0)) { | ||
334 | IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); | ||
335 | return NULL; | ||
336 | } | ||
337 | if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { | ||
338 | ip_rt_put(rt); | ||
339 | IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); | ||
340 | return NULL; | ||
341 | } | ||
342 | return &rt->u.dst; | ||
343 | } | ||
344 | |||
345 | EXPORT_SYMBOL_GPL(inet_csk_route_req); | ||
346 | |||
347 | static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, | ||
348 | const u32 rnd, const u16 synq_hsize) | ||
349 | { | ||
350 | return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); | ||
351 | } | ||
352 | |||
353 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
354 | #define AF_INET_FAMILY(fam) ((fam) == AF_INET) | ||
355 | #else | ||
356 | #define AF_INET_FAMILY(fam) 1 | ||
357 | #endif | ||
358 | |||
359 | struct request_sock *inet_csk_search_req(const struct sock *sk, | ||
360 | struct request_sock ***prevp, | ||
361 | const __u16 rport, const __u32 raddr, | ||
362 | const __u32 laddr) | ||
363 | { | ||
364 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
365 | struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; | ||
366 | struct request_sock *req, **prev; | ||
367 | |||
368 | for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, | ||
369 | lopt->nr_table_entries)]; | ||
370 | (req = *prev) != NULL; | ||
371 | prev = &req->dl_next) { | ||
372 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
373 | |||
374 | if (ireq->rmt_port == rport && | ||
375 | ireq->rmt_addr == raddr && | ||
376 | ireq->loc_addr == laddr && | ||
377 | AF_INET_FAMILY(req->rsk_ops->family)) { | ||
378 | BUG_TRAP(!req->sk); | ||
379 | *prevp = prev; | ||
380 | break; | ||
381 | } | ||
382 | } | ||
383 | |||
384 | return req; | ||
385 | } | ||
386 | |||
387 | EXPORT_SYMBOL_GPL(inet_csk_search_req); | ||
388 | |||
389 | void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, | ||
390 | const unsigned timeout) | ||
391 | { | ||
392 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
393 | struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; | ||
394 | const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, | ||
395 | lopt->hash_rnd, lopt->nr_table_entries); | ||
396 | |||
397 | reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); | ||
398 | inet_csk_reqsk_queue_added(sk, timeout); | ||
399 | } | ||
400 | |||
401 | EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); | ||