aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_ipv4.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r--net/ipv4/tcp_ipv4.c2663
1 files changed, 2663 insertions, 0 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 000000000000..3ac6659869c4
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2663 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
67#include <net/tcp.h>
68#include <net/ipv6.h>
69#include <net/inet_common.h>
70#include <net/xfrm.h>
71
72#include <linux/inet.h>
73#include <linux/ipv6.h>
74#include <linux/stddef.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77
78extern int sysctl_ip_dynaddr;
79int sysctl_tcp_tw_reuse;
80int sysctl_tcp_low_latency;
81
82/* Check TCP sequence numbers in ICMP packets. */
83#define ICMP_MIN_LENGTH 8
84
85/* Socket used for sending RSTs */
86static struct socket *tcp_socket;
87
88void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89 struct sk_buff *skb);
90
91struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
94 .__tcp_lhash_wait
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
97};
98
99/*
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
103 */
104int sysctl_local_port_range[2] = { 1024, 4999 };
105int tcp_port_rover = 1024 - 1;
106
107static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
109{
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
111 h ^= h >> 16;
112 h ^= h >> 8;
113 return h & (tcp_ehash_size - 1);
114}
115
116static __inline__ int tcp_sk_hashfn(struct sock *sk)
117{
118 struct inet_sock *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
123
124 return tcp_hashfn(laddr, lport, faddr, fport);
125}
126
127/* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
129 */
130struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131 unsigned short snum)
132{
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134 SLAB_ATOMIC);
135 if (tb) {
136 tb->port = snum;
137 tb->fastreuse = 0;
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
140 }
141 return tb;
142}
143
144/* Caller must hold hashbucket lock for this tb with local BH disabled */
145void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
146{
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
150 }
151}
152
153/* Caller must disable local BH processing. */
154static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
155{
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
159
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
165}
166
167inline void tcp_inherit_port(struct sock *sk, struct sock *child)
168{
169 local_bh_disable();
170 __tcp_inherit_port(sk, child);
171 local_bh_enable();
172}
173
174void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175 unsigned short snum)
176{
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
180}
181
182static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183{
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185 struct sock *sk2;
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
188
189 sk_for_each_bound(sk2, node, &tb->owners) {
190 if (sk != sk2 &&
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
200 break;
201 }
202 }
203 }
204 return node != NULL;
205}
206
207/* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
209 */
210static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211{
212 struct tcp_bind_hashbucket *head;
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
215 int ret;
216
217 local_bh_disable();
218 if (!snum) {
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
222 int rover;
223
224 spin_lock(&tcp_portalloc_lock);
225 rover = tcp_port_rover;
226 do {
227 rover++;
228 if (rover < low || rover > high)
229 rover = low;
230 head = &tcp_bhash[tcp_bhashfn(rover)];
231 spin_lock(&head->lock);
232 tb_for_each(tb, node, &head->chain)
233 if (tb->port == rover)
234 goto next;
235 break;
236 next:
237 spin_unlock(&head->lock);
238 } while (--remaining > 0);
239 tcp_port_rover = rover;
240 spin_unlock(&tcp_portalloc_lock);
241
242 /* Exhausted local port range during search? */
243 ret = 1;
244 if (remaining <= 0)
245 goto fail;
246
247 /* OK, here is the one we will use. HEAD is
248 * non-NULL and we hold it's mutex.
249 */
250 snum = rover;
251 } else {
252 head = &tcp_bhash[tcp_bhashfn(snum)];
253 spin_lock(&head->lock);
254 tb_for_each(tb, node, &head->chain)
255 if (tb->port == snum)
256 goto tb_found;
257 }
258 tb = NULL;
259 goto tb_not_found;
260tb_found:
261 if (!hlist_empty(&tb->owners)) {
262 if (sk->sk_reuse > 1)
263 goto success;
264 if (tb->fastreuse > 0 &&
265 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
266 goto success;
267 } else {
268 ret = 1;
269 if (tcp_bind_conflict(sk, tb))
270 goto fail_unlock;
271 }
272 }
273tb_not_found:
274 ret = 1;
275 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
276 goto fail_unlock;
277 if (hlist_empty(&tb->owners)) {
278 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
279 tb->fastreuse = 1;
280 else
281 tb->fastreuse = 0;
282 } else if (tb->fastreuse &&
283 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
284 tb->fastreuse = 0;
285success:
286 if (!tcp_sk(sk)->bind_hash)
287 tcp_bind_hash(sk, tb, snum);
288 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
289 ret = 0;
290
291fail_unlock:
292 spin_unlock(&head->lock);
293fail:
294 local_bh_enable();
295 return ret;
296}
297
298/* Get rid of any references to a local port held by the
299 * given sock.
300 */
301static void __tcp_put_port(struct sock *sk)
302{
303 struct inet_sock *inet = inet_sk(sk);
304 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
305 struct tcp_bind_bucket *tb;
306
307 spin_lock(&head->lock);
308 tb = tcp_sk(sk)->bind_hash;
309 __sk_del_bind_node(sk);
310 tcp_sk(sk)->bind_hash = NULL;
311 inet->num = 0;
312 tcp_bucket_destroy(tb);
313 spin_unlock(&head->lock);
314}
315
316void tcp_put_port(struct sock *sk)
317{
318 local_bh_disable();
319 __tcp_put_port(sk);
320 local_bh_enable();
321}
322
323/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
324 * Look, when several writers sleep and reader wakes them up, all but one
325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
326 * this, _but_ remember, it adds useless work on UP machines (wake up each
327 * exclusive lock release). It should be ifdefed really.
328 */
329
330void tcp_listen_wlock(void)
331{
332 write_lock(&tcp_lhash_lock);
333
334 if (atomic_read(&tcp_lhash_users)) {
335 DEFINE_WAIT(wait);
336
337 for (;;) {
338 prepare_to_wait_exclusive(&tcp_lhash_wait,
339 &wait, TASK_UNINTERRUPTIBLE);
340 if (!atomic_read(&tcp_lhash_users))
341 break;
342 write_unlock_bh(&tcp_lhash_lock);
343 schedule();
344 write_lock_bh(&tcp_lhash_lock);
345 }
346
347 finish_wait(&tcp_lhash_wait, &wait);
348 }
349}
350
351static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
352{
353 struct hlist_head *list;
354 rwlock_t *lock;
355
356 BUG_TRAP(sk_unhashed(sk));
357 if (listen_possible && sk->sk_state == TCP_LISTEN) {
358 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359 lock = &tcp_lhash_lock;
360 tcp_listen_wlock();
361 } else {
362 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
363 lock = &tcp_ehash[sk->sk_hashent].lock;
364 write_lock(lock);
365 }
366 __sk_add_node(sk, list);
367 sock_prot_inc_use(sk->sk_prot);
368 write_unlock(lock);
369 if (listen_possible && sk->sk_state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
371}
372
373static void tcp_v4_hash(struct sock *sk)
374{
375 if (sk->sk_state != TCP_CLOSE) {
376 local_bh_disable();
377 __tcp_v4_hash(sk, 1);
378 local_bh_enable();
379 }
380}
381
382void tcp_unhash(struct sock *sk)
383{
384 rwlock_t *lock;
385
386 if (sk_unhashed(sk))
387 goto ende;
388
389 if (sk->sk_state == TCP_LISTEN) {
390 local_bh_disable();
391 tcp_listen_wlock();
392 lock = &tcp_lhash_lock;
393 } else {
394 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
395 lock = &head->lock;
396 write_lock_bh(&head->lock);
397 }
398
399 if (__sk_del_node_init(sk))
400 sock_prot_dec_use(sk->sk_prot);
401 write_unlock_bh(lock);
402
403 ende:
404 if (sk->sk_state == TCP_LISTEN)
405 wake_up(&tcp_lhash_wait);
406}
407
408/* Don't inline this cruft. Here are some nice properties to
409 * exploit here. The BSD API does not allow a listening TCP
410 * to specify the remote port nor the remote address for the
411 * connection. So always assume those are both wildcarded
412 * during the search since they can never be otherwise.
413 */
414static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
415 unsigned short hnum, int dif)
416{
417 struct sock *result = NULL, *sk;
418 struct hlist_node *node;
419 int score, hiscore;
420
421 hiscore=-1;
422 sk_for_each(sk, node, head) {
423 struct inet_sock *inet = inet_sk(sk);
424
425 if (inet->num == hnum && !ipv6_only_sock(sk)) {
426 __u32 rcv_saddr = inet->rcv_saddr;
427
428 score = (sk->sk_family == PF_INET ? 1 : 0);
429 if (rcv_saddr) {
430 if (rcv_saddr != daddr)
431 continue;
432 score+=2;
433 }
434 if (sk->sk_bound_dev_if) {
435 if (sk->sk_bound_dev_if != dif)
436 continue;
437 score+=2;
438 }
439 if (score == 5)
440 return sk;
441 if (score > hiscore) {
442 hiscore = score;
443 result = sk;
444 }
445 }
446 }
447 return result;
448}
449
450/* Optimize the common listener case. */
451static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
452 unsigned short hnum, int dif)
453{
454 struct sock *sk = NULL;
455 struct hlist_head *head;
456
457 read_lock(&tcp_lhash_lock);
458 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459 if (!hlist_empty(head)) {
460 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
461
462 if (inet->num == hnum && !sk->sk_node.next &&
463 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
464 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
465 !sk->sk_bound_dev_if)
466 goto sherry_cache;
467 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
468 }
469 if (sk) {
470sherry_cache:
471 sock_hold(sk);
472 }
473 read_unlock(&tcp_lhash_lock);
474 return sk;
475}
476
477/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
478 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
479 *
480 * Local BH must be disabled here.
481 */
482
483static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
484 u32 daddr, u16 hnum,
485 int dif)
486{
487 struct tcp_ehash_bucket *head;
488 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
489 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
490 struct sock *sk;
491 struct hlist_node *node;
492 /* Optimize here for direct hit, only listening connections can
493 * have wildcards anyways.
494 */
495 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
496 head = &tcp_ehash[hash];
497 read_lock(&head->lock);
498 sk_for_each(sk, node, &head->chain) {
499 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
500 goto hit; /* You sunk my battleship! */
501 }
502
503 /* Must check for a TIME_WAIT'er before going to listener hash. */
504 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
505 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
506 goto hit;
507 }
508 sk = NULL;
509out:
510 read_unlock(&head->lock);
511 return sk;
512hit:
513 sock_hold(sk);
514 goto out;
515}
516
517static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
518 u32 daddr, u16 hnum, int dif)
519{
520 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
521 daddr, hnum, dif);
522
523 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
524}
525
526inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
527 u16 dport, int dif)
528{
529 struct sock *sk;
530
531 local_bh_disable();
532 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
533 local_bh_enable();
534
535 return sk;
536}
537
538EXPORT_SYMBOL_GPL(tcp_v4_lookup);
539
540static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
541{
542 return secure_tcp_sequence_number(skb->nh.iph->daddr,
543 skb->nh.iph->saddr,
544 skb->h.th->dest,
545 skb->h.th->source);
546}
547
548/* called with local bh disabled */
549static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
550 struct tcp_tw_bucket **twp)
551{
552 struct inet_sock *inet = inet_sk(sk);
553 u32 daddr = inet->rcv_saddr;
554 u32 saddr = inet->daddr;
555 int dif = sk->sk_bound_dev_if;
556 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
557 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
558 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
559 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
560 struct sock *sk2;
561 struct hlist_node *node;
562 struct tcp_tw_bucket *tw;
563
564 write_lock(&head->lock);
565
566 /* Check TIME-WAIT sockets first. */
567 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
568 tw = (struct tcp_tw_bucket *)sk2;
569
570 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
571 struct tcp_sock *tp = tcp_sk(sk);
572
573 /* With PAWS, it is safe from the viewpoint
574 of data integrity. Even without PAWS it
575 is safe provided sequence spaces do not
576 overlap i.e. at data rates <= 80Mbit/sec.
577
578 Actually, the idea is close to VJ's one,
579 only timestamp cache is held not per host,
580 but per port pair and TW bucket is used
581 as state holder.
582
583 If TW bucket has been already destroyed we
584 fall back to VJ's scheme and use initial
585 timestamp retrieved from peer table.
586 */
587 if (tw->tw_ts_recent_stamp &&
588 (!twp || (sysctl_tcp_tw_reuse &&
589 xtime.tv_sec -
590 tw->tw_ts_recent_stamp > 1))) {
591 if ((tp->write_seq =
592 tw->tw_snd_nxt + 65535 + 2) == 0)
593 tp->write_seq = 1;
594 tp->rx_opt.ts_recent = tw->tw_ts_recent;
595 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
596 sock_hold(sk2);
597 goto unique;
598 } else
599 goto not_unique;
600 }
601 }
602 tw = NULL;
603
604 /* And established part... */
605 sk_for_each(sk2, node, &head->chain) {
606 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
607 goto not_unique;
608 }
609
610unique:
611 /* Must record num and sport now. Otherwise we will see
612 * in hash table socket with a funny identity. */
613 inet->num = lport;
614 inet->sport = htons(lport);
615 sk->sk_hashent = hash;
616 BUG_TRAP(sk_unhashed(sk));
617 __sk_add_node(sk, &head->chain);
618 sock_prot_inc_use(sk->sk_prot);
619 write_unlock(&head->lock);
620
621 if (twp) {
622 *twp = tw;
623 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
624 } else if (tw) {
625 /* Silly. Should hash-dance instead... */
626 tcp_tw_deschedule(tw);
627 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
628
629 tcp_tw_put(tw);
630 }
631
632 return 0;
633
634not_unique:
635 write_unlock(&head->lock);
636 return -EADDRNOTAVAIL;
637}
638
639static inline u32 connect_port_offset(const struct sock *sk)
640{
641 const struct inet_sock *inet = inet_sk(sk);
642
643 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
644 inet->dport);
645}
646
647/*
648 * Bind a port for a connect operation and hash it.
649 */
650static inline int tcp_v4_hash_connect(struct sock *sk)
651{
652 unsigned short snum = inet_sk(sk)->num;
653 struct tcp_bind_hashbucket *head;
654 struct tcp_bind_bucket *tb;
655 int ret;
656
657 if (!snum) {
658 int low = sysctl_local_port_range[0];
659 int high = sysctl_local_port_range[1];
660 int range = high - low;
661 int i;
662 int port;
663 static u32 hint;
664 u32 offset = hint + connect_port_offset(sk);
665 struct hlist_node *node;
666 struct tcp_tw_bucket *tw = NULL;
667
668 local_bh_disable();
669 for (i = 1; i <= range; i++) {
670 port = low + (i + offset) % range;
671 head = &tcp_bhash[tcp_bhashfn(port)];
672 spin_lock(&head->lock);
673
674 /* Does not bother with rcv_saddr checks,
675 * because the established check is already
676 * unique enough.
677 */
678 tb_for_each(tb, node, &head->chain) {
679 if (tb->port == port) {
680 BUG_TRAP(!hlist_empty(&tb->owners));
681 if (tb->fastreuse >= 0)
682 goto next_port;
683 if (!__tcp_v4_check_established(sk,
684 port,
685 &tw))
686 goto ok;
687 goto next_port;
688 }
689 }
690
691 tb = tcp_bucket_create(head, port);
692 if (!tb) {
693 spin_unlock(&head->lock);
694 break;
695 }
696 tb->fastreuse = -1;
697 goto ok;
698
699 next_port:
700 spin_unlock(&head->lock);
701 }
702 local_bh_enable();
703
704 return -EADDRNOTAVAIL;
705
706ok:
707 hint += i;
708
709 /* Head lock still held and bh's disabled */
710 tcp_bind_hash(sk, tb, port);
711 if (sk_unhashed(sk)) {
712 inet_sk(sk)->sport = htons(port);
713 __tcp_v4_hash(sk, 0);
714 }
715 spin_unlock(&head->lock);
716
717 if (tw) {
718 tcp_tw_deschedule(tw);
719 tcp_tw_put(tw);
720 }
721
722 ret = 0;
723 goto out;
724 }
725
726 head = &tcp_bhash[tcp_bhashfn(snum)];
727 tb = tcp_sk(sk)->bind_hash;
728 spin_lock_bh(&head->lock);
729 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
730 __tcp_v4_hash(sk, 0);
731 spin_unlock_bh(&head->lock);
732 return 0;
733 } else {
734 spin_unlock(&head->lock);
735 /* No definite answer... Walk to established hash table */
736 ret = __tcp_v4_check_established(sk, snum, NULL);
737out:
738 local_bh_enable();
739 return ret;
740 }
741}
742
743/* This will initiate an outgoing connection. */
744int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
745{
746 struct inet_sock *inet = inet_sk(sk);
747 struct tcp_sock *tp = tcp_sk(sk);
748 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
749 struct rtable *rt;
750 u32 daddr, nexthop;
751 int tmp;
752 int err;
753
754 if (addr_len < sizeof(struct sockaddr_in))
755 return -EINVAL;
756
757 if (usin->sin_family != AF_INET)
758 return -EAFNOSUPPORT;
759
760 nexthop = daddr = usin->sin_addr.s_addr;
761 if (inet->opt && inet->opt->srr) {
762 if (!daddr)
763 return -EINVAL;
764 nexthop = inet->opt->faddr;
765 }
766
767 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
768 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
769 IPPROTO_TCP,
770 inet->sport, usin->sin_port, sk);
771 if (tmp < 0)
772 return tmp;
773
774 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
775 ip_rt_put(rt);
776 return -ENETUNREACH;
777 }
778
779 if (!inet->opt || !inet->opt->srr)
780 daddr = rt->rt_dst;
781
782 if (!inet->saddr)
783 inet->saddr = rt->rt_src;
784 inet->rcv_saddr = inet->saddr;
785
786 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
787 /* Reset inherited state */
788 tp->rx_opt.ts_recent = 0;
789 tp->rx_opt.ts_recent_stamp = 0;
790 tp->write_seq = 0;
791 }
792
793 if (sysctl_tcp_tw_recycle &&
794 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
795 struct inet_peer *peer = rt_get_peer(rt);
796
797 /* VJ's idea. We save last timestamp seen from
798 * the destination in peer table, when entering state TIME-WAIT
799 * and initialize rx_opt.ts_recent from it, when trying new connection.
800 */
801
802 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
803 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
804 tp->rx_opt.ts_recent = peer->tcp_ts;
805 }
806 }
807
808 inet->dport = usin->sin_port;
809 inet->daddr = daddr;
810
811 tp->ext_header_len = 0;
812 if (inet->opt)
813 tp->ext_header_len = inet->opt->optlen;
814
815 tp->rx_opt.mss_clamp = 536;
816
817 /* Socket identity is still unknown (sport may be zero).
818 * However we set state to SYN-SENT and not releasing socket
819 * lock select source port, enter ourselves into the hash tables and
820 * complete initialization after this.
821 */
822 tcp_set_state(sk, TCP_SYN_SENT);
823 err = tcp_v4_hash_connect(sk);
824 if (err)
825 goto failure;
826
827 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
828 if (err)
829 goto failure;
830
831 /* OK, now commit destination to socket. */
832 __sk_dst_set(sk, &rt->u.dst);
833 tcp_v4_setup_caps(sk, &rt->u.dst);
834
835 if (!tp->write_seq)
836 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
837 inet->daddr,
838 inet->sport,
839 usin->sin_port);
840
841 inet->id = tp->write_seq ^ jiffies;
842
843 err = tcp_connect(sk);
844 rt = NULL;
845 if (err)
846 goto failure;
847
848 return 0;
849
850failure:
851 /* This unhashes the socket and releases the local port, if necessary. */
852 tcp_set_state(sk, TCP_CLOSE);
853 ip_rt_put(rt);
854 sk->sk_route_caps = 0;
855 inet->dport = 0;
856 return err;
857}
858
859static __inline__ int tcp_v4_iif(struct sk_buff *skb)
860{
861 return ((struct rtable *)skb->dst)->rt_iif;
862}
863
864static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
865{
866 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
867}
868
869static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
870 struct open_request ***prevp,
871 __u16 rport,
872 __u32 raddr, __u32 laddr)
873{
874 struct tcp_listen_opt *lopt = tp->listen_opt;
875 struct open_request *req, **prev;
876
877 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
878 (req = *prev) != NULL;
879 prev = &req->dl_next) {
880 if (req->rmt_port == rport &&
881 req->af.v4_req.rmt_addr == raddr &&
882 req->af.v4_req.loc_addr == laddr &&
883 TCP_INET_FAMILY(req->class->family)) {
884 BUG_TRAP(!req->sk);
885 *prevp = prev;
886 break;
887 }
888 }
889
890 return req;
891}
892
893static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
894{
895 struct tcp_sock *tp = tcp_sk(sk);
896 struct tcp_listen_opt *lopt = tp->listen_opt;
897 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
898
899 req->expires = jiffies + TCP_TIMEOUT_INIT;
900 req->retrans = 0;
901 req->sk = NULL;
902 req->dl_next = lopt->syn_table[h];
903
904 write_lock(&tp->syn_wait_lock);
905 lopt->syn_table[h] = req;
906 write_unlock(&tp->syn_wait_lock);
907
908 tcp_synq_added(sk);
909}
910
911
912/*
913 * This routine does path mtu discovery as defined in RFC1191.
914 */
915static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
916 u32 mtu)
917{
918 struct dst_entry *dst;
919 struct inet_sock *inet = inet_sk(sk);
920 struct tcp_sock *tp = tcp_sk(sk);
921
922 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
923 * send out by Linux are always <576bytes so they should go through
924 * unfragmented).
925 */
926 if (sk->sk_state == TCP_LISTEN)
927 return;
928
929 /* We don't check in the destentry if pmtu discovery is forbidden
930 * on this route. We just assume that no packet_to_big packets
931 * are send back when pmtu discovery is not active.
932 * There is a small race when the user changes this flag in the
933 * route, but I think that's acceptable.
934 */
935 if ((dst = __sk_dst_check(sk, 0)) == NULL)
936 return;
937
938 dst->ops->update_pmtu(dst, mtu);
939
940 /* Something is about to be wrong... Remember soft error
941 * for the case, if this connection will not able to recover.
942 */
943 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
944 sk->sk_err_soft = EMSGSIZE;
945
946 mtu = dst_mtu(dst);
947
948 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
949 tp->pmtu_cookie > mtu) {
950 tcp_sync_mss(sk, mtu);
951
952 /* Resend the TCP packet because it's
953 * clear that the old packet has been
954 * dropped. This is the new "fast" path mtu
955 * discovery.
956 */
957 tcp_simple_retransmit(sk);
958 } /* else let the usual retransmit timer handle it */
959}
960
961/*
962 * This routine is called by the ICMP module when it gets some
963 * sort of error condition. If err < 0 then the socket should
964 * be closed and the error returned to the user. If err > 0
965 * it's just the icmp type << 8 | icmp code. After adjustment
966 * header points to the first 8 bytes of the tcp header. We need
967 * to find the appropriate port.
968 *
969 * The locking strategy used here is very "optimistic". When
970 * someone else accesses the socket the ICMP is just dropped
971 * and for some paths there is no check at all.
972 * A more general error queue to queue errors for later handling
973 * is probably better.
974 *
975 */
976
977void tcp_v4_err(struct sk_buff *skb, u32 info)
978{
979 struct iphdr *iph = (struct iphdr *)skb->data;
980 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
981 struct tcp_sock *tp;
982 struct inet_sock *inet;
983 int type = skb->h.icmph->type;
984 int code = skb->h.icmph->code;
985 struct sock *sk;
986 __u32 seq;
987 int err;
988
989 if (skb->len < (iph->ihl << 2) + 8) {
990 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
991 return;
992 }
993
994 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
995 th->source, tcp_v4_iif(skb));
996 if (!sk) {
997 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
998 return;
999 }
1000 if (sk->sk_state == TCP_TIME_WAIT) {
1001 tcp_tw_put((struct tcp_tw_bucket *)sk);
1002 return;
1003 }
1004
1005 bh_lock_sock(sk);
1006 /* If too many ICMPs get dropped on busy
1007 * servers this needs to be solved differently.
1008 */
1009 if (sock_owned_by_user(sk))
1010 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1011
1012 if (sk->sk_state == TCP_CLOSE)
1013 goto out;
1014
1015 tp = tcp_sk(sk);
1016 seq = ntohl(th->seq);
1017 if (sk->sk_state != TCP_LISTEN &&
1018 !between(seq, tp->snd_una, tp->snd_nxt)) {
1019 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1020 goto out;
1021 }
1022
1023 switch (type) {
1024 case ICMP_SOURCE_QUENCH:
1025 /* Just silently ignore these. */
1026 goto out;
1027 case ICMP_PARAMETERPROB:
1028 err = EPROTO;
1029 break;
1030 case ICMP_DEST_UNREACH:
1031 if (code > NR_ICMP_UNREACH)
1032 goto out;
1033
1034 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1035 if (!sock_owned_by_user(sk))
1036 do_pmtu_discovery(sk, iph, info);
1037 goto out;
1038 }
1039
1040 err = icmp_err_convert[code].errno;
1041 break;
1042 case ICMP_TIME_EXCEEDED:
1043 err = EHOSTUNREACH;
1044 break;
1045 default:
1046 goto out;
1047 }
1048
1049 switch (sk->sk_state) {
1050 struct open_request *req, **prev;
1051 case TCP_LISTEN:
1052 if (sock_owned_by_user(sk))
1053 goto out;
1054
1055 req = tcp_v4_search_req(tp, &prev, th->dest,
1056 iph->daddr, iph->saddr);
1057 if (!req)
1058 goto out;
1059
1060 /* ICMPs are not backlogged, hence we cannot get
1061 an established socket here.
1062 */
1063 BUG_TRAP(!req->sk);
1064
1065 if (seq != req->snt_isn) {
1066 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1067 goto out;
1068 }
1069
1070 /*
1071 * Still in SYN_RECV, just remove it silently.
1072 * There is no good way to pass the error to the newly
1073 * created socket, and POSIX does not want network
1074 * errors returned from accept().
1075 */
1076 tcp_synq_drop(sk, req, prev);
1077 goto out;
1078
1079 case TCP_SYN_SENT:
1080 case TCP_SYN_RECV: /* Cannot happen.
1081 It can f.e. if SYNs crossed.
1082 */
1083 if (!sock_owned_by_user(sk)) {
1084 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1085 sk->sk_err = err;
1086
1087 sk->sk_error_report(sk);
1088
1089 tcp_done(sk);
1090 } else {
1091 sk->sk_err_soft = err;
1092 }
1093 goto out;
1094 }
1095
1096 /* If we've already connected we will keep trying
1097 * until we time out, or the user gives up.
1098 *
1099 * rfc1122 4.2.3.9 allows to consider as hard errors
1100 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1101 * but it is obsoleted by pmtu discovery).
1102 *
1103 * Note, that in modern internet, where routing is unreliable
1104 * and in each dark corner broken firewalls sit, sending random
1105 * errors ordered by their masters even this two messages finally lose
1106 * their original sense (even Linux sends invalid PORT_UNREACHs)
1107 *
1108 * Now we are in compliance with RFCs.
1109 * --ANK (980905)
1110 */
1111
1112 inet = inet_sk(sk);
1113 if (!sock_owned_by_user(sk) && inet->recverr) {
1114 sk->sk_err = err;
1115 sk->sk_error_report(sk);
1116 } else { /* Only an error on timeout */
1117 sk->sk_err_soft = err;
1118 }
1119
1120out:
1121 bh_unlock_sock(sk);
1122 sock_put(sk);
1123}
1124
1125/* This routine computes an IPv4 TCP checksum. */
1126void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1127 struct sk_buff *skb)
1128{
1129 struct inet_sock *inet = inet_sk(sk);
1130
1131 if (skb->ip_summed == CHECKSUM_HW) {
1132 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1133 skb->csum = offsetof(struct tcphdr, check);
1134 } else {
1135 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1136 csum_partial((char *)th,
1137 th->doff << 2,
1138 skb->csum));
1139 }
1140}
1141
1142/*
1143 * This routine will send an RST to the other tcp.
1144 *
1145 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1146 * for reset.
1147 * Answer: if a packet caused RST, it is not for a socket
1148 * existing in our system, if it is matched to a socket,
1149 * it is just duplicate segment or bug in other side's TCP.
1150 * So that we build reply only basing on parameters
1151 * arrived with segment.
1152 * Exception: precedence violation. We do not implement it in any case.
1153 */
1154
1155static void tcp_v4_send_reset(struct sk_buff *skb)
1156{
1157 struct tcphdr *th = skb->h.th;
1158 struct tcphdr rth;
1159 struct ip_reply_arg arg;
1160
1161 /* Never send a reset in response to a reset. */
1162 if (th->rst)
1163 return;
1164
1165 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1166 return;
1167
1168 /* Swap the send and the receive. */
1169 memset(&rth, 0, sizeof(struct tcphdr));
1170 rth.dest = th->source;
1171 rth.source = th->dest;
1172 rth.doff = sizeof(struct tcphdr) / 4;
1173 rth.rst = 1;
1174
1175 if (th->ack) {
1176 rth.seq = th->ack_seq;
1177 } else {
1178 rth.ack = 1;
1179 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1180 skb->len - (th->doff << 2));
1181 }
1182
1183 memset(&arg, 0, sizeof arg);
1184 arg.iov[0].iov_base = (unsigned char *)&rth;
1185 arg.iov[0].iov_len = sizeof rth;
1186 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1187 skb->nh.iph->saddr, /*XXX*/
1188 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1189 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1190
1191 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1192
1193 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1194 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1195}
1196
1197/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1198 outside socket context is ugly, certainly. What can I do?
1199 */
1200
1201static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1202 u32 win, u32 ts)
1203{
1204 struct tcphdr *th = skb->h.th;
1205 struct {
1206 struct tcphdr th;
1207 u32 tsopt[3];
1208 } rep;
1209 struct ip_reply_arg arg;
1210
1211 memset(&rep.th, 0, sizeof(struct tcphdr));
1212 memset(&arg, 0, sizeof arg);
1213
1214 arg.iov[0].iov_base = (unsigned char *)&rep;
1215 arg.iov[0].iov_len = sizeof(rep.th);
1216 if (ts) {
1217 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1218 (TCPOPT_TIMESTAMP << 8) |
1219 TCPOLEN_TIMESTAMP);
1220 rep.tsopt[1] = htonl(tcp_time_stamp);
1221 rep.tsopt[2] = htonl(ts);
1222 arg.iov[0].iov_len = sizeof(rep);
1223 }
1224
1225 /* Swap the send and the receive. */
1226 rep.th.dest = th->source;
1227 rep.th.source = th->dest;
1228 rep.th.doff = arg.iov[0].iov_len / 4;
1229 rep.th.seq = htonl(seq);
1230 rep.th.ack_seq = htonl(ack);
1231 rep.th.ack = 1;
1232 rep.th.window = htons(win);
1233
1234 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1235 skb->nh.iph->saddr, /*XXX*/
1236 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1237 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1238
1239 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1240
1241 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1242}
1243
1244static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1245{
1246 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1247
1248 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1249 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1250
1251 tcp_tw_put(tw);
1252}
1253
1254static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1255{
1256 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1257 req->ts_recent);
1258}
1259
1260static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1261 struct open_request *req)
1262{
1263 struct rtable *rt;
1264 struct ip_options *opt = req->af.v4_req.opt;
1265 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1266 .nl_u = { .ip4_u =
1267 { .daddr = ((opt && opt->srr) ?
1268 opt->faddr :
1269 req->af.v4_req.rmt_addr),
1270 .saddr = req->af.v4_req.loc_addr,
1271 .tos = RT_CONN_FLAGS(sk) } },
1272 .proto = IPPROTO_TCP,
1273 .uli_u = { .ports =
1274 { .sport = inet_sk(sk)->sport,
1275 .dport = req->rmt_port } } };
1276
1277 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1278 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1279 return NULL;
1280 }
1281 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1282 ip_rt_put(rt);
1283 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1284 return NULL;
1285 }
1286 return &rt->u.dst;
1287}
1288
1289/*
1290 * Send a SYN-ACK after having received an ACK.
1291 * This still operates on a open_request only, not on a big
1292 * socket.
1293 */
1294static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1295 struct dst_entry *dst)
1296{
1297 int err = -1;
1298 struct sk_buff * skb;
1299
1300 /* First, grab a route. */
1301 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1302 goto out;
1303
1304 skb = tcp_make_synack(sk, dst, req);
1305
1306 if (skb) {
1307 struct tcphdr *th = skb->h.th;
1308
1309 th->check = tcp_v4_check(th, skb->len,
1310 req->af.v4_req.loc_addr,
1311 req->af.v4_req.rmt_addr,
1312 csum_partial((char *)th, skb->len,
1313 skb->csum));
1314
1315 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1316 req->af.v4_req.rmt_addr,
1317 req->af.v4_req.opt);
1318 if (err == NET_XMIT_CN)
1319 err = 0;
1320 }
1321
1322out:
1323 dst_release(dst);
1324 return err;
1325}
1326
1327/*
1328 * IPv4 open_request destructor.
1329 */
1330static void tcp_v4_or_free(struct open_request *req)
1331{
1332 if (req->af.v4_req.opt)
1333 kfree(req->af.v4_req.opt);
1334}
1335
1336static inline void syn_flood_warning(struct sk_buff *skb)
1337{
1338 static unsigned long warntime;
1339
1340 if (time_after(jiffies, (warntime + HZ * 60))) {
1341 warntime = jiffies;
1342 printk(KERN_INFO
1343 "possible SYN flooding on port %d. Sending cookies.\n",
1344 ntohs(skb->h.th->dest));
1345 }
1346}
1347
1348/*
1349 * Save and compile IPv4 options into the open_request if needed.
1350 */
1351static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1352 struct sk_buff *skb)
1353{
1354 struct ip_options *opt = &(IPCB(skb)->opt);
1355 struct ip_options *dopt = NULL;
1356
1357 if (opt && opt->optlen) {
1358 int opt_size = optlength(opt);
1359 dopt = kmalloc(opt_size, GFP_ATOMIC);
1360 if (dopt) {
1361 if (ip_options_echo(dopt, skb)) {
1362 kfree(dopt);
1363 dopt = NULL;
1364 }
1365 }
1366 }
1367 return dopt;
1368}
1369
1370/*
1371 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1372 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1373 * It would be better to replace it with a global counter for all sockets
1374 * but then some measure against one socket starving all other sockets
1375 * would be needed.
1376 *
1377 * It was 128 by default. Experiments with real servers show, that
1378 * it is absolutely not enough even at 100conn/sec. 256 cures most
1379 * of problems. This value is adjusted to 128 for very small machines
1380 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1381 * Further increasing requires to change hash table size.
1382 */
1383int sysctl_max_syn_backlog = 256;
1384
1385struct or_calltable or_ipv4 = {
1386 .family = PF_INET,
1387 .rtx_syn_ack = tcp_v4_send_synack,
1388 .send_ack = tcp_v4_or_send_ack,
1389 .destructor = tcp_v4_or_free,
1390 .send_reset = tcp_v4_send_reset,
1391};
1392
1393int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1394{
1395 struct tcp_options_received tmp_opt;
1396 struct open_request *req;
1397 __u32 saddr = skb->nh.iph->saddr;
1398 __u32 daddr = skb->nh.iph->daddr;
1399 __u32 isn = TCP_SKB_CB(skb)->when;
1400 struct dst_entry *dst = NULL;
1401#ifdef CONFIG_SYN_COOKIES
1402 int want_cookie = 0;
1403#else
1404#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1405#endif
1406
1407 /* Never answer to SYNs send to broadcast or multicast */
1408 if (((struct rtable *)skb->dst)->rt_flags &
1409 (RTCF_BROADCAST | RTCF_MULTICAST))
1410 goto drop;
1411
1412 /* TW buckets are converted to open requests without
1413 * limitations, they conserve resources and peer is
1414 * evidently real one.
1415 */
1416 if (tcp_synq_is_full(sk) && !isn) {
1417#ifdef CONFIG_SYN_COOKIES
1418 if (sysctl_tcp_syncookies) {
1419 want_cookie = 1;
1420 } else
1421#endif
1422 goto drop;
1423 }
1424
1425 /* Accept backlog is full. If we have already queued enough
1426 * of warm entries in syn queue, drop request. It is better than
1427 * clogging syn queue with openreqs with exponentially increasing
1428 * timeout.
1429 */
1430 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1431 goto drop;
1432
1433 req = tcp_openreq_alloc();
1434 if (!req)
1435 goto drop;
1436
1437 tcp_clear_options(&tmp_opt);
1438 tmp_opt.mss_clamp = 536;
1439 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1440
1441 tcp_parse_options(skb, &tmp_opt, 0);
1442
1443 if (want_cookie) {
1444 tcp_clear_options(&tmp_opt);
1445 tmp_opt.saw_tstamp = 0;
1446 }
1447
1448 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1449 /* Some OSes (unknown ones, but I see them on web server, which
1450 * contains information interesting only for windows'
1451 * users) do not send their stamp in SYN. It is easy case.
1452 * We simply do not advertise TS support.
1453 */
1454 tmp_opt.saw_tstamp = 0;
1455 tmp_opt.tstamp_ok = 0;
1456 }
1457 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1458
1459 tcp_openreq_init(req, &tmp_opt, skb);
1460
1461 req->af.v4_req.loc_addr = daddr;
1462 req->af.v4_req.rmt_addr = saddr;
1463 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1464 req->class = &or_ipv4;
1465 if (!want_cookie)
1466 TCP_ECN_create_request(req, skb->h.th);
1467
1468 if (want_cookie) {
1469#ifdef CONFIG_SYN_COOKIES
1470 syn_flood_warning(skb);
1471#endif
1472 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1473 } else if (!isn) {
1474 struct inet_peer *peer = NULL;
1475
1476 /* VJ's idea. We save last timestamp seen
1477 * from the destination in peer table, when entering
1478 * state TIME-WAIT, and check against it before
1479 * accepting new connection request.
1480 *
1481 * If "isn" is not zero, this request hit alive
1482 * timewait bucket, so that all the necessary checks
1483 * are made in the function processing timewait state.
1484 */
1485 if (tmp_opt.saw_tstamp &&
1486 sysctl_tcp_tw_recycle &&
1487 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1488 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1489 peer->v4daddr == saddr) {
1490 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1491 (s32)(peer->tcp_ts - req->ts_recent) >
1492 TCP_PAWS_WINDOW) {
1493 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1494 dst_release(dst);
1495 goto drop_and_free;
1496 }
1497 }
1498 /* Kill the following clause, if you dislike this way. */
1499 else if (!sysctl_tcp_syncookies &&
1500 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1501 (sysctl_max_syn_backlog >> 2)) &&
1502 (!peer || !peer->tcp_ts_stamp) &&
1503 (!dst || !dst_metric(dst, RTAX_RTT))) {
1504 /* Without syncookies last quarter of
1505 * backlog is filled with destinations,
1506 * proven to be alive.
1507 * It means that we continue to communicate
1508 * to destinations, already remembered
1509 * to the moment of synflood.
1510 */
1511 NETDEBUG(if (net_ratelimit()) \
1512 printk(KERN_DEBUG "TCP: drop open "
1513 "request from %u.%u."
1514 "%u.%u/%u\n", \
1515 NIPQUAD(saddr),
1516 ntohs(skb->h.th->source)));
1517 dst_release(dst);
1518 goto drop_and_free;
1519 }
1520
1521 isn = tcp_v4_init_sequence(sk, skb);
1522 }
1523 req->snt_isn = isn;
1524
1525 if (tcp_v4_send_synack(sk, req, dst))
1526 goto drop_and_free;
1527
1528 if (want_cookie) {
1529 tcp_openreq_free(req);
1530 } else {
1531 tcp_v4_synq_add(sk, req);
1532 }
1533 return 0;
1534
1535drop_and_free:
1536 tcp_openreq_free(req);
1537drop:
1538 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1539 return 0;
1540}
1541
1542
1543/*
1544 * The three way handshake has completed - we got a valid synack -
1545 * now create the new socket.
1546 */
1547struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1548 struct open_request *req,
1549 struct dst_entry *dst)
1550{
1551 struct inet_sock *newinet;
1552 struct tcp_sock *newtp;
1553 struct sock *newsk;
1554
1555 if (sk_acceptq_is_full(sk))
1556 goto exit_overflow;
1557
1558 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1559 goto exit;
1560
1561 newsk = tcp_create_openreq_child(sk, req, skb);
1562 if (!newsk)
1563 goto exit;
1564
1565 newsk->sk_dst_cache = dst;
1566 tcp_v4_setup_caps(newsk, dst);
1567
1568 newtp = tcp_sk(newsk);
1569 newinet = inet_sk(newsk);
1570 newinet->daddr = req->af.v4_req.rmt_addr;
1571 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1572 newinet->saddr = req->af.v4_req.loc_addr;
1573 newinet->opt = req->af.v4_req.opt;
1574 req->af.v4_req.opt = NULL;
1575 newinet->mc_index = tcp_v4_iif(skb);
1576 newinet->mc_ttl = skb->nh.iph->ttl;
1577 newtp->ext_header_len = 0;
1578 if (newinet->opt)
1579 newtp->ext_header_len = newinet->opt->optlen;
1580 newinet->id = newtp->write_seq ^ jiffies;
1581
1582 tcp_sync_mss(newsk, dst_mtu(dst));
1583 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1584 tcp_initialize_rcv_mss(newsk);
1585
1586 __tcp_v4_hash(newsk, 0);
1587 __tcp_inherit_port(sk, newsk);
1588
1589 return newsk;
1590
1591exit_overflow:
1592 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1593exit:
1594 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1595 dst_release(dst);
1596 return NULL;
1597}
1598
1599static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1600{
1601 struct tcphdr *th = skb->h.th;
1602 struct iphdr *iph = skb->nh.iph;
1603 struct tcp_sock *tp = tcp_sk(sk);
1604 struct sock *nsk;
1605 struct open_request **prev;
1606 /* Find possible connection requests. */
1607 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1608 iph->saddr, iph->daddr);
1609 if (req)
1610 return tcp_check_req(sk, skb, req, prev);
1611
1612 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1613 th->source,
1614 skb->nh.iph->daddr,
1615 ntohs(th->dest),
1616 tcp_v4_iif(skb));
1617
1618 if (nsk) {
1619 if (nsk->sk_state != TCP_TIME_WAIT) {
1620 bh_lock_sock(nsk);
1621 return nsk;
1622 }
1623 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1624 return NULL;
1625 }
1626
1627#ifdef CONFIG_SYN_COOKIES
1628 if (!th->rst && !th->syn && th->ack)
1629 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1630#endif
1631 return sk;
1632}
1633
1634static int tcp_v4_checksum_init(struct sk_buff *skb)
1635{
1636 if (skb->ip_summed == CHECKSUM_HW) {
1637 skb->ip_summed = CHECKSUM_UNNECESSARY;
1638 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1639 skb->nh.iph->daddr, skb->csum))
1640 return 0;
1641
1642 NETDEBUG(if (net_ratelimit())
1643 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1644 skb->ip_summed = CHECKSUM_NONE;
1645 }
1646 if (skb->len <= 76) {
1647 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1648 skb->nh.iph->daddr,
1649 skb_checksum(skb, 0, skb->len, 0)))
1650 return -1;
1651 skb->ip_summed = CHECKSUM_UNNECESSARY;
1652 } else {
1653 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1654 skb->nh.iph->saddr,
1655 skb->nh.iph->daddr, 0);
1656 }
1657 return 0;
1658}
1659
1660
1661/* The socket must have it's spinlock held when we get
1662 * here.
1663 *
1664 * We have a potential double-lock case here, so even when
1665 * doing backlog processing we use the BH locking scheme.
1666 * This is because we cannot sleep with the original spinlock
1667 * held.
1668 */
1669int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1670{
1671 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1672 TCP_CHECK_TIMER(sk);
1673 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1674 goto reset;
1675 TCP_CHECK_TIMER(sk);
1676 return 0;
1677 }
1678
1679 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1680 goto csum_err;
1681
1682 if (sk->sk_state == TCP_LISTEN) {
1683 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1684 if (!nsk)
1685 goto discard;
1686
1687 if (nsk != sk) {
1688 if (tcp_child_process(sk, nsk, skb))
1689 goto reset;
1690 return 0;
1691 }
1692 }
1693
1694 TCP_CHECK_TIMER(sk);
1695 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1696 goto reset;
1697 TCP_CHECK_TIMER(sk);
1698 return 0;
1699
1700reset:
1701 tcp_v4_send_reset(skb);
1702discard:
1703 kfree_skb(skb);
1704 /* Be careful here. If this function gets more complicated and
1705 * gcc suffers from register pressure on the x86, sk (in %ebx)
1706 * might be destroyed here. This current version compiles correctly,
1707 * but you have been warned.
1708 */
1709 return 0;
1710
1711csum_err:
1712 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1713 goto discard;
1714}
1715
1716/*
1717 * From tcp_input.c
1718 */
1719
1720int tcp_v4_rcv(struct sk_buff *skb)
1721{
1722 struct tcphdr *th;
1723 struct sock *sk;
1724 int ret;
1725
1726 if (skb->pkt_type != PACKET_HOST)
1727 goto discard_it;
1728
1729 /* Count it even if it's bad */
1730 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1731
1732 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1733 goto discard_it;
1734
1735 th = skb->h.th;
1736
1737 if (th->doff < sizeof(struct tcphdr) / 4)
1738 goto bad_packet;
1739 if (!pskb_may_pull(skb, th->doff * 4))
1740 goto discard_it;
1741
1742 /* An explanation is required here, I think.
1743 * Packet length and doff are validated by header prediction,
1744 * provided case of th->doff==0 is elimineted.
1745 * So, we defer the checks. */
1746 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1747 tcp_v4_checksum_init(skb) < 0))
1748 goto bad_packet;
1749
1750 th = skb->h.th;
1751 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1752 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1753 skb->len - th->doff * 4);
1754 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1755 TCP_SKB_CB(skb)->when = 0;
1756 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1757 TCP_SKB_CB(skb)->sacked = 0;
1758
1759 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1760 skb->nh.iph->daddr, ntohs(th->dest),
1761 tcp_v4_iif(skb));
1762
1763 if (!sk)
1764 goto no_tcp_socket;
1765
1766process:
1767 if (sk->sk_state == TCP_TIME_WAIT)
1768 goto do_time_wait;
1769
1770 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1771 goto discard_and_relse;
1772
1773 if (sk_filter(sk, skb, 0))
1774 goto discard_and_relse;
1775
1776 skb->dev = NULL;
1777
1778 bh_lock_sock(sk);
1779 ret = 0;
1780 if (!sock_owned_by_user(sk)) {
1781 if (!tcp_prequeue(sk, skb))
1782 ret = tcp_v4_do_rcv(sk, skb);
1783 } else
1784 sk_add_backlog(sk, skb);
1785 bh_unlock_sock(sk);
1786
1787 sock_put(sk);
1788
1789 return ret;
1790
1791no_tcp_socket:
1792 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1793 goto discard_it;
1794
1795 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1796bad_packet:
1797 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1798 } else {
1799 tcp_v4_send_reset(skb);
1800 }
1801
1802discard_it:
1803 /* Discard frame. */
1804 kfree_skb(skb);
1805 return 0;
1806
1807discard_and_relse:
1808 sock_put(sk);
1809 goto discard_it;
1810
1811do_time_wait:
1812 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1813 tcp_tw_put((struct tcp_tw_bucket *) sk);
1814 goto discard_it;
1815 }
1816
1817 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1818 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1819 tcp_tw_put((struct tcp_tw_bucket *) sk);
1820 goto discard_it;
1821 }
1822 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1823 skb, th, skb->len)) {
1824 case TCP_TW_SYN: {
1825 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1826 ntohs(th->dest),
1827 tcp_v4_iif(skb));
1828 if (sk2) {
1829 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1830 tcp_tw_put((struct tcp_tw_bucket *)sk);
1831 sk = sk2;
1832 goto process;
1833 }
1834 /* Fall through to ACK */
1835 }
1836 case TCP_TW_ACK:
1837 tcp_v4_timewait_ack(sk, skb);
1838 break;
1839 case TCP_TW_RST:
1840 goto no_tcp_socket;
1841 case TCP_TW_SUCCESS:;
1842 }
1843 goto discard_it;
1844}
1845
1846/* With per-bucket locks this operation is not-atomic, so that
1847 * this version is not worse.
1848 */
1849static void __tcp_v4_rehash(struct sock *sk)
1850{
1851 sk->sk_prot->unhash(sk);
1852 sk->sk_prot->hash(sk);
1853}
1854
1855static int tcp_v4_reselect_saddr(struct sock *sk)
1856{
1857 struct inet_sock *inet = inet_sk(sk);
1858 int err;
1859 struct rtable *rt;
1860 __u32 old_saddr = inet->saddr;
1861 __u32 new_saddr;
1862 __u32 daddr = inet->daddr;
1863
1864 if (inet->opt && inet->opt->srr)
1865 daddr = inet->opt->faddr;
1866
1867 /* Query new route. */
1868 err = ip_route_connect(&rt, daddr, 0,
1869 RT_CONN_FLAGS(sk),
1870 sk->sk_bound_dev_if,
1871 IPPROTO_TCP,
1872 inet->sport, inet->dport, sk);
1873 if (err)
1874 return err;
1875
1876 __sk_dst_set(sk, &rt->u.dst);
1877 tcp_v4_setup_caps(sk, &rt->u.dst);
1878
1879 new_saddr = rt->rt_src;
1880
1881 if (new_saddr == old_saddr)
1882 return 0;
1883
1884 if (sysctl_ip_dynaddr > 1) {
1885 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1886 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1887 NIPQUAD(old_saddr),
1888 NIPQUAD(new_saddr));
1889 }
1890
1891 inet->saddr = new_saddr;
1892 inet->rcv_saddr = new_saddr;
1893
1894 /* XXX The only one ugly spot where we need to
1895 * XXX really change the sockets identity after
1896 * XXX it has entered the hashes. -DaveM
1897 *
1898 * Besides that, it does not check for connection
1899 * uniqueness. Wait for troubles.
1900 */
1901 __tcp_v4_rehash(sk);
1902 return 0;
1903}
1904
1905int tcp_v4_rebuild_header(struct sock *sk)
1906{
1907 struct inet_sock *inet = inet_sk(sk);
1908 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1909 u32 daddr;
1910 int err;
1911
1912 /* Route is OK, nothing to do. */
1913 if (rt)
1914 return 0;
1915
1916 /* Reroute. */
1917 daddr = inet->daddr;
1918 if (inet->opt && inet->opt->srr)
1919 daddr = inet->opt->faddr;
1920
1921 {
1922 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1923 .nl_u = { .ip4_u =
1924 { .daddr = daddr,
1925 .saddr = inet->saddr,
1926 .tos = RT_CONN_FLAGS(sk) } },
1927 .proto = IPPROTO_TCP,
1928 .uli_u = { .ports =
1929 { .sport = inet->sport,
1930 .dport = inet->dport } } };
1931
1932 err = ip_route_output_flow(&rt, &fl, sk, 0);
1933 }
1934 if (!err) {
1935 __sk_dst_set(sk, &rt->u.dst);
1936 tcp_v4_setup_caps(sk, &rt->u.dst);
1937 return 0;
1938 }
1939
1940 /* Routing failed... */
1941 sk->sk_route_caps = 0;
1942
1943 if (!sysctl_ip_dynaddr ||
1944 sk->sk_state != TCP_SYN_SENT ||
1945 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1946 (err = tcp_v4_reselect_saddr(sk)) != 0)
1947 sk->sk_err_soft = -err;
1948
1949 return err;
1950}
1951
1952static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1953{
1954 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1955 struct inet_sock *inet = inet_sk(sk);
1956
1957 sin->sin_family = AF_INET;
1958 sin->sin_addr.s_addr = inet->daddr;
1959 sin->sin_port = inet->dport;
1960}
1961
1962/* VJ's idea. Save last timestamp seen from this destination
1963 * and hold it at least for normal timewait interval to use for duplicate
1964 * segment detection in subsequent connections, before they enter synchronized
1965 * state.
1966 */
1967
1968int tcp_v4_remember_stamp(struct sock *sk)
1969{
1970 struct inet_sock *inet = inet_sk(sk);
1971 struct tcp_sock *tp = tcp_sk(sk);
1972 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1973 struct inet_peer *peer = NULL;
1974 int release_it = 0;
1975
1976 if (!rt || rt->rt_dst != inet->daddr) {
1977 peer = inet_getpeer(inet->daddr, 1);
1978 release_it = 1;
1979 } else {
1980 if (!rt->peer)
1981 rt_bind_peer(rt, 1);
1982 peer = rt->peer;
1983 }
1984
1985 if (peer) {
1986 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1987 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1988 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1989 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1990 peer->tcp_ts = tp->rx_opt.ts_recent;
1991 }
1992 if (release_it)
1993 inet_putpeer(peer);
1994 return 1;
1995 }
1996
1997 return 0;
1998}
1999
2000int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2001{
2002 struct inet_peer *peer = NULL;
2003
2004 peer = inet_getpeer(tw->tw_daddr, 1);
2005
2006 if (peer) {
2007 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2008 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2009 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2010 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2011 peer->tcp_ts = tw->tw_ts_recent;
2012 }
2013 inet_putpeer(peer);
2014 return 1;
2015 }
2016
2017 return 0;
2018}
2019
2020struct tcp_func ipv4_specific = {
2021 .queue_xmit = ip_queue_xmit,
2022 .send_check = tcp_v4_send_check,
2023 .rebuild_header = tcp_v4_rebuild_header,
2024 .conn_request = tcp_v4_conn_request,
2025 .syn_recv_sock = tcp_v4_syn_recv_sock,
2026 .remember_stamp = tcp_v4_remember_stamp,
2027 .net_header_len = sizeof(struct iphdr),
2028 .setsockopt = ip_setsockopt,
2029 .getsockopt = ip_getsockopt,
2030 .addr2sockaddr = v4_addr2sockaddr,
2031 .sockaddr_len = sizeof(struct sockaddr_in),
2032};
2033
2034/* NOTE: A lot of things set to zero explicitly by call to
2035 * sk_alloc() so need not be done here.
2036 */
2037static int tcp_v4_init_sock(struct sock *sk)
2038{
2039 struct tcp_sock *tp = tcp_sk(sk);
2040
2041 skb_queue_head_init(&tp->out_of_order_queue);
2042 tcp_init_xmit_timers(sk);
2043 tcp_prequeue_init(tp);
2044
2045 tp->rto = TCP_TIMEOUT_INIT;
2046 tp->mdev = TCP_TIMEOUT_INIT;
2047
2048 /* So many TCP implementations out there (incorrectly) count the
2049 * initial SYN frame in their delayed-ACK and congestion control
2050 * algorithms that we must have the following bandaid to talk
2051 * efficiently to them. -DaveM
2052 */
2053 tp->snd_cwnd = 2;
2054
2055 /* See draft-stevens-tcpca-spec-01 for discussion of the
2056 * initialization of these values.
2057 */
2058 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2059 tp->snd_cwnd_clamp = ~0;
2060 tp->mss_cache_std = tp->mss_cache = 536;
2061
2062 tp->reordering = sysctl_tcp_reordering;
2063
2064 sk->sk_state = TCP_CLOSE;
2065
2066 sk->sk_write_space = sk_stream_write_space;
2067 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2068
2069 tp->af_specific = &ipv4_specific;
2070
2071 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2072 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2073
2074 atomic_inc(&tcp_sockets_allocated);
2075
2076 return 0;
2077}
2078
2079int tcp_v4_destroy_sock(struct sock *sk)
2080{
2081 struct tcp_sock *tp = tcp_sk(sk);
2082
2083 tcp_clear_xmit_timers(sk);
2084
2085 /* Cleanup up the write buffer. */
2086 sk_stream_writequeue_purge(sk);
2087
2088 /* Cleans up our, hopefully empty, out_of_order_queue. */
2089 __skb_queue_purge(&tp->out_of_order_queue);
2090
2091 /* Clean prequeue, it must be empty really */
2092 __skb_queue_purge(&tp->ucopy.prequeue);
2093
2094 /* Clean up a referenced TCP bind bucket. */
2095 if (tp->bind_hash)
2096 tcp_put_port(sk);
2097
2098 /*
2099 * If sendmsg cached page exists, toss it.
2100 */
2101 if (sk->sk_sndmsg_page) {
2102 __free_page(sk->sk_sndmsg_page);
2103 sk->sk_sndmsg_page = NULL;
2104 }
2105
2106 atomic_dec(&tcp_sockets_allocated);
2107
2108 return 0;
2109}
2110
2111EXPORT_SYMBOL(tcp_v4_destroy_sock);
2112
2113#ifdef CONFIG_PROC_FS
2114/* Proc filesystem TCP sock list dumping. */
2115
2116static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2117{
2118 return hlist_empty(head) ? NULL :
2119 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2120}
2121
2122static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2123{
2124 return tw->tw_node.next ?
2125 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2126}
2127
2128static void *listening_get_next(struct seq_file *seq, void *cur)
2129{
2130 struct tcp_sock *tp;
2131 struct hlist_node *node;
2132 struct sock *sk = cur;
2133 struct tcp_iter_state* st = seq->private;
2134
2135 if (!sk) {
2136 st->bucket = 0;
2137 sk = sk_head(&tcp_listening_hash[0]);
2138 goto get_sk;
2139 }
2140
2141 ++st->num;
2142
2143 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2144 struct open_request *req = cur;
2145
2146 tp = tcp_sk(st->syn_wait_sk);
2147 req = req->dl_next;
2148 while (1) {
2149 while (req) {
2150 if (req->class->family == st->family) {
2151 cur = req;
2152 goto out;
2153 }
2154 req = req->dl_next;
2155 }
2156 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2157 break;
2158get_req:
2159 req = tp->listen_opt->syn_table[st->sbucket];
2160 }
2161 sk = sk_next(st->syn_wait_sk);
2162 st->state = TCP_SEQ_STATE_LISTENING;
2163 read_unlock_bh(&tp->syn_wait_lock);
2164 } else {
2165 tp = tcp_sk(sk);
2166 read_lock_bh(&tp->syn_wait_lock);
2167 if (tp->listen_opt && tp->listen_opt->qlen)
2168 goto start_req;
2169 read_unlock_bh(&tp->syn_wait_lock);
2170 sk = sk_next(sk);
2171 }
2172get_sk:
2173 sk_for_each_from(sk, node) {
2174 if (sk->sk_family == st->family) {
2175 cur = sk;
2176 goto out;
2177 }
2178 tp = tcp_sk(sk);
2179 read_lock_bh(&tp->syn_wait_lock);
2180 if (tp->listen_opt && tp->listen_opt->qlen) {
2181start_req:
2182 st->uid = sock_i_uid(sk);
2183 st->syn_wait_sk = sk;
2184 st->state = TCP_SEQ_STATE_OPENREQ;
2185 st->sbucket = 0;
2186 goto get_req;
2187 }
2188 read_unlock_bh(&tp->syn_wait_lock);
2189 }
2190 if (++st->bucket < TCP_LHTABLE_SIZE) {
2191 sk = sk_head(&tcp_listening_hash[st->bucket]);
2192 goto get_sk;
2193 }
2194 cur = NULL;
2195out:
2196 return cur;
2197}
2198
2199static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2200{
2201 void *rc = listening_get_next(seq, NULL);
2202
2203 while (rc && *pos) {
2204 rc = listening_get_next(seq, rc);
2205 --*pos;
2206 }
2207 return rc;
2208}
2209
2210static void *established_get_first(struct seq_file *seq)
2211{
2212 struct tcp_iter_state* st = seq->private;
2213 void *rc = NULL;
2214
2215 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2216 struct sock *sk;
2217 struct hlist_node *node;
2218 struct tcp_tw_bucket *tw;
2219
2220 /* We can reschedule _before_ having picked the target: */
2221 cond_resched_softirq();
2222
2223 read_lock(&tcp_ehash[st->bucket].lock);
2224 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2225 if (sk->sk_family != st->family) {
2226 continue;
2227 }
2228 rc = sk;
2229 goto out;
2230 }
2231 st->state = TCP_SEQ_STATE_TIME_WAIT;
2232 tw_for_each(tw, node,
2233 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2234 if (tw->tw_family != st->family) {
2235 continue;
2236 }
2237 rc = tw;
2238 goto out;
2239 }
2240 read_unlock(&tcp_ehash[st->bucket].lock);
2241 st->state = TCP_SEQ_STATE_ESTABLISHED;
2242 }
2243out:
2244 return rc;
2245}
2246
2247static void *established_get_next(struct seq_file *seq, void *cur)
2248{
2249 struct sock *sk = cur;
2250 struct tcp_tw_bucket *tw;
2251 struct hlist_node *node;
2252 struct tcp_iter_state* st = seq->private;
2253
2254 ++st->num;
2255
2256 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2257 tw = cur;
2258 tw = tw_next(tw);
2259get_tw:
2260 while (tw && tw->tw_family != st->family) {
2261 tw = tw_next(tw);
2262 }
2263 if (tw) {
2264 cur = tw;
2265 goto out;
2266 }
2267 read_unlock(&tcp_ehash[st->bucket].lock);
2268 st->state = TCP_SEQ_STATE_ESTABLISHED;
2269
2270 /* We can reschedule between buckets: */
2271 cond_resched_softirq();
2272
2273 if (++st->bucket < tcp_ehash_size) {
2274 read_lock(&tcp_ehash[st->bucket].lock);
2275 sk = sk_head(&tcp_ehash[st->bucket].chain);
2276 } else {
2277 cur = NULL;
2278 goto out;
2279 }
2280 } else
2281 sk = sk_next(sk);
2282
2283 sk_for_each_from(sk, node) {
2284 if (sk->sk_family == st->family)
2285 goto found;
2286 }
2287
2288 st->state = TCP_SEQ_STATE_TIME_WAIT;
2289 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2290 goto get_tw;
2291found:
2292 cur = sk;
2293out:
2294 return cur;
2295}
2296
2297static void *established_get_idx(struct seq_file *seq, loff_t pos)
2298{
2299 void *rc = established_get_first(seq);
2300
2301 while (rc && pos) {
2302 rc = established_get_next(seq, rc);
2303 --pos;
2304 }
2305 return rc;
2306}
2307
2308static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2309{
2310 void *rc;
2311 struct tcp_iter_state* st = seq->private;
2312
2313 tcp_listen_lock();
2314 st->state = TCP_SEQ_STATE_LISTENING;
2315 rc = listening_get_idx(seq, &pos);
2316
2317 if (!rc) {
2318 tcp_listen_unlock();
2319 local_bh_disable();
2320 st->state = TCP_SEQ_STATE_ESTABLISHED;
2321 rc = established_get_idx(seq, pos);
2322 }
2323
2324 return rc;
2325}
2326
2327static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2328{
2329 struct tcp_iter_state* st = seq->private;
2330 st->state = TCP_SEQ_STATE_LISTENING;
2331 st->num = 0;
2332 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2333}
2334
2335static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2336{
2337 void *rc = NULL;
2338 struct tcp_iter_state* st;
2339
2340 if (v == SEQ_START_TOKEN) {
2341 rc = tcp_get_idx(seq, 0);
2342 goto out;
2343 }
2344 st = seq->private;
2345
2346 switch (st->state) {
2347 case TCP_SEQ_STATE_OPENREQ:
2348 case TCP_SEQ_STATE_LISTENING:
2349 rc = listening_get_next(seq, v);
2350 if (!rc) {
2351 tcp_listen_unlock();
2352 local_bh_disable();
2353 st->state = TCP_SEQ_STATE_ESTABLISHED;
2354 rc = established_get_first(seq);
2355 }
2356 break;
2357 case TCP_SEQ_STATE_ESTABLISHED:
2358 case TCP_SEQ_STATE_TIME_WAIT:
2359 rc = established_get_next(seq, v);
2360 break;
2361 }
2362out:
2363 ++*pos;
2364 return rc;
2365}
2366
2367static void tcp_seq_stop(struct seq_file *seq, void *v)
2368{
2369 struct tcp_iter_state* st = seq->private;
2370
2371 switch (st->state) {
2372 case TCP_SEQ_STATE_OPENREQ:
2373 if (v) {
2374 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2375 read_unlock_bh(&tp->syn_wait_lock);
2376 }
2377 case TCP_SEQ_STATE_LISTENING:
2378 if (v != SEQ_START_TOKEN)
2379 tcp_listen_unlock();
2380 break;
2381 case TCP_SEQ_STATE_TIME_WAIT:
2382 case TCP_SEQ_STATE_ESTABLISHED:
2383 if (v)
2384 read_unlock(&tcp_ehash[st->bucket].lock);
2385 local_bh_enable();
2386 break;
2387 }
2388}
2389
2390static int tcp_seq_open(struct inode *inode, struct file *file)
2391{
2392 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2393 struct seq_file *seq;
2394 struct tcp_iter_state *s;
2395 int rc;
2396
2397 if (unlikely(afinfo == NULL))
2398 return -EINVAL;
2399
2400 s = kmalloc(sizeof(*s), GFP_KERNEL);
2401 if (!s)
2402 return -ENOMEM;
2403 memset(s, 0, sizeof(*s));
2404 s->family = afinfo->family;
2405 s->seq_ops.start = tcp_seq_start;
2406 s->seq_ops.next = tcp_seq_next;
2407 s->seq_ops.show = afinfo->seq_show;
2408 s->seq_ops.stop = tcp_seq_stop;
2409
2410 rc = seq_open(file, &s->seq_ops);
2411 if (rc)
2412 goto out_kfree;
2413 seq = file->private_data;
2414 seq->private = s;
2415out:
2416 return rc;
2417out_kfree:
2418 kfree(s);
2419 goto out;
2420}
2421
2422int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2423{
2424 int rc = 0;
2425 struct proc_dir_entry *p;
2426
2427 if (!afinfo)
2428 return -EINVAL;
2429 afinfo->seq_fops->owner = afinfo->owner;
2430 afinfo->seq_fops->open = tcp_seq_open;
2431 afinfo->seq_fops->read = seq_read;
2432 afinfo->seq_fops->llseek = seq_lseek;
2433 afinfo->seq_fops->release = seq_release_private;
2434
2435 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2436 if (p)
2437 p->data = afinfo;
2438 else
2439 rc = -ENOMEM;
2440 return rc;
2441}
2442
2443void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2444{
2445 if (!afinfo)
2446 return;
2447 proc_net_remove(afinfo->name);
2448 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2449}
2450
2451static void get_openreq4(struct sock *sk, struct open_request *req,
2452 char *tmpbuf, int i, int uid)
2453{
2454 int ttd = req->expires - jiffies;
2455
2456 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2457 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2458 i,
2459 req->af.v4_req.loc_addr,
2460 ntohs(inet_sk(sk)->sport),
2461 req->af.v4_req.rmt_addr,
2462 ntohs(req->rmt_port),
2463 TCP_SYN_RECV,
2464 0, 0, /* could print option size, but that is af dependent. */
2465 1, /* timers active (only the expire timer) */
2466 jiffies_to_clock_t(ttd),
2467 req->retrans,
2468 uid,
2469 0, /* non standard timer */
2470 0, /* open_requests have no inode */
2471 atomic_read(&sk->sk_refcnt),
2472 req);
2473}
2474
2475static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2476{
2477 int timer_active;
2478 unsigned long timer_expires;
2479 struct tcp_sock *tp = tcp_sk(sp);
2480 struct inet_sock *inet = inet_sk(sp);
2481 unsigned int dest = inet->daddr;
2482 unsigned int src = inet->rcv_saddr;
2483 __u16 destp = ntohs(inet->dport);
2484 __u16 srcp = ntohs(inet->sport);
2485
2486 if (tp->pending == TCP_TIME_RETRANS) {
2487 timer_active = 1;
2488 timer_expires = tp->timeout;
2489 } else if (tp->pending == TCP_TIME_PROBE0) {
2490 timer_active = 4;
2491 timer_expires = tp->timeout;
2492 } else if (timer_pending(&sp->sk_timer)) {
2493 timer_active = 2;
2494 timer_expires = sp->sk_timer.expires;
2495 } else {
2496 timer_active = 0;
2497 timer_expires = jiffies;
2498 }
2499
2500 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2501 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2502 i, src, srcp, dest, destp, sp->sk_state,
2503 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2504 timer_active,
2505 jiffies_to_clock_t(timer_expires - jiffies),
2506 tp->retransmits,
2507 sock_i_uid(sp),
2508 tp->probes_out,
2509 sock_i_ino(sp),
2510 atomic_read(&sp->sk_refcnt), sp,
2511 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2512 tp->snd_cwnd,
2513 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2514}
2515
2516static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2517{
2518 unsigned int dest, src;
2519 __u16 destp, srcp;
2520 int ttd = tw->tw_ttd - jiffies;
2521
2522 if (ttd < 0)
2523 ttd = 0;
2524
2525 dest = tw->tw_daddr;
2526 src = tw->tw_rcv_saddr;
2527 destp = ntohs(tw->tw_dport);
2528 srcp = ntohs(tw->tw_sport);
2529
2530 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2531 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2532 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2533 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2534 atomic_read(&tw->tw_refcnt), tw);
2535}
2536
2537#define TMPSZ 150
2538
2539static int tcp4_seq_show(struct seq_file *seq, void *v)
2540{
2541 struct tcp_iter_state* st;
2542 char tmpbuf[TMPSZ + 1];
2543
2544 if (v == SEQ_START_TOKEN) {
2545 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2546 " sl local_address rem_address st tx_queue "
2547 "rx_queue tr tm->when retrnsmt uid timeout "
2548 "inode");
2549 goto out;
2550 }
2551 st = seq->private;
2552
2553 switch (st->state) {
2554 case TCP_SEQ_STATE_LISTENING:
2555 case TCP_SEQ_STATE_ESTABLISHED:
2556 get_tcp4_sock(v, tmpbuf, st->num);
2557 break;
2558 case TCP_SEQ_STATE_OPENREQ:
2559 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2560 break;
2561 case TCP_SEQ_STATE_TIME_WAIT:
2562 get_timewait4_sock(v, tmpbuf, st->num);
2563 break;
2564 }
2565 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2566out:
2567 return 0;
2568}
2569
2570static struct file_operations tcp4_seq_fops;
2571static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2572 .owner = THIS_MODULE,
2573 .name = "tcp",
2574 .family = AF_INET,
2575 .seq_show = tcp4_seq_show,
2576 .seq_fops = &tcp4_seq_fops,
2577};
2578
2579int __init tcp4_proc_init(void)
2580{
2581 return tcp_proc_register(&tcp4_seq_afinfo);
2582}
2583
2584void tcp4_proc_exit(void)
2585{
2586 tcp_proc_unregister(&tcp4_seq_afinfo);
2587}
2588#endif /* CONFIG_PROC_FS */
2589
2590struct proto tcp_prot = {
2591 .name = "TCP",
2592 .owner = THIS_MODULE,
2593 .close = tcp_close,
2594 .connect = tcp_v4_connect,
2595 .disconnect = tcp_disconnect,
2596 .accept = tcp_accept,
2597 .ioctl = tcp_ioctl,
2598 .init = tcp_v4_init_sock,
2599 .destroy = tcp_v4_destroy_sock,
2600 .shutdown = tcp_shutdown,
2601 .setsockopt = tcp_setsockopt,
2602 .getsockopt = tcp_getsockopt,
2603 .sendmsg = tcp_sendmsg,
2604 .recvmsg = tcp_recvmsg,
2605 .backlog_rcv = tcp_v4_do_rcv,
2606 .hash = tcp_v4_hash,
2607 .unhash = tcp_unhash,
2608 .get_port = tcp_v4_get_port,
2609 .enter_memory_pressure = tcp_enter_memory_pressure,
2610 .sockets_allocated = &tcp_sockets_allocated,
2611 .memory_allocated = &tcp_memory_allocated,
2612 .memory_pressure = &tcp_memory_pressure,
2613 .sysctl_mem = sysctl_tcp_mem,
2614 .sysctl_wmem = sysctl_tcp_wmem,
2615 .sysctl_rmem = sysctl_tcp_rmem,
2616 .max_header = MAX_TCP_HEADER,
2617 .obj_size = sizeof(struct tcp_sock),
2618};
2619
2620
2621
2622void __init tcp_v4_init(struct net_proto_family *ops)
2623{
2624 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2625 if (err < 0)
2626 panic("Failed to create the TCP control socket.\n");
2627 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2628 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2629
2630 /* Unhash it so that IP input processing does not even
2631 * see it, we do not wish this socket to see incoming
2632 * packets.
2633 */
2634 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2635}
2636
2637EXPORT_SYMBOL(ipv4_specific);
2638EXPORT_SYMBOL(tcp_bind_hash);
2639EXPORT_SYMBOL(tcp_bucket_create);
2640EXPORT_SYMBOL(tcp_hashinfo);
2641EXPORT_SYMBOL(tcp_inherit_port);
2642EXPORT_SYMBOL(tcp_listen_wlock);
2643EXPORT_SYMBOL(tcp_port_rover);
2644EXPORT_SYMBOL(tcp_prot);
2645EXPORT_SYMBOL(tcp_put_port);
2646EXPORT_SYMBOL(tcp_unhash);
2647EXPORT_SYMBOL(tcp_v4_conn_request);
2648EXPORT_SYMBOL(tcp_v4_connect);
2649EXPORT_SYMBOL(tcp_v4_do_rcv);
2650EXPORT_SYMBOL(tcp_v4_rebuild_header);
2651EXPORT_SYMBOL(tcp_v4_remember_stamp);
2652EXPORT_SYMBOL(tcp_v4_send_check);
2653EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2654
2655#ifdef CONFIG_PROC_FS
2656EXPORT_SYMBOL(tcp_proc_register);
2657EXPORT_SYMBOL(tcp_proc_unregister);
2658#endif
2659EXPORT_SYMBOL(sysctl_local_port_range);
2660EXPORT_SYMBOL(sysctl_max_syn_backlog);
2661EXPORT_SYMBOL(sysctl_tcp_low_latency);
2662EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2663