aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_ipv4.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r--net/ipv4/tcp_ipv4.c269
1 files changed, 51 insertions, 218 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e1929b..e9f83e5b28ce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -69,6 +69,7 @@
69#include <net/transp_v6.h> 69#include <net/transp_v6.h>
70#include <net/ipv6.h> 70#include <net/ipv6.h>
71#include <net/inet_common.h> 71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
72#include <net/xfrm.h> 73#include <net/xfrm.h>
73 74
74#include <linux/inet.h> 75#include <linux/inet.h>
@@ -86,8 +87,7 @@ int sysctl_tcp_low_latency;
86/* Socket used for sending RSTs */ 87/* Socket used for sending RSTs */
87static struct socket *tcp_socket; 88static struct socket *tcp_socket;
88 89
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 90void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
90 struct sk_buff *skb);
91 91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { 92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED, 93 .lhash_lock = RW_LOCK_UNLOCKED,
@@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
97 97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum) 98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{ 99{
100 return inet_csk_get_port(&tcp_hashinfo, sk, snum); 100 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101 inet_csk_bind_conflict);
101} 102}
102 103
103static void tcp_v4_hash(struct sock *sk) 104static void tcp_v4_hash(struct sock *sk)
@@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
118 skb->h.th->source); 119 skb->h.th->source);
119} 120}
120 121
121/* called with local bh disabled */ 122int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
122static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
123 struct inet_timewait_sock **twp)
124{ 123{
125 struct inet_sock *inet = inet_sk(sk); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
126 u32 daddr = inet->rcv_saddr; 125 struct tcp_sock *tp = tcp_sk(sk);
127 u32 saddr = inet->daddr;
128 int dif = sk->sk_bound_dev_if;
129 INET_ADDR_COOKIE(acookie, saddr, daddr)
130 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
131 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
132 struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
133 struct sock *sk2;
134 const struct hlist_node *node;
135 struct inet_timewait_sock *tw;
136
137 prefetch(head->chain.first);
138 write_lock(&head->lock);
139
140 /* Check TIME-WAIT sockets first. */
141 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
142 tw = inet_twsk(sk2);
143
144 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
145 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
146 struct tcp_sock *tp = tcp_sk(sk);
147
148 /* With PAWS, it is safe from the viewpoint
149 of data integrity. Even without PAWS it
150 is safe provided sequence spaces do not
151 overlap i.e. at data rates <= 80Mbit/sec.
152
153 Actually, the idea is close to VJ's one,
154 only timestamp cache is held not per host,
155 but per port pair and TW bucket is used
156 as state holder.
157 126
158 If TW bucket has been already destroyed we 127 /* With PAWS, it is safe from the viewpoint
159 fall back to VJ's scheme and use initial 128 of data integrity. Even without PAWS it is safe provided sequence
160 timestamp retrieved from peer table. 129 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 */
162 if (tcptw->tw_ts_recent_stamp &&
163 (!twp || (sysctl_tcp_tw_reuse &&
164 xtime.tv_sec -
165 tcptw->tw_ts_recent_stamp > 1))) {
166 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
167 if (tp->write_seq == 0)
168 tp->write_seq = 1;
169 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
170 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
171 sock_hold(sk2);
172 goto unique;
173 } else
174 goto not_unique;
175 }
176 }
177 tw = NULL;
178 130
179 /* And established part... */ 131 Actually, the idea is close to VJ's one, only timestamp cache is
180 sk_for_each(sk2, node, &head->chain) { 132 held not per host, but per port pair and TW bucket is used as state
181 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) 133 holder.
182 goto not_unique;
183 }
184 134
185unique: 135 If TW bucket has been already destroyed we fall back to VJ's scheme
186 /* Must record num and sport now. Otherwise we will see 136 and use initial timestamp retrieved from peer table.
187 * in hash table socket with a funny identity. */ 137 */
188 inet->num = lport; 138 if (tcptw->tw_ts_recent_stamp &&
189 inet->sport = htons(lport); 139 (twp == NULL || (sysctl_tcp_tw_reuse &&
190 sk->sk_hash = hash; 140 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
191 BUG_TRAP(sk_unhashed(sk)); 141 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
192 __sk_add_node(sk, &head->chain); 142 if (tp->write_seq == 0)
193 sock_prot_inc_use(sk->sk_prot); 143 tp->write_seq = 1;
194 write_unlock(&head->lock); 144 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
195 145 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
196 if (twp) { 146 sock_hold(sktw);
197 *twp = tw; 147 return 1;
198 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
199 } else if (tw) {
200 /* Silly. Should hash-dance instead... */
201 inet_twsk_deschedule(tw, &tcp_death_row);
202 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
203
204 inet_twsk_put(tw);
205 } 148 }
206 149
207 return 0; 150 return 0;
208
209not_unique:
210 write_unlock(&head->lock);
211 return -EADDRNOTAVAIL;
212} 151}
213 152
214static inline u32 connect_port_offset(const struct sock *sk) 153EXPORT_SYMBOL_GPL(tcp_twsk_unique);
215{
216 const struct inet_sock *inet = inet_sk(sk);
217
218 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
219 inet->dport);
220}
221
222/*
223 * Bind a port for a connect operation and hash it.
224 */
225static inline int tcp_v4_hash_connect(struct sock *sk)
226{
227 const unsigned short snum = inet_sk(sk)->num;
228 struct inet_bind_hashbucket *head;
229 struct inet_bind_bucket *tb;
230 int ret;
231
232 if (!snum) {
233 int low = sysctl_local_port_range[0];
234 int high = sysctl_local_port_range[1];
235 int range = high - low;
236 int i;
237 int port;
238 static u32 hint;
239 u32 offset = hint + connect_port_offset(sk);
240 struct hlist_node *node;
241 struct inet_timewait_sock *tw = NULL;
242
243 local_bh_disable();
244 for (i = 1; i <= range; i++) {
245 port = low + (i + offset) % range;
246 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
247 spin_lock(&head->lock);
248
249 /* Does not bother with rcv_saddr checks,
250 * because the established check is already
251 * unique enough.
252 */
253 inet_bind_bucket_for_each(tb, node, &head->chain) {
254 if (tb->port == port) {
255 BUG_TRAP(!hlist_empty(&tb->owners));
256 if (tb->fastreuse >= 0)
257 goto next_port;
258 if (!__tcp_v4_check_established(sk,
259 port,
260 &tw))
261 goto ok;
262 goto next_port;
263 }
264 }
265
266 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
267 if (!tb) {
268 spin_unlock(&head->lock);
269 break;
270 }
271 tb->fastreuse = -1;
272 goto ok;
273
274 next_port:
275 spin_unlock(&head->lock);
276 }
277 local_bh_enable();
278
279 return -EADDRNOTAVAIL;
280
281ok:
282 hint += i;
283
284 /* Head lock still held and bh's disabled */
285 inet_bind_hash(sk, tb, port);
286 if (sk_unhashed(sk)) {
287 inet_sk(sk)->sport = htons(port);
288 __inet_hash(&tcp_hashinfo, sk, 0);
289 }
290 spin_unlock(&head->lock);
291
292 if (tw) {
293 inet_twsk_deschedule(tw, &tcp_death_row);;
294 inet_twsk_put(tw);
295 }
296
297 ret = 0;
298 goto out;
299 }
300
301 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
302 tb = inet_csk(sk)->icsk_bind_hash;
303 spin_lock_bh(&head->lock);
304 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
305 __inet_hash(&tcp_hashinfo, sk, 0);
306 spin_unlock_bh(&head->lock);
307 return 0;
308 } else {
309 spin_unlock(&head->lock);
310 /* No definite answer... Walk to established hash table */
311 ret = __tcp_v4_check_established(sk, snum, NULL);
312out:
313 local_bh_enable();
314 return ret;
315 }
316}
317 154
318/* This will initiate an outgoing connection. */ 155/* This will initiate an outgoing connection. */
319int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 156int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
383 inet->dport = usin->sin_port; 220 inet->dport = usin->sin_port;
384 inet->daddr = daddr; 221 inet->daddr = daddr;
385 222
386 tp->ext_header_len = 0; 223 inet_csk(sk)->icsk_ext_hdr_len = 0;
387 if (inet->opt) 224 if (inet->opt)
388 tp->ext_header_len = inet->opt->optlen; 225 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
389 226
390 tp->rx_opt.mss_clamp = 536; 227 tp->rx_opt.mss_clamp = 536;
391 228
@@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
395 * complete initialization after this. 232 * complete initialization after this.
396 */ 233 */
397 tcp_set_state(sk, TCP_SYN_SENT); 234 tcp_set_state(sk, TCP_SYN_SENT);
398 err = tcp_v4_hash_connect(sk); 235 err = inet_hash_connect(&tcp_death_row, sk);
399 if (err) 236 if (err)
400 goto failure; 237 goto failure;
401 238
@@ -433,12 +270,10 @@ failure:
433/* 270/*
434 * This routine does path mtu discovery as defined in RFC1191. 271 * This routine does path mtu discovery as defined in RFC1191.
435 */ 272 */
436static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, 273static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
437 u32 mtu)
438{ 274{
439 struct dst_entry *dst; 275 struct dst_entry *dst;
440 struct inet_sock *inet = inet_sk(sk); 276 struct inet_sock *inet = inet_sk(sk);
441 struct tcp_sock *tp = tcp_sk(sk);
442 277
443 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 278 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
444 * send out by Linux are always <576bytes so they should go through 279 * send out by Linux are always <576bytes so they should go through
@@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
467 mtu = dst_mtu(dst); 302 mtu = dst_mtu(dst);
468 303
469 if (inet->pmtudisc != IP_PMTUDISC_DONT && 304 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
470 tp->pmtu_cookie > mtu) { 305 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
471 tcp_sync_mss(sk, mtu); 306 tcp_sync_mss(sk, mtu);
472 307
473 /* Resend the TCP packet because it's 308 /* Resend the TCP packet because it's
@@ -644,10 +479,10 @@ out:
644} 479}
645 480
646/* This routine computes an IPv4 TCP checksum. */ 481/* This routine computes an IPv4 TCP checksum. */
647void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
648 struct sk_buff *skb)
649{ 483{
650 struct inet_sock *inet = inet_sk(sk); 484 struct inet_sock *inet = inet_sk(sk);
485 struct tcphdr *th = skb->h.th;
651 486
652 if (skb->ip_summed == CHECKSUM_HW) { 487 if (skb->ip_summed == CHECKSUM_HW) {
653 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); 488 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
@@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
826 kfree(inet_rsk(req)->opt); 661 kfree(inet_rsk(req)->opt);
827} 662}
828 663
829static inline void syn_flood_warning(struct sk_buff *skb) 664#ifdef CONFIG_SYN_COOKIES
665static void syn_flood_warning(struct sk_buff *skb)
830{ 666{
831 static unsigned long warntime; 667 static unsigned long warntime;
832 668
@@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb)
837 ntohs(skb->h.th->dest)); 673 ntohs(skb->h.th->dest));
838 } 674 }
839} 675}
676#endif
840 677
841/* 678/*
842 * Save and compile IPv4 options into the request_sock if needed. 679 * Save and compile IPv4 options into the request_sock if needed.
843 */ 680 */
844static inline struct ip_options *tcp_v4_save_options(struct sock *sk, 681static struct ip_options *tcp_v4_save_options(struct sock *sk,
845 struct sk_buff *skb) 682 struct sk_buff *skb)
846{ 683{
847 struct ip_options *opt = &(IPCB(skb)->opt); 684 struct ip_options *opt = &(IPCB(skb)->opt);
848 struct ip_options *dopt = NULL; 685 struct ip_options *dopt = NULL;
@@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = {
869 .send_reset = tcp_v4_send_reset, 706 .send_reset = tcp_v4_send_reset,
870}; 707};
871 708
709static struct timewait_sock_ops tcp_timewait_sock_ops = {
710 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
711 .twsk_unique = tcp_twsk_unique,
712};
713
872int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 714int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
873{ 715{
874 struct inet_request_sock *ireq; 716 struct inet_request_sock *ireq;
@@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1053 ireq->opt = NULL; 895 ireq->opt = NULL;
1054 newinet->mc_index = inet_iif(skb); 896 newinet->mc_index = inet_iif(skb);
1055 newinet->mc_ttl = skb->nh.iph->ttl; 897 newinet->mc_ttl = skb->nh.iph->ttl;
1056 newtp->ext_header_len = 0; 898 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1057 if (newinet->opt) 899 if (newinet->opt)
1058 newtp->ext_header_len = newinet->opt->optlen; 900 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1059 newinet->id = newtp->write_seq ^ jiffies; 901 newinet->id = newtp->write_seq ^ jiffies;
1060 902
1061 tcp_sync_mss(newsk, dst_mtu(dst)); 903 tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1314,16 +1156,6 @@ do_time_wait:
1314 goto discard_it; 1156 goto discard_it;
1315} 1157}
1316 1158
1317static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1318{
1319 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1320 struct inet_sock *inet = inet_sk(sk);
1321
1322 sin->sin_family = AF_INET;
1323 sin->sin_addr.s_addr = inet->daddr;
1324 sin->sin_port = inet->dport;
1325}
1326
1327/* VJ's idea. Save last timestamp seen from this destination 1159/* VJ's idea. Save last timestamp seen from this destination
1328 * and hold it at least for normal timewait interval to use for duplicate 1160 * and hold it at least for normal timewait interval to use for duplicate
1329 * segment detection in subsequent connections, before they enter synchronized 1161 * segment detection in subsequent connections, before they enter synchronized
@@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1382 return 0; 1214 return 0;
1383} 1215}
1384 1216
1385struct tcp_func ipv4_specific = { 1217struct inet_connection_sock_af_ops ipv4_specific = {
1386 .queue_xmit = ip_queue_xmit, 1218 .queue_xmit = ip_queue_xmit,
1387 .send_check = tcp_v4_send_check, 1219 .send_check = tcp_v4_send_check,
1388 .rebuild_header = inet_sk_rebuild_header, 1220 .rebuild_header = inet_sk_rebuild_header,
@@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = {
1392 .net_header_len = sizeof(struct iphdr), 1224 .net_header_len = sizeof(struct iphdr),
1393 .setsockopt = ip_setsockopt, 1225 .setsockopt = ip_setsockopt,
1394 .getsockopt = ip_getsockopt, 1226 .getsockopt = ip_getsockopt,
1395 .addr2sockaddr = v4_addr2sockaddr, 1227 .addr2sockaddr = inet_csk_addr2sockaddr,
1396 .sockaddr_len = sizeof(struct sockaddr_in), 1228 .sockaddr_len = sizeof(struct sockaddr_in),
1397}; 1229};
1398 1230
@@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk)
1433 sk->sk_write_space = sk_stream_write_space; 1265 sk->sk_write_space = sk_stream_write_space;
1434 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1266 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1435 1267
1436 tp->af_specific = &ipv4_specific; 1268 icsk->icsk_af_ops = &ipv4_specific;
1269 icsk->icsk_sync_mss = tcp_sync_mss;
1437 1270
1438 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1271 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1439 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1272 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -1989,7 +1822,7 @@ struct proto tcp_prot = {
1989 .sysctl_rmem = sysctl_tcp_rmem, 1822 .sysctl_rmem = sysctl_tcp_rmem,
1990 .max_header = MAX_TCP_HEADER, 1823 .max_header = MAX_TCP_HEADER,
1991 .obj_size = sizeof(struct tcp_sock), 1824 .obj_size = sizeof(struct tcp_sock),
1992 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1825 .twsk_prot = &tcp_timewait_sock_ops,
1993 .rsk_prot = &tcp_request_sock_ops, 1826 .rsk_prot = &tcp_request_sock_ops,
1994}; 1827};
1995 1828