diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 10 | ||||
-rw-r--r-- | net/ipv4/ah4.c | 6 | ||||
-rw-r--r-- | net/ipv4/arp.c | 22 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 4 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 12 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 8 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 31 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 277 | ||||
-rw-r--r-- | net/ipv4/inet_lro.c | 4 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 48 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 44 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 10 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 18 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 40 | ||||
-rw-r--r-- | net/ipv4/ipcomp.c | 6 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 35 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 40 | ||||
-rw-r--r-- | net/ipv4/netfilter.c | 4 | ||||
-rw-r--r-- | net/ipv4/proc.c | 66 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 27 | ||||
-rw-r--r-- | net/ipv4/tcp_diag.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_htcp.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 388 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 73 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 129 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 76 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 13 | ||||
-rw-r--r-- | net/ipv4/xfrm4_state.c | 3 |
31 files changed, 919 insertions, 499 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e3286814c8d9..fe03048c130d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -245,7 +245,7 @@ static inline int inet_netns_ok(struct net *net, int protocol) | |||
245 | int hash; | 245 | int hash; |
246 | struct net_protocol *ipprot; | 246 | struct net_protocol *ipprot; |
247 | 247 | ||
248 | if (net == &init_net) | 248 | if (net_eq(net, &init_net)) |
249 | return 1; | 249 | return 1; |
250 | 250 | ||
251 | hash = protocol & (MAX_INET_PROTOS - 1); | 251 | hash = protocol & (MAX_INET_PROTOS - 1); |
@@ -272,10 +272,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol) | |||
272 | int try_loading_module = 0; | 272 | int try_loading_module = 0; |
273 | int err; | 273 | int err; |
274 | 274 | ||
275 | if (sock->type != SOCK_RAW && | 275 | if (unlikely(!inet_ehash_secret)) |
276 | sock->type != SOCK_DGRAM && | 276 | if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) |
277 | !inet_ehash_secret) | 277 | build_ehash_secret(); |
278 | build_ehash_secret(); | ||
279 | 278 | ||
280 | sock->state = SS_UNCONNECTED; | 279 | sock->state = SS_UNCONNECTED; |
281 | 280 | ||
@@ -1114,6 +1113,7 @@ int inet_sk_rebuild_header(struct sock *sk) | |||
1114 | }, | 1113 | }, |
1115 | }, | 1114 | }, |
1116 | .proto = sk->sk_protocol, | 1115 | .proto = sk->sk_protocol, |
1116 | .flags = inet_sk_flowi_flags(sk), | ||
1117 | .uli_u = { | 1117 | .uli_u = { |
1118 | .ports = { | 1118 | .ports = { |
1119 | .sport = inet->sport, | 1119 | .sport = inet->sport, |
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 3f205181712d..e878e494296e 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -201,6 +201,7 @@ out: | |||
201 | 201 | ||
202 | static void ah4_err(struct sk_buff *skb, u32 info) | 202 | static void ah4_err(struct sk_buff *skb, u32 info) |
203 | { | 203 | { |
204 | struct net *net = dev_net(skb->dev); | ||
204 | struct iphdr *iph = (struct iphdr *)skb->data; | 205 | struct iphdr *iph = (struct iphdr *)skb->data; |
205 | struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); | 206 | struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); |
206 | struct xfrm_state *x; | 207 | struct xfrm_state *x; |
@@ -209,7 +210,7 @@ static void ah4_err(struct sk_buff *skb, u32 info) | |||
209 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 210 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
210 | return; | 211 | return; |
211 | 212 | ||
212 | x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); | 213 | x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); |
213 | if (!x) | 214 | if (!x) |
214 | return; | 215 | return; |
215 | printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", | 216 | printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", |
@@ -293,9 +294,7 @@ static void ah_destroy(struct xfrm_state *x) | |||
293 | return; | 294 | return; |
294 | 295 | ||
295 | kfree(ahp->work_icv); | 296 | kfree(ahp->work_icv); |
296 | ahp->work_icv = NULL; | ||
297 | crypto_free_hash(ahp->tfm); | 297 | crypto_free_hash(ahp->tfm); |
298 | ahp->tfm = NULL; | ||
299 | kfree(ahp); | 298 | kfree(ahp); |
300 | } | 299 | } |
301 | 300 | ||
@@ -316,6 +315,7 @@ static struct net_protocol ah4_protocol = { | |||
316 | .handler = xfrm4_rcv, | 315 | .handler = xfrm4_rcv, |
317 | .err_handler = ah4_err, | 316 | .err_handler = ah4_err, |
318 | .no_policy = 1, | 317 | .no_policy = 1, |
318 | .netns_ok = 1, | ||
319 | }; | 319 | }; |
320 | 320 | ||
321 | static int __init ah4_init(void) | 321 | static int __init ah4_init(void) |
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 957c87dc8e16..29a74c01d8de 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
@@ -818,18 +818,18 @@ static int arp_process(struct sk_buff *skb) | |||
818 | addr_type = rt->rt_type; | 818 | addr_type = rt->rt_type; |
819 | 819 | ||
820 | if (addr_type == RTN_LOCAL) { | 820 | if (addr_type == RTN_LOCAL) { |
821 | n = neigh_event_ns(&arp_tbl, sha, &sip, dev); | 821 | int dont_send = 0; |
822 | if (n) { | ||
823 | int dont_send = 0; | ||
824 | |||
825 | if (!dont_send) | ||
826 | dont_send |= arp_ignore(in_dev, sip, tip); | ||
827 | if (!dont_send && IN_DEV_ARPFILTER(in_dev)) | ||
828 | dont_send |= arp_filter(sip, tip, dev); | ||
829 | if (!dont_send) | ||
830 | arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); | ||
831 | 822 | ||
832 | neigh_release(n); | 823 | if (!dont_send) |
824 | dont_send |= arp_ignore(in_dev,sip,tip); | ||
825 | if (!dont_send && IN_DEV_ARPFILTER(in_dev)) | ||
826 | dont_send |= arp_filter(sip,tip,dev); | ||
827 | if (!dont_send) { | ||
828 | n = neigh_event_ns(&arp_tbl, sha, &sip, dev); | ||
829 | if (n) { | ||
830 | arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); | ||
831 | neigh_release(n); | ||
832 | } | ||
833 | } | 833 | } |
834 | goto out; | 834 | goto out; |
835 | } else if (IN_DEV_FORWARD(in_dev)) { | 835 | } else if (IN_DEV_FORWARD(in_dev)) { |
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 95a9c65003f8..18bb383ea393 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -413,6 +413,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) | |||
413 | 413 | ||
414 | static void esp4_err(struct sk_buff *skb, u32 info) | 414 | static void esp4_err(struct sk_buff *skb, u32 info) |
415 | { | 415 | { |
416 | struct net *net = dev_net(skb->dev); | ||
416 | struct iphdr *iph = (struct iphdr *)skb->data; | 417 | struct iphdr *iph = (struct iphdr *)skb->data; |
417 | struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); | 418 | struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); |
418 | struct xfrm_state *x; | 419 | struct xfrm_state *x; |
@@ -421,7 +422,7 @@ static void esp4_err(struct sk_buff *skb, u32 info) | |||
421 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | 422 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
422 | return; | 423 | return; |
423 | 424 | ||
424 | x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); | 425 | x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); |
425 | if (!x) | 426 | if (!x) |
426 | return; | 427 | return; |
427 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", | 428 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", |
@@ -618,6 +619,7 @@ static struct net_protocol esp4_protocol = { | |||
618 | .handler = xfrm4_rcv, | 619 | .handler = xfrm4_rcv, |
619 | .err_handler = esp4_err, | 620 | .err_handler = esp4_err, |
620 | .no_policy = 1, | 621 | .no_policy = 1, |
622 | .netns_ok = 1, | ||
621 | }; | 623 | }; |
622 | 624 | ||
623 | static int __init esp4_init(void) | 625 | static int __init esp4_init(void) |
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 21e497efbd7f..705b33b184a3 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -321,12 +321,12 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, | |||
321 | } | 321 | } |
322 | 322 | ||
323 | static void icmp_push_reply(struct icmp_bxm *icmp_param, | 323 | static void icmp_push_reply(struct icmp_bxm *icmp_param, |
324 | struct ipcm_cookie *ipc, struct rtable *rt) | 324 | struct ipcm_cookie *ipc, struct rtable **rt) |
325 | { | 325 | { |
326 | struct sock *sk; | 326 | struct sock *sk; |
327 | struct sk_buff *skb; | 327 | struct sk_buff *skb; |
328 | 328 | ||
329 | sk = icmp_sk(dev_net(rt->u.dst.dev)); | 329 | sk = icmp_sk(dev_net((*rt)->u.dst.dev)); |
330 | if (ip_append_data(sk, icmp_glue_bits, icmp_param, | 330 | if (ip_append_data(sk, icmp_glue_bits, icmp_param, |
331 | icmp_param->data_len+icmp_param->head_len, | 331 | icmp_param->data_len+icmp_param->head_len, |
332 | icmp_param->head_len, | 332 | icmp_param->head_len, |
@@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
392 | } | 392 | } |
393 | if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, | 393 | if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, |
394 | icmp_param->data.icmph.code)) | 394 | icmp_param->data.icmph.code)) |
395 | icmp_push_reply(icmp_param, &ipc, rt); | 395 | icmp_push_reply(icmp_param, &ipc, &rt); |
396 | ip_rt_put(rt); | 396 | ip_rt_put(rt); |
397 | out_unlock: | 397 | out_unlock: |
398 | icmp_xmit_unlock(sk); | 398 | icmp_xmit_unlock(sk); |
@@ -562,7 +562,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
562 | /* No need to clone since we're just using its address. */ | 562 | /* No need to clone since we're just using its address. */ |
563 | rt2 = rt; | 563 | rt2 = rt; |
564 | 564 | ||
565 | err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); | 565 | err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); |
566 | switch (err) { | 566 | switch (err) { |
567 | case 0: | 567 | case 0: |
568 | if (rt != rt2) | 568 | if (rt != rt2) |
@@ -601,7 +601,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
601 | if (err) | 601 | if (err) |
602 | goto relookup_failed; | 602 | goto relookup_failed; |
603 | 603 | ||
604 | err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, | 604 | err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL, |
605 | XFRM_LOOKUP_ICMP); | 605 | XFRM_LOOKUP_ICMP); |
606 | switch (err) { | 606 | switch (err) { |
607 | case 0: | 607 | case 0: |
@@ -635,7 +635,7 @@ route_done: | |||
635 | icmp_param.data_len = room; | 635 | icmp_param.data_len = room; |
636 | icmp_param.head_len = sizeof(struct icmphdr); | 636 | icmp_param.head_len = sizeof(struct icmphdr); |
637 | 637 | ||
638 | icmp_push_reply(&icmp_param, &ipc, rt); | 638 | icmp_push_reply(&icmp_param, &ipc, &rt); |
639 | ende: | 639 | ende: |
640 | ip_rt_put(rt); | 640 | ip_rt_put(rt); |
641 | out_unlock: | 641 | out_unlock: |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 36f4cbc7da3a..1ccdbba528be 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -109,7 +109,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) | |||
109 | hashinfo->bhash_size)]; | 109 | hashinfo->bhash_size)]; |
110 | spin_lock(&head->lock); | 110 | spin_lock(&head->lock); |
111 | inet_bind_bucket_for_each(tb, node, &head->chain) | 111 | inet_bind_bucket_for_each(tb, node, &head->chain) |
112 | if (tb->ib_net == net && tb->port == rover) | 112 | if (ib_net(tb) == net && tb->port == rover) |
113 | goto next; | 113 | goto next; |
114 | break; | 114 | break; |
115 | next: | 115 | next: |
@@ -137,7 +137,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) | |||
137 | hashinfo->bhash_size)]; | 137 | hashinfo->bhash_size)]; |
138 | spin_lock(&head->lock); | 138 | spin_lock(&head->lock); |
139 | inet_bind_bucket_for_each(tb, node, &head->chain) | 139 | inet_bind_bucket_for_each(tb, node, &head->chain) |
140 | if (tb->ib_net == net && tb->port == snum) | 140 | if (ib_net(tb) == net && tb->port == snum) |
141 | goto tb_found; | 141 | goto tb_found; |
142 | } | 142 | } |
143 | tb = NULL; | 143 | tb = NULL; |
@@ -561,7 +561,7 @@ void inet_csk_destroy_sock(struct sock *sk) | |||
561 | 561 | ||
562 | sk_refcnt_debug_release(sk); | 562 | sk_refcnt_debug_release(sk); |
563 | 563 | ||
564 | atomic_dec(sk->sk_prot->orphan_count); | 564 | percpu_counter_dec(sk->sk_prot->orphan_count); |
565 | sock_put(sk); | 565 | sock_put(sk); |
566 | } | 566 | } |
567 | 567 | ||
@@ -641,7 +641,7 @@ void inet_csk_listen_stop(struct sock *sk) | |||
641 | 641 | ||
642 | sock_orphan(child); | 642 | sock_orphan(child); |
643 | 643 | ||
644 | atomic_inc(sk->sk_prot->orphan_count); | 644 | percpu_counter_inc(sk->sk_prot->orphan_count); |
645 | 645 | ||
646 | inet_csk_destroy_sock(child); | 646 | inet_csk_destroy_sock(child); |
647 | 647 | ||
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 564230dabcb8..588a7796e3e3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
718 | if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) | 718 | if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) |
719 | goto skip_listen_ht; | 719 | goto skip_listen_ht; |
720 | 720 | ||
721 | inet_listen_lock(hashinfo); | ||
722 | for (i = s_i; i < INET_LHTABLE_SIZE; i++) { | 721 | for (i = s_i; i < INET_LHTABLE_SIZE; i++) { |
723 | struct sock *sk; | 722 | struct sock *sk; |
724 | struct hlist_node *node; | 723 | struct hlist_nulls_node *node; |
724 | struct inet_listen_hashbucket *ilb; | ||
725 | 725 | ||
726 | num = 0; | 726 | num = 0; |
727 | sk_for_each(sk, node, &hashinfo->listening_hash[i]) { | 727 | ilb = &hashinfo->listening_hash[i]; |
728 | spin_lock_bh(&ilb->lock); | ||
729 | sk_nulls_for_each(sk, node, &ilb->head) { | ||
728 | struct inet_sock *inet = inet_sk(sk); | 730 | struct inet_sock *inet = inet_sk(sk); |
729 | 731 | ||
730 | if (num < s_num) { | 732 | if (num < s_num) { |
@@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
742 | goto syn_recv; | 744 | goto syn_recv; |
743 | 745 | ||
744 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { | 746 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { |
745 | inet_listen_unlock(hashinfo); | 747 | spin_unlock_bh(&ilb->lock); |
746 | goto done; | 748 | goto done; |
747 | } | 749 | } |
748 | 750 | ||
@@ -751,7 +753,7 @@ syn_recv: | |||
751 | goto next_listen; | 753 | goto next_listen; |
752 | 754 | ||
753 | if (inet_diag_dump_reqs(skb, sk, cb) < 0) { | 755 | if (inet_diag_dump_reqs(skb, sk, cb) < 0) { |
754 | inet_listen_unlock(hashinfo); | 756 | spin_unlock_bh(&ilb->lock); |
755 | goto done; | 757 | goto done; |
756 | } | 758 | } |
757 | 759 | ||
@@ -760,12 +762,12 @@ next_listen: | |||
760 | cb->args[4] = 0; | 762 | cb->args[4] = 0; |
761 | ++num; | 763 | ++num; |
762 | } | 764 | } |
765 | spin_unlock_bh(&ilb->lock); | ||
763 | 766 | ||
764 | s_num = 0; | 767 | s_num = 0; |
765 | cb->args[3] = 0; | 768 | cb->args[3] = 0; |
766 | cb->args[4] = 0; | 769 | cb->args[4] = 0; |
767 | } | 770 | } |
768 | inet_listen_unlock(hashinfo); | ||
769 | skip_listen_ht: | 771 | skip_listen_ht: |
770 | cb->args[0] = 1; | 772 | cb->args[0] = 1; |
771 | s_i = num = s_num = 0; | 773 | s_i = num = s_num = 0; |
@@ -776,20 +778,21 @@ skip_listen_ht: | |||
776 | 778 | ||
777 | for (i = s_i; i < hashinfo->ehash_size; i++) { | 779 | for (i = s_i; i < hashinfo->ehash_size; i++) { |
778 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; | 780 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; |
779 | rwlock_t *lock = inet_ehash_lockp(hashinfo, i); | 781 | spinlock_t *lock = inet_ehash_lockp(hashinfo, i); |
780 | struct sock *sk; | 782 | struct sock *sk; |
781 | struct hlist_node *node; | 783 | struct hlist_nulls_node *node; |
782 | 784 | ||
783 | num = 0; | 785 | num = 0; |
784 | 786 | ||
785 | if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) | 787 | if (hlist_nulls_empty(&head->chain) && |
788 | hlist_nulls_empty(&head->twchain)) | ||
786 | continue; | 789 | continue; |
787 | 790 | ||
788 | if (i > s_i) | 791 | if (i > s_i) |
789 | s_num = 0; | 792 | s_num = 0; |
790 | 793 | ||
791 | read_lock_bh(lock); | 794 | spin_lock_bh(lock); |
792 | sk_for_each(sk, node, &head->chain) { | 795 | sk_nulls_for_each(sk, node, &head->chain) { |
793 | struct inet_sock *inet = inet_sk(sk); | 796 | struct inet_sock *inet = inet_sk(sk); |
794 | 797 | ||
795 | if (num < s_num) | 798 | if (num < s_num) |
@@ -803,7 +806,7 @@ skip_listen_ht: | |||
803 | r->id.idiag_dport) | 806 | r->id.idiag_dport) |
804 | goto next_normal; | 807 | goto next_normal; |
805 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { | 808 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { |
806 | read_unlock_bh(lock); | 809 | spin_unlock_bh(lock); |
807 | goto done; | 810 | goto done; |
808 | } | 811 | } |
809 | next_normal: | 812 | next_normal: |
@@ -825,14 +828,14 @@ next_normal: | |||
825 | r->id.idiag_dport) | 828 | r->id.idiag_dport) |
826 | goto next_dying; | 829 | goto next_dying; |
827 | if (inet_twsk_diag_dump(tw, skb, cb) < 0) { | 830 | if (inet_twsk_diag_dump(tw, skb, cb) < 0) { |
828 | read_unlock_bh(lock); | 831 | spin_unlock_bh(lock); |
829 | goto done; | 832 | goto done; |
830 | } | 833 | } |
831 | next_dying: | 834 | next_dying: |
832 | ++num; | 835 | ++num; |
833 | } | 836 | } |
834 | } | 837 | } |
835 | read_unlock_bh(lock); | 838 | spin_unlock_bh(lock); |
836 | } | 839 | } |
837 | 840 | ||
838 | done: | 841 | done: |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 44981906fb91..6a1045da48d2 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -35,7 +35,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, | |||
35 | struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); | 35 | struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); |
36 | 36 | ||
37 | if (tb != NULL) { | 37 | if (tb != NULL) { |
38 | tb->ib_net = hold_net(net); | 38 | write_pnet(&tb->ib_net, hold_net(net)); |
39 | tb->port = snum; | 39 | tb->port = snum; |
40 | tb->fastreuse = 0; | 40 | tb->fastreuse = 0; |
41 | INIT_HLIST_HEAD(&tb->owners); | 41 | INIT_HLIST_HEAD(&tb->owners); |
@@ -51,7 +51,7 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket | |||
51 | { | 51 | { |
52 | if (hlist_empty(&tb->owners)) { | 52 | if (hlist_empty(&tb->owners)) { |
53 | __hlist_del(&tb->node); | 53 | __hlist_del(&tb->node); |
54 | release_net(tb->ib_net); | 54 | release_net(ib_net(tb)); |
55 | kmem_cache_free(cachep, tb); | 55 | kmem_cache_free(cachep, tb); |
56 | } | 56 | } |
57 | } | 57 | } |
@@ -110,33 +110,29 @@ void __inet_inherit_port(struct sock *sk, struct sock *child) | |||
110 | 110 | ||
111 | EXPORT_SYMBOL_GPL(__inet_inherit_port); | 111 | EXPORT_SYMBOL_GPL(__inet_inherit_port); |
112 | 112 | ||
113 | /* | 113 | static inline int compute_score(struct sock *sk, struct net *net, |
114 | * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. | 114 | const unsigned short hnum, const __be32 daddr, |
115 | * Look, when several writers sleep and reader wakes them up, all but one | 115 | const int dif) |
116 | * immediately hit write lock and grab all the cpus. Exclusive sleep solves | ||
117 | * this, _but_ remember, it adds useless work on UP machines (wake up each | ||
118 | * exclusive lock release). It should be ifdefed really. | ||
119 | */ | ||
120 | void inet_listen_wlock(struct inet_hashinfo *hashinfo) | ||
121 | __acquires(hashinfo->lhash_lock) | ||
122 | { | 116 | { |
123 | write_lock(&hashinfo->lhash_lock); | 117 | int score = -1; |
124 | 118 | struct inet_sock *inet = inet_sk(sk); | |
125 | if (atomic_read(&hashinfo->lhash_users)) { | ||
126 | DEFINE_WAIT(wait); | ||
127 | 119 | ||
128 | for (;;) { | 120 | if (net_eq(sock_net(sk), net) && inet->num == hnum && |
129 | prepare_to_wait_exclusive(&hashinfo->lhash_wait, | 121 | !ipv6_only_sock(sk)) { |
130 | &wait, TASK_UNINTERRUPTIBLE); | 122 | __be32 rcv_saddr = inet->rcv_saddr; |
131 | if (!atomic_read(&hashinfo->lhash_users)) | 123 | score = sk->sk_family == PF_INET ? 1 : 0; |
132 | break; | 124 | if (rcv_saddr) { |
133 | write_unlock_bh(&hashinfo->lhash_lock); | 125 | if (rcv_saddr != daddr) |
134 | schedule(); | 126 | return -1; |
135 | write_lock_bh(&hashinfo->lhash_lock); | 127 | score += 2; |
128 | } | ||
129 | if (sk->sk_bound_dev_if) { | ||
130 | if (sk->sk_bound_dev_if != dif) | ||
131 | return -1; | ||
132 | score += 2; | ||
136 | } | 133 | } |
137 | |||
138 | finish_wait(&hashinfo->lhash_wait, &wait); | ||
139 | } | 134 | } |
135 | return score; | ||
140 | } | 136 | } |
141 | 137 | ||
142 | /* | 138 | /* |
@@ -145,72 +141,48 @@ void inet_listen_wlock(struct inet_hashinfo *hashinfo) | |||
145 | * remote address for the connection. So always assume those are both | 141 | * remote address for the connection. So always assume those are both |
146 | * wildcarded during the search since they can never be otherwise. | 142 | * wildcarded during the search since they can never be otherwise. |
147 | */ | 143 | */ |
148 | static struct sock *inet_lookup_listener_slow(struct net *net, | ||
149 | const struct hlist_head *head, | ||
150 | const __be32 daddr, | ||
151 | const unsigned short hnum, | ||
152 | const int dif) | ||
153 | { | ||
154 | struct sock *result = NULL, *sk; | ||
155 | const struct hlist_node *node; | ||
156 | int hiscore = -1; | ||
157 | |||
158 | sk_for_each(sk, node, head) { | ||
159 | const struct inet_sock *inet = inet_sk(sk); | ||
160 | |||
161 | if (net_eq(sock_net(sk), net) && inet->num == hnum && | ||
162 | !ipv6_only_sock(sk)) { | ||
163 | const __be32 rcv_saddr = inet->rcv_saddr; | ||
164 | int score = sk->sk_family == PF_INET ? 1 : 0; | ||
165 | |||
166 | if (rcv_saddr) { | ||
167 | if (rcv_saddr != daddr) | ||
168 | continue; | ||
169 | score += 2; | ||
170 | } | ||
171 | if (sk->sk_bound_dev_if) { | ||
172 | if (sk->sk_bound_dev_if != dif) | ||
173 | continue; | ||
174 | score += 2; | ||
175 | } | ||
176 | if (score == 5) | ||
177 | return sk; | ||
178 | if (score > hiscore) { | ||
179 | hiscore = score; | ||
180 | result = sk; | ||
181 | } | ||
182 | } | ||
183 | } | ||
184 | return result; | ||
185 | } | ||
186 | 144 | ||
187 | /* Optimize the common listener case. */ | 145 | |
188 | struct sock *__inet_lookup_listener(struct net *net, | 146 | struct sock *__inet_lookup_listener(struct net *net, |
189 | struct inet_hashinfo *hashinfo, | 147 | struct inet_hashinfo *hashinfo, |
190 | const __be32 daddr, const unsigned short hnum, | 148 | const __be32 daddr, const unsigned short hnum, |
191 | const int dif) | 149 | const int dif) |
192 | { | 150 | { |
193 | struct sock *sk = NULL; | 151 | struct sock *sk, *result; |
194 | const struct hlist_head *head; | 152 | struct hlist_nulls_node *node; |
195 | 153 | unsigned int hash = inet_lhashfn(net, hnum); | |
196 | read_lock(&hashinfo->lhash_lock); | 154 | struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; |
197 | head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; | 155 | int score, hiscore; |
198 | if (!hlist_empty(head)) { | 156 | |
199 | const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); | 157 | rcu_read_lock(); |
200 | 158 | begin: | |
201 | if (inet->num == hnum && !sk->sk_node.next && | 159 | result = NULL; |
202 | (!inet->rcv_saddr || inet->rcv_saddr == daddr) && | 160 | hiscore = -1; |
203 | (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && | 161 | sk_nulls_for_each_rcu(sk, node, &ilb->head) { |
204 | !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) | 162 | score = compute_score(sk, net, hnum, daddr, dif); |
205 | goto sherry_cache; | 163 | if (score > hiscore) { |
206 | sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); | 164 | result = sk; |
165 | hiscore = score; | ||
166 | } | ||
207 | } | 167 | } |
208 | if (sk) { | 168 | /* |
209 | sherry_cache: | 169 | * if the nulls value we got at the end of this lookup is |
210 | sock_hold(sk); | 170 | * not the expected one, we must restart lookup. |
171 | * We probably met an item that was moved to another chain. | ||
172 | */ | ||
173 | if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) | ||
174 | goto begin; | ||
175 | if (result) { | ||
176 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) | ||
177 | result = NULL; | ||
178 | else if (unlikely(compute_score(result, net, hnum, daddr, | ||
179 | dif) < hiscore)) { | ||
180 | sock_put(result); | ||
181 | goto begin; | ||
182 | } | ||
211 | } | 183 | } |
212 | read_unlock(&hashinfo->lhash_lock); | 184 | rcu_read_unlock(); |
213 | return sk; | 185 | return result; |
214 | } | 186 | } |
215 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | 187 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); |
216 | 188 | ||
@@ -223,35 +195,65 @@ struct sock * __inet_lookup_established(struct net *net, | |||
223 | INET_ADDR_COOKIE(acookie, saddr, daddr) | 195 | INET_ADDR_COOKIE(acookie, saddr, daddr) |
224 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); | 196 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); |
225 | struct sock *sk; | 197 | struct sock *sk; |
226 | const struct hlist_node *node; | 198 | const struct hlist_nulls_node *node; |
227 | /* Optimize here for direct hit, only listening connections can | 199 | /* Optimize here for direct hit, only listening connections can |
228 | * have wildcards anyways. | 200 | * have wildcards anyways. |
229 | */ | 201 | */ |
230 | unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); | 202 | unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); |
231 | struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); | 203 | unsigned int slot = hash & (hashinfo->ehash_size - 1); |
232 | rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); | 204 | struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; |
233 | 205 | ||
234 | prefetch(head->chain.first); | 206 | rcu_read_lock(); |
235 | read_lock(lock); | 207 | begin: |
236 | sk_for_each(sk, node, &head->chain) { | 208 | sk_nulls_for_each_rcu(sk, node, &head->chain) { |
237 | if (INET_MATCH(sk, net, hash, acookie, | 209 | if (INET_MATCH(sk, net, hash, acookie, |
238 | saddr, daddr, ports, dif)) | 210 | saddr, daddr, ports, dif)) { |
239 | goto hit; /* You sunk my battleship! */ | 211 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) |
212 | goto begintw; | ||
213 | if (unlikely(!INET_MATCH(sk, net, hash, acookie, | ||
214 | saddr, daddr, ports, dif))) { | ||
215 | sock_put(sk); | ||
216 | goto begin; | ||
217 | } | ||
218 | goto out; | ||
219 | } | ||
240 | } | 220 | } |
221 | /* | ||
222 | * if the nulls value we got at the end of this lookup is | ||
223 | * not the expected one, we must restart lookup. | ||
224 | * We probably met an item that was moved to another chain. | ||
225 | */ | ||
226 | if (get_nulls_value(node) != slot) | ||
227 | goto begin; | ||
241 | 228 | ||
229 | begintw: | ||
242 | /* Must check for a TIME_WAIT'er before going to listener hash. */ | 230 | /* Must check for a TIME_WAIT'er before going to listener hash. */ |
243 | sk_for_each(sk, node, &head->twchain) { | 231 | sk_nulls_for_each_rcu(sk, node, &head->twchain) { |
244 | if (INET_TW_MATCH(sk, net, hash, acookie, | 232 | if (INET_TW_MATCH(sk, net, hash, acookie, |
245 | saddr, daddr, ports, dif)) | 233 | saddr, daddr, ports, dif)) { |
246 | goto hit; | 234 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { |
235 | sk = NULL; | ||
236 | goto out; | ||
237 | } | ||
238 | if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, | ||
239 | saddr, daddr, ports, dif))) { | ||
240 | sock_put(sk); | ||
241 | goto begintw; | ||
242 | } | ||
243 | goto out; | ||
244 | } | ||
247 | } | 245 | } |
246 | /* | ||
247 | * if the nulls value we got at the end of this lookup is | ||
248 | * not the expected one, we must restart lookup. | ||
249 | * We probably met an item that was moved to another chain. | ||
250 | */ | ||
251 | if (get_nulls_value(node) != slot) | ||
252 | goto begintw; | ||
248 | sk = NULL; | 253 | sk = NULL; |
249 | out: | 254 | out: |
250 | read_unlock(lock); | 255 | rcu_read_unlock(); |
251 | return sk; | 256 | return sk; |
252 | hit: | ||
253 | sock_hold(sk); | ||
254 | goto out; | ||
255 | } | 257 | } |
256 | EXPORT_SYMBOL_GPL(__inet_lookup_established); | 258 | EXPORT_SYMBOL_GPL(__inet_lookup_established); |
257 | 259 | ||
@@ -270,16 +272,15 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, | |||
270 | struct net *net = sock_net(sk); | 272 | struct net *net = sock_net(sk); |
271 | unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); | 273 | unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); |
272 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); | 274 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); |
273 | rwlock_t *lock = inet_ehash_lockp(hinfo, hash); | 275 | spinlock_t *lock = inet_ehash_lockp(hinfo, hash); |
274 | struct sock *sk2; | 276 | struct sock *sk2; |
275 | const struct hlist_node *node; | 277 | const struct hlist_nulls_node *node; |
276 | struct inet_timewait_sock *tw; | 278 | struct inet_timewait_sock *tw; |
277 | 279 | ||
278 | prefetch(head->chain.first); | 280 | spin_lock(lock); |
279 | write_lock(lock); | ||
280 | 281 | ||
281 | /* Check TIME-WAIT sockets first. */ | 282 | /* Check TIME-WAIT sockets first. */ |
282 | sk_for_each(sk2, node, &head->twchain) { | 283 | sk_nulls_for_each(sk2, node, &head->twchain) { |
283 | tw = inet_twsk(sk2); | 284 | tw = inet_twsk(sk2); |
284 | 285 | ||
285 | if (INET_TW_MATCH(sk2, net, hash, acookie, | 286 | if (INET_TW_MATCH(sk2, net, hash, acookie, |
@@ -293,7 +294,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, | |||
293 | tw = NULL; | 294 | tw = NULL; |
294 | 295 | ||
295 | /* And established part... */ | 296 | /* And established part... */ |
296 | sk_for_each(sk2, node, &head->chain) { | 297 | sk_nulls_for_each(sk2, node, &head->chain) { |
297 | if (INET_MATCH(sk2, net, hash, acookie, | 298 | if (INET_MATCH(sk2, net, hash, acookie, |
298 | saddr, daddr, ports, dif)) | 299 | saddr, daddr, ports, dif)) |
299 | goto not_unique; | 300 | goto not_unique; |
@@ -306,9 +307,9 @@ unique: | |||
306 | inet->sport = htons(lport); | 307 | inet->sport = htons(lport); |
307 | sk->sk_hash = hash; | 308 | sk->sk_hash = hash; |
308 | WARN_ON(!sk_unhashed(sk)); | 309 | WARN_ON(!sk_unhashed(sk)); |
309 | __sk_add_node(sk, &head->chain); | 310 | __sk_nulls_add_node_rcu(sk, &head->chain); |
311 | spin_unlock(lock); | ||
310 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 312 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
311 | write_unlock(lock); | ||
312 | 313 | ||
313 | if (twp) { | 314 | if (twp) { |
314 | *twp = tw; | 315 | *twp = tw; |
@@ -324,7 +325,7 @@ unique: | |||
324 | return 0; | 325 | return 0; |
325 | 326 | ||
326 | not_unique: | 327 | not_unique: |
327 | write_unlock(lock); | 328 | spin_unlock(lock); |
328 | return -EADDRNOTAVAIL; | 329 | return -EADDRNOTAVAIL; |
329 | } | 330 | } |
330 | 331 | ||
@@ -338,8 +339,8 @@ static inline u32 inet_sk_port_offset(const struct sock *sk) | |||
338 | void __inet_hash_nolisten(struct sock *sk) | 339 | void __inet_hash_nolisten(struct sock *sk) |
339 | { | 340 | { |
340 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 341 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
341 | struct hlist_head *list; | 342 | struct hlist_nulls_head *list; |
342 | rwlock_t *lock; | 343 | spinlock_t *lock; |
343 | struct inet_ehash_bucket *head; | 344 | struct inet_ehash_bucket *head; |
344 | 345 | ||
345 | WARN_ON(!sk_unhashed(sk)); | 346 | WARN_ON(!sk_unhashed(sk)); |
@@ -349,18 +350,17 @@ void __inet_hash_nolisten(struct sock *sk) | |||
349 | list = &head->chain; | 350 | list = &head->chain; |
350 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | 351 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
351 | 352 | ||
352 | write_lock(lock); | 353 | spin_lock(lock); |
353 | __sk_add_node(sk, list); | 354 | __sk_nulls_add_node_rcu(sk, list); |
355 | spin_unlock(lock); | ||
354 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 356 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
355 | write_unlock(lock); | ||
356 | } | 357 | } |
357 | EXPORT_SYMBOL_GPL(__inet_hash_nolisten); | 358 | EXPORT_SYMBOL_GPL(__inet_hash_nolisten); |
358 | 359 | ||
359 | static void __inet_hash(struct sock *sk) | 360 | static void __inet_hash(struct sock *sk) |
360 | { | 361 | { |
361 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 362 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
362 | struct hlist_head *list; | 363 | struct inet_listen_hashbucket *ilb; |
363 | rwlock_t *lock; | ||
364 | 364 | ||
365 | if (sk->sk_state != TCP_LISTEN) { | 365 | if (sk->sk_state != TCP_LISTEN) { |
366 | __inet_hash_nolisten(sk); | 366 | __inet_hash_nolisten(sk); |
@@ -368,14 +368,12 @@ static void __inet_hash(struct sock *sk) | |||
368 | } | 368 | } |
369 | 369 | ||
370 | WARN_ON(!sk_unhashed(sk)); | 370 | WARN_ON(!sk_unhashed(sk)); |
371 | list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 371 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
372 | lock = &hashinfo->lhash_lock; | ||
373 | 372 | ||
374 | inet_listen_wlock(hashinfo); | 373 | spin_lock(&ilb->lock); |
375 | __sk_add_node(sk, list); | 374 | __sk_nulls_add_node_rcu(sk, &ilb->head); |
376 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 375 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
377 | write_unlock(lock); | 376 | spin_unlock(&ilb->lock); |
378 | wake_up(&hashinfo->lhash_wait); | ||
379 | } | 377 | } |
380 | 378 | ||
381 | void inet_hash(struct sock *sk) | 379 | void inet_hash(struct sock *sk) |
@@ -390,27 +388,23 @@ EXPORT_SYMBOL_GPL(inet_hash); | |||
390 | 388 | ||
391 | void inet_unhash(struct sock *sk) | 389 | void inet_unhash(struct sock *sk) |
392 | { | 390 | { |
393 | rwlock_t *lock; | ||
394 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 391 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
392 | spinlock_t *lock; | ||
393 | int done; | ||
395 | 394 | ||
396 | if (sk_unhashed(sk)) | 395 | if (sk_unhashed(sk)) |
397 | goto out; | 396 | return; |
398 | 397 | ||
399 | if (sk->sk_state == TCP_LISTEN) { | 398 | if (sk->sk_state == TCP_LISTEN) |
400 | local_bh_disable(); | 399 | lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; |
401 | inet_listen_wlock(hashinfo); | 400 | else |
402 | lock = &hashinfo->lhash_lock; | ||
403 | } else { | ||
404 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | 401 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
405 | write_lock_bh(lock); | ||
406 | } | ||
407 | 402 | ||
408 | if (__sk_del_node_init(sk)) | 403 | spin_lock_bh(lock); |
404 | done =__sk_nulls_del_node_init_rcu(sk); | ||
405 | if (done) | ||
409 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 406 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
410 | write_unlock_bh(lock); | 407 | spin_unlock_bh(lock); |
411 | out: | ||
412 | if (sk->sk_state == TCP_LISTEN) | ||
413 | wake_up(&hashinfo->lhash_wait); | ||
414 | } | 408 | } |
415 | EXPORT_SYMBOL_GPL(inet_unhash); | 409 | EXPORT_SYMBOL_GPL(inet_unhash); |
416 | 410 | ||
@@ -449,7 +443,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, | |||
449 | * unique enough. | 443 | * unique enough. |
450 | */ | 444 | */ |
451 | inet_bind_bucket_for_each(tb, node, &head->chain) { | 445 | inet_bind_bucket_for_each(tb, node, &head->chain) { |
452 | if (tb->ib_net == net && tb->port == port) { | 446 | if (ib_net(tb) == net && tb->port == port) { |
453 | WARN_ON(hlist_empty(&tb->owners)); | 447 | WARN_ON(hlist_empty(&tb->owners)); |
454 | if (tb->fastreuse >= 0) | 448 | if (tb->fastreuse >= 0) |
455 | goto next_port; | 449 | goto next_port; |
@@ -524,3 +518,16 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, | |||
524 | } | 518 | } |
525 | 519 | ||
526 | EXPORT_SYMBOL_GPL(inet_hash_connect); | 520 | EXPORT_SYMBOL_GPL(inet_hash_connect); |
521 | |||
522 | void inet_hashinfo_init(struct inet_hashinfo *h) | ||
523 | { | ||
524 | int i; | ||
525 | |||
526 | for (i = 0; i < INET_LHTABLE_SIZE; i++) { | ||
527 | spin_lock_init(&h->listening_hash[i].lock); | ||
528 | INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, | ||
529 | i + LISTENING_NULLS_BASE); | ||
530 | } | ||
531 | } | ||
532 | |||
533 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); | ||
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index cfd034a2b96e..6a667dae315e 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c | |||
@@ -120,7 +120,7 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) | |||
120 | iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); | 120 | iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); |
121 | 121 | ||
122 | tcph->check = 0; | 122 | tcph->check = 0; |
123 | tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0); | 123 | tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); |
124 | lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); | 124 | lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); |
125 | tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, | 125 | tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, |
126 | lro_desc->ip_tot_len - | 126 | lro_desc->ip_tot_len - |
@@ -135,7 +135,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) | |||
135 | __wsum tcp_ps_hdr_csum; | 135 | __wsum tcp_ps_hdr_csum; |
136 | 136 | ||
137 | tcp_csum = ~csum_unfold(tcph->check); | 137 | tcp_csum = ~csum_unfold(tcph->check); |
138 | tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum); | 138 | tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); |
139 | 139 | ||
140 | tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, | 140 | tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, |
141 | len + TCP_HDR_LEN(tcph), | 141 | len + TCP_HDR_LEN(tcph), |
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1c5fd38f8824..8554d0ea1719 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c | |||
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, | |||
20 | struct inet_bind_hashbucket *bhead; | 20 | struct inet_bind_hashbucket *bhead; |
21 | struct inet_bind_bucket *tb; | 21 | struct inet_bind_bucket *tb; |
22 | /* Unlink from established hashes. */ | 22 | /* Unlink from established hashes. */ |
23 | rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); | 23 | spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); |
24 | 24 | ||
25 | write_lock(lock); | 25 | spin_lock(lock); |
26 | if (hlist_unhashed(&tw->tw_node)) { | 26 | if (hlist_nulls_unhashed(&tw->tw_node)) { |
27 | write_unlock(lock); | 27 | spin_unlock(lock); |
28 | return; | 28 | return; |
29 | } | 29 | } |
30 | __hlist_del(&tw->tw_node); | 30 | hlist_nulls_del_rcu(&tw->tw_node); |
31 | sk_node_init(&tw->tw_node); | 31 | sk_nulls_node_init(&tw->tw_node); |
32 | write_unlock(lock); | 32 | spin_unlock(lock); |
33 | 33 | ||
34 | /* Disassociate with bind bucket. */ | 34 | /* Disassociate with bind bucket. */ |
35 | bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, | 35 | bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, |
@@ -76,7 +76,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | |||
76 | const struct inet_sock *inet = inet_sk(sk); | 76 | const struct inet_sock *inet = inet_sk(sk); |
77 | const struct inet_connection_sock *icsk = inet_csk(sk); | 77 | const struct inet_connection_sock *icsk = inet_csk(sk); |
78 | struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); | 78 | struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); |
79 | rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | 79 | spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
80 | struct inet_bind_hashbucket *bhead; | 80 | struct inet_bind_hashbucket *bhead; |
81 | /* Step 1: Put TW into bind hash. Original socket stays there too. | 81 | /* Step 1: Put TW into bind hash. Original socket stays there too. |
82 | Note, that any socket with inet->num != 0 MUST be bound in | 82 | Note, that any socket with inet->num != 0 MUST be bound in |
@@ -90,17 +90,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | |||
90 | inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); | 90 | inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); |
91 | spin_unlock(&bhead->lock); | 91 | spin_unlock(&bhead->lock); |
92 | 92 | ||
93 | write_lock(lock); | 93 | spin_lock(lock); |
94 | 94 | ||
95 | /* Step 2: Remove SK from established hash. */ | 95 | /* |
96 | if (__sk_del_node_init(sk)) | 96 | * Step 2: Hash TW into TIMEWAIT chain. |
97 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 97 | * Should be done before removing sk from established chain |
98 | 98 | * because readers are lockless and search established first. | |
99 | /* Step 3: Hash TW into TIMEWAIT chain. */ | 99 | */ |
100 | inet_twsk_add_node(tw, &ehead->twchain); | ||
101 | atomic_inc(&tw->tw_refcnt); | 100 | atomic_inc(&tw->tw_refcnt); |
101 | inet_twsk_add_node_rcu(tw, &ehead->twchain); | ||
102 | 102 | ||
103 | write_unlock(lock); | 103 | /* Step 3: Remove SK from established hash. */ |
104 | if (__sk_nulls_del_node_init_rcu(sk)) | ||
105 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | ||
106 | |||
107 | spin_unlock(lock); | ||
104 | } | 108 | } |
105 | 109 | ||
106 | EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); | 110 | EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); |
@@ -416,17 +420,17 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, | |||
416 | { | 420 | { |
417 | struct inet_timewait_sock *tw; | 421 | struct inet_timewait_sock *tw; |
418 | struct sock *sk; | 422 | struct sock *sk; |
419 | struct hlist_node *node; | 423 | struct hlist_nulls_node *node; |
420 | int h; | 424 | int h; |
421 | 425 | ||
422 | local_bh_disable(); | 426 | local_bh_disable(); |
423 | for (h = 0; h < (hashinfo->ehash_size); h++) { | 427 | for (h = 0; h < (hashinfo->ehash_size); h++) { |
424 | struct inet_ehash_bucket *head = | 428 | struct inet_ehash_bucket *head = |
425 | inet_ehash_bucket(hashinfo, h); | 429 | inet_ehash_bucket(hashinfo, h); |
426 | rwlock_t *lock = inet_ehash_lockp(hashinfo, h); | 430 | spinlock_t *lock = inet_ehash_lockp(hashinfo, h); |
427 | restart: | 431 | restart: |
428 | write_lock(lock); | 432 | spin_lock(lock); |
429 | sk_for_each(sk, node, &head->twchain) { | 433 | sk_nulls_for_each(sk, node, &head->twchain) { |
430 | 434 | ||
431 | tw = inet_twsk(sk); | 435 | tw = inet_twsk(sk); |
432 | if (!net_eq(twsk_net(tw), net) || | 436 | if (!net_eq(twsk_net(tw), net) || |
@@ -434,13 +438,13 @@ restart: | |||
434 | continue; | 438 | continue; |
435 | 439 | ||
436 | atomic_inc(&tw->tw_refcnt); | 440 | atomic_inc(&tw->tw_refcnt); |
437 | write_unlock(lock); | 441 | spin_unlock(lock); |
438 | inet_twsk_deschedule(tw, twdr); | 442 | inet_twsk_deschedule(tw, twdr); |
439 | inet_twsk_put(tw); | 443 | inet_twsk_put(tw); |
440 | 444 | ||
441 | goto restart; | 445 | goto restart; |
442 | } | 446 | } |
443 | write_unlock(lock); | 447 | spin_unlock(lock); |
444 | } | 448 | } |
445 | local_bh_enable(); | 449 | local_bh_enable(); |
446 | } | 450 | } |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 191ef7588134..0101521f366b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -126,8 +126,6 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); | |||
126 | 126 | ||
127 | /* Fallback tunnel: no source, no destination, no key, no options */ | 127 | /* Fallback tunnel: no source, no destination, no key, no options */ |
128 | 128 | ||
129 | static int ipgre_fb_tunnel_init(struct net_device *dev); | ||
130 | |||
131 | #define HASH_SIZE 16 | 129 | #define HASH_SIZE 16 |
132 | 130 | ||
133 | static int ipgre_net_id; | 131 | static int ipgre_net_id; |
@@ -1142,6 +1140,7 @@ static int ipgre_open(struct net_device *dev) | |||
1142 | static int ipgre_close(struct net_device *dev) | 1140 | static int ipgre_close(struct net_device *dev) |
1143 | { | 1141 | { |
1144 | struct ip_tunnel *t = netdev_priv(dev); | 1142 | struct ip_tunnel *t = netdev_priv(dev); |
1143 | |||
1145 | if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { | 1144 | if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { |
1146 | struct in_device *in_dev; | 1145 | struct in_device *in_dev; |
1147 | in_dev = inetdev_by_index(dev_net(dev), t->mlink); | 1146 | in_dev = inetdev_by_index(dev_net(dev), t->mlink); |
@@ -1155,14 +1154,22 @@ static int ipgre_close(struct net_device *dev) | |||
1155 | 1154 | ||
1156 | #endif | 1155 | #endif |
1157 | 1156 | ||
1157 | static const struct net_device_ops ipgre_netdev_ops = { | ||
1158 | .ndo_init = ipgre_tunnel_init, | ||
1159 | .ndo_uninit = ipgre_tunnel_uninit, | ||
1160 | #ifdef CONFIG_NET_IPGRE_BROADCAST | ||
1161 | .ndo_open = ipgre_open, | ||
1162 | .ndo_stop = ipgre_close, | ||
1163 | #endif | ||
1164 | .ndo_start_xmit = ipgre_tunnel_xmit, | ||
1165 | .ndo_do_ioctl = ipgre_tunnel_ioctl, | ||
1166 | .ndo_change_mtu = ipgre_tunnel_change_mtu, | ||
1167 | }; | ||
1168 | |||
1158 | static void ipgre_tunnel_setup(struct net_device *dev) | 1169 | static void ipgre_tunnel_setup(struct net_device *dev) |
1159 | { | 1170 | { |
1160 | dev->init = ipgre_tunnel_init; | 1171 | dev->netdev_ops = &ipgre_netdev_ops; |
1161 | dev->uninit = ipgre_tunnel_uninit; | ||
1162 | dev->destructor = free_netdev; | 1172 | dev->destructor = free_netdev; |
1163 | dev->hard_start_xmit = ipgre_tunnel_xmit; | ||
1164 | dev->do_ioctl = ipgre_tunnel_ioctl; | ||
1165 | dev->change_mtu = ipgre_tunnel_change_mtu; | ||
1166 | 1173 | ||
1167 | dev->type = ARPHRD_IPGRE; | 1174 | dev->type = ARPHRD_IPGRE; |
1168 | dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; | 1175 | dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; |
@@ -1194,8 +1201,6 @@ static int ipgre_tunnel_init(struct net_device *dev) | |||
1194 | return -EINVAL; | 1201 | return -EINVAL; |
1195 | dev->flags = IFF_BROADCAST; | 1202 | dev->flags = IFF_BROADCAST; |
1196 | dev->header_ops = &ipgre_header_ops; | 1203 | dev->header_ops = &ipgre_header_ops; |
1197 | dev->open = ipgre_open; | ||
1198 | dev->stop = ipgre_close; | ||
1199 | } | 1204 | } |
1200 | #endif | 1205 | #endif |
1201 | } else | 1206 | } else |
@@ -1204,7 +1209,7 @@ static int ipgre_tunnel_init(struct net_device *dev) | |||
1204 | return 0; | 1209 | return 0; |
1205 | } | 1210 | } |
1206 | 1211 | ||
1207 | static int ipgre_fb_tunnel_init(struct net_device *dev) | 1212 | static void ipgre_fb_tunnel_init(struct net_device *dev) |
1208 | { | 1213 | { |
1209 | struct ip_tunnel *tunnel = netdev_priv(dev); | 1214 | struct ip_tunnel *tunnel = netdev_priv(dev); |
1210 | struct iphdr *iph = &tunnel->parms.iph; | 1215 | struct iphdr *iph = &tunnel->parms.iph; |
@@ -1220,7 +1225,6 @@ static int ipgre_fb_tunnel_init(struct net_device *dev) | |||
1220 | 1225 | ||
1221 | dev_hold(dev); | 1226 | dev_hold(dev); |
1222 | ign->tunnels_wc[0] = tunnel; | 1227 | ign->tunnels_wc[0] = tunnel; |
1223 | return 0; | ||
1224 | } | 1228 | } |
1225 | 1229 | ||
1226 | 1230 | ||
@@ -1264,9 +1268,9 @@ static int ipgre_init_net(struct net *net) | |||
1264 | err = -ENOMEM; | 1268 | err = -ENOMEM; |
1265 | goto err_alloc_dev; | 1269 | goto err_alloc_dev; |
1266 | } | 1270 | } |
1267 | |||
1268 | ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init; | ||
1269 | dev_net_set(ign->fb_tunnel_dev, net); | 1271 | dev_net_set(ign->fb_tunnel_dev, net); |
1272 | |||
1273 | ipgre_fb_tunnel_init(ign->fb_tunnel_dev); | ||
1270 | ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; | 1274 | ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; |
1271 | 1275 | ||
1272 | if ((err = register_netdev(ign->fb_tunnel_dev))) | 1276 | if ((err = register_netdev(ign->fb_tunnel_dev))) |
@@ -1397,16 +1401,22 @@ static int ipgre_tap_init(struct net_device *dev) | |||
1397 | return 0; | 1401 | return 0; |
1398 | } | 1402 | } |
1399 | 1403 | ||
1404 | static const struct net_device_ops ipgre_tap_netdev_ops = { | ||
1405 | .ndo_init = ipgre_tap_init, | ||
1406 | .ndo_uninit = ipgre_tunnel_uninit, | ||
1407 | .ndo_start_xmit = ipgre_tunnel_xmit, | ||
1408 | .ndo_set_mac_address = eth_mac_addr, | ||
1409 | .ndo_validate_addr = eth_validate_addr, | ||
1410 | .ndo_change_mtu = ipgre_tunnel_change_mtu, | ||
1411 | }; | ||
1412 | |||
1400 | static void ipgre_tap_setup(struct net_device *dev) | 1413 | static void ipgre_tap_setup(struct net_device *dev) |
1401 | { | 1414 | { |
1402 | 1415 | ||
1403 | ether_setup(dev); | 1416 | ether_setup(dev); |
1404 | 1417 | ||
1405 | dev->init = ipgre_tap_init; | 1418 | dev->netdev_ops = &ipgre_netdev_ops; |
1406 | dev->uninit = ipgre_tunnel_uninit; | ||
1407 | dev->destructor = free_netdev; | 1419 | dev->destructor = free_netdev; |
1408 | dev->hard_start_xmit = ipgre_tunnel_xmit; | ||
1409 | dev->change_mtu = ipgre_tunnel_change_mtu; | ||
1410 | 1420 | ||
1411 | dev->iflink = 0; | 1421 | dev->iflink = 0; |
1412 | dev->features |= NETIF_F_NETNS_LOCAL; | 1422 | dev->features |= NETIF_F_NETNS_LOCAL; |
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 70bedab03b09..1a58a6fa1dc0 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
@@ -209,9 +209,17 @@ static int ip_local_deliver_finish(struct sk_buff *skb) | |||
209 | 209 | ||
210 | hash = protocol & (MAX_INET_PROTOS - 1); | 210 | hash = protocol & (MAX_INET_PROTOS - 1); |
211 | ipprot = rcu_dereference(inet_protos[hash]); | 211 | ipprot = rcu_dereference(inet_protos[hash]); |
212 | if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) { | 212 | if (ipprot != NULL) { |
213 | int ret; | 213 | int ret; |
214 | 214 | ||
215 | if (!net_eq(net, &init_net) && !ipprot->netns_ok) { | ||
216 | if (net_ratelimit()) | ||
217 | printk("%s: proto %d isn't netns-ready\n", | ||
218 | __func__, protocol); | ||
219 | kfree_skb(skb); | ||
220 | goto out; | ||
221 | } | ||
222 | |||
215 | if (!ipprot->no_policy) { | 223 | if (!ipprot->no_policy) { |
216 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { | 224 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
217 | kfree_skb(skb); | 225 | kfree_skb(skb); |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 46d7be233eac..8ebe86dd72af 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -778,7 +778,7 @@ int ip_append_data(struct sock *sk, | |||
778 | int getfrag(void *from, char *to, int offset, int len, | 778 | int getfrag(void *from, char *to, int offset, int len, |
779 | int odd, struct sk_buff *skb), | 779 | int odd, struct sk_buff *skb), |
780 | void *from, int length, int transhdrlen, | 780 | void *from, int length, int transhdrlen, |
781 | struct ipcm_cookie *ipc, struct rtable *rt, | 781 | struct ipcm_cookie *ipc, struct rtable **rtp, |
782 | unsigned int flags) | 782 | unsigned int flags) |
783 | { | 783 | { |
784 | struct inet_sock *inet = inet_sk(sk); | 784 | struct inet_sock *inet = inet_sk(sk); |
@@ -793,6 +793,7 @@ int ip_append_data(struct sock *sk, | |||
793 | int offset = 0; | 793 | int offset = 0; |
794 | unsigned int maxfraglen, fragheaderlen; | 794 | unsigned int maxfraglen, fragheaderlen; |
795 | int csummode = CHECKSUM_NONE; | 795 | int csummode = CHECKSUM_NONE; |
796 | struct rtable *rt; | ||
796 | 797 | ||
797 | if (flags&MSG_PROBE) | 798 | if (flags&MSG_PROBE) |
798 | return 0; | 799 | return 0; |
@@ -812,7 +813,11 @@ int ip_append_data(struct sock *sk, | |||
812 | inet->cork.flags |= IPCORK_OPT; | 813 | inet->cork.flags |= IPCORK_OPT; |
813 | inet->cork.addr = ipc->addr; | 814 | inet->cork.addr = ipc->addr; |
814 | } | 815 | } |
815 | dst_hold(&rt->u.dst); | 816 | rt = *rtp; |
817 | /* | ||
818 | * We steal reference to this route, caller should not release it | ||
819 | */ | ||
820 | *rtp = NULL; | ||
816 | inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? | 821 | inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? |
817 | rt->u.dst.dev->mtu : | 822 | rt->u.dst.dev->mtu : |
818 | dst_mtu(rt->u.dst.path); | 823 | dst_mtu(rt->u.dst.path); |
@@ -1279,7 +1284,12 @@ int ip_push_pending_frames(struct sock *sk) | |||
1279 | 1284 | ||
1280 | skb->priority = sk->sk_priority; | 1285 | skb->priority = sk->sk_priority; |
1281 | skb->mark = sk->sk_mark; | 1286 | skb->mark = sk->sk_mark; |
1282 | skb->dst = dst_clone(&rt->u.dst); | 1287 | /* |
1288 | * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec | ||
1289 | * on dst refcount | ||
1290 | */ | ||
1291 | inet->cork.dst = NULL; | ||
1292 | skb->dst = &rt->u.dst; | ||
1283 | 1293 | ||
1284 | if (iph->protocol == IPPROTO_ICMP) | 1294 | if (iph->protocol == IPPROTO_ICMP) |
1285 | icmp_out_count(net, ((struct icmphdr *) | 1295 | icmp_out_count(net, ((struct icmphdr *) |
@@ -1391,7 +1401,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar | |||
1391 | sk->sk_protocol = ip_hdr(skb)->protocol; | 1401 | sk->sk_protocol = ip_hdr(skb)->protocol; |
1392 | sk->sk_bound_dev_if = arg->bound_dev_if; | 1402 | sk->sk_bound_dev_if = arg->bound_dev_if; |
1393 | ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, | 1403 | ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, |
1394 | &ipc, rt, MSG_DONTWAIT); | 1404 | &ipc, &rt, MSG_DONTWAIT); |
1395 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { | 1405 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { |
1396 | if (arg->csumoffset >= 0) | 1406 | if (arg->csumoffset >= 0) |
1397 | *((__sum16 *)skb_transport_header(skb) + | 1407 | *((__sum16 *)skb_transport_header(skb) + |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e976efeb1456..43c05854d752 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #define IP_CMSG_RECVOPTS 8 | 48 | #define IP_CMSG_RECVOPTS 8 |
49 | #define IP_CMSG_RETOPTS 16 | 49 | #define IP_CMSG_RETOPTS 16 |
50 | #define IP_CMSG_PASSSEC 32 | 50 | #define IP_CMSG_PASSSEC 32 |
51 | #define IP_CMSG_ORIGDSTADDR 64 | ||
51 | 52 | ||
52 | /* | 53 | /* |
53 | * SOL_IP control messages. | 54 | * SOL_IP control messages. |
@@ -126,6 +127,27 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) | |||
126 | security_release_secctx(secdata, seclen); | 127 | security_release_secctx(secdata, seclen); |
127 | } | 128 | } |
128 | 129 | ||
130 | static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) | ||
131 | { | ||
132 | struct sockaddr_in sin; | ||
133 | struct iphdr *iph = ip_hdr(skb); | ||
134 | __be16 *ports = (__be16 *)skb_transport_header(skb); | ||
135 | |||
136 | if (skb_transport_offset(skb) + 4 > skb->len) | ||
137 | return; | ||
138 | |||
139 | /* All current transport protocols have the port numbers in the | ||
140 | * first four bytes of the transport header and this function is | ||
141 | * written with this assumption in mind. | ||
142 | */ | ||
143 | |||
144 | sin.sin_family = AF_INET; | ||
145 | sin.sin_addr.s_addr = iph->daddr; | ||
146 | sin.sin_port = ports[1]; | ||
147 | memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); | ||
148 | |||
149 | put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); | ||
150 | } | ||
129 | 151 | ||
130 | void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) | 152 | void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) |
131 | { | 153 | { |
@@ -160,6 +182,12 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) | |||
160 | 182 | ||
161 | if (flags & 1) | 183 | if (flags & 1) |
162 | ip_cmsg_recv_security(msg, skb); | 184 | ip_cmsg_recv_security(msg, skb); |
185 | |||
186 | if ((flags>>=1) == 0) | ||
187 | return; | ||
188 | if (flags & 1) | ||
189 | ip_cmsg_recv_dstaddr(msg, skb); | ||
190 | |||
163 | } | 191 | } |
164 | 192 | ||
165 | int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) | 193 | int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) |
@@ -421,7 +449,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
421 | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | | 449 | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | |
422 | (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || | 450 | (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || |
423 | optname == IP_MULTICAST_TTL || | 451 | optname == IP_MULTICAST_TTL || |
424 | optname == IP_MULTICAST_LOOP) { | 452 | optname == IP_MULTICAST_LOOP || |
453 | optname == IP_RECVORIGDSTADDR) { | ||
425 | if (optlen >= sizeof(int)) { | 454 | if (optlen >= sizeof(int)) { |
426 | if (get_user(val, (int __user *) optval)) | 455 | if (get_user(val, (int __user *) optval)) |
427 | return -EFAULT; | 456 | return -EFAULT; |
@@ -509,6 +538,12 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
509 | else | 538 | else |
510 | inet->cmsg_flags &= ~IP_CMSG_PASSSEC; | 539 | inet->cmsg_flags &= ~IP_CMSG_PASSSEC; |
511 | break; | 540 | break; |
541 | case IP_RECVORIGDSTADDR: | ||
542 | if (val) | ||
543 | inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR; | ||
544 | else | ||
545 | inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; | ||
546 | break; | ||
512 | case IP_TOS: /* This sets both TOS and Precedence */ | 547 | case IP_TOS: /* This sets both TOS and Precedence */ |
513 | if (sk->sk_type == SOCK_STREAM) { | 548 | if (sk->sk_type == SOCK_STREAM) { |
514 | val &= ~3; | 549 | val &= ~3; |
@@ -1022,6 +1057,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, | |||
1022 | case IP_PASSSEC: | 1057 | case IP_PASSSEC: |
1023 | val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; | 1058 | val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; |
1024 | break; | 1059 | break; |
1060 | case IP_RECVORIGDSTADDR: | ||
1061 | val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; | ||
1062 | break; | ||
1025 | case IP_TOS: | 1063 | case IP_TOS: |
1026 | val = inet->tos; | 1064 | val = inet->tos; |
1027 | break; | 1065 | break; |
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index ec8264ae45c2..3262ce06294c 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
@@ -35,7 +35,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) | |||
35 | return; | 35 | return; |
36 | 36 | ||
37 | spi = htonl(ntohs(ipch->cpi)); | 37 | spi = htonl(ntohs(ipch->cpi)); |
38 | x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, | 38 | x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, |
39 | spi, IPPROTO_COMP, AF_INET); | 39 | spi, IPPROTO_COMP, AF_INET); |
40 | if (!x) | 40 | if (!x) |
41 | return; | 41 | return; |
@@ -49,7 +49,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) | |||
49 | { | 49 | { |
50 | struct xfrm_state *t; | 50 | struct xfrm_state *t; |
51 | 51 | ||
52 | t = xfrm_state_alloc(); | 52 | t = xfrm_state_alloc(&init_net); |
53 | if (t == NULL) | 53 | if (t == NULL) |
54 | goto out; | 54 | goto out; |
55 | 55 | ||
@@ -85,7 +85,7 @@ static int ipcomp_tunnel_attach(struct xfrm_state *x) | |||
85 | int err = 0; | 85 | int err = 0; |
86 | struct xfrm_state *t; | 86 | struct xfrm_state *t; |
87 | 87 | ||
88 | t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4, | 88 | t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4, |
89 | x->props.saddr.a4, IPPROTO_IPIP, AF_INET); | 89 | x->props.saddr.a4, IPPROTO_IPIP, AF_INET); |
90 | if (!t) { | 90 | if (!t) { |
91 | t = ipcomp_tunnel_create(x); | 91 | t = ipcomp_tunnel_create(x); |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index b3c3d7b0d116..5079dfbc6f38 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -130,8 +130,8 @@ struct ipip_net { | |||
130 | struct net_device *fb_tunnel_dev; | 130 | struct net_device *fb_tunnel_dev; |
131 | }; | 131 | }; |
132 | 132 | ||
133 | static int ipip_fb_tunnel_init(struct net_device *dev); | 133 | static void ipip_fb_tunnel_init(struct net_device *dev); |
134 | static int ipip_tunnel_init(struct net_device *dev); | 134 | static void ipip_tunnel_init(struct net_device *dev); |
135 | static void ipip_tunnel_setup(struct net_device *dev); | 135 | static void ipip_tunnel_setup(struct net_device *dev); |
136 | 136 | ||
137 | static DEFINE_RWLOCK(ipip_lock); | 137 | static DEFINE_RWLOCK(ipip_lock); |
@@ -245,9 +245,10 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, | |||
245 | } | 245 | } |
246 | 246 | ||
247 | nt = netdev_priv(dev); | 247 | nt = netdev_priv(dev); |
248 | dev->init = ipip_tunnel_init; | ||
249 | nt->parms = *parms; | 248 | nt->parms = *parms; |
250 | 249 | ||
250 | ipip_tunnel_init(dev); | ||
251 | |||
251 | if (register_netdevice(dev) < 0) | 252 | if (register_netdevice(dev) < 0) |
252 | goto failed_free; | 253 | goto failed_free; |
253 | 254 | ||
@@ -691,12 +692,17 @@ static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) | |||
691 | return 0; | 692 | return 0; |
692 | } | 693 | } |
693 | 694 | ||
695 | static const struct net_device_ops ipip_netdev_ops = { | ||
696 | .ndo_uninit = ipip_tunnel_uninit, | ||
697 | .ndo_start_xmit = ipip_tunnel_xmit, | ||
698 | .ndo_do_ioctl = ipip_tunnel_ioctl, | ||
699 | .ndo_change_mtu = ipip_tunnel_change_mtu, | ||
700 | |||
701 | }; | ||
702 | |||
694 | static void ipip_tunnel_setup(struct net_device *dev) | 703 | static void ipip_tunnel_setup(struct net_device *dev) |
695 | { | 704 | { |
696 | dev->uninit = ipip_tunnel_uninit; | 705 | dev->netdev_ops = &ipip_netdev_ops; |
697 | dev->hard_start_xmit = ipip_tunnel_xmit; | ||
698 | dev->do_ioctl = ipip_tunnel_ioctl; | ||
699 | dev->change_mtu = ipip_tunnel_change_mtu; | ||
700 | dev->destructor = free_netdev; | 706 | dev->destructor = free_netdev; |
701 | 707 | ||
702 | dev->type = ARPHRD_TUNNEL; | 708 | dev->type = ARPHRD_TUNNEL; |
@@ -708,11 +714,9 @@ static void ipip_tunnel_setup(struct net_device *dev) | |||
708 | dev->features |= NETIF_F_NETNS_LOCAL; | 714 | dev->features |= NETIF_F_NETNS_LOCAL; |
709 | } | 715 | } |
710 | 716 | ||
711 | static int ipip_tunnel_init(struct net_device *dev) | 717 | static void ipip_tunnel_init(struct net_device *dev) |
712 | { | 718 | { |
713 | struct ip_tunnel *tunnel; | 719 | struct ip_tunnel *tunnel = netdev_priv(dev); |
714 | |||
715 | tunnel = netdev_priv(dev); | ||
716 | 720 | ||
717 | tunnel->dev = dev; | 721 | tunnel->dev = dev; |
718 | strcpy(tunnel->parms.name, dev->name); | 722 | strcpy(tunnel->parms.name, dev->name); |
@@ -721,11 +725,9 @@ static int ipip_tunnel_init(struct net_device *dev) | |||
721 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | 725 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); |
722 | 726 | ||
723 | ipip_tunnel_bind_dev(dev); | 727 | ipip_tunnel_bind_dev(dev); |
724 | |||
725 | return 0; | ||
726 | } | 728 | } |
727 | 729 | ||
728 | static int ipip_fb_tunnel_init(struct net_device *dev) | 730 | static void ipip_fb_tunnel_init(struct net_device *dev) |
729 | { | 731 | { |
730 | struct ip_tunnel *tunnel = netdev_priv(dev); | 732 | struct ip_tunnel *tunnel = netdev_priv(dev); |
731 | struct iphdr *iph = &tunnel->parms.iph; | 733 | struct iphdr *iph = &tunnel->parms.iph; |
@@ -740,7 +742,6 @@ static int ipip_fb_tunnel_init(struct net_device *dev) | |||
740 | 742 | ||
741 | dev_hold(dev); | 743 | dev_hold(dev); |
742 | ipn->tunnels_wc[0] = tunnel; | 744 | ipn->tunnels_wc[0] = tunnel; |
743 | return 0; | ||
744 | } | 745 | } |
745 | 746 | ||
746 | static struct xfrm_tunnel ipip_handler = { | 747 | static struct xfrm_tunnel ipip_handler = { |
@@ -792,10 +793,10 @@ static int ipip_init_net(struct net *net) | |||
792 | err = -ENOMEM; | 793 | err = -ENOMEM; |
793 | goto err_alloc_dev; | 794 | goto err_alloc_dev; |
794 | } | 795 | } |
795 | |||
796 | ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init; | ||
797 | dev_net_set(ipn->fb_tunnel_dev, net); | 796 | dev_net_set(ipn->fb_tunnel_dev, net); |
798 | 797 | ||
798 | ipip_fb_tunnel_init(ipn->fb_tunnel_dev); | ||
799 | |||
799 | if ((err = register_netdev(ipn->fb_tunnel_dev))) | 800 | if ((err = register_netdev(ipn->fb_tunnel_dev))) |
800 | goto err_reg_dev; | 801 | goto err_reg_dev; |
801 | 802 | ||
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 05ed336f798a..77fc4d3fdf61 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -124,8 +124,8 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) | |||
124 | 124 | ||
125 | dev = __dev_get_by_name(&init_net, "tunl0"); | 125 | dev = __dev_get_by_name(&init_net, "tunl0"); |
126 | if (dev) { | 126 | if (dev) { |
127 | const struct net_device_ops *ops = dev->netdev_ops; | ||
127 | struct ifreq ifr; | 128 | struct ifreq ifr; |
128 | mm_segment_t oldfs; | ||
129 | struct ip_tunnel_parm p; | 129 | struct ip_tunnel_parm p; |
130 | 130 | ||
131 | memset(&p, 0, sizeof(p)); | 131 | memset(&p, 0, sizeof(p)); |
@@ -137,9 +137,13 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) | |||
137 | sprintf(p.name, "dvmrp%d", v->vifc_vifi); | 137 | sprintf(p.name, "dvmrp%d", v->vifc_vifi); |
138 | ifr.ifr_ifru.ifru_data = (__force void __user *)&p; | 138 | ifr.ifr_ifru.ifru_data = (__force void __user *)&p; |
139 | 139 | ||
140 | oldfs = get_fs(); set_fs(KERNEL_DS); | 140 | if (ops->ndo_do_ioctl) { |
141 | dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL); | 141 | mm_segment_t oldfs = get_fs(); |
142 | set_fs(oldfs); | 142 | |
143 | set_fs(KERNEL_DS); | ||
144 | ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL); | ||
145 | set_fs(oldfs); | ||
146 | } | ||
143 | } | 147 | } |
144 | } | 148 | } |
145 | 149 | ||
@@ -151,9 +155,9 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) | |||
151 | dev = __dev_get_by_name(&init_net, "tunl0"); | 155 | dev = __dev_get_by_name(&init_net, "tunl0"); |
152 | 156 | ||
153 | if (dev) { | 157 | if (dev) { |
158 | const struct net_device_ops *ops = dev->netdev_ops; | ||
154 | int err; | 159 | int err; |
155 | struct ifreq ifr; | 160 | struct ifreq ifr; |
156 | mm_segment_t oldfs; | ||
157 | struct ip_tunnel_parm p; | 161 | struct ip_tunnel_parm p; |
158 | struct in_device *in_dev; | 162 | struct in_device *in_dev; |
159 | 163 | ||
@@ -166,9 +170,14 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) | |||
166 | sprintf(p.name, "dvmrp%d", v->vifc_vifi); | 170 | sprintf(p.name, "dvmrp%d", v->vifc_vifi); |
167 | ifr.ifr_ifru.ifru_data = (__force void __user *)&p; | 171 | ifr.ifr_ifru.ifru_data = (__force void __user *)&p; |
168 | 172 | ||
169 | oldfs = get_fs(); set_fs(KERNEL_DS); | 173 | if (ops->ndo_do_ioctl) { |
170 | err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); | 174 | mm_segment_t oldfs = get_fs(); |
171 | set_fs(oldfs); | 175 | |
176 | set_fs(KERNEL_DS); | ||
177 | err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); | ||
178 | set_fs(oldfs); | ||
179 | } else | ||
180 | err = -EOPNOTSUPP; | ||
172 | 181 | ||
173 | dev = NULL; | 182 | dev = NULL; |
174 | 183 | ||
@@ -213,12 +222,16 @@ static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) | |||
213 | return 0; | 222 | return 0; |
214 | } | 223 | } |
215 | 224 | ||
225 | static const struct net_device_ops reg_vif_netdev_ops = { | ||
226 | .ndo_start_xmit = reg_vif_xmit, | ||
227 | }; | ||
228 | |||
216 | static void reg_vif_setup(struct net_device *dev) | 229 | static void reg_vif_setup(struct net_device *dev) |
217 | { | 230 | { |
218 | dev->type = ARPHRD_PIMREG; | 231 | dev->type = ARPHRD_PIMREG; |
219 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; | 232 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; |
220 | dev->flags = IFF_NOARP; | 233 | dev->flags = IFF_NOARP; |
221 | dev->hard_start_xmit = reg_vif_xmit; | 234 | dev->netdev_ops = ®_vif_netdev_ops, |
222 | dev->destructor = free_netdev; | 235 | dev->destructor = free_netdev; |
223 | } | 236 | } |
224 | 237 | ||
@@ -1945,13 +1958,14 @@ int __init ip_mr_init(void) | |||
1945 | goto proc_cache_fail; | 1958 | goto proc_cache_fail; |
1946 | #endif | 1959 | #endif |
1947 | return 0; | 1960 | return 0; |
1948 | reg_notif_fail: | ||
1949 | kmem_cache_destroy(mrt_cachep); | ||
1950 | #ifdef CONFIG_PROC_FS | 1961 | #ifdef CONFIG_PROC_FS |
1951 | proc_vif_fail: | ||
1952 | unregister_netdevice_notifier(&ip_mr_notifier); | ||
1953 | proc_cache_fail: | 1962 | proc_cache_fail: |
1954 | proc_net_remove(&init_net, "ip_mr_vif"); | 1963 | proc_net_remove(&init_net, "ip_mr_vif"); |
1964 | proc_vif_fail: | ||
1965 | unregister_netdevice_notifier(&ip_mr_notifier); | ||
1955 | #endif | 1966 | #endif |
1967 | reg_notif_fail: | ||
1968 | del_timer(&ipmr_expire_timer); | ||
1969 | kmem_cache_destroy(mrt_cachep); | ||
1956 | return err; | 1970 | return err; |
1957 | } | 1971 | } |
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 7c145d76384d..fdf6811c31a2 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c | |||
@@ -66,7 +66,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) | |||
66 | #ifdef CONFIG_XFRM | 66 | #ifdef CONFIG_XFRM |
67 | if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && | 67 | if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && |
68 | xfrm_decode_session(skb, &fl, AF_INET) == 0) | 68 | xfrm_decode_session(skb, &fl, AF_INET) == 0) |
69 | if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0)) | 69 | if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0)) |
70 | return -1; | 70 | return -1; |
71 | #endif | 71 | #endif |
72 | 72 | ||
@@ -97,7 +97,7 @@ int ip_xfrm_me_harder(struct sk_buff *skb) | |||
97 | dst = ((struct xfrm_dst *)dst)->route; | 97 | dst = ((struct xfrm_dst *)dst)->route; |
98 | dst_hold(dst); | 98 | dst_hold(dst); |
99 | 99 | ||
100 | if (xfrm_lookup(&dst, &fl, skb->sk, 0) < 0) | 100 | if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) |
101 | return -1; | 101 | return -1; |
102 | 102 | ||
103 | dst_release(skb->dst); | 103 | dst_release(skb->dst); |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8f5a403f6f6b..614958b7c276 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -54,8 +54,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) | |||
54 | socket_seq_show(seq); | 54 | socket_seq_show(seq); |
55 | seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", | 55 | seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", |
56 | sock_prot_inuse_get(net, &tcp_prot), | 56 | sock_prot_inuse_get(net, &tcp_prot), |
57 | atomic_read(&tcp_orphan_count), | 57 | (int)percpu_counter_sum_positive(&tcp_orphan_count), |
58 | tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), | 58 | tcp_death_row.tw_count, |
59 | (int)percpu_counter_sum_positive(&tcp_sockets_allocated), | ||
59 | atomic_read(&tcp_memory_allocated)); | 60 | atomic_read(&tcp_memory_allocated)); |
60 | seq_printf(seq, "UDP: inuse %d mem %d\n", | 61 | seq_printf(seq, "UDP: inuse %d mem %d\n", |
61 | sock_prot_inuse_get(net, &udp_prot), | 62 | sock_prot_inuse_get(net, &udp_prot), |
@@ -234,46 +235,51 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
234 | SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), | 235 | SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), |
235 | SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), | 236 | SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), |
236 | SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), | 237 | SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), |
238 | SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), | ||
239 | SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), | ||
240 | SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), | ||
237 | SNMP_MIB_SENTINEL | 241 | SNMP_MIB_SENTINEL |
238 | }; | 242 | }; |
239 | 243 | ||
244 | static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, | ||
245 | unsigned short *type, int count) | ||
246 | { | ||
247 | int j; | ||
248 | |||
249 | if (count) { | ||
250 | seq_printf(seq, "\nIcmpMsg:"); | ||
251 | for (j = 0; j < count; ++j) | ||
252 | seq_printf(seq, " %sType%u", | ||
253 | type[j] & 0x100 ? "Out" : "In", | ||
254 | type[j] & 0xff); | ||
255 | seq_printf(seq, "\nIcmpMsg:"); | ||
256 | for (j = 0; j < count; ++j) | ||
257 | seq_printf(seq, " %lu", vals[j]); | ||
258 | } | ||
259 | } | ||
260 | |||
240 | static void icmpmsg_put(struct seq_file *seq) | 261 | static void icmpmsg_put(struct seq_file *seq) |
241 | { | 262 | { |
242 | #define PERLINE 16 | 263 | #define PERLINE 16 |
243 | 264 | ||
244 | int j, i, count; | 265 | int i, count; |
245 | static int out[PERLINE]; | 266 | unsigned short type[PERLINE]; |
267 | unsigned long vals[PERLINE], val; | ||
246 | struct net *net = seq->private; | 268 | struct net *net = seq->private; |
247 | 269 | ||
248 | count = 0; | 270 | count = 0; |
249 | for (i = 0; i < ICMPMSG_MIB_MAX; i++) { | 271 | for (i = 0; i < ICMPMSG_MIB_MAX; i++) { |
250 | 272 | val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i); | |
251 | if (snmp_fold_field((void **) net->mib.icmpmsg_statistics, i)) | 273 | if (val) { |
252 | out[count++] = i; | 274 | type[count] = i; |
253 | if (count < PERLINE) | 275 | vals[count++] = val; |
254 | continue; | 276 | } |
255 | 277 | if (count == PERLINE) { | |
256 | seq_printf(seq, "\nIcmpMsg:"); | 278 | icmpmsg_put_line(seq, vals, type, count); |
257 | for (j = 0; j < PERLINE; ++j) | 279 | count = 0; |
258 | seq_printf(seq, " %sType%u", i & 0x100 ? "Out" : "In", | 280 | } |
259 | i & 0xff); | ||
260 | seq_printf(seq, "\nIcmpMsg: "); | ||
261 | for (j = 0; j < PERLINE; ++j) | ||
262 | seq_printf(seq, " %lu", | ||
263 | snmp_fold_field((void **) net->mib.icmpmsg_statistics, | ||
264 | out[j])); | ||
265 | seq_putc(seq, '\n'); | ||
266 | } | ||
267 | if (count) { | ||
268 | seq_printf(seq, "\nIcmpMsg:"); | ||
269 | for (j = 0; j < count; ++j) | ||
270 | seq_printf(seq, " %sType%u", out[j] & 0x100 ? "Out" : | ||
271 | "In", out[j] & 0xff); | ||
272 | seq_printf(seq, "\nIcmpMsg:"); | ||
273 | for (j = 0; j < count; ++j) | ||
274 | seq_printf(seq, " %lu", snmp_fold_field((void **) | ||
275 | net->mib.icmpmsg_statistics, out[j])); | ||
276 | } | 281 | } |
282 | icmpmsg_put_line(seq, vals, type, count); | ||
277 | 283 | ||
278 | #undef PERLINE | 284 | #undef PERLINE |
279 | } | 285 | } |
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 998fcffc9e15..dff8bc4e0fac 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -572,7 +572,7 @@ back_from_confirm: | |||
572 | ipc.addr = rt->rt_dst; | 572 | ipc.addr = rt->rt_dst; |
573 | lock_sock(sk); | 573 | lock_sock(sk); |
574 | err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, | 574 | err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, |
575 | &ipc, rt, msg->msg_flags); | 575 | &ipc, &rt, msg->msg_flags); |
576 | if (err) | 576 | if (err) |
577 | ip_flush_pending_frames(sk); | 577 | ip_flush_pending_frames(sk); |
578 | else if (!(msg->msg_flags & MSG_MORE)) | 578 | else if (!(msg->msg_flags & MSG_MORE)) |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0dc0c3826763..77bfba975959 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -160,7 +160,6 @@ static struct dst_ops ipv4_dst_ops = { | |||
160 | .link_failure = ipv4_link_failure, | 160 | .link_failure = ipv4_link_failure, |
161 | .update_pmtu = ip_rt_update_pmtu, | 161 | .update_pmtu = ip_rt_update_pmtu, |
162 | .local_out = __ip_local_out, | 162 | .local_out = __ip_local_out, |
163 | .entry_size = sizeof(struct rtable), | ||
164 | .entries = ATOMIC_INIT(0), | 163 | .entries = ATOMIC_INIT(0), |
165 | }; | 164 | }; |
166 | 165 | ||
@@ -2701,7 +2700,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = { | |||
2701 | .destroy = ipv4_dst_destroy, | 2700 | .destroy = ipv4_dst_destroy, |
2702 | .check = ipv4_dst_check, | 2701 | .check = ipv4_dst_check, |
2703 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, | 2702 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, |
2704 | .entry_size = sizeof(struct rtable), | ||
2705 | .entries = ATOMIC_INIT(0), | 2703 | .entries = ATOMIC_INIT(0), |
2706 | }; | 2704 | }; |
2707 | 2705 | ||
@@ -2763,7 +2761,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, | |||
2763 | flp->fl4_src = (*rp)->rt_src; | 2761 | flp->fl4_src = (*rp)->rt_src; |
2764 | if (!flp->fl4_dst) | 2762 | if (!flp->fl4_dst) |
2765 | flp->fl4_dst = (*rp)->rt_dst; | 2763 | flp->fl4_dst = (*rp)->rt_dst; |
2766 | err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, | 2764 | err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, |
2767 | flags ? XFRM_LOOKUP_WAIT : 0); | 2765 | flags ? XFRM_LOOKUP_WAIT : 0); |
2768 | if (err == -EREMOTE) | 2766 | if (err == -EREMOTE) |
2769 | err = ipv4_dst_blackhole(net, rp, flp); | 2767 | err = ipv4_dst_blackhole(net, rp, flp); |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 60c28add96b8..019243408623 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -277,8 +277,7 @@ | |||
277 | 277 | ||
278 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | 278 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; |
279 | 279 | ||
280 | atomic_t tcp_orphan_count = ATOMIC_INIT(0); | 280 | struct percpu_counter tcp_orphan_count; |
281 | |||
282 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 281 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
283 | 282 | ||
284 | int sysctl_tcp_mem[3] __read_mostly; | 283 | int sysctl_tcp_mem[3] __read_mostly; |
@@ -290,9 +289,12 @@ EXPORT_SYMBOL(sysctl_tcp_rmem); | |||
290 | EXPORT_SYMBOL(sysctl_tcp_wmem); | 289 | EXPORT_SYMBOL(sysctl_tcp_wmem); |
291 | 290 | ||
292 | atomic_t tcp_memory_allocated; /* Current allocated memory. */ | 291 | atomic_t tcp_memory_allocated; /* Current allocated memory. */ |
293 | atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ | ||
294 | |||
295 | EXPORT_SYMBOL(tcp_memory_allocated); | 292 | EXPORT_SYMBOL(tcp_memory_allocated); |
293 | |||
294 | /* | ||
295 | * Current number of TCP sockets. | ||
296 | */ | ||
297 | struct percpu_counter tcp_sockets_allocated; | ||
296 | EXPORT_SYMBOL(tcp_sockets_allocated); | 298 | EXPORT_SYMBOL(tcp_sockets_allocated); |
297 | 299 | ||
298 | /* | 300 | /* |
@@ -1374,8 +1376,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1374 | sk->sk_state == TCP_CLOSE || | 1376 | sk->sk_state == TCP_CLOSE || |
1375 | (sk->sk_shutdown & RCV_SHUTDOWN) || | 1377 | (sk->sk_shutdown & RCV_SHUTDOWN) || |
1376 | !timeo || | 1378 | !timeo || |
1377 | signal_pending(current) || | 1379 | signal_pending(current)) |
1378 | (flags & MSG_PEEK)) | ||
1379 | break; | 1380 | break; |
1380 | } else { | 1381 | } else { |
1381 | if (sock_flag(sk, SOCK_DONE)) | 1382 | if (sock_flag(sk, SOCK_DONE)) |
@@ -1835,7 +1836,7 @@ adjudge_to_death: | |||
1835 | state = sk->sk_state; | 1836 | state = sk->sk_state; |
1836 | sock_hold(sk); | 1837 | sock_hold(sk); |
1837 | sock_orphan(sk); | 1838 | sock_orphan(sk); |
1838 | atomic_inc(sk->sk_prot->orphan_count); | 1839 | percpu_counter_inc(sk->sk_prot->orphan_count); |
1839 | 1840 | ||
1840 | /* It is the last release_sock in its life. It will remove backlog. */ | 1841 | /* It is the last release_sock in its life. It will remove backlog. */ |
1841 | release_sock(sk); | 1842 | release_sock(sk); |
@@ -1886,9 +1887,11 @@ adjudge_to_death: | |||
1886 | } | 1887 | } |
1887 | } | 1888 | } |
1888 | if (sk->sk_state != TCP_CLOSE) { | 1889 | if (sk->sk_state != TCP_CLOSE) { |
1890 | int orphan_count = percpu_counter_read_positive( | ||
1891 | sk->sk_prot->orphan_count); | ||
1892 | |||
1889 | sk_mem_reclaim(sk); | 1893 | sk_mem_reclaim(sk); |
1890 | if (tcp_too_many_orphans(sk, | 1894 | if (tcp_too_many_orphans(sk, orphan_count)) { |
1891 | atomic_read(sk->sk_prot->orphan_count))) { | ||
1892 | if (net_ratelimit()) | 1895 | if (net_ratelimit()) |
1893 | printk(KERN_INFO "TCP: too many of orphaned " | 1896 | printk(KERN_INFO "TCP: too many of orphaned " |
1894 | "sockets\n"); | 1897 | "sockets\n"); |
@@ -2686,6 +2689,8 @@ void __init tcp_init(void) | |||
2686 | 2689 | ||
2687 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); | 2690 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); |
2688 | 2691 | ||
2692 | percpu_counter_init(&tcp_sockets_allocated, 0); | ||
2693 | percpu_counter_init(&tcp_orphan_count, 0); | ||
2689 | tcp_hashinfo.bind_bucket_cachep = | 2694 | tcp_hashinfo.bind_bucket_cachep = |
2690 | kmem_cache_create("tcp_bind_bucket", | 2695 | kmem_cache_create("tcp_bind_bucket", |
2691 | sizeof(struct inet_bind_bucket), 0, | 2696 | sizeof(struct inet_bind_bucket), 0, |
@@ -2708,8 +2713,8 @@ void __init tcp_init(void) | |||
2708 | thash_entries ? 0 : 512 * 1024); | 2713 | thash_entries ? 0 : 512 * 1024); |
2709 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; | 2714 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; |
2710 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { | 2715 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { |
2711 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); | 2716 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); |
2712 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); | 2717 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); |
2713 | } | 2718 | } |
2714 | if (inet_ehash_locks_alloc(&tcp_hashinfo)) | 2719 | if (inet_ehash_locks_alloc(&tcp_hashinfo)) |
2715 | panic("TCP: failed to alloc ehash_locks"); | 2720 | panic("TCP: failed to alloc ehash_locks"); |
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 838d491dfda7..fcbcd4ff6c5f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c | |||
@@ -34,7 +34,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, | |||
34 | tcp_get_info(sk, info); | 34 | tcp_get_info(sk, info); |
35 | } | 35 | } |
36 | 36 | ||
37 | static struct inet_diag_handler tcp_diag_handler = { | 37 | static const struct inet_diag_handler tcp_diag_handler = { |
38 | .idiag_hashinfo = &tcp_hashinfo, | 38 | .idiag_hashinfo = &tcp_hashinfo, |
39 | .idiag_get_info = tcp_diag_get_info, | 39 | .idiag_get_info = tcp_diag_get_info, |
40 | .idiag_type = TCPDIAG_GETSOCK, | 40 | .idiag_type = TCPDIAG_GETSOCK, |
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index af99776146ff..937549b8a921 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
@@ -69,9 +69,12 @@ static u32 htcp_cwnd_undo(struct sock *sk) | |||
69 | const struct tcp_sock *tp = tcp_sk(sk); | 69 | const struct tcp_sock *tp = tcp_sk(sk); |
70 | struct htcp *ca = inet_csk_ca(sk); | 70 | struct htcp *ca = inet_csk_ca(sk); |
71 | 71 | ||
72 | ca->last_cong = ca->undo_last_cong; | 72 | if (ca->undo_last_cong) { |
73 | ca->maxRTT = ca->undo_maxRTT; | 73 | ca->last_cong = ca->undo_last_cong; |
74 | ca->old_maxB = ca->undo_old_maxB; | 74 | ca->maxRTT = ca->undo_maxRTT; |
75 | ca->old_maxB = ca->undo_old_maxB; | ||
76 | ca->undo_last_cong = 0; | ||
77 | } | ||
75 | 78 | ||
76 | return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta); | 79 | return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta); |
77 | } | 80 | } |
@@ -268,7 +271,10 @@ static void htcp_state(struct sock *sk, u8 new_state) | |||
268 | case TCP_CA_Open: | 271 | case TCP_CA_Open: |
269 | { | 272 | { |
270 | struct htcp *ca = inet_csk_ca(sk); | 273 | struct htcp *ca = inet_csk_ca(sk); |
271 | ca->last_cong = jiffies; | 274 | if (ca->undo_last_cong) { |
275 | ca->last_cong = jiffies; | ||
276 | ca->undo_last_cong = 0; | ||
277 | } | ||
272 | } | 278 | } |
273 | break; | 279 | break; |
274 | case TCP_CA_CWR: | 280 | case TCP_CA_CWR: |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 097294b7da3e..d67b6e9cc540 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -1002,7 +1002,8 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) | |||
1002 | } | 1002 | } |
1003 | } | 1003 | } |
1004 | 1004 | ||
1005 | void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) | 1005 | static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, |
1006 | struct sk_buff *skb) | ||
1006 | { | 1007 | { |
1007 | tcp_verify_retransmit_hint(tp, skb); | 1008 | tcp_verify_retransmit_hint(tp, skb); |
1008 | 1009 | ||
@@ -1241,26 +1242,47 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, | |||
1241 | * aligned portion of it that matches. Therefore we might need to fragment | 1242 | * aligned portion of it that matches. Therefore we might need to fragment |
1242 | * which may fail and creates some hassle (caller must handle error case | 1243 | * which may fail and creates some hassle (caller must handle error case |
1243 | * returns). | 1244 | * returns). |
1245 | * | ||
1246 | * FIXME: this could be merged to shift decision code | ||
1244 | */ | 1247 | */ |
1245 | static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | 1248 | static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, |
1246 | u32 start_seq, u32 end_seq) | 1249 | u32 start_seq, u32 end_seq) |
1247 | { | 1250 | { |
1248 | int in_sack, err; | 1251 | int in_sack, err; |
1249 | unsigned int pkt_len; | 1252 | unsigned int pkt_len; |
1253 | unsigned int mss; | ||
1250 | 1254 | ||
1251 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && | 1255 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && |
1252 | !before(end_seq, TCP_SKB_CB(skb)->end_seq); | 1256 | !before(end_seq, TCP_SKB_CB(skb)->end_seq); |
1253 | 1257 | ||
1254 | if (tcp_skb_pcount(skb) > 1 && !in_sack && | 1258 | if (tcp_skb_pcount(skb) > 1 && !in_sack && |
1255 | after(TCP_SKB_CB(skb)->end_seq, start_seq)) { | 1259 | after(TCP_SKB_CB(skb)->end_seq, start_seq)) { |
1256 | 1260 | mss = tcp_skb_mss(skb); | |
1257 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); | 1261 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); |
1258 | 1262 | ||
1259 | if (!in_sack) | 1263 | if (!in_sack) { |
1260 | pkt_len = start_seq - TCP_SKB_CB(skb)->seq; | 1264 | pkt_len = start_seq - TCP_SKB_CB(skb)->seq; |
1261 | else | 1265 | if (pkt_len < mss) |
1266 | pkt_len = mss; | ||
1267 | } else { | ||
1262 | pkt_len = end_seq - TCP_SKB_CB(skb)->seq; | 1268 | pkt_len = end_seq - TCP_SKB_CB(skb)->seq; |
1263 | err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); | 1269 | if (pkt_len < mss) |
1270 | return -EINVAL; | ||
1271 | } | ||
1272 | |||
1273 | /* Round if necessary so that SACKs cover only full MSSes | ||
1274 | * and/or the remaining small portion (if present) | ||
1275 | */ | ||
1276 | if (pkt_len > mss) { | ||
1277 | unsigned int new_len = (pkt_len / mss) * mss; | ||
1278 | if (!in_sack && new_len < pkt_len) { | ||
1279 | new_len += mss; | ||
1280 | if (new_len > skb->len) | ||
1281 | return 0; | ||
1282 | } | ||
1283 | pkt_len = new_len; | ||
1284 | } | ||
1285 | err = tcp_fragment(sk, skb, pkt_len, mss); | ||
1264 | if (err < 0) | 1286 | if (err < 0) |
1265 | return err; | 1287 | return err; |
1266 | } | 1288 | } |
@@ -1269,7 +1291,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | |||
1269 | } | 1291 | } |
1270 | 1292 | ||
1271 | static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | 1293 | static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, |
1272 | int *reord, int dup_sack, int fack_count) | 1294 | int *reord, int dup_sack, int fack_count, |
1295 | u8 *sackedto, int pcount) | ||
1273 | { | 1296 | { |
1274 | struct tcp_sock *tp = tcp_sk(sk); | 1297 | struct tcp_sock *tp = tcp_sk(sk); |
1275 | u8 sacked = TCP_SKB_CB(skb)->sacked; | 1298 | u8 sacked = TCP_SKB_CB(skb)->sacked; |
@@ -1294,10 +1317,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | |||
1294 | * that retransmission is still in flight. | 1317 | * that retransmission is still in flight. |
1295 | */ | 1318 | */ |
1296 | if (sacked & TCPCB_LOST) { | 1319 | if (sacked & TCPCB_LOST) { |
1297 | TCP_SKB_CB(skb)->sacked &= | 1320 | *sackedto &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); |
1298 | ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); | 1321 | tp->lost_out -= pcount; |
1299 | tp->lost_out -= tcp_skb_pcount(skb); | 1322 | tp->retrans_out -= pcount; |
1300 | tp->retrans_out -= tcp_skb_pcount(skb); | ||
1301 | } | 1323 | } |
1302 | } else { | 1324 | } else { |
1303 | if (!(sacked & TCPCB_RETRANS)) { | 1325 | if (!(sacked & TCPCB_RETRANS)) { |
@@ -1314,48 +1336,280 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | |||
1314 | } | 1336 | } |
1315 | 1337 | ||
1316 | if (sacked & TCPCB_LOST) { | 1338 | if (sacked & TCPCB_LOST) { |
1317 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 1339 | *sackedto &= ~TCPCB_LOST; |
1318 | tp->lost_out -= tcp_skb_pcount(skb); | 1340 | tp->lost_out -= pcount; |
1319 | } | 1341 | } |
1320 | } | 1342 | } |
1321 | 1343 | ||
1322 | TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; | 1344 | *sackedto |= TCPCB_SACKED_ACKED; |
1323 | flag |= FLAG_DATA_SACKED; | 1345 | flag |= FLAG_DATA_SACKED; |
1324 | tp->sacked_out += tcp_skb_pcount(skb); | 1346 | tp->sacked_out += pcount; |
1325 | 1347 | ||
1326 | fack_count += tcp_skb_pcount(skb); | 1348 | fack_count += pcount; |
1327 | 1349 | ||
1328 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ | 1350 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
1329 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && | 1351 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && |
1330 | before(TCP_SKB_CB(skb)->seq, | 1352 | before(TCP_SKB_CB(skb)->seq, |
1331 | TCP_SKB_CB(tp->lost_skb_hint)->seq)) | 1353 | TCP_SKB_CB(tp->lost_skb_hint)->seq)) |
1332 | tp->lost_cnt_hint += tcp_skb_pcount(skb); | 1354 | tp->lost_cnt_hint += pcount; |
1333 | 1355 | ||
1334 | if (fack_count > tp->fackets_out) | 1356 | if (fack_count > tp->fackets_out) |
1335 | tp->fackets_out = fack_count; | 1357 | tp->fackets_out = fack_count; |
1336 | |||
1337 | if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) | ||
1338 | tcp_advance_highest_sack(sk, skb); | ||
1339 | } | 1358 | } |
1340 | 1359 | ||
1341 | /* D-SACK. We can detect redundant retransmission in S|R and plain R | 1360 | /* D-SACK. We can detect redundant retransmission in S|R and plain R |
1342 | * frames and clear it. undo_retrans is decreased above, L|R frames | 1361 | * frames and clear it. undo_retrans is decreased above, L|R frames |
1343 | * are accounted above as well. | 1362 | * are accounted above as well. |
1344 | */ | 1363 | */ |
1345 | if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { | 1364 | if (dup_sack && (*sackedto & TCPCB_SACKED_RETRANS)) { |
1346 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1365 | *sackedto &= ~TCPCB_SACKED_RETRANS; |
1347 | tp->retrans_out -= tcp_skb_pcount(skb); | 1366 | tp->retrans_out -= pcount; |
1348 | } | 1367 | } |
1349 | 1368 | ||
1350 | return flag; | 1369 | return flag; |
1351 | } | 1370 | } |
1352 | 1371 | ||
1372 | static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, | ||
1373 | struct sk_buff *skb, unsigned int pcount, | ||
1374 | int shifted, int fack_count, int *reord, | ||
1375 | int *flag, int mss) | ||
1376 | { | ||
1377 | struct tcp_sock *tp = tcp_sk(sk); | ||
1378 | u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */ | ||
1379 | |||
1380 | BUG_ON(!pcount); | ||
1381 | |||
1382 | /* Tweak before seqno plays */ | ||
1383 | if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && | ||
1384 | !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) | ||
1385 | tp->lost_cnt_hint += pcount; | ||
1386 | |||
1387 | TCP_SKB_CB(prev)->end_seq += shifted; | ||
1388 | TCP_SKB_CB(skb)->seq += shifted; | ||
1389 | |||
1390 | skb_shinfo(prev)->gso_segs += pcount; | ||
1391 | BUG_ON(skb_shinfo(skb)->gso_segs < pcount); | ||
1392 | skb_shinfo(skb)->gso_segs -= pcount; | ||
1393 | |||
1394 | /* When we're adding to gso_segs == 1, gso_size will be zero, | ||
1395 | * in theory this shouldn't be necessary but as long as DSACK | ||
1396 | * code can come after this skb later on it's better to keep | ||
1397 | * setting gso_size to something. | ||
1398 | */ | ||
1399 | if (!skb_shinfo(prev)->gso_size) { | ||
1400 | skb_shinfo(prev)->gso_size = mss; | ||
1401 | skb_shinfo(prev)->gso_type = sk->sk_gso_type; | ||
1402 | } | ||
1403 | |||
1404 | /* CHECKME: To clear or not to clear? Mimics normal skb currently */ | ||
1405 | if (skb_shinfo(skb)->gso_segs <= 1) { | ||
1406 | skb_shinfo(skb)->gso_size = 0; | ||
1407 | skb_shinfo(skb)->gso_type = 0; | ||
1408 | } | ||
1409 | |||
1410 | *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked, | ||
1411 | pcount); | ||
1412 | |||
1413 | /* Difference in this won't matter, both ACKed by the same cumul. ACK */ | ||
1414 | TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); | ||
1415 | |||
1416 | if (skb->len > 0) { | ||
1417 | BUG_ON(!tcp_skb_pcount(skb)); | ||
1418 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); | ||
1419 | return 0; | ||
1420 | } | ||
1421 | |||
1422 | /* Whole SKB was eaten :-) */ | ||
1423 | |||
1424 | if (skb == tp->retransmit_skb_hint) | ||
1425 | tp->retransmit_skb_hint = prev; | ||
1426 | if (skb == tp->scoreboard_skb_hint) | ||
1427 | tp->scoreboard_skb_hint = prev; | ||
1428 | if (skb == tp->lost_skb_hint) { | ||
1429 | tp->lost_skb_hint = prev; | ||
1430 | tp->lost_cnt_hint -= tcp_skb_pcount(prev); | ||
1431 | } | ||
1432 | |||
1433 | TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; | ||
1434 | if (skb == tcp_highest_sack(sk)) | ||
1435 | tcp_advance_highest_sack(sk, skb); | ||
1436 | |||
1437 | tcp_unlink_write_queue(skb, sk); | ||
1438 | sk_wmem_free_skb(sk, skb); | ||
1439 | |||
1440 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); | ||
1441 | |||
1442 | return 1; | ||
1443 | } | ||
1444 | |||
1445 | /* I wish gso_size would have a bit more sane initialization than | ||
1446 | * something-or-zero which complicates things | ||
1447 | */ | ||
1448 | static int tcp_shift_mss(struct sk_buff *skb) | ||
1449 | { | ||
1450 | int mss = tcp_skb_mss(skb); | ||
1451 | |||
1452 | if (!mss) | ||
1453 | mss = skb->len; | ||
1454 | |||
1455 | return mss; | ||
1456 | } | ||
1457 | |||
1458 | /* Shifting pages past head area doesn't work */ | ||
1459 | static int skb_can_shift(struct sk_buff *skb) | ||
1460 | { | ||
1461 | return !skb_headlen(skb) && skb_is_nonlinear(skb); | ||
1462 | } | ||
1463 | |||
1464 | /* Try collapsing SACK blocks spanning across multiple skbs to a single | ||
1465 | * skb. | ||
1466 | */ | ||
1467 | static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | ||
1468 | u32 start_seq, u32 end_seq, | ||
1469 | int dup_sack, int *fack_count, | ||
1470 | int *reord, int *flag) | ||
1471 | { | ||
1472 | struct tcp_sock *tp = tcp_sk(sk); | ||
1473 | struct sk_buff *prev; | ||
1474 | int mss; | ||
1475 | int pcount = 0; | ||
1476 | int len; | ||
1477 | int in_sack; | ||
1478 | |||
1479 | if (!sk_can_gso(sk)) | ||
1480 | goto fallback; | ||
1481 | |||
1482 | /* Normally R but no L won't result in plain S */ | ||
1483 | if (!dup_sack && | ||
1484 | (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS) | ||
1485 | goto fallback; | ||
1486 | if (!skb_can_shift(skb)) | ||
1487 | goto fallback; | ||
1488 | /* This frame is about to be dropped (was ACKed). */ | ||
1489 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) | ||
1490 | goto fallback; | ||
1491 | |||
1492 | /* Can only happen with delayed DSACK + discard craziness */ | ||
1493 | if (unlikely(skb == tcp_write_queue_head(sk))) | ||
1494 | goto fallback; | ||
1495 | prev = tcp_write_queue_prev(sk, skb); | ||
1496 | |||
1497 | if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) | ||
1498 | goto fallback; | ||
1499 | |||
1500 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && | ||
1501 | !before(end_seq, TCP_SKB_CB(skb)->end_seq); | ||
1502 | |||
1503 | if (in_sack) { | ||
1504 | len = skb->len; | ||
1505 | pcount = tcp_skb_pcount(skb); | ||
1506 | mss = tcp_shift_mss(skb); | ||
1507 | |||
1508 | /* TODO: Fix DSACKs to not fragment already SACKed and we can | ||
1509 | * drop this restriction as unnecessary | ||
1510 | */ | ||
1511 | if (mss != tcp_shift_mss(prev)) | ||
1512 | goto fallback; | ||
1513 | } else { | ||
1514 | if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) | ||
1515 | goto noop; | ||
1516 | /* CHECKME: This is non-MSS split case only?, this will | ||
1517 | * cause skipped skbs due to advancing loop btw, original | ||
1518 | * has that feature too | ||
1519 | */ | ||
1520 | if (tcp_skb_pcount(skb) <= 1) | ||
1521 | goto noop; | ||
1522 | |||
1523 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); | ||
1524 | if (!in_sack) { | ||
1525 | /* TODO: head merge to next could be attempted here | ||
1526 | * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), | ||
1527 | * though it might not be worth of the additional hassle | ||
1528 | * | ||
1529 | * ...we can probably just fallback to what was done | ||
1530 | * previously. We could try merging non-SACKed ones | ||
1531 | * as well but it probably isn't going to buy off | ||
1532 | * because later SACKs might again split them, and | ||
1533 | * it would make skb timestamp tracking considerably | ||
1534 | * harder problem. | ||
1535 | */ | ||
1536 | goto fallback; | ||
1537 | } | ||
1538 | |||
1539 | len = end_seq - TCP_SKB_CB(skb)->seq; | ||
1540 | BUG_ON(len < 0); | ||
1541 | BUG_ON(len > skb->len); | ||
1542 | |||
1543 | /* MSS boundaries should be honoured or else pcount will | ||
1544 | * severely break even though it makes things bit trickier. | ||
1545 | * Optimize common case to avoid most of the divides | ||
1546 | */ | ||
1547 | mss = tcp_skb_mss(skb); | ||
1548 | |||
1549 | /* TODO: Fix DSACKs to not fragment already SACKed and we can | ||
1550 | * drop this restriction as unnecessary | ||
1551 | */ | ||
1552 | if (mss != tcp_shift_mss(prev)) | ||
1553 | goto fallback; | ||
1554 | |||
1555 | if (len == mss) { | ||
1556 | pcount = 1; | ||
1557 | } else if (len < mss) { | ||
1558 | goto noop; | ||
1559 | } else { | ||
1560 | pcount = len / mss; | ||
1561 | len = pcount * mss; | ||
1562 | } | ||
1563 | } | ||
1564 | |||
1565 | if (!skb_shift(prev, skb, len)) | ||
1566 | goto fallback; | ||
1567 | if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord, | ||
1568 | flag, mss)) | ||
1569 | goto out; | ||
1570 | |||
1571 | /* Hole filled allows collapsing with the next as well, this is very | ||
1572 | * useful when hole on every nth skb pattern happens | ||
1573 | */ | ||
1574 | if (prev == tcp_write_queue_tail(sk)) | ||
1575 | goto out; | ||
1576 | skb = tcp_write_queue_next(sk, prev); | ||
1577 | |||
1578 | if (!skb_can_shift(skb)) | ||
1579 | goto out; | ||
1580 | if (skb == tcp_send_head(sk)) | ||
1581 | goto out; | ||
1582 | if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) | ||
1583 | goto out; | ||
1584 | |||
1585 | len = skb->len; | ||
1586 | if (skb_shift(prev, skb, len)) { | ||
1587 | pcount += tcp_skb_pcount(skb); | ||
1588 | tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len, | ||
1589 | *fack_count, reord, flag, mss); | ||
1590 | } | ||
1591 | |||
1592 | out: | ||
1593 | *fack_count += pcount; | ||
1594 | return prev; | ||
1595 | |||
1596 | noop: | ||
1597 | return skb; | ||
1598 | |||
1599 | fallback: | ||
1600 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); | ||
1601 | return NULL; | ||
1602 | } | ||
1603 | |||
1353 | static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | 1604 | static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, |
1354 | struct tcp_sack_block *next_dup, | 1605 | struct tcp_sack_block *next_dup, |
1355 | u32 start_seq, u32 end_seq, | 1606 | u32 start_seq, u32 end_seq, |
1356 | int dup_sack_in, int *fack_count, | 1607 | int dup_sack_in, int *fack_count, |
1357 | int *reord, int *flag) | 1608 | int *reord, int *flag) |
1358 | { | 1609 | { |
1610 | struct tcp_sock *tp = tcp_sk(sk); | ||
1611 | struct sk_buff *tmp; | ||
1612 | |||
1359 | tcp_for_write_queue_from(skb, sk) { | 1613 | tcp_for_write_queue_from(skb, sk) { |
1360 | int in_sack = 0; | 1614 | int in_sack = 0; |
1361 | int dup_sack = dup_sack_in; | 1615 | int dup_sack = dup_sack_in; |
@@ -1376,15 +1630,41 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1376 | dup_sack = 1; | 1630 | dup_sack = 1; |
1377 | } | 1631 | } |
1378 | 1632 | ||
1379 | if (in_sack <= 0) | 1633 | /* skb reference here is a bit tricky to get right, since |
1380 | in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, | 1634 | * shifting can eat and free both this skb and the next, |
1381 | end_seq); | 1635 | * so not even _safe variant of the loop is enough. |
1636 | */ | ||
1637 | if (in_sack <= 0) { | ||
1638 | tmp = tcp_shift_skb_data(sk, skb, start_seq, | ||
1639 | end_seq, dup_sack, | ||
1640 | fack_count, reord, flag); | ||
1641 | if (tmp != NULL) { | ||
1642 | if (tmp != skb) { | ||
1643 | skb = tmp; | ||
1644 | continue; | ||
1645 | } | ||
1646 | |||
1647 | in_sack = 0; | ||
1648 | } else { | ||
1649 | in_sack = tcp_match_skb_to_sack(sk, skb, | ||
1650 | start_seq, | ||
1651 | end_seq); | ||
1652 | } | ||
1653 | } | ||
1654 | |||
1382 | if (unlikely(in_sack < 0)) | 1655 | if (unlikely(in_sack < 0)) |
1383 | break; | 1656 | break; |
1384 | 1657 | ||
1385 | if (in_sack) | 1658 | if (in_sack) { |
1386 | *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, | 1659 | *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, |
1387 | *fack_count); | 1660 | *fack_count, |
1661 | &(TCP_SKB_CB(skb)->sacked), | ||
1662 | tcp_skb_pcount(skb)); | ||
1663 | |||
1664 | if (!before(TCP_SKB_CB(skb)->seq, | ||
1665 | tcp_highest_sack_seq(tp))) | ||
1666 | tcp_advance_highest_sack(sk, skb); | ||
1667 | } | ||
1388 | 1668 | ||
1389 | *fack_count += tcp_skb_pcount(skb); | 1669 | *fack_count += tcp_skb_pcount(skb); |
1390 | } | 1670 | } |
@@ -1401,7 +1681,7 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, | |||
1401 | if (skb == tcp_send_head(sk)) | 1681 | if (skb == tcp_send_head(sk)) |
1402 | break; | 1682 | break; |
1403 | 1683 | ||
1404 | if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) | 1684 | if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) |
1405 | break; | 1685 | break; |
1406 | 1686 | ||
1407 | *fack_count += tcp_skb_pcount(skb); | 1687 | *fack_count += tcp_skb_pcount(skb); |
@@ -1660,7 +1940,7 @@ out: | |||
1660 | /* Limits sacked_out so that sum with lost_out isn't ever larger than | 1940 | /* Limits sacked_out so that sum with lost_out isn't ever larger than |
1661 | * packets_out. Returns zero if sacked_out adjustement wasn't necessary. | 1941 | * packets_out. Returns zero if sacked_out adjustement wasn't necessary. |
1662 | */ | 1942 | */ |
1663 | int tcp_limit_reno_sacked(struct tcp_sock *tp) | 1943 | static int tcp_limit_reno_sacked(struct tcp_sock *tp) |
1664 | { | 1944 | { |
1665 | u32 holes; | 1945 | u32 holes; |
1666 | 1946 | ||
@@ -2559,6 +2839,56 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) | |||
2559 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 2839 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
2560 | } | 2840 | } |
2561 | 2841 | ||
2842 | /* Do a simple retransmit without using the backoff mechanisms in | ||
2843 | * tcp_timer. This is used for path mtu discovery. | ||
2844 | * The socket is already locked here. | ||
2845 | */ | ||
2846 | void tcp_simple_retransmit(struct sock *sk) | ||
2847 | { | ||
2848 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
2849 | struct tcp_sock *tp = tcp_sk(sk); | ||
2850 | struct sk_buff *skb; | ||
2851 | unsigned int mss = tcp_current_mss(sk, 0); | ||
2852 | u32 prior_lost = tp->lost_out; | ||
2853 | |||
2854 | tcp_for_write_queue(skb, sk) { | ||
2855 | if (skb == tcp_send_head(sk)) | ||
2856 | break; | ||
2857 | if (skb->len > mss && | ||
2858 | !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { | ||
2859 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { | ||
2860 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | ||
2861 | tp->retrans_out -= tcp_skb_pcount(skb); | ||
2862 | } | ||
2863 | tcp_skb_mark_lost_uncond_verify(tp, skb); | ||
2864 | } | ||
2865 | } | ||
2866 | |||
2867 | tcp_clear_retrans_hints_partial(tp); | ||
2868 | |||
2869 | if (prior_lost == tp->lost_out) | ||
2870 | return; | ||
2871 | |||
2872 | if (tcp_is_reno(tp)) | ||
2873 | tcp_limit_reno_sacked(tp); | ||
2874 | |||
2875 | tcp_verify_left_out(tp); | ||
2876 | |||
2877 | /* Don't muck with the congestion window here. | ||
2878 | * Reason is that we do not increase amount of _data_ | ||
2879 | * in network, but units changed and effective | ||
2880 | * cwnd/ssthresh really reduced now. | ||
2881 | */ | ||
2882 | if (icsk->icsk_ca_state != TCP_CA_Loss) { | ||
2883 | tp->high_seq = tp->snd_nxt; | ||
2884 | tp->snd_ssthresh = tcp_current_ssthresh(sk); | ||
2885 | tp->prior_ssthresh = 0; | ||
2886 | tp->undo_marker = 0; | ||
2887 | tcp_set_ca_state(sk, TCP_CA_Loss); | ||
2888 | } | ||
2889 | tcp_xmit_retransmit_queue(sk); | ||
2890 | } | ||
2891 | |||
2562 | /* Process an event, which can update packets-in-flight not trivially. | 2892 | /* Process an event, which can update packets-in-flight not trivially. |
2563 | * Main goal of this function is to calculate new estimate for left_out, | 2893 | * Main goal of this function is to calculate new estimate for left_out, |
2564 | * taking into account both packets sitting in receiver's buffer and | 2894 | * taking into account both packets sitting in receiver's buffer and |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d49233f409b5..26b9030747cc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) | |||
97 | } | 97 | } |
98 | #endif | 98 | #endif |
99 | 99 | ||
100 | struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { | 100 | struct inet_hashinfo tcp_hashinfo; |
101 | .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock), | ||
102 | .lhash_users = ATOMIC_INIT(0), | ||
103 | .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), | ||
104 | }; | ||
105 | 101 | ||
106 | static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) | 102 | static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) |
107 | { | 103 | { |
@@ -492,7 +488,7 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) | |||
492 | skb->csum_offset = offsetof(struct tcphdr, check); | 488 | skb->csum_offset = offsetof(struct tcphdr, check); |
493 | } else { | 489 | } else { |
494 | th->check = tcp_v4_check(len, inet->saddr, inet->daddr, | 490 | th->check = tcp_v4_check(len, inet->saddr, inet->daddr, |
495 | csum_partial((char *)th, | 491 | csum_partial(th, |
496 | th->doff << 2, | 492 | th->doff << 2, |
497 | skb->csum)); | 493 | skb->csum)); |
498 | } | 494 | } |
@@ -726,7 +722,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, | |||
726 | th->check = tcp_v4_check(skb->len, | 722 | th->check = tcp_v4_check(skb->len, |
727 | ireq->loc_addr, | 723 | ireq->loc_addr, |
728 | ireq->rmt_addr, | 724 | ireq->rmt_addr, |
729 | csum_partial((char *)th, skb->len, | 725 | csum_partial(th, skb->len, |
730 | skb->csum)); | 726 | skb->csum)); |
731 | 727 | ||
732 | err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, | 728 | err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, |
@@ -1801,7 +1797,7 @@ static int tcp_v4_init_sock(struct sock *sk) | |||
1801 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; | 1797 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; |
1802 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; | 1798 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; |
1803 | 1799 | ||
1804 | atomic_inc(&tcp_sockets_allocated); | 1800 | percpu_counter_inc(&tcp_sockets_allocated); |
1805 | 1801 | ||
1806 | return 0; | 1802 | return 0; |
1807 | } | 1803 | } |
@@ -1849,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk) | |||
1849 | sk->sk_sndmsg_page = NULL; | 1845 | sk->sk_sndmsg_page = NULL; |
1850 | } | 1846 | } |
1851 | 1847 | ||
1852 | atomic_dec(&tcp_sockets_allocated); | 1848 | percpu_counter_dec(&tcp_sockets_allocated); |
1853 | } | 1849 | } |
1854 | 1850 | ||
1855 | EXPORT_SYMBOL(tcp_v4_destroy_sock); | 1851 | EXPORT_SYMBOL(tcp_v4_destroy_sock); |
@@ -1857,32 +1853,35 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); | |||
1857 | #ifdef CONFIG_PROC_FS | 1853 | #ifdef CONFIG_PROC_FS |
1858 | /* Proc filesystem TCP sock list dumping. */ | 1854 | /* Proc filesystem TCP sock list dumping. */ |
1859 | 1855 | ||
1860 | static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) | 1856 | static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) |
1861 | { | 1857 | { |
1862 | return hlist_empty(head) ? NULL : | 1858 | return hlist_nulls_empty(head) ? NULL : |
1863 | list_entry(head->first, struct inet_timewait_sock, tw_node); | 1859 | list_entry(head->first, struct inet_timewait_sock, tw_node); |
1864 | } | 1860 | } |
1865 | 1861 | ||
1866 | static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) | 1862 | static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) |
1867 | { | 1863 | { |
1868 | return tw->tw_node.next ? | 1864 | return !is_a_nulls(tw->tw_node.next) ? |
1869 | hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; | 1865 | hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; |
1870 | } | 1866 | } |
1871 | 1867 | ||
1872 | static void *listening_get_next(struct seq_file *seq, void *cur) | 1868 | static void *listening_get_next(struct seq_file *seq, void *cur) |
1873 | { | 1869 | { |
1874 | struct inet_connection_sock *icsk; | 1870 | struct inet_connection_sock *icsk; |
1875 | struct hlist_node *node; | 1871 | struct hlist_nulls_node *node; |
1876 | struct sock *sk = cur; | 1872 | struct sock *sk = cur; |
1873 | struct inet_listen_hashbucket *ilb; | ||
1877 | struct tcp_iter_state *st = seq->private; | 1874 | struct tcp_iter_state *st = seq->private; |
1878 | struct net *net = seq_file_net(seq); | 1875 | struct net *net = seq_file_net(seq); |
1879 | 1876 | ||
1880 | if (!sk) { | 1877 | if (!sk) { |
1881 | st->bucket = 0; | 1878 | st->bucket = 0; |
1882 | sk = sk_head(&tcp_hashinfo.listening_hash[0]); | 1879 | ilb = &tcp_hashinfo.listening_hash[0]; |
1880 | spin_lock_bh(&ilb->lock); | ||
1881 | sk = sk_nulls_head(&ilb->head); | ||
1883 | goto get_sk; | 1882 | goto get_sk; |
1884 | } | 1883 | } |
1885 | 1884 | ilb = &tcp_hashinfo.listening_hash[st->bucket]; | |
1886 | ++st->num; | 1885 | ++st->num; |
1887 | 1886 | ||
1888 | if (st->state == TCP_SEQ_STATE_OPENREQ) { | 1887 | if (st->state == TCP_SEQ_STATE_OPENREQ) { |
@@ -1915,7 +1914,7 @@ get_req: | |||
1915 | sk = sk_next(sk); | 1914 | sk = sk_next(sk); |
1916 | } | 1915 | } |
1917 | get_sk: | 1916 | get_sk: |
1918 | sk_for_each_from(sk, node) { | 1917 | sk_nulls_for_each_from(sk, node) { |
1919 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { | 1918 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { |
1920 | cur = sk; | 1919 | cur = sk; |
1921 | goto out; | 1920 | goto out; |
@@ -1932,8 +1931,11 @@ start_req: | |||
1932 | } | 1931 | } |
1933 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); | 1932 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
1934 | } | 1933 | } |
1934 | spin_unlock_bh(&ilb->lock); | ||
1935 | if (++st->bucket < INET_LHTABLE_SIZE) { | 1935 | if (++st->bucket < INET_LHTABLE_SIZE) { |
1936 | sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); | 1936 | ilb = &tcp_hashinfo.listening_hash[st->bucket]; |
1937 | spin_lock_bh(&ilb->lock); | ||
1938 | sk = sk_nulls_head(&ilb->head); | ||
1937 | goto get_sk; | 1939 | goto get_sk; |
1938 | } | 1940 | } |
1939 | cur = NULL; | 1941 | cur = NULL; |
@@ -1954,8 +1956,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) | |||
1954 | 1956 | ||
1955 | static inline int empty_bucket(struct tcp_iter_state *st) | 1957 | static inline int empty_bucket(struct tcp_iter_state *st) |
1956 | { | 1958 | { |
1957 | return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && | 1959 | return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && |
1958 | hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); | 1960 | hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); |
1959 | } | 1961 | } |
1960 | 1962 | ||
1961 | static void *established_get_first(struct seq_file *seq) | 1963 | static void *established_get_first(struct seq_file *seq) |
@@ -1966,16 +1968,16 @@ static void *established_get_first(struct seq_file *seq) | |||
1966 | 1968 | ||
1967 | for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { | 1969 | for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { |
1968 | struct sock *sk; | 1970 | struct sock *sk; |
1969 | struct hlist_node *node; | 1971 | struct hlist_nulls_node *node; |
1970 | struct inet_timewait_sock *tw; | 1972 | struct inet_timewait_sock *tw; |
1971 | rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); | 1973 | spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); |
1972 | 1974 | ||
1973 | /* Lockless fast path for the common case of empty buckets */ | 1975 | /* Lockless fast path for the common case of empty buckets */ |
1974 | if (empty_bucket(st)) | 1976 | if (empty_bucket(st)) |
1975 | continue; | 1977 | continue; |
1976 | 1978 | ||
1977 | read_lock_bh(lock); | 1979 | spin_lock_bh(lock); |
1978 | sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { | 1980 | sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { |
1979 | if (sk->sk_family != st->family || | 1981 | if (sk->sk_family != st->family || |
1980 | !net_eq(sock_net(sk), net)) { | 1982 | !net_eq(sock_net(sk), net)) { |
1981 | continue; | 1983 | continue; |
@@ -1993,7 +1995,7 @@ static void *established_get_first(struct seq_file *seq) | |||
1993 | rc = tw; | 1995 | rc = tw; |
1994 | goto out; | 1996 | goto out; |
1995 | } | 1997 | } |
1996 | read_unlock_bh(lock); | 1998 | spin_unlock_bh(lock); |
1997 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 1999 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
1998 | } | 2000 | } |
1999 | out: | 2001 | out: |
@@ -2004,7 +2006,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) | |||
2004 | { | 2006 | { |
2005 | struct sock *sk = cur; | 2007 | struct sock *sk = cur; |
2006 | struct inet_timewait_sock *tw; | 2008 | struct inet_timewait_sock *tw; |
2007 | struct hlist_node *node; | 2009 | struct hlist_nulls_node *node; |
2008 | struct tcp_iter_state *st = seq->private; | 2010 | struct tcp_iter_state *st = seq->private; |
2009 | struct net *net = seq_file_net(seq); | 2011 | struct net *net = seq_file_net(seq); |
2010 | 2012 | ||
@@ -2021,7 +2023,7 @@ get_tw: | |||
2021 | cur = tw; | 2023 | cur = tw; |
2022 | goto out; | 2024 | goto out; |
2023 | } | 2025 | } |
2024 | read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); | 2026 | spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); |
2025 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 2027 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
2026 | 2028 | ||
2027 | /* Look for next non empty bucket */ | 2029 | /* Look for next non empty bucket */ |
@@ -2031,12 +2033,12 @@ get_tw: | |||
2031 | if (st->bucket >= tcp_hashinfo.ehash_size) | 2033 | if (st->bucket >= tcp_hashinfo.ehash_size) |
2032 | return NULL; | 2034 | return NULL; |
2033 | 2035 | ||
2034 | read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); | 2036 | spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); |
2035 | sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); | 2037 | sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); |
2036 | } else | 2038 | } else |
2037 | sk = sk_next(sk); | 2039 | sk = sk_nulls_next(sk); |
2038 | 2040 | ||
2039 | sk_for_each_from(sk, node) { | 2041 | sk_nulls_for_each_from(sk, node) { |
2040 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) | 2042 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) |
2041 | goto found; | 2043 | goto found; |
2042 | } | 2044 | } |
@@ -2066,12 +2068,10 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) | |||
2066 | void *rc; | 2068 | void *rc; |
2067 | struct tcp_iter_state *st = seq->private; | 2069 | struct tcp_iter_state *st = seq->private; |
2068 | 2070 | ||
2069 | inet_listen_lock(&tcp_hashinfo); | ||
2070 | st->state = TCP_SEQ_STATE_LISTENING; | 2071 | st->state = TCP_SEQ_STATE_LISTENING; |
2071 | rc = listening_get_idx(seq, &pos); | 2072 | rc = listening_get_idx(seq, &pos); |
2072 | 2073 | ||
2073 | if (!rc) { | 2074 | if (!rc) { |
2074 | inet_listen_unlock(&tcp_hashinfo); | ||
2075 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 2075 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
2076 | rc = established_get_idx(seq, pos); | 2076 | rc = established_get_idx(seq, pos); |
2077 | } | 2077 | } |
@@ -2103,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
2103 | case TCP_SEQ_STATE_LISTENING: | 2103 | case TCP_SEQ_STATE_LISTENING: |
2104 | rc = listening_get_next(seq, v); | 2104 | rc = listening_get_next(seq, v); |
2105 | if (!rc) { | 2105 | if (!rc) { |
2106 | inet_listen_unlock(&tcp_hashinfo); | ||
2107 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 2106 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
2108 | rc = established_get_first(seq); | 2107 | rc = established_get_first(seq); |
2109 | } | 2108 | } |
@@ -2130,12 +2129,12 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) | |||
2130 | } | 2129 | } |
2131 | case TCP_SEQ_STATE_LISTENING: | 2130 | case TCP_SEQ_STATE_LISTENING: |
2132 | if (v != SEQ_START_TOKEN) | 2131 | if (v != SEQ_START_TOKEN) |
2133 | inet_listen_unlock(&tcp_hashinfo); | 2132 | spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); |
2134 | break; | 2133 | break; |
2135 | case TCP_SEQ_STATE_TIME_WAIT: | 2134 | case TCP_SEQ_STATE_TIME_WAIT: |
2136 | case TCP_SEQ_STATE_ESTABLISHED: | 2135 | case TCP_SEQ_STATE_ESTABLISHED: |
2137 | if (v) | 2136 | if (v) |
2138 | read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); | 2137 | spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); |
2139 | break; | 2138 | break; |
2140 | } | 2139 | } |
2141 | } | 2140 | } |
@@ -2375,6 +2374,7 @@ struct proto tcp_prot = { | |||
2375 | .sysctl_rmem = sysctl_tcp_rmem, | 2374 | .sysctl_rmem = sysctl_tcp_rmem, |
2376 | .max_header = MAX_TCP_HEADER, | 2375 | .max_header = MAX_TCP_HEADER, |
2377 | .obj_size = sizeof(struct tcp_sock), | 2376 | .obj_size = sizeof(struct tcp_sock), |
2377 | .slab_flags = SLAB_DESTROY_BY_RCU, | ||
2378 | .twsk_prot = &tcp_timewait_sock_ops, | 2378 | .twsk_prot = &tcp_timewait_sock_ops, |
2379 | .rsk_prot = &tcp_request_sock_ops, | 2379 | .rsk_prot = &tcp_request_sock_ops, |
2380 | .h.hashinfo = &tcp_hashinfo, | 2380 | .h.hashinfo = &tcp_hashinfo, |
@@ -2404,6 +2404,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { | |||
2404 | 2404 | ||
2405 | void __init tcp_v4_init(void) | 2405 | void __init tcp_v4_init(void) |
2406 | { | 2406 | { |
2407 | inet_hashinfo_init(&tcp_hashinfo); | ||
2407 | if (register_pernet_device(&tcp_sk_ops)) | 2408 | if (register_pernet_device(&tcp_sk_ops)) |
2408 | panic("Failed to create the TCP control socket.\n"); | 2409 | panic("Failed to create the TCP control socket.\n"); |
2409 | } | 2410 | } |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a524627923ae..76f840917bcb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -722,7 +722,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) | |||
722 | static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, | 722 | static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, |
723 | unsigned int mss_now) | 723 | unsigned int mss_now) |
724 | { | 724 | { |
725 | if (skb->len <= mss_now || !sk_can_gso(sk)) { | 725 | if (skb->len <= mss_now || !sk_can_gso(sk) || |
726 | tcp_urg_mode(tcp_sk(sk))) { | ||
726 | /* Avoid the costly divide in the normal | 727 | /* Avoid the costly divide in the normal |
727 | * non-TSO case. | 728 | * non-TSO case. |
728 | */ | 729 | */ |
@@ -1163,7 +1164,9 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, | |||
1163 | { | 1164 | { |
1164 | int tso_segs = tcp_skb_pcount(skb); | 1165 | int tso_segs = tcp_skb_pcount(skb); |
1165 | 1166 | ||
1166 | if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { | 1167 | if (!tso_segs || |
1168 | (tso_segs > 1 && (tcp_skb_mss(skb) != mss_now || | ||
1169 | tcp_urg_mode(tcp_sk(sk))))) { | ||
1167 | tcp_set_skb_tso_segs(sk, skb, mss_now); | 1170 | tcp_set_skb_tso_segs(sk, skb, mss_now); |
1168 | tso_segs = tcp_skb_pcount(skb); | 1171 | tso_segs = tcp_skb_pcount(skb); |
1169 | } | 1172 | } |
@@ -1766,46 +1769,22 @@ u32 __tcp_select_window(struct sock *sk) | |||
1766 | return window; | 1769 | return window; |
1767 | } | 1770 | } |
1768 | 1771 | ||
1769 | /* Attempt to collapse two adjacent SKB's during retransmission. */ | 1772 | /* Collapses two adjacent SKB's during retransmission. */ |
1770 | static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, | 1773 | static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) |
1771 | int mss_now) | ||
1772 | { | 1774 | { |
1773 | struct tcp_sock *tp = tcp_sk(sk); | 1775 | struct tcp_sock *tp = tcp_sk(sk); |
1774 | struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); | 1776 | struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); |
1775 | int skb_size, next_skb_size; | 1777 | int skb_size, next_skb_size; |
1776 | u16 flags; | 1778 | u16 flags; |
1777 | 1779 | ||
1778 | /* The first test we must make is that neither of these two | ||
1779 | * SKB's are still referenced by someone else. | ||
1780 | */ | ||
1781 | if (skb_cloned(skb) || skb_cloned(next_skb)) | ||
1782 | return; | ||
1783 | |||
1784 | skb_size = skb->len; | 1780 | skb_size = skb->len; |
1785 | next_skb_size = next_skb->len; | 1781 | next_skb_size = next_skb->len; |
1786 | flags = TCP_SKB_CB(skb)->flags; | 1782 | flags = TCP_SKB_CB(skb)->flags; |
1787 | 1783 | ||
1788 | /* Also punt if next skb has been SACK'd. */ | ||
1789 | if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) | ||
1790 | return; | ||
1791 | |||
1792 | /* Next skb is out of window. */ | ||
1793 | if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp))) | ||
1794 | return; | ||
1795 | |||
1796 | /* Punt if not enough space exists in the first SKB for | ||
1797 | * the data in the second, or the total combined payload | ||
1798 | * would exceed the MSS. | ||
1799 | */ | ||
1800 | if ((next_skb_size > skb_tailroom(skb)) || | ||
1801 | ((skb_size + next_skb_size) > mss_now)) | ||
1802 | return; | ||
1803 | |||
1804 | BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); | 1784 | BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); |
1805 | 1785 | ||
1806 | tcp_highest_sack_combine(sk, next_skb, skb); | 1786 | tcp_highest_sack_combine(sk, next_skb, skb); |
1807 | 1787 | ||
1808 | /* Ok. We will be able to collapse the packet. */ | ||
1809 | tcp_unlink_write_queue(next_skb, sk); | 1788 | tcp_unlink_write_queue(next_skb, sk); |
1810 | 1789 | ||
1811 | skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), | 1790 | skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), |
@@ -1847,54 +1826,60 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, | |||
1847 | sk_wmem_free_skb(sk, next_skb); | 1826 | sk_wmem_free_skb(sk, next_skb); |
1848 | } | 1827 | } |
1849 | 1828 | ||
1850 | /* Do a simple retransmit without using the backoff mechanisms in | 1829 | static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) |
1851 | * tcp_timer. This is used for path mtu discovery. | 1830 | { |
1852 | * The socket is already locked here. | 1831 | if (tcp_skb_pcount(skb) > 1) |
1853 | */ | 1832 | return 0; |
1854 | void tcp_simple_retransmit(struct sock *sk) | 1833 | /* TODO: SACK collapsing could be used to remove this condition */ |
1834 | if (skb_shinfo(skb)->nr_frags != 0) | ||
1835 | return 0; | ||
1836 | if (skb_cloned(skb)) | ||
1837 | return 0; | ||
1838 | if (skb == tcp_send_head(sk)) | ||
1839 | return 0; | ||
1840 | /* Some heurestics for collapsing over SACK'd could be invented */ | ||
1841 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) | ||
1842 | return 0; | ||
1843 | |||
1844 | return 1; | ||
1845 | } | ||
1846 | |||
1847 | static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | ||
1848 | int space) | ||
1855 | { | 1849 | { |
1856 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
1857 | struct tcp_sock *tp = tcp_sk(sk); | 1850 | struct tcp_sock *tp = tcp_sk(sk); |
1858 | struct sk_buff *skb; | 1851 | struct sk_buff *skb = to, *tmp; |
1859 | unsigned int mss = tcp_current_mss(sk, 0); | 1852 | int first = 1; |
1860 | u32 prior_lost = tp->lost_out; | ||
1861 | 1853 | ||
1862 | tcp_for_write_queue(skb, sk) { | 1854 | if (!sysctl_tcp_retrans_collapse) |
1863 | if (skb == tcp_send_head(sk)) | 1855 | return; |
1856 | if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) | ||
1857 | return; | ||
1858 | |||
1859 | tcp_for_write_queue_from_safe(skb, tmp, sk) { | ||
1860 | if (!tcp_can_collapse(sk, skb)) | ||
1864 | break; | 1861 | break; |
1865 | if (skb->len > mss && | ||
1866 | !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { | ||
1867 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { | ||
1868 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | ||
1869 | tp->retrans_out -= tcp_skb_pcount(skb); | ||
1870 | } | ||
1871 | tcp_skb_mark_lost_uncond_verify(tp, skb); | ||
1872 | } | ||
1873 | } | ||
1874 | 1862 | ||
1875 | tcp_clear_retrans_hints_partial(tp); | 1863 | space -= skb->len; |
1876 | 1864 | ||
1877 | if (prior_lost == tp->lost_out) | 1865 | if (first) { |
1878 | return; | 1866 | first = 0; |
1867 | continue; | ||
1868 | } | ||
1879 | 1869 | ||
1880 | if (tcp_is_reno(tp)) | 1870 | if (space < 0) |
1881 | tcp_limit_reno_sacked(tp); | 1871 | break; |
1872 | /* Punt if not enough space exists in the first SKB for | ||
1873 | * the data in the second | ||
1874 | */ | ||
1875 | if (skb->len > skb_tailroom(to)) | ||
1876 | break; | ||
1882 | 1877 | ||
1883 | tcp_verify_left_out(tp); | 1878 | if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) |
1879 | break; | ||
1884 | 1880 | ||
1885 | /* Don't muck with the congestion window here. | 1881 | tcp_collapse_retrans(sk, to); |
1886 | * Reason is that we do not increase amount of _data_ | ||
1887 | * in network, but units changed and effective | ||
1888 | * cwnd/ssthresh really reduced now. | ||
1889 | */ | ||
1890 | if (icsk->icsk_ca_state != TCP_CA_Loss) { | ||
1891 | tp->high_seq = tp->snd_nxt; | ||
1892 | tp->snd_ssthresh = tcp_current_ssthresh(sk); | ||
1893 | tp->prior_ssthresh = 0; | ||
1894 | tp->undo_marker = 0; | ||
1895 | tcp_set_ca_state(sk, TCP_CA_Loss); | ||
1896 | } | 1882 | } |
1897 | tcp_xmit_retransmit_queue(sk); | ||
1898 | } | 1883 | } |
1899 | 1884 | ||
1900 | /* This retransmits one SKB. Policy decisions and retransmit queue | 1885 | /* This retransmits one SKB. Policy decisions and retransmit queue |
@@ -1946,17 +1931,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
1946 | return -ENOMEM; /* We'll try again later. */ | 1931 | return -ENOMEM; /* We'll try again later. */ |
1947 | } | 1932 | } |
1948 | 1933 | ||
1949 | /* Collapse two adjacent packets if worthwhile and we can. */ | 1934 | tcp_retrans_try_collapse(sk, skb, cur_mss); |
1950 | if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && | ||
1951 | (skb->len < (cur_mss >> 1)) && | ||
1952 | (!tcp_skb_is_last(sk, skb)) && | ||
1953 | (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && | ||
1954 | (skb_shinfo(skb)->nr_frags == 0 && | ||
1955 | skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && | ||
1956 | (tcp_skb_pcount(skb) == 1 && | ||
1957 | tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && | ||
1958 | (sysctl_tcp_retrans_collapse != 0)) | ||
1959 | tcp_retrans_try_collapse(sk, skb, cur_mss); | ||
1960 | 1935 | ||
1961 | /* Some Solaris stacks overoptimize and ignore the FIN on a | 1936 | /* Some Solaris stacks overoptimize and ignore the FIN on a |
1962 | * retransmit when old data is attached. So strip it off | 1937 | * retransmit when old data is attached. So strip it off |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3df339e3e363..cc4e6d27dedc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -65,7 +65,7 @@ static void tcp_write_err(struct sock *sk) | |||
65 | static int tcp_out_of_resources(struct sock *sk, int do_reset) | 65 | static int tcp_out_of_resources(struct sock *sk, int do_reset) |
66 | { | 66 | { |
67 | struct tcp_sock *tp = tcp_sk(sk); | 67 | struct tcp_sock *tp = tcp_sk(sk); |
68 | int orphans = atomic_read(&tcp_orphan_count); | 68 | int orphans = percpu_counter_read_positive(&tcp_orphan_count); |
69 | 69 | ||
70 | /* If peer does not open window for long time, or did not transmit | 70 | /* If peer does not open window for long time, or did not transmit |
71 | * anything for long time, penalize it. */ | 71 | * anything for long time, penalize it. */ |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7e4d9c871153..cf5ab0581eba 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, | |||
127 | const struct sock *sk2)) | 127 | const struct sock *sk2)) |
128 | { | 128 | { |
129 | struct sock *sk2; | 129 | struct sock *sk2; |
130 | struct hlist_node *node; | 130 | struct hlist_nulls_node *node; |
131 | 131 | ||
132 | sk_for_each(sk2, node, &hslot->head) | 132 | sk_nulls_for_each(sk2, node, &hslot->head) |
133 | if (net_eq(sock_net(sk2), net) && | 133 | if (net_eq(sock_net(sk2), net) && |
134 | sk2 != sk && | 134 | sk2 != sk && |
135 | sk2->sk_hash == num && | 135 | sk2->sk_hash == num && |
@@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
189 | inet_sk(sk)->num = snum; | 189 | inet_sk(sk)->num = snum; |
190 | sk->sk_hash = snum; | 190 | sk->sk_hash = snum; |
191 | if (sk_unhashed(sk)) { | 191 | if (sk_unhashed(sk)) { |
192 | /* | 192 | sk_nulls_add_node_rcu(sk, &hslot->head); |
193 | * We need that previous write to sk->sk_hash committed | ||
194 | * before write to sk->next done in following add_node() variant | ||
195 | */ | ||
196 | smp_wmb(); | ||
197 | sk_add_node_rcu(sk, &hslot->head); | ||
198 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 193 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
199 | } | 194 | } |
200 | error = 0; | 195 | error = 0; |
@@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, | |||
261 | int dif, struct udp_table *udptable) | 256 | int dif, struct udp_table *udptable) |
262 | { | 257 | { |
263 | struct sock *sk, *result; | 258 | struct sock *sk, *result; |
264 | struct hlist_node *node, *next; | 259 | struct hlist_nulls_node *node; |
265 | unsigned short hnum = ntohs(dport); | 260 | unsigned short hnum = ntohs(dport); |
266 | unsigned int hash = udp_hashfn(net, hnum); | 261 | unsigned int hash = udp_hashfn(net, hnum); |
267 | struct udp_hslot *hslot = &udptable->hash[hash]; | 262 | struct udp_hslot *hslot = &udptable->hash[hash]; |
@@ -271,13 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, | |||
271 | begin: | 266 | begin: |
272 | result = NULL; | 267 | result = NULL; |
273 | badness = -1; | 268 | badness = -1; |
274 | sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { | 269 | sk_nulls_for_each_rcu(sk, node, &hslot->head) { |
275 | /* | ||
276 | * lockless reader, and SLAB_DESTROY_BY_RCU items: | ||
277 | * We must check this item was not moved to another chain | ||
278 | */ | ||
279 | if (udp_hashfn(net, sk->sk_hash) != hash) | ||
280 | goto begin; | ||
281 | score = compute_score(sk, net, saddr, hnum, sport, | 270 | score = compute_score(sk, net, saddr, hnum, sport, |
282 | daddr, dport, dif); | 271 | daddr, dport, dif); |
283 | if (score > badness) { | 272 | if (score > badness) { |
@@ -285,6 +274,14 @@ begin: | |||
285 | badness = score; | 274 | badness = score; |
286 | } | 275 | } |
287 | } | 276 | } |
277 | /* | ||
278 | * if the nulls value we got at the end of this lookup is | ||
279 | * not the expected one, we must restart lookup. | ||
280 | * We probably met an item that was moved to another chain. | ||
281 | */ | ||
282 | if (get_nulls_value(node) != hash) | ||
283 | goto begin; | ||
284 | |||
288 | if (result) { | 285 | if (result) { |
289 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) | 286 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) |
290 | result = NULL; | 287 | result = NULL; |
@@ -320,19 +317,20 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, | |||
320 | } | 317 | } |
321 | EXPORT_SYMBOL_GPL(udp4_lib_lookup); | 318 | EXPORT_SYMBOL_GPL(udp4_lib_lookup); |
322 | 319 | ||
323 | static inline struct sock *udp_v4_mcast_next(struct sock *sk, | 320 | static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, |
324 | __be16 loc_port, __be32 loc_addr, | 321 | __be16 loc_port, __be32 loc_addr, |
325 | __be16 rmt_port, __be32 rmt_addr, | 322 | __be16 rmt_port, __be32 rmt_addr, |
326 | int dif) | 323 | int dif) |
327 | { | 324 | { |
328 | struct hlist_node *node; | 325 | struct hlist_nulls_node *node; |
329 | struct sock *s = sk; | 326 | struct sock *s = sk; |
330 | unsigned short hnum = ntohs(loc_port); | 327 | unsigned short hnum = ntohs(loc_port); |
331 | 328 | ||
332 | sk_for_each_from(s, node) { | 329 | sk_nulls_for_each_from(s, node) { |
333 | struct inet_sock *inet = inet_sk(s); | 330 | struct inet_sock *inet = inet_sk(s); |
334 | 331 | ||
335 | if (s->sk_hash != hnum || | 332 | if (!net_eq(sock_net(s), net) || |
333 | s->sk_hash != hnum || | ||
336 | (inet->daddr && inet->daddr != rmt_addr) || | 334 | (inet->daddr && inet->daddr != rmt_addr) || |
337 | (inet->dport != rmt_port && inet->dport) || | 335 | (inet->dport != rmt_port && inet->dport) || |
338 | (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || | 336 | (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || |
@@ -668,6 +666,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
668 | .saddr = saddr, | 666 | .saddr = saddr, |
669 | .tos = tos } }, | 667 | .tos = tos } }, |
670 | .proto = sk->sk_protocol, | 668 | .proto = sk->sk_protocol, |
669 | .flags = inet_sk_flowi_flags(sk), | ||
671 | .uli_u = { .ports = | 670 | .uli_u = { .ports = |
672 | { .sport = inet->sport, | 671 | { .sport = inet->sport, |
673 | .dport = dport } } }; | 672 | .dport = dport } } }; |
@@ -720,7 +719,7 @@ do_append_data: | |||
720 | up->len += ulen; | 719 | up->len += ulen; |
721 | getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; | 720 | getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; |
722 | err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, | 721 | err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, |
723 | sizeof(struct udphdr), &ipc, rt, | 722 | sizeof(struct udphdr), &ipc, &rt, |
724 | corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); | 723 | corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); |
725 | if (err) | 724 | if (err) |
726 | udp_flush_pending_frames(sk); | 725 | udp_flush_pending_frames(sk); |
@@ -971,16 +970,18 @@ int udp_disconnect(struct sock *sk, int flags) | |||
971 | 970 | ||
972 | void udp_lib_unhash(struct sock *sk) | 971 | void udp_lib_unhash(struct sock *sk) |
973 | { | 972 | { |
974 | struct udp_table *udptable = sk->sk_prot->h.udp_table; | 973 | if (sk_hashed(sk)) { |
975 | unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); | 974 | struct udp_table *udptable = sk->sk_prot->h.udp_table; |
976 | struct udp_hslot *hslot = &udptable->hash[hash]; | 975 | unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); |
976 | struct udp_hslot *hslot = &udptable->hash[hash]; | ||
977 | 977 | ||
978 | spin_lock_bh(&hslot->lock); | 978 | spin_lock_bh(&hslot->lock); |
979 | if (sk_del_node_init_rcu(sk)) { | 979 | if (sk_nulls_del_node_init_rcu(sk)) { |
980 | inet_sk(sk)->num = 0; | 980 | inet_sk(sk)->num = 0; |
981 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 981 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
982 | } | ||
983 | spin_unlock_bh(&hslot->lock); | ||
982 | } | 984 | } |
983 | spin_unlock_bh(&hslot->lock); | ||
984 | } | 985 | } |
985 | EXPORT_SYMBOL(udp_lib_unhash); | 986 | EXPORT_SYMBOL(udp_lib_unhash); |
986 | 987 | ||
@@ -1129,17 +1130,18 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, | |||
1129 | int dif; | 1130 | int dif; |
1130 | 1131 | ||
1131 | spin_lock(&hslot->lock); | 1132 | spin_lock(&hslot->lock); |
1132 | sk = sk_head(&hslot->head); | 1133 | sk = sk_nulls_head(&hslot->head); |
1133 | dif = skb->dev->ifindex; | 1134 | dif = skb->dev->ifindex; |
1134 | sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); | 1135 | sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); |
1135 | if (sk) { | 1136 | if (sk) { |
1136 | struct sock *sknext = NULL; | 1137 | struct sock *sknext = NULL; |
1137 | 1138 | ||
1138 | do { | 1139 | do { |
1139 | struct sk_buff *skb1 = skb; | 1140 | struct sk_buff *skb1 = skb; |
1140 | 1141 | ||
1141 | sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, | 1142 | sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, |
1142 | uh->source, saddr, dif); | 1143 | daddr, uh->source, saddr, |
1144 | dif); | ||
1143 | if (sknext) | 1145 | if (sknext) |
1144 | skb1 = skb_clone(skb, GFP_ATOMIC); | 1146 | skb1 = skb_clone(skb, GFP_ATOMIC); |
1145 | 1147 | ||
@@ -1558,10 +1560,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) | |||
1558 | struct net *net = seq_file_net(seq); | 1560 | struct net *net = seq_file_net(seq); |
1559 | 1561 | ||
1560 | for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { | 1562 | for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { |
1561 | struct hlist_node *node; | 1563 | struct hlist_nulls_node *node; |
1562 | struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; | 1564 | struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; |
1563 | spin_lock_bh(&hslot->lock); | 1565 | spin_lock_bh(&hslot->lock); |
1564 | sk_for_each(sk, node, &hslot->head) { | 1566 | sk_nulls_for_each(sk, node, &hslot->head) { |
1565 | if (!net_eq(sock_net(sk), net)) | 1567 | if (!net_eq(sock_net(sk), net)) |
1566 | continue; | 1568 | continue; |
1567 | if (sk->sk_family == state->family) | 1569 | if (sk->sk_family == state->family) |
@@ -1580,7 +1582,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) | |||
1580 | struct net *net = seq_file_net(seq); | 1582 | struct net *net = seq_file_net(seq); |
1581 | 1583 | ||
1582 | do { | 1584 | do { |
1583 | sk = sk_next(sk); | 1585 | sk = sk_nulls_next(sk); |
1584 | } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); | 1586 | } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); |
1585 | 1587 | ||
1586 | if (!sk) { | 1588 | if (!sk) { |
@@ -1751,7 +1753,7 @@ void __init udp_table_init(struct udp_table *table) | |||
1751 | int i; | 1753 | int i; |
1752 | 1754 | ||
1753 | for (i = 0; i < UDP_HTABLE_SIZE; i++) { | 1755 | for (i = 0; i < UDP_HTABLE_SIZE; i++) { |
1754 | INIT_HLIST_HEAD(&table->hash[i].head); | 1756 | INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); |
1755 | spin_lock_init(&table->hash[i].lock); | 1757 | spin_lock_init(&table->hash[i].lock); |
1756 | } | 1758 | } |
1757 | } | 1759 | } |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index f9a775b7e796..2ad24ba31f9d 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
@@ -18,7 +18,8 @@ | |||
18 | static struct dst_ops xfrm4_dst_ops; | 18 | static struct dst_ops xfrm4_dst_ops; |
19 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; | 19 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; |
20 | 20 | ||
21 | static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, | 21 | static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, |
22 | xfrm_address_t *saddr, | ||
22 | xfrm_address_t *daddr) | 23 | xfrm_address_t *daddr) |
23 | { | 24 | { |
24 | struct flowi fl = { | 25 | struct flowi fl = { |
@@ -36,19 +37,20 @@ static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, | |||
36 | if (saddr) | 37 | if (saddr) |
37 | fl.fl4_src = saddr->a4; | 38 | fl.fl4_src = saddr->a4; |
38 | 39 | ||
39 | err = __ip_route_output_key(&init_net, &rt, &fl); | 40 | err = __ip_route_output_key(net, &rt, &fl); |
40 | dst = &rt->u.dst; | 41 | dst = &rt->u.dst; |
41 | if (err) | 42 | if (err) |
42 | dst = ERR_PTR(err); | 43 | dst = ERR_PTR(err); |
43 | return dst; | 44 | return dst; |
44 | } | 45 | } |
45 | 46 | ||
46 | static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) | 47 | static int xfrm4_get_saddr(struct net *net, |
48 | xfrm_address_t *saddr, xfrm_address_t *daddr) | ||
47 | { | 49 | { |
48 | struct dst_entry *dst; | 50 | struct dst_entry *dst; |
49 | struct rtable *rt; | 51 | struct rtable *rt; |
50 | 52 | ||
51 | dst = xfrm4_dst_lookup(0, NULL, daddr); | 53 | dst = xfrm4_dst_lookup(net, 0, NULL, daddr); |
52 | if (IS_ERR(dst)) | 54 | if (IS_ERR(dst)) |
53 | return -EHOSTUNREACH; | 55 | return -EHOSTUNREACH; |
54 | 56 | ||
@@ -187,7 +189,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | |||
187 | 189 | ||
188 | static inline int xfrm4_garbage_collect(struct dst_ops *ops) | 190 | static inline int xfrm4_garbage_collect(struct dst_ops *ops) |
189 | { | 191 | { |
190 | xfrm4_policy_afinfo.garbage_collect(); | 192 | xfrm4_policy_afinfo.garbage_collect(&init_net); |
191 | return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); | 193 | return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); |
192 | } | 194 | } |
193 | 195 | ||
@@ -246,7 +248,6 @@ static struct dst_ops xfrm4_dst_ops = { | |||
246 | .ifdown = xfrm4_dst_ifdown, | 248 | .ifdown = xfrm4_dst_ifdown, |
247 | .local_out = __ip_local_out, | 249 | .local_out = __ip_local_out, |
248 | .gc_thresh = 1024, | 250 | .gc_thresh = 1024, |
249 | .entry_size = sizeof(struct xfrm_dst), | ||
250 | .entries = ATOMIC_INIT(0), | 251 | .entries = ATOMIC_INIT(0), |
251 | }; | 252 | }; |
252 | 253 | ||
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 07735ed280d7..1ef1366a0a03 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c | |||
@@ -13,8 +13,6 @@ | |||
13 | #include <linux/ipsec.h> | 13 | #include <linux/ipsec.h> |
14 | #include <linux/netfilter_ipv4.h> | 14 | #include <linux/netfilter_ipv4.h> |
15 | 15 | ||
16 | static struct xfrm_state_afinfo xfrm4_state_afinfo; | ||
17 | |||
18 | static int xfrm4_init_flags(struct xfrm_state *x) | 16 | static int xfrm4_init_flags(struct xfrm_state *x) |
19 | { | 17 | { |
20 | if (ipv4_config.no_pmtu_disc) | 18 | if (ipv4_config.no_pmtu_disc) |
@@ -33,6 +31,7 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, | |||
33 | x->sel.dport_mask = htons(0xffff); | 31 | x->sel.dport_mask = htons(0xffff); |
34 | x->sel.sport = xfrm_flowi_sport(fl); | 32 | x->sel.sport = xfrm_flowi_sport(fl); |
35 | x->sel.sport_mask = htons(0xffff); | 33 | x->sel.sport_mask = htons(0xffff); |
34 | x->sel.family = AF_INET; | ||
36 | x->sel.prefixlen_d = 32; | 35 | x->sel.prefixlen_d = 32; |
37 | x->sel.prefixlen_s = 32; | 36 | x->sel.prefixlen_s = 32; |
38 | x->sel.proto = fl->proto; | 37 | x->sel.proto = fl->proto; |