aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c10
-rw-r--r--net/ipv4/ah4.c6
-rw-r--r--net/ipv4/arp.c22
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/icmp.c12
-rw-r--r--net/ipv4/inet_connection_sock.c8
-rw-r--r--net/ipv4/inet_diag.c31
-rw-r--r--net/ipv4/inet_hashtables.c277
-rw-r--r--net/ipv4/inet_lro.c4
-rw-r--r--net/ipv4/inet_timewait_sock.c48
-rw-r--r--net/ipv4/ip_gre.c44
-rw-r--r--net/ipv4/ip_input.c10
-rw-r--r--net/ipv4/ip_output.c18
-rw-r--r--net/ipv4/ip_sockglue.c40
-rw-r--r--net/ipv4/ipcomp.c6
-rw-r--r--net/ipv4/ipip.c35
-rw-r--r--net/ipv4/ipmr.c40
-rw-r--r--net/ipv4/netfilter.c4
-rw-r--r--net/ipv4/proc.c66
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c4
-rw-r--r--net/ipv4/tcp.c27
-rw-r--r--net/ipv4/tcp_diag.c2
-rw-r--r--net/ipv4/tcp_htcp.c14
-rw-r--r--net/ipv4/tcp_input.c388
-rw-r--r--net/ipv4/tcp_ipv4.c73
-rw-r--r--net/ipv4/tcp_output.c129
-rw-r--r--net/ipv4/tcp_timer.c2
-rw-r--r--net/ipv4/udp.c76
-rw-r--r--net/ipv4/xfrm4_policy.c13
-rw-r--r--net/ipv4/xfrm4_state.c3
31 files changed, 919 insertions, 499 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e3286814c8d9..fe03048c130d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -245,7 +245,7 @@ static inline int inet_netns_ok(struct net *net, int protocol)
245 int hash; 245 int hash;
246 struct net_protocol *ipprot; 246 struct net_protocol *ipprot;
247 247
248 if (net == &init_net) 248 if (net_eq(net, &init_net))
249 return 1; 249 return 1;
250 250
251 hash = protocol & (MAX_INET_PROTOS - 1); 251 hash = protocol & (MAX_INET_PROTOS - 1);
@@ -272,10 +272,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol)
272 int try_loading_module = 0; 272 int try_loading_module = 0;
273 int err; 273 int err;
274 274
275 if (sock->type != SOCK_RAW && 275 if (unlikely(!inet_ehash_secret))
276 sock->type != SOCK_DGRAM && 276 if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
277 !inet_ehash_secret) 277 build_ehash_secret();
278 build_ehash_secret();
279 278
280 sock->state = SS_UNCONNECTED; 279 sock->state = SS_UNCONNECTED;
281 280
@@ -1114,6 +1113,7 @@ int inet_sk_rebuild_header(struct sock *sk)
1114 }, 1113 },
1115 }, 1114 },
1116 .proto = sk->sk_protocol, 1115 .proto = sk->sk_protocol,
1116 .flags = inet_sk_flowi_flags(sk),
1117 .uli_u = { 1117 .uli_u = {
1118 .ports = { 1118 .ports = {
1119 .sport = inet->sport, 1119 .sport = inet->sport,
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 3f205181712d..e878e494296e 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,6 +201,7 @@ out:
201 201
202static void ah4_err(struct sk_buff *skb, u32 info) 202static void ah4_err(struct sk_buff *skb, u32 info)
203{ 203{
204 struct net *net = dev_net(skb->dev);
204 struct iphdr *iph = (struct iphdr *)skb->data; 205 struct iphdr *iph = (struct iphdr *)skb->data;
205 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); 206 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
206 struct xfrm_state *x; 207 struct xfrm_state *x;
@@ -209,7 +210,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
209 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 210 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
210 return; 211 return;
211 212
212 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); 213 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
213 if (!x) 214 if (!x)
214 return; 215 return;
215 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", 216 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
@@ -293,9 +294,7 @@ static void ah_destroy(struct xfrm_state *x)
293 return; 294 return;
294 295
295 kfree(ahp->work_icv); 296 kfree(ahp->work_icv);
296 ahp->work_icv = NULL;
297 crypto_free_hash(ahp->tfm); 297 crypto_free_hash(ahp->tfm);
298 ahp->tfm = NULL;
299 kfree(ahp); 298 kfree(ahp);
300} 299}
301 300
@@ -316,6 +315,7 @@ static struct net_protocol ah4_protocol = {
316 .handler = xfrm4_rcv, 315 .handler = xfrm4_rcv,
317 .err_handler = ah4_err, 316 .err_handler = ah4_err,
318 .no_policy = 1, 317 .no_policy = 1,
318 .netns_ok = 1,
319}; 319};
320 320
321static int __init ah4_init(void) 321static int __init ah4_init(void)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 957c87dc8e16..29a74c01d8de 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -818,18 +818,18 @@ static int arp_process(struct sk_buff *skb)
818 addr_type = rt->rt_type; 818 addr_type = rt->rt_type;
819 819
820 if (addr_type == RTN_LOCAL) { 820 if (addr_type == RTN_LOCAL) {
821 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 821 int dont_send = 0;
822 if (n) {
823 int dont_send = 0;
824
825 if (!dont_send)
826 dont_send |= arp_ignore(in_dev, sip, tip);
827 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
828 dont_send |= arp_filter(sip, tip, dev);
829 if (!dont_send)
830 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
831 822
832 neigh_release(n); 823 if (!dont_send)
824 dont_send |= arp_ignore(in_dev,sip,tip);
825 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
826 dont_send |= arp_filter(sip,tip,dev);
827 if (!dont_send) {
828 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
829 if (n) {
830 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
831 neigh_release(n);
832 }
833 } 833 }
834 goto out; 834 goto out;
835 } else if (IN_DEV_FORWARD(in_dev)) { 835 } else if (IN_DEV_FORWARD(in_dev)) {
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 95a9c65003f8..18bb383ea393 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -413,6 +413,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
413 413
414static void esp4_err(struct sk_buff *skb, u32 info) 414static void esp4_err(struct sk_buff *skb, u32 info)
415{ 415{
416 struct net *net = dev_net(skb->dev);
416 struct iphdr *iph = (struct iphdr *)skb->data; 417 struct iphdr *iph = (struct iphdr *)skb->data;
417 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); 418 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
418 struct xfrm_state *x; 419 struct xfrm_state *x;
@@ -421,7 +422,7 @@ static void esp4_err(struct sk_buff *skb, u32 info)
421 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 422 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
422 return; 423 return;
423 424
424 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 425 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
425 if (!x) 426 if (!x)
426 return; 427 return;
427 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 428 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
@@ -618,6 +619,7 @@ static struct net_protocol esp4_protocol = {
618 .handler = xfrm4_rcv, 619 .handler = xfrm4_rcv,
619 .err_handler = esp4_err, 620 .err_handler = esp4_err,
620 .no_policy = 1, 621 .no_policy = 1,
622 .netns_ok = 1,
621}; 623};
622 624
623static int __init esp4_init(void) 625static int __init esp4_init(void)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 21e497efbd7f..705b33b184a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -321,12 +321,12 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
321} 321}
322 322
323static void icmp_push_reply(struct icmp_bxm *icmp_param, 323static void icmp_push_reply(struct icmp_bxm *icmp_param,
324 struct ipcm_cookie *ipc, struct rtable *rt) 324 struct ipcm_cookie *ipc, struct rtable **rt)
325{ 325{
326 struct sock *sk; 326 struct sock *sk;
327 struct sk_buff *skb; 327 struct sk_buff *skb;
328 328
329 sk = icmp_sk(dev_net(rt->u.dst.dev)); 329 sk = icmp_sk(dev_net((*rt)->u.dst.dev));
330 if (ip_append_data(sk, icmp_glue_bits, icmp_param, 330 if (ip_append_data(sk, icmp_glue_bits, icmp_param,
331 icmp_param->data_len+icmp_param->head_len, 331 icmp_param->data_len+icmp_param->head_len,
332 icmp_param->head_len, 332 icmp_param->head_len,
@@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
392 } 392 }
393 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, 393 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
394 icmp_param->data.icmph.code)) 394 icmp_param->data.icmph.code))
395 icmp_push_reply(icmp_param, &ipc, rt); 395 icmp_push_reply(icmp_param, &ipc, &rt);
396 ip_rt_put(rt); 396 ip_rt_put(rt);
397out_unlock: 397out_unlock:
398 icmp_xmit_unlock(sk); 398 icmp_xmit_unlock(sk);
@@ -562,7 +562,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
562 /* No need to clone since we're just using its address. */ 562 /* No need to clone since we're just using its address. */
563 rt2 = rt; 563 rt2 = rt;
564 564
565 err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); 565 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
566 switch (err) { 566 switch (err) {
567 case 0: 567 case 0:
568 if (rt != rt2) 568 if (rt != rt2)
@@ -601,7 +601,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
601 if (err) 601 if (err)
602 goto relookup_failed; 602 goto relookup_failed;
603 603
604 err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, 604 err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
605 XFRM_LOOKUP_ICMP); 605 XFRM_LOOKUP_ICMP);
606 switch (err) { 606 switch (err) {
607 case 0: 607 case 0:
@@ -635,7 +635,7 @@ route_done:
635 icmp_param.data_len = room; 635 icmp_param.data_len = room;
636 icmp_param.head_len = sizeof(struct icmphdr); 636 icmp_param.head_len = sizeof(struct icmphdr);
637 637
638 icmp_push_reply(&icmp_param, &ipc, rt); 638 icmp_push_reply(&icmp_param, &ipc, &rt);
639ende: 639ende:
640 ip_rt_put(rt); 640 ip_rt_put(rt);
641out_unlock: 641out_unlock:
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 36f4cbc7da3a..1ccdbba528be 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -109,7 +109,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
109 hashinfo->bhash_size)]; 109 hashinfo->bhash_size)];
110 spin_lock(&head->lock); 110 spin_lock(&head->lock);
111 inet_bind_bucket_for_each(tb, node, &head->chain) 111 inet_bind_bucket_for_each(tb, node, &head->chain)
112 if (tb->ib_net == net && tb->port == rover) 112 if (ib_net(tb) == net && tb->port == rover)
113 goto next; 113 goto next;
114 break; 114 break;
115 next: 115 next:
@@ -137,7 +137,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
137 hashinfo->bhash_size)]; 137 hashinfo->bhash_size)];
138 spin_lock(&head->lock); 138 spin_lock(&head->lock);
139 inet_bind_bucket_for_each(tb, node, &head->chain) 139 inet_bind_bucket_for_each(tb, node, &head->chain)
140 if (tb->ib_net == net && tb->port == snum) 140 if (ib_net(tb) == net && tb->port == snum)
141 goto tb_found; 141 goto tb_found;
142 } 142 }
143 tb = NULL; 143 tb = NULL;
@@ -561,7 +561,7 @@ void inet_csk_destroy_sock(struct sock *sk)
561 561
562 sk_refcnt_debug_release(sk); 562 sk_refcnt_debug_release(sk);
563 563
564 atomic_dec(sk->sk_prot->orphan_count); 564 percpu_counter_dec(sk->sk_prot->orphan_count);
565 sock_put(sk); 565 sock_put(sk);
566} 566}
567 567
@@ -641,7 +641,7 @@ void inet_csk_listen_stop(struct sock *sk)
641 641
642 sock_orphan(child); 642 sock_orphan(child);
643 643
644 atomic_inc(sk->sk_prot->orphan_count); 644 percpu_counter_inc(sk->sk_prot->orphan_count);
645 645
646 inet_csk_destroy_sock(child); 646 inet_csk_destroy_sock(child);
647 647
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 564230dabcb8..588a7796e3e3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
718 if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) 718 if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
719 goto skip_listen_ht; 719 goto skip_listen_ht;
720 720
721 inet_listen_lock(hashinfo);
722 for (i = s_i; i < INET_LHTABLE_SIZE; i++) { 721 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
723 struct sock *sk; 722 struct sock *sk;
724 struct hlist_node *node; 723 struct hlist_nulls_node *node;
724 struct inet_listen_hashbucket *ilb;
725 725
726 num = 0; 726 num = 0;
727 sk_for_each(sk, node, &hashinfo->listening_hash[i]) { 727 ilb = &hashinfo->listening_hash[i];
728 spin_lock_bh(&ilb->lock);
729 sk_nulls_for_each(sk, node, &ilb->head) {
728 struct inet_sock *inet = inet_sk(sk); 730 struct inet_sock *inet = inet_sk(sk);
729 731
730 if (num < s_num) { 732 if (num < s_num) {
@@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
742 goto syn_recv; 744 goto syn_recv;
743 745
744 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 746 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
745 inet_listen_unlock(hashinfo); 747 spin_unlock_bh(&ilb->lock);
746 goto done; 748 goto done;
747 } 749 }
748 750
@@ -751,7 +753,7 @@ syn_recv:
751 goto next_listen; 753 goto next_listen;
752 754
753 if (inet_diag_dump_reqs(skb, sk, cb) < 0) { 755 if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
754 inet_listen_unlock(hashinfo); 756 spin_unlock_bh(&ilb->lock);
755 goto done; 757 goto done;
756 } 758 }
757 759
@@ -760,12 +762,12 @@ next_listen:
760 cb->args[4] = 0; 762 cb->args[4] = 0;
761 ++num; 763 ++num;
762 } 764 }
765 spin_unlock_bh(&ilb->lock);
763 766
764 s_num = 0; 767 s_num = 0;
765 cb->args[3] = 0; 768 cb->args[3] = 0;
766 cb->args[4] = 0; 769 cb->args[4] = 0;
767 } 770 }
768 inet_listen_unlock(hashinfo);
769skip_listen_ht: 771skip_listen_ht:
770 cb->args[0] = 1; 772 cb->args[0] = 1;
771 s_i = num = s_num = 0; 773 s_i = num = s_num = 0;
@@ -776,20 +778,21 @@ skip_listen_ht:
776 778
777 for (i = s_i; i < hashinfo->ehash_size; i++) { 779 for (i = s_i; i < hashinfo->ehash_size; i++) {
778 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 780 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
779 rwlock_t *lock = inet_ehash_lockp(hashinfo, i); 781 spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
780 struct sock *sk; 782 struct sock *sk;
781 struct hlist_node *node; 783 struct hlist_nulls_node *node;
782 784
783 num = 0; 785 num = 0;
784 786
785 if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) 787 if (hlist_nulls_empty(&head->chain) &&
788 hlist_nulls_empty(&head->twchain))
786 continue; 789 continue;
787 790
788 if (i > s_i) 791 if (i > s_i)
789 s_num = 0; 792 s_num = 0;
790 793
791 read_lock_bh(lock); 794 spin_lock_bh(lock);
792 sk_for_each(sk, node, &head->chain) { 795 sk_nulls_for_each(sk, node, &head->chain) {
793 struct inet_sock *inet = inet_sk(sk); 796 struct inet_sock *inet = inet_sk(sk);
794 797
795 if (num < s_num) 798 if (num < s_num)
@@ -803,7 +806,7 @@ skip_listen_ht:
803 r->id.idiag_dport) 806 r->id.idiag_dport)
804 goto next_normal; 807 goto next_normal;
805 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 808 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
806 read_unlock_bh(lock); 809 spin_unlock_bh(lock);
807 goto done; 810 goto done;
808 } 811 }
809next_normal: 812next_normal:
@@ -825,14 +828,14 @@ next_normal:
825 r->id.idiag_dport) 828 r->id.idiag_dport)
826 goto next_dying; 829 goto next_dying;
827 if (inet_twsk_diag_dump(tw, skb, cb) < 0) { 830 if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
828 read_unlock_bh(lock); 831 spin_unlock_bh(lock);
829 goto done; 832 goto done;
830 } 833 }
831next_dying: 834next_dying:
832 ++num; 835 ++num;
833 } 836 }
834 } 837 }
835 read_unlock_bh(lock); 838 spin_unlock_bh(lock);
836 } 839 }
837 840
838done: 841done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 44981906fb91..6a1045da48d2 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -35,7 +35,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
36 36
37 if (tb != NULL) { 37 if (tb != NULL) {
38 tb->ib_net = hold_net(net); 38 write_pnet(&tb->ib_net, hold_net(net));
39 tb->port = snum; 39 tb->port = snum;
40 tb->fastreuse = 0; 40 tb->fastreuse = 0;
41 INIT_HLIST_HEAD(&tb->owners); 41 INIT_HLIST_HEAD(&tb->owners);
@@ -51,7 +51,7 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
51{ 51{
52 if (hlist_empty(&tb->owners)) { 52 if (hlist_empty(&tb->owners)) {
53 __hlist_del(&tb->node); 53 __hlist_del(&tb->node);
54 release_net(tb->ib_net); 54 release_net(ib_net(tb));
55 kmem_cache_free(cachep, tb); 55 kmem_cache_free(cachep, tb);
56 } 56 }
57} 57}
@@ -110,33 +110,29 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
110 110
111EXPORT_SYMBOL_GPL(__inet_inherit_port); 111EXPORT_SYMBOL_GPL(__inet_inherit_port);
112 112
113/* 113static inline int compute_score(struct sock *sk, struct net *net,
114 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 114 const unsigned short hnum, const __be32 daddr,
115 * Look, when several writers sleep and reader wakes them up, all but one 115 const int dif)
116 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
117 * this, _but_ remember, it adds useless work on UP machines (wake up each
118 * exclusive lock release). It should be ifdefed really.
119 */
120void inet_listen_wlock(struct inet_hashinfo *hashinfo)
121 __acquires(hashinfo->lhash_lock)
122{ 116{
123 write_lock(&hashinfo->lhash_lock); 117 int score = -1;
124 118 struct inet_sock *inet = inet_sk(sk);
125 if (atomic_read(&hashinfo->lhash_users)) {
126 DEFINE_WAIT(wait);
127 119
128 for (;;) { 120 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
129 prepare_to_wait_exclusive(&hashinfo->lhash_wait, 121 !ipv6_only_sock(sk)) {
130 &wait, TASK_UNINTERRUPTIBLE); 122 __be32 rcv_saddr = inet->rcv_saddr;
131 if (!atomic_read(&hashinfo->lhash_users)) 123 score = sk->sk_family == PF_INET ? 1 : 0;
132 break; 124 if (rcv_saddr) {
133 write_unlock_bh(&hashinfo->lhash_lock); 125 if (rcv_saddr != daddr)
134 schedule(); 126 return -1;
135 write_lock_bh(&hashinfo->lhash_lock); 127 score += 2;
128 }
129 if (sk->sk_bound_dev_if) {
130 if (sk->sk_bound_dev_if != dif)
131 return -1;
132 score += 2;
136 } 133 }
137
138 finish_wait(&hashinfo->lhash_wait, &wait);
139 } 134 }
135 return score;
140} 136}
141 137
142/* 138/*
@@ -145,72 +141,48 @@ void inet_listen_wlock(struct inet_hashinfo *hashinfo)
145 * remote address for the connection. So always assume those are both 141 * remote address for the connection. So always assume those are both
146 * wildcarded during the search since they can never be otherwise. 142 * wildcarded during the search since they can never be otherwise.
147 */ 143 */
148static struct sock *inet_lookup_listener_slow(struct net *net,
149 const struct hlist_head *head,
150 const __be32 daddr,
151 const unsigned short hnum,
152 const int dif)
153{
154 struct sock *result = NULL, *sk;
155 const struct hlist_node *node;
156 int hiscore = -1;
157
158 sk_for_each(sk, node, head) {
159 const struct inet_sock *inet = inet_sk(sk);
160
161 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
162 !ipv6_only_sock(sk)) {
163 const __be32 rcv_saddr = inet->rcv_saddr;
164 int score = sk->sk_family == PF_INET ? 1 : 0;
165
166 if (rcv_saddr) {
167 if (rcv_saddr != daddr)
168 continue;
169 score += 2;
170 }
171 if (sk->sk_bound_dev_if) {
172 if (sk->sk_bound_dev_if != dif)
173 continue;
174 score += 2;
175 }
176 if (score == 5)
177 return sk;
178 if (score > hiscore) {
179 hiscore = score;
180 result = sk;
181 }
182 }
183 }
184 return result;
185}
186 144
187/* Optimize the common listener case. */ 145
188struct sock *__inet_lookup_listener(struct net *net, 146struct sock *__inet_lookup_listener(struct net *net,
189 struct inet_hashinfo *hashinfo, 147 struct inet_hashinfo *hashinfo,
190 const __be32 daddr, const unsigned short hnum, 148 const __be32 daddr, const unsigned short hnum,
191 const int dif) 149 const int dif)
192{ 150{
193 struct sock *sk = NULL; 151 struct sock *sk, *result;
194 const struct hlist_head *head; 152 struct hlist_nulls_node *node;
195 153 unsigned int hash = inet_lhashfn(net, hnum);
196 read_lock(&hashinfo->lhash_lock); 154 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
197 head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; 155 int score, hiscore;
198 if (!hlist_empty(head)) { 156
199 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 157 rcu_read_lock();
200 158begin:
201 if (inet->num == hnum && !sk->sk_node.next && 159 result = NULL;
202 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 160 hiscore = -1;
203 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 161 sk_nulls_for_each_rcu(sk, node, &ilb->head) {
204 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) 162 score = compute_score(sk, net, hnum, daddr, dif);
205 goto sherry_cache; 163 if (score > hiscore) {
206 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); 164 result = sk;
165 hiscore = score;
166 }
207 } 167 }
208 if (sk) { 168 /*
209sherry_cache: 169 * if the nulls value we got at the end of this lookup is
210 sock_hold(sk); 170 * not the expected one, we must restart lookup.
171 * We probably met an item that was moved to another chain.
172 */
173 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
174 goto begin;
175 if (result) {
176 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
177 result = NULL;
178 else if (unlikely(compute_score(result, net, hnum, daddr,
179 dif) < hiscore)) {
180 sock_put(result);
181 goto begin;
182 }
211 } 183 }
212 read_unlock(&hashinfo->lhash_lock); 184 rcu_read_unlock();
213 return sk; 185 return result;
214} 186}
215EXPORT_SYMBOL_GPL(__inet_lookup_listener); 187EXPORT_SYMBOL_GPL(__inet_lookup_listener);
216 188
@@ -223,35 +195,65 @@ struct sock * __inet_lookup_established(struct net *net,
223 INET_ADDR_COOKIE(acookie, saddr, daddr) 195 INET_ADDR_COOKIE(acookie, saddr, daddr)
224 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 196 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
225 struct sock *sk; 197 struct sock *sk;
226 const struct hlist_node *node; 198 const struct hlist_nulls_node *node;
227 /* Optimize here for direct hit, only listening connections can 199 /* Optimize here for direct hit, only listening connections can
228 * have wildcards anyways. 200 * have wildcards anyways.
229 */ 201 */
230 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 202 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
231 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 203 unsigned int slot = hash & (hashinfo->ehash_size - 1);
232 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 204 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
233 205
234 prefetch(head->chain.first); 206 rcu_read_lock();
235 read_lock(lock); 207begin:
236 sk_for_each(sk, node, &head->chain) { 208 sk_nulls_for_each_rcu(sk, node, &head->chain) {
237 if (INET_MATCH(sk, net, hash, acookie, 209 if (INET_MATCH(sk, net, hash, acookie,
238 saddr, daddr, ports, dif)) 210 saddr, daddr, ports, dif)) {
239 goto hit; /* You sunk my battleship! */ 211 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
212 goto begintw;
213 if (unlikely(!INET_MATCH(sk, net, hash, acookie,
214 saddr, daddr, ports, dif))) {
215 sock_put(sk);
216 goto begin;
217 }
218 goto out;
219 }
240 } 220 }
221 /*
222 * if the nulls value we got at the end of this lookup is
223 * not the expected one, we must restart lookup.
224 * We probably met an item that was moved to another chain.
225 */
226 if (get_nulls_value(node) != slot)
227 goto begin;
241 228
229begintw:
242 /* Must check for a TIME_WAIT'er before going to listener hash. */ 230 /* Must check for a TIME_WAIT'er before going to listener hash. */
243 sk_for_each(sk, node, &head->twchain) { 231 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
244 if (INET_TW_MATCH(sk, net, hash, acookie, 232 if (INET_TW_MATCH(sk, net, hash, acookie,
245 saddr, daddr, ports, dif)) 233 saddr, daddr, ports, dif)) {
246 goto hit; 234 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
235 sk = NULL;
236 goto out;
237 }
238 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
239 saddr, daddr, ports, dif))) {
240 sock_put(sk);
241 goto begintw;
242 }
243 goto out;
244 }
247 } 245 }
246 /*
247 * if the nulls value we got at the end of this lookup is
248 * not the expected one, we must restart lookup.
249 * We probably met an item that was moved to another chain.
250 */
251 if (get_nulls_value(node) != slot)
252 goto begintw;
248 sk = NULL; 253 sk = NULL;
249out: 254out:
250 read_unlock(lock); 255 rcu_read_unlock();
251 return sk; 256 return sk;
252hit:
253 sock_hold(sk);
254 goto out;
255} 257}
256EXPORT_SYMBOL_GPL(__inet_lookup_established); 258EXPORT_SYMBOL_GPL(__inet_lookup_established);
257 259
@@ -270,16 +272,15 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
270 struct net *net = sock_net(sk); 272 struct net *net = sock_net(sk);
271 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); 273 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport);
272 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 274 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
273 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 275 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
274 struct sock *sk2; 276 struct sock *sk2;
275 const struct hlist_node *node; 277 const struct hlist_nulls_node *node;
276 struct inet_timewait_sock *tw; 278 struct inet_timewait_sock *tw;
277 279
278 prefetch(head->chain.first); 280 spin_lock(lock);
279 write_lock(lock);
280 281
281 /* Check TIME-WAIT sockets first. */ 282 /* Check TIME-WAIT sockets first. */
282 sk_for_each(sk2, node, &head->twchain) { 283 sk_nulls_for_each(sk2, node, &head->twchain) {
283 tw = inet_twsk(sk2); 284 tw = inet_twsk(sk2);
284 285
285 if (INET_TW_MATCH(sk2, net, hash, acookie, 286 if (INET_TW_MATCH(sk2, net, hash, acookie,
@@ -293,7 +294,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
293 tw = NULL; 294 tw = NULL;
294 295
295 /* And established part... */ 296 /* And established part... */
296 sk_for_each(sk2, node, &head->chain) { 297 sk_nulls_for_each(sk2, node, &head->chain) {
297 if (INET_MATCH(sk2, net, hash, acookie, 298 if (INET_MATCH(sk2, net, hash, acookie,
298 saddr, daddr, ports, dif)) 299 saddr, daddr, ports, dif))
299 goto not_unique; 300 goto not_unique;
@@ -306,9 +307,9 @@ unique:
306 inet->sport = htons(lport); 307 inet->sport = htons(lport);
307 sk->sk_hash = hash; 308 sk->sk_hash = hash;
308 WARN_ON(!sk_unhashed(sk)); 309 WARN_ON(!sk_unhashed(sk));
309 __sk_add_node(sk, &head->chain); 310 __sk_nulls_add_node_rcu(sk, &head->chain);
311 spin_unlock(lock);
310 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 312 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
311 write_unlock(lock);
312 313
313 if (twp) { 314 if (twp) {
314 *twp = tw; 315 *twp = tw;
@@ -324,7 +325,7 @@ unique:
324 return 0; 325 return 0;
325 326
326not_unique: 327not_unique:
327 write_unlock(lock); 328 spin_unlock(lock);
328 return -EADDRNOTAVAIL; 329 return -EADDRNOTAVAIL;
329} 330}
330 331
@@ -338,8 +339,8 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
338void __inet_hash_nolisten(struct sock *sk) 339void __inet_hash_nolisten(struct sock *sk)
339{ 340{
340 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 341 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
341 struct hlist_head *list; 342 struct hlist_nulls_head *list;
342 rwlock_t *lock; 343 spinlock_t *lock;
343 struct inet_ehash_bucket *head; 344 struct inet_ehash_bucket *head;
344 345
345 WARN_ON(!sk_unhashed(sk)); 346 WARN_ON(!sk_unhashed(sk));
@@ -349,18 +350,17 @@ void __inet_hash_nolisten(struct sock *sk)
349 list = &head->chain; 350 list = &head->chain;
350 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 351 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
351 352
352 write_lock(lock); 353 spin_lock(lock);
353 __sk_add_node(sk, list); 354 __sk_nulls_add_node_rcu(sk, list);
355 spin_unlock(lock);
354 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 356 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
355 write_unlock(lock);
356} 357}
357EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 358EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
358 359
359static void __inet_hash(struct sock *sk) 360static void __inet_hash(struct sock *sk)
360{ 361{
361 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 362 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
362 struct hlist_head *list; 363 struct inet_listen_hashbucket *ilb;
363 rwlock_t *lock;
364 364
365 if (sk->sk_state != TCP_LISTEN) { 365 if (sk->sk_state != TCP_LISTEN) {
366 __inet_hash_nolisten(sk); 366 __inet_hash_nolisten(sk);
@@ -368,14 +368,12 @@ static void __inet_hash(struct sock *sk)
368 } 368 }
369 369
370 WARN_ON(!sk_unhashed(sk)); 370 WARN_ON(!sk_unhashed(sk));
371 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 371 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
372 lock = &hashinfo->lhash_lock;
373 372
374 inet_listen_wlock(hashinfo); 373 spin_lock(&ilb->lock);
375 __sk_add_node(sk, list); 374 __sk_nulls_add_node_rcu(sk, &ilb->head);
376 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
377 write_unlock(lock); 376 spin_unlock(&ilb->lock);
378 wake_up(&hashinfo->lhash_wait);
379} 377}
380 378
381void inet_hash(struct sock *sk) 379void inet_hash(struct sock *sk)
@@ -390,27 +388,23 @@ EXPORT_SYMBOL_GPL(inet_hash);
390 388
391void inet_unhash(struct sock *sk) 389void inet_unhash(struct sock *sk)
392{ 390{
393 rwlock_t *lock;
394 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 391 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
392 spinlock_t *lock;
393 int done;
395 394
396 if (sk_unhashed(sk)) 395 if (sk_unhashed(sk))
397 goto out; 396 return;
398 397
399 if (sk->sk_state == TCP_LISTEN) { 398 if (sk->sk_state == TCP_LISTEN)
400 local_bh_disable(); 399 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
401 inet_listen_wlock(hashinfo); 400 else
402 lock = &hashinfo->lhash_lock;
403 } else {
404 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 401 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
405 write_lock_bh(lock);
406 }
407 402
408 if (__sk_del_node_init(sk)) 403 spin_lock_bh(lock);
404 done =__sk_nulls_del_node_init_rcu(sk);
405 if (done)
409 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 406 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
410 write_unlock_bh(lock); 407 spin_unlock_bh(lock);
411out:
412 if (sk->sk_state == TCP_LISTEN)
413 wake_up(&hashinfo->lhash_wait);
414} 408}
415EXPORT_SYMBOL_GPL(inet_unhash); 409EXPORT_SYMBOL_GPL(inet_unhash);
416 410
@@ -449,7 +443,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
449 * unique enough. 443 * unique enough.
450 */ 444 */
451 inet_bind_bucket_for_each(tb, node, &head->chain) { 445 inet_bind_bucket_for_each(tb, node, &head->chain) {
452 if (tb->ib_net == net && tb->port == port) { 446 if (ib_net(tb) == net && tb->port == port) {
453 WARN_ON(hlist_empty(&tb->owners)); 447 WARN_ON(hlist_empty(&tb->owners));
454 if (tb->fastreuse >= 0) 448 if (tb->fastreuse >= 0)
455 goto next_port; 449 goto next_port;
@@ -524,3 +518,16 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
524} 518}
525 519
526EXPORT_SYMBOL_GPL(inet_hash_connect); 520EXPORT_SYMBOL_GPL(inet_hash_connect);
521
522void inet_hashinfo_init(struct inet_hashinfo *h)
523{
524 int i;
525
526 for (i = 0; i < INET_LHTABLE_SIZE; i++) {
527 spin_lock_init(&h->listening_hash[i].lock);
528 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
529 i + LISTENING_NULLS_BASE);
530 }
531}
532
533EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index cfd034a2b96e..6a667dae315e 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -120,7 +120,7 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
120 iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); 120 iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
121 121
122 tcph->check = 0; 122 tcph->check = 0;
123 tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0); 123 tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
124 lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); 124 lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
125 tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, 125 tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
126 lro_desc->ip_tot_len - 126 lro_desc->ip_tot_len -
@@ -135,7 +135,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
135 __wsum tcp_ps_hdr_csum; 135 __wsum tcp_ps_hdr_csum;
136 136
137 tcp_csum = ~csum_unfold(tcph->check); 137 tcp_csum = ~csum_unfold(tcph->check);
138 tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum); 138 tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
139 139
140 tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, 140 tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
141 len + TCP_HDR_LEN(tcph), 141 len + TCP_HDR_LEN(tcph),
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1c5fd38f8824..8554d0ea1719 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
20 struct inet_bind_hashbucket *bhead; 20 struct inet_bind_hashbucket *bhead;
21 struct inet_bind_bucket *tb; 21 struct inet_bind_bucket *tb;
22 /* Unlink from established hashes. */ 22 /* Unlink from established hashes. */
23 rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 23 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
24 24
25 write_lock(lock); 25 spin_lock(lock);
26 if (hlist_unhashed(&tw->tw_node)) { 26 if (hlist_nulls_unhashed(&tw->tw_node)) {
27 write_unlock(lock); 27 spin_unlock(lock);
28 return; 28 return;
29 } 29 }
30 __hlist_del(&tw->tw_node); 30 hlist_nulls_del_rcu(&tw->tw_node);
31 sk_node_init(&tw->tw_node); 31 sk_nulls_node_init(&tw->tw_node);
32 write_unlock(lock); 32 spin_unlock(lock);
33 33
34 /* Disassociate with bind bucket. */ 34 /* Disassociate with bind bucket. */
35 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, 35 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
@@ -76,7 +76,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
76 const struct inet_sock *inet = inet_sk(sk); 76 const struct inet_sock *inet = inet_sk(sk);
77 const struct inet_connection_sock *icsk = inet_csk(sk); 77 const struct inet_connection_sock *icsk = inet_csk(sk);
78 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); 78 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
79 rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 79 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
80 struct inet_bind_hashbucket *bhead; 80 struct inet_bind_hashbucket *bhead;
81 /* Step 1: Put TW into bind hash. Original socket stays there too. 81 /* Step 1: Put TW into bind hash. Original socket stays there too.
82 Note, that any socket with inet->num != 0 MUST be bound in 82 Note, that any socket with inet->num != 0 MUST be bound in
@@ -90,17 +90,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
90 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); 90 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
91 spin_unlock(&bhead->lock); 91 spin_unlock(&bhead->lock);
92 92
93 write_lock(lock); 93 spin_lock(lock);
94 94
95 /* Step 2: Remove SK from established hash. */ 95 /*
96 if (__sk_del_node_init(sk)) 96 * Step 2: Hash TW into TIMEWAIT chain.
97 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 97 * Should be done before removing sk from established chain
98 98 * because readers are lockless and search established first.
99 /* Step 3: Hash TW into TIMEWAIT chain. */ 99 */
100 inet_twsk_add_node(tw, &ehead->twchain);
101 atomic_inc(&tw->tw_refcnt); 100 atomic_inc(&tw->tw_refcnt);
101 inet_twsk_add_node_rcu(tw, &ehead->twchain);
102 102
103 write_unlock(lock); 103 /* Step 3: Remove SK from established hash. */
104 if (__sk_nulls_del_node_init_rcu(sk))
105 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
106
107 spin_unlock(lock);
104} 108}
105 109
106EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 110EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -416,17 +420,17 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
416{ 420{
417 struct inet_timewait_sock *tw; 421 struct inet_timewait_sock *tw;
418 struct sock *sk; 422 struct sock *sk;
419 struct hlist_node *node; 423 struct hlist_nulls_node *node;
420 int h; 424 int h;
421 425
422 local_bh_disable(); 426 local_bh_disable();
423 for (h = 0; h < (hashinfo->ehash_size); h++) { 427 for (h = 0; h < (hashinfo->ehash_size); h++) {
424 struct inet_ehash_bucket *head = 428 struct inet_ehash_bucket *head =
425 inet_ehash_bucket(hashinfo, h); 429 inet_ehash_bucket(hashinfo, h);
426 rwlock_t *lock = inet_ehash_lockp(hashinfo, h); 430 spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
427restart: 431restart:
428 write_lock(lock); 432 spin_lock(lock);
429 sk_for_each(sk, node, &head->twchain) { 433 sk_nulls_for_each(sk, node, &head->twchain) {
430 434
431 tw = inet_twsk(sk); 435 tw = inet_twsk(sk);
432 if (!net_eq(twsk_net(tw), net) || 436 if (!net_eq(twsk_net(tw), net) ||
@@ -434,13 +438,13 @@ restart:
434 continue; 438 continue;
435 439
436 atomic_inc(&tw->tw_refcnt); 440 atomic_inc(&tw->tw_refcnt);
437 write_unlock(lock); 441 spin_unlock(lock);
438 inet_twsk_deschedule(tw, twdr); 442 inet_twsk_deschedule(tw, twdr);
439 inet_twsk_put(tw); 443 inet_twsk_put(tw);
440 444
441 goto restart; 445 goto restart;
442 } 446 }
443 write_unlock(lock); 447 spin_unlock(lock);
444 } 448 }
445 local_bh_enable(); 449 local_bh_enable();
446} 450}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 191ef7588134..0101521f366b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -126,8 +126,6 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
126 126
127/* Fallback tunnel: no source, no destination, no key, no options */ 127/* Fallback tunnel: no source, no destination, no key, no options */
128 128
129static int ipgre_fb_tunnel_init(struct net_device *dev);
130
131#define HASH_SIZE 16 129#define HASH_SIZE 16
132 130
133static int ipgre_net_id; 131static int ipgre_net_id;
@@ -1142,6 +1140,7 @@ static int ipgre_open(struct net_device *dev)
1142static int ipgre_close(struct net_device *dev) 1140static int ipgre_close(struct net_device *dev)
1143{ 1141{
1144 struct ip_tunnel *t = netdev_priv(dev); 1142 struct ip_tunnel *t = netdev_priv(dev);
1143
1145 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1144 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1146 struct in_device *in_dev; 1145 struct in_device *in_dev;
1147 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1146 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
@@ -1155,14 +1154,22 @@ static int ipgre_close(struct net_device *dev)
1155 1154
1156#endif 1155#endif
1157 1156
1157static const struct net_device_ops ipgre_netdev_ops = {
1158 .ndo_init = ipgre_tunnel_init,
1159 .ndo_uninit = ipgre_tunnel_uninit,
1160#ifdef CONFIG_NET_IPGRE_BROADCAST
1161 .ndo_open = ipgre_open,
1162 .ndo_stop = ipgre_close,
1163#endif
1164 .ndo_start_xmit = ipgre_tunnel_xmit,
1165 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1166 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1167};
1168
1158static void ipgre_tunnel_setup(struct net_device *dev) 1169static void ipgre_tunnel_setup(struct net_device *dev)
1159{ 1170{
1160 dev->init = ipgre_tunnel_init; 1171 dev->netdev_ops = &ipgre_netdev_ops;
1161 dev->uninit = ipgre_tunnel_uninit;
1162 dev->destructor = free_netdev; 1172 dev->destructor = free_netdev;
1163 dev->hard_start_xmit = ipgre_tunnel_xmit;
1164 dev->do_ioctl = ipgre_tunnel_ioctl;
1165 dev->change_mtu = ipgre_tunnel_change_mtu;
1166 1173
1167 dev->type = ARPHRD_IPGRE; 1174 dev->type = ARPHRD_IPGRE;
1168 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1175 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1194,8 +1201,6 @@ static int ipgre_tunnel_init(struct net_device *dev)
1194 return -EINVAL; 1201 return -EINVAL;
1195 dev->flags = IFF_BROADCAST; 1202 dev->flags = IFF_BROADCAST;
1196 dev->header_ops = &ipgre_header_ops; 1203 dev->header_ops = &ipgre_header_ops;
1197 dev->open = ipgre_open;
1198 dev->stop = ipgre_close;
1199 } 1204 }
1200#endif 1205#endif
1201 } else 1206 } else
@@ -1204,7 +1209,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
1204 return 0; 1209 return 0;
1205} 1210}
1206 1211
1207static int ipgre_fb_tunnel_init(struct net_device *dev) 1212static void ipgre_fb_tunnel_init(struct net_device *dev)
1208{ 1213{
1209 struct ip_tunnel *tunnel = netdev_priv(dev); 1214 struct ip_tunnel *tunnel = netdev_priv(dev);
1210 struct iphdr *iph = &tunnel->parms.iph; 1215 struct iphdr *iph = &tunnel->parms.iph;
@@ -1220,7 +1225,6 @@ static int ipgre_fb_tunnel_init(struct net_device *dev)
1220 1225
1221 dev_hold(dev); 1226 dev_hold(dev);
1222 ign->tunnels_wc[0] = tunnel; 1227 ign->tunnels_wc[0] = tunnel;
1223 return 0;
1224} 1228}
1225 1229
1226 1230
@@ -1264,9 +1268,9 @@ static int ipgre_init_net(struct net *net)
1264 err = -ENOMEM; 1268 err = -ENOMEM;
1265 goto err_alloc_dev; 1269 goto err_alloc_dev;
1266 } 1270 }
1267
1268 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1269 dev_net_set(ign->fb_tunnel_dev, net); 1271 dev_net_set(ign->fb_tunnel_dev, net);
1272
1273 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1270 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; 1274 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1271 1275
1272 if ((err = register_netdev(ign->fb_tunnel_dev))) 1276 if ((err = register_netdev(ign->fb_tunnel_dev)))
@@ -1397,16 +1401,22 @@ static int ipgre_tap_init(struct net_device *dev)
1397 return 0; 1401 return 0;
1398} 1402}
1399 1403
1404static const struct net_device_ops ipgre_tap_netdev_ops = {
1405 .ndo_init = ipgre_tap_init,
1406 .ndo_uninit = ipgre_tunnel_uninit,
1407 .ndo_start_xmit = ipgre_tunnel_xmit,
1408 .ndo_set_mac_address = eth_mac_addr,
1409 .ndo_validate_addr = eth_validate_addr,
1410 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1411};
1412
1400static void ipgre_tap_setup(struct net_device *dev) 1413static void ipgre_tap_setup(struct net_device *dev)
1401{ 1414{
1402 1415
1403 ether_setup(dev); 1416 ether_setup(dev);
1404 1417
1405 dev->init = ipgre_tap_init; 1418 dev->netdev_ops = &ipgre_netdev_ops;
1406 dev->uninit = ipgre_tunnel_uninit;
1407 dev->destructor = free_netdev; 1419 dev->destructor = free_netdev;
1408 dev->hard_start_xmit = ipgre_tunnel_xmit;
1409 dev->change_mtu = ipgre_tunnel_change_mtu;
1410 1420
1411 dev->iflink = 0; 1421 dev->iflink = 0;
1412 dev->features |= NETIF_F_NETNS_LOCAL; 1422 dev->features |= NETIF_F_NETNS_LOCAL;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 70bedab03b09..1a58a6fa1dc0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -209,9 +209,17 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
209 209
210 hash = protocol & (MAX_INET_PROTOS - 1); 210 hash = protocol & (MAX_INET_PROTOS - 1);
211 ipprot = rcu_dereference(inet_protos[hash]); 211 ipprot = rcu_dereference(inet_protos[hash]);
212 if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) { 212 if (ipprot != NULL) {
213 int ret; 213 int ret;
214 214
215 if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
216 if (net_ratelimit())
217 printk("%s: proto %d isn't netns-ready\n",
218 __func__, protocol);
219 kfree_skb(skb);
220 goto out;
221 }
222
215 if (!ipprot->no_policy) { 223 if (!ipprot->no_policy) {
216 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 224 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
217 kfree_skb(skb); 225 kfree_skb(skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 46d7be233eac..8ebe86dd72af 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -778,7 +778,7 @@ int ip_append_data(struct sock *sk,
778 int getfrag(void *from, char *to, int offset, int len, 778 int getfrag(void *from, char *to, int offset, int len,
779 int odd, struct sk_buff *skb), 779 int odd, struct sk_buff *skb),
780 void *from, int length, int transhdrlen, 780 void *from, int length, int transhdrlen,
781 struct ipcm_cookie *ipc, struct rtable *rt, 781 struct ipcm_cookie *ipc, struct rtable **rtp,
782 unsigned int flags) 782 unsigned int flags)
783{ 783{
784 struct inet_sock *inet = inet_sk(sk); 784 struct inet_sock *inet = inet_sk(sk);
@@ -793,6 +793,7 @@ int ip_append_data(struct sock *sk,
793 int offset = 0; 793 int offset = 0;
794 unsigned int maxfraglen, fragheaderlen; 794 unsigned int maxfraglen, fragheaderlen;
795 int csummode = CHECKSUM_NONE; 795 int csummode = CHECKSUM_NONE;
796 struct rtable *rt;
796 797
797 if (flags&MSG_PROBE) 798 if (flags&MSG_PROBE)
798 return 0; 799 return 0;
@@ -812,7 +813,11 @@ int ip_append_data(struct sock *sk,
812 inet->cork.flags |= IPCORK_OPT; 813 inet->cork.flags |= IPCORK_OPT;
813 inet->cork.addr = ipc->addr; 814 inet->cork.addr = ipc->addr;
814 } 815 }
815 dst_hold(&rt->u.dst); 816 rt = *rtp;
817 /*
818 * We steal reference to this route, caller should not release it
819 */
820 *rtp = NULL;
816 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? 821 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
817 rt->u.dst.dev->mtu : 822 rt->u.dst.dev->mtu :
818 dst_mtu(rt->u.dst.path); 823 dst_mtu(rt->u.dst.path);
@@ -1279,7 +1284,12 @@ int ip_push_pending_frames(struct sock *sk)
1279 1284
1280 skb->priority = sk->sk_priority; 1285 skb->priority = sk->sk_priority;
1281 skb->mark = sk->sk_mark; 1286 skb->mark = sk->sk_mark;
1282 skb->dst = dst_clone(&rt->u.dst); 1287 /*
1288 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1289 * on dst refcount
1290 */
1291 inet->cork.dst = NULL;
1292 skb->dst = &rt->u.dst;
1283 1293
1284 if (iph->protocol == IPPROTO_ICMP) 1294 if (iph->protocol == IPPROTO_ICMP)
1285 icmp_out_count(net, ((struct icmphdr *) 1295 icmp_out_count(net, ((struct icmphdr *)
@@ -1391,7 +1401,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1391 sk->sk_protocol = ip_hdr(skb)->protocol; 1401 sk->sk_protocol = ip_hdr(skb)->protocol;
1392 sk->sk_bound_dev_if = arg->bound_dev_if; 1402 sk->sk_bound_dev_if = arg->bound_dev_if;
1393 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1403 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1394 &ipc, rt, MSG_DONTWAIT); 1404 &ipc, &rt, MSG_DONTWAIT);
1395 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1405 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1396 if (arg->csumoffset >= 0) 1406 if (arg->csumoffset >= 0)
1397 *((__sum16 *)skb_transport_header(skb) + 1407 *((__sum16 *)skb_transport_header(skb) +
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e976efeb1456..43c05854d752 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -48,6 +48,7 @@
48#define IP_CMSG_RECVOPTS 8 48#define IP_CMSG_RECVOPTS 8
49#define IP_CMSG_RETOPTS 16 49#define IP_CMSG_RETOPTS 16
50#define IP_CMSG_PASSSEC 32 50#define IP_CMSG_PASSSEC 32
51#define IP_CMSG_ORIGDSTADDR 64
51 52
52/* 53/*
53 * SOL_IP control messages. 54 * SOL_IP control messages.
@@ -126,6 +127,27 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
126 security_release_secctx(secdata, seclen); 127 security_release_secctx(secdata, seclen);
127} 128}
128 129
130static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
131{
132 struct sockaddr_in sin;
133 struct iphdr *iph = ip_hdr(skb);
134 __be16 *ports = (__be16 *)skb_transport_header(skb);
135
136 if (skb_transport_offset(skb) + 4 > skb->len)
137 return;
138
139 /* All current transport protocols have the port numbers in the
140 * first four bytes of the transport header and this function is
141 * written with this assumption in mind.
142 */
143
144 sin.sin_family = AF_INET;
145 sin.sin_addr.s_addr = iph->daddr;
146 sin.sin_port = ports[1];
147 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
148
149 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
150}
129 151
130void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) 152void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
131{ 153{
@@ -160,6 +182,12 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
160 182
161 if (flags & 1) 183 if (flags & 1)
162 ip_cmsg_recv_security(msg, skb); 184 ip_cmsg_recv_security(msg, skb);
185
186 if ((flags>>=1) == 0)
187 return;
188 if (flags & 1)
189 ip_cmsg_recv_dstaddr(msg, skb);
190
163} 191}
164 192
165int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) 193int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
@@ -421,7 +449,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
421 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | 449 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
422 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || 450 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
423 optname == IP_MULTICAST_TTL || 451 optname == IP_MULTICAST_TTL ||
424 optname == IP_MULTICAST_LOOP) { 452 optname == IP_MULTICAST_LOOP ||
453 optname == IP_RECVORIGDSTADDR) {
425 if (optlen >= sizeof(int)) { 454 if (optlen >= sizeof(int)) {
426 if (get_user(val, (int __user *) optval)) 455 if (get_user(val, (int __user *) optval))
427 return -EFAULT; 456 return -EFAULT;
@@ -509,6 +538,12 @@ static int do_ip_setsockopt(struct sock *sk, int level,
509 else 538 else
510 inet->cmsg_flags &= ~IP_CMSG_PASSSEC; 539 inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
511 break; 540 break;
541 case IP_RECVORIGDSTADDR:
542 if (val)
543 inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
544 else
545 inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
546 break;
512 case IP_TOS: /* This sets both TOS and Precedence */ 547 case IP_TOS: /* This sets both TOS and Precedence */
513 if (sk->sk_type == SOCK_STREAM) { 548 if (sk->sk_type == SOCK_STREAM) {
514 val &= ~3; 549 val &= ~3;
@@ -1022,6 +1057,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1022 case IP_PASSSEC: 1057 case IP_PASSSEC:
1023 val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; 1058 val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
1024 break; 1059 break;
1060 case IP_RECVORIGDSTADDR:
1061 val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
1062 break;
1025 case IP_TOS: 1063 case IP_TOS:
1026 val = inet->tos; 1064 val = inet->tos;
1027 break; 1065 break;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index ec8264ae45c2..3262ce06294c 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -35,7 +35,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
35 return; 35 return;
36 36
37 spi = htonl(ntohs(ipch->cpi)); 37 spi = htonl(ntohs(ipch->cpi));
38 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, 38 x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr,
39 spi, IPPROTO_COMP, AF_INET); 39 spi, IPPROTO_COMP, AF_INET);
40 if (!x) 40 if (!x)
41 return; 41 return;
@@ -49,7 +49,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
49{ 49{
50 struct xfrm_state *t; 50 struct xfrm_state *t;
51 51
52 t = xfrm_state_alloc(); 52 t = xfrm_state_alloc(&init_net);
53 if (t == NULL) 53 if (t == NULL)
54 goto out; 54 goto out;
55 55
@@ -85,7 +85,7 @@ static int ipcomp_tunnel_attach(struct xfrm_state *x)
85 int err = 0; 85 int err = 0;
86 struct xfrm_state *t; 86 struct xfrm_state *t;
87 87
88 t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4, 88 t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4,
89 x->props.saddr.a4, IPPROTO_IPIP, AF_INET); 89 x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
90 if (!t) { 90 if (!t) {
91 t = ipcomp_tunnel_create(x); 91 t = ipcomp_tunnel_create(x);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index b3c3d7b0d116..5079dfbc6f38 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -130,8 +130,8 @@ struct ipip_net {
130 struct net_device *fb_tunnel_dev; 130 struct net_device *fb_tunnel_dev;
131}; 131};
132 132
133static int ipip_fb_tunnel_init(struct net_device *dev); 133static void ipip_fb_tunnel_init(struct net_device *dev);
134static int ipip_tunnel_init(struct net_device *dev); 134static void ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136 136
137static DEFINE_RWLOCK(ipip_lock); 137static DEFINE_RWLOCK(ipip_lock);
@@ -245,9 +245,10 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
245 } 245 }
246 246
247 nt = netdev_priv(dev); 247 nt = netdev_priv(dev);
248 dev->init = ipip_tunnel_init;
249 nt->parms = *parms; 248 nt->parms = *parms;
250 249
250 ipip_tunnel_init(dev);
251
251 if (register_netdevice(dev) < 0) 252 if (register_netdevice(dev) < 0)
252 goto failed_free; 253 goto failed_free;
253 254
@@ -691,12 +692,17 @@ static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
691 return 0; 692 return 0;
692} 693}
693 694
695static const struct net_device_ops ipip_netdev_ops = {
696 .ndo_uninit = ipip_tunnel_uninit,
697 .ndo_start_xmit = ipip_tunnel_xmit,
698 .ndo_do_ioctl = ipip_tunnel_ioctl,
699 .ndo_change_mtu = ipip_tunnel_change_mtu,
700
701};
702
694static void ipip_tunnel_setup(struct net_device *dev) 703static void ipip_tunnel_setup(struct net_device *dev)
695{ 704{
696 dev->uninit = ipip_tunnel_uninit; 705 dev->netdev_ops = &ipip_netdev_ops;
697 dev->hard_start_xmit = ipip_tunnel_xmit;
698 dev->do_ioctl = ipip_tunnel_ioctl;
699 dev->change_mtu = ipip_tunnel_change_mtu;
700 dev->destructor = free_netdev; 706 dev->destructor = free_netdev;
701 707
702 dev->type = ARPHRD_TUNNEL; 708 dev->type = ARPHRD_TUNNEL;
@@ -708,11 +714,9 @@ static void ipip_tunnel_setup(struct net_device *dev)
708 dev->features |= NETIF_F_NETNS_LOCAL; 714 dev->features |= NETIF_F_NETNS_LOCAL;
709} 715}
710 716
711static int ipip_tunnel_init(struct net_device *dev) 717static void ipip_tunnel_init(struct net_device *dev)
712{ 718{
713 struct ip_tunnel *tunnel; 719 struct ip_tunnel *tunnel = netdev_priv(dev);
714
715 tunnel = netdev_priv(dev);
716 720
717 tunnel->dev = dev; 721 tunnel->dev = dev;
718 strcpy(tunnel->parms.name, dev->name); 722 strcpy(tunnel->parms.name, dev->name);
@@ -721,11 +725,9 @@ static int ipip_tunnel_init(struct net_device *dev)
721 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 725 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
722 726
723 ipip_tunnel_bind_dev(dev); 727 ipip_tunnel_bind_dev(dev);
724
725 return 0;
726} 728}
727 729
728static int ipip_fb_tunnel_init(struct net_device *dev) 730static void ipip_fb_tunnel_init(struct net_device *dev)
729{ 731{
730 struct ip_tunnel *tunnel = netdev_priv(dev); 732 struct ip_tunnel *tunnel = netdev_priv(dev);
731 struct iphdr *iph = &tunnel->parms.iph; 733 struct iphdr *iph = &tunnel->parms.iph;
@@ -740,7 +742,6 @@ static int ipip_fb_tunnel_init(struct net_device *dev)
740 742
741 dev_hold(dev); 743 dev_hold(dev);
742 ipn->tunnels_wc[0] = tunnel; 744 ipn->tunnels_wc[0] = tunnel;
743 return 0;
744} 745}
745 746
746static struct xfrm_tunnel ipip_handler = { 747static struct xfrm_tunnel ipip_handler = {
@@ -792,10 +793,10 @@ static int ipip_init_net(struct net *net)
792 err = -ENOMEM; 793 err = -ENOMEM;
793 goto err_alloc_dev; 794 goto err_alloc_dev;
794 } 795 }
795
796 ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init;
797 dev_net_set(ipn->fb_tunnel_dev, net); 796 dev_net_set(ipn->fb_tunnel_dev, net);
798 797
798 ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
799
799 if ((err = register_netdev(ipn->fb_tunnel_dev))) 800 if ((err = register_netdev(ipn->fb_tunnel_dev)))
800 goto err_reg_dev; 801 goto err_reg_dev;
801 802
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 05ed336f798a..77fc4d3fdf61 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -124,8 +124,8 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
124 124
125 dev = __dev_get_by_name(&init_net, "tunl0"); 125 dev = __dev_get_by_name(&init_net, "tunl0");
126 if (dev) { 126 if (dev) {
127 const struct net_device_ops *ops = dev->netdev_ops;
127 struct ifreq ifr; 128 struct ifreq ifr;
128 mm_segment_t oldfs;
129 struct ip_tunnel_parm p; 129 struct ip_tunnel_parm p;
130 130
131 memset(&p, 0, sizeof(p)); 131 memset(&p, 0, sizeof(p));
@@ -137,9 +137,13 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
137 sprintf(p.name, "dvmrp%d", v->vifc_vifi); 137 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
138 ifr.ifr_ifru.ifru_data = (__force void __user *)&p; 138 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
139 139
140 oldfs = get_fs(); set_fs(KERNEL_DS); 140 if (ops->ndo_do_ioctl) {
141 dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL); 141 mm_segment_t oldfs = get_fs();
142 set_fs(oldfs); 142
143 set_fs(KERNEL_DS);
144 ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
145 set_fs(oldfs);
146 }
143 } 147 }
144} 148}
145 149
@@ -151,9 +155,9 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
151 dev = __dev_get_by_name(&init_net, "tunl0"); 155 dev = __dev_get_by_name(&init_net, "tunl0");
152 156
153 if (dev) { 157 if (dev) {
158 const struct net_device_ops *ops = dev->netdev_ops;
154 int err; 159 int err;
155 struct ifreq ifr; 160 struct ifreq ifr;
156 mm_segment_t oldfs;
157 struct ip_tunnel_parm p; 161 struct ip_tunnel_parm p;
158 struct in_device *in_dev; 162 struct in_device *in_dev;
159 163
@@ -166,9 +170,14 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
166 sprintf(p.name, "dvmrp%d", v->vifc_vifi); 170 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
167 ifr.ifr_ifru.ifru_data = (__force void __user *)&p; 171 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
168 172
169 oldfs = get_fs(); set_fs(KERNEL_DS); 173 if (ops->ndo_do_ioctl) {
170 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); 174 mm_segment_t oldfs = get_fs();
171 set_fs(oldfs); 175
176 set_fs(KERNEL_DS);
177 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
178 set_fs(oldfs);
179 } else
180 err = -EOPNOTSUPP;
172 181
173 dev = NULL; 182 dev = NULL;
174 183
@@ -213,12 +222,16 @@ static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
213 return 0; 222 return 0;
214} 223}
215 224
225static const struct net_device_ops reg_vif_netdev_ops = {
226 .ndo_start_xmit = reg_vif_xmit,
227};
228
216static void reg_vif_setup(struct net_device *dev) 229static void reg_vif_setup(struct net_device *dev)
217{ 230{
218 dev->type = ARPHRD_PIMREG; 231 dev->type = ARPHRD_PIMREG;
219 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; 232 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
220 dev->flags = IFF_NOARP; 233 dev->flags = IFF_NOARP;
221 dev->hard_start_xmit = reg_vif_xmit; 234 dev->netdev_ops = &reg_vif_netdev_ops,
222 dev->destructor = free_netdev; 235 dev->destructor = free_netdev;
223} 236}
224 237
@@ -1945,13 +1958,14 @@ int __init ip_mr_init(void)
1945 goto proc_cache_fail; 1958 goto proc_cache_fail;
1946#endif 1959#endif
1947 return 0; 1960 return 0;
1948reg_notif_fail:
1949 kmem_cache_destroy(mrt_cachep);
1950#ifdef CONFIG_PROC_FS 1961#ifdef CONFIG_PROC_FS
1951proc_vif_fail:
1952 unregister_netdevice_notifier(&ip_mr_notifier);
1953proc_cache_fail: 1962proc_cache_fail:
1954 proc_net_remove(&init_net, "ip_mr_vif"); 1963 proc_net_remove(&init_net, "ip_mr_vif");
1964proc_vif_fail:
1965 unregister_netdevice_notifier(&ip_mr_notifier);
1955#endif 1966#endif
1967reg_notif_fail:
1968 del_timer(&ipmr_expire_timer);
1969 kmem_cache_destroy(mrt_cachep);
1956 return err; 1970 return err;
1957} 1971}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 7c145d76384d..fdf6811c31a2 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -66,7 +66,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
66#ifdef CONFIG_XFRM 66#ifdef CONFIG_XFRM
67 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 67 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
68 xfrm_decode_session(skb, &fl, AF_INET) == 0) 68 xfrm_decode_session(skb, &fl, AF_INET) == 0)
69 if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0)) 69 if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0))
70 return -1; 70 return -1;
71#endif 71#endif
72 72
@@ -97,7 +97,7 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
97 dst = ((struct xfrm_dst *)dst)->route; 97 dst = ((struct xfrm_dst *)dst)->route;
98 dst_hold(dst); 98 dst_hold(dst);
99 99
100 if (xfrm_lookup(&dst, &fl, skb->sk, 0) < 0) 100 if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0)
101 return -1; 101 return -1;
102 102
103 dst_release(skb->dst); 103 dst_release(skb->dst);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8f5a403f6f6b..614958b7c276 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,8 +54,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
54 socket_seq_show(seq); 54 socket_seq_show(seq);
55 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", 55 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
56 sock_prot_inuse_get(net, &tcp_prot), 56 sock_prot_inuse_get(net, &tcp_prot),
57 atomic_read(&tcp_orphan_count), 57 (int)percpu_counter_sum_positive(&tcp_orphan_count),
58 tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), 58 tcp_death_row.tw_count,
59 (int)percpu_counter_sum_positive(&tcp_sockets_allocated),
59 atomic_read(&tcp_memory_allocated)); 60 atomic_read(&tcp_memory_allocated));
60 seq_printf(seq, "UDP: inuse %d mem %d\n", 61 seq_printf(seq, "UDP: inuse %d mem %d\n",
61 sock_prot_inuse_get(net, &udp_prot), 62 sock_prot_inuse_get(net, &udp_prot),
@@ -234,46 +235,51 @@ static const struct snmp_mib snmp4_net_list[] = {
234 SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), 235 SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
235 SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), 236 SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
236 SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), 237 SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
238 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
239 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
240 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
237 SNMP_MIB_SENTINEL 241 SNMP_MIB_SENTINEL
238}; 242};
239 243
244static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
245 unsigned short *type, int count)
246{
247 int j;
248
249 if (count) {
250 seq_printf(seq, "\nIcmpMsg:");
251 for (j = 0; j < count; ++j)
252 seq_printf(seq, " %sType%u",
253 type[j] & 0x100 ? "Out" : "In",
254 type[j] & 0xff);
255 seq_printf(seq, "\nIcmpMsg:");
256 for (j = 0; j < count; ++j)
257 seq_printf(seq, " %lu", vals[j]);
258 }
259}
260
240static void icmpmsg_put(struct seq_file *seq) 261static void icmpmsg_put(struct seq_file *seq)
241{ 262{
242#define PERLINE 16 263#define PERLINE 16
243 264
244 int j, i, count; 265 int i, count;
245 static int out[PERLINE]; 266 unsigned short type[PERLINE];
267 unsigned long vals[PERLINE], val;
246 struct net *net = seq->private; 268 struct net *net = seq->private;
247 269
248 count = 0; 270 count = 0;
249 for (i = 0; i < ICMPMSG_MIB_MAX; i++) { 271 for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
250 272 val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i);
251 if (snmp_fold_field((void **) net->mib.icmpmsg_statistics, i)) 273 if (val) {
252 out[count++] = i; 274 type[count] = i;
253 if (count < PERLINE) 275 vals[count++] = val;
254 continue; 276 }
255 277 if (count == PERLINE) {
256 seq_printf(seq, "\nIcmpMsg:"); 278 icmpmsg_put_line(seq, vals, type, count);
257 for (j = 0; j < PERLINE; ++j) 279 count = 0;
258 seq_printf(seq, " %sType%u", i & 0x100 ? "Out" : "In", 280 }
259 i & 0xff);
260 seq_printf(seq, "\nIcmpMsg: ");
261 for (j = 0; j < PERLINE; ++j)
262 seq_printf(seq, " %lu",
263 snmp_fold_field((void **) net->mib.icmpmsg_statistics,
264 out[j]));
265 seq_putc(seq, '\n');
266 }
267 if (count) {
268 seq_printf(seq, "\nIcmpMsg:");
269 for (j = 0; j < count; ++j)
270 seq_printf(seq, " %sType%u", out[j] & 0x100 ? "Out" :
271 "In", out[j] & 0xff);
272 seq_printf(seq, "\nIcmpMsg:");
273 for (j = 0; j < count; ++j)
274 seq_printf(seq, " %lu", snmp_fold_field((void **)
275 net->mib.icmpmsg_statistics, out[j]));
276 } 281 }
282 icmpmsg_put_line(seq, vals, type, count);
277 283
278#undef PERLINE 284#undef PERLINE
279} 285}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 998fcffc9e15..dff8bc4e0fac 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -572,7 +572,7 @@ back_from_confirm:
572 ipc.addr = rt->rt_dst; 572 ipc.addr = rt->rt_dst;
573 lock_sock(sk); 573 lock_sock(sk);
574 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, 574 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
575 &ipc, rt, msg->msg_flags); 575 &ipc, &rt, msg->msg_flags);
576 if (err) 576 if (err)
577 ip_flush_pending_frames(sk); 577 ip_flush_pending_frames(sk);
578 else if (!(msg->msg_flags & MSG_MORE)) 578 else if (!(msg->msg_flags & MSG_MORE))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0dc0c3826763..77bfba975959 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -160,7 +160,6 @@ static struct dst_ops ipv4_dst_ops = {
160 .link_failure = ipv4_link_failure, 160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu, 161 .update_pmtu = ip_rt_update_pmtu,
162 .local_out = __ip_local_out, 162 .local_out = __ip_local_out,
163 .entry_size = sizeof(struct rtable),
164 .entries = ATOMIC_INIT(0), 163 .entries = ATOMIC_INIT(0),
165}; 164};
166 165
@@ -2701,7 +2700,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2701 .destroy = ipv4_dst_destroy, 2700 .destroy = ipv4_dst_destroy,
2702 .check = ipv4_dst_check, 2701 .check = ipv4_dst_check,
2703 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2702 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2704 .entry_size = sizeof(struct rtable),
2705 .entries = ATOMIC_INIT(0), 2703 .entries = ATOMIC_INIT(0),
2706}; 2704};
2707 2705
@@ -2763,7 +2761,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2763 flp->fl4_src = (*rp)->rt_src; 2761 flp->fl4_src = (*rp)->rt_src;
2764 if (!flp->fl4_dst) 2762 if (!flp->fl4_dst)
2765 flp->fl4_dst = (*rp)->rt_dst; 2763 flp->fl4_dst = (*rp)->rt_dst;
2766 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, 2764 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2767 flags ? XFRM_LOOKUP_WAIT : 0); 2765 flags ? XFRM_LOOKUP_WAIT : 0);
2768 if (err == -EREMOTE) 2766 if (err == -EREMOTE)
2769 err = ipv4_dst_blackhole(net, rp, flp); 2767 err = ipv4_dst_blackhole(net, rp, flp);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 60c28add96b8..019243408623 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -277,8 +277,7 @@
277 277
278int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; 278int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
279 279
280atomic_t tcp_orphan_count = ATOMIC_INIT(0); 280struct percpu_counter tcp_orphan_count;
281
282EXPORT_SYMBOL_GPL(tcp_orphan_count); 281EXPORT_SYMBOL_GPL(tcp_orphan_count);
283 282
284int sysctl_tcp_mem[3] __read_mostly; 283int sysctl_tcp_mem[3] __read_mostly;
@@ -290,9 +289,12 @@ EXPORT_SYMBOL(sysctl_tcp_rmem);
290EXPORT_SYMBOL(sysctl_tcp_wmem); 289EXPORT_SYMBOL(sysctl_tcp_wmem);
291 290
292atomic_t tcp_memory_allocated; /* Current allocated memory. */ 291atomic_t tcp_memory_allocated; /* Current allocated memory. */
293atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
294
295EXPORT_SYMBOL(tcp_memory_allocated); 292EXPORT_SYMBOL(tcp_memory_allocated);
293
294/*
295 * Current number of TCP sockets.
296 */
297struct percpu_counter tcp_sockets_allocated;
296EXPORT_SYMBOL(tcp_sockets_allocated); 298EXPORT_SYMBOL(tcp_sockets_allocated);
297 299
298/* 300/*
@@ -1374,8 +1376,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1374 sk->sk_state == TCP_CLOSE || 1376 sk->sk_state == TCP_CLOSE ||
1375 (sk->sk_shutdown & RCV_SHUTDOWN) || 1377 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1376 !timeo || 1378 !timeo ||
1377 signal_pending(current) || 1379 signal_pending(current))
1378 (flags & MSG_PEEK))
1379 break; 1380 break;
1380 } else { 1381 } else {
1381 if (sock_flag(sk, SOCK_DONE)) 1382 if (sock_flag(sk, SOCK_DONE))
@@ -1835,7 +1836,7 @@ adjudge_to_death:
1835 state = sk->sk_state; 1836 state = sk->sk_state;
1836 sock_hold(sk); 1837 sock_hold(sk);
1837 sock_orphan(sk); 1838 sock_orphan(sk);
1838 atomic_inc(sk->sk_prot->orphan_count); 1839 percpu_counter_inc(sk->sk_prot->orphan_count);
1839 1840
1840 /* It is the last release_sock in its life. It will remove backlog. */ 1841 /* It is the last release_sock in its life. It will remove backlog. */
1841 release_sock(sk); 1842 release_sock(sk);
@@ -1886,9 +1887,11 @@ adjudge_to_death:
1886 } 1887 }
1887 } 1888 }
1888 if (sk->sk_state != TCP_CLOSE) { 1889 if (sk->sk_state != TCP_CLOSE) {
1890 int orphan_count = percpu_counter_read_positive(
1891 sk->sk_prot->orphan_count);
1892
1889 sk_mem_reclaim(sk); 1893 sk_mem_reclaim(sk);
1890 if (tcp_too_many_orphans(sk, 1894 if (tcp_too_many_orphans(sk, orphan_count)) {
1891 atomic_read(sk->sk_prot->orphan_count))) {
1892 if (net_ratelimit()) 1895 if (net_ratelimit())
1893 printk(KERN_INFO "TCP: too many of orphaned " 1896 printk(KERN_INFO "TCP: too many of orphaned "
1894 "sockets\n"); 1897 "sockets\n");
@@ -2686,6 +2689,8 @@ void __init tcp_init(void)
2686 2689
2687 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 2690 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2688 2691
2692 percpu_counter_init(&tcp_sockets_allocated, 0);
2693 percpu_counter_init(&tcp_orphan_count, 0);
2689 tcp_hashinfo.bind_bucket_cachep = 2694 tcp_hashinfo.bind_bucket_cachep =
2690 kmem_cache_create("tcp_bind_bucket", 2695 kmem_cache_create("tcp_bind_bucket",
2691 sizeof(struct inet_bind_bucket), 0, 2696 sizeof(struct inet_bind_bucket), 0,
@@ -2708,8 +2713,8 @@ void __init tcp_init(void)
2708 thash_entries ? 0 : 512 * 1024); 2713 thash_entries ? 0 : 512 * 1024);
2709 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 2714 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2710 for (i = 0; i < tcp_hashinfo.ehash_size; i++) { 2715 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2711 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); 2716 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2712 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); 2717 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2713 } 2718 }
2714 if (inet_ehash_locks_alloc(&tcp_hashinfo)) 2719 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2715 panic("TCP: failed to alloc ehash_locks"); 2720 panic("TCP: failed to alloc ehash_locks");
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 838d491dfda7..fcbcd4ff6c5f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -34,7 +34,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
34 tcp_get_info(sk, info); 34 tcp_get_info(sk, info);
35} 35}
36 36
37static struct inet_diag_handler tcp_diag_handler = { 37static const struct inet_diag_handler tcp_diag_handler = {
38 .idiag_hashinfo = &tcp_hashinfo, 38 .idiag_hashinfo = &tcp_hashinfo,
39 .idiag_get_info = tcp_diag_get_info, 39 .idiag_get_info = tcp_diag_get_info,
40 .idiag_type = TCPDIAG_GETSOCK, 40 .idiag_type = TCPDIAG_GETSOCK,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index af99776146ff..937549b8a921 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -69,9 +69,12 @@ static u32 htcp_cwnd_undo(struct sock *sk)
69 const struct tcp_sock *tp = tcp_sk(sk); 69 const struct tcp_sock *tp = tcp_sk(sk);
70 struct htcp *ca = inet_csk_ca(sk); 70 struct htcp *ca = inet_csk_ca(sk);
71 71
72 ca->last_cong = ca->undo_last_cong; 72 if (ca->undo_last_cong) {
73 ca->maxRTT = ca->undo_maxRTT; 73 ca->last_cong = ca->undo_last_cong;
74 ca->old_maxB = ca->undo_old_maxB; 74 ca->maxRTT = ca->undo_maxRTT;
75 ca->old_maxB = ca->undo_old_maxB;
76 ca->undo_last_cong = 0;
77 }
75 78
76 return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta); 79 return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
77} 80}
@@ -268,7 +271,10 @@ static void htcp_state(struct sock *sk, u8 new_state)
268 case TCP_CA_Open: 271 case TCP_CA_Open:
269 { 272 {
270 struct htcp *ca = inet_csk_ca(sk); 273 struct htcp *ca = inet_csk_ca(sk);
271 ca->last_cong = jiffies; 274 if (ca->undo_last_cong) {
275 ca->last_cong = jiffies;
276 ca->undo_last_cong = 0;
277 }
272 } 278 }
273 break; 279 break;
274 case TCP_CA_CWR: 280 case TCP_CA_CWR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 097294b7da3e..d67b6e9cc540 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1002,7 +1002,8 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
1002 } 1002 }
1003} 1003}
1004 1004
1005void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) 1005static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
1006 struct sk_buff *skb)
1006{ 1007{
1007 tcp_verify_retransmit_hint(tp, skb); 1008 tcp_verify_retransmit_hint(tp, skb);
1008 1009
@@ -1241,26 +1242,47 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1241 * aligned portion of it that matches. Therefore we might need to fragment 1242 * aligned portion of it that matches. Therefore we might need to fragment
1242 * which may fail and creates some hassle (caller must handle error case 1243 * which may fail and creates some hassle (caller must handle error case
1243 * returns). 1244 * returns).
1245 *
1246 * FIXME: this could be merged to shift decision code
1244 */ 1247 */
1245static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1248static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1246 u32 start_seq, u32 end_seq) 1249 u32 start_seq, u32 end_seq)
1247{ 1250{
1248 int in_sack, err; 1251 int in_sack, err;
1249 unsigned int pkt_len; 1252 unsigned int pkt_len;
1253 unsigned int mss;
1250 1254
1251 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && 1255 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1252 !before(end_seq, TCP_SKB_CB(skb)->end_seq); 1256 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1253 1257
1254 if (tcp_skb_pcount(skb) > 1 && !in_sack && 1258 if (tcp_skb_pcount(skb) > 1 && !in_sack &&
1255 after(TCP_SKB_CB(skb)->end_seq, start_seq)) { 1259 after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1256 1260 mss = tcp_skb_mss(skb);
1257 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); 1261 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1258 1262
1259 if (!in_sack) 1263 if (!in_sack) {
1260 pkt_len = start_seq - TCP_SKB_CB(skb)->seq; 1264 pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1261 else 1265 if (pkt_len < mss)
1266 pkt_len = mss;
1267 } else {
1262 pkt_len = end_seq - TCP_SKB_CB(skb)->seq; 1268 pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1263 err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); 1269 if (pkt_len < mss)
1270 return -EINVAL;
1271 }
1272
1273 /* Round if necessary so that SACKs cover only full MSSes
1274 * and/or the remaining small portion (if present)
1275 */
1276 if (pkt_len > mss) {
1277 unsigned int new_len = (pkt_len / mss) * mss;
1278 if (!in_sack && new_len < pkt_len) {
1279 new_len += mss;
1280 if (new_len > skb->len)
1281 return 0;
1282 }
1283 pkt_len = new_len;
1284 }
1285 err = tcp_fragment(sk, skb, pkt_len, mss);
1264 if (err < 0) 1286 if (err < 0)
1265 return err; 1287 return err;
1266 } 1288 }
@@ -1269,7 +1291,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1269} 1291}
1270 1292
1271static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, 1293static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1272 int *reord, int dup_sack, int fack_count) 1294 int *reord, int dup_sack, int fack_count,
1295 u8 *sackedto, int pcount)
1273{ 1296{
1274 struct tcp_sock *tp = tcp_sk(sk); 1297 struct tcp_sock *tp = tcp_sk(sk);
1275 u8 sacked = TCP_SKB_CB(skb)->sacked; 1298 u8 sacked = TCP_SKB_CB(skb)->sacked;
@@ -1294,10 +1317,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1294 * that retransmission is still in flight. 1317 * that retransmission is still in flight.
1295 */ 1318 */
1296 if (sacked & TCPCB_LOST) { 1319 if (sacked & TCPCB_LOST) {
1297 TCP_SKB_CB(skb)->sacked &= 1320 *sackedto &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1298 ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1321 tp->lost_out -= pcount;
1299 tp->lost_out -= tcp_skb_pcount(skb); 1322 tp->retrans_out -= pcount;
1300 tp->retrans_out -= tcp_skb_pcount(skb);
1301 } 1323 }
1302 } else { 1324 } else {
1303 if (!(sacked & TCPCB_RETRANS)) { 1325 if (!(sacked & TCPCB_RETRANS)) {
@@ -1314,48 +1336,280 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1314 } 1336 }
1315 1337
1316 if (sacked & TCPCB_LOST) { 1338 if (sacked & TCPCB_LOST) {
1317 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1339 *sackedto &= ~TCPCB_LOST;
1318 tp->lost_out -= tcp_skb_pcount(skb); 1340 tp->lost_out -= pcount;
1319 } 1341 }
1320 } 1342 }
1321 1343
1322 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; 1344 *sackedto |= TCPCB_SACKED_ACKED;
1323 flag |= FLAG_DATA_SACKED; 1345 flag |= FLAG_DATA_SACKED;
1324 tp->sacked_out += tcp_skb_pcount(skb); 1346 tp->sacked_out += pcount;
1325 1347
1326 fack_count += tcp_skb_pcount(skb); 1348 fack_count += pcount;
1327 1349
1328 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1350 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1329 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && 1351 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
1330 before(TCP_SKB_CB(skb)->seq, 1352 before(TCP_SKB_CB(skb)->seq,
1331 TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1353 TCP_SKB_CB(tp->lost_skb_hint)->seq))
1332 tp->lost_cnt_hint += tcp_skb_pcount(skb); 1354 tp->lost_cnt_hint += pcount;
1333 1355
1334 if (fack_count > tp->fackets_out) 1356 if (fack_count > tp->fackets_out)
1335 tp->fackets_out = fack_count; 1357 tp->fackets_out = fack_count;
1336
1337 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
1338 tcp_advance_highest_sack(sk, skb);
1339 } 1358 }
1340 1359
1341 /* D-SACK. We can detect redundant retransmission in S|R and plain R 1360 /* D-SACK. We can detect redundant retransmission in S|R and plain R
1342 * frames and clear it. undo_retrans is decreased above, L|R frames 1361 * frames and clear it. undo_retrans is decreased above, L|R frames
1343 * are accounted above as well. 1362 * are accounted above as well.
1344 */ 1363 */
1345 if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { 1364 if (dup_sack && (*sackedto & TCPCB_SACKED_RETRANS)) {
1346 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1365 *sackedto &= ~TCPCB_SACKED_RETRANS;
1347 tp->retrans_out -= tcp_skb_pcount(skb); 1366 tp->retrans_out -= pcount;
1348 } 1367 }
1349 1368
1350 return flag; 1369 return flag;
1351} 1370}
1352 1371
1372static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1373 struct sk_buff *skb, unsigned int pcount,
1374 int shifted, int fack_count, int *reord,
1375 int *flag, int mss)
1376{
1377 struct tcp_sock *tp = tcp_sk(sk);
1378 u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */
1379
1380 BUG_ON(!pcount);
1381
1382 /* Tweak before seqno plays */
1383 if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint &&
1384 !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
1385 tp->lost_cnt_hint += pcount;
1386
1387 TCP_SKB_CB(prev)->end_seq += shifted;
1388 TCP_SKB_CB(skb)->seq += shifted;
1389
1390 skb_shinfo(prev)->gso_segs += pcount;
1391 BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
1392 skb_shinfo(skb)->gso_segs -= pcount;
1393
1394 /* When we're adding to gso_segs == 1, gso_size will be zero,
1395 * in theory this shouldn't be necessary but as long as DSACK
1396 * code can come after this skb later on it's better to keep
1397 * setting gso_size to something.
1398 */
1399 if (!skb_shinfo(prev)->gso_size) {
1400 skb_shinfo(prev)->gso_size = mss;
1401 skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1402 }
1403
1404 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1405 if (skb_shinfo(skb)->gso_segs <= 1) {
1406 skb_shinfo(skb)->gso_size = 0;
1407 skb_shinfo(skb)->gso_type = 0;
1408 }
1409
1410 *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
1411 pcount);
1412
1413 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1414 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1415
1416 if (skb->len > 0) {
1417 BUG_ON(!tcp_skb_pcount(skb));
1418 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1419 return 0;
1420 }
1421
1422 /* Whole SKB was eaten :-) */
1423
1424 if (skb == tp->retransmit_skb_hint)
1425 tp->retransmit_skb_hint = prev;
1426 if (skb == tp->scoreboard_skb_hint)
1427 tp->scoreboard_skb_hint = prev;
1428 if (skb == tp->lost_skb_hint) {
1429 tp->lost_skb_hint = prev;
1430 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1431 }
1432
1433 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
1434 if (skb == tcp_highest_sack(sk))
1435 tcp_advance_highest_sack(sk, skb);
1436
1437 tcp_unlink_write_queue(skb, sk);
1438 sk_wmem_free_skb(sk, skb);
1439
1440 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
1441
1442 return 1;
1443}
1444
1445/* I wish gso_size would have a bit more sane initialization than
1446 * something-or-zero which complicates things
1447 */
1448static int tcp_shift_mss(struct sk_buff *skb)
1449{
1450 int mss = tcp_skb_mss(skb);
1451
1452 if (!mss)
1453 mss = skb->len;
1454
1455 return mss;
1456}
1457
1458/* Shifting pages past head area doesn't work */
1459static int skb_can_shift(struct sk_buff *skb)
1460{
1461 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1462}
1463
1464/* Try collapsing SACK blocks spanning across multiple skbs to a single
1465 * skb.
1466 */
1467static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1468 u32 start_seq, u32 end_seq,
1469 int dup_sack, int *fack_count,
1470 int *reord, int *flag)
1471{
1472 struct tcp_sock *tp = tcp_sk(sk);
1473 struct sk_buff *prev;
1474 int mss;
1475 int pcount = 0;
1476 int len;
1477 int in_sack;
1478
1479 if (!sk_can_gso(sk))
1480 goto fallback;
1481
1482 /* Normally R but no L won't result in plain S */
1483 if (!dup_sack &&
1484 (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
1485 goto fallback;
1486 if (!skb_can_shift(skb))
1487 goto fallback;
1488 /* This frame is about to be dropped (was ACKed). */
1489 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1490 goto fallback;
1491
1492 /* Can only happen with delayed DSACK + discard craziness */
1493 if (unlikely(skb == tcp_write_queue_head(sk)))
1494 goto fallback;
1495 prev = tcp_write_queue_prev(sk, skb);
1496
1497 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1498 goto fallback;
1499
1500 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1501 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1502
1503 if (in_sack) {
1504 len = skb->len;
1505 pcount = tcp_skb_pcount(skb);
1506 mss = tcp_shift_mss(skb);
1507
1508 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1509 * drop this restriction as unnecessary
1510 */
1511 if (mss != tcp_shift_mss(prev))
1512 goto fallback;
1513 } else {
1514 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1515 goto noop;
1516 /* CHECKME: This is non-MSS split case only?, this will
1517 * cause skipped skbs due to advancing loop btw, original
1518 * has that feature too
1519 */
1520 if (tcp_skb_pcount(skb) <= 1)
1521 goto noop;
1522
1523 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1524 if (!in_sack) {
1525 /* TODO: head merge to next could be attempted here
1526 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1527 * though it might not be worth of the additional hassle
1528 *
1529 * ...we can probably just fallback to what was done
1530 * previously. We could try merging non-SACKed ones
1531 * as well but it probably isn't going to buy off
1532 * because later SACKs might again split them, and
1533 * it would make skb timestamp tracking considerably
1534 * harder problem.
1535 */
1536 goto fallback;
1537 }
1538
1539 len = end_seq - TCP_SKB_CB(skb)->seq;
1540 BUG_ON(len < 0);
1541 BUG_ON(len > skb->len);
1542
1543 /* MSS boundaries should be honoured or else pcount will
1544 * severely break even though it makes things bit trickier.
1545 * Optimize common case to avoid most of the divides
1546 */
1547 mss = tcp_skb_mss(skb);
1548
1549 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1550 * drop this restriction as unnecessary
1551 */
1552 if (mss != tcp_shift_mss(prev))
1553 goto fallback;
1554
1555 if (len == mss) {
1556 pcount = 1;
1557 } else if (len < mss) {
1558 goto noop;
1559 } else {
1560 pcount = len / mss;
1561 len = pcount * mss;
1562 }
1563 }
1564
1565 if (!skb_shift(prev, skb, len))
1566 goto fallback;
1567 if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
1568 flag, mss))
1569 goto out;
1570
1571 /* Hole filled allows collapsing with the next as well, this is very
1572 * useful when hole on every nth skb pattern happens
1573 */
1574 if (prev == tcp_write_queue_tail(sk))
1575 goto out;
1576 skb = tcp_write_queue_next(sk, prev);
1577
1578 if (!skb_can_shift(skb))
1579 goto out;
1580 if (skb == tcp_send_head(sk))
1581 goto out;
1582 if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1583 goto out;
1584
1585 len = skb->len;
1586 if (skb_shift(prev, skb, len)) {
1587 pcount += tcp_skb_pcount(skb);
1588 tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
1589 *fack_count, reord, flag, mss);
1590 }
1591
1592out:
1593 *fack_count += pcount;
1594 return prev;
1595
1596noop:
1597 return skb;
1598
1599fallback:
1600 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1601 return NULL;
1602}
1603
1353static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, 1604static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1354 struct tcp_sack_block *next_dup, 1605 struct tcp_sack_block *next_dup,
1355 u32 start_seq, u32 end_seq, 1606 u32 start_seq, u32 end_seq,
1356 int dup_sack_in, int *fack_count, 1607 int dup_sack_in, int *fack_count,
1357 int *reord, int *flag) 1608 int *reord, int *flag)
1358{ 1609{
1610 struct tcp_sock *tp = tcp_sk(sk);
1611 struct sk_buff *tmp;
1612
1359 tcp_for_write_queue_from(skb, sk) { 1613 tcp_for_write_queue_from(skb, sk) {
1360 int in_sack = 0; 1614 int in_sack = 0;
1361 int dup_sack = dup_sack_in; 1615 int dup_sack = dup_sack_in;
@@ -1376,15 +1630,41 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1376 dup_sack = 1; 1630 dup_sack = 1;
1377 } 1631 }
1378 1632
1379 if (in_sack <= 0) 1633 /* skb reference here is a bit tricky to get right, since
1380 in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, 1634 * shifting can eat and free both this skb and the next,
1381 end_seq); 1635 * so not even _safe variant of the loop is enough.
1636 */
1637 if (in_sack <= 0) {
1638 tmp = tcp_shift_skb_data(sk, skb, start_seq,
1639 end_seq, dup_sack,
1640 fack_count, reord, flag);
1641 if (tmp != NULL) {
1642 if (tmp != skb) {
1643 skb = tmp;
1644 continue;
1645 }
1646
1647 in_sack = 0;
1648 } else {
1649 in_sack = tcp_match_skb_to_sack(sk, skb,
1650 start_seq,
1651 end_seq);
1652 }
1653 }
1654
1382 if (unlikely(in_sack < 0)) 1655 if (unlikely(in_sack < 0))
1383 break; 1656 break;
1384 1657
1385 if (in_sack) 1658 if (in_sack) {
1386 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, 1659 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
1387 *fack_count); 1660 *fack_count,
1661 &(TCP_SKB_CB(skb)->sacked),
1662 tcp_skb_pcount(skb));
1663
1664 if (!before(TCP_SKB_CB(skb)->seq,
1665 tcp_highest_sack_seq(tp)))
1666 tcp_advance_highest_sack(sk, skb);
1667 }
1388 1668
1389 *fack_count += tcp_skb_pcount(skb); 1669 *fack_count += tcp_skb_pcount(skb);
1390 } 1670 }
@@ -1401,7 +1681,7 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1401 if (skb == tcp_send_head(sk)) 1681 if (skb == tcp_send_head(sk))
1402 break; 1682 break;
1403 1683
1404 if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) 1684 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1405 break; 1685 break;
1406 1686
1407 *fack_count += tcp_skb_pcount(skb); 1687 *fack_count += tcp_skb_pcount(skb);
@@ -1660,7 +1940,7 @@ out:
1660/* Limits sacked_out so that sum with lost_out isn't ever larger than 1940/* Limits sacked_out so that sum with lost_out isn't ever larger than
1661 * packets_out. Returns zero if sacked_out adjustement wasn't necessary. 1941 * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
1662 */ 1942 */
1663int tcp_limit_reno_sacked(struct tcp_sock *tp) 1943static int tcp_limit_reno_sacked(struct tcp_sock *tp)
1664{ 1944{
1665 u32 holes; 1945 u32 holes;
1666 1946
@@ -2559,6 +2839,56 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
2559 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 2839 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2560} 2840}
2561 2841
2842/* Do a simple retransmit without using the backoff mechanisms in
2843 * tcp_timer. This is used for path mtu discovery.
2844 * The socket is already locked here.
2845 */
2846void tcp_simple_retransmit(struct sock *sk)
2847{
2848 const struct inet_connection_sock *icsk = inet_csk(sk);
2849 struct tcp_sock *tp = tcp_sk(sk);
2850 struct sk_buff *skb;
2851 unsigned int mss = tcp_current_mss(sk, 0);
2852 u32 prior_lost = tp->lost_out;
2853
2854 tcp_for_write_queue(skb, sk) {
2855 if (skb == tcp_send_head(sk))
2856 break;
2857 if (skb->len > mss &&
2858 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2859 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2860 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2861 tp->retrans_out -= tcp_skb_pcount(skb);
2862 }
2863 tcp_skb_mark_lost_uncond_verify(tp, skb);
2864 }
2865 }
2866
2867 tcp_clear_retrans_hints_partial(tp);
2868
2869 if (prior_lost == tp->lost_out)
2870 return;
2871
2872 if (tcp_is_reno(tp))
2873 tcp_limit_reno_sacked(tp);
2874
2875 tcp_verify_left_out(tp);
2876
2877 /* Don't muck with the congestion window here.
2878 * Reason is that we do not increase amount of _data_
2879 * in network, but units changed and effective
2880 * cwnd/ssthresh really reduced now.
2881 */
2882 if (icsk->icsk_ca_state != TCP_CA_Loss) {
2883 tp->high_seq = tp->snd_nxt;
2884 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2885 tp->prior_ssthresh = 0;
2886 tp->undo_marker = 0;
2887 tcp_set_ca_state(sk, TCP_CA_Loss);
2888 }
2889 tcp_xmit_retransmit_queue(sk);
2890}
2891
2562/* Process an event, which can update packets-in-flight not trivially. 2892/* Process an event, which can update packets-in-flight not trivially.
2563 * Main goal of this function is to calculate new estimate for left_out, 2893 * Main goal of this function is to calculate new estimate for left_out,
2564 * taking into account both packets sitting in receiver's buffer and 2894 * taking into account both packets sitting in receiver's buffer and
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d49233f409b5..26b9030747cc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
97} 97}
98#endif 98#endif
99 99
100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { 100struct inet_hashinfo tcp_hashinfo;
101 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102 .lhash_users = ATOMIC_INIT(0),
103 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104};
105 101
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 102static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{ 103{
@@ -492,7 +488,7 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
492 skb->csum_offset = offsetof(struct tcphdr, check); 488 skb->csum_offset = offsetof(struct tcphdr, check);
493 } else { 489 } else {
494 th->check = tcp_v4_check(len, inet->saddr, inet->daddr, 490 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495 csum_partial((char *)th, 491 csum_partial(th,
496 th->doff << 2, 492 th->doff << 2,
497 skb->csum)); 493 skb->csum));
498 } 494 }
@@ -726,7 +722,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
726 th->check = tcp_v4_check(skb->len, 722 th->check = tcp_v4_check(skb->len,
727 ireq->loc_addr, 723 ireq->loc_addr,
728 ireq->rmt_addr, 724 ireq->rmt_addr,
729 csum_partial((char *)th, skb->len, 725 csum_partial(th, skb->len,
730 skb->csum)); 726 skb->csum));
731 727
732 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, 728 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
@@ -1801,7 +1797,7 @@ static int tcp_v4_init_sock(struct sock *sk)
1801 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1797 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1802 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1798 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1803 1799
1804 atomic_inc(&tcp_sockets_allocated); 1800 percpu_counter_inc(&tcp_sockets_allocated);
1805 1801
1806 return 0; 1802 return 0;
1807} 1803}
@@ -1849,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
1849 sk->sk_sndmsg_page = NULL; 1845 sk->sk_sndmsg_page = NULL;
1850 } 1846 }
1851 1847
1852 atomic_dec(&tcp_sockets_allocated); 1848 percpu_counter_dec(&tcp_sockets_allocated);
1853} 1849}
1854 1850
1855EXPORT_SYMBOL(tcp_v4_destroy_sock); 1851EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -1857,32 +1853,35 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
1857#ifdef CONFIG_PROC_FS 1853#ifdef CONFIG_PROC_FS
1858/* Proc filesystem TCP sock list dumping. */ 1854/* Proc filesystem TCP sock list dumping. */
1859 1855
1860static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) 1856static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1861{ 1857{
1862 return hlist_empty(head) ? NULL : 1858 return hlist_nulls_empty(head) ? NULL :
1863 list_entry(head->first, struct inet_timewait_sock, tw_node); 1859 list_entry(head->first, struct inet_timewait_sock, tw_node);
1864} 1860}
1865 1861
1866static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) 1862static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1867{ 1863{
1868 return tw->tw_node.next ? 1864 return !is_a_nulls(tw->tw_node.next) ?
1869 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1865 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1870} 1866}
1871 1867
1872static void *listening_get_next(struct seq_file *seq, void *cur) 1868static void *listening_get_next(struct seq_file *seq, void *cur)
1873{ 1869{
1874 struct inet_connection_sock *icsk; 1870 struct inet_connection_sock *icsk;
1875 struct hlist_node *node; 1871 struct hlist_nulls_node *node;
1876 struct sock *sk = cur; 1872 struct sock *sk = cur;
1873 struct inet_listen_hashbucket *ilb;
1877 struct tcp_iter_state *st = seq->private; 1874 struct tcp_iter_state *st = seq->private;
1878 struct net *net = seq_file_net(seq); 1875 struct net *net = seq_file_net(seq);
1879 1876
1880 if (!sk) { 1877 if (!sk) {
1881 st->bucket = 0; 1878 st->bucket = 0;
1882 sk = sk_head(&tcp_hashinfo.listening_hash[0]); 1879 ilb = &tcp_hashinfo.listening_hash[0];
1880 spin_lock_bh(&ilb->lock);
1881 sk = sk_nulls_head(&ilb->head);
1883 goto get_sk; 1882 goto get_sk;
1884 } 1883 }
1885 1884 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1886 ++st->num; 1885 ++st->num;
1887 1886
1888 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1887 if (st->state == TCP_SEQ_STATE_OPENREQ) {
@@ -1915,7 +1914,7 @@ get_req:
1915 sk = sk_next(sk); 1914 sk = sk_next(sk);
1916 } 1915 }
1917get_sk: 1916get_sk:
1918 sk_for_each_from(sk, node) { 1917 sk_nulls_for_each_from(sk, node) {
1919 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { 1918 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1920 cur = sk; 1919 cur = sk;
1921 goto out; 1920 goto out;
@@ -1932,8 +1931,11 @@ start_req:
1932 } 1931 }
1933 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1932 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1934 } 1933 }
1934 spin_unlock_bh(&ilb->lock);
1935 if (++st->bucket < INET_LHTABLE_SIZE) { 1935 if (++st->bucket < INET_LHTABLE_SIZE) {
1936 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); 1936 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1937 spin_lock_bh(&ilb->lock);
1938 sk = sk_nulls_head(&ilb->head);
1937 goto get_sk; 1939 goto get_sk;
1938 } 1940 }
1939 cur = NULL; 1941 cur = NULL;
@@ -1954,8 +1956,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1954 1956
1955static inline int empty_bucket(struct tcp_iter_state *st) 1957static inline int empty_bucket(struct tcp_iter_state *st)
1956{ 1958{
1957 return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && 1959 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1958 hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 1960 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1959} 1961}
1960 1962
1961static void *established_get_first(struct seq_file *seq) 1963static void *established_get_first(struct seq_file *seq)
@@ -1966,16 +1968,16 @@ static void *established_get_first(struct seq_file *seq)
1966 1968
1967 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { 1969 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1968 struct sock *sk; 1970 struct sock *sk;
1969 struct hlist_node *node; 1971 struct hlist_nulls_node *node;
1970 struct inet_timewait_sock *tw; 1972 struct inet_timewait_sock *tw;
1971 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1973 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1972 1974
1973 /* Lockless fast path for the common case of empty buckets */ 1975 /* Lockless fast path for the common case of empty buckets */
1974 if (empty_bucket(st)) 1976 if (empty_bucket(st))
1975 continue; 1977 continue;
1976 1978
1977 read_lock_bh(lock); 1979 spin_lock_bh(lock);
1978 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1980 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1979 if (sk->sk_family != st->family || 1981 if (sk->sk_family != st->family ||
1980 !net_eq(sock_net(sk), net)) { 1982 !net_eq(sock_net(sk), net)) {
1981 continue; 1983 continue;
@@ -1993,7 +1995,7 @@ static void *established_get_first(struct seq_file *seq)
1993 rc = tw; 1995 rc = tw;
1994 goto out; 1996 goto out;
1995 } 1997 }
1996 read_unlock_bh(lock); 1998 spin_unlock_bh(lock);
1997 st->state = TCP_SEQ_STATE_ESTABLISHED; 1999 st->state = TCP_SEQ_STATE_ESTABLISHED;
1998 } 2000 }
1999out: 2001out:
@@ -2004,7 +2006,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
2004{ 2006{
2005 struct sock *sk = cur; 2007 struct sock *sk = cur;
2006 struct inet_timewait_sock *tw; 2008 struct inet_timewait_sock *tw;
2007 struct hlist_node *node; 2009 struct hlist_nulls_node *node;
2008 struct tcp_iter_state *st = seq->private; 2010 struct tcp_iter_state *st = seq->private;
2009 struct net *net = seq_file_net(seq); 2011 struct net *net = seq_file_net(seq);
2010 2012
@@ -2021,7 +2023,7 @@ get_tw:
2021 cur = tw; 2023 cur = tw;
2022 goto out; 2024 goto out;
2023 } 2025 }
2024 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2026 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2025 st->state = TCP_SEQ_STATE_ESTABLISHED; 2027 st->state = TCP_SEQ_STATE_ESTABLISHED;
2026 2028
2027 /* Look for next non empty bucket */ 2029 /* Look for next non empty bucket */
@@ -2031,12 +2033,12 @@ get_tw:
2031 if (st->bucket >= tcp_hashinfo.ehash_size) 2033 if (st->bucket >= tcp_hashinfo.ehash_size)
2032 return NULL; 2034 return NULL;
2033 2035
2034 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2036 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2035 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2037 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2036 } else 2038 } else
2037 sk = sk_next(sk); 2039 sk = sk_nulls_next(sk);
2038 2040
2039 sk_for_each_from(sk, node) { 2041 sk_nulls_for_each_from(sk, node) {
2040 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2042 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2041 goto found; 2043 goto found;
2042 } 2044 }
@@ -2066,12 +2068,10 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2066 void *rc; 2068 void *rc;
2067 struct tcp_iter_state *st = seq->private; 2069 struct tcp_iter_state *st = seq->private;
2068 2070
2069 inet_listen_lock(&tcp_hashinfo);
2070 st->state = TCP_SEQ_STATE_LISTENING; 2071 st->state = TCP_SEQ_STATE_LISTENING;
2071 rc = listening_get_idx(seq, &pos); 2072 rc = listening_get_idx(seq, &pos);
2072 2073
2073 if (!rc) { 2074 if (!rc) {
2074 inet_listen_unlock(&tcp_hashinfo);
2075 st->state = TCP_SEQ_STATE_ESTABLISHED; 2075 st->state = TCP_SEQ_STATE_ESTABLISHED;
2076 rc = established_get_idx(seq, pos); 2076 rc = established_get_idx(seq, pos);
2077 } 2077 }
@@ -2103,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2103 case TCP_SEQ_STATE_LISTENING: 2103 case TCP_SEQ_STATE_LISTENING:
2104 rc = listening_get_next(seq, v); 2104 rc = listening_get_next(seq, v);
2105 if (!rc) { 2105 if (!rc) {
2106 inet_listen_unlock(&tcp_hashinfo);
2107 st->state = TCP_SEQ_STATE_ESTABLISHED; 2106 st->state = TCP_SEQ_STATE_ESTABLISHED;
2108 rc = established_get_first(seq); 2107 rc = established_get_first(seq);
2109 } 2108 }
@@ -2130,12 +2129,12 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2130 } 2129 }
2131 case TCP_SEQ_STATE_LISTENING: 2130 case TCP_SEQ_STATE_LISTENING:
2132 if (v != SEQ_START_TOKEN) 2131 if (v != SEQ_START_TOKEN)
2133 inet_listen_unlock(&tcp_hashinfo); 2132 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2134 break; 2133 break;
2135 case TCP_SEQ_STATE_TIME_WAIT: 2134 case TCP_SEQ_STATE_TIME_WAIT:
2136 case TCP_SEQ_STATE_ESTABLISHED: 2135 case TCP_SEQ_STATE_ESTABLISHED:
2137 if (v) 2136 if (v)
2138 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2137 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2139 break; 2138 break;
2140 } 2139 }
2141} 2140}
@@ -2375,6 +2374,7 @@ struct proto tcp_prot = {
2375 .sysctl_rmem = sysctl_tcp_rmem, 2374 .sysctl_rmem = sysctl_tcp_rmem,
2376 .max_header = MAX_TCP_HEADER, 2375 .max_header = MAX_TCP_HEADER,
2377 .obj_size = sizeof(struct tcp_sock), 2376 .obj_size = sizeof(struct tcp_sock),
2377 .slab_flags = SLAB_DESTROY_BY_RCU,
2378 .twsk_prot = &tcp_timewait_sock_ops, 2378 .twsk_prot = &tcp_timewait_sock_ops,
2379 .rsk_prot = &tcp_request_sock_ops, 2379 .rsk_prot = &tcp_request_sock_ops,
2380 .h.hashinfo = &tcp_hashinfo, 2380 .h.hashinfo = &tcp_hashinfo,
@@ -2404,6 +2404,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
2404 2404
2405void __init tcp_v4_init(void) 2405void __init tcp_v4_init(void)
2406{ 2406{
2407 inet_hashinfo_init(&tcp_hashinfo);
2407 if (register_pernet_device(&tcp_sk_ops)) 2408 if (register_pernet_device(&tcp_sk_ops))
2408 panic("Failed to create the TCP control socket.\n"); 2409 panic("Failed to create the TCP control socket.\n");
2409} 2410}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a524627923ae..76f840917bcb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -722,7 +722,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
722static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, 722static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
723 unsigned int mss_now) 723 unsigned int mss_now)
724{ 724{
725 if (skb->len <= mss_now || !sk_can_gso(sk)) { 725 if (skb->len <= mss_now || !sk_can_gso(sk) ||
726 tcp_urg_mode(tcp_sk(sk))) {
726 /* Avoid the costly divide in the normal 727 /* Avoid the costly divide in the normal
727 * non-TSO case. 728 * non-TSO case.
728 */ 729 */
@@ -1163,7 +1164,9 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
1163{ 1164{
1164 int tso_segs = tcp_skb_pcount(skb); 1165 int tso_segs = tcp_skb_pcount(skb);
1165 1166
1166 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { 1167 if (!tso_segs ||
1168 (tso_segs > 1 && (tcp_skb_mss(skb) != mss_now ||
1169 tcp_urg_mode(tcp_sk(sk))))) {
1167 tcp_set_skb_tso_segs(sk, skb, mss_now); 1170 tcp_set_skb_tso_segs(sk, skb, mss_now);
1168 tso_segs = tcp_skb_pcount(skb); 1171 tso_segs = tcp_skb_pcount(skb);
1169 } 1172 }
@@ -1766,46 +1769,22 @@ u32 __tcp_select_window(struct sock *sk)
1766 return window; 1769 return window;
1767} 1770}
1768 1771
1769/* Attempt to collapse two adjacent SKB's during retransmission. */ 1772/* Collapses two adjacent SKB's during retransmission. */
1770static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, 1773static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1771 int mss_now)
1772{ 1774{
1773 struct tcp_sock *tp = tcp_sk(sk); 1775 struct tcp_sock *tp = tcp_sk(sk);
1774 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 1776 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
1775 int skb_size, next_skb_size; 1777 int skb_size, next_skb_size;
1776 u16 flags; 1778 u16 flags;
1777 1779
1778 /* The first test we must make is that neither of these two
1779 * SKB's are still referenced by someone else.
1780 */
1781 if (skb_cloned(skb) || skb_cloned(next_skb))
1782 return;
1783
1784 skb_size = skb->len; 1780 skb_size = skb->len;
1785 next_skb_size = next_skb->len; 1781 next_skb_size = next_skb->len;
1786 flags = TCP_SKB_CB(skb)->flags; 1782 flags = TCP_SKB_CB(skb)->flags;
1787 1783
1788 /* Also punt if next skb has been SACK'd. */
1789 if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
1790 return;
1791
1792 /* Next skb is out of window. */
1793 if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp)))
1794 return;
1795
1796 /* Punt if not enough space exists in the first SKB for
1797 * the data in the second, or the total combined payload
1798 * would exceed the MSS.
1799 */
1800 if ((next_skb_size > skb_tailroom(skb)) ||
1801 ((skb_size + next_skb_size) > mss_now))
1802 return;
1803
1804 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 1784 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
1805 1785
1806 tcp_highest_sack_combine(sk, next_skb, skb); 1786 tcp_highest_sack_combine(sk, next_skb, skb);
1807 1787
1808 /* Ok. We will be able to collapse the packet. */
1809 tcp_unlink_write_queue(next_skb, sk); 1788 tcp_unlink_write_queue(next_skb, sk);
1810 1789
1811 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), 1790 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
@@ -1847,54 +1826,60 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
1847 sk_wmem_free_skb(sk, next_skb); 1826 sk_wmem_free_skb(sk, next_skb);
1848} 1827}
1849 1828
1850/* Do a simple retransmit without using the backoff mechanisms in 1829static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
1851 * tcp_timer. This is used for path mtu discovery. 1830{
1852 * The socket is already locked here. 1831 if (tcp_skb_pcount(skb) > 1)
1853 */ 1832 return 0;
1854void tcp_simple_retransmit(struct sock *sk) 1833 /* TODO: SACK collapsing could be used to remove this condition */
1834 if (skb_shinfo(skb)->nr_frags != 0)
1835 return 0;
1836 if (skb_cloned(skb))
1837 return 0;
1838 if (skb == tcp_send_head(sk))
1839 return 0;
1840 /* Some heurestics for collapsing over SACK'd could be invented */
1841 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1842 return 0;
1843
1844 return 1;
1845}
1846
1847static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
1848 int space)
1855{ 1849{
1856 const struct inet_connection_sock *icsk = inet_csk(sk);
1857 struct tcp_sock *tp = tcp_sk(sk); 1850 struct tcp_sock *tp = tcp_sk(sk);
1858 struct sk_buff *skb; 1851 struct sk_buff *skb = to, *tmp;
1859 unsigned int mss = tcp_current_mss(sk, 0); 1852 int first = 1;
1860 u32 prior_lost = tp->lost_out;
1861 1853
1862 tcp_for_write_queue(skb, sk) { 1854 if (!sysctl_tcp_retrans_collapse)
1863 if (skb == tcp_send_head(sk)) 1855 return;
1856 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)
1857 return;
1858
1859 tcp_for_write_queue_from_safe(skb, tmp, sk) {
1860 if (!tcp_can_collapse(sk, skb))
1864 break; 1861 break;
1865 if (skb->len > mss &&
1866 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1867 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1868 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1869 tp->retrans_out -= tcp_skb_pcount(skb);
1870 }
1871 tcp_skb_mark_lost_uncond_verify(tp, skb);
1872 }
1873 }
1874 1862
1875 tcp_clear_retrans_hints_partial(tp); 1863 space -= skb->len;
1876 1864
1877 if (prior_lost == tp->lost_out) 1865 if (first) {
1878 return; 1866 first = 0;
1867 continue;
1868 }
1879 1869
1880 if (tcp_is_reno(tp)) 1870 if (space < 0)
1881 tcp_limit_reno_sacked(tp); 1871 break;
1872 /* Punt if not enough space exists in the first SKB for
1873 * the data in the second
1874 */
1875 if (skb->len > skb_tailroom(to))
1876 break;
1882 1877
1883 tcp_verify_left_out(tp); 1878 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
1879 break;
1884 1880
1885 /* Don't muck with the congestion window here. 1881 tcp_collapse_retrans(sk, to);
1886 * Reason is that we do not increase amount of _data_
1887 * in network, but units changed and effective
1888 * cwnd/ssthresh really reduced now.
1889 */
1890 if (icsk->icsk_ca_state != TCP_CA_Loss) {
1891 tp->high_seq = tp->snd_nxt;
1892 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1893 tp->prior_ssthresh = 0;
1894 tp->undo_marker = 0;
1895 tcp_set_ca_state(sk, TCP_CA_Loss);
1896 } 1882 }
1897 tcp_xmit_retransmit_queue(sk);
1898} 1883}
1899 1884
1900/* This retransmits one SKB. Policy decisions and retransmit queue 1885/* This retransmits one SKB. Policy decisions and retransmit queue
@@ -1946,17 +1931,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1946 return -ENOMEM; /* We'll try again later. */ 1931 return -ENOMEM; /* We'll try again later. */
1947 } 1932 }
1948 1933
1949 /* Collapse two adjacent packets if worthwhile and we can. */ 1934 tcp_retrans_try_collapse(sk, skb, cur_mss);
1950 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1951 (skb->len < (cur_mss >> 1)) &&
1952 (!tcp_skb_is_last(sk, skb)) &&
1953 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1954 (skb_shinfo(skb)->nr_frags == 0 &&
1955 skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
1956 (tcp_skb_pcount(skb) == 1 &&
1957 tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
1958 (sysctl_tcp_retrans_collapse != 0))
1959 tcp_retrans_try_collapse(sk, skb, cur_mss);
1960 1935
1961 /* Some Solaris stacks overoptimize and ignore the FIN on a 1936 /* Some Solaris stacks overoptimize and ignore the FIN on a
1962 * retransmit when old data is attached. So strip it off 1937 * retransmit when old data is attached. So strip it off
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3df339e3e363..cc4e6d27dedc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -65,7 +65,7 @@ static void tcp_write_err(struct sock *sk)
65static int tcp_out_of_resources(struct sock *sk, int do_reset) 65static int tcp_out_of_resources(struct sock *sk, int do_reset)
66{ 66{
67 struct tcp_sock *tp = tcp_sk(sk); 67 struct tcp_sock *tp = tcp_sk(sk);
68 int orphans = atomic_read(&tcp_orphan_count); 68 int orphans = percpu_counter_read_positive(&tcp_orphan_count);
69 69
70 /* If peer does not open window for long time, or did not transmit 70 /* If peer does not open window for long time, or did not transmit
71 * anything for long time, penalize it. */ 71 * anything for long time, penalize it. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7e4d9c871153..cf5ab0581eba 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
127 const struct sock *sk2)) 127 const struct sock *sk2))
128{ 128{
129 struct sock *sk2; 129 struct sock *sk2;
130 struct hlist_node *node; 130 struct hlist_nulls_node *node;
131 131
132 sk_for_each(sk2, node, &hslot->head) 132 sk_nulls_for_each(sk2, node, &hslot->head)
133 if (net_eq(sock_net(sk2), net) && 133 if (net_eq(sock_net(sk2), net) &&
134 sk2 != sk && 134 sk2 != sk &&
135 sk2->sk_hash == num && 135 sk2->sk_hash == num &&
@@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
189 inet_sk(sk)->num = snum; 189 inet_sk(sk)->num = snum;
190 sk->sk_hash = snum; 190 sk->sk_hash = snum;
191 if (sk_unhashed(sk)) { 191 if (sk_unhashed(sk)) {
192 /* 192 sk_nulls_add_node_rcu(sk, &hslot->head);
193 * We need that previous write to sk->sk_hash committed
194 * before write to sk->next done in following add_node() variant
195 */
196 smp_wmb();
197 sk_add_node_rcu(sk, &hslot->head);
198 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 193 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
199 } 194 }
200 error = 0; 195 error = 0;
@@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
261 int dif, struct udp_table *udptable) 256 int dif, struct udp_table *udptable)
262{ 257{
263 struct sock *sk, *result; 258 struct sock *sk, *result;
264 struct hlist_node *node, *next; 259 struct hlist_nulls_node *node;
265 unsigned short hnum = ntohs(dport); 260 unsigned short hnum = ntohs(dport);
266 unsigned int hash = udp_hashfn(net, hnum); 261 unsigned int hash = udp_hashfn(net, hnum);
267 struct udp_hslot *hslot = &udptable->hash[hash]; 262 struct udp_hslot *hslot = &udptable->hash[hash];
@@ -271,13 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
271begin: 266begin:
272 result = NULL; 267 result = NULL;
273 badness = -1; 268 badness = -1;
274 sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { 269 sk_nulls_for_each_rcu(sk, node, &hslot->head) {
275 /*
276 * lockless reader, and SLAB_DESTROY_BY_RCU items:
277 * We must check this item was not moved to another chain
278 */
279 if (udp_hashfn(net, sk->sk_hash) != hash)
280 goto begin;
281 score = compute_score(sk, net, saddr, hnum, sport, 270 score = compute_score(sk, net, saddr, hnum, sport,
282 daddr, dport, dif); 271 daddr, dport, dif);
283 if (score > badness) { 272 if (score > badness) {
@@ -285,6 +274,14 @@ begin:
285 badness = score; 274 badness = score;
286 } 275 }
287 } 276 }
277 /*
278 * if the nulls value we got at the end of this lookup is
279 * not the expected one, we must restart lookup.
280 * We probably met an item that was moved to another chain.
281 */
282 if (get_nulls_value(node) != hash)
283 goto begin;
284
288 if (result) { 285 if (result) {
289 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 286 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
290 result = NULL; 287 result = NULL;
@@ -320,19 +317,20 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
320} 317}
321EXPORT_SYMBOL_GPL(udp4_lib_lookup); 318EXPORT_SYMBOL_GPL(udp4_lib_lookup);
322 319
323static inline struct sock *udp_v4_mcast_next(struct sock *sk, 320static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
324 __be16 loc_port, __be32 loc_addr, 321 __be16 loc_port, __be32 loc_addr,
325 __be16 rmt_port, __be32 rmt_addr, 322 __be16 rmt_port, __be32 rmt_addr,
326 int dif) 323 int dif)
327{ 324{
328 struct hlist_node *node; 325 struct hlist_nulls_node *node;
329 struct sock *s = sk; 326 struct sock *s = sk;
330 unsigned short hnum = ntohs(loc_port); 327 unsigned short hnum = ntohs(loc_port);
331 328
332 sk_for_each_from(s, node) { 329 sk_nulls_for_each_from(s, node) {
333 struct inet_sock *inet = inet_sk(s); 330 struct inet_sock *inet = inet_sk(s);
334 331
335 if (s->sk_hash != hnum || 332 if (!net_eq(sock_net(s), net) ||
333 s->sk_hash != hnum ||
336 (inet->daddr && inet->daddr != rmt_addr) || 334 (inet->daddr && inet->daddr != rmt_addr) ||
337 (inet->dport != rmt_port && inet->dport) || 335 (inet->dport != rmt_port && inet->dport) ||
338 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || 336 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) ||
@@ -668,6 +666,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
668 .saddr = saddr, 666 .saddr = saddr,
669 .tos = tos } }, 667 .tos = tos } },
670 .proto = sk->sk_protocol, 668 .proto = sk->sk_protocol,
669 .flags = inet_sk_flowi_flags(sk),
671 .uli_u = { .ports = 670 .uli_u = { .ports =
672 { .sport = inet->sport, 671 { .sport = inet->sport,
673 .dport = dport } } }; 672 .dport = dport } } };
@@ -720,7 +719,7 @@ do_append_data:
720 up->len += ulen; 719 up->len += ulen;
721 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; 720 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
722 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, 721 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
723 sizeof(struct udphdr), &ipc, rt, 722 sizeof(struct udphdr), &ipc, &rt,
724 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); 723 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
725 if (err) 724 if (err)
726 udp_flush_pending_frames(sk); 725 udp_flush_pending_frames(sk);
@@ -971,16 +970,18 @@ int udp_disconnect(struct sock *sk, int flags)
971 970
972void udp_lib_unhash(struct sock *sk) 971void udp_lib_unhash(struct sock *sk)
973{ 972{
974 struct udp_table *udptable = sk->sk_prot->h.udp_table; 973 if (sk_hashed(sk)) {
975 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); 974 struct udp_table *udptable = sk->sk_prot->h.udp_table;
976 struct udp_hslot *hslot = &udptable->hash[hash]; 975 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
976 struct udp_hslot *hslot = &udptable->hash[hash];
977 977
978 spin_lock_bh(&hslot->lock); 978 spin_lock_bh(&hslot->lock);
979 if (sk_del_node_init_rcu(sk)) { 979 if (sk_nulls_del_node_init_rcu(sk)) {
980 inet_sk(sk)->num = 0; 980 inet_sk(sk)->num = 0;
981 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 981 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
982 }
983 spin_unlock_bh(&hslot->lock);
982 } 984 }
983 spin_unlock_bh(&hslot->lock);
984} 985}
985EXPORT_SYMBOL(udp_lib_unhash); 986EXPORT_SYMBOL(udp_lib_unhash);
986 987
@@ -1129,17 +1130,18 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1129 int dif; 1130 int dif;
1130 1131
1131 spin_lock(&hslot->lock); 1132 spin_lock(&hslot->lock);
1132 sk = sk_head(&hslot->head); 1133 sk = sk_nulls_head(&hslot->head);
1133 dif = skb->dev->ifindex; 1134 dif = skb->dev->ifindex;
1134 sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); 1135 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
1135 if (sk) { 1136 if (sk) {
1136 struct sock *sknext = NULL; 1137 struct sock *sknext = NULL;
1137 1138
1138 do { 1139 do {
1139 struct sk_buff *skb1 = skb; 1140 struct sk_buff *skb1 = skb;
1140 1141
1141 sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, 1142 sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1142 uh->source, saddr, dif); 1143 daddr, uh->source, saddr,
1144 dif);
1143 if (sknext) 1145 if (sknext)
1144 skb1 = skb_clone(skb, GFP_ATOMIC); 1146 skb1 = skb_clone(skb, GFP_ATOMIC);
1145 1147
@@ -1558,10 +1560,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
1558 struct net *net = seq_file_net(seq); 1560 struct net *net = seq_file_net(seq);
1559 1561
1560 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { 1562 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
1561 struct hlist_node *node; 1563 struct hlist_nulls_node *node;
1562 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; 1564 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1563 spin_lock_bh(&hslot->lock); 1565 spin_lock_bh(&hslot->lock);
1564 sk_for_each(sk, node, &hslot->head) { 1566 sk_nulls_for_each(sk, node, &hslot->head) {
1565 if (!net_eq(sock_net(sk), net)) 1567 if (!net_eq(sock_net(sk), net))
1566 continue; 1568 continue;
1567 if (sk->sk_family == state->family) 1569 if (sk->sk_family == state->family)
@@ -1580,7 +1582,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1580 struct net *net = seq_file_net(seq); 1582 struct net *net = seq_file_net(seq);
1581 1583
1582 do { 1584 do {
1583 sk = sk_next(sk); 1585 sk = sk_nulls_next(sk);
1584 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); 1586 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1585 1587
1586 if (!sk) { 1588 if (!sk) {
@@ -1751,7 +1753,7 @@ void __init udp_table_init(struct udp_table *table)
1751 int i; 1753 int i;
1752 1754
1753 for (i = 0; i < UDP_HTABLE_SIZE; i++) { 1755 for (i = 0; i < UDP_HTABLE_SIZE; i++) {
1754 INIT_HLIST_HEAD(&table->hash[i].head); 1756 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
1755 spin_lock_init(&table->hash[i].lock); 1757 spin_lock_init(&table->hash[i].lock);
1756 } 1758 }
1757} 1759}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index f9a775b7e796..2ad24ba31f9d 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -18,7 +18,8 @@
18static struct dst_ops xfrm4_dst_ops; 18static struct dst_ops xfrm4_dst_ops;
19static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 19static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20 20
21static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, 21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
22 xfrm_address_t *saddr,
22 xfrm_address_t *daddr) 23 xfrm_address_t *daddr)
23{ 24{
24 struct flowi fl = { 25 struct flowi fl = {
@@ -36,19 +37,20 @@ static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr,
36 if (saddr) 37 if (saddr)
37 fl.fl4_src = saddr->a4; 38 fl.fl4_src = saddr->a4;
38 39
39 err = __ip_route_output_key(&init_net, &rt, &fl); 40 err = __ip_route_output_key(net, &rt, &fl);
40 dst = &rt->u.dst; 41 dst = &rt->u.dst;
41 if (err) 42 if (err)
42 dst = ERR_PTR(err); 43 dst = ERR_PTR(err);
43 return dst; 44 return dst;
44} 45}
45 46
46static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) 47static int xfrm4_get_saddr(struct net *net,
48 xfrm_address_t *saddr, xfrm_address_t *daddr)
47{ 49{
48 struct dst_entry *dst; 50 struct dst_entry *dst;
49 struct rtable *rt; 51 struct rtable *rt;
50 52
51 dst = xfrm4_dst_lookup(0, NULL, daddr); 53 dst = xfrm4_dst_lookup(net, 0, NULL, daddr);
52 if (IS_ERR(dst)) 54 if (IS_ERR(dst))
53 return -EHOSTUNREACH; 55 return -EHOSTUNREACH;
54 56
@@ -187,7 +189,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
187 189
188static inline int xfrm4_garbage_collect(struct dst_ops *ops) 190static inline int xfrm4_garbage_collect(struct dst_ops *ops)
189{ 191{
190 xfrm4_policy_afinfo.garbage_collect(); 192 xfrm4_policy_afinfo.garbage_collect(&init_net);
191 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); 193 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
192} 194}
193 195
@@ -246,7 +248,6 @@ static struct dst_ops xfrm4_dst_ops = {
246 .ifdown = xfrm4_dst_ifdown, 248 .ifdown = xfrm4_dst_ifdown,
247 .local_out = __ip_local_out, 249 .local_out = __ip_local_out,
248 .gc_thresh = 1024, 250 .gc_thresh = 1024,
249 .entry_size = sizeof(struct xfrm_dst),
250 .entries = ATOMIC_INIT(0), 251 .entries = ATOMIC_INIT(0),
251}; 252};
252 253
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 07735ed280d7..1ef1366a0a03 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -13,8 +13,6 @@
13#include <linux/ipsec.h> 13#include <linux/ipsec.h>
14#include <linux/netfilter_ipv4.h> 14#include <linux/netfilter_ipv4.h>
15 15
16static struct xfrm_state_afinfo xfrm4_state_afinfo;
17
18static int xfrm4_init_flags(struct xfrm_state *x) 16static int xfrm4_init_flags(struct xfrm_state *x)
19{ 17{
20 if (ipv4_config.no_pmtu_disc) 18 if (ipv4_config.no_pmtu_disc)
@@ -33,6 +31,7 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
33 x->sel.dport_mask = htons(0xffff); 31 x->sel.dport_mask = htons(0xffff);
34 x->sel.sport = xfrm_flowi_sport(fl); 32 x->sel.sport = xfrm_flowi_sport(fl);
35 x->sel.sport_mask = htons(0xffff); 33 x->sel.sport_mask = htons(0xffff);
34 x->sel.family = AF_INET;
36 x->sel.prefixlen_d = 32; 35 x->sel.prefixlen_d = 32;
37 x->sel.prefixlen_s = 32; 36 x->sel.prefixlen_s = 32;
38 x->sel.proto = fl->proto; 37 x->sel.proto = fl->proto;