aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig5
-rw-r--r--net/ipv4/datagram.c5
-rw-r--r--net/ipv4/fib_frontend.c15
-rw-r--r--net/ipv4/fib_trie.c8
-rw-r--r--net/ipv4/igmp.c12
-rw-r--r--net/ipv4/ip_gre.c8
-rw-r--r--net/ipv4/ip_output.c19
-rw-r--r--net/ipv4/ip_sockglue.c3
-rw-r--r--net/ipv4/netfilter/arp_tables.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c6
-rw-r--r--net/ipv4/route.c9
-rw-r--r--net/ipv4/tcp.c41
-rw-r--r--net/ipv4/tcp_cong.c5
-rw-r--r--net/ipv4/tcp_input.c5
-rw-r--r--net/ipv4/tcp_timer.c32
-rw-r--r--net/ipv4/udp.c44
-rw-r--r--net/ipv4/xfrm4_policy.c2
-rw-r--r--net/ipv4/xfrm4_state.c33
21 files changed, 188 insertions, 79 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7c3a7d191249..7cd7760144f7 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER
46 rp_filter on use: 46 rp_filter on use:
47 47
48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter 48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
49 and 49 or
50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter 50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
51 51
52 Note that some distributions enable it in startup scripts. 52 Note that some distributions enable it in startup scripts.
@@ -217,6 +217,7 @@ config NET_IPIP
217 217
218config NET_IPGRE 218config NET_IPGRE
219 tristate "IP: GRE tunnels over IP" 219 tristate "IP: GRE tunnels over IP"
220 depends on IPV6 || IPV6=n
220 help 221 help
221 Tunneling means encapsulating data of one protocol type within 222 Tunneling means encapsulating data of one protocol type within
222 another protocol and sending it over a channel that understands the 223 another protocol and sending it over a channel that understands the
@@ -412,7 +413,7 @@ config INET_XFRM_MODE_BEET
412 If unsure, say Y. 413 If unsure, say Y.
413 414
414config INET_LRO 415config INET_LRO
415 bool "Large Receive Offload (ipv4/tcp)" 416 tristate "Large Receive Offload (ipv4/tcp)"
416 default y 417 default y
417 ---help--- 418 ---help---
418 Support for Large Receive Offload (ipv4/tcp). 419 Support for Large Receive Offload (ipv4/tcp).
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f0550941df7b..721a8a37b45c 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -62,8 +62,11 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
62 } 62 }
63 if (!inet->inet_saddr) 63 if (!inet->inet_saddr)
64 inet->inet_saddr = rt->rt_src; /* Update source address */ 64 inet->inet_saddr = rt->rt_src; /* Update source address */
65 if (!inet->inet_rcv_saddr) 65 if (!inet->inet_rcv_saddr) {
66 inet->inet_rcv_saddr = rt->rt_src; 66 inet->inet_rcv_saddr = rt->rt_src;
67 if (sk->sk_prot->rehash)
68 sk->sk_prot->rehash(sk);
69 }
67 inet->inet_daddr = rt->rt_dst; 70 inet->inet_daddr = rt->rt_dst;
68 inet->inet_dport = usin->sin_port; 71 inet->inet_dport = usin->sin_port;
69 sk->sk_state = TCP_ESTABLISHED; 72 sk->sk_state = TCP_ESTABLISHED;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a43968918350..7d02a9f999fa 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -246,6 +246,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
246 246
247 struct fib_result res; 247 struct fib_result res;
248 int no_addr, rpf, accept_local; 248 int no_addr, rpf, accept_local;
249 bool dev_match;
249 int ret; 250 int ret;
250 struct net *net; 251 struct net *net;
251 252
@@ -273,12 +274,22 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
273 } 274 }
274 *spec_dst = FIB_RES_PREFSRC(res); 275 *spec_dst = FIB_RES_PREFSRC(res);
275 fib_combine_itag(itag, &res); 276 fib_combine_itag(itag, &res);
277 dev_match = false;
278
276#ifdef CONFIG_IP_ROUTE_MULTIPATH 279#ifdef CONFIG_IP_ROUTE_MULTIPATH
277 if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) 280 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
281 struct fib_nh *nh = &res.fi->fib_nh[ret];
282
283 if (nh->nh_dev == dev) {
284 dev_match = true;
285 break;
286 }
287 }
278#else 288#else
279 if (FIB_RES_DEV(res) == dev) 289 if (FIB_RES_DEV(res) == dev)
290 dev_match = true;
280#endif 291#endif
281 { 292 if (dev_match) {
282 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 293 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
283 fib_res_put(&res); 294 fib_res_put(&res);
284 return ret; 295 return ret;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 79d057a939ba..4a8e370862bc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,7 +186,9 @@ static inline struct tnode *node_parent_rcu(struct node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
189 return rcu_dereference(ret); 189 return rcu_dereference_check(ret,
190 rcu_read_lock_held() ||
191 lockdep_rtnl_is_held());
190} 192}
191 193
192/* Same as rcu_assign_pointer 194/* Same as rcu_assign_pointer
@@ -1753,7 +1755,9 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1753 1755
1754static struct leaf *trie_firstleaf(struct trie *t) 1756static struct leaf *trie_firstleaf(struct trie *t)
1755{ 1757{
1756 struct tnode *n = (struct tnode *) rcu_dereference(t->trie); 1758 struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
1759 rcu_read_lock_held() ||
1760 lockdep_rtnl_is_held());
1757 1761
1758 if (!n) 1762 if (!n)
1759 return NULL; 1763 return NULL;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a1ad0e7180d2..2a4bb76f2132 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
856 igmpv3_clear_delrec(in_dev); 856 igmpv3_clear_delrec(in_dev);
857 } else if (len < 12) { 857 } else if (len < 12) {
858 return; /* ignore bogus packet; freed by caller */ 858 return; /* ignore bogus packet; freed by caller */
859 } else if (IGMP_V1_SEEN(in_dev)) {
860 /* This is a v3 query with v1 queriers present */
861 max_delay = IGMP_Query_Response_Interval;
862 group = 0;
863 } else if (IGMP_V2_SEEN(in_dev)) {
864 /* this is a v3 query with v2 queriers present;
865 * Interpretation of the max_delay code is problematic here.
866 * A real v2 host would use ih_code directly, while v3 has a
867 * different encoding. We use the v3 encoding as more likely
868 * to be intended in a v3 query.
869 */
870 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
859 } else { /* v3 */ 871 } else { /* v3 */
860 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 872 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
861 return; 873 return;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 945b20a5ad50..35c93e8b6a46 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -45,7 +45,7 @@
45#include <net/netns/generic.h> 45#include <net/netns/generic.h>
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47 47
48#ifdef CONFIG_IPV6 48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49#include <net/ipv6.h> 49#include <net/ipv6.h>
50#include <net/ip6_fib.h> 50#include <net/ip6_fib.h>
51#include <net/ip6_route.h> 51#include <net/ip6_route.h>
@@ -699,7 +699,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
699 if ((dst = rt->rt_gateway) == 0) 699 if ((dst = rt->rt_gateway) == 0)
700 goto tx_error_icmp; 700 goto tx_error_icmp;
701 } 701 }
702#ifdef CONFIG_IPV6 702#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
703 else if (skb->protocol == htons(ETH_P_IPV6)) { 703 else if (skb->protocol == htons(ETH_P_IPV6)) {
704 struct in6_addr *addr6; 704 struct in6_addr *addr6;
705 int addr_type; 705 int addr_type;
@@ -774,7 +774,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
774 goto tx_error; 774 goto tx_error;
775 } 775 }
776 } 776 }
777#ifdef CONFIG_IPV6 777#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
778 else if (skb->protocol == htons(ETH_P_IPV6)) { 778 else if (skb->protocol == htons(ETH_P_IPV6)) {
779 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 779 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
780 780
@@ -850,7 +850,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
850 if ((iph->ttl = tiph->ttl) == 0) { 850 if ((iph->ttl = tiph->ttl) == 0) {
851 if (skb->protocol == htons(ETH_P_IP)) 851 if (skb->protocol == htons(ETH_P_IP))
852 iph->ttl = old_iph->ttl; 852 iph->ttl = old_iph->ttl;
853#ifdef CONFIG_IPV6 853#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
854 else if (skb->protocol == htons(ETH_P_IPV6)) 854 else if (skb->protocol == htons(ETH_P_IPV6))
855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
856#endif 856#endif
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04b69896df5f..7649d7750075 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -488,9 +488,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
488 * we can switch to copy when see the first bad fragment. 488 * we can switch to copy when see the first bad fragment.
489 */ 489 */
490 if (skb_has_frags(skb)) { 490 if (skb_has_frags(skb)) {
491 struct sk_buff *frag; 491 struct sk_buff *frag, *frag2;
492 int first_len = skb_pagelen(skb); 492 int first_len = skb_pagelen(skb);
493 int truesizes = 0;
494 493
495 if (first_len - hlen > mtu || 494 if (first_len - hlen > mtu ||
496 ((first_len - hlen) & 7) || 495 ((first_len - hlen) & 7) ||
@@ -503,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
503 if (frag->len > mtu || 502 if (frag->len > mtu ||
504 ((frag->len & 7) && frag->next) || 503 ((frag->len & 7) && frag->next) ||
505 skb_headroom(frag) < hlen) 504 skb_headroom(frag) < hlen)
506 goto slow_path; 505 goto slow_path_clean;
507 506
508 /* Partially cloned skb? */ 507 /* Partially cloned skb? */
509 if (skb_shared(frag)) 508 if (skb_shared(frag))
510 goto slow_path; 509 goto slow_path_clean;
511 510
512 BUG_ON(frag->sk); 511 BUG_ON(frag->sk);
513 if (skb->sk) { 512 if (skb->sk) {
514 frag->sk = skb->sk; 513 frag->sk = skb->sk;
515 frag->destructor = sock_wfree; 514 frag->destructor = sock_wfree;
516 } 515 }
517 truesizes += frag->truesize; 516 skb->truesize -= frag->truesize;
518 } 517 }
519 518
520 /* Everything is OK. Generate! */ 519 /* Everything is OK. Generate! */
@@ -524,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
524 frag = skb_shinfo(skb)->frag_list; 523 frag = skb_shinfo(skb)->frag_list;
525 skb_frag_list_init(skb); 524 skb_frag_list_init(skb);
526 skb->data_len = first_len - skb_headlen(skb); 525 skb->data_len = first_len - skb_headlen(skb);
527 skb->truesize -= truesizes;
528 skb->len = first_len; 526 skb->len = first_len;
529 iph->tot_len = htons(first_len); 527 iph->tot_len = htons(first_len);
530 iph->frag_off = htons(IP_MF); 528 iph->frag_off = htons(IP_MF);
@@ -576,6 +574,15 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
576 } 574 }
577 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 575 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
578 return err; 576 return err;
577
578slow_path_clean:
579 skb_walk_frags(skb, frag2) {
580 if (frag2 == frag)
581 break;
582 frag2->sk = NULL;
583 frag2->destructor = NULL;
584 skb->truesize += frag2->truesize;
585 }
579 } 586 }
580 587
581slow_path: 588slow_path:
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6c40a8c46e79..64b70ad162e3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1129,6 +1129,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1129 case IP_HDRINCL: 1129 case IP_HDRINCL:
1130 val = inet->hdrincl; 1130 val = inet->hdrincl;
1131 break; 1131 break;
1132 case IP_NODEFRAG:
1133 val = inet->nodefrag;
1134 break;
1132 case IP_MTU_DISCOVER: 1135 case IP_MTU_DISCOVER:
1133 val = inet->pmtudisc; 1136 val = inet->pmtudisc;
1134 break; 1137 break;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 6bccba31d132..e8f4f9a57f12 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -735,6 +735,7 @@ static void get_counters(const struct xt_table_info *t,
735 if (cpu == curcpu) 735 if (cpu == curcpu)
736 continue; 736 continue;
737 i = 0; 737 i = 0;
738 local_bh_disable();
738 xt_info_wrlock(cpu); 739 xt_info_wrlock(cpu);
739 xt_entry_foreach(iter, t->entries[cpu], t->size) { 740 xt_entry_foreach(iter, t->entries[cpu], t->size) {
740 ADD_COUNTER(counters[i], iter->counters.bcnt, 741 ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -742,6 +743,7 @@ static void get_counters(const struct xt_table_info *t,
742 ++i; 743 ++i;
743 } 744 }
744 xt_info_wrunlock(cpu); 745 xt_info_wrunlock(cpu);
746 local_bh_enable();
745 } 747 }
746 put_cpu(); 748 put_cpu();
747} 749}
@@ -1418,6 +1420,9 @@ static int translate_compat_table(const char *name,
1418 if (ret != 0) 1420 if (ret != 0)
1419 break; 1421 break;
1420 ++i; 1422 ++i;
1423 if (strcmp(arpt_get_target(iter1)->u.user.name,
1424 XT_ERROR_TARGET) == 0)
1425 ++newinfo->stacksize;
1421 } 1426 }
1422 if (ret) { 1427 if (ret) {
1423 /* 1428 /*
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c439721b165a..d163f2e3b2e9 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -909,6 +909,7 @@ get_counters(const struct xt_table_info *t,
909 if (cpu == curcpu) 909 if (cpu == curcpu)
910 continue; 910 continue;
911 i = 0; 911 i = 0;
912 local_bh_disable();
912 xt_info_wrlock(cpu); 913 xt_info_wrlock(cpu);
913 xt_entry_foreach(iter, t->entries[cpu], t->size) { 914 xt_entry_foreach(iter, t->entries[cpu], t->size) {
914 ADD_COUNTER(counters[i], iter->counters.bcnt, 915 ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -916,6 +917,7 @@ get_counters(const struct xt_table_info *t,
916 ++i; /* macro does multi eval of i */ 917 ++i; /* macro does multi eval of i */
917 } 918 }
918 xt_info_wrunlock(cpu); 919 xt_info_wrunlock(cpu);
920 local_bh_enable();
919 } 921 }
920 put_cpu(); 922 put_cpu();
921} 923}
@@ -1749,6 +1751,9 @@ translate_compat_table(struct net *net,
1749 if (ret != 0) 1751 if (ret != 0)
1750 break; 1752 break;
1751 ++i; 1753 ++i;
1754 if (strcmp(ipt_get_target(iter1)->u.user.name,
1755 XT_ERROR_TARGET) == 0)
1756 ++newinfo->stacksize;
1752 } 1757 }
1753 if (ret) { 1758 if (ret) {
1754 /* 1759 /*
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index b254dafaf429..43eec80c0e7c 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -112,6 +112,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
112 /* ip_route_me_harder expects skb->dst to be set */ 112 /* ip_route_me_harder expects skb->dst to be set */
113 skb_dst_set_noref(nskb, skb_dst(oldskb)); 113 skb_dst_set_noref(nskb, skb_dst(oldskb));
114 114
115 nskb->protocol = htons(ETH_P_IP);
115 if (ip_route_me_harder(nskb, addr_type)) 116 if (ip_route_me_harder(nskb, addr_type))
116 goto free_nskb; 117 goto free_nskb;
117 118
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index eab8de32f200..f3a9b42b16c6 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,9 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
66 const struct net_device *out, 66 const struct net_device *out,
67 int (*okfn)(struct sk_buff *)) 67 int (*okfn)(struct sk_buff *))
68{ 68{
69 struct sock *sk = skb->sk;
69 struct inet_sock *inet = inet_sk(skb->sk); 70 struct inet_sock *inet = inet_sk(skb->sk);
70 71
71 if (inet && inet->nodefrag) 72 if (sk && (sk->sk_family == PF_INET) &&
73 inet->nodefrag)
72 return NF_ACCEPT; 74 return NF_ACCEPT;
73 75
74#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 76#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 1679e2c0963d..ee5f419d0a56 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -893,13 +893,15 @@ static void fast_csum(__sum16 *csum,
893 unsigned char s[4]; 893 unsigned char s[4];
894 894
895 if (offset & 1) { 895 if (offset & 1) {
896 s[0] = s[2] = 0; 896 s[0] = ~0;
897 s[1] = ~*optr; 897 s[1] = ~*optr;
898 s[2] = 0;
898 s[3] = *nptr; 899 s[3] = *nptr;
899 } else { 900 } else {
900 s[1] = s[3] = 0;
901 s[0] = ~*optr; 901 s[0] = ~*optr;
902 s[1] = ~0;
902 s[2] = *nptr; 903 s[2] = *nptr;
904 s[3] = 0;
903 } 905 }
904 906
905 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); 907 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f56b6e6c6aa..ac6559cb54f9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1231,7 +1231,7 @@ restart:
1231 } 1231 }
1232 1232
1233 if (net_ratelimit()) 1233 if (net_ratelimit())
1234 printk(KERN_WARNING "Neighbour table overflow.\n"); 1234 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1235 rt_drop(rt); 1235 rt_drop(rt);
1236 return -ENOBUFS; 1236 return -ENOBUFS;
1237 } 1237 }
@@ -2738,6 +2738,11 @@ slow_output:
2738} 2738}
2739EXPORT_SYMBOL_GPL(__ip_route_output_key); 2739EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740 2740
2741static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2742{
2743 return NULL;
2744}
2745
2741static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2746static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2742{ 2747{
2743} 2748}
@@ -2746,7 +2751,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2746 .family = AF_INET, 2751 .family = AF_INET,
2747 .protocol = cpu_to_be16(ETH_P_IP), 2752 .protocol = cpu_to_be16(ETH_P_IP),
2748 .destroy = ipv4_dst_destroy, 2753 .destroy = ipv4_dst_destroy,
2749 .check = ipv4_dst_check, 2754 .check = ipv4_blackhole_dst_check,
2750 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2755 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2751 .entries = ATOMIC_INIT(0), 2756 .entries = ATOMIC_INIT(0),
2752}; 2757};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 176e11aaea77..f115ea68a4ef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -386,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
386 */ 386 */
387 387
388 mask = 0; 388 mask = 0;
389 if (sk->sk_err)
390 mask = POLLERR;
391 389
392 /* 390 /*
393 * POLLHUP is certainly not done right. But poll() doesn't 391 * POLLHUP is certainly not done right. But poll() doesn't
@@ -451,11 +449,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
451 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 449 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
452 mask |= POLLOUT | POLLWRNORM; 450 mask |= POLLOUT | POLLWRNORM;
453 } 451 }
454 } 452 } else
453 mask |= POLLOUT | POLLWRNORM;
455 454
456 if (tp->urg_data & TCP_URG_VALID) 455 if (tp->urg_data & TCP_URG_VALID)
457 mask |= POLLPRI; 456 mask |= POLLPRI;
458 } 457 }
458 /* This barrier is coupled with smp_wmb() in tcp_reset() */
459 smp_rmb();
460 if (sk->sk_err)
461 mask |= POLLERR;
462
459 return mask; 463 return mask;
460} 464}
461EXPORT_SYMBOL(tcp_poll); 465EXPORT_SYMBOL(tcp_poll);
@@ -939,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
939 sg = sk->sk_route_caps & NETIF_F_SG; 943 sg = sk->sk_route_caps & NETIF_F_SG;
940 944
941 while (--iovlen >= 0) { 945 while (--iovlen >= 0) {
942 int seglen = iov->iov_len; 946 size_t seglen = iov->iov_len;
943 unsigned char __user *from = iov->iov_base; 947 unsigned char __user *from = iov->iov_base;
944 948
945 iov++; 949 iov++;
@@ -2011,11 +2015,8 @@ adjudge_to_death:
2011 } 2015 }
2012 } 2016 }
2013 if (sk->sk_state != TCP_CLOSE) { 2017 if (sk->sk_state != TCP_CLOSE) {
2014 int orphan_count = percpu_counter_read_positive(
2015 sk->sk_prot->orphan_count);
2016
2017 sk_mem_reclaim(sk); 2018 sk_mem_reclaim(sk);
2018 if (tcp_too_many_orphans(sk, orphan_count)) { 2019 if (tcp_too_many_orphans(sk, 0)) {
2019 if (net_ratelimit()) 2020 if (net_ratelimit())
2020 printk(KERN_INFO "TCP: too many of orphaned " 2021 printk(KERN_INFO "TCP: too many of orphaned "
2021 "sockets\n"); 2022 "sockets\n");
@@ -3212,7 +3213,7 @@ void __init tcp_init(void)
3212{ 3213{
3213 struct sk_buff *skb = NULL; 3214 struct sk_buff *skb = NULL;
3214 unsigned long nr_pages, limit; 3215 unsigned long nr_pages, limit;
3215 int order, i, max_share; 3216 int i, max_share, cnt;
3216 unsigned long jiffy = jiffies; 3217 unsigned long jiffy = jiffies;
3217 3218
3218 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3219 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3261,22 +3262,12 @@ void __init tcp_init(void)
3261 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 3262 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3262 } 3263 }
3263 3264
3264 /* Try to be a bit smarter and adjust defaults depending 3265
3265 * on available memory. 3266 cnt = tcp_hashinfo.ehash_mask + 1;
3266 */ 3267
3267 for (order = 0; ((1 << order) << PAGE_SHIFT) < 3268 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3268 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); 3269 sysctl_tcp_max_orphans = cnt / 2;
3269 order++) 3270 sysctl_max_syn_backlog = max(128, cnt / 256);
3270 ;
3271 if (order >= 4) {
3272 tcp_death_row.sysctl_max_tw_buckets = 180000;
3273 sysctl_tcp_max_orphans = 4096 << (order - 4);
3274 sysctl_max_syn_backlog = 1024;
3275 } else if (order < 3) {
3276 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
3277 sysctl_tcp_max_orphans >>= (3 - order);
3278 sysctl_max_syn_backlog = 128;
3279 }
3280 3271
3281 /* Set the pressure threshold to be a fraction of global memory that 3272 /* Set the pressure threshold to be a fraction of global memory that
3282 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of 3273 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 0ec9bd0ae94f..850c737e08e2 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
196int tcp_set_allowed_congestion_control(char *val) 196int tcp_set_allowed_congestion_control(char *val)
197{ 197{
198 struct tcp_congestion_ops *ca; 198 struct tcp_congestion_ops *ca;
199 char *clone, *name; 199 char *saved_clone, *clone, *name;
200 int ret = 0; 200 int ret = 0;
201 201
202 clone = kstrdup(val, GFP_USER); 202 saved_clone = clone = kstrdup(val, GFP_USER);
203 if (!clone) 203 if (!clone)
204 return -ENOMEM; 204 return -ENOMEM;
205 205
@@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val)
226 } 226 }
227out: 227out:
228 spin_unlock(&tcp_cong_list_lock); 228 spin_unlock(&tcp_cong_list_lock);
229 kfree(saved_clone);
229 230
230 return ret; 231 return ret;
231} 232}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e663b78a2ef6..b55f60f6fcbe 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2545,7 +2545,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2545 cnt += tcp_skb_pcount(skb); 2545 cnt += tcp_skb_pcount(skb);
2546 2546
2547 if (cnt > packets) { 2547 if (cnt > packets) {
2548 if (tcp_is_sack(tp) || (oldcnt >= packets)) 2548 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2549 (oldcnt >= packets))
2549 break; 2550 break;
2550 2551
2551 mss = skb_shinfo(skb)->gso_size; 2552 mss = skb_shinfo(skb)->gso_size;
@@ -4048,6 +4049,8 @@ static void tcp_reset(struct sock *sk)
4048 default: 4049 default:
4049 sk->sk_err = ECONNRESET; 4050 sk->sk_err = ECONNRESET;
4050 } 4051 }
4052 /* This barrier is coupled with smp_rmb() in tcp_poll() */
4053 smp_wmb();
4051 4054
4052 if (!sock_flag(sk, SOCK_DEAD)) 4055 if (!sock_flag(sk, SOCK_DEAD))
4053 sk->sk_error_report(sk); 4056 sk->sk_error_report(sk);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 808bb920c9f5..74c54b30600f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -66,18 +66,18 @@ static void tcp_write_err(struct sock *sk)
66static int tcp_out_of_resources(struct sock *sk, int do_reset) 66static int tcp_out_of_resources(struct sock *sk, int do_reset)
67{ 67{
68 struct tcp_sock *tp = tcp_sk(sk); 68 struct tcp_sock *tp = tcp_sk(sk);
69 int orphans = percpu_counter_read_positive(&tcp_orphan_count); 69 int shift = 0;
70 70
71 /* If peer does not open window for long time, or did not transmit 71 /* If peer does not open window for long time, or did not transmit
72 * anything for long time, penalize it. */ 72 * anything for long time, penalize it. */
73 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) 73 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
74 orphans <<= 1; 74 shift++;
75 75
76 /* If some dubious ICMP arrived, penalize even more. */ 76 /* If some dubious ICMP arrived, penalize even more. */
77 if (sk->sk_err_soft) 77 if (sk->sk_err_soft)
78 orphans <<= 1; 78 shift++;
79 79
80 if (tcp_too_many_orphans(sk, orphans)) { 80 if (tcp_too_many_orphans(sk, shift)) {
81 if (net_ratelimit()) 81 if (net_ratelimit())
82 printk(KERN_INFO "Out of socket memory\n"); 82 printk(KERN_INFO "Out of socket memory\n");
83 83
@@ -135,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
135 135
136/* This function calculates a "timeout" which is equivalent to the timeout of a 136/* This function calculates a "timeout" which is equivalent to the timeout of a
137 * TCP connection after "boundary" unsuccessful, exponentially backed-off 137 * TCP connection after "boundary" unsuccessful, exponentially backed-off
138 * retransmissions with an initial RTO of TCP_RTO_MIN. 138 * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
139 * syn_set flag is set.
139 */ 140 */
140static bool retransmits_timed_out(struct sock *sk, 141static bool retransmits_timed_out(struct sock *sk,
141 unsigned int boundary) 142 unsigned int boundary,
143 bool syn_set)
142{ 144{
143 unsigned int timeout, linear_backoff_thresh; 145 unsigned int timeout, linear_backoff_thresh;
144 unsigned int start_ts; 146 unsigned int start_ts;
147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
145 148
146 if (!inet_csk(sk)->icsk_retransmits) 149 if (!inet_csk(sk)->icsk_retransmits)
147 return false; 150 return false;
@@ -151,12 +154,12 @@ static bool retransmits_timed_out(struct sock *sk,
151 else 154 else
152 start_ts = tcp_sk(sk)->retrans_stamp; 155 start_ts = tcp_sk(sk)->retrans_stamp;
153 156
154 linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); 157 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
155 158
156 if (boundary <= linear_backoff_thresh) 159 if (boundary <= linear_backoff_thresh)
157 timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; 160 timeout = ((2 << boundary) - 1) * rto_base;
158 else 161 else
159 timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + 162 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
160 (boundary - linear_backoff_thresh) * TCP_RTO_MAX; 163 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
161 164
162 return (tcp_time_stamp - start_ts) >= timeout; 165 return (tcp_time_stamp - start_ts) >= timeout;
@@ -167,14 +170,15 @@ static int tcp_write_timeout(struct sock *sk)
167{ 170{
168 struct inet_connection_sock *icsk = inet_csk(sk); 171 struct inet_connection_sock *icsk = inet_csk(sk);
169 int retry_until; 172 int retry_until;
170 bool do_reset; 173 bool do_reset, syn_set = 0;
171 174
172 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 175 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
173 if (icsk->icsk_retransmits) 176 if (icsk->icsk_retransmits)
174 dst_negative_advice(sk); 177 dst_negative_advice(sk);
175 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 178 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
179 syn_set = 1;
176 } else { 180 } else {
177 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { 181 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) {
178 /* Black hole detection */ 182 /* Black hole detection */
179 tcp_mtu_probing(icsk, sk); 183 tcp_mtu_probing(icsk, sk);
180 184
@@ -187,14 +191,14 @@ static int tcp_write_timeout(struct sock *sk)
187 191
188 retry_until = tcp_orphan_retries(sk, alive); 192 retry_until = tcp_orphan_retries(sk, alive);
189 do_reset = alive || 193 do_reset = alive ||
190 !retransmits_timed_out(sk, retry_until); 194 !retransmits_timed_out(sk, retry_until, 0);
191 195
192 if (tcp_out_of_resources(sk, do_reset)) 196 if (tcp_out_of_resources(sk, do_reset))
193 return 1; 197 return 1;
194 } 198 }
195 } 199 }
196 200
197 if (retransmits_timed_out(sk, retry_until)) { 201 if (retransmits_timed_out(sk, retry_until, syn_set)) {
198 /* Has it gone just too far? */ 202 /* Has it gone just too far? */
199 tcp_write_err(sk); 203 tcp_write_err(sk);
200 return 1; 204 return 1;
@@ -436,7 +440,7 @@ out_reset_timer:
436 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 440 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
437 } 441 }
438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 442 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
439 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) 443 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0))
440 __sk_dst_reset(sk); 444 __sk_dst_reset(sk);
441 445
442out:; 446out:;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32e0bef60d0a..fb23c2e63b52 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk)
1260} 1260}
1261EXPORT_SYMBOL(udp_lib_unhash); 1261EXPORT_SYMBOL(udp_lib_unhash);
1262 1262
1263/*
1264 * inet_rcv_saddr was changed, we must rehash secondary hash
1265 */
1266void udp_lib_rehash(struct sock *sk, u16 newhash)
1267{
1268 if (sk_hashed(sk)) {
1269 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1270 struct udp_hslot *hslot, *hslot2, *nhslot2;
1271
1272 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1273 nhslot2 = udp_hashslot2(udptable, newhash);
1274 udp_sk(sk)->udp_portaddr_hash = newhash;
1275 if (hslot2 != nhslot2) {
1276 hslot = udp_hashslot(udptable, sock_net(sk),
1277 udp_sk(sk)->udp_port_hash);
1278 /* we must lock primary chain too */
1279 spin_lock_bh(&hslot->lock);
1280
1281 spin_lock(&hslot2->lock);
1282 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1283 hslot2->count--;
1284 spin_unlock(&hslot2->lock);
1285
1286 spin_lock(&nhslot2->lock);
1287 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
1288 &nhslot2->head);
1289 nhslot2->count++;
1290 spin_unlock(&nhslot2->lock);
1291
1292 spin_unlock_bh(&hslot->lock);
1293 }
1294 }
1295}
1296EXPORT_SYMBOL(udp_lib_rehash);
1297
1298static void udp_v4_rehash(struct sock *sk)
1299{
1300 u16 new_hash = udp4_portaddr_hash(sock_net(sk),
1301 inet_sk(sk)->inet_rcv_saddr,
1302 inet_sk(sk)->inet_num);
1303 udp_lib_rehash(sk, new_hash);
1304}
1305
1263static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1306static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1264{ 1307{
1265 int rc; 1308 int rc;
@@ -1843,6 +1886,7 @@ struct proto udp_prot = {
1843 .backlog_rcv = __udp_queue_rcv_skb, 1886 .backlog_rcv = __udp_queue_rcv_skb,
1844 .hash = udp_lib_hash, 1887 .hash = udp_lib_hash,
1845 .unhash = udp_lib_unhash, 1888 .unhash = udp_lib_unhash,
1889 .rehash = udp_v4_rehash,
1846 .get_port = udp_v4_get_port, 1890 .get_port = udp_v4_get_port,
1847 .memory_allocated = &udp_memory_allocated, 1891 .memory_allocated = &udp_memory_allocated,
1848 .sysctl_mem = sysctl_udp_mem, 1892 .sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 869078d4eeb9..a580349f0b8a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -61,7 +61,7 @@ static int xfrm4_get_saddr(struct net *net,
61 61
62static int xfrm4_get_tos(struct flowi *fl) 62static int xfrm4_get_tos(struct flowi *fl)
63{ 63{
64 return fl->fl4_tos; 64 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
65} 65}
66 66
67static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 67static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a03..47947624eccc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
25 struct xfrm_tmpl *tmpl, 25{
26 xfrm_address_t *daddr, xfrm_address_t *saddr) 26 sel->daddr.a4 = fl->fl4_dst;
27 sel->saddr.a4 = fl->fl4_src;
28 sel->dport = xfrm_flowi_dport(fl);
29 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl);
31 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET;
33 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32;
35 sel->proto = fl->proto;
36 sel->ifindex = fl->oif;
37}
38
39static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr)
27{ 42{
28 x->sel.daddr.a4 = fl->fl4_dst;
29 x->sel.saddr.a4 = fl->fl4_src;
30 x->sel.dport = xfrm_flowi_dport(fl);
31 x->sel.dport_mask = htons(0xffff);
32 x->sel.sport = xfrm_flowi_sport(fl);
33 x->sel.sport_mask = htons(0xffff);
34 x->sel.family = AF_INET;
35 x->sel.prefixlen_d = 32;
36 x->sel.prefixlen_s = 32;
37 x->sel.proto = fl->proto;
38 x->sel.ifindex = fl->oif;
39 x->id = tmpl->id; 43 x->id = tmpl->id;
40 if (x->id.daddr.a4 == 0) 44 if (x->id.daddr.a4 == 0)
41 x->id.daddr.a4 = daddr->a4; 45 x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
70 .owner = THIS_MODULE, 74 .owner = THIS_MODULE,
71 .init_flags = xfrm4_init_flags, 75 .init_flags = xfrm4_init_flags,
72 .init_tempsel = __xfrm4_init_tempsel, 76 .init_tempsel = __xfrm4_init_tempsel,
77 .init_temprop = xfrm4_init_temprop,
73 .output = xfrm4_output, 78 .output = xfrm4_output,
74 .extract_input = xfrm4_extract_input, 79 .extract_input = xfrm4_extract_input,
75 .extract_output = xfrm4_extract_output, 80 .extract_output = xfrm4_extract_output,