aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c116
-rw-r--r--net/ipv4/ah4.c10
-rw-r--r--net/ipv4/arp.c36
-rw-r--r--net/ipv4/cipso_ipv4.c1
-rw-r--r--net/ipv4/devinet.c19
-rw-r--r--net/ipv4/esp4.c8
-rw-r--r--net/ipv4/fib_frontend.c10
-rw-r--r--net/ipv4/fib_hash.c12
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fib_trie.c6
-rw-r--r--net/ipv4/icmp.c39
-rw-r--r--net/ipv4/igmp.c95
-rw-r--r--net/ipv4/inet_connection_sock.c31
-rw-r--r--net/ipv4/inet_diag.c31
-rw-r--r--net/ipv4/inet_hashtables.c277
-rw-r--r--net/ipv4/inet_lro.c4
-rw-r--r--net/ipv4/inet_timewait_sock.c48
-rw-r--r--net/ipv4/inetpeer.c2
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_fragment.c21
-rw-r--r--net/ipv4/ip_gre.c58
-rw-r--r--net/ipv4/ip_input.c10
-rw-r--r--net/ipv4/ip_output.c24
-rw-r--r--net/ipv4/ip_sockglue.c72
-rw-r--r--net/ipv4/ipcomp.c10
-rw-r--r--net/ipv4/ipconfig.c40
-rw-r--r--net/ipv4/ipip.c37
-rw-r--r--net/ipv4/ipmr.c280
-rw-r--r--net/ipv4/netfilter.c7
-rw-r--r--net/ipv4/netfilter/arp_tables.c16
-rw-r--r--net/ipv4/netfilter/arptable_filter.c12
-rw-r--r--net/ipv4/netfilter/ip_tables.c12
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c9
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c7
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c16
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c27
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c58
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c27
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c18
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/proc.c8
-rw-r--r--net/ipv4/raw.c10
-rw-r--r--net/ipv4/route.c227
-rw-r--r--net/ipv4/sysctl_net_ipv4.c188
-rw-r--r--net/ipv4/tcp.c130
-rw-r--r--net/ipv4/tcp_cubic.c120
-rw-r--r--net/ipv4/tcp_diag.c2
-rw-r--r--net/ipv4/tcp_input.c511
-rw-r--r--net/ipv4/tcp_ipv4.c137
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c219
-rw-r--r--net/ipv4/tcp_probe.c7
-rw-r--r--net/ipv4/tcp_timer.c14
-rw-r--r--net/ipv4/tcp_yeah.c4
-rw-r--r--net/ipv4/udp.c271
-rw-r--r--net/ipv4/udp_impl.h4
-rw-r--r--net/ipv4/udplite.c14
-rw-r--r--net/ipv4/xfrm4_input.c4
-rw-r--r--net/ipv4/xfrm4_policy.c15
-rw-r--r--net/ipv4/xfrm4_state.c2
62 files changed, 2136 insertions, 1286 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1aa2dc9e380e..743f5542d65a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -94,6 +94,7 @@
94#include <linux/igmp.h> 94#include <linux/igmp.h>
95#include <linux/inetdevice.h> 95#include <linux/inetdevice.h>
96#include <linux/netdevice.h> 96#include <linux/netdevice.h>
97#include <net/checksum.h>
97#include <net/ip.h> 98#include <net/ip.h>
98#include <net/protocol.h> 99#include <net/protocol.h>
99#include <net/arp.h> 100#include <net/arp.h>
@@ -245,7 +246,7 @@ static inline int inet_netns_ok(struct net *net, int protocol)
245 int hash; 246 int hash;
246 struct net_protocol *ipprot; 247 struct net_protocol *ipprot;
247 248
248 if (net == &init_net) 249 if (net_eq(net, &init_net))
249 return 1; 250 return 1;
250 251
251 hash = protocol & (MAX_INET_PROTOS - 1); 252 hash = protocol & (MAX_INET_PROTOS - 1);
@@ -272,10 +273,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol)
272 int try_loading_module = 0; 273 int try_loading_module = 0;
273 int err; 274 int err;
274 275
275 if (sock->type != SOCK_RAW && 276 if (unlikely(!inet_ehash_secret))
276 sock->type != SOCK_DGRAM && 277 if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
277 !inet_ehash_secret) 278 build_ehash_secret();
278 build_ehash_secret();
279 279
280 sock->state = SS_UNCONNECTED; 280 sock->state = SS_UNCONNECTED;
281 281
@@ -1070,11 +1070,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1070 return 0; 1070 return 0;
1071 1071
1072 if (sysctl_ip_dynaddr > 1) { 1072 if (sysctl_ip_dynaddr > 1) {
1073 printk(KERN_INFO "%s(): shifting inet->" 1073 printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n",
1074 "saddr from " NIPQUAD_FMT " to " NIPQUAD_FMT "\n", 1074 __func__, &old_saddr, &new_saddr);
1075 __func__,
1076 NIPQUAD(old_saddr),
1077 NIPQUAD(new_saddr));
1078 } 1075 }
1079 1076
1080 inet->saddr = inet->rcv_saddr = new_saddr; 1077 inet->saddr = inet->rcv_saddr = new_saddr;
@@ -1245,6 +1242,100 @@ out:
1245 return segs; 1242 return segs;
1246} 1243}
1247 1244
1245static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1246 struct sk_buff *skb)
1247{
1248 struct net_protocol *ops;
1249 struct sk_buff **pp = NULL;
1250 struct sk_buff *p;
1251 struct iphdr *iph;
1252 int flush = 1;
1253 int proto;
1254 int id;
1255
1256 if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
1257 goto out;
1258
1259 iph = ip_hdr(skb);
1260 proto = iph->protocol & (MAX_INET_PROTOS - 1);
1261
1262 rcu_read_lock();
1263 ops = rcu_dereference(inet_protos[proto]);
1264 if (!ops || !ops->gro_receive)
1265 goto out_unlock;
1266
1267 if (iph->version != 4 || iph->ihl != 5)
1268 goto out_unlock;
1269
1270 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
1271 goto out_unlock;
1272
1273 flush = ntohs(iph->tot_len) != skb->len ||
1274 iph->frag_off != htons(IP_DF);
1275 id = ntohs(iph->id);
1276
1277 for (p = *head; p; p = p->next) {
1278 struct iphdr *iph2;
1279
1280 if (!NAPI_GRO_CB(p)->same_flow)
1281 continue;
1282
1283 iph2 = ip_hdr(p);
1284
1285 if (iph->protocol != iph2->protocol ||
1286 iph->tos != iph2->tos ||
1287 memcmp(&iph->saddr, &iph2->saddr, 8)) {
1288 NAPI_GRO_CB(p)->same_flow = 0;
1289 continue;
1290 }
1291
1292 /* All fields must match except length and checksum. */
1293 NAPI_GRO_CB(p)->flush |=
1294 memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
1295 (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
1296
1297 NAPI_GRO_CB(p)->flush |= flush;
1298 }
1299
1300 NAPI_GRO_CB(skb)->flush |= flush;
1301 __skb_pull(skb, sizeof(*iph));
1302 skb_reset_transport_header(skb);
1303
1304 pp = ops->gro_receive(head, skb);
1305
1306out_unlock:
1307 rcu_read_unlock();
1308
1309out:
1310 NAPI_GRO_CB(skb)->flush |= flush;
1311
1312 return pp;
1313}
1314
1315static int inet_gro_complete(struct sk_buff *skb)
1316{
1317 struct net_protocol *ops;
1318 struct iphdr *iph = ip_hdr(skb);
1319 int proto = iph->protocol & (MAX_INET_PROTOS - 1);
1320 int err = -ENOSYS;
1321 __be16 newlen = htons(skb->len - skb_network_offset(skb));
1322
1323 csum_replace2(&iph->check, iph->tot_len, newlen);
1324 iph->tot_len = newlen;
1325
1326 rcu_read_lock();
1327 ops = rcu_dereference(inet_protos[proto]);
1328 if (WARN_ON(!ops || !ops->gro_complete))
1329 goto out_unlock;
1330
1331 err = ops->gro_complete(skb);
1332
1333out_unlock:
1334 rcu_read_unlock();
1335
1336 return err;
1337}
1338
1248int inet_ctl_sock_create(struct sock **sk, unsigned short family, 1339int inet_ctl_sock_create(struct sock **sk, unsigned short family,
1249 unsigned short type, unsigned char protocol, 1340 unsigned short type, unsigned char protocol,
1250 struct net *net) 1341 struct net *net)
@@ -1311,6 +1402,7 @@ EXPORT_SYMBOL_GPL(snmp_mib_free);
1311#ifdef CONFIG_IP_MULTICAST 1402#ifdef CONFIG_IP_MULTICAST
1312static struct net_protocol igmp_protocol = { 1403static struct net_protocol igmp_protocol = {
1313 .handler = igmp_rcv, 1404 .handler = igmp_rcv,
1405 .netns_ok = 1,
1314}; 1406};
1315#endif 1407#endif
1316 1408
@@ -1319,6 +1411,8 @@ static struct net_protocol tcp_protocol = {
1319 .err_handler = tcp_v4_err, 1411 .err_handler = tcp_v4_err,
1320 .gso_send_check = tcp_v4_gso_send_check, 1412 .gso_send_check = tcp_v4_gso_send_check,
1321 .gso_segment = tcp_tso_segment, 1413 .gso_segment = tcp_tso_segment,
1414 .gro_receive = tcp4_gro_receive,
1415 .gro_complete = tcp4_gro_complete,
1322 .no_policy = 1, 1416 .no_policy = 1,
1323 .netns_ok = 1, 1417 .netns_ok = 1,
1324}; 1418};
@@ -1411,6 +1505,8 @@ static struct packet_type ip_packet_type = {
1411 .func = ip_rcv, 1505 .func = ip_rcv,
1412 .gso_send_check = inet_gso_send_check, 1506 .gso_send_check = inet_gso_send_check,
1413 .gso_segment = inet_gso_segment, 1507 .gso_segment = inet_gso_segment,
1508 .gro_receive = inet_gro_receive,
1509 .gro_complete = inet_gro_complete,
1414}; 1510};
1415 1511
1416static int __init inet_init(void) 1512static int __init inet_init(void)
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 8219b7e0968d..e878e494296e 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,15 +201,16 @@ out:
201 201
202static void ah4_err(struct sk_buff *skb, u32 info) 202static void ah4_err(struct sk_buff *skb, u32 info)
203{ 203{
204 struct iphdr *iph = (struct iphdr*)skb->data; 204 struct net *net = dev_net(skb->dev);
205 struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2)); 205 struct iphdr *iph = (struct iphdr *)skb->data;
206 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
206 struct xfrm_state *x; 207 struct xfrm_state *x;
207 208
208 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || 209 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
209 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 210 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
210 return; 211 return;
211 212
212 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); 213 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
213 if (!x) 214 if (!x)
214 return; 215 return;
215 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", 216 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
@@ -293,9 +294,7 @@ static void ah_destroy(struct xfrm_state *x)
293 return; 294 return;
294 295
295 kfree(ahp->work_icv); 296 kfree(ahp->work_icv);
296 ahp->work_icv = NULL;
297 crypto_free_hash(ahp->tfm); 297 crypto_free_hash(ahp->tfm);
298 ahp->tfm = NULL;
299 kfree(ahp); 298 kfree(ahp);
300} 299}
301 300
@@ -316,6 +315,7 @@ static struct net_protocol ah4_protocol = {
316 .handler = xfrm4_rcv, 315 .handler = xfrm4_rcv,
317 .err_handler = ah4_err, 316 .err_handler = ah4_err,
318 .no_policy = 1, 317 .no_policy = 1,
318 .netns_ok = 1,
319}; 319};
320 320
321static int __init ah4_init(void) 321static int __init ah4_init(void)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1a9dd66511fc..29a74c01d8de 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -506,7 +506,7 @@ int arp_bind_neighbour(struct dst_entry *dst)
506 if (dev == NULL) 506 if (dev == NULL)
507 return -EINVAL; 507 return -EINVAL;
508 if (n == NULL) { 508 if (n == NULL) {
509 __be32 nexthop = ((struct rtable*)dst)->rt_gateway; 509 __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
510 if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) 510 if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
511 nexthop = 0; 511 nexthop = 0;
512 n = __neigh_lookup_errno( 512 n = __neigh_lookup_errno(
@@ -640,14 +640,14 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
640 arp_ptr=(unsigned char *)(arp+1); 640 arp_ptr=(unsigned char *)(arp+1);
641 641
642 memcpy(arp_ptr, src_hw, dev->addr_len); 642 memcpy(arp_ptr, src_hw, dev->addr_len);
643 arp_ptr+=dev->addr_len; 643 arp_ptr += dev->addr_len;
644 memcpy(arp_ptr, &src_ip,4); 644 memcpy(arp_ptr, &src_ip, 4);
645 arp_ptr+=4; 645 arp_ptr += 4;
646 if (target_hw != NULL) 646 if (target_hw != NULL)
647 memcpy(arp_ptr, target_hw, dev->addr_len); 647 memcpy(arp_ptr, target_hw, dev->addr_len);
648 else 648 else
649 memset(arp_ptr, 0, dev->addr_len); 649 memset(arp_ptr, 0, dev->addr_len);
650 arp_ptr+=dev->addr_len; 650 arp_ptr += dev->addr_len;
651 memcpy(arp_ptr, &dest_ip, 4); 651 memcpy(arp_ptr, &dest_ip, 4);
652 652
653 return skb; 653 return skb;
@@ -818,18 +818,18 @@ static int arp_process(struct sk_buff *skb)
818 addr_type = rt->rt_type; 818 addr_type = rt->rt_type;
819 819
820 if (addr_type == RTN_LOCAL) { 820 if (addr_type == RTN_LOCAL) {
821 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 821 int dont_send = 0;
822 if (n) {
823 int dont_send = 0;
824
825 if (!dont_send)
826 dont_send |= arp_ignore(in_dev,sip,tip);
827 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
828 dont_send |= arp_filter(sip,tip,dev);
829 if (!dont_send)
830 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
831 822
832 neigh_release(n); 823 if (!dont_send)
824 dont_send |= arp_ignore(in_dev,sip,tip);
825 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
826 dont_send |= arp_filter(sip,tip,dev);
827 if (!dont_send) {
828 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
829 if (n) {
830 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
831 neigh_release(n);
832 }
833 } 833 }
834 goto out; 834 goto out;
835 } else if (IN_DEV_FORWARD(in_dev)) { 835 } else if (IN_DEV_FORWARD(in_dev)) {
@@ -1308,7 +1308,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
1308#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) 1308#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1309 } 1309 }
1310#endif 1310#endif
1311 sprintf(tbuf, NIPQUAD_FMT, NIPQUAD(*(u32*)n->primary_key)); 1311 sprintf(tbuf, "%pI4", n->primary_key);
1312 seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", 1312 seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
1313 tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); 1313 tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
1314 read_unlock(&n->lock); 1314 read_unlock(&n->lock);
@@ -1321,7 +1321,7 @@ static void arp_format_pneigh_entry(struct seq_file *seq,
1321 int hatype = dev ? dev->type : 0; 1321 int hatype = dev ? dev->type : 0;
1322 char tbuf[16]; 1322 char tbuf[16];
1323 1323
1324 sprintf(tbuf, NIPQUAD_FMT, NIPQUAD(*(u32*)n->key)); 1324 sprintf(tbuf, "%pI4", n->key);
1325 seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", 1325 seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
1326 tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", 1326 tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
1327 dev ? dev->name : "*"); 1327 dev ? dev->name : "*");
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 2e78f6bd9775..e52799047a5f 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -490,7 +490,6 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def)
490 } 490 }
491 491
492 atomic_set(&doi_def->refcount, 1); 492 atomic_set(&doi_def->refcount, 1);
493 INIT_RCU_HEAD(&doi_def->rcu);
494 493
495 spin_lock(&cipso_v4_doi_list_lock); 494 spin_lock(&cipso_v4_doi_list_lock);
496 if (cipso_v4_doi_search(doi_def->doi) != NULL) 495 if (cipso_v4_doi_search(doi_def->doi) != NULL)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 56fce3ab6c55..309997edc8a5 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -112,13 +112,7 @@ static inline void devinet_sysctl_unregister(struct in_device *idev)
112 112
113static struct in_ifaddr *inet_alloc_ifa(void) 113static struct in_ifaddr *inet_alloc_ifa(void)
114{ 114{
115 struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL); 115 return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
116
117 if (ifa) {
118 INIT_RCU_HEAD(&ifa->rcu_head);
119 }
120
121 return ifa;
122} 116}
123 117
124static void inet_rcu_free_ifa(struct rcu_head *head) 118static void inet_rcu_free_ifa(struct rcu_head *head)
@@ -161,7 +155,6 @@ static struct in_device *inetdev_init(struct net_device *dev)
161 in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); 155 in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
162 if (!in_dev) 156 if (!in_dev)
163 goto out; 157 goto out;
164 INIT_RCU_HEAD(&in_dev->rcu_head);
165 memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, 158 memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
166 sizeof(in_dev->cnf)); 159 sizeof(in_dev->cnf));
167 in_dev->cnf.sysctl = NULL; 160 in_dev->cnf.sysctl = NULL;
@@ -1108,7 +1101,7 @@ out:
1108} 1101}
1109 1102
1110static struct notifier_block ip_netdev_notifier = { 1103static struct notifier_block ip_netdev_notifier = {
1111 .notifier_call =inetdev_event, 1104 .notifier_call = inetdev_event,
1112}; 1105};
1113 1106
1114static inline size_t inet_nlmsg_size(void) 1107static inline size_t inet_nlmsg_size(void)
@@ -1195,7 +1188,7 @@ done:
1195 return skb->len; 1188 return skb->len;
1196} 1189}
1197 1190
1198static void rtmsg_ifa(int event, struct in_ifaddr* ifa, struct nlmsghdr *nlh, 1191static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1199 u32 pid) 1192 u32 pid)
1200{ 1193{
1201 struct sk_buff *skb; 1194 struct sk_buff *skb;
@@ -1262,7 +1255,7 @@ static void inet_forward_change(struct net *net)
1262} 1255}
1263 1256
1264static int devinet_conf_proc(ctl_table *ctl, int write, 1257static int devinet_conf_proc(ctl_table *ctl, int write,
1265 struct file* filp, void __user *buffer, 1258 struct file *filp, void __user *buffer,
1266 size_t *lenp, loff_t *ppos) 1259 size_t *lenp, loff_t *ppos)
1267{ 1260{
1268 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 1261 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
@@ -1334,7 +1327,7 @@ static int devinet_conf_sysctl(ctl_table *table,
1334} 1327}
1335 1328
1336static int devinet_sysctl_forward(ctl_table *ctl, int write, 1329static int devinet_sysctl_forward(ctl_table *ctl, int write,
1337 struct file* filp, void __user *buffer, 1330 struct file *filp, void __user *buffer,
1338 size_t *lenp, loff_t *ppos) 1331 size_t *lenp, loff_t *ppos)
1339{ 1332{
1340 int *valp = ctl->data; 1333 int *valp = ctl->data;
@@ -1363,7 +1356,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1363} 1356}
1364 1357
1365int ipv4_doint_and_flush(ctl_table *ctl, int write, 1358int ipv4_doint_and_flush(ctl_table *ctl, int write,
1366 struct file* filp, void __user *buffer, 1359 struct file *filp, void __user *buffer,
1367 size_t *lenp, loff_t *ppos) 1360 size_t *lenp, loff_t *ppos)
1368{ 1361{
1369 int *valp = ctl->data; 1362 int *valp = ctl->data;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 21515d4c49eb..18bb383ea393 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -413,15 +413,16 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
413 413
414static void esp4_err(struct sk_buff *skb, u32 info) 414static void esp4_err(struct sk_buff *skb, u32 info)
415{ 415{
416 struct iphdr *iph = (struct iphdr*)skb->data; 416 struct net *net = dev_net(skb->dev);
417 struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2)); 417 struct iphdr *iph = (struct iphdr *)skb->data;
418 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
418 struct xfrm_state *x; 419 struct xfrm_state *x;
419 420
420 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || 421 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
421 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 422 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
422 return; 423 return;
423 424
424 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 425 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
425 if (!x) 426 if (!x)
426 return; 427 return;
427 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 428 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
@@ -618,6 +619,7 @@ static struct net_protocol esp4_protocol = {
618 .handler = xfrm4_rcv, 619 .handler = xfrm4_rcv,
619 .err_handler = esp4_err, 620 .err_handler = esp4_err,
620 .no_policy = 1, 621 .no_policy = 1,
622 .netns_ok = 1,
621}; 623};
622 624
623static int __init esp4_init(void) 625static int __init esp4_init(void)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 65c1503f8cc8..741e4fa3e474 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -578,7 +578,7 @@ errout:
578 return err; 578 return err;
579} 579}
580 580
581static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 581static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
582{ 582{
583 struct net *net = sock_net(skb->sk); 583 struct net *net = sock_net(skb->sk);
584 struct fib_config cfg; 584 struct fib_config cfg;
@@ -600,7 +600,7 @@ errout:
600 return err; 600 return err;
601} 601}
602 602
603static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 603static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
604{ 604{
605 struct net *net = sock_net(skb->sk); 605 struct net *net = sock_net(skb->sk);
606 struct fib_config cfg; 606 struct fib_config cfg;
@@ -903,7 +903,7 @@ static void fib_disable_ip(struct net_device *dev, int force)
903 903
904static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) 904static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
905{ 905{
906 struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; 906 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
907 struct net_device *dev = ifa->ifa_dev->dev; 907 struct net_device *dev = ifa->ifa_dev->dev;
908 908
909 switch (event) { 909 switch (event) {
@@ -964,11 +964,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
964} 964}
965 965
966static struct notifier_block fib_inetaddr_notifier = { 966static struct notifier_block fib_inetaddr_notifier = {
967 .notifier_call =fib_inetaddr_event, 967 .notifier_call = fib_inetaddr_event,
968}; 968};
969 969
970static struct notifier_block fib_netdev_notifier = { 970static struct notifier_block fib_netdev_notifier = {
971 .notifier_call =fib_netdev_event, 971 .notifier_call = fib_netdev_event,
972}; 972};
973 973
974static int __net_init ip_fib_net_init(struct net *net) 974static int __net_init ip_fib_net_init(struct net *net)
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index c8cac6c7f881..ded8c44fb848 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -247,7 +247,7 @@ fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
247{ 247{
248 int err; 248 int err;
249 struct fn_zone *fz; 249 struct fn_zone *fz;
250 struct fn_hash *t = (struct fn_hash*)tb->tb_data; 250 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
251 251
252 read_lock(&fib_hash_lock); 252 read_lock(&fib_hash_lock);
253 for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { 253 for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
@@ -283,7 +283,7 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
283 struct fib_node *f; 283 struct fib_node *f;
284 struct fib_info *fi = NULL; 284 struct fib_info *fi = NULL;
285 struct fib_info *last_resort; 285 struct fib_info *last_resort;
286 struct fn_hash *t = (struct fn_hash*)tb->tb_data; 286 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
287 struct fn_zone *fz = t->fn_zones[0]; 287 struct fn_zone *fz = t->fn_zones[0];
288 288
289 if (fz == NULL) 289 if (fz == NULL)
@@ -548,7 +548,7 @@ out:
548 548
549static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) 549static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
550{ 550{
551 struct fn_hash *table = (struct fn_hash*)tb->tb_data; 551 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
552 struct fib_node *f; 552 struct fib_node *f;
553 struct fib_alias *fa, *fa_to_delete; 553 struct fib_alias *fa, *fa_to_delete;
554 struct fn_zone *fz; 554 struct fn_zone *fz;
@@ -748,7 +748,7 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
748{ 748{
749 int m, s_m; 749 int m, s_m;
750 struct fn_zone *fz; 750 struct fn_zone *fz;
751 struct fn_hash *table = (struct fn_hash*)tb->tb_data; 751 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
752 752
753 s_m = cb->args[2]; 753 s_m = cb->args[2];
754 read_lock(&fib_hash_lock); 754 read_lock(&fib_hash_lock);
@@ -845,10 +845,10 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
845 struct hlist_node *node; 845 struct hlist_node *node;
846 struct fib_node *fn; 846 struct fib_node *fn;
847 847
848 hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) { 848 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
849 struct fib_alias *fa; 849 struct fib_alias *fa;
850 850
851 list_for_each_entry(fa,&fn->fn_alias,fa_list) { 851 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
852 iter->fn = fn; 852 iter->fn = fn;
853 iter->fa = fa; 853 iter->fa = fa;
854 goto out; 854 goto out;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ded2ae34eab1..4817dea3bc73 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -63,16 +63,16 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64 64
65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ 65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
66for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) 66for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 67
68#else /* CONFIG_IP_ROUTE_MULTIPATH */ 68#else /* CONFIG_IP_ROUTE_MULTIPATH */
69 69
70/* Hope, that gcc will optimize it to get rid of dummy loop */ 70/* Hope, that gcc will optimize it to get rid of dummy loop */
71 71
72#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \ 72#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
73for (nhsel=0; nhsel < 1; nhsel++) 73for (nhsel=0; nhsel < 1; nhsel++)
74 74
75#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \ 75#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \
76for (nhsel=0; nhsel < 1; nhsel++) 76for (nhsel=0; nhsel < 1; nhsel++)
77 77
78#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 78#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -358,7 +358,7 @@ int fib_detect_death(struct fib_info *fi, int order,
358 state = n->nud_state; 358 state = n->nud_state;
359 neigh_release(n); 359 neigh_release(n);
360 } 360 }
361 if (state==NUD_REACHABLE) 361 if (state == NUD_REACHABLE)
362 return 0; 362 return 0;
363 if ((state&NUD_VALID) && order != dflt) 363 if ((state&NUD_VALID) && order != dflt)
364 return 0; 364 return 0;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5cb72786a8af..ec0ae490f0b6 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2399,8 +2399,8 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2399 __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); 2399 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2400 2400
2401 seq_indent(seq, iter->depth-1); 2401 seq_indent(seq, iter->depth-1);
2402 seq_printf(seq, " +-- " NIPQUAD_FMT "/%d %d %d %d\n", 2402 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2403 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children, 2403 &prf, tn->pos, tn->bits, tn->full_children,
2404 tn->empty_children); 2404 tn->empty_children);
2405 2405
2406 } else { 2406 } else {
@@ -2410,7 +2410,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2410 __be32 val = htonl(l->key); 2410 __be32 val = htonl(l->key);
2411 2411
2412 seq_indent(seq, iter->depth); 2412 seq_indent(seq, iter->depth);
2413 seq_printf(seq, " |-- " NIPQUAD_FMT "\n", NIPQUAD(val)); 2413 seq_printf(seq, " |-- %pI4\n", &val);
2414 2414
2415 hlist_for_each_entry_rcu(li, node, &l->list, hlist) { 2415 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2416 struct fib_alias *fa; 2416 struct fib_alias *fa;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 72b2de76f1cd..705b33b184a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -321,12 +321,12 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
321} 321}
322 322
323static void icmp_push_reply(struct icmp_bxm *icmp_param, 323static void icmp_push_reply(struct icmp_bxm *icmp_param,
324 struct ipcm_cookie *ipc, struct rtable *rt) 324 struct ipcm_cookie *ipc, struct rtable **rt)
325{ 325{
326 struct sock *sk; 326 struct sock *sk;
327 struct sk_buff *skb; 327 struct sk_buff *skb;
328 328
329 sk = icmp_sk(dev_net(rt->u.dst.dev)); 329 sk = icmp_sk(dev_net((*rt)->u.dst.dev));
330 if (ip_append_data(sk, icmp_glue_bits, icmp_param, 330 if (ip_append_data(sk, icmp_glue_bits, icmp_param,
331 icmp_param->data_len+icmp_param->head_len, 331 icmp_param->data_len+icmp_param->head_len,
332 icmp_param->head_len, 332 icmp_param->head_len,
@@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
392 } 392 }
393 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, 393 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
394 icmp_param->data.icmph.code)) 394 icmp_param->data.icmph.code))
395 icmp_push_reply(icmp_param, &ipc, rt); 395 icmp_push_reply(icmp_param, &ipc, &rt);
396 ip_rt_put(rt); 396 ip_rt_put(rt);
397out_unlock: 397out_unlock:
398 icmp_xmit_unlock(sk); 398 icmp_xmit_unlock(sk);
@@ -562,7 +562,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
562 /* No need to clone since we're just using its address. */ 562 /* No need to clone since we're just using its address. */
563 rt2 = rt; 563 rt2 = rt;
564 564
565 err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); 565 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
566 switch (err) { 566 switch (err) {
567 case 0: 567 case 0:
568 if (rt != rt2) 568 if (rt != rt2)
@@ -601,7 +601,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
601 if (err) 601 if (err)
602 goto relookup_failed; 602 goto relookup_failed;
603 603
604 err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, 604 err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
605 XFRM_LOOKUP_ICMP); 605 XFRM_LOOKUP_ICMP);
606 switch (err) { 606 switch (err) {
607 case 0: 607 case 0:
@@ -635,7 +635,7 @@ route_done:
635 icmp_param.data_len = room; 635 icmp_param.data_len = room;
636 icmp_param.head_len = sizeof(struct icmphdr); 636 icmp_param.head_len = sizeof(struct icmphdr);
637 637
638 icmp_push_reply(&icmp_param, &ipc, rt); 638 icmp_push_reply(&icmp_param, &ipc, &rt);
639ende: 639ende:
640 ip_rt_put(rt); 640 ip_rt_put(rt);
641out_unlock: 641out_unlock:
@@ -683,10 +683,8 @@ static void icmp_unreach(struct sk_buff *skb)
683 break; 683 break;
684 case ICMP_FRAG_NEEDED: 684 case ICMP_FRAG_NEEDED:
685 if (ipv4_config.no_pmtu_disc) { 685 if (ipv4_config.no_pmtu_disc) {
686 LIMIT_NETDEBUG(KERN_INFO "ICMP: " NIPQUAD_FMT ": " 686 LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n",
687 "fragmentation needed " 687 &iph->daddr);
688 "and DF set.\n",
689 NIPQUAD(iph->daddr));
690 } else { 688 } else {
691 info = ip_rt_frag_needed(net, iph, 689 info = ip_rt_frag_needed(net, iph,
692 ntohs(icmph->un.frag.mtu), 690 ntohs(icmph->un.frag.mtu),
@@ -696,9 +694,8 @@ static void icmp_unreach(struct sk_buff *skb)
696 } 694 }
697 break; 695 break;
698 case ICMP_SR_FAILED: 696 case ICMP_SR_FAILED:
699 LIMIT_NETDEBUG(KERN_INFO "ICMP: " NIPQUAD_FMT ": Source " 697 LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n",
700 "Route Failed.\n", 698 &iph->daddr);
701 NIPQUAD(iph->daddr));
702 break; 699 break;
703 default: 700 default:
704 break; 701 break;
@@ -729,12 +726,12 @@ static void icmp_unreach(struct sk_buff *skb)
729 if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && 726 if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
730 inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { 727 inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
731 if (net_ratelimit()) 728 if (net_ratelimit())
732 printk(KERN_WARNING NIPQUAD_FMT " sent an invalid ICMP " 729 printk(KERN_WARNING "%pI4 sent an invalid ICMP "
733 "type %u, code %u " 730 "type %u, code %u "
734 "error to a broadcast: " NIPQUAD_FMT " on %s\n", 731 "error to a broadcast: %pI4 on %s\n",
735 NIPQUAD(ip_hdr(skb)->saddr), 732 &ip_hdr(skb)->saddr,
736 icmph->type, icmph->code, 733 icmph->type, icmph->code,
737 NIPQUAD(iph->daddr), 734 &iph->daddr,
738 skb->dev->name); 735 skb->dev->name);
739 goto out; 736 goto out;
740 } 737 }
@@ -952,9 +949,8 @@ static void icmp_address_reply(struct sk_buff *skb)
952 break; 949 break;
953 } 950 }
954 if (!ifa && net_ratelimit()) { 951 if (!ifa && net_ratelimit()) {
955 printk(KERN_INFO "Wrong address mask " NIPQUAD_FMT " from " 952 printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n",
956 "%s/" NIPQUAD_FMT "\n", 953 mp, dev->name, &rt->rt_src);
957 NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
958 } 954 }
959 } 955 }
960 rcu_read_unlock(); 956 rcu_read_unlock();
@@ -976,9 +972,10 @@ int icmp_rcv(struct sk_buff *skb)
976 struct net *net = dev_net(rt->u.dst.dev); 972 struct net *net = dev_net(rt->u.dst.dev);
977 973
978 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 974 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
975 struct sec_path *sp = skb_sec_path(skb);
979 int nh; 976 int nh;
980 977
981 if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags & 978 if (!(sp && sp->xvec[sp->len - 1]->props.flags &
982 XFRM_STATE_ICMP)) 979 XFRM_STATE_ICMP))
983 goto drop; 980 goto drop;
984 981
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a0d86455c53e..9eb6219af615 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -167,7 +167,7 @@ static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
167 spin_lock_bh(&im->lock); 167 spin_lock_bh(&im->lock);
168 if (del_timer(&im->timer)) 168 if (del_timer(&im->timer))
169 atomic_dec(&im->refcnt); 169 atomic_dec(&im->refcnt);
170 im->tm_running=0; 170 im->tm_running = 0;
171 im->reporter = 0; 171 im->reporter = 0;
172 im->unsolicit_count = 0; 172 im->unsolicit_count = 0;
173 spin_unlock_bh(&im->lock); 173 spin_unlock_bh(&im->lock);
@@ -176,9 +176,9 @@ static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
176/* It must be called with locked im->lock */ 176/* It must be called with locked im->lock */
177static void igmp_start_timer(struct ip_mc_list *im, int max_delay) 177static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
178{ 178{
179 int tv=net_random() % max_delay; 179 int tv = net_random() % max_delay;
180 180
181 im->tm_running=1; 181 im->tm_running = 1;
182 if (!mod_timer(&im->timer, jiffies+tv+2)) 182 if (!mod_timer(&im->timer, jiffies+tv+2))
183 atomic_inc(&im->refcnt); 183 atomic_inc(&im->refcnt);
184} 184}
@@ -207,7 +207,7 @@ static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
207 if (del_timer(&im->timer)) { 207 if (del_timer(&im->timer)) {
208 if ((long)(im->timer.expires-jiffies) < max_delay) { 208 if ((long)(im->timer.expires-jiffies) < max_delay) {
209 add_timer(&im->timer); 209 add_timer(&im->timer);
210 im->tm_running=1; 210 im->tm_running = 1;
211 spin_unlock_bh(&im->lock); 211 spin_unlock_bh(&im->lock);
212 return; 212 return;
213 } 213 }
@@ -358,7 +358,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
358 358
359static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) 359static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
360{ 360{
361 return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel); 361 return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
362} 362}
363 363
364static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, 364static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
@@ -653,7 +653,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
653 return -1; 653 return -1;
654 } 654 }
655 655
656 skb=alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); 656 skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
657 if (skb == NULL) { 657 if (skb == NULL) {
658 ip_rt_put(rt); 658 ip_rt_put(rt);
659 return -1; 659 return -1;
@@ -682,11 +682,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
682 ((u8*)&iph[1])[3] = 0; 682 ((u8*)&iph[1])[3] = 0;
683 683
684 ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 684 ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
685 ih->type=type; 685 ih->type = type;
686 ih->code=0; 686 ih->code = 0;
687 ih->csum=0; 687 ih->csum = 0;
688 ih->group=group; 688 ih->group = group;
689 ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); 689 ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
690 690
691 return ip_local_out(skb); 691 return ip_local_out(skb);
692} 692}
@@ -728,7 +728,7 @@ static void igmp_timer_expire(unsigned long data)
728 struct in_device *in_dev = im->interface; 728 struct in_device *in_dev = im->interface;
729 729
730 spin_lock(&im->lock); 730 spin_lock(&im->lock);
731 im->tm_running=0; 731 im->tm_running = 0;
732 732
733 if (im->unsolicit_count) { 733 if (im->unsolicit_count) {
734 im->unsolicit_count--; 734 im->unsolicit_count--;
@@ -997,7 +997,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
997 --ANK 997 --ANK
998 */ 998 */
999 if (arp_mc_map(addr, buf, dev, 0) == 0) 999 if (arp_mc_map(addr, buf, dev, 0) == 0)
1000 dev_mc_add(dev,buf,dev->addr_len,0); 1000 dev_mc_add(dev, buf, dev->addr_len, 0);
1001} 1001}
1002 1002
1003/* 1003/*
@@ -1010,7 +1010,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
1010 struct net_device *dev = in_dev->dev; 1010 struct net_device *dev = in_dev->dev;
1011 1011
1012 if (arp_mc_map(addr, buf, dev, 0) == 0) 1012 if (arp_mc_map(addr, buf, dev, 0) == 0)
1013 dev_mc_delete(dev,buf,dev->addr_len,0); 1013 dev_mc_delete(dev, buf, dev->addr_len, 0);
1014} 1014}
1015 1015
1016#ifdef CONFIG_IP_MULTICAST 1016#ifdef CONFIG_IP_MULTICAST
@@ -1210,10 +1210,10 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1210 if (!im) 1210 if (!im)
1211 goto out; 1211 goto out;
1212 1212
1213 im->users=1; 1213 im->users = 1;
1214 im->interface=in_dev; 1214 im->interface = in_dev;
1215 in_dev_hold(in_dev); 1215 in_dev_hold(in_dev);
1216 im->multiaddr=addr; 1216 im->multiaddr = addr;
1217 /* initial mode is (EX, empty) */ 1217 /* initial mode is (EX, empty) */
1218 im->sfmode = MCAST_EXCLUDE; 1218 im->sfmode = MCAST_EXCLUDE;
1219 im->sfcount[MCAST_INCLUDE] = 0; 1219 im->sfcount[MCAST_INCLUDE] = 0;
@@ -1224,7 +1224,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1224 atomic_set(&im->refcnt, 1); 1224 atomic_set(&im->refcnt, 1);
1225 spin_lock_init(&im->lock); 1225 spin_lock_init(&im->lock);
1226#ifdef CONFIG_IP_MULTICAST 1226#ifdef CONFIG_IP_MULTICAST
1227 im->tm_running=0; 1227 im->tm_running = 0;
1228 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); 1228 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
1229 im->unsolicit_count = IGMP_Unsolicited_Report_Count; 1229 im->unsolicit_count = IGMP_Unsolicited_Report_Count;
1230 im->reporter = 0; 1230 im->reporter = 0;
@@ -1232,8 +1232,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1232#endif 1232#endif
1233 im->loaded = 0; 1233 im->loaded = 0;
1234 write_lock_bh(&in_dev->mc_list_lock); 1234 write_lock_bh(&in_dev->mc_list_lock);
1235 im->next=in_dev->mc_list; 1235 im->next = in_dev->mc_list;
1236 in_dev->mc_list=im; 1236 in_dev->mc_list = im;
1237 in_dev->mc_count++; 1237 in_dev->mc_count++;
1238 write_unlock_bh(&in_dev->mc_list_lock); 1238 write_unlock_bh(&in_dev->mc_list_lock);
1239#ifdef CONFIG_IP_MULTICAST 1239#ifdef CONFIG_IP_MULTICAST
@@ -1279,7 +1279,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1279 ASSERT_RTNL(); 1279 ASSERT_RTNL();
1280 1280
1281 for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { 1281 for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
1282 if (i->multiaddr==addr) { 1282 if (i->multiaddr == addr) {
1283 if (--i->users == 0) { 1283 if (--i->users == 0) {
1284 write_lock_bh(&in_dev->mc_list_lock); 1284 write_lock_bh(&in_dev->mc_list_lock);
1285 *ip = i->next; 1285 *ip = i->next;
@@ -1738,7 +1738,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1738{ 1738{
1739 int err; 1739 int err;
1740 __be32 addr = imr->imr_multiaddr.s_addr; 1740 __be32 addr = imr->imr_multiaddr.s_addr;
1741 struct ip_mc_socklist *iml=NULL, *i; 1741 struct ip_mc_socklist *iml = NULL, *i;
1742 struct in_device *in_dev; 1742 struct in_device *in_dev;
1743 struct inet_sock *inet = inet_sk(sk); 1743 struct inet_sock *inet = inet_sk(sk);
1744 struct net *net = sock_net(sk); 1744 struct net *net = sock_net(sk);
@@ -1769,7 +1769,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1769 err = -ENOBUFS; 1769 err = -ENOBUFS;
1770 if (count >= sysctl_igmp_max_memberships) 1770 if (count >= sysctl_igmp_max_memberships)
1771 goto done; 1771 goto done;
1772 iml = sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL); 1772 iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
1773 if (iml == NULL) 1773 if (iml == NULL)
1774 goto done; 1774 goto done;
1775 1775
@@ -2275,6 +2275,7 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2275 2275
2276#if defined(CONFIG_PROC_FS) 2276#if defined(CONFIG_PROC_FS)
2277struct igmp_mc_iter_state { 2277struct igmp_mc_iter_state {
2278 struct seq_net_private p;
2278 struct net_device *dev; 2279 struct net_device *dev;
2279 struct in_device *in_dev; 2280 struct in_device *in_dev;
2280}; 2281};
@@ -2283,11 +2284,12 @@ struct igmp_mc_iter_state {
2283 2284
2284static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) 2285static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2285{ 2286{
2287 struct net *net = seq_file_net(seq);
2286 struct ip_mc_list *im = NULL; 2288 struct ip_mc_list *im = NULL;
2287 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2289 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2288 2290
2289 state->in_dev = NULL; 2291 state->in_dev = NULL;
2290 for_each_netdev(&init_net, state->dev) { 2292 for_each_netdev(net, state->dev) {
2291 struct in_device *in_dev; 2293 struct in_device *in_dev;
2292 in_dev = in_dev_get(state->dev); 2294 in_dev = in_dev_get(state->dev);
2293 if (!in_dev) 2295 if (!in_dev)
@@ -2408,7 +2410,7 @@ static const struct seq_operations igmp_mc_seq_ops = {
2408 2410
2409static int igmp_mc_seq_open(struct inode *inode, struct file *file) 2411static int igmp_mc_seq_open(struct inode *inode, struct file *file)
2410{ 2412{
2411 return seq_open_private(file, &igmp_mc_seq_ops, 2413 return seq_open_net(inode, file, &igmp_mc_seq_ops,
2412 sizeof(struct igmp_mc_iter_state)); 2414 sizeof(struct igmp_mc_iter_state));
2413} 2415}
2414 2416
@@ -2417,10 +2419,11 @@ static const struct file_operations igmp_mc_seq_fops = {
2417 .open = igmp_mc_seq_open, 2419 .open = igmp_mc_seq_open,
2418 .read = seq_read, 2420 .read = seq_read,
2419 .llseek = seq_lseek, 2421 .llseek = seq_lseek,
2420 .release = seq_release_private, 2422 .release = seq_release_net,
2421}; 2423};
2422 2424
2423struct igmp_mcf_iter_state { 2425struct igmp_mcf_iter_state {
2426 struct seq_net_private p;
2424 struct net_device *dev; 2427 struct net_device *dev;
2425 struct in_device *idev; 2428 struct in_device *idev;
2426 struct ip_mc_list *im; 2429 struct ip_mc_list *im;
@@ -2430,13 +2433,14 @@ struct igmp_mcf_iter_state {
2430 2433
2431static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) 2434static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2432{ 2435{
2436 struct net *net = seq_file_net(seq);
2433 struct ip_sf_list *psf = NULL; 2437 struct ip_sf_list *psf = NULL;
2434 struct ip_mc_list *im = NULL; 2438 struct ip_mc_list *im = NULL;
2435 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); 2439 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2436 2440
2437 state->idev = NULL; 2441 state->idev = NULL;
2438 state->im = NULL; 2442 state->im = NULL;
2439 for_each_netdev(&init_net, state->dev) { 2443 for_each_netdev(net, state->dev) {
2440 struct in_device *idev; 2444 struct in_device *idev;
2441 idev = in_dev_get(state->dev); 2445 idev = in_dev_get(state->dev);
2442 if (unlikely(idev == NULL)) 2446 if (unlikely(idev == NULL))
@@ -2567,7 +2571,7 @@ static const struct seq_operations igmp_mcf_seq_ops = {
2567 2571
2568static int igmp_mcf_seq_open(struct inode *inode, struct file *file) 2572static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
2569{ 2573{
2570 return seq_open_private(file, &igmp_mcf_seq_ops, 2574 return seq_open_net(inode, file, &igmp_mcf_seq_ops,
2571 sizeof(struct igmp_mcf_iter_state)); 2575 sizeof(struct igmp_mcf_iter_state));
2572} 2576}
2573 2577
@@ -2576,14 +2580,41 @@ static const struct file_operations igmp_mcf_seq_fops = {
2576 .open = igmp_mcf_seq_open, 2580 .open = igmp_mcf_seq_open,
2577 .read = seq_read, 2581 .read = seq_read,
2578 .llseek = seq_lseek, 2582 .llseek = seq_lseek,
2579 .release = seq_release_private, 2583 .release = seq_release_net,
2580}; 2584};
2581 2585
2582int __init igmp_mc_proc_init(void) 2586static int igmp_net_init(struct net *net)
2583{ 2587{
2584 proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops); 2588 struct proc_dir_entry *pde;
2585 proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); 2589
2590 pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops);
2591 if (!pde)
2592 goto out_igmp;
2593 pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
2594 if (!pde)
2595 goto out_mcfilter;
2586 return 0; 2596 return 0;
2597
2598out_mcfilter:
2599 proc_net_remove(net, "igmp");
2600out_igmp:
2601 return -ENOMEM;
2602}
2603
2604static void igmp_net_exit(struct net *net)
2605{
2606 proc_net_remove(net, "mcfilter");
2607 proc_net_remove(net, "igmp");
2608}
2609
2610static struct pernet_operations igmp_net_ops = {
2611 .init = igmp_net_init,
2612 .exit = igmp_net_exit,
2613};
2614
2615int __init igmp_mc_proc_init(void)
2616{
2617 return register_pernet_subsys(&igmp_net_ops);
2587} 2618}
2588#endif 2619#endif
2589 2620
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index bd1278a2d828..c7cda1ca8e65 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -109,7 +109,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
109 hashinfo->bhash_size)]; 109 hashinfo->bhash_size)];
110 spin_lock(&head->lock); 110 spin_lock(&head->lock);
111 inet_bind_bucket_for_each(tb, node, &head->chain) 111 inet_bind_bucket_for_each(tb, node, &head->chain)
112 if (tb->ib_net == net && tb->port == rover) 112 if (ib_net(tb) == net && tb->port == rover)
113 goto next; 113 goto next;
114 break; 114 break;
115 next: 115 next:
@@ -137,7 +137,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
137 hashinfo->bhash_size)]; 137 hashinfo->bhash_size)];
138 spin_lock(&head->lock); 138 spin_lock(&head->lock);
139 inet_bind_bucket_for_each(tb, node, &head->chain) 139 inet_bind_bucket_for_each(tb, node, &head->chain)
140 if (tb->ib_net == net && tb->port == snum) 140 if (ib_net(tb) == net && tb->port == snum)
141 goto tb_found; 141 goto tb_found;
142 } 142 }
143 tb = NULL; 143 tb = NULL;
@@ -323,7 +323,7 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
323 323
324EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 324EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
325 325
326struct dst_entry* inet_csk_route_req(struct sock *sk, 326struct dst_entry *inet_csk_route_req(struct sock *sk,
327 const struct request_sock *req) 327 const struct request_sock *req)
328{ 328{
329 struct rtable *rt; 329 struct rtable *rt;
@@ -344,16 +344,17 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
344 struct net *net = sock_net(sk); 344 struct net *net = sock_net(sk);
345 345
346 security_req_classify_flow(req, &fl); 346 security_req_classify_flow(req, &fl);
347 if (ip_route_output_flow(net, &rt, &fl, sk, 0)) { 347 if (ip_route_output_flow(net, &rt, &fl, sk, 0))
348 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 348 goto no_route;
349 return NULL; 349 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
350 } 350 goto route_err;
351 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
352 ip_rt_put(rt);
353 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
354 return NULL;
355 }
356 return &rt->u.dst; 351 return &rt->u.dst;
352
353route_err:
354 ip_rt_put(rt);
355no_route:
356 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
357 return NULL;
357} 358}
358 359
359EXPORT_SYMBOL_GPL(inet_csk_route_req); 360EXPORT_SYMBOL_GPL(inet_csk_route_req);
@@ -561,7 +562,7 @@ void inet_csk_destroy_sock(struct sock *sk)
561 562
562 sk_refcnt_debug_release(sk); 563 sk_refcnt_debug_release(sk);
563 564
564 atomic_dec(sk->sk_prot->orphan_count); 565 percpu_counter_dec(sk->sk_prot->orphan_count);
565 sock_put(sk); 566 sock_put(sk);
566} 567}
567 568
@@ -632,6 +633,8 @@ void inet_csk_listen_stop(struct sock *sk)
632 633
633 acc_req = req->dl_next; 634 acc_req = req->dl_next;
634 635
636 percpu_counter_inc(sk->sk_prot->orphan_count);
637
635 local_bh_disable(); 638 local_bh_disable();
636 bh_lock_sock(child); 639 bh_lock_sock(child);
637 WARN_ON(sock_owned_by_user(child)); 640 WARN_ON(sock_owned_by_user(child));
@@ -641,8 +644,6 @@ void inet_csk_listen_stop(struct sock *sk)
641 644
642 sock_orphan(child); 645 sock_orphan(child);
643 646
644 atomic_inc(sk->sk_prot->orphan_count);
645
646 inet_csk_destroy_sock(child); 647 inet_csk_destroy_sock(child);
647 648
648 bh_unlock_sock(child); 649 bh_unlock_sock(child);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 564230dabcb8..588a7796e3e3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
718 if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) 718 if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
719 goto skip_listen_ht; 719 goto skip_listen_ht;
720 720
721 inet_listen_lock(hashinfo);
722 for (i = s_i; i < INET_LHTABLE_SIZE; i++) { 721 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
723 struct sock *sk; 722 struct sock *sk;
724 struct hlist_node *node; 723 struct hlist_nulls_node *node;
724 struct inet_listen_hashbucket *ilb;
725 725
726 num = 0; 726 num = 0;
727 sk_for_each(sk, node, &hashinfo->listening_hash[i]) { 727 ilb = &hashinfo->listening_hash[i];
728 spin_lock_bh(&ilb->lock);
729 sk_nulls_for_each(sk, node, &ilb->head) {
728 struct inet_sock *inet = inet_sk(sk); 730 struct inet_sock *inet = inet_sk(sk);
729 731
730 if (num < s_num) { 732 if (num < s_num) {
@@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
742 goto syn_recv; 744 goto syn_recv;
743 745
744 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 746 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
745 inet_listen_unlock(hashinfo); 747 spin_unlock_bh(&ilb->lock);
746 goto done; 748 goto done;
747 } 749 }
748 750
@@ -751,7 +753,7 @@ syn_recv:
751 goto next_listen; 753 goto next_listen;
752 754
753 if (inet_diag_dump_reqs(skb, sk, cb) < 0) { 755 if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
754 inet_listen_unlock(hashinfo); 756 spin_unlock_bh(&ilb->lock);
755 goto done; 757 goto done;
756 } 758 }
757 759
@@ -760,12 +762,12 @@ next_listen:
760 cb->args[4] = 0; 762 cb->args[4] = 0;
761 ++num; 763 ++num;
762 } 764 }
765 spin_unlock_bh(&ilb->lock);
763 766
764 s_num = 0; 767 s_num = 0;
765 cb->args[3] = 0; 768 cb->args[3] = 0;
766 cb->args[4] = 0; 769 cb->args[4] = 0;
767 } 770 }
768 inet_listen_unlock(hashinfo);
769skip_listen_ht: 771skip_listen_ht:
770 cb->args[0] = 1; 772 cb->args[0] = 1;
771 s_i = num = s_num = 0; 773 s_i = num = s_num = 0;
@@ -776,20 +778,21 @@ skip_listen_ht:
776 778
777 for (i = s_i; i < hashinfo->ehash_size; i++) { 779 for (i = s_i; i < hashinfo->ehash_size; i++) {
778 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 780 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
779 rwlock_t *lock = inet_ehash_lockp(hashinfo, i); 781 spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
780 struct sock *sk; 782 struct sock *sk;
781 struct hlist_node *node; 783 struct hlist_nulls_node *node;
782 784
783 num = 0; 785 num = 0;
784 786
785 if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) 787 if (hlist_nulls_empty(&head->chain) &&
788 hlist_nulls_empty(&head->twchain))
786 continue; 789 continue;
787 790
788 if (i > s_i) 791 if (i > s_i)
789 s_num = 0; 792 s_num = 0;
790 793
791 read_lock_bh(lock); 794 spin_lock_bh(lock);
792 sk_for_each(sk, node, &head->chain) { 795 sk_nulls_for_each(sk, node, &head->chain) {
793 struct inet_sock *inet = inet_sk(sk); 796 struct inet_sock *inet = inet_sk(sk);
794 797
795 if (num < s_num) 798 if (num < s_num)
@@ -803,7 +806,7 @@ skip_listen_ht:
803 r->id.idiag_dport) 806 r->id.idiag_dport)
804 goto next_normal; 807 goto next_normal;
805 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 808 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
806 read_unlock_bh(lock); 809 spin_unlock_bh(lock);
807 goto done; 810 goto done;
808 } 811 }
809next_normal: 812next_normal:
@@ -825,14 +828,14 @@ next_normal:
825 r->id.idiag_dport) 828 r->id.idiag_dport)
826 goto next_dying; 829 goto next_dying;
827 if (inet_twsk_diag_dump(tw, skb, cb) < 0) { 830 if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
828 read_unlock_bh(lock); 831 spin_unlock_bh(lock);
829 goto done; 832 goto done;
830 } 833 }
831next_dying: 834next_dying:
832 ++num; 835 ++num;
833 } 836 }
834 } 837 }
835 read_unlock_bh(lock); 838 spin_unlock_bh(lock);
836 } 839 }
837 840
838done: 841done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 44981906fb91..6a1045da48d2 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -35,7 +35,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
36 36
37 if (tb != NULL) { 37 if (tb != NULL) {
38 tb->ib_net = hold_net(net); 38 write_pnet(&tb->ib_net, hold_net(net));
39 tb->port = snum; 39 tb->port = snum;
40 tb->fastreuse = 0; 40 tb->fastreuse = 0;
41 INIT_HLIST_HEAD(&tb->owners); 41 INIT_HLIST_HEAD(&tb->owners);
@@ -51,7 +51,7 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
51{ 51{
52 if (hlist_empty(&tb->owners)) { 52 if (hlist_empty(&tb->owners)) {
53 __hlist_del(&tb->node); 53 __hlist_del(&tb->node);
54 release_net(tb->ib_net); 54 release_net(ib_net(tb));
55 kmem_cache_free(cachep, tb); 55 kmem_cache_free(cachep, tb);
56 } 56 }
57} 57}
@@ -110,33 +110,29 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
110 110
111EXPORT_SYMBOL_GPL(__inet_inherit_port); 111EXPORT_SYMBOL_GPL(__inet_inherit_port);
112 112
113/* 113static inline int compute_score(struct sock *sk, struct net *net,
114 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 114 const unsigned short hnum, const __be32 daddr,
115 * Look, when several writers sleep and reader wakes them up, all but one 115 const int dif)
116 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
117 * this, _but_ remember, it adds useless work on UP machines (wake up each
118 * exclusive lock release). It should be ifdefed really.
119 */
120void inet_listen_wlock(struct inet_hashinfo *hashinfo)
121 __acquires(hashinfo->lhash_lock)
122{ 116{
123 write_lock(&hashinfo->lhash_lock); 117 int score = -1;
124 118 struct inet_sock *inet = inet_sk(sk);
125 if (atomic_read(&hashinfo->lhash_users)) {
126 DEFINE_WAIT(wait);
127 119
128 for (;;) { 120 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
129 prepare_to_wait_exclusive(&hashinfo->lhash_wait, 121 !ipv6_only_sock(sk)) {
130 &wait, TASK_UNINTERRUPTIBLE); 122 __be32 rcv_saddr = inet->rcv_saddr;
131 if (!atomic_read(&hashinfo->lhash_users)) 123 score = sk->sk_family == PF_INET ? 1 : 0;
132 break; 124 if (rcv_saddr) {
133 write_unlock_bh(&hashinfo->lhash_lock); 125 if (rcv_saddr != daddr)
134 schedule(); 126 return -1;
135 write_lock_bh(&hashinfo->lhash_lock); 127 score += 2;
128 }
129 if (sk->sk_bound_dev_if) {
130 if (sk->sk_bound_dev_if != dif)
131 return -1;
132 score += 2;
136 } 133 }
137
138 finish_wait(&hashinfo->lhash_wait, &wait);
139 } 134 }
135 return score;
140} 136}
141 137
142/* 138/*
@@ -145,72 +141,48 @@ void inet_listen_wlock(struct inet_hashinfo *hashinfo)
145 * remote address for the connection. So always assume those are both 141 * remote address for the connection. So always assume those are both
146 * wildcarded during the search since they can never be otherwise. 142 * wildcarded during the search since they can never be otherwise.
147 */ 143 */
148static struct sock *inet_lookup_listener_slow(struct net *net,
149 const struct hlist_head *head,
150 const __be32 daddr,
151 const unsigned short hnum,
152 const int dif)
153{
154 struct sock *result = NULL, *sk;
155 const struct hlist_node *node;
156 int hiscore = -1;
157
158 sk_for_each(sk, node, head) {
159 const struct inet_sock *inet = inet_sk(sk);
160
161 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
162 !ipv6_only_sock(sk)) {
163 const __be32 rcv_saddr = inet->rcv_saddr;
164 int score = sk->sk_family == PF_INET ? 1 : 0;
165
166 if (rcv_saddr) {
167 if (rcv_saddr != daddr)
168 continue;
169 score += 2;
170 }
171 if (sk->sk_bound_dev_if) {
172 if (sk->sk_bound_dev_if != dif)
173 continue;
174 score += 2;
175 }
176 if (score == 5)
177 return sk;
178 if (score > hiscore) {
179 hiscore = score;
180 result = sk;
181 }
182 }
183 }
184 return result;
185}
186 144
187/* Optimize the common listener case. */ 145
188struct sock *__inet_lookup_listener(struct net *net, 146struct sock *__inet_lookup_listener(struct net *net,
189 struct inet_hashinfo *hashinfo, 147 struct inet_hashinfo *hashinfo,
190 const __be32 daddr, const unsigned short hnum, 148 const __be32 daddr, const unsigned short hnum,
191 const int dif) 149 const int dif)
192{ 150{
193 struct sock *sk = NULL; 151 struct sock *sk, *result;
194 const struct hlist_head *head; 152 struct hlist_nulls_node *node;
195 153 unsigned int hash = inet_lhashfn(net, hnum);
196 read_lock(&hashinfo->lhash_lock); 154 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
197 head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; 155 int score, hiscore;
198 if (!hlist_empty(head)) { 156
199 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 157 rcu_read_lock();
200 158begin:
201 if (inet->num == hnum && !sk->sk_node.next && 159 result = NULL;
202 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 160 hiscore = -1;
203 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 161 sk_nulls_for_each_rcu(sk, node, &ilb->head) {
204 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) 162 score = compute_score(sk, net, hnum, daddr, dif);
205 goto sherry_cache; 163 if (score > hiscore) {
206 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); 164 result = sk;
165 hiscore = score;
166 }
207 } 167 }
208 if (sk) { 168 /*
209sherry_cache: 169 * if the nulls value we got at the end of this lookup is
210 sock_hold(sk); 170 * not the expected one, we must restart lookup.
171 * We probably met an item that was moved to another chain.
172 */
173 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
174 goto begin;
175 if (result) {
176 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
177 result = NULL;
178 else if (unlikely(compute_score(result, net, hnum, daddr,
179 dif) < hiscore)) {
180 sock_put(result);
181 goto begin;
182 }
211 } 183 }
212 read_unlock(&hashinfo->lhash_lock); 184 rcu_read_unlock();
213 return sk; 185 return result;
214} 186}
215EXPORT_SYMBOL_GPL(__inet_lookup_listener); 187EXPORT_SYMBOL_GPL(__inet_lookup_listener);
216 188
@@ -223,35 +195,65 @@ struct sock * __inet_lookup_established(struct net *net,
223 INET_ADDR_COOKIE(acookie, saddr, daddr) 195 INET_ADDR_COOKIE(acookie, saddr, daddr)
224 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 196 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
225 struct sock *sk; 197 struct sock *sk;
226 const struct hlist_node *node; 198 const struct hlist_nulls_node *node;
227 /* Optimize here for direct hit, only listening connections can 199 /* Optimize here for direct hit, only listening connections can
228 * have wildcards anyways. 200 * have wildcards anyways.
229 */ 201 */
230 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 202 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
231 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 203 unsigned int slot = hash & (hashinfo->ehash_size - 1);
232 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 204 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
233 205
234 prefetch(head->chain.first); 206 rcu_read_lock();
235 read_lock(lock); 207begin:
236 sk_for_each(sk, node, &head->chain) { 208 sk_nulls_for_each_rcu(sk, node, &head->chain) {
237 if (INET_MATCH(sk, net, hash, acookie, 209 if (INET_MATCH(sk, net, hash, acookie,
238 saddr, daddr, ports, dif)) 210 saddr, daddr, ports, dif)) {
239 goto hit; /* You sunk my battleship! */ 211 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
212 goto begintw;
213 if (unlikely(!INET_MATCH(sk, net, hash, acookie,
214 saddr, daddr, ports, dif))) {
215 sock_put(sk);
216 goto begin;
217 }
218 goto out;
219 }
240 } 220 }
221 /*
222 * if the nulls value we got at the end of this lookup is
223 * not the expected one, we must restart lookup.
224 * We probably met an item that was moved to another chain.
225 */
226 if (get_nulls_value(node) != slot)
227 goto begin;
241 228
229begintw:
242 /* Must check for a TIME_WAIT'er before going to listener hash. */ 230 /* Must check for a TIME_WAIT'er before going to listener hash. */
243 sk_for_each(sk, node, &head->twchain) { 231 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
244 if (INET_TW_MATCH(sk, net, hash, acookie, 232 if (INET_TW_MATCH(sk, net, hash, acookie,
245 saddr, daddr, ports, dif)) 233 saddr, daddr, ports, dif)) {
246 goto hit; 234 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
235 sk = NULL;
236 goto out;
237 }
238 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
239 saddr, daddr, ports, dif))) {
240 sock_put(sk);
241 goto begintw;
242 }
243 goto out;
244 }
247 } 245 }
246 /*
247 * if the nulls value we got at the end of this lookup is
248 * not the expected one, we must restart lookup.
249 * We probably met an item that was moved to another chain.
250 */
251 if (get_nulls_value(node) != slot)
252 goto begintw;
248 sk = NULL; 253 sk = NULL;
249out: 254out:
250 read_unlock(lock); 255 rcu_read_unlock();
251 return sk; 256 return sk;
252hit:
253 sock_hold(sk);
254 goto out;
255} 257}
256EXPORT_SYMBOL_GPL(__inet_lookup_established); 258EXPORT_SYMBOL_GPL(__inet_lookup_established);
257 259
@@ -270,16 +272,15 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
270 struct net *net = sock_net(sk); 272 struct net *net = sock_net(sk);
271 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); 273 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport);
272 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 274 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
273 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 275 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
274 struct sock *sk2; 276 struct sock *sk2;
275 const struct hlist_node *node; 277 const struct hlist_nulls_node *node;
276 struct inet_timewait_sock *tw; 278 struct inet_timewait_sock *tw;
277 279
278 prefetch(head->chain.first); 280 spin_lock(lock);
279 write_lock(lock);
280 281
281 /* Check TIME-WAIT sockets first. */ 282 /* Check TIME-WAIT sockets first. */
282 sk_for_each(sk2, node, &head->twchain) { 283 sk_nulls_for_each(sk2, node, &head->twchain) {
283 tw = inet_twsk(sk2); 284 tw = inet_twsk(sk2);
284 285
285 if (INET_TW_MATCH(sk2, net, hash, acookie, 286 if (INET_TW_MATCH(sk2, net, hash, acookie,
@@ -293,7 +294,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
293 tw = NULL; 294 tw = NULL;
294 295
295 /* And established part... */ 296 /* And established part... */
296 sk_for_each(sk2, node, &head->chain) { 297 sk_nulls_for_each(sk2, node, &head->chain) {
297 if (INET_MATCH(sk2, net, hash, acookie, 298 if (INET_MATCH(sk2, net, hash, acookie,
298 saddr, daddr, ports, dif)) 299 saddr, daddr, ports, dif))
299 goto not_unique; 300 goto not_unique;
@@ -306,9 +307,9 @@ unique:
306 inet->sport = htons(lport); 307 inet->sport = htons(lport);
307 sk->sk_hash = hash; 308 sk->sk_hash = hash;
308 WARN_ON(!sk_unhashed(sk)); 309 WARN_ON(!sk_unhashed(sk));
309 __sk_add_node(sk, &head->chain); 310 __sk_nulls_add_node_rcu(sk, &head->chain);
311 spin_unlock(lock);
310 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 312 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
311 write_unlock(lock);
312 313
313 if (twp) { 314 if (twp) {
314 *twp = tw; 315 *twp = tw;
@@ -324,7 +325,7 @@ unique:
324 return 0; 325 return 0;
325 326
326not_unique: 327not_unique:
327 write_unlock(lock); 328 spin_unlock(lock);
328 return -EADDRNOTAVAIL; 329 return -EADDRNOTAVAIL;
329} 330}
330 331
@@ -338,8 +339,8 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
338void __inet_hash_nolisten(struct sock *sk) 339void __inet_hash_nolisten(struct sock *sk)
339{ 340{
340 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 341 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
341 struct hlist_head *list; 342 struct hlist_nulls_head *list;
342 rwlock_t *lock; 343 spinlock_t *lock;
343 struct inet_ehash_bucket *head; 344 struct inet_ehash_bucket *head;
344 345
345 WARN_ON(!sk_unhashed(sk)); 346 WARN_ON(!sk_unhashed(sk));
@@ -349,18 +350,17 @@ void __inet_hash_nolisten(struct sock *sk)
349 list = &head->chain; 350 list = &head->chain;
350 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 351 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
351 352
352 write_lock(lock); 353 spin_lock(lock);
353 __sk_add_node(sk, list); 354 __sk_nulls_add_node_rcu(sk, list);
355 spin_unlock(lock);
354 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 356 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
355 write_unlock(lock);
356} 357}
357EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 358EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
358 359
359static void __inet_hash(struct sock *sk) 360static void __inet_hash(struct sock *sk)
360{ 361{
361 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 362 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
362 struct hlist_head *list; 363 struct inet_listen_hashbucket *ilb;
363 rwlock_t *lock;
364 364
365 if (sk->sk_state != TCP_LISTEN) { 365 if (sk->sk_state != TCP_LISTEN) {
366 __inet_hash_nolisten(sk); 366 __inet_hash_nolisten(sk);
@@ -368,14 +368,12 @@ static void __inet_hash(struct sock *sk)
368 } 368 }
369 369
370 WARN_ON(!sk_unhashed(sk)); 370 WARN_ON(!sk_unhashed(sk));
371 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 371 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
372 lock = &hashinfo->lhash_lock;
373 372
374 inet_listen_wlock(hashinfo); 373 spin_lock(&ilb->lock);
375 __sk_add_node(sk, list); 374 __sk_nulls_add_node_rcu(sk, &ilb->head);
376 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
377 write_unlock(lock); 376 spin_unlock(&ilb->lock);
378 wake_up(&hashinfo->lhash_wait);
379} 377}
380 378
381void inet_hash(struct sock *sk) 379void inet_hash(struct sock *sk)
@@ -390,27 +388,23 @@ EXPORT_SYMBOL_GPL(inet_hash);
390 388
391void inet_unhash(struct sock *sk) 389void inet_unhash(struct sock *sk)
392{ 390{
393 rwlock_t *lock;
394 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 391 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
392 spinlock_t *lock;
393 int done;
395 394
396 if (sk_unhashed(sk)) 395 if (sk_unhashed(sk))
397 goto out; 396 return;
398 397
399 if (sk->sk_state == TCP_LISTEN) { 398 if (sk->sk_state == TCP_LISTEN)
400 local_bh_disable(); 399 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
401 inet_listen_wlock(hashinfo); 400 else
402 lock = &hashinfo->lhash_lock;
403 } else {
404 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 401 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
405 write_lock_bh(lock);
406 }
407 402
408 if (__sk_del_node_init(sk)) 403 spin_lock_bh(lock);
404 done =__sk_nulls_del_node_init_rcu(sk);
405 if (done)
409 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 406 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
410 write_unlock_bh(lock); 407 spin_unlock_bh(lock);
411out:
412 if (sk->sk_state == TCP_LISTEN)
413 wake_up(&hashinfo->lhash_wait);
414} 408}
415EXPORT_SYMBOL_GPL(inet_unhash); 409EXPORT_SYMBOL_GPL(inet_unhash);
416 410
@@ -449,7 +443,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
449 * unique enough. 443 * unique enough.
450 */ 444 */
451 inet_bind_bucket_for_each(tb, node, &head->chain) { 445 inet_bind_bucket_for_each(tb, node, &head->chain) {
452 if (tb->ib_net == net && tb->port == port) { 446 if (ib_net(tb) == net && tb->port == port) {
453 WARN_ON(hlist_empty(&tb->owners)); 447 WARN_ON(hlist_empty(&tb->owners));
454 if (tb->fastreuse >= 0) 448 if (tb->fastreuse >= 0)
455 goto next_port; 449 goto next_port;
@@ -524,3 +518,16 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
524} 518}
525 519
526EXPORT_SYMBOL_GPL(inet_hash_connect); 520EXPORT_SYMBOL_GPL(inet_hash_connect);
521
522void inet_hashinfo_init(struct inet_hashinfo *h)
523{
524 int i;
525
526 for (i = 0; i < INET_LHTABLE_SIZE; i++) {
527 spin_lock_init(&h->listening_hash[i].lock);
528 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
529 i + LISTENING_NULLS_BASE);
530 }
531}
532
533EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index cfd034a2b96e..6a667dae315e 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -120,7 +120,7 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
120 iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); 120 iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
121 121
122 tcph->check = 0; 122 tcph->check = 0;
123 tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0); 123 tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
124 lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); 124 lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
125 tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, 125 tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
126 lro_desc->ip_tot_len - 126 lro_desc->ip_tot_len -
@@ -135,7 +135,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
135 __wsum tcp_ps_hdr_csum; 135 __wsum tcp_ps_hdr_csum;
136 136
137 tcp_csum = ~csum_unfold(tcph->check); 137 tcp_csum = ~csum_unfold(tcph->check);
138 tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum); 138 tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
139 139
140 tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, 140 tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
141 len + TCP_HDR_LEN(tcph), 141 len + TCP_HDR_LEN(tcph),
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1c5fd38f8824..8554d0ea1719 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
20 struct inet_bind_hashbucket *bhead; 20 struct inet_bind_hashbucket *bhead;
21 struct inet_bind_bucket *tb; 21 struct inet_bind_bucket *tb;
22 /* Unlink from established hashes. */ 22 /* Unlink from established hashes. */
23 rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 23 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
24 24
25 write_lock(lock); 25 spin_lock(lock);
26 if (hlist_unhashed(&tw->tw_node)) { 26 if (hlist_nulls_unhashed(&tw->tw_node)) {
27 write_unlock(lock); 27 spin_unlock(lock);
28 return; 28 return;
29 } 29 }
30 __hlist_del(&tw->tw_node); 30 hlist_nulls_del_rcu(&tw->tw_node);
31 sk_node_init(&tw->tw_node); 31 sk_nulls_node_init(&tw->tw_node);
32 write_unlock(lock); 32 spin_unlock(lock);
33 33
34 /* Disassociate with bind bucket. */ 34 /* Disassociate with bind bucket. */
35 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, 35 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
@@ -76,7 +76,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
76 const struct inet_sock *inet = inet_sk(sk); 76 const struct inet_sock *inet = inet_sk(sk);
77 const struct inet_connection_sock *icsk = inet_csk(sk); 77 const struct inet_connection_sock *icsk = inet_csk(sk);
78 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); 78 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
79 rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 79 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
80 struct inet_bind_hashbucket *bhead; 80 struct inet_bind_hashbucket *bhead;
81 /* Step 1: Put TW into bind hash. Original socket stays there too. 81 /* Step 1: Put TW into bind hash. Original socket stays there too.
82 Note, that any socket with inet->num != 0 MUST be bound in 82 Note, that any socket with inet->num != 0 MUST be bound in
@@ -90,17 +90,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
90 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); 90 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
91 spin_unlock(&bhead->lock); 91 spin_unlock(&bhead->lock);
92 92
93 write_lock(lock); 93 spin_lock(lock);
94 94
95 /* Step 2: Remove SK from established hash. */ 95 /*
96 if (__sk_del_node_init(sk)) 96 * Step 2: Hash TW into TIMEWAIT chain.
97 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 97 * Should be done before removing sk from established chain
98 98 * because readers are lockless and search established first.
99 /* Step 3: Hash TW into TIMEWAIT chain. */ 99 */
100 inet_twsk_add_node(tw, &ehead->twchain);
101 atomic_inc(&tw->tw_refcnt); 100 atomic_inc(&tw->tw_refcnt);
101 inet_twsk_add_node_rcu(tw, &ehead->twchain);
102 102
103 write_unlock(lock); 103 /* Step 3: Remove SK from established hash. */
104 if (__sk_nulls_del_node_init_rcu(sk))
105 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
106
107 spin_unlock(lock);
104} 108}
105 109
106EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 110EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -416,17 +420,17 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
416{ 420{
417 struct inet_timewait_sock *tw; 421 struct inet_timewait_sock *tw;
418 struct sock *sk; 422 struct sock *sk;
419 struct hlist_node *node; 423 struct hlist_nulls_node *node;
420 int h; 424 int h;
421 425
422 local_bh_disable(); 426 local_bh_disable();
423 for (h = 0; h < (hashinfo->ehash_size); h++) { 427 for (h = 0; h < (hashinfo->ehash_size); h++) {
424 struct inet_ehash_bucket *head = 428 struct inet_ehash_bucket *head =
425 inet_ehash_bucket(hashinfo, h); 429 inet_ehash_bucket(hashinfo, h);
426 rwlock_t *lock = inet_ehash_lockp(hashinfo, h); 430 spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
427restart: 431restart:
428 write_lock(lock); 432 spin_lock(lock);
429 sk_for_each(sk, node, &head->twchain) { 433 sk_nulls_for_each(sk, node, &head->twchain) {
430 434
431 tw = inet_twsk(sk); 435 tw = inet_twsk(sk);
432 if (!net_eq(twsk_net(tw), net) || 436 if (!net_eq(twsk_net(tw), net) ||
@@ -434,13 +438,13 @@ restart:
434 continue; 438 continue;
435 439
436 atomic_inc(&tw->tw_refcnt); 440 atomic_inc(&tw->tw_refcnt);
437 write_unlock(lock); 441 spin_unlock(lock);
438 inet_twsk_deschedule(tw, twdr); 442 inet_twsk_deschedule(tw, twdr);
439 inet_twsk_put(tw); 443 inet_twsk_put(tw);
440 444
441 goto restart; 445 goto restart;
442 } 446 }
443 write_unlock(lock); 447 spin_unlock(lock);
444 } 448 }
445 local_bh_enable(); 449 local_bh_enable();
446} 450}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index a456ceeac3f2..b1fbe18feb5a 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -144,7 +144,7 @@ static void unlink_from_unused(struct inet_peer *p)
144 * _stack is known to be NULL or not at compile time, 144 * _stack is known to be NULL or not at compile time,
145 * so compiler will optimize the if (_stack) tests. 145 * so compiler will optimize the if (_stack) tests.
146 */ 146 */
147#define lookup(_daddr,_stack) \ 147#define lookup(_daddr, _stack) \
148({ \ 148({ \
149 struct inet_peer *u, **v; \ 149 struct inet_peer *u, **v; \
150 if (_stack != NULL) { \ 150 if (_stack != NULL) { \
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 450016b89a18..df3fe50bbf0d 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -106,7 +106,7 @@ int ip_forward(struct sk_buff *skb)
106 * We now generate an ICMP HOST REDIRECT giving the route 106 * We now generate an ICMP HOST REDIRECT giving the route
107 * we calculated. 107 * we calculated.
108 */ 108 */
109 if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp) 109 if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
110 ip_rt_send_redirect(skb); 110 ip_rt_send_redirect(skb);
111 111
112 skb->priority = rt_tos2priority(iph->tos); 112 skb->priority = rt_tos2priority(iph->tos);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e4f81f54befe..6659ac000eeb 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -56,7 +56,7 @@ struct ipfrag_skb_cb
56 int offset; 56 int offset;
57}; 57};
58 58
59#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb)) 59#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
60 60
61/* Describe an entry in the "incomplete datagrams" queue. */ 61/* Describe an entry in the "incomplete datagrams" queue. */
62struct ipq { 62struct ipq {
@@ -559,9 +559,8 @@ out_nomem:
559 goto out_fail; 559 goto out_fail;
560out_oversize: 560out_oversize:
561 if (net_ratelimit()) 561 if (net_ratelimit())
562 printk(KERN_INFO 562 printk(KERN_INFO "Oversized IP packet from %pI4.\n",
563 "Oversized IP packet from " NIPQUAD_FMT ".\n", 563 &qp->saddr);
564 NIPQUAD(qp->saddr));
565out_fail: 564out_fail:
566 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS); 565 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS);
567 return err; 566 return err;
@@ -608,7 +607,7 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
608 .data = &init_net.ipv4.frags.high_thresh, 607 .data = &init_net.ipv4.frags.high_thresh,
609 .maxlen = sizeof(int), 608 .maxlen = sizeof(int),
610 .mode = 0644, 609 .mode = 0644,
611 .proc_handler = &proc_dointvec 610 .proc_handler = proc_dointvec
612 }, 611 },
613 { 612 {
614 .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, 613 .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
@@ -616,7 +615,7 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
616 .data = &init_net.ipv4.frags.low_thresh, 615 .data = &init_net.ipv4.frags.low_thresh,
617 .maxlen = sizeof(int), 616 .maxlen = sizeof(int),
618 .mode = 0644, 617 .mode = 0644,
619 .proc_handler = &proc_dointvec 618 .proc_handler = proc_dointvec
620 }, 619 },
621 { 620 {
622 .ctl_name = NET_IPV4_IPFRAG_TIME, 621 .ctl_name = NET_IPV4_IPFRAG_TIME,
@@ -624,8 +623,8 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
624 .data = &init_net.ipv4.frags.timeout, 623 .data = &init_net.ipv4.frags.timeout,
625 .maxlen = sizeof(int), 624 .maxlen = sizeof(int),
626 .mode = 0644, 625 .mode = 0644,
627 .proc_handler = &proc_dointvec_jiffies, 626 .proc_handler = proc_dointvec_jiffies,
628 .strategy = &sysctl_jiffies 627 .strategy = sysctl_jiffies
629 }, 628 },
630 { } 629 { }
631}; 630};
@@ -637,15 +636,15 @@ static struct ctl_table ip4_frags_ctl_table[] = {
637 .data = &ip4_frags.secret_interval, 636 .data = &ip4_frags.secret_interval,
638 .maxlen = sizeof(int), 637 .maxlen = sizeof(int),
639 .mode = 0644, 638 .mode = 0644,
640 .proc_handler = &proc_dointvec_jiffies, 639 .proc_handler = proc_dointvec_jiffies,
641 .strategy = &sysctl_jiffies 640 .strategy = sysctl_jiffies
642 }, 641 },
643 { 642 {
644 .procname = "ipfrag_max_dist", 643 .procname = "ipfrag_max_dist",
645 .data = &sysctl_ipfrag_max_dist, 644 .data = &sysctl_ipfrag_max_dist,
646 .maxlen = sizeof(int), 645 .maxlen = sizeof(int),
647 .mode = 0644, 646 .mode = 0644,
648 .proc_handler = &proc_dointvec_minmax, 647 .proc_handler = proc_dointvec_minmax,
649 .extra1 = &zero 648 .extra1 = &zero
650 }, 649 },
651 { } 650 { }
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 85c487b8572b..0101521f366b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -126,8 +126,6 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
126 126
127/* Fallback tunnel: no source, no destination, no key, no options */ 127/* Fallback tunnel: no source, no destination, no key, no options */
128 128
129static int ipgre_fb_tunnel_init(struct net_device *dev);
130
131#define HASH_SIZE 16 129#define HASH_SIZE 16
132 130
133static int ipgre_net_id; 131static int ipgre_net_id;
@@ -371,7 +369,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
371 by themself??? 369 by themself???
372 */ 370 */
373 371
374 struct iphdr *iph = (struct iphdr*)skb->data; 372 struct iphdr *iph = (struct iphdr *)skb->data;
375 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); 373 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
376 int grehlen = (iph->ihl<<2) + 4; 374 int grehlen = (iph->ihl<<2) + 4;
377 const int type = icmp_hdr(skb)->type; 375 const int type = icmp_hdr(skb)->type;
@@ -632,7 +630,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
632 630
633 if (dev->header_ops && dev->type == ARPHRD_IPGRE) { 631 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
634 gre_hlen = 0; 632 gre_hlen = 0;
635 tiph = (struct iphdr*)skb->data; 633 tiph = (struct iphdr *)skb->data;
636 } else { 634 } else {
637 gre_hlen = tunnel->hlen; 635 gre_hlen = tunnel->hlen;
638 tiph = &tunnel->parms.iph; 636 tiph = &tunnel->parms.iph;
@@ -660,7 +658,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
660 if (neigh == NULL) 658 if (neigh == NULL)
661 goto tx_error; 659 goto tx_error;
662 660
663 addr6 = (struct in6_addr*)&neigh->primary_key; 661 addr6 = (struct in6_addr *)&neigh->primary_key;
664 addr_type = ipv6_addr_type(addr6); 662 addr_type = ipv6_addr_type(addr6);
665 663
666 if (addr_type == IPV6_ADDR_ANY) { 664 if (addr_type == IPV6_ADDR_ANY) {
@@ -726,7 +724,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
726 } 724 }
727#ifdef CONFIG_IPV6 725#ifdef CONFIG_IPV6
728 else if (skb->protocol == htons(ETH_P_IPV6)) { 726 else if (skb->protocol == htons(ETH_P_IPV6)) {
729 struct rt6_info *rt6 = (struct rt6_info*)skb->dst; 727 struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
730 728
731 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) { 729 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
732 if ((tunnel->parms.iph.daddr && 730 if ((tunnel->parms.iph.daddr &&
@@ -800,7 +798,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
800 iph->ttl = old_iph->ttl; 798 iph->ttl = old_iph->ttl;
801#ifdef CONFIG_IPV6 799#ifdef CONFIG_IPV6
802 else if (skb->protocol == htons(ETH_P_IPV6)) 800 else if (skb->protocol == htons(ETH_P_IPV6))
803 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit; 801 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
804#endif 802#endif
805 else 803 else
806 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); 804 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
@@ -962,7 +960,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
962 break; 960 break;
963 } 961 }
964 } else { 962 } else {
965 unsigned nflags=0; 963 unsigned nflags = 0;
966 964
967 t = netdev_priv(dev); 965 t = netdev_priv(dev);
968 966
@@ -1104,7 +1102,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1104 1102
1105static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) 1103static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1106{ 1104{
1107 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb); 1105 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1108 memcpy(haddr, &iph->saddr, 4); 1106 memcpy(haddr, &iph->saddr, 4);
1109 return 4; 1107 return 4;
1110} 1108}
@@ -1142,6 +1140,7 @@ static int ipgre_open(struct net_device *dev)
1142static int ipgre_close(struct net_device *dev) 1140static int ipgre_close(struct net_device *dev)
1143{ 1141{
1144 struct ip_tunnel *t = netdev_priv(dev); 1142 struct ip_tunnel *t = netdev_priv(dev);
1143
1145 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1144 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1146 struct in_device *in_dev; 1145 struct in_device *in_dev;
1147 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1146 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
@@ -1155,14 +1154,22 @@ static int ipgre_close(struct net_device *dev)
1155 1154
1156#endif 1155#endif
1157 1156
1157static const struct net_device_ops ipgre_netdev_ops = {
1158 .ndo_init = ipgre_tunnel_init,
1159 .ndo_uninit = ipgre_tunnel_uninit,
1160#ifdef CONFIG_NET_IPGRE_BROADCAST
1161 .ndo_open = ipgre_open,
1162 .ndo_stop = ipgre_close,
1163#endif
1164 .ndo_start_xmit = ipgre_tunnel_xmit,
1165 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1166 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1167};
1168
1158static void ipgre_tunnel_setup(struct net_device *dev) 1169static void ipgre_tunnel_setup(struct net_device *dev)
1159{ 1170{
1160 dev->init = ipgre_tunnel_init; 1171 dev->netdev_ops = &ipgre_netdev_ops;
1161 dev->uninit = ipgre_tunnel_uninit;
1162 dev->destructor = free_netdev; 1172 dev->destructor = free_netdev;
1163 dev->hard_start_xmit = ipgre_tunnel_xmit;
1164 dev->do_ioctl = ipgre_tunnel_ioctl;
1165 dev->change_mtu = ipgre_tunnel_change_mtu;
1166 1173
1167 dev->type = ARPHRD_IPGRE; 1174 dev->type = ARPHRD_IPGRE;
1168 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1175 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1194,8 +1201,6 @@ static int ipgre_tunnel_init(struct net_device *dev)
1194 return -EINVAL; 1201 return -EINVAL;
1195 dev->flags = IFF_BROADCAST; 1202 dev->flags = IFF_BROADCAST;
1196 dev->header_ops = &ipgre_header_ops; 1203 dev->header_ops = &ipgre_header_ops;
1197 dev->open = ipgre_open;
1198 dev->stop = ipgre_close;
1199 } 1204 }
1200#endif 1205#endif
1201 } else 1206 } else
@@ -1204,7 +1209,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
1204 return 0; 1209 return 0;
1205} 1210}
1206 1211
1207static int ipgre_fb_tunnel_init(struct net_device *dev) 1212static void ipgre_fb_tunnel_init(struct net_device *dev)
1208{ 1213{
1209 struct ip_tunnel *tunnel = netdev_priv(dev); 1214 struct ip_tunnel *tunnel = netdev_priv(dev);
1210 struct iphdr *iph = &tunnel->parms.iph; 1215 struct iphdr *iph = &tunnel->parms.iph;
@@ -1220,7 +1225,6 @@ static int ipgre_fb_tunnel_init(struct net_device *dev)
1220 1225
1221 dev_hold(dev); 1226 dev_hold(dev);
1222 ign->tunnels_wc[0] = tunnel; 1227 ign->tunnels_wc[0] = tunnel;
1223 return 0;
1224} 1228}
1225 1229
1226 1230
@@ -1264,9 +1268,9 @@ static int ipgre_init_net(struct net *net)
1264 err = -ENOMEM; 1268 err = -ENOMEM;
1265 goto err_alloc_dev; 1269 goto err_alloc_dev;
1266 } 1270 }
1267
1268 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1269 dev_net_set(ign->fb_tunnel_dev, net); 1271 dev_net_set(ign->fb_tunnel_dev, net);
1272
1273 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1270 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; 1274 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1271 1275
1272 if ((err = register_netdev(ign->fb_tunnel_dev))) 1276 if ((err = register_netdev(ign->fb_tunnel_dev)))
@@ -1397,16 +1401,22 @@ static int ipgre_tap_init(struct net_device *dev)
1397 return 0; 1401 return 0;
1398} 1402}
1399 1403
1404static const struct net_device_ops ipgre_tap_netdev_ops = {
1405 .ndo_init = ipgre_tap_init,
1406 .ndo_uninit = ipgre_tunnel_uninit,
1407 .ndo_start_xmit = ipgre_tunnel_xmit,
1408 .ndo_set_mac_address = eth_mac_addr,
1409 .ndo_validate_addr = eth_validate_addr,
1410 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1411};
1412
1400static void ipgre_tap_setup(struct net_device *dev) 1413static void ipgre_tap_setup(struct net_device *dev)
1401{ 1414{
1402 1415
1403 ether_setup(dev); 1416 ether_setup(dev);
1404 1417
1405 dev->init = ipgre_tap_init; 1418 dev->netdev_ops = &ipgre_netdev_ops;
1406 dev->uninit = ipgre_tunnel_uninit;
1407 dev->destructor = free_netdev; 1419 dev->destructor = free_netdev;
1408 dev->hard_start_xmit = ipgre_tunnel_xmit;
1409 dev->change_mtu = ipgre_tunnel_change_mtu;
1410 1420
1411 dev->iflink = 0; 1421 dev->iflink = 0;
1412 dev->features |= NETIF_F_NETNS_LOCAL; 1422 dev->features |= NETIF_F_NETNS_LOCAL;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index cfb38ac9d698..1a58a6fa1dc0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -302,10 +302,8 @@ static inline int ip_rcv_options(struct sk_buff *skb)
302 if (!IN_DEV_SOURCE_ROUTE(in_dev)) { 302 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
303 if (IN_DEV_LOG_MARTIANS(in_dev) && 303 if (IN_DEV_LOG_MARTIANS(in_dev) &&
304 net_ratelimit()) 304 net_ratelimit())
305 printk(KERN_INFO "source route option " 305 printk(KERN_INFO "source route option %pI4 -> %pI4\n",
306 NIPQUAD_FMT " -> " NIPQUAD_FMT "\n", 306 &iph->saddr, &iph->daddr);
307 NIPQUAD(iph->saddr),
308 NIPQUAD(iph->daddr));
309 in_dev_put(in_dev); 307 in_dev_put(in_dev);
310 goto drop; 308 goto drop;
311 } 309 }
@@ -350,9 +348,9 @@ static int ip_rcv_finish(struct sk_buff *skb)
350 struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); 348 struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
351 u32 idx = skb->dst->tclassid; 349 u32 idx = skb->dst->tclassid;
352 st[idx&0xFF].o_packets++; 350 st[idx&0xFF].o_packets++;
353 st[idx&0xFF].o_bytes+=skb->len; 351 st[idx&0xFF].o_bytes += skb->len;
354 st[(idx>>16)&0xFF].i_packets++; 352 st[(idx>>16)&0xFF].i_packets++;
355 st[(idx>>16)&0xFF].i_bytes+=skb->len; 353 st[(idx>>16)&0xFF].i_bytes += skb->len;
356 } 354 }
357#endif 355#endif
358 356
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d2a8f8bb78a6..8ebe86dd72af 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -430,7 +430,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 * single device frame, and queue such a frame for sending. 430 * single device frame, and queue such a frame for sending.
431 */ 431 */
432 432
433int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) 433int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
434{ 434{
435 struct iphdr *iph; 435 struct iphdr *iph;
436 int raw = 0; 436 int raw = 0;
@@ -720,7 +720,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
720 int getfrag(void *from, char *to, int offset, int len, 720 int getfrag(void *from, char *to, int offset, int len,
721 int odd, struct sk_buff *skb), 721 int odd, struct sk_buff *skb),
722 void *from, int length, int hh_len, int fragheaderlen, 722 void *from, int length, int hh_len, int fragheaderlen,
723 int transhdrlen, int mtu,unsigned int flags) 723 int transhdrlen, int mtu, unsigned int flags)
724{ 724{
725 struct sk_buff *skb; 725 struct sk_buff *skb;
726 int err; 726 int err;
@@ -741,7 +741,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
741 skb_reserve(skb, hh_len); 741 skb_reserve(skb, hh_len);
742 742
743 /* create space for UDP/IP header */ 743 /* create space for UDP/IP header */
744 skb_put(skb,fragheaderlen + transhdrlen); 744 skb_put(skb, fragheaderlen + transhdrlen);
745 745
746 /* initialize network header pointer */ 746 /* initialize network header pointer */
747 skb_reset_network_header(skb); 747 skb_reset_network_header(skb);
@@ -778,7 +778,7 @@ int ip_append_data(struct sock *sk,
778 int getfrag(void *from, char *to, int offset, int len, 778 int getfrag(void *from, char *to, int offset, int len,
779 int odd, struct sk_buff *skb), 779 int odd, struct sk_buff *skb),
780 void *from, int length, int transhdrlen, 780 void *from, int length, int transhdrlen,
781 struct ipcm_cookie *ipc, struct rtable *rt, 781 struct ipcm_cookie *ipc, struct rtable **rtp,
782 unsigned int flags) 782 unsigned int flags)
783{ 783{
784 struct inet_sock *inet = inet_sk(sk); 784 struct inet_sock *inet = inet_sk(sk);
@@ -793,6 +793,7 @@ int ip_append_data(struct sock *sk,
793 int offset = 0; 793 int offset = 0;
794 unsigned int maxfraglen, fragheaderlen; 794 unsigned int maxfraglen, fragheaderlen;
795 int csummode = CHECKSUM_NONE; 795 int csummode = CHECKSUM_NONE;
796 struct rtable *rt;
796 797
797 if (flags&MSG_PROBE) 798 if (flags&MSG_PROBE)
798 return 0; 799 return 0;
@@ -812,7 +813,11 @@ int ip_append_data(struct sock *sk,
812 inet->cork.flags |= IPCORK_OPT; 813 inet->cork.flags |= IPCORK_OPT;
813 inet->cork.addr = ipc->addr; 814 inet->cork.addr = ipc->addr;
814 } 815 }
815 dst_hold(&rt->u.dst); 816 rt = *rtp;
817 /*
818 * We steal reference to this route, caller should not release it
819 */
820 *rtp = NULL;
816 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? 821 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
817 rt->u.dst.dev->mtu : 822 rt->u.dst.dev->mtu :
818 dst_mtu(rt->u.dst.path); 823 dst_mtu(rt->u.dst.path);
@@ -1279,7 +1284,12 @@ int ip_push_pending_frames(struct sock *sk)
1279 1284
1280 skb->priority = sk->sk_priority; 1285 skb->priority = sk->sk_priority;
1281 skb->mark = sk->sk_mark; 1286 skb->mark = sk->sk_mark;
1282 skb->dst = dst_clone(&rt->u.dst); 1287 /*
1288 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1289 * on dst refcount
1290 */
1291 inet->cork.dst = NULL;
1292 skb->dst = &rt->u.dst;
1283 1293
1284 if (iph->protocol == IPPROTO_ICMP) 1294 if (iph->protocol == IPPROTO_ICMP)
1285 icmp_out_count(net, ((struct icmphdr *) 1295 icmp_out_count(net, ((struct icmphdr *)
@@ -1391,7 +1401,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1391 sk->sk_protocol = ip_hdr(skb)->protocol; 1401 sk->sk_protocol = ip_hdr(skb)->protocol;
1392 sk->sk_bound_dev_if = arg->bound_dev_if; 1402 sk->sk_bound_dev_if = arg->bound_dev_if;
1393 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1403 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1394 &ipc, rt, MSG_DONTWAIT); 1404 &ipc, &rt, MSG_DONTWAIT);
1395 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1405 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1396 if (arg->csumoffset >= 0) 1406 if (arg->csumoffset >= 0)
1397 *((__sum16 *)skb_transport_header(skb) + 1407 *((__sum16 *)skb_transport_header(skb) +
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 465abf0a9869..43c05854d752 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -48,6 +48,7 @@
48#define IP_CMSG_RECVOPTS 8 48#define IP_CMSG_RECVOPTS 8
49#define IP_CMSG_RETOPTS 16 49#define IP_CMSG_RETOPTS 16
50#define IP_CMSG_PASSSEC 32 50#define IP_CMSG_PASSSEC 32
51#define IP_CMSG_ORIGDSTADDR 64
51 52
52/* 53/*
53 * SOL_IP control messages. 54 * SOL_IP control messages.
@@ -94,7 +95,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
94static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) 95static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
95{ 96{
96 unsigned char optbuf[sizeof(struct ip_options) + 40]; 97 unsigned char optbuf[sizeof(struct ip_options) + 40];
97 struct ip_options * opt = (struct ip_options*)optbuf; 98 struct ip_options * opt = (struct ip_options *)optbuf;
98 99
99 if (IPCB(skb)->opt.optlen == 0) 100 if (IPCB(skb)->opt.optlen == 0)
100 return; 101 return;
@@ -126,6 +127,27 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
126 security_release_secctx(secdata, seclen); 127 security_release_secctx(secdata, seclen);
127} 128}
128 129
130static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
131{
132 struct sockaddr_in sin;
133 struct iphdr *iph = ip_hdr(skb);
134 __be16 *ports = (__be16 *)skb_transport_header(skb);
135
136 if (skb_transport_offset(skb) + 4 > skb->len)
137 return;
138
139 /* All current transport protocols have the port numbers in the
140 * first four bytes of the transport header and this function is
141 * written with this assumption in mind.
142 */
143
144 sin.sin_family = AF_INET;
145 sin.sin_addr.s_addr = iph->daddr;
146 sin.sin_port = ports[1];
147 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
148
149 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
150}
129 151
130void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) 152void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
131{ 153{
@@ -160,6 +182,12 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
160 182
161 if (flags & 1) 183 if (flags & 1)
162 ip_cmsg_recv_security(msg, skb); 184 ip_cmsg_recv_security(msg, skb);
185
186 if ((flags>>=1) == 0)
187 return;
188 if (flags & 1)
189 ip_cmsg_recv_dstaddr(msg, skb);
190
163} 191}
164 192
165int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) 193int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
@@ -411,7 +439,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
411 int optname, char __user *optval, int optlen) 439 int optname, char __user *optval, int optlen)
412{ 440{
413 struct inet_sock *inet = inet_sk(sk); 441 struct inet_sock *inet = inet_sk(sk);
414 int val=0,err; 442 int val = 0, err;
415 443
416 if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | 444 if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
417 (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | 445 (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
@@ -421,7 +449,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
421 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | 449 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
422 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || 450 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
423 optname == IP_MULTICAST_TTL || 451 optname == IP_MULTICAST_TTL ||
424 optname == IP_MULTICAST_LOOP) { 452 optname == IP_MULTICAST_LOOP ||
453 optname == IP_RECVORIGDSTADDR) {
425 if (optlen >= sizeof(int)) { 454 if (optlen >= sizeof(int)) {
426 if (get_user(val, (int __user *) optval)) 455 if (get_user(val, (int __user *) optval))
427 return -EFAULT; 456 return -EFAULT;
@@ -437,7 +466,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
437 /* If optlen==0, it is equivalent to val == 0 */ 466 /* If optlen==0, it is equivalent to val == 0 */
438 467
439 if (ip_mroute_opt(optname)) 468 if (ip_mroute_opt(optname))
440 return ip_mroute_setsockopt(sk,optname,optval,optlen); 469 return ip_mroute_setsockopt(sk, optname, optval, optlen);
441 470
442 err = 0; 471 err = 0;
443 lock_sock(sk); 472 lock_sock(sk);
@@ -509,6 +538,12 @@ static int do_ip_setsockopt(struct sock *sk, int level,
509 else 538 else
510 inet->cmsg_flags &= ~IP_CMSG_PASSSEC; 539 inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
511 break; 540 break;
541 case IP_RECVORIGDSTADDR:
542 if (val)
543 inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
544 else
545 inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
546 break;
512 case IP_TOS: /* This sets both TOS and Precedence */ 547 case IP_TOS: /* This sets both TOS and Precedence */
513 if (sk->sk_type == SOCK_STREAM) { 548 if (sk->sk_type == SOCK_STREAM) {
514 val &= ~3; 549 val &= ~3;
@@ -549,7 +584,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
549 goto e_inval; 584 goto e_inval;
550 if (optlen<1) 585 if (optlen<1)
551 goto e_inval; 586 goto e_inval;
552 if (val==-1) 587 if (val == -1)
553 val = 1; 588 val = 1;
554 if (val < 0 || val > 255) 589 if (val < 0 || val > 255)
555 goto e_inval; 590 goto e_inval;
@@ -573,12 +608,12 @@ static int do_ip_setsockopt(struct sock *sk, int level,
573 608
574 err = -EFAULT; 609 err = -EFAULT;
575 if (optlen >= sizeof(struct ip_mreqn)) { 610 if (optlen >= sizeof(struct ip_mreqn)) {
576 if (copy_from_user(&mreq,optval,sizeof(mreq))) 611 if (copy_from_user(&mreq, optval, sizeof(mreq)))
577 break; 612 break;
578 } else { 613 } else {
579 memset(&mreq, 0, sizeof(mreq)); 614 memset(&mreq, 0, sizeof(mreq));
580 if (optlen >= sizeof(struct in_addr) && 615 if (optlen >= sizeof(struct in_addr) &&
581 copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) 616 copy_from_user(&mreq.imr_address, optval, sizeof(struct in_addr)))
582 break; 617 break;
583 } 618 }
584 619
@@ -626,11 +661,11 @@ static int do_ip_setsockopt(struct sock *sk, int level,
626 goto e_inval; 661 goto e_inval;
627 err = -EFAULT; 662 err = -EFAULT;
628 if (optlen >= sizeof(struct ip_mreqn)) { 663 if (optlen >= sizeof(struct ip_mreqn)) {
629 if (copy_from_user(&mreq,optval,sizeof(mreq))) 664 if (copy_from_user(&mreq, optval, sizeof(mreq)))
630 break; 665 break;
631 } else { 666 } else {
632 memset(&mreq, 0, sizeof(mreq)); 667 memset(&mreq, 0, sizeof(mreq));
633 if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) 668 if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq)))
634 break; 669 break;
635 } 670 }
636 671
@@ -808,7 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
808 err = -ENOBUFS; 843 err = -ENOBUFS;
809 break; 844 break;
810 } 845 }
811 gsf = kmalloc(optlen,GFP_KERNEL); 846 gsf = kmalloc(optlen, GFP_KERNEL);
812 if (!gsf) { 847 if (!gsf) {
813 err = -ENOBUFS; 848 err = -ENOBUFS;
814 break; 849 break;
@@ -828,7 +863,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
828 goto mc_msf_out; 863 goto mc_msf_out;
829 } 864 }
830 msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); 865 msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
831 msf = kmalloc(msize,GFP_KERNEL); 866 msf = kmalloc(msize, GFP_KERNEL);
832 if (!msf) { 867 if (!msf) {
833 err = -ENOBUFS; 868 err = -ENOBUFS;
834 goto mc_msf_out; 869 goto mc_msf_out;
@@ -971,9 +1006,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
971 return -EOPNOTSUPP; 1006 return -EOPNOTSUPP;
972 1007
973 if (ip_mroute_opt(optname)) 1008 if (ip_mroute_opt(optname))
974 return ip_mroute_getsockopt(sk,optname,optval,optlen); 1009 return ip_mroute_getsockopt(sk, optname, optval, optlen);
975 1010
976 if (get_user(len,optlen)) 1011 if (get_user(len, optlen))
977 return -EFAULT; 1012 return -EFAULT;
978 if (len < 0) 1013 if (len < 0)
979 return -EINVAL; 1014 return -EINVAL;
@@ -984,7 +1019,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
984 case IP_OPTIONS: 1019 case IP_OPTIONS:
985 { 1020 {
986 unsigned char optbuf[sizeof(struct ip_options)+40]; 1021 unsigned char optbuf[sizeof(struct ip_options)+40];
987 struct ip_options * opt = (struct ip_options*)optbuf; 1022 struct ip_options * opt = (struct ip_options *)optbuf;
988 opt->optlen = 0; 1023 opt->optlen = 0;
989 if (inet->opt) 1024 if (inet->opt)
990 memcpy(optbuf, inet->opt, 1025 memcpy(optbuf, inet->opt,
@@ -1022,6 +1057,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1022 case IP_PASSSEC: 1057 case IP_PASSSEC:
1023 val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; 1058 val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
1024 break; 1059 break;
1060 case IP_RECVORIGDSTADDR:
1061 val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
1062 break;
1025 case IP_TOS: 1063 case IP_TOS:
1026 val = inet->tos; 1064 val = inet->tos;
1027 break; 1065 break;
@@ -1154,13 +1192,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1154 len = 1; 1192 len = 1;
1155 if (put_user(len, optlen)) 1193 if (put_user(len, optlen))
1156 return -EFAULT; 1194 return -EFAULT;
1157 if (copy_to_user(optval,&ucval,1)) 1195 if (copy_to_user(optval, &ucval, 1))
1158 return -EFAULT; 1196 return -EFAULT;
1159 } else { 1197 } else {
1160 len = min_t(unsigned int, sizeof(int), len); 1198 len = min_t(unsigned int, sizeof(int), len);
1161 if (put_user(len, optlen)) 1199 if (put_user(len, optlen))
1162 return -EFAULT; 1200 return -EFAULT;
1163 if (copy_to_user(optval,&val,len)) 1201 if (copy_to_user(optval, &val, len))
1164 return -EFAULT; 1202 return -EFAULT;
1165 } 1203 }
1166 return 0; 1204 return 0;
@@ -1178,7 +1216,7 @@ int ip_getsockopt(struct sock *sk, int level,
1178 !ip_mroute_opt(optname)) { 1216 !ip_mroute_opt(optname)) {
1179 int len; 1217 int len;
1180 1218
1181 if (get_user(len,optlen)) 1219 if (get_user(len, optlen))
1182 return -EFAULT; 1220 return -EFAULT;
1183 1221
1184 lock_sock(sk); 1222 lock_sock(sk);
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 38ccb6dfb02e..3262ce06294c 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -35,12 +35,12 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
35 return; 35 return;
36 36
37 spi = htonl(ntohs(ipch->cpi)); 37 spi = htonl(ntohs(ipch->cpi));
38 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, 38 x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr,
39 spi, IPPROTO_COMP, AF_INET); 39 spi, IPPROTO_COMP, AF_INET);
40 if (!x) 40 if (!x)
41 return; 41 return;
42 NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" NIPQUAD_FMT "\n", 42 NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
43 spi, NIPQUAD(iph->daddr)); 43 spi, &iph->daddr);
44 xfrm_state_put(x); 44 xfrm_state_put(x);
45} 45}
46 46
@@ -49,7 +49,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
49{ 49{
50 struct xfrm_state *t; 50 struct xfrm_state *t;
51 51
52 t = xfrm_state_alloc(); 52 t = xfrm_state_alloc(&init_net);
53 if (t == NULL) 53 if (t == NULL)
54 goto out; 54 goto out;
55 55
@@ -85,7 +85,7 @@ static int ipcomp_tunnel_attach(struct xfrm_state *x)
85 int err = 0; 85 int err = 0;
86 struct xfrm_state *t; 86 struct xfrm_state *t;
87 87
88 t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4, 88 t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4,
89 x->props.saddr.a4, IPPROTO_IPIP, AF_INET); 89 x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
90 if (!t) { 90 if (!t) {
91 t = ipcomp_tunnel_create(x); 91 t = ipcomp_tunnel_create(x);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 42065fff46c4..42a0f3dd3fd6 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -374,7 +374,7 @@ static int __init ic_defaults(void)
374 */ 374 */
375 375
376 if (!ic_host_name_set) 376 if (!ic_host_name_set)
377 sprintf(init_utsname()->nodename, NIPQUAD_FMT, NIPQUAD(ic_myaddr)); 377 sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr);
378 378
379 if (root_server_addr == NONE) 379 if (root_server_addr == NONE)
380 root_server_addr = ic_servaddr; 380 root_server_addr = ic_servaddr;
@@ -387,11 +387,11 @@ static int __init ic_defaults(void)
387 else if (IN_CLASSC(ntohl(ic_myaddr))) 387 else if (IN_CLASSC(ntohl(ic_myaddr)))
388 ic_netmask = htonl(IN_CLASSC_NET); 388 ic_netmask = htonl(IN_CLASSC_NET);
389 else { 389 else {
390 printk(KERN_ERR "IP-Config: Unable to guess netmask for address " NIPQUAD_FMT "\n", 390 printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n",
391 NIPQUAD(ic_myaddr)); 391 &ic_myaddr);
392 return -1; 392 return -1;
393 } 393 }
394 printk("IP-Config: Guessing netmask " NIPQUAD_FMT "\n", NIPQUAD(ic_netmask)); 394 printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
395 } 395 }
396 396
397 return 0; 397 return 0;
@@ -979,10 +979,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
979 ic_myaddr = b->your_ip; 979 ic_myaddr = b->your_ip;
980 ic_servaddr = server_id; 980 ic_servaddr = server_id;
981#ifdef IPCONFIG_DEBUG 981#ifdef IPCONFIG_DEBUG
982 printk("DHCP: Offered address " NIPQUAD_FMT, 982 printk("DHCP: Offered address %pI4 by server %pI4\n",
983 NIPQUAD(ic_myaddr)); 983 &ic_myaddr, &ic_servaddr);
984 printk(" by server " NIPQUAD_FMT "\n",
985 NIPQUAD(ic_servaddr));
986#endif 984#endif
987 /* The DHCP indicated server address takes 985 /* The DHCP indicated server address takes
988 * precedence over the bootp header one if 986 * precedence over the bootp header one if
@@ -1177,11 +1175,11 @@ static int __init ic_dynamic(void)
1177 return -1; 1175 return -1;
1178 } 1176 }
1179 1177
1180 printk("IP-Config: Got %s answer from " NIPQUAD_FMT ", ", 1178 printk("IP-Config: Got %s answer from %pI4, ",
1181 ((ic_got_reply & IC_RARP) ? "RARP" 1179 ((ic_got_reply & IC_RARP) ? "RARP"
1182 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), 1180 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
1183 NIPQUAD(ic_servaddr)); 1181 &ic_servaddr);
1184 printk("my address is " NIPQUAD_FMT "\n", NIPQUAD(ic_myaddr)); 1182 printk("my address is %pI4\n", &ic_myaddr);
1185 1183
1186 return 0; 1184 return 0;
1187} 1185}
@@ -1206,14 +1204,12 @@ static int pnp_seq_show(struct seq_file *seq, void *v)
1206 "domain %s\n", ic_domain); 1204 "domain %s\n", ic_domain);
1207 for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { 1205 for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
1208 if (ic_nameservers[i] != NONE) 1206 if (ic_nameservers[i] != NONE)
1209 seq_printf(seq, 1207 seq_printf(seq, "nameserver %pI4\n",
1210 "nameserver " NIPQUAD_FMT "\n", 1208 &ic_nameservers[i]);
1211 NIPQUAD(ic_nameservers[i]));
1212 } 1209 }
1213 if (ic_servaddr != NONE) 1210 if (ic_servaddr != NONE)
1214 seq_printf(seq, 1211 seq_printf(seq, "bootserver %pI4\n",
1215 "bootserver " NIPQUAD_FMT "\n", 1212 &ic_servaddr);
1216 NIPQUAD(ic_servaddr));
1217 return 0; 1213 return 0;
1218} 1214}
1219 1215
@@ -1387,13 +1383,13 @@ static int __init ip_auto_config(void)
1387 */ 1383 */
1388 printk("IP-Config: Complete:"); 1384 printk("IP-Config: Complete:");
1389 printk("\n device=%s", ic_dev->name); 1385 printk("\n device=%s", ic_dev->name);
1390 printk(", addr=" NIPQUAD_FMT, NIPQUAD(ic_myaddr)); 1386 printk(", addr=%pI4", &ic_myaddr);
1391 printk(", mask=" NIPQUAD_FMT, NIPQUAD(ic_netmask)); 1387 printk(", mask=%pI4", &ic_netmask);
1392 printk(", gw=" NIPQUAD_FMT, NIPQUAD(ic_gateway)); 1388 printk(", gw=%pI4", &ic_gateway);
1393 printk(",\n host=%s, domain=%s, nis-domain=%s", 1389 printk(",\n host=%s, domain=%s, nis-domain=%s",
1394 utsname()->nodename, ic_domain, utsname()->domainname); 1390 utsname()->nodename, ic_domain, utsname()->domainname);
1395 printk(",\n bootserver=" NIPQUAD_FMT, NIPQUAD(ic_servaddr)); 1391 printk(",\n bootserver=%pI4", &ic_servaddr);
1396 printk(", rootserver=" NIPQUAD_FMT, NIPQUAD(root_server_addr)); 1392 printk(", rootserver=%pI4", &root_server_addr);
1397 printk(", rootpath=%s", root_server_path); 1393 printk(", rootpath=%s", root_server_path);
1398 printk("\n"); 1394 printk("\n");
1399#endif /* !SILENT */ 1395#endif /* !SILENT */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 29609d29df76..5079dfbc6f38 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -130,8 +130,8 @@ struct ipip_net {
130 struct net_device *fb_tunnel_dev; 130 struct net_device *fb_tunnel_dev;
131}; 131};
132 132
133static int ipip_fb_tunnel_init(struct net_device *dev); 133static void ipip_fb_tunnel_init(struct net_device *dev);
134static int ipip_tunnel_init(struct net_device *dev); 134static void ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136 136
137static DEFINE_RWLOCK(ipip_lock); 137static DEFINE_RWLOCK(ipip_lock);
@@ -245,9 +245,10 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
245 } 245 }
246 246
247 nt = netdev_priv(dev); 247 nt = netdev_priv(dev);
248 dev->init = ipip_tunnel_init;
249 nt->parms = *parms; 248 nt->parms = *parms;
250 249
250 ipip_tunnel_init(dev);
251
251 if (register_netdevice(dev) < 0) 252 if (register_netdevice(dev) < 0)
252 goto failed_free; 253 goto failed_free;
253 254
@@ -281,7 +282,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
281 8 bytes of packet payload. It means, that precise relaying of 282 8 bytes of packet payload. It means, that precise relaying of
282 ICMP in the real Internet is absolutely infeasible. 283 ICMP in the real Internet is absolutely infeasible.
283 */ 284 */
284 struct iphdr *iph = (struct iphdr*)skb->data; 285 struct iphdr *iph = (struct iphdr *)skb->data;
285 const int type = icmp_hdr(skb)->type; 286 const int type = icmp_hdr(skb)->type;
286 const int code = icmp_hdr(skb)->code; 287 const int code = icmp_hdr(skb)->code;
287 struct ip_tunnel *t; 288 struct ip_tunnel *t;
@@ -691,12 +692,17 @@ static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
691 return 0; 692 return 0;
692} 693}
693 694
695static const struct net_device_ops ipip_netdev_ops = {
696 .ndo_uninit = ipip_tunnel_uninit,
697 .ndo_start_xmit = ipip_tunnel_xmit,
698 .ndo_do_ioctl = ipip_tunnel_ioctl,
699 .ndo_change_mtu = ipip_tunnel_change_mtu,
700
701};
702
694static void ipip_tunnel_setup(struct net_device *dev) 703static void ipip_tunnel_setup(struct net_device *dev)
695{ 704{
696 dev->uninit = ipip_tunnel_uninit; 705 dev->netdev_ops = &ipip_netdev_ops;
697 dev->hard_start_xmit = ipip_tunnel_xmit;
698 dev->do_ioctl = ipip_tunnel_ioctl;
699 dev->change_mtu = ipip_tunnel_change_mtu;
700 dev->destructor = free_netdev; 706 dev->destructor = free_netdev;
701 707
702 dev->type = ARPHRD_TUNNEL; 708 dev->type = ARPHRD_TUNNEL;
@@ -708,11 +714,9 @@ static void ipip_tunnel_setup(struct net_device *dev)
708 dev->features |= NETIF_F_NETNS_LOCAL; 714 dev->features |= NETIF_F_NETNS_LOCAL;
709} 715}
710 716
711static int ipip_tunnel_init(struct net_device *dev) 717static void ipip_tunnel_init(struct net_device *dev)
712{ 718{
713 struct ip_tunnel *tunnel; 719 struct ip_tunnel *tunnel = netdev_priv(dev);
714
715 tunnel = netdev_priv(dev);
716 720
717 tunnel->dev = dev; 721 tunnel->dev = dev;
718 strcpy(tunnel->parms.name, dev->name); 722 strcpy(tunnel->parms.name, dev->name);
@@ -721,11 +725,9 @@ static int ipip_tunnel_init(struct net_device *dev)
721 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 725 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
722 726
723 ipip_tunnel_bind_dev(dev); 727 ipip_tunnel_bind_dev(dev);
724
725 return 0;
726} 728}
727 729
728static int ipip_fb_tunnel_init(struct net_device *dev) 730static void ipip_fb_tunnel_init(struct net_device *dev)
729{ 731{
730 struct ip_tunnel *tunnel = netdev_priv(dev); 732 struct ip_tunnel *tunnel = netdev_priv(dev);
731 struct iphdr *iph = &tunnel->parms.iph; 733 struct iphdr *iph = &tunnel->parms.iph;
@@ -740,7 +742,6 @@ static int ipip_fb_tunnel_init(struct net_device *dev)
740 742
741 dev_hold(dev); 743 dev_hold(dev);
742 ipn->tunnels_wc[0] = tunnel; 744 ipn->tunnels_wc[0] = tunnel;
743 return 0;
744} 745}
745 746
746static struct xfrm_tunnel ipip_handler = { 747static struct xfrm_tunnel ipip_handler = {
@@ -792,10 +793,10 @@ static int ipip_init_net(struct net *net)
792 err = -ENOMEM; 793 err = -ENOMEM;
793 goto err_alloc_dev; 794 goto err_alloc_dev;
794 } 795 }
795
796 ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init;
797 dev_net_set(ipn->fb_tunnel_dev, net); 796 dev_net_set(ipn->fb_tunnel_dev, net);
798 797
798 ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
799
799 if ((err = register_netdev(ipn->fb_tunnel_dev))) 800 if ((err = register_netdev(ipn->fb_tunnel_dev)))
800 goto err_reg_dev; 801 goto err_reg_dev;
801 802
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 25924b1eb2ef..14666449dc1c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -124,8 +124,8 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
124 124
125 dev = __dev_get_by_name(&init_net, "tunl0"); 125 dev = __dev_get_by_name(&init_net, "tunl0");
126 if (dev) { 126 if (dev) {
127 const struct net_device_ops *ops = dev->netdev_ops;
127 struct ifreq ifr; 128 struct ifreq ifr;
128 mm_segment_t oldfs;
129 struct ip_tunnel_parm p; 129 struct ip_tunnel_parm p;
130 130
131 memset(&p, 0, sizeof(p)); 131 memset(&p, 0, sizeof(p));
@@ -137,9 +137,13 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
137 sprintf(p.name, "dvmrp%d", v->vifc_vifi); 137 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
138 ifr.ifr_ifru.ifru_data = (__force void __user *)&p; 138 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
139 139
140 oldfs = get_fs(); set_fs(KERNEL_DS); 140 if (ops->ndo_do_ioctl) {
141 dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL); 141 mm_segment_t oldfs = get_fs();
142 set_fs(oldfs); 142
143 set_fs(KERNEL_DS);
144 ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
145 set_fs(oldfs);
146 }
143 } 147 }
144} 148}
145 149
@@ -151,9 +155,9 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
151 dev = __dev_get_by_name(&init_net, "tunl0"); 155 dev = __dev_get_by_name(&init_net, "tunl0");
152 156
153 if (dev) { 157 if (dev) {
158 const struct net_device_ops *ops = dev->netdev_ops;
154 int err; 159 int err;
155 struct ifreq ifr; 160 struct ifreq ifr;
156 mm_segment_t oldfs;
157 struct ip_tunnel_parm p; 161 struct ip_tunnel_parm p;
158 struct in_device *in_dev; 162 struct in_device *in_dev;
159 163
@@ -166,9 +170,14 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
166 sprintf(p.name, "dvmrp%d", v->vifc_vifi); 170 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
167 ifr.ifr_ifru.ifru_data = (__force void __user *)&p; 171 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
168 172
169 oldfs = get_fs(); set_fs(KERNEL_DS); 173 if (ops->ndo_do_ioctl) {
170 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); 174 mm_segment_t oldfs = get_fs();
171 set_fs(oldfs); 175
176 set_fs(KERNEL_DS);
177 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
178 set_fs(oldfs);
179 } else
180 err = -EOPNOTSUPP;
172 181
173 dev = NULL; 182 dev = NULL;
174 183
@@ -213,12 +222,16 @@ static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
213 return 0; 222 return 0;
214} 223}
215 224
225static const struct net_device_ops reg_vif_netdev_ops = {
226 .ndo_start_xmit = reg_vif_xmit,
227};
228
216static void reg_vif_setup(struct net_device *dev) 229static void reg_vif_setup(struct net_device *dev)
217{ 230{
218 dev->type = ARPHRD_PIMREG; 231 dev->type = ARPHRD_PIMREG;
219 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; 232 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
220 dev->flags = IFF_NOARP; 233 dev->flags = IFF_NOARP;
221 dev->hard_start_xmit = reg_vif_xmit; 234 dev->netdev_ops = &reg_vif_netdev_ops,
222 dev->destructor = free_netdev; 235 dev->destructor = free_netdev;
223} 236}
224 237
@@ -331,7 +344,7 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
331 344
332 atomic_dec(&cache_resolve_queue_len); 345 atomic_dec(&cache_resolve_queue_len);
333 346
334 while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { 347 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
335 if (ip_hdr(skb)->version == 0) { 348 if (ip_hdr(skb)->version == 0) {
336 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 349 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
337 nlh->nlmsg_type = NLMSG_ERROR; 350 nlh->nlmsg_type = NLMSG_ERROR;
@@ -477,13 +490,13 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
477 /* 490 /*
478 * Fill in the VIF structures 491 * Fill in the VIF structures
479 */ 492 */
480 v->rate_limit=vifc->vifc_rate_limit; 493 v->rate_limit = vifc->vifc_rate_limit;
481 v->local=vifc->vifc_lcl_addr.s_addr; 494 v->local = vifc->vifc_lcl_addr.s_addr;
482 v->remote=vifc->vifc_rmt_addr.s_addr; 495 v->remote = vifc->vifc_rmt_addr.s_addr;
483 v->flags=vifc->vifc_flags; 496 v->flags = vifc->vifc_flags;
484 if (!mrtsock) 497 if (!mrtsock)
485 v->flags |= VIFF_STATIC; 498 v->flags |= VIFF_STATIC;
486 v->threshold=vifc->vifc_threshold; 499 v->threshold = vifc->vifc_threshold;
487 v->bytes_in = 0; 500 v->bytes_in = 0;
488 v->bytes_out = 0; 501 v->bytes_out = 0;
489 v->pkt_in = 0; 502 v->pkt_in = 0;
@@ -494,7 +507,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
494 507
495 /* And finish update writing critical data */ 508 /* And finish update writing critical data */
496 write_lock_bh(&mrt_lock); 509 write_lock_bh(&mrt_lock);
497 v->dev=dev; 510 v->dev = dev;
498#ifdef CONFIG_IP_PIMSM 511#ifdef CONFIG_IP_PIMSM
499 if (v->flags&VIFF_REGISTER) 512 if (v->flags&VIFF_REGISTER)
500 reg_vif_num = vifi; 513 reg_vif_num = vifi;
@@ -507,7 +520,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
507 520
508static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp) 521static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
509{ 522{
510 int line=MFC_HASH(mcastgrp,origin); 523 int line = MFC_HASH(mcastgrp, origin);
511 struct mfc_cache *c; 524 struct mfc_cache *c;
512 525
513 for (c=mfc_cache_array[line]; c; c = c->next) { 526 for (c=mfc_cache_array[line]; c; c = c->next) {
@@ -522,8 +535,8 @@ static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
522 */ 535 */
523static struct mfc_cache *ipmr_cache_alloc(void) 536static struct mfc_cache *ipmr_cache_alloc(void)
524{ 537{
525 struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 538 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
526 if (c==NULL) 539 if (c == NULL)
527 return NULL; 540 return NULL;
528 c->mfc_un.res.minvif = MAXVIFS; 541 c->mfc_un.res.minvif = MAXVIFS;
529 return c; 542 return c;
@@ -531,8 +544,8 @@ static struct mfc_cache *ipmr_cache_alloc(void)
531 544
532static struct mfc_cache *ipmr_cache_alloc_unres(void) 545static struct mfc_cache *ipmr_cache_alloc_unres(void)
533{ 546{
534 struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 547 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
535 if (c==NULL) 548 if (c == NULL)
536 return NULL; 549 return NULL;
537 skb_queue_head_init(&c->mfc_un.unres.unresolved); 550 skb_queue_head_init(&c->mfc_un.unres.unresolved);
538 c->mfc_un.unres.expires = jiffies + 10*HZ; 551 c->mfc_un.unres.expires = jiffies + 10*HZ;
@@ -552,7 +565,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
552 * Play the pending entries through our router 565 * Play the pending entries through our router
553 */ 566 */
554 567
555 while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) { 568 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
556 if (ip_hdr(skb)->version == 0) { 569 if (ip_hdr(skb)->version == 0) {
557 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 570 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
558 571
@@ -637,7 +650,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
637 * Add our header 650 * Add our header
638 */ 651 */
639 652
640 igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr)); 653 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
641 igmp->type = 654 igmp->type =
642 msg->im_msgtype = assert; 655 msg->im_msgtype = assert;
643 igmp->code = 0; 656 igmp->code = 0;
@@ -653,7 +666,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
653 /* 666 /*
654 * Deliver to mrouted 667 * Deliver to mrouted
655 */ 668 */
656 if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) { 669 if ((ret = sock_queue_rcv_skb(mroute_socket, skb))<0) {
657 if (net_ratelimit()) 670 if (net_ratelimit())
658 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 671 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
659 kfree_skb(skb); 672 kfree_skb(skb);
@@ -685,7 +698,7 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
685 * Create a new entry if allowable 698 * Create a new entry if allowable
686 */ 699 */
687 700
688 if (atomic_read(&cache_resolve_queue_len)>=10 || 701 if (atomic_read(&cache_resolve_queue_len) >= 10 ||
689 (c=ipmr_cache_alloc_unres())==NULL) { 702 (c=ipmr_cache_alloc_unres())==NULL) {
690 spin_unlock_bh(&mfc_unres_lock); 703 spin_unlock_bh(&mfc_unres_lock);
691 704
@@ -728,7 +741,7 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
728 kfree_skb(skb); 741 kfree_skb(skb);
729 err = -ENOBUFS; 742 err = -ENOBUFS;
730 } else { 743 } else {
731 skb_queue_tail(&c->mfc_un.unres.unresolved,skb); 744 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
732 err = 0; 745 err = 0;
733 } 746 }
734 747
@@ -745,7 +758,7 @@ static int ipmr_mfc_delete(struct mfcctl *mfc)
745 int line; 758 int line;
746 struct mfc_cache *c, **cp; 759 struct mfc_cache *c, **cp;
747 760
748 line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 761 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
749 762
750 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { 763 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
751 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 764 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
@@ -766,7 +779,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
766 int line; 779 int line;
767 struct mfc_cache *uc, *c, **cp; 780 struct mfc_cache *uc, *c, **cp;
768 781
769 line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 782 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
770 783
771 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { 784 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
772 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 785 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
@@ -787,13 +800,13 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
787 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) 800 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
788 return -EINVAL; 801 return -EINVAL;
789 802
790 c=ipmr_cache_alloc(); 803 c = ipmr_cache_alloc();
791 if (c==NULL) 804 if (c == NULL)
792 return -ENOMEM; 805 return -ENOMEM;
793 806
794 c->mfc_origin=mfc->mfcc_origin.s_addr; 807 c->mfc_origin = mfc->mfcc_origin.s_addr;
795 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; 808 c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
796 c->mfc_parent=mfc->mfcc_parent; 809 c->mfc_parent = mfc->mfcc_parent;
797 ipmr_update_thresholds(c, mfc->mfcc_ttls); 810 ipmr_update_thresholds(c, mfc->mfcc_ttls);
798 if (!mrtsock) 811 if (!mrtsock)
799 c->mfc_flags |= MFC_STATIC; 812 c->mfc_flags |= MFC_STATIC;
@@ -846,7 +859,7 @@ static void mroute_clean_tables(struct sock *sk)
846 /* 859 /*
847 * Wipe the cache 860 * Wipe the cache
848 */ 861 */
849 for (i=0;i<MFC_LINES;i++) { 862 for (i=0; i<MFC_LINES; i++) {
850 struct mfc_cache *c, **cp; 863 struct mfc_cache *c, **cp;
851 864
852 cp = &mfc_cache_array[i]; 865 cp = &mfc_cache_array[i];
@@ -887,7 +900,7 @@ static void mrtsock_destruct(struct sock *sk)
887 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--; 900 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
888 901
889 write_lock_bh(&mrt_lock); 902 write_lock_bh(&mrt_lock);
890 mroute_socket=NULL; 903 mroute_socket = NULL;
891 write_unlock_bh(&mrt_lock); 904 write_unlock_bh(&mrt_lock);
892 905
893 mroute_clean_tables(sk); 906 mroute_clean_tables(sk);
@@ -902,7 +915,7 @@ static void mrtsock_destruct(struct sock *sk)
902 * MOSPF/PIM router set up we can clean this up. 915 * MOSPF/PIM router set up we can clean this up.
903 */ 916 */
904 917
905int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen) 918int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
906{ 919{
907 int ret; 920 int ret;
908 struct vifctl vif; 921 struct vifctl vif;
@@ -918,7 +931,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt
918 if (sk->sk_type != SOCK_RAW || 931 if (sk->sk_type != SOCK_RAW ||
919 inet_sk(sk)->num != IPPROTO_IGMP) 932 inet_sk(sk)->num != IPPROTO_IGMP)
920 return -EOPNOTSUPP; 933 return -EOPNOTSUPP;
921 if (optlen!=sizeof(int)) 934 if (optlen != sizeof(int))
922 return -ENOPROTOOPT; 935 return -ENOPROTOOPT;
923 936
924 rtnl_lock(); 937 rtnl_lock();
@@ -930,7 +943,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt
930 ret = ip_ra_control(sk, 1, mrtsock_destruct); 943 ret = ip_ra_control(sk, 1, mrtsock_destruct);
931 if (ret == 0) { 944 if (ret == 0) {
932 write_lock_bh(&mrt_lock); 945 write_lock_bh(&mrt_lock);
933 mroute_socket=sk; 946 mroute_socket = sk;
934 write_unlock_bh(&mrt_lock); 947 write_unlock_bh(&mrt_lock);
935 948
936 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++; 949 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
@@ -938,19 +951,19 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt
938 rtnl_unlock(); 951 rtnl_unlock();
939 return ret; 952 return ret;
940 case MRT_DONE: 953 case MRT_DONE:
941 if (sk!=mroute_socket) 954 if (sk != mroute_socket)
942 return -EACCES; 955 return -EACCES;
943 return ip_ra_control(sk, 0, NULL); 956 return ip_ra_control(sk, 0, NULL);
944 case MRT_ADD_VIF: 957 case MRT_ADD_VIF:
945 case MRT_DEL_VIF: 958 case MRT_DEL_VIF:
946 if (optlen!=sizeof(vif)) 959 if (optlen != sizeof(vif))
947 return -EINVAL; 960 return -EINVAL;
948 if (copy_from_user(&vif,optval,sizeof(vif))) 961 if (copy_from_user(&vif, optval, sizeof(vif)))
949 return -EFAULT; 962 return -EFAULT;
950 if (vif.vifc_vifi >= MAXVIFS) 963 if (vif.vifc_vifi >= MAXVIFS)
951 return -ENFILE; 964 return -ENFILE;
952 rtnl_lock(); 965 rtnl_lock();
953 if (optname==MRT_ADD_VIF) { 966 if (optname == MRT_ADD_VIF) {
954 ret = vif_add(&vif, sk==mroute_socket); 967 ret = vif_add(&vif, sk==mroute_socket);
955 } else { 968 } else {
956 ret = vif_delete(vif.vifc_vifi, 0); 969 ret = vif_delete(vif.vifc_vifi, 0);
@@ -964,12 +977,12 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt
964 */ 977 */
965 case MRT_ADD_MFC: 978 case MRT_ADD_MFC:
966 case MRT_DEL_MFC: 979 case MRT_DEL_MFC:
967 if (optlen!=sizeof(mfc)) 980 if (optlen != sizeof(mfc))
968 return -EINVAL; 981 return -EINVAL;
969 if (copy_from_user(&mfc,optval, sizeof(mfc))) 982 if (copy_from_user(&mfc, optval, sizeof(mfc)))
970 return -EFAULT; 983 return -EFAULT;
971 rtnl_lock(); 984 rtnl_lock();
972 if (optname==MRT_DEL_MFC) 985 if (optname == MRT_DEL_MFC)
973 ret = ipmr_mfc_delete(&mfc); 986 ret = ipmr_mfc_delete(&mfc);
974 else 987 else
975 ret = ipmr_mfc_add(&mfc, sk==mroute_socket); 988 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
@@ -1028,12 +1041,12 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt
1028 * Getsock opt support for the multicast routing system. 1041 * Getsock opt support for the multicast routing system.
1029 */ 1042 */
1030 1043
1031int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen) 1044int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1032{ 1045{
1033 int olr; 1046 int olr;
1034 int val; 1047 int val;
1035 1048
1036 if (optname!=MRT_VERSION && 1049 if (optname != MRT_VERSION &&
1037#ifdef CONFIG_IP_PIMSM 1050#ifdef CONFIG_IP_PIMSM
1038 optname!=MRT_PIM && 1051 optname!=MRT_PIM &&
1039#endif 1052#endif
@@ -1047,17 +1060,17 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u
1047 if (olr < 0) 1060 if (olr < 0)
1048 return -EINVAL; 1061 return -EINVAL;
1049 1062
1050 if (put_user(olr,optlen)) 1063 if (put_user(olr, optlen))
1051 return -EFAULT; 1064 return -EFAULT;
1052 if (optname==MRT_VERSION) 1065 if (optname == MRT_VERSION)
1053 val=0x0305; 1066 val = 0x0305;
1054#ifdef CONFIG_IP_PIMSM 1067#ifdef CONFIG_IP_PIMSM
1055 else if (optname==MRT_PIM) 1068 else if (optname == MRT_PIM)
1056 val=mroute_do_pim; 1069 val = mroute_do_pim;
1057#endif 1070#endif
1058 else 1071 else
1059 val=mroute_do_assert; 1072 val = mroute_do_assert;
1060 if (copy_to_user(optval,&val,olr)) 1073 if (copy_to_user(optval, &val, olr))
1061 return -EFAULT; 1074 return -EFAULT;
1062 return 0; 1075 return 0;
1063} 1076}
@@ -1075,27 +1088,27 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1075 1088
1076 switch (cmd) { 1089 switch (cmd) {
1077 case SIOCGETVIFCNT: 1090 case SIOCGETVIFCNT:
1078 if (copy_from_user(&vr,arg,sizeof(vr))) 1091 if (copy_from_user(&vr, arg, sizeof(vr)))
1079 return -EFAULT; 1092 return -EFAULT;
1080 if (vr.vifi>=maxvif) 1093 if (vr.vifi >= maxvif)
1081 return -EINVAL; 1094 return -EINVAL;
1082 read_lock(&mrt_lock); 1095 read_lock(&mrt_lock);
1083 vif=&vif_table[vr.vifi]; 1096 vif=&vif_table[vr.vifi];
1084 if (VIF_EXISTS(vr.vifi)) { 1097 if (VIF_EXISTS(vr.vifi)) {
1085 vr.icount=vif->pkt_in; 1098 vr.icount = vif->pkt_in;
1086 vr.ocount=vif->pkt_out; 1099 vr.ocount = vif->pkt_out;
1087 vr.ibytes=vif->bytes_in; 1100 vr.ibytes = vif->bytes_in;
1088 vr.obytes=vif->bytes_out; 1101 vr.obytes = vif->bytes_out;
1089 read_unlock(&mrt_lock); 1102 read_unlock(&mrt_lock);
1090 1103
1091 if (copy_to_user(arg,&vr,sizeof(vr))) 1104 if (copy_to_user(arg, &vr, sizeof(vr)))
1092 return -EFAULT; 1105 return -EFAULT;
1093 return 0; 1106 return 0;
1094 } 1107 }
1095 read_unlock(&mrt_lock); 1108 read_unlock(&mrt_lock);
1096 return -EADDRNOTAVAIL; 1109 return -EADDRNOTAVAIL;
1097 case SIOCGETSGCNT: 1110 case SIOCGETSGCNT:
1098 if (copy_from_user(&sr,arg,sizeof(sr))) 1111 if (copy_from_user(&sr, arg, sizeof(sr)))
1099 return -EFAULT; 1112 return -EFAULT;
1100 1113
1101 read_lock(&mrt_lock); 1114 read_lock(&mrt_lock);
@@ -1106,7 +1119,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1106 sr.wrong_if = c->mfc_un.res.wrong_if; 1119 sr.wrong_if = c->mfc_un.res.wrong_if;
1107 read_unlock(&mrt_lock); 1120 read_unlock(&mrt_lock);
1108 1121
1109 if (copy_to_user(arg,&sr,sizeof(sr))) 1122 if (copy_to_user(arg, &sr, sizeof(sr)))
1110 return -EFAULT; 1123 return -EFAULT;
1111 return 0; 1124 return 0;
1112 } 1125 }
@@ -1130,15 +1143,15 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1130 if (event != NETDEV_UNREGISTER) 1143 if (event != NETDEV_UNREGISTER)
1131 return NOTIFY_DONE; 1144 return NOTIFY_DONE;
1132 v=&vif_table[0]; 1145 v=&vif_table[0];
1133 for (ct=0;ct<maxvif;ct++,v++) { 1146 for (ct=0; ct<maxvif; ct++,v++) {
1134 if (v->dev==dev) 1147 if (v->dev == dev)
1135 vif_delete(ct, 1); 1148 vif_delete(ct, 1);
1136 } 1149 }
1137 return NOTIFY_DONE; 1150 return NOTIFY_DONE;
1138} 1151}
1139 1152
1140 1153
1141static struct notifier_block ip_mr_notifier={ 1154static struct notifier_block ip_mr_notifier = {
1142 .notifier_call = ipmr_device_event, 1155 .notifier_call = ipmr_device_event,
1143}; 1156};
1144 1157
@@ -1204,7 +1217,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1204#ifdef CONFIG_IP_PIMSM 1217#ifdef CONFIG_IP_PIMSM
1205 if (vif->flags & VIFF_REGISTER) { 1218 if (vif->flags & VIFF_REGISTER) {
1206 vif->pkt_out++; 1219 vif->pkt_out++;
1207 vif->bytes_out+=skb->len; 1220 vif->bytes_out += skb->len;
1208 vif->dev->stats.tx_bytes += skb->len; 1221 vif->dev->stats.tx_bytes += skb->len;
1209 vif->dev->stats.tx_packets++; 1222 vif->dev->stats.tx_packets++;
1210 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); 1223 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
@@ -1254,7 +1267,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1254 } 1267 }
1255 1268
1256 vif->pkt_out++; 1269 vif->pkt_out++;
1257 vif->bytes_out+=skb->len; 1270 vif->bytes_out += skb->len;
1258 1271
1259 dst_release(skb->dst); 1272 dst_release(skb->dst);
1260 skb->dst = &rt->u.dst; 1273 skb->dst = &rt->u.dst;
@@ -1352,7 +1365,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1352 } 1365 }
1353 1366
1354 vif_table[vif].pkt_in++; 1367 vif_table[vif].pkt_in++;
1355 vif_table[vif].bytes_in+=skb->len; 1368 vif_table[vif].bytes_in += skb->len;
1356 1369
1357 /* 1370 /*
1358 * Forward the frame 1371 * Forward the frame
@@ -1364,7 +1377,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1364 if (skb2) 1377 if (skb2)
1365 ipmr_queue_xmit(skb2, cache, psend); 1378 ipmr_queue_xmit(skb2, cache, psend);
1366 } 1379 }
1367 psend=ct; 1380 psend = ct;
1368 } 1381 }
1369 } 1382 }
1370 if (psend != -1) { 1383 if (psend != -1) {
@@ -1428,7 +1441,7 @@ int ip_mr_input(struct sk_buff *skb)
1428 /* 1441 /*
1429 * No usable cache entry 1442 * No usable cache entry
1430 */ 1443 */
1431 if (cache==NULL) { 1444 if (cache == NULL) {
1432 int vif; 1445 int vif;
1433 1446
1434 if (local) { 1447 if (local) {
@@ -1469,29 +1482,13 @@ dont_forward:
1469 return 0; 1482 return 0;
1470} 1483}
1471 1484
1472#ifdef CONFIG_IP_PIMSM_V1 1485#ifdef CONFIG_IP_PIMSM
1473/* 1486static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1474 * Handle IGMP messages of PIMv1
1475 */
1476
1477int pim_rcv_v1(struct sk_buff * skb)
1478{ 1487{
1479 struct igmphdr *pim; 1488 struct net_device *reg_dev = NULL;
1480 struct iphdr *encap; 1489 struct iphdr *encap;
1481 struct net_device *reg_dev = NULL;
1482
1483 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1484 goto drop;
1485 1490
1486 pim = igmp_hdr(skb); 1491 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1487
1488 if (!mroute_do_pim ||
1489 skb->len < sizeof(*pim) + sizeof(*encap) ||
1490 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1491 goto drop;
1492
1493 encap = (struct iphdr *)(skb_transport_header(skb) +
1494 sizeof(struct igmphdr));
1495 /* 1492 /*
1496 Check that: 1493 Check that:
1497 a. packet is really destinted to a multicast group 1494 a. packet is really destinted to a multicast group
@@ -1500,8 +1497,8 @@ int pim_rcv_v1(struct sk_buff * skb)
1500 */ 1497 */
1501 if (!ipv4_is_multicast(encap->daddr) || 1498 if (!ipv4_is_multicast(encap->daddr) ||
1502 encap->tot_len == 0 || 1499 encap->tot_len == 0 ||
1503 ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 1500 ntohs(encap->tot_len) + pimlen > skb->len)
1504 goto drop; 1501 return 1;
1505 1502
1506 read_lock(&mrt_lock); 1503 read_lock(&mrt_lock);
1507 if (reg_vif_num >= 0) 1504 if (reg_vif_num >= 0)
@@ -1511,7 +1508,7 @@ int pim_rcv_v1(struct sk_buff * skb)
1511 read_unlock(&mrt_lock); 1508 read_unlock(&mrt_lock);
1512 1509
1513 if (reg_dev == NULL) 1510 if (reg_dev == NULL)
1514 goto drop; 1511 return 1;
1515 1512
1516 skb->mac_header = skb->network_header; 1513 skb->mac_header = skb->network_header;
1517 skb_pull(skb, (u8*)encap - skb->data); 1514 skb_pull(skb, (u8*)encap - skb->data);
@@ -1527,9 +1524,33 @@ int pim_rcv_v1(struct sk_buff * skb)
1527 nf_reset(skb); 1524 nf_reset(skb);
1528 netif_rx(skb); 1525 netif_rx(skb);
1529 dev_put(reg_dev); 1526 dev_put(reg_dev);
1527
1530 return 0; 1528 return 0;
1531 drop: 1529}
1532 kfree_skb(skb); 1530#endif
1531
1532#ifdef CONFIG_IP_PIMSM_V1
1533/*
1534 * Handle IGMP messages of PIMv1
1535 */
1536
1537int pim_rcv_v1(struct sk_buff * skb)
1538{
1539 struct igmphdr *pim;
1540
1541 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1542 goto drop;
1543
1544 pim = igmp_hdr(skb);
1545
1546 if (!mroute_do_pim ||
1547 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1548 goto drop;
1549
1550 if (__pim_rcv(skb, sizeof(*pim))) {
1551drop:
1552 kfree_skb(skb);
1553 }
1533 return 0; 1554 return 0;
1534} 1555}
1535#endif 1556#endif
@@ -1538,10 +1559,8 @@ int pim_rcv_v1(struct sk_buff * skb)
1538static int pim_rcv(struct sk_buff * skb) 1559static int pim_rcv(struct sk_buff * skb)
1539{ 1560{
1540 struct pimreghdr *pim; 1561 struct pimreghdr *pim;
1541 struct iphdr *encap;
1542 struct net_device *reg_dev = NULL;
1543 1562
1544 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 1563 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1545 goto drop; 1564 goto drop;
1546 1565
1547 pim = (struct pimreghdr *)skb_transport_header(skb); 1566 pim = (struct pimreghdr *)skb_transport_header(skb);
@@ -1551,41 +1570,10 @@ static int pim_rcv(struct sk_buff * skb)
1551 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1570 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1552 goto drop; 1571 goto drop;
1553 1572
1554 /* check if the inner packet is destined to mcast group */ 1573 if (__pim_rcv(skb, sizeof(*pim))) {
1555 encap = (struct iphdr *)(skb_transport_header(skb) + 1574drop:
1556 sizeof(struct pimreghdr)); 1575 kfree_skb(skb);
1557 if (!ipv4_is_multicast(encap->daddr) || 1576 }
1558 encap->tot_len == 0 ||
1559 ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1560 goto drop;
1561
1562 read_lock(&mrt_lock);
1563 if (reg_vif_num >= 0)
1564 reg_dev = vif_table[reg_vif_num].dev;
1565 if (reg_dev)
1566 dev_hold(reg_dev);
1567 read_unlock(&mrt_lock);
1568
1569 if (reg_dev == NULL)
1570 goto drop;
1571
1572 skb->mac_header = skb->network_header;
1573 skb_pull(skb, (u8*)encap - skb->data);
1574 skb_reset_network_header(skb);
1575 skb->dev = reg_dev;
1576 skb->protocol = htons(ETH_P_IP);
1577 skb->ip_summed = 0;
1578 skb->pkt_type = PACKET_HOST;
1579 dst_release(skb->dst);
1580 reg_dev->stats.rx_bytes += skb->len;
1581 reg_dev->stats.rx_packets++;
1582 skb->dst = NULL;
1583 nf_reset(skb);
1584 netif_rx(skb);
1585 dev_put(reg_dev);
1586 return 0;
1587 drop:
1588 kfree_skb(skb);
1589 return 0; 1577 return 0;
1590} 1578}
1591#endif 1579#endif
@@ -1602,13 +1590,13 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1602 if (dev) 1590 if (dev)
1603 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); 1591 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1604 1592
1605 mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0)); 1593 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1606 1594
1607 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { 1595 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1608 if (c->mfc_un.res.ttls[ct] < 255) { 1596 if (c->mfc_un.res.ttls[ct] < 255) {
1609 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) 1597 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1610 goto rtattr_failure; 1598 goto rtattr_failure;
1611 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); 1599 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1612 nhp->rtnh_flags = 0; 1600 nhp->rtnh_flags = 0;
1613 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 1601 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1614 nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; 1602 nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
@@ -1634,7 +1622,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1634 read_lock(&mrt_lock); 1622 read_lock(&mrt_lock);
1635 cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); 1623 cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1636 1624
1637 if (cache==NULL) { 1625 if (cache == NULL) {
1638 struct sk_buff *skb2; 1626 struct sk_buff *skb2;
1639 struct iphdr *iph; 1627 struct iphdr *iph;
1640 struct net_device *dev; 1628 struct net_device *dev;
@@ -1866,15 +1854,16 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1866 const struct mfc_cache *mfc = v; 1854 const struct mfc_cache *mfc = v;
1867 const struct ipmr_mfc_iter *it = seq->private; 1855 const struct ipmr_mfc_iter *it = seq->private;
1868 1856
1869 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld", 1857 seq_printf(seq, "%08lX %08lX %-3hd",
1870 (unsigned long) mfc->mfc_mcastgrp, 1858 (unsigned long) mfc->mfc_mcastgrp,
1871 (unsigned long) mfc->mfc_origin, 1859 (unsigned long) mfc->mfc_origin,
1872 mfc->mfc_parent, 1860 mfc->mfc_parent);
1873 mfc->mfc_un.res.pkt,
1874 mfc->mfc_un.res.bytes,
1875 mfc->mfc_un.res.wrong_if);
1876 1861
1877 if (it->cache != &mfc_unres_queue) { 1862 if (it->cache != &mfc_unres_queue) {
1863 seq_printf(seq, " %8lu %8lu %8lu",
1864 mfc->mfc_un.res.pkt,
1865 mfc->mfc_un.res.bytes,
1866 mfc->mfc_un.res.wrong_if);
1878 for (n = mfc->mfc_un.res.minvif; 1867 for (n = mfc->mfc_un.res.minvif;
1879 n < mfc->mfc_un.res.maxvif; n++ ) { 1868 n < mfc->mfc_un.res.maxvif; n++ ) {
1880 if (VIF_EXISTS(n) 1869 if (VIF_EXISTS(n)
@@ -1883,6 +1872,11 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1883 " %2d:%-3d", 1872 " %2d:%-3d",
1884 n, mfc->mfc_un.res.ttls[n]); 1873 n, mfc->mfc_un.res.ttls[n]);
1885 } 1874 }
1875 } else {
1876 /* unresolved mfc_caches don't contain
1877 * pkt, bytes and wrong_if values
1878 */
1879 seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1886 } 1880 }
1887 seq_putc(seq, '\n'); 1881 seq_putc(seq, '\n');
1888 } 1882 }
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 6efdb70b3eb2..fdf6811c31a2 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -66,7 +66,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
66#ifdef CONFIG_XFRM 66#ifdef CONFIG_XFRM
67 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 67 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
68 xfrm_decode_session(skb, &fl, AF_INET) == 0) 68 xfrm_decode_session(skb, &fl, AF_INET) == 0)
69 if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0)) 69 if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0))
70 return -1; 70 return -1;
71#endif 71#endif
72 72
@@ -97,7 +97,7 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
97 dst = ((struct xfrm_dst *)dst)->route; 97 dst = ((struct xfrm_dst *)dst)->route;
98 dst_hold(dst); 98 dst_hold(dst);
99 99
100 if (xfrm_lookup(&dst, &fl, skb->sk, 0) < 0) 100 if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0)
101 return -1; 101 return -1;
102 102
103 dst_release(skb->dst); 103 dst_release(skb->dst);
@@ -125,6 +125,7 @@ struct ip_rt_info {
125 __be32 daddr; 125 __be32 daddr;
126 __be32 saddr; 126 __be32 saddr;
127 u_int8_t tos; 127 u_int8_t tos;
128 u_int32_t mark;
128}; 129};
129 130
130static void nf_ip_saveroute(const struct sk_buff *skb, 131static void nf_ip_saveroute(const struct sk_buff *skb,
@@ -138,6 +139,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb,
138 rt_info->tos = iph->tos; 139 rt_info->tos = iph->tos;
139 rt_info->daddr = iph->daddr; 140 rt_info->daddr = iph->daddr;
140 rt_info->saddr = iph->saddr; 141 rt_info->saddr = iph->saddr;
142 rt_info->mark = skb->mark;
141 } 143 }
142} 144}
143 145
@@ -150,6 +152,7 @@ static int nf_ip_reroute(struct sk_buff *skb,
150 const struct iphdr *iph = ip_hdr(skb); 152 const struct iphdr *iph = ip_hdr(skb);
151 153
152 if (!(iph->tos == rt_info->tos 154 if (!(iph->tos == rt_info->tos
155 && skb->mark == rt_info->mark
153 && iph->daddr == rt_info->daddr 156 && iph->daddr == rt_info->daddr
154 && iph->saddr == rt_info->saddr)) 157 && iph->saddr == rt_info->saddr))
155 return ip_route_me_harder(skb, RTN_UNSPEC); 158 return ip_route_me_harder(skb, RTN_UNSPEC);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 8d70d29f1ccf..7ea88b61cb0d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -142,15 +142,15 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
142 ARPT_INV_TGTIP)) { 142 ARPT_INV_TGTIP)) {
143 dprintf("Source or target IP address mismatch.\n"); 143 dprintf("Source or target IP address mismatch.\n");
144 144
145 dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", 145 dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
146 NIPQUAD(src_ipaddr), 146 &src_ipaddr,
147 NIPQUAD(arpinfo->smsk.s_addr), 147 &arpinfo->smsk.s_addr,
148 NIPQUAD(arpinfo->src.s_addr), 148 &arpinfo->src.s_addr,
149 arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); 149 arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
150 dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", 150 dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n",
151 NIPQUAD(tgt_ipaddr), 151 &tgt_ipaddr,
152 NIPQUAD(arpinfo->tmsk.s_addr), 152 &arpinfo->tmsk.s_addr,
153 NIPQUAD(arpinfo->tgt.s_addr), 153 &arpinfo->tgt.s_addr,
154 arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); 154 arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
155 return 0; 155 return 0;
156 } 156 }
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index bee3d117661a..e091187e864f 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -75,16 +75,6 @@ static unsigned int arpt_out_hook(unsigned int hook,
75 dev_net(out)->ipv4.arptable_filter); 75 dev_net(out)->ipv4.arptable_filter);
76} 76}
77 77
78static unsigned int arpt_forward_hook(unsigned int hook,
79 struct sk_buff *skb,
80 const struct net_device *in,
81 const struct net_device *out,
82 int (*okfn)(struct sk_buff *))
83{
84 return arpt_do_table(skb, hook, in, out,
85 dev_net(in)->ipv4.arptable_filter);
86}
87
88static struct nf_hook_ops arpt_ops[] __read_mostly = { 78static struct nf_hook_ops arpt_ops[] __read_mostly = {
89 { 79 {
90 .hook = arpt_in_hook, 80 .hook = arpt_in_hook,
@@ -101,7 +91,7 @@ static struct nf_hook_ops arpt_ops[] __read_mostly = {
101 .priority = NF_IP_PRI_FILTER, 91 .priority = NF_IP_PRI_FILTER,
102 }, 92 },
103 { 93 {
104 .hook = arpt_forward_hook, 94 .hook = arpt_in_hook,
105 .owner = THIS_MODULE, 95 .owner = THIS_MODULE,
106 .pf = NFPROTO_ARP, 96 .pf = NFPROTO_ARP,
107 .hooknum = NF_ARP_FORWARD, 97 .hooknum = NF_ARP_FORWARD,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 213fb27debc1..ef8b6ca068b2 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -94,15 +94,11 @@ ip_packet_match(const struct iphdr *ip,
94 IPT_INV_DSTIP)) { 94 IPT_INV_DSTIP)) {
95 dprintf("Source or dest mismatch.\n"); 95 dprintf("Source or dest mismatch.\n");
96 96
97 dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", 97 dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
98 NIPQUAD(ip->saddr), 98 &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr,
99 NIPQUAD(ipinfo->smsk.s_addr),
100 NIPQUAD(ipinfo->src.s_addr),
101 ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); 99 ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
102 dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", 100 dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n",
103 NIPQUAD(ip->daddr), 101 &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr,
104 NIPQUAD(ipinfo->dmsk.s_addr),
105 NIPQUAD(ipinfo->dst.s_addr),
106 ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); 102 ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
107 return false; 103 return false;
108 } 104 }
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 7ac1677419a9..2e4f98b85524 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -168,7 +168,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
168 char buffer[16]; 168 char buffer[16];
169 169
170 /* create proc dir entry */ 170 /* create proc dir entry */
171 sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); 171 sprintf(buffer, "%pI4", &ip);
172 c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, 172 c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
173 clusterip_procdir, 173 clusterip_procdir,
174 &clusterip_proc_fops, c); 174 &clusterip_proc_fops, c);
@@ -373,7 +373,7 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
373 config = clusterip_config_find_get(e->ip.dst.s_addr, 1); 373 config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
374 if (!config) { 374 if (!config) {
375 if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { 375 if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
376 printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr)); 376 printk(KERN_WARNING "CLUSTERIP: no config found for %pI4, need 'new'\n", &e->ip.dst.s_addr);
377 return false; 377 return false;
378 } else { 378 } else {
379 struct net_device *dev; 379 struct net_device *dev;
@@ -478,9 +478,8 @@ static void arp_print(struct arp_payload *payload)
478 } 478 }
479 hbuffer[--k]='\0'; 479 hbuffer[--k]='\0';
480 480
481 printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n", 481 printk("src %pI4@%s, dst %pI4\n",
482 NIPQUAD(payload->src_ip), hbuffer, 482 &payload->src_ip, hbuffer, &payload->dst_ip);
483 NIPQUAD(payload->dst_ip));
484} 483}
485#endif 484#endif
486 485
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 7b5dbe118c09..27a78fbbd92b 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -54,8 +54,8 @@ static void dump_packet(const struct nf_loginfo *info,
54 /* Important fields: 54 /* Important fields:
55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ 55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ 56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
57 printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", 57 printk("SRC=%pI4 DST=%pI4 ",
58 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); 58 &ih->saddr, &ih->daddr);
59 59
60 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ 60 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
61 printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", 61 printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
@@ -262,8 +262,7 @@ static void dump_packet(const struct nf_loginfo *info,
262 break; 262 break;
263 case ICMP_REDIRECT: 263 case ICMP_REDIRECT:
264 /* Max length: 24 "GATEWAY=255.255.255.255 " */ 264 /* Max length: 24 "GATEWAY=255.255.255.255 " */
265 printk("GATEWAY=%u.%u.%u.%u ", 265 printk("GATEWAY=%pI4 ", &ich->un.gateway);
266 NIPQUAD(ich->un.gateway));
267 /* Fall through */ 266 /* Fall through */
268 case ICMP_DEST_UNREACH: 267 case ICMP_DEST_UNREACH:
269 case ICMP_SOURCE_QUENCH: 268 case ICMP_SOURCE_QUENCH:
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index 88762f02779d..3b216be3bc9f 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -23,24 +23,25 @@ MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); 23MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
24MODULE_DESCRIPTION("Xtables: address type match for IPv4"); 24MODULE_DESCRIPTION("Xtables: address type match for IPv4");
25 25
26static inline bool match_type(const struct net_device *dev, __be32 addr, 26static inline bool match_type(struct net *net, const struct net_device *dev,
27 u_int16_t mask) 27 __be32 addr, u_int16_t mask)
28{ 28{
29 return !!(mask & (1 << inet_dev_addr_type(&init_net, dev, addr))); 29 return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
30} 30}
31 31
32static bool 32static bool
33addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) 33addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
34{ 34{
35 struct net *net = dev_net(par->in ? par->in : par->out);
35 const struct ipt_addrtype_info *info = par->matchinfo; 36 const struct ipt_addrtype_info *info = par->matchinfo;
36 const struct iphdr *iph = ip_hdr(skb); 37 const struct iphdr *iph = ip_hdr(skb);
37 bool ret = true; 38 bool ret = true;
38 39
39 if (info->source) 40 if (info->source)
40 ret &= match_type(NULL, iph->saddr, info->source) ^ 41 ret &= match_type(net, NULL, iph->saddr, info->source) ^
41 info->invert_source; 42 info->invert_source;
42 if (info->dest) 43 if (info->dest)
43 ret &= match_type(NULL, iph->daddr, info->dest) ^ 44 ret &= match_type(net, NULL, iph->daddr, info->dest) ^
44 info->invert_dest; 45 info->invert_dest;
45 46
46 return ret; 47 return ret;
@@ -49,6 +50,7 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
49static bool 50static bool
50addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) 51addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
51{ 52{
53 struct net *net = dev_net(par->in ? par->in : par->out);
52 const struct ipt_addrtype_info_v1 *info = par->matchinfo; 54 const struct ipt_addrtype_info_v1 *info = par->matchinfo;
53 const struct iphdr *iph = ip_hdr(skb); 55 const struct iphdr *iph = ip_hdr(skb);
54 const struct net_device *dev = NULL; 56 const struct net_device *dev = NULL;
@@ -60,10 +62,10 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
60 dev = par->out; 62 dev = par->out;
61 63
62 if (info->source) 64 if (info->source)
63 ret &= match_type(dev, iph->saddr, info->source) ^ 65 ret &= match_type(net, dev, iph->saddr, info->source) ^
64 (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); 66 (info->flags & IPT_ADDRTYPE_INVERT_SOURCE);
65 if (ret && info->dest) 67 if (ret && info->dest)
66 ret &= match_type(dev, iph->daddr, info->dest) ^ 68 ret &= match_type(net, dev, iph->daddr, info->dest) ^
67 !!(info->flags & IPT_ADDRTYPE_INVERT_DEST); 69 !!(info->flags & IPT_ADDRTYPE_INVERT_DEST);
68 return ret; 70 return ret;
69} 71}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 4a7c35275396..b2141e11575e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -60,9 +60,8 @@ static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
60static int ipv4_print_tuple(struct seq_file *s, 60static int ipv4_print_tuple(struct seq_file *s,
61 const struct nf_conntrack_tuple *tuple) 61 const struct nf_conntrack_tuple *tuple)
62{ 62{
63 return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", 63 return seq_printf(s, "src=%pI4 dst=%pI4 ",
64 NIPQUAD(tuple->src.u3.ip), 64 &tuple->src.u3.ip, &tuple->dst.u3.ip);
65 NIPQUAD(tuple->dst.u3.ip));
66} 65}
67 66
68static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 67static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
@@ -198,7 +197,7 @@ static ctl_table ip_ct_sysctl_table[] = {
198 .data = &nf_conntrack_max, 197 .data = &nf_conntrack_max,
199 .maxlen = sizeof(int), 198 .maxlen = sizeof(int),
200 .mode = 0644, 199 .mode = 0644,
201 .proc_handler = &proc_dointvec, 200 .proc_handler = proc_dointvec,
202 }, 201 },
203 { 202 {
204 .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, 203 .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
@@ -206,7 +205,7 @@ static ctl_table ip_ct_sysctl_table[] = {
206 .data = &init_net.ct.count, 205 .data = &init_net.ct.count,
207 .maxlen = sizeof(int), 206 .maxlen = sizeof(int),
208 .mode = 0444, 207 .mode = 0444,
209 .proc_handler = &proc_dointvec, 208 .proc_handler = proc_dointvec,
210 }, 209 },
211 { 210 {
212 .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS, 211 .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
@@ -214,7 +213,7 @@ static ctl_table ip_ct_sysctl_table[] = {
214 .data = &nf_conntrack_htable_size, 213 .data = &nf_conntrack_htable_size,
215 .maxlen = sizeof(unsigned int), 214 .maxlen = sizeof(unsigned int),
216 .mode = 0444, 215 .mode = 0444,
217 .proc_handler = &proc_dointvec, 216 .proc_handler = proc_dointvec,
218 }, 217 },
219 { 218 {
220 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, 219 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
@@ -222,7 +221,7 @@ static ctl_table ip_ct_sysctl_table[] = {
222 .data = &init_net.ct.sysctl_checksum, 221 .data = &init_net.ct.sysctl_checksum,
223 .maxlen = sizeof(int), 222 .maxlen = sizeof(int),
224 .mode = 0644, 223 .mode = 0644,
225 .proc_handler = &proc_dointvec, 224 .proc_handler = proc_dointvec,
226 }, 225 },
227 { 226 {
228 .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, 227 .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
@@ -230,8 +229,8 @@ static ctl_table ip_ct_sysctl_table[] = {
230 .data = &init_net.ct.sysctl_log_invalid, 229 .data = &init_net.ct.sysctl_log_invalid,
231 .maxlen = sizeof(unsigned int), 230 .maxlen = sizeof(unsigned int),
232 .mode = 0644, 231 .mode = 0644,
233 .proc_handler = &proc_dointvec_minmax, 232 .proc_handler = proc_dointvec_minmax,
234 .strategy = &sysctl_intvec, 233 .strategy = sysctl_intvec,
235 .extra1 = &log_invalid_proto_min, 234 .extra1 = &log_invalid_proto_min,
236 .extra2 = &log_invalid_proto_max, 235 .extra2 = &log_invalid_proto_max,
237 }, 236 },
@@ -284,17 +283,17 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
284 .tuple.dst.u3.ip; 283 .tuple.dst.u3.ip;
285 memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); 284 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
286 285
287 pr_debug("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", 286 pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
288 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 287 &sin.sin_addr.s_addr, ntohs(sin.sin_port));
289 nf_ct_put(ct); 288 nf_ct_put(ct);
290 if (copy_to_user(user, &sin, sizeof(sin)) != 0) 289 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
291 return -EFAULT; 290 return -EFAULT;
292 else 291 else
293 return 0; 292 return 0;
294 } 293 }
295 pr_debug("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", 294 pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
296 NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port), 295 &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
297 NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port)); 296 &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
298 return -ENOENT; 297 return -ENOENT;
299} 298}
300 299
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4e8879220222..1fd3ef7718b6 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -272,7 +272,7 @@ static struct ctl_table icmp_sysctl_table[] = {
272 .data = &nf_ct_icmp_timeout, 272 .data = &nf_ct_icmp_timeout,
273 .maxlen = sizeof(unsigned int), 273 .maxlen = sizeof(unsigned int),
274 .mode = 0644, 274 .mode = 0644,
275 .proc_handler = &proc_dointvec_jiffies, 275 .proc_handler = proc_dointvec_jiffies,
276 }, 276 },
277 { 277 {
278 .ctl_name = 0 278 .ctl_name = 0
@@ -285,7 +285,7 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
285 .data = &nf_ct_icmp_timeout, 285 .data = &nf_ct_icmp_timeout,
286 .maxlen = sizeof(unsigned int), 286 .maxlen = sizeof(unsigned int),
287 .mode = 0644, 287 .mode = 0644,
288 .proc_handler = &proc_dointvec_jiffies, 288 .proc_handler = proc_dointvec_jiffies,
289 }, 289 },
290 { 290 {
291 .ctl_name = 0 291 .ctl_name = 0
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index ee47bf28c825..7e8e6fc75413 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -119,10 +119,9 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
119 (ntohl(addr.ip) & 0xff000000) == 0x7f000000) 119 (ntohl(addr.ip) & 0xff000000) == 0x7f000000)
120 i = 0; 120 i = 0;
121 121
122 pr_debug("nf_nat_ras: set signal address " 122 pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
123 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 123 &addr.ip, port,
124 NIPQUAD(addr.ip), port, 124 &ct->tuplehash[!dir].tuple.dst.u3.ip,
125 NIPQUAD(ct->tuplehash[!dir].tuple.dst.u3.ip),
126 info->sig_port[!dir]); 125 info->sig_port[!dir]);
127 return set_h225_addr(skb, data, 0, &taddr[i], 126 return set_h225_addr(skb, data, 0, &taddr[i],
128 &ct->tuplehash[!dir]. 127 &ct->tuplehash[!dir].
@@ -131,10 +130,9 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
131 } else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && 130 } else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
132 port == info->sig_port[dir]) { 131 port == info->sig_port[dir]) {
133 /* GK->GW */ 132 /* GK->GW */
134 pr_debug("nf_nat_ras: set signal address " 133 pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
135 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 134 &addr.ip, port,
136 NIPQUAD(addr.ip), port, 135 &ct->tuplehash[!dir].tuple.src.u3.ip,
137 NIPQUAD(ct->tuplehash[!dir].tuple.src.u3.ip),
138 info->sig_port[!dir]); 136 info->sig_port[!dir]);
139 return set_h225_addr(skb, data, 0, &taddr[i], 137 return set_h225_addr(skb, data, 0, &taddr[i],
140 &ct->tuplehash[!dir]. 138 &ct->tuplehash[!dir].
@@ -162,10 +160,9 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
162 if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) && 160 if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
163 addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && 161 addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
164 port == ct->tuplehash[dir].tuple.src.u.udp.port) { 162 port == ct->tuplehash[dir].tuple.src.u.udp.port) {
165 pr_debug("nf_nat_ras: set rasAddress " 163 pr_debug("nf_nat_ras: set rasAddress %pI4:%hu->%pI4:%hu\n",
166 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 164 &addr.ip, ntohs(port),
167 NIPQUAD(addr.ip), ntohs(port), 165 &ct->tuplehash[!dir].tuple.dst.u3.ip,
168 NIPQUAD(ct->tuplehash[!dir].tuple.dst.u3.ip),
169 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); 166 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
170 return set_h225_addr(skb, data, 0, &taddr[i], 167 return set_h225_addr(skb, data, 0, &taddr[i],
171 &ct->tuplehash[!dir].tuple.dst.u3, 168 &ct->tuplehash[!dir].tuple.dst.u3,
@@ -257,15 +254,15 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
257 } 254 }
258 255
259 /* Success */ 256 /* Success */
260 pr_debug("nf_nat_h323: expect RTP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 257 pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n",
261 NIPQUAD(rtp_exp->tuple.src.u3.ip), 258 &rtp_exp->tuple.src.u3.ip,
262 ntohs(rtp_exp->tuple.src.u.udp.port), 259 ntohs(rtp_exp->tuple.src.u.udp.port),
263 NIPQUAD(rtp_exp->tuple.dst.u3.ip), 260 &rtp_exp->tuple.dst.u3.ip,
264 ntohs(rtp_exp->tuple.dst.u.udp.port)); 261 ntohs(rtp_exp->tuple.dst.u.udp.port));
265 pr_debug("nf_nat_h323: expect RTCP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 262 pr_debug("nf_nat_h323: expect RTCP %pI4:%hu->%pI4:%hu\n",
266 NIPQUAD(rtcp_exp->tuple.src.u3.ip), 263 &rtcp_exp->tuple.src.u3.ip,
267 ntohs(rtcp_exp->tuple.src.u.udp.port), 264 ntohs(rtcp_exp->tuple.src.u.udp.port),
268 NIPQUAD(rtcp_exp->tuple.dst.u3.ip), 265 &rtcp_exp->tuple.dst.u3.ip,
269 ntohs(rtcp_exp->tuple.dst.u.udp.port)); 266 ntohs(rtcp_exp->tuple.dst.u.udp.port));
270 267
271 return 0; 268 return 0;
@@ -307,10 +304,10 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
307 return -1; 304 return -1;
308 } 305 }
309 306
310 pr_debug("nf_nat_h323: expect T.120 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 307 pr_debug("nf_nat_h323: expect T.120 %pI4:%hu->%pI4:%hu\n",
311 NIPQUAD(exp->tuple.src.u3.ip), 308 &exp->tuple.src.u3.ip,
312 ntohs(exp->tuple.src.u.tcp.port), 309 ntohs(exp->tuple.src.u.tcp.port),
313 NIPQUAD(exp->tuple.dst.u3.ip), 310 &exp->tuple.dst.u3.ip,
314 ntohs(exp->tuple.dst.u.tcp.port)); 311 ntohs(exp->tuple.dst.u.tcp.port));
315 312
316 return 0; 313 return 0;
@@ -361,10 +358,10 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
361 return -1; 358 return -1;
362 } 359 }
363 360
364 pr_debug("nf_nat_q931: expect H.245 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 361 pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n",
365 NIPQUAD(exp->tuple.src.u3.ip), 362 &exp->tuple.src.u3.ip,
366 ntohs(exp->tuple.src.u.tcp.port), 363 ntohs(exp->tuple.src.u.tcp.port),
367 NIPQUAD(exp->tuple.dst.u3.ip), 364 &exp->tuple.dst.u3.ip,
368 ntohs(exp->tuple.dst.u.tcp.port)); 365 ntohs(exp->tuple.dst.u.tcp.port));
369 366
370 return 0; 367 return 0;
@@ -455,10 +452,10 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
455 } 452 }
456 453
457 /* Success */ 454 /* Success */
458 pr_debug("nf_nat_ras: expect Q.931 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 455 pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n",
459 NIPQUAD(exp->tuple.src.u3.ip), 456 &exp->tuple.src.u3.ip,
460 ntohs(exp->tuple.src.u.tcp.port), 457 ntohs(exp->tuple.src.u.tcp.port),
461 NIPQUAD(exp->tuple.dst.u3.ip), 458 &exp->tuple.dst.u3.ip,
462 ntohs(exp->tuple.dst.u.tcp.port)); 459 ntohs(exp->tuple.dst.u.tcp.port));
463 460
464 return 0; 461 return 0;
@@ -524,11 +521,10 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
524 } 521 }
525 522
526 /* Success */ 523 /* Success */
527 pr_debug("nf_nat_q931: expect Call Forwarding " 524 pr_debug("nf_nat_q931: expect Call Forwarding %pI4:%hu->%pI4:%hu\n",
528 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", 525 &exp->tuple.src.u3.ip,
529 NIPQUAD(exp->tuple.src.u3.ip),
530 ntohs(exp->tuple.src.u.tcp.port), 526 ntohs(exp->tuple.src.u.tcp.port),
531 NIPQUAD(exp->tuple.dst.u3.ip), 527 &exp->tuple.dst.u3.ip,
532 ntohs(exp->tuple.dst.u.tcp.port)); 528 ntohs(exp->tuple.dst.u.tcp.port));
533 529
534 return 0; 530 return 0;
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index fe6f9cef6c85..ea83a886b03e 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -55,8 +55,8 @@ static unsigned int help(struct sk_buff *skb,
55 55
56 ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip); 56 ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip);
57 sprintf(buffer, "%u %u", ip, port); 57 sprintf(buffer, "%u %u", ip, port);
58 pr_debug("nf_nat_irc: inserting '%s' == %u.%u.%u.%u, port %u\n", 58 pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
59 buffer, NIPQUAD(ip), port); 59 buffer, &ip, port);
60 60
61 ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo, 61 ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo,
62 matchoff, matchlen, buffer, 62 matchoff, matchlen, buffer,
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 8d489e746b21..a7eb04719044 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -86,25 +86,6 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par)
86 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); 86 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
87} 87}
88 88
89/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
90static void warn_if_extra_mangle(struct net *net, __be32 dstip, __be32 srcip)
91{
92 static int warned = 0;
93 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
94 struct rtable *rt;
95
96 if (ip_route_output_key(net, &rt, &fl) != 0)
97 return;
98
99 if (rt->rt_src != srcip && !warned) {
100 printk("NAT: no longer support implicit source local NAT\n");
101 printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n",
102 NIPQUAD(srcip), NIPQUAD(dstip));
103 warned = 1;
104 }
105 ip_rt_put(rt);
106}
107
108static unsigned int 89static unsigned int
109ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) 90ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
110{ 91{
@@ -120,11 +101,6 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
120 /* Connection must be valid and new. */ 101 /* Connection must be valid and new. */
121 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); 102 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
122 103
123 if (par->hooknum == NF_INET_LOCAL_OUT &&
124 mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
125 warn_if_extra_mangle(dev_net(par->out), ip_hdr(skb)->daddr,
126 mr->range[0].min_ip);
127
128 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); 104 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
129} 105}
130 106
@@ -166,8 +142,7 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
166 struct nf_nat_range range 142 struct nf_nat_range range
167 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; 143 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
168 144
169 pr_debug("Allocating NULL binding for %p (%u.%u.%u.%u)\n", 145 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
170 ct, NIPQUAD(ip));
171 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); 146 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
172} 147}
173 148
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 14544320c545..07d61a57613c 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -74,8 +74,7 @@ static int map_addr(struct sk_buff *skb,
74 if (newaddr == addr->ip && newport == port) 74 if (newaddr == addr->ip && newport == port)
75 return 1; 75 return 1;
76 76
77 buflen = sprintf(buffer, "%u.%u.%u.%u:%u", 77 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
78 NIPQUAD(newaddr), ntohs(newport));
79 78
80 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 79 return mangle_packet(skb, dptr, datalen, matchoff, matchlen,
81 buffer, buflen); 80 buffer, buflen);
@@ -152,8 +151,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
152 &addr) > 0 && 151 &addr) > 0 &&
153 addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && 152 addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
154 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { 153 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
155 __be32 ip = ct->tuplehash[!dir].tuple.dst.u3.ip; 154 buflen = sprintf(buffer, "%pI4",
156 buflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); 155 &ct->tuplehash[!dir].tuple.dst.u3.ip);
157 if (!mangle_packet(skb, dptr, datalen, poff, plen, 156 if (!mangle_packet(skb, dptr, datalen, poff, plen,
158 buffer, buflen)) 157 buffer, buflen))
159 return NF_DROP; 158 return NF_DROP;
@@ -166,8 +165,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
166 &addr) > 0 && 165 &addr) > 0 &&
167 addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && 166 addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
168 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { 167 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
169 __be32 ip = ct->tuplehash[!dir].tuple.src.u3.ip; 168 buflen = sprintf(buffer, "%pI4",
170 buflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); 169 &ct->tuplehash[!dir].tuple.src.u3.ip);
171 if (!mangle_packet(skb, dptr, datalen, poff, plen, 170 if (!mangle_packet(skb, dptr, datalen, poff, plen,
172 buffer, buflen)) 171 buffer, buflen))
173 return NF_DROP; 172 return NF_DROP;
@@ -279,8 +278,7 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb,
279 278
280 if (exp->tuple.dst.u3.ip != exp->saved_ip || 279 if (exp->tuple.dst.u3.ip != exp->saved_ip ||
281 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { 280 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
282 buflen = sprintf(buffer, "%u.%u.%u.%u:%u", 281 buflen = sprintf(buffer, "%pI4:%u", &newip, port);
283 NIPQUAD(newip), port);
284 if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, 282 if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen,
285 buffer, buflen)) 283 buffer, buflen))
286 goto err; 284 goto err;
@@ -345,7 +343,7 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr,
345 char buffer[sizeof("nnn.nnn.nnn.nnn")]; 343 char buffer[sizeof("nnn.nnn.nnn.nnn")];
346 unsigned int buflen; 344 unsigned int buflen;
347 345
348 buflen = sprintf(buffer, NIPQUAD_FMT, NIPQUAD(addr->ip)); 346 buflen = sprintf(buffer, "%pI4", &addr->ip);
349 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, 347 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term,
350 buffer, buflen)) 348 buffer, buflen))
351 return 0; 349 return 0;
@@ -380,7 +378,7 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
380 unsigned int buflen; 378 unsigned int buflen;
381 379
382 /* Mangle session description owner and contact addresses */ 380 /* Mangle session description owner and contact addresses */
383 buflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(addr->ip)); 381 buflen = sprintf(buffer, "%pI4", &addr->ip);
384 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, 382 if (mangle_sdp_packet(skb, dptr, dataoff, datalen,
385 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, 383 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
386 buffer, buflen)) 384 buffer, buflen))
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 8303e4b406c0..182f845de92f 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -930,8 +930,8 @@ static inline void mangle_address(unsigned char *begin,
930 } 930 }
931 931
932 if (debug) 932 if (debug)
933 printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to " 933 printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n",
934 "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr)); 934 &old, addr);
935 } 935 }
936} 936}
937 937
@@ -1267,9 +1267,8 @@ static int help(struct sk_buff *skb, unsigned int protoff,
1267 */ 1267 */
1268 if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { 1268 if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
1269 if (net_ratelimit()) 1269 if (net_ratelimit())
1270 printk(KERN_WARNING "SNMP: dropping malformed packet " 1270 printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
1271 "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n", 1271 &iph->saddr, &iph->daddr);
1272 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1273 return NF_DROP; 1272 return NF_DROP;
1274 } 1273 }
1275 1274
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a631a1f110ca..614958b7c276 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,8 +54,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
54 socket_seq_show(seq); 54 socket_seq_show(seq);
55 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", 55 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
56 sock_prot_inuse_get(net, &tcp_prot), 56 sock_prot_inuse_get(net, &tcp_prot),
57 atomic_read(&tcp_orphan_count), 57 (int)percpu_counter_sum_positive(&tcp_orphan_count),
58 tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), 58 tcp_death_row.tw_count,
59 (int)percpu_counter_sum_positive(&tcp_sockets_allocated),
59 atomic_read(&tcp_memory_allocated)); 60 atomic_read(&tcp_memory_allocated));
60 seq_printf(seq, "UDP: inuse %d mem %d\n", 61 seq_printf(seq, "UDP: inuse %d mem %d\n",
61 sock_prot_inuse_get(net, &udp_prot), 62 sock_prot_inuse_get(net, &udp_prot),
@@ -234,6 +235,9 @@ static const struct snmp_mib snmp4_net_list[] = {
234 SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), 235 SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
235 SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), 236 SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
236 SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), 237 SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
238 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
239 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
240 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
237 SNMP_MIB_SENTINEL 241 SNMP_MIB_SENTINEL
238}; 242};
239 243
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index cd975743bcd2..dff8bc4e0fac 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -247,7 +247,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
247 } 247 }
248 248
249 if (inet->recverr) { 249 if (inet->recverr) {
250 struct iphdr *iph = (struct iphdr*)skb->data; 250 struct iphdr *iph = (struct iphdr *)skb->data;
251 u8 *payload = skb->data + (iph->ihl << 2); 251 u8 *payload = skb->data + (iph->ihl << 2);
252 252
253 if (inet->hdrincl) 253 if (inet->hdrincl)
@@ -465,7 +465,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
465 */ 465 */
466 466
467 if (msg->msg_namelen) { 467 if (msg->msg_namelen) {
468 struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name; 468 struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
469 err = -EINVAL; 469 err = -EINVAL;
470 if (msg->msg_namelen < sizeof(*usin)) 470 if (msg->msg_namelen < sizeof(*usin))
471 goto out; 471 goto out;
@@ -572,7 +572,7 @@ back_from_confirm:
572 ipc.addr = rt->rt_dst; 572 ipc.addr = rt->rt_dst;
573 lock_sock(sk); 573 lock_sock(sk);
574 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, 574 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
575 &ipc, rt, msg->msg_flags); 575 &ipc, &rt, msg->msg_flags);
576 if (err) 576 if (err)
577 ip_flush_pending_frames(sk); 577 ip_flush_pending_frames(sk);
578 else if (!(msg->msg_flags & MSG_MORE)) 578 else if (!(msg->msg_flags & MSG_MORE))
@@ -851,7 +851,7 @@ struct proto raw_prot = {
851static struct sock *raw_get_first(struct seq_file *seq) 851static struct sock *raw_get_first(struct seq_file *seq)
852{ 852{
853 struct sock *sk; 853 struct sock *sk;
854 struct raw_iter_state* state = raw_seq_private(seq); 854 struct raw_iter_state *state = raw_seq_private(seq);
855 855
856 for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; 856 for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
857 ++state->bucket) { 857 ++state->bucket) {
@@ -868,7 +868,7 @@ found:
868 868
869static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) 869static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
870{ 870{
871 struct raw_iter_state* state = raw_seq_private(seq); 871 struct raw_iter_state *state = raw_seq_private(seq);
872 872
873 do { 873 do {
874 sk = sk_next(sk); 874 sk = sk_next(sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2ea6dcc3e2cc..77bfba975959 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256; 130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; 131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20;
132 133
133static void rt_worker_func(struct work_struct *work); 134static void rt_worker_func(struct work_struct *work);
134static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); 135static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
@@ -145,6 +146,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147static int rt_garbage_collect(struct dst_ops *ops); 148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
148 150
149 151
150static struct dst_ops ipv4_dst_ops = { 152static struct dst_ops ipv4_dst_ops = {
@@ -158,7 +160,6 @@ static struct dst_ops ipv4_dst_ops = {
158 .link_failure = ipv4_link_failure, 160 .link_failure = ipv4_link_failure,
159 .update_pmtu = ip_rt_update_pmtu, 161 .update_pmtu = ip_rt_update_pmtu,
160 .local_out = __ip_local_out, 162 .local_out = __ip_local_out,
161 .entry_size = sizeof(struct rtable),
162 .entries = ATOMIC_INIT(0), 163 .entries = ATOMIC_INIT(0),
163}; 164};
164 165
@@ -201,6 +202,7 @@ const __u8 ip_tos2prio[16] = {
201struct rt_hash_bucket { 202struct rt_hash_bucket {
202 struct rtable *chain; 203 struct rtable *chain;
203}; 204};
205
204#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 defined(CONFIG_PROVE_LOCKING) 207 defined(CONFIG_PROVE_LOCKING)
206/* 208/*
@@ -674,6 +676,20 @@ static inline u32 rt_score(struct rtable *rt)
674 return score; 676 return score;
675} 677}
676 678
679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
677static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
678{ 694{
679 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
@@ -753,11 +769,24 @@ static void rt_do_flush(int process_context)
753 } 769 }
754} 770}
755 771
772/*
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778 */
779
780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS)
782
756static void rt_check_expire(void) 783static void rt_check_expire(void)
757{ 784{
758 static unsigned int rover; 785 static unsigned int rover;
759 unsigned int i = rover, goal; 786 unsigned int i = rover, goal;
760 struct rtable *rth, **rthp; 787 struct rtable *rth, **rthp;
788 unsigned long length = 0, samples = 0;
789 unsigned long sum = 0, sum2 = 0;
761 u64 mult; 790 u64 mult;
762 791
763 mult = ((u64)ip_rt_gc_interval) << rt_hash_log; 792 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
@@ -766,6 +795,7 @@ static void rt_check_expire(void)
766 goal = (unsigned int)mult; 795 goal = (unsigned int)mult;
767 if (goal > rt_hash_mask) 796 if (goal > rt_hash_mask)
768 goal = rt_hash_mask + 1; 797 goal = rt_hash_mask + 1;
798 length = 0;
769 for (; goal > 0; goal--) { 799 for (; goal > 0; goal--) {
770 unsigned long tmo = ip_rt_gc_timeout; 800 unsigned long tmo = ip_rt_gc_timeout;
771 801
@@ -775,6 +805,8 @@ static void rt_check_expire(void)
775 if (need_resched()) 805 if (need_resched())
776 cond_resched(); 806 cond_resched();
777 807
808 samples++;
809
778 if (*rthp == NULL) 810 if (*rthp == NULL)
779 continue; 811 continue;
780 spin_lock_bh(rt_hash_lock_addr(i)); 812 spin_lock_bh(rt_hash_lock_addr(i));
@@ -789,11 +821,29 @@ static void rt_check_expire(void)
789 if (time_before_eq(jiffies, rth->u.dst.expires)) { 821 if (time_before_eq(jiffies, rth->u.dst.expires)) {
790 tmo >>= 1; 822 tmo >>= 1;
791 rthp = &rth->u.dst.rt_next; 823 rthp = &rth->u.dst.rt_next;
824 /*
825 * Only bump our length if the hash
826 * inputs on entries n and n+1 are not
827 * the same, we only count entries on
828 * a chain with equal hash inputs once
829 * so that entries for different QOS
830 * levels, and other non-hash input
831 * attributes don't unfairly skew
832 * the length computation
833 */
834 if ((*rthp == NULL) ||
835 !compare_hash_inputs(&(*rthp)->fl,
836 &rth->fl))
837 length += ONE;
792 continue; 838 continue;
793 } 839 }
794 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { 840 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
795 tmo >>= 1; 841 tmo >>= 1;
796 rthp = &rth->u.dst.rt_next; 842 rthp = &rth->u.dst.rt_next;
843 if ((*rthp == NULL) ||
844 !compare_hash_inputs(&(*rthp)->fl,
845 &rth->fl))
846 length += ONE;
797 continue; 847 continue;
798 } 848 }
799 849
@@ -802,6 +852,15 @@ static void rt_check_expire(void)
802 rt_free(rth); 852 rt_free(rth);
803 } 853 }
804 spin_unlock_bh(rt_hash_lock_addr(i)); 854 spin_unlock_bh(rt_hash_lock_addr(i));
855 sum += length;
856 sum2 += length*length;
857 }
858 if (samples) {
859 unsigned long avg = sum / samples;
860 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
861 rt_chain_length_max = max_t(unsigned long,
862 ip_rt_gc_elasticity,
863 (avg + 4*sd) >> FRACT_BITS);
805 } 864 }
806 rover = i; 865 rover = i;
807} 866}
@@ -851,6 +910,26 @@ static void rt_secret_rebuild(unsigned long __net)
851 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); 910 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
852} 911}
853 912
913static void rt_secret_rebuild_oneshot(struct net *net)
914{
915 del_timer_sync(&net->ipv4.rt_secret_timer);
916 rt_cache_invalidate(net);
917 if (ip_rt_secret_interval) {
918 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
919 add_timer(&net->ipv4.rt_secret_timer);
920 }
921}
922
923static void rt_emergency_hash_rebuild(struct net *net)
924{
925 if (net_ratelimit()) {
926 printk(KERN_WARNING "Route hash chain too long!\n");
927 printk(KERN_WARNING "Adjust your secret_interval!\n");
928 }
929
930 rt_secret_rebuild_oneshot(net);
931}
932
854/* 933/*
855 Short description of GC goals. 934 Short description of GC goals.
856 935
@@ -989,6 +1068,7 @@ out: return 0;
989static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) 1068static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
990{ 1069{
991 struct rtable *rth, **rthp; 1070 struct rtable *rth, **rthp;
1071 struct rtable *rthi;
992 unsigned long now; 1072 unsigned long now;
993 struct rtable *cand, **candp; 1073 struct rtable *cand, **candp;
994 u32 min_score; 1074 u32 min_score;
@@ -1002,7 +1082,13 @@ restart:
1002 candp = NULL; 1082 candp = NULL;
1003 now = jiffies; 1083 now = jiffies;
1004 1084
1085 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1086 rt_drop(rt);
1087 return 0;
1088 }
1089
1005 rthp = &rt_hash_table[hash].chain; 1090 rthp = &rt_hash_table[hash].chain;
1091 rthi = NULL;
1006 1092
1007 spin_lock_bh(rt_hash_lock_addr(hash)); 1093 spin_lock_bh(rt_hash_lock_addr(hash));
1008 while ((rth = *rthp) != NULL) { 1094 while ((rth = *rthp) != NULL) {
@@ -1048,6 +1134,17 @@ restart:
1048 chain_length++; 1134 chain_length++;
1049 1135
1050 rthp = &rth->u.dst.rt_next; 1136 rthp = &rth->u.dst.rt_next;
1137
1138 /*
1139 * check to see if the next entry in the chain
1140 * contains the same hash input values as rt. If it does
1141 * This is where we will insert into the list, instead of
1142 * at the head. This groups entries that differ by aspects not
1143 * relvant to the hash function together, which we use to adjust
1144 * our chain length
1145 */
1146 if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
1147 rthi = rth;
1051 } 1148 }
1052 1149
1053 if (cand) { 1150 if (cand) {
@@ -1061,6 +1158,16 @@ restart:
1061 *candp = cand->u.dst.rt_next; 1158 *candp = cand->u.dst.rt_next;
1062 rt_free(cand); 1159 rt_free(cand);
1063 } 1160 }
1161 } else {
1162 if (chain_length > rt_chain_length_max) {
1163 struct net *net = dev_net(rt->u.dst.dev);
1164 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1165 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1166 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1167 rt->u.dst.dev->name, num);
1168 }
1169 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1170 }
1064 } 1171 }
1065 1172
1066 /* Try to bind route to arp only if it is output 1173 /* Try to bind route to arp only if it is output
@@ -1098,14 +1205,17 @@ restart:
1098 } 1205 }
1099 } 1206 }
1100 1207
1101 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1208 if (rthi)
1209 rt->u.dst.rt_next = rthi->u.dst.rt_next;
1210 else
1211 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1212
1102#if RT_CACHE_DEBUG >= 2 1213#if RT_CACHE_DEBUG >= 2
1103 if (rt->u.dst.rt_next) { 1214 if (rt->u.dst.rt_next) {
1104 struct rtable *trt; 1215 struct rtable *trt;
1105 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash, 1216 printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
1106 NIPQUAD(rt->rt_dst));
1107 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) 1217 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1108 printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst)); 1218 printk(" . %pI4", &trt->rt_dst);
1109 printk("\n"); 1219 printk("\n");
1110 } 1220 }
1111#endif 1221#endif
@@ -1114,7 +1224,11 @@ restart:
1114 * previous writes to rt are comitted to memory 1224 * previous writes to rt are comitted to memory
1115 * before making rt visible to other CPUS. 1225 * before making rt visible to other CPUS.
1116 */ 1226 */
1117 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1227 if (rthi)
1228 rcu_assign_pointer(rthi->u.dst.rt_next, rt);
1229 else
1230 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1231
1118 spin_unlock_bh(rt_hash_lock_addr(hash)); 1232 spin_unlock_bh(rt_hash_lock_addr(hash));
1119 *rp = rt; 1233 *rp = rt;
1120 return 0; 1234 return 0;
@@ -1217,6 +1331,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1217 || ipv4_is_zeronet(new_gw)) 1331 || ipv4_is_zeronet(new_gw))
1218 goto reject_redirect; 1332 goto reject_redirect;
1219 1333
1334 if (!rt_caching(net))
1335 goto reject_redirect;
1336
1220 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1337 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1221 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1338 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1222 goto reject_redirect; 1339 goto reject_redirect;
@@ -1267,7 +1384,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1267 1384
1268 /* Copy all the information. */ 1385 /* Copy all the information. */
1269 *rt = *rth; 1386 *rt = *rth;
1270 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1271 rt->u.dst.__use = 1; 1387 rt->u.dst.__use = 1;
1272 atomic_set(&rt->u.dst.__refcnt, 1); 1388 atomic_set(&rt->u.dst.__refcnt, 1);
1273 rt->u.dst.child = NULL; 1389 rt->u.dst.child = NULL;
@@ -1280,7 +1396,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1280 rt->u.dst.path = &rt->u.dst; 1396 rt->u.dst.path = &rt->u.dst;
1281 rt->u.dst.neighbour = NULL; 1397 rt->u.dst.neighbour = NULL;
1282 rt->u.dst.hh = NULL; 1398 rt->u.dst.hh = NULL;
1399#ifdef CONFIG_XFRM
1283 rt->u.dst.xfrm = NULL; 1400 rt->u.dst.xfrm = NULL;
1401#endif
1284 rt->rt_genid = rt_genid(net); 1402 rt->rt_genid = rt_genid(net);
1285 rt->rt_flags |= RTCF_REDIRECTED; 1403 rt->rt_flags |= RTCF_REDIRECTED;
1286 1404
@@ -1324,11 +1442,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1324reject_redirect: 1442reject_redirect:
1325#ifdef CONFIG_IP_ROUTE_VERBOSE 1443#ifdef CONFIG_IP_ROUTE_VERBOSE
1326 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1444 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1327 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about " 1445 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1328 NIPQUAD_FMT " ignored.\n" 1446 " Advised path = %pI4 -> %pI4\n",
1329 " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n", 1447 &old_gw, dev->name, &new_gw,
1330 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), 1448 &saddr, &daddr);
1331 NIPQUAD(saddr), NIPQUAD(daddr));
1332#endif 1449#endif
1333 in_dev_put(in_dev); 1450 in_dev_put(in_dev);
1334} 1451}
@@ -1348,9 +1465,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1348 rt->fl.oif, 1465 rt->fl.oif,
1349 rt_genid(dev_net(dst->dev))); 1466 rt_genid(dev_net(dst->dev)));
1350#if RT_CACHE_DEBUG >= 1 1467#if RT_CACHE_DEBUG >= 1
1351 printk(KERN_DEBUG "ipv4_negative_advice: redirect to " 1468 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1352 NIPQUAD_FMT "/%02x dropped\n", 1469 &rt->rt_dst, rt->fl.fl4_tos);
1353 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1354#endif 1470#endif
1355 rt_del(hash, rt); 1471 rt_del(hash, rt);
1356 ret = NULL; 1472 ret = NULL;
@@ -1414,10 +1530,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1414 if (IN_DEV_LOG_MARTIANS(in_dev) && 1530 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1415 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1531 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1416 net_ratelimit()) 1532 net_ratelimit())
1417 printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores " 1533 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1418 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n", 1534 &rt->rt_src, rt->rt_iif,
1419 NIPQUAD(rt->rt_src), rt->rt_iif, 1535 &rt->rt_dst, &rt->rt_gateway);
1420 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1421#endif 1536#endif
1422 } 1537 }
1423out: 1538out:
@@ -1610,8 +1725,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
1610 1725
1611static int ip_rt_bug(struct sk_buff *skb) 1726static int ip_rt_bug(struct sk_buff *skb)
1612{ 1727{
1613 printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n", 1728 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1614 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), 1729 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1615 skb->dev ? skb->dev->name : "?"); 1730 skb->dev ? skb->dev->name : "?");
1616 kfree_skb(skb); 1731 kfree_skb(skb);
1617 return 0; 1732 return 0;
@@ -1788,9 +1903,8 @@ static void ip_handle_martian_source(struct net_device *dev,
1788 * RFC1812 recommendation, if source is martian, 1903 * RFC1812 recommendation, if source is martian,
1789 * the only hint is MAC header. 1904 * the only hint is MAC header.
1790 */ 1905 */
1791 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from " 1906 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1792 NIPQUAD_FMT", on dev %s\n", 1907 &daddr, &saddr, dev->name);
1793 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1794 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1908 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1795 int i; 1909 int i;
1796 const unsigned char *p = skb_mac_header(skb); 1910 const unsigned char *p = skb_mac_header(skb);
@@ -2099,9 +2213,8 @@ martian_destination:
2099 RT_CACHE_STAT_INC(in_martian_dst); 2213 RT_CACHE_STAT_INC(in_martian_dst);
2100#ifdef CONFIG_IP_ROUTE_VERBOSE 2214#ifdef CONFIG_IP_ROUTE_VERBOSE
2101 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2215 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2102 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from " 2216 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2103 NIPQUAD_FMT ", dev %s\n", 2217 &daddr, &saddr, dev->name);
2104 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2105#endif 2218#endif
2106 2219
2107e_hostunreach: 2220e_hostunreach:
@@ -2130,6 +2243,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2130 struct net *net; 2243 struct net *net;
2131 2244
2132 net = dev_net(dev); 2245 net = dev_net(dev);
2246
2247 if (!rt_caching(net))
2248 goto skip_cache;
2249
2133 tos &= IPTOS_RT_MASK; 2250 tos &= IPTOS_RT_MASK;
2134 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2251 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2135 2252
@@ -2154,6 +2271,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2154 } 2271 }
2155 rcu_read_unlock(); 2272 rcu_read_unlock();
2156 2273
2274skip_cache:
2157 /* Multicast recognition logic is moved from route cache to here. 2275 /* Multicast recognition logic is moved from route cache to here.
2158 The problem was that too many Ethernet cards have broken/missing 2276 The problem was that too many Ethernet cards have broken/missing
2159 hardware multicast filters :-( As result the host on multicasting 2277 hardware multicast filters :-( As result the host on multicasting
@@ -2539,6 +2657,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2539 unsigned hash; 2657 unsigned hash;
2540 struct rtable *rth; 2658 struct rtable *rth;
2541 2659
2660 if (!rt_caching(net))
2661 goto slow_output;
2662
2542 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2663 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2543 2664
2544 rcu_read_lock_bh(); 2665 rcu_read_lock_bh();
@@ -2563,6 +2684,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2563 } 2684 }
2564 rcu_read_unlock_bh(); 2685 rcu_read_unlock_bh();
2565 2686
2687slow_output:
2566 return ip_route_output_slow(net, rp, flp); 2688 return ip_route_output_slow(net, rp, flp);
2567} 2689}
2568 2690
@@ -2578,7 +2700,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2578 .destroy = ipv4_dst_destroy, 2700 .destroy = ipv4_dst_destroy,
2579 .check = ipv4_dst_check, 2701 .check = ipv4_dst_check,
2580 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2702 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2581 .entry_size = sizeof(struct rtable),
2582 .entries = ATOMIC_INIT(0), 2703 .entries = ATOMIC_INIT(0),
2583}; 2704};
2584 2705
@@ -2640,7 +2761,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2640 flp->fl4_src = (*rp)->rt_src; 2761 flp->fl4_src = (*rp)->rt_src;
2641 if (!flp->fl4_dst) 2762 if (!flp->fl4_dst)
2642 flp->fl4_dst = (*rp)->rt_dst; 2763 flp->fl4_dst = (*rp)->rt_dst;
2643 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, 2764 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2644 flags ? XFRM_LOOKUP_WAIT : 0); 2765 flags ? XFRM_LOOKUP_WAIT : 0);
2645 if (err == -EREMOTE) 2766 if (err == -EREMOTE)
2646 err = ipv4_dst_blackhole(net, rp, flp); 2767 err = ipv4_dst_blackhole(net, rp, flp);
@@ -2995,7 +3116,7 @@ static ctl_table ipv4_route_table[] = {
2995 .data = &ipv4_dst_ops.gc_thresh, 3116 .data = &ipv4_dst_ops.gc_thresh,
2996 .maxlen = sizeof(int), 3117 .maxlen = sizeof(int),
2997 .mode = 0644, 3118 .mode = 0644,
2998 .proc_handler = &proc_dointvec, 3119 .proc_handler = proc_dointvec,
2999 }, 3120 },
3000 { 3121 {
3001 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, 3122 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
@@ -3003,7 +3124,7 @@ static ctl_table ipv4_route_table[] = {
3003 .data = &ip_rt_max_size, 3124 .data = &ip_rt_max_size,
3004 .maxlen = sizeof(int), 3125 .maxlen = sizeof(int),
3005 .mode = 0644, 3126 .mode = 0644,
3006 .proc_handler = &proc_dointvec, 3127 .proc_handler = proc_dointvec,
3007 }, 3128 },
3008 { 3129 {
3009 /* Deprecated. Use gc_min_interval_ms */ 3130 /* Deprecated. Use gc_min_interval_ms */
@@ -3013,8 +3134,8 @@ static ctl_table ipv4_route_table[] = {
3013 .data = &ip_rt_gc_min_interval, 3134 .data = &ip_rt_gc_min_interval,
3014 .maxlen = sizeof(int), 3135 .maxlen = sizeof(int),
3015 .mode = 0644, 3136 .mode = 0644,
3016 .proc_handler = &proc_dointvec_jiffies, 3137 .proc_handler = proc_dointvec_jiffies,
3017 .strategy = &sysctl_jiffies, 3138 .strategy = sysctl_jiffies,
3018 }, 3139 },
3019 { 3140 {
3020 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, 3141 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
@@ -3022,8 +3143,8 @@ static ctl_table ipv4_route_table[] = {
3022 .data = &ip_rt_gc_min_interval, 3143 .data = &ip_rt_gc_min_interval,
3023 .maxlen = sizeof(int), 3144 .maxlen = sizeof(int),
3024 .mode = 0644, 3145 .mode = 0644,
3025 .proc_handler = &proc_dointvec_ms_jiffies, 3146 .proc_handler = proc_dointvec_ms_jiffies,
3026 .strategy = &sysctl_ms_jiffies, 3147 .strategy = sysctl_ms_jiffies,
3027 }, 3148 },
3028 { 3149 {
3029 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, 3150 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
@@ -3031,8 +3152,8 @@ static ctl_table ipv4_route_table[] = {
3031 .data = &ip_rt_gc_timeout, 3152 .data = &ip_rt_gc_timeout,
3032 .maxlen = sizeof(int), 3153 .maxlen = sizeof(int),
3033 .mode = 0644, 3154 .mode = 0644,
3034 .proc_handler = &proc_dointvec_jiffies, 3155 .proc_handler = proc_dointvec_jiffies,
3035 .strategy = &sysctl_jiffies, 3156 .strategy = sysctl_jiffies,
3036 }, 3157 },
3037 { 3158 {
3038 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, 3159 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
@@ -3040,8 +3161,8 @@ static ctl_table ipv4_route_table[] = {
3040 .data = &ip_rt_gc_interval, 3161 .data = &ip_rt_gc_interval,
3041 .maxlen = sizeof(int), 3162 .maxlen = sizeof(int),
3042 .mode = 0644, 3163 .mode = 0644,
3043 .proc_handler = &proc_dointvec_jiffies, 3164 .proc_handler = proc_dointvec_jiffies,
3044 .strategy = &sysctl_jiffies, 3165 .strategy = sysctl_jiffies,
3045 }, 3166 },
3046 { 3167 {
3047 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, 3168 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
@@ -3049,7 +3170,7 @@ static ctl_table ipv4_route_table[] = {
3049 .data = &ip_rt_redirect_load, 3170 .data = &ip_rt_redirect_load,
3050 .maxlen = sizeof(int), 3171 .maxlen = sizeof(int),
3051 .mode = 0644, 3172 .mode = 0644,
3052 .proc_handler = &proc_dointvec, 3173 .proc_handler = proc_dointvec,
3053 }, 3174 },
3054 { 3175 {
3055 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, 3176 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
@@ -3057,7 +3178,7 @@ static ctl_table ipv4_route_table[] = {
3057 .data = &ip_rt_redirect_number, 3178 .data = &ip_rt_redirect_number,
3058 .maxlen = sizeof(int), 3179 .maxlen = sizeof(int),
3059 .mode = 0644, 3180 .mode = 0644,
3060 .proc_handler = &proc_dointvec, 3181 .proc_handler = proc_dointvec,
3061 }, 3182 },
3062 { 3183 {
3063 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, 3184 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
@@ -3065,7 +3186,7 @@ static ctl_table ipv4_route_table[] = {
3065 .data = &ip_rt_redirect_silence, 3186 .data = &ip_rt_redirect_silence,
3066 .maxlen = sizeof(int), 3187 .maxlen = sizeof(int),
3067 .mode = 0644, 3188 .mode = 0644,
3068 .proc_handler = &proc_dointvec, 3189 .proc_handler = proc_dointvec,
3069 }, 3190 },
3070 { 3191 {
3071 .ctl_name = NET_IPV4_ROUTE_ERROR_COST, 3192 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
@@ -3073,7 +3194,7 @@ static ctl_table ipv4_route_table[] = {
3073 .data = &ip_rt_error_cost, 3194 .data = &ip_rt_error_cost,
3074 .maxlen = sizeof(int), 3195 .maxlen = sizeof(int),
3075 .mode = 0644, 3196 .mode = 0644,
3076 .proc_handler = &proc_dointvec, 3197 .proc_handler = proc_dointvec,
3077 }, 3198 },
3078 { 3199 {
3079 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, 3200 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
@@ -3081,7 +3202,7 @@ static ctl_table ipv4_route_table[] = {
3081 .data = &ip_rt_error_burst, 3202 .data = &ip_rt_error_burst,
3082 .maxlen = sizeof(int), 3203 .maxlen = sizeof(int),
3083 .mode = 0644, 3204 .mode = 0644,
3084 .proc_handler = &proc_dointvec, 3205 .proc_handler = proc_dointvec,
3085 }, 3206 },
3086 { 3207 {
3087 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, 3208 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
@@ -3089,7 +3210,7 @@ static ctl_table ipv4_route_table[] = {
3089 .data = &ip_rt_gc_elasticity, 3210 .data = &ip_rt_gc_elasticity,
3090 .maxlen = sizeof(int), 3211 .maxlen = sizeof(int),
3091 .mode = 0644, 3212 .mode = 0644,
3092 .proc_handler = &proc_dointvec, 3213 .proc_handler = proc_dointvec,
3093 }, 3214 },
3094 { 3215 {
3095 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, 3216 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
@@ -3097,8 +3218,8 @@ static ctl_table ipv4_route_table[] = {
3097 .data = &ip_rt_mtu_expires, 3218 .data = &ip_rt_mtu_expires,
3098 .maxlen = sizeof(int), 3219 .maxlen = sizeof(int),
3099 .mode = 0644, 3220 .mode = 0644,
3100 .proc_handler = &proc_dointvec_jiffies, 3221 .proc_handler = proc_dointvec_jiffies,
3101 .strategy = &sysctl_jiffies, 3222 .strategy = sysctl_jiffies,
3102 }, 3223 },
3103 { 3224 {
3104 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, 3225 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
@@ -3106,7 +3227,7 @@ static ctl_table ipv4_route_table[] = {
3106 .data = &ip_rt_min_pmtu, 3227 .data = &ip_rt_min_pmtu,
3107 .maxlen = sizeof(int), 3228 .maxlen = sizeof(int),
3108 .mode = 0644, 3229 .mode = 0644,
3109 .proc_handler = &proc_dointvec, 3230 .proc_handler = proc_dointvec,
3110 }, 3231 },
3111 { 3232 {
3112 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, 3233 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
@@ -3114,7 +3235,7 @@ static ctl_table ipv4_route_table[] = {
3114 .data = &ip_rt_min_advmss, 3235 .data = &ip_rt_min_advmss,
3115 .maxlen = sizeof(int), 3236 .maxlen = sizeof(int),
3116 .mode = 0644, 3237 .mode = 0644,
3117 .proc_handler = &proc_dointvec, 3238 .proc_handler = proc_dointvec,
3118 }, 3239 },
3119 { 3240 {
3120 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, 3241 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
@@ -3122,8 +3243,8 @@ static ctl_table ipv4_route_table[] = {
3122 .data = &ip_rt_secret_interval, 3243 .data = &ip_rt_secret_interval,
3123 .maxlen = sizeof(int), 3244 .maxlen = sizeof(int),
3124 .mode = 0644, 3245 .mode = 0644,
3125 .proc_handler = &ipv4_sysctl_rt_secret_interval, 3246 .proc_handler = ipv4_sysctl_rt_secret_interval,
3126 .strategy = &ipv4_sysctl_rt_secret_interval_strategy, 3247 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
3127 }, 3248 },
3128 { .ctl_name = 0 } 3249 { .ctl_name = 0 }
3129}; 3250};
@@ -3151,8 +3272,8 @@ static struct ctl_table ipv4_route_flush_table[] = {
3151 .procname = "flush", 3272 .procname = "flush",
3152 .maxlen = sizeof(int), 3273 .maxlen = sizeof(int),
3153 .mode = 0200, 3274 .mode = 0200,
3154 .proc_handler = &ipv4_sysctl_rtcache_flush, 3275 .proc_handler = ipv4_sysctl_rtcache_flush,
3155 .strategy = &ipv4_sysctl_rtcache_flush_strategy, 3276 .strategy = ipv4_sysctl_rtcache_flush_strategy,
3156 }, 3277 },
3157 { .ctl_name = 0 }, 3278 { .ctl_name = 0 },
3158}; 3279};
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1bb10df8ce7d..4710d219f06a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -195,7 +195,7 @@ static struct ctl_table ipv4_table[] = {
195 .data = &sysctl_tcp_timestamps, 195 .data = &sysctl_tcp_timestamps,
196 .maxlen = sizeof(int), 196 .maxlen = sizeof(int),
197 .mode = 0644, 197 .mode = 0644,
198 .proc_handler = &proc_dointvec 198 .proc_handler = proc_dointvec
199 }, 199 },
200 { 200 {
201 .ctl_name = NET_IPV4_TCP_WINDOW_SCALING, 201 .ctl_name = NET_IPV4_TCP_WINDOW_SCALING,
@@ -203,7 +203,7 @@ static struct ctl_table ipv4_table[] = {
203 .data = &sysctl_tcp_window_scaling, 203 .data = &sysctl_tcp_window_scaling,
204 .maxlen = sizeof(int), 204 .maxlen = sizeof(int),
205 .mode = 0644, 205 .mode = 0644,
206 .proc_handler = &proc_dointvec 206 .proc_handler = proc_dointvec
207 }, 207 },
208 { 208 {
209 .ctl_name = NET_IPV4_TCP_SACK, 209 .ctl_name = NET_IPV4_TCP_SACK,
@@ -211,7 +211,7 @@ static struct ctl_table ipv4_table[] = {
211 .data = &sysctl_tcp_sack, 211 .data = &sysctl_tcp_sack,
212 .maxlen = sizeof(int), 212 .maxlen = sizeof(int),
213 .mode = 0644, 213 .mode = 0644,
214 .proc_handler = &proc_dointvec 214 .proc_handler = proc_dointvec
215 }, 215 },
216 { 216 {
217 .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE, 217 .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE,
@@ -219,7 +219,7 @@ static struct ctl_table ipv4_table[] = {
219 .data = &sysctl_tcp_retrans_collapse, 219 .data = &sysctl_tcp_retrans_collapse,
220 .maxlen = sizeof(int), 220 .maxlen = sizeof(int),
221 .mode = 0644, 221 .mode = 0644,
222 .proc_handler = &proc_dointvec 222 .proc_handler = proc_dointvec
223 }, 223 },
224 { 224 {
225 .ctl_name = NET_IPV4_DEFAULT_TTL, 225 .ctl_name = NET_IPV4_DEFAULT_TTL,
@@ -227,8 +227,8 @@ static struct ctl_table ipv4_table[] = {
227 .data = &sysctl_ip_default_ttl, 227 .data = &sysctl_ip_default_ttl,
228 .maxlen = sizeof(int), 228 .maxlen = sizeof(int),
229 .mode = 0644, 229 .mode = 0644,
230 .proc_handler = &ipv4_doint_and_flush, 230 .proc_handler = ipv4_doint_and_flush,
231 .strategy = &ipv4_doint_and_flush_strategy, 231 .strategy = ipv4_doint_and_flush_strategy,
232 .extra2 = &init_net, 232 .extra2 = &init_net,
233 }, 233 },
234 { 234 {
@@ -237,7 +237,7 @@ static struct ctl_table ipv4_table[] = {
237 .data = &ipv4_config.no_pmtu_disc, 237 .data = &ipv4_config.no_pmtu_disc,
238 .maxlen = sizeof(int), 238 .maxlen = sizeof(int),
239 .mode = 0644, 239 .mode = 0644,
240 .proc_handler = &proc_dointvec 240 .proc_handler = proc_dointvec
241 }, 241 },
242 { 242 {
243 .ctl_name = NET_IPV4_NONLOCAL_BIND, 243 .ctl_name = NET_IPV4_NONLOCAL_BIND,
@@ -245,7 +245,7 @@ static struct ctl_table ipv4_table[] = {
245 .data = &sysctl_ip_nonlocal_bind, 245 .data = &sysctl_ip_nonlocal_bind,
246 .maxlen = sizeof(int), 246 .maxlen = sizeof(int),
247 .mode = 0644, 247 .mode = 0644,
248 .proc_handler = &proc_dointvec 248 .proc_handler = proc_dointvec
249 }, 249 },
250 { 250 {
251 .ctl_name = NET_IPV4_TCP_SYN_RETRIES, 251 .ctl_name = NET_IPV4_TCP_SYN_RETRIES,
@@ -253,7 +253,7 @@ static struct ctl_table ipv4_table[] = {
253 .data = &sysctl_tcp_syn_retries, 253 .data = &sysctl_tcp_syn_retries,
254 .maxlen = sizeof(int), 254 .maxlen = sizeof(int),
255 .mode = 0644, 255 .mode = 0644,
256 .proc_handler = &proc_dointvec 256 .proc_handler = proc_dointvec
257 }, 257 },
258 { 258 {
259 .ctl_name = NET_TCP_SYNACK_RETRIES, 259 .ctl_name = NET_TCP_SYNACK_RETRIES,
@@ -261,7 +261,7 @@ static struct ctl_table ipv4_table[] = {
261 .data = &sysctl_tcp_synack_retries, 261 .data = &sysctl_tcp_synack_retries,
262 .maxlen = sizeof(int), 262 .maxlen = sizeof(int),
263 .mode = 0644, 263 .mode = 0644,
264 .proc_handler = &proc_dointvec 264 .proc_handler = proc_dointvec
265 }, 265 },
266 { 266 {
267 .ctl_name = NET_TCP_MAX_ORPHANS, 267 .ctl_name = NET_TCP_MAX_ORPHANS,
@@ -269,7 +269,7 @@ static struct ctl_table ipv4_table[] = {
269 .data = &sysctl_tcp_max_orphans, 269 .data = &sysctl_tcp_max_orphans,
270 .maxlen = sizeof(int), 270 .maxlen = sizeof(int),
271 .mode = 0644, 271 .mode = 0644,
272 .proc_handler = &proc_dointvec 272 .proc_handler = proc_dointvec
273 }, 273 },
274 { 274 {
275 .ctl_name = NET_TCP_MAX_TW_BUCKETS, 275 .ctl_name = NET_TCP_MAX_TW_BUCKETS,
@@ -277,7 +277,7 @@ static struct ctl_table ipv4_table[] = {
277 .data = &tcp_death_row.sysctl_max_tw_buckets, 277 .data = &tcp_death_row.sysctl_max_tw_buckets,
278 .maxlen = sizeof(int), 278 .maxlen = sizeof(int),
279 .mode = 0644, 279 .mode = 0644,
280 .proc_handler = &proc_dointvec 280 .proc_handler = proc_dointvec
281 }, 281 },
282 { 282 {
283 .ctl_name = NET_IPV4_DYNADDR, 283 .ctl_name = NET_IPV4_DYNADDR,
@@ -285,7 +285,7 @@ static struct ctl_table ipv4_table[] = {
285 .data = &sysctl_ip_dynaddr, 285 .data = &sysctl_ip_dynaddr,
286 .maxlen = sizeof(int), 286 .maxlen = sizeof(int),
287 .mode = 0644, 287 .mode = 0644,
288 .proc_handler = &proc_dointvec 288 .proc_handler = proc_dointvec
289 }, 289 },
290 { 290 {
291 .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME, 291 .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME,
@@ -293,8 +293,8 @@ static struct ctl_table ipv4_table[] = {
293 .data = &sysctl_tcp_keepalive_time, 293 .data = &sysctl_tcp_keepalive_time,
294 .maxlen = sizeof(int), 294 .maxlen = sizeof(int),
295 .mode = 0644, 295 .mode = 0644,
296 .proc_handler = &proc_dointvec_jiffies, 296 .proc_handler = proc_dointvec_jiffies,
297 .strategy = &sysctl_jiffies 297 .strategy = sysctl_jiffies
298 }, 298 },
299 { 299 {
300 .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES, 300 .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES,
@@ -302,7 +302,7 @@ static struct ctl_table ipv4_table[] = {
302 .data = &sysctl_tcp_keepalive_probes, 302 .data = &sysctl_tcp_keepalive_probes,
303 .maxlen = sizeof(int), 303 .maxlen = sizeof(int),
304 .mode = 0644, 304 .mode = 0644,
305 .proc_handler = &proc_dointvec 305 .proc_handler = proc_dointvec
306 }, 306 },
307 { 307 {
308 .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL, 308 .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL,
@@ -310,8 +310,8 @@ static struct ctl_table ipv4_table[] = {
310 .data = &sysctl_tcp_keepalive_intvl, 310 .data = &sysctl_tcp_keepalive_intvl,
311 .maxlen = sizeof(int), 311 .maxlen = sizeof(int),
312 .mode = 0644, 312 .mode = 0644,
313 .proc_handler = &proc_dointvec_jiffies, 313 .proc_handler = proc_dointvec_jiffies,
314 .strategy = &sysctl_jiffies 314 .strategy = sysctl_jiffies
315 }, 315 },
316 { 316 {
317 .ctl_name = NET_IPV4_TCP_RETRIES1, 317 .ctl_name = NET_IPV4_TCP_RETRIES1,
@@ -319,8 +319,8 @@ static struct ctl_table ipv4_table[] = {
319 .data = &sysctl_tcp_retries1, 319 .data = &sysctl_tcp_retries1,
320 .maxlen = sizeof(int), 320 .maxlen = sizeof(int),
321 .mode = 0644, 321 .mode = 0644,
322 .proc_handler = &proc_dointvec_minmax, 322 .proc_handler = proc_dointvec_minmax,
323 .strategy = &sysctl_intvec, 323 .strategy = sysctl_intvec,
324 .extra2 = &tcp_retr1_max 324 .extra2 = &tcp_retr1_max
325 }, 325 },
326 { 326 {
@@ -329,7 +329,7 @@ static struct ctl_table ipv4_table[] = {
329 .data = &sysctl_tcp_retries2, 329 .data = &sysctl_tcp_retries2,
330 .maxlen = sizeof(int), 330 .maxlen = sizeof(int),
331 .mode = 0644, 331 .mode = 0644,
332 .proc_handler = &proc_dointvec 332 .proc_handler = proc_dointvec
333 }, 333 },
334 { 334 {
335 .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT, 335 .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT,
@@ -337,8 +337,8 @@ static struct ctl_table ipv4_table[] = {
337 .data = &sysctl_tcp_fin_timeout, 337 .data = &sysctl_tcp_fin_timeout,
338 .maxlen = sizeof(int), 338 .maxlen = sizeof(int),
339 .mode = 0644, 339 .mode = 0644,
340 .proc_handler = &proc_dointvec_jiffies, 340 .proc_handler = proc_dointvec_jiffies,
341 .strategy = &sysctl_jiffies 341 .strategy = sysctl_jiffies
342 }, 342 },
343#ifdef CONFIG_SYN_COOKIES 343#ifdef CONFIG_SYN_COOKIES
344 { 344 {
@@ -347,7 +347,7 @@ static struct ctl_table ipv4_table[] = {
347 .data = &sysctl_tcp_syncookies, 347 .data = &sysctl_tcp_syncookies,
348 .maxlen = sizeof(int), 348 .maxlen = sizeof(int),
349 .mode = 0644, 349 .mode = 0644,
350 .proc_handler = &proc_dointvec 350 .proc_handler = proc_dointvec
351 }, 351 },
352#endif 352#endif
353 { 353 {
@@ -356,7 +356,7 @@ static struct ctl_table ipv4_table[] = {
356 .data = &tcp_death_row.sysctl_tw_recycle, 356 .data = &tcp_death_row.sysctl_tw_recycle,
357 .maxlen = sizeof(int), 357 .maxlen = sizeof(int),
358 .mode = 0644, 358 .mode = 0644,
359 .proc_handler = &proc_dointvec 359 .proc_handler = proc_dointvec
360 }, 360 },
361 { 361 {
362 .ctl_name = NET_TCP_ABORT_ON_OVERFLOW, 362 .ctl_name = NET_TCP_ABORT_ON_OVERFLOW,
@@ -364,7 +364,7 @@ static struct ctl_table ipv4_table[] = {
364 .data = &sysctl_tcp_abort_on_overflow, 364 .data = &sysctl_tcp_abort_on_overflow,
365 .maxlen = sizeof(int), 365 .maxlen = sizeof(int),
366 .mode = 0644, 366 .mode = 0644,
367 .proc_handler = &proc_dointvec 367 .proc_handler = proc_dointvec
368 }, 368 },
369 { 369 {
370 .ctl_name = NET_TCP_STDURG, 370 .ctl_name = NET_TCP_STDURG,
@@ -372,7 +372,7 @@ static struct ctl_table ipv4_table[] = {
372 .data = &sysctl_tcp_stdurg, 372 .data = &sysctl_tcp_stdurg,
373 .maxlen = sizeof(int), 373 .maxlen = sizeof(int),
374 .mode = 0644, 374 .mode = 0644,
375 .proc_handler = &proc_dointvec 375 .proc_handler = proc_dointvec
376 }, 376 },
377 { 377 {
378 .ctl_name = NET_TCP_RFC1337, 378 .ctl_name = NET_TCP_RFC1337,
@@ -380,7 +380,7 @@ static struct ctl_table ipv4_table[] = {
380 .data = &sysctl_tcp_rfc1337, 380 .data = &sysctl_tcp_rfc1337,
381 .maxlen = sizeof(int), 381 .maxlen = sizeof(int),
382 .mode = 0644, 382 .mode = 0644,
383 .proc_handler = &proc_dointvec 383 .proc_handler = proc_dointvec
384 }, 384 },
385 { 385 {
386 .ctl_name = NET_TCP_MAX_SYN_BACKLOG, 386 .ctl_name = NET_TCP_MAX_SYN_BACKLOG,
@@ -388,7 +388,7 @@ static struct ctl_table ipv4_table[] = {
388 .data = &sysctl_max_syn_backlog, 388 .data = &sysctl_max_syn_backlog,
389 .maxlen = sizeof(int), 389 .maxlen = sizeof(int),
390 .mode = 0644, 390 .mode = 0644,
391 .proc_handler = &proc_dointvec 391 .proc_handler = proc_dointvec
392 }, 392 },
393 { 393 {
394 .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, 394 .ctl_name = NET_IPV4_LOCAL_PORT_RANGE,
@@ -396,8 +396,8 @@ static struct ctl_table ipv4_table[] = {
396 .data = &sysctl_local_ports.range, 396 .data = &sysctl_local_ports.range,
397 .maxlen = sizeof(sysctl_local_ports.range), 397 .maxlen = sizeof(sysctl_local_ports.range),
398 .mode = 0644, 398 .mode = 0644,
399 .proc_handler = &ipv4_local_port_range, 399 .proc_handler = ipv4_local_port_range,
400 .strategy = &ipv4_sysctl_local_port_range, 400 .strategy = ipv4_sysctl_local_port_range,
401 }, 401 },
402#ifdef CONFIG_IP_MULTICAST 402#ifdef CONFIG_IP_MULTICAST
403 { 403 {
@@ -406,7 +406,7 @@ static struct ctl_table ipv4_table[] = {
406 .data = &sysctl_igmp_max_memberships, 406 .data = &sysctl_igmp_max_memberships,
407 .maxlen = sizeof(int), 407 .maxlen = sizeof(int),
408 .mode = 0644, 408 .mode = 0644,
409 .proc_handler = &proc_dointvec 409 .proc_handler = proc_dointvec
410 }, 410 },
411 411
412#endif 412#endif
@@ -416,7 +416,7 @@ static struct ctl_table ipv4_table[] = {
416 .data = &sysctl_igmp_max_msf, 416 .data = &sysctl_igmp_max_msf,
417 .maxlen = sizeof(int), 417 .maxlen = sizeof(int),
418 .mode = 0644, 418 .mode = 0644,
419 .proc_handler = &proc_dointvec 419 .proc_handler = proc_dointvec
420 }, 420 },
421 { 421 {
422 .ctl_name = NET_IPV4_INET_PEER_THRESHOLD, 422 .ctl_name = NET_IPV4_INET_PEER_THRESHOLD,
@@ -424,7 +424,7 @@ static struct ctl_table ipv4_table[] = {
424 .data = &inet_peer_threshold, 424 .data = &inet_peer_threshold,
425 .maxlen = sizeof(int), 425 .maxlen = sizeof(int),
426 .mode = 0644, 426 .mode = 0644,
427 .proc_handler = &proc_dointvec 427 .proc_handler = proc_dointvec
428 }, 428 },
429 { 429 {
430 .ctl_name = NET_IPV4_INET_PEER_MINTTL, 430 .ctl_name = NET_IPV4_INET_PEER_MINTTL,
@@ -432,8 +432,8 @@ static struct ctl_table ipv4_table[] = {
432 .data = &inet_peer_minttl, 432 .data = &inet_peer_minttl,
433 .maxlen = sizeof(int), 433 .maxlen = sizeof(int),
434 .mode = 0644, 434 .mode = 0644,
435 .proc_handler = &proc_dointvec_jiffies, 435 .proc_handler = proc_dointvec_jiffies,
436 .strategy = &sysctl_jiffies 436 .strategy = sysctl_jiffies
437 }, 437 },
438 { 438 {
439 .ctl_name = NET_IPV4_INET_PEER_MAXTTL, 439 .ctl_name = NET_IPV4_INET_PEER_MAXTTL,
@@ -441,8 +441,8 @@ static struct ctl_table ipv4_table[] = {
441 .data = &inet_peer_maxttl, 441 .data = &inet_peer_maxttl,
442 .maxlen = sizeof(int), 442 .maxlen = sizeof(int),
443 .mode = 0644, 443 .mode = 0644,
444 .proc_handler = &proc_dointvec_jiffies, 444 .proc_handler = proc_dointvec_jiffies,
445 .strategy = &sysctl_jiffies 445 .strategy = sysctl_jiffies
446 }, 446 },
447 { 447 {
448 .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME, 448 .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME,
@@ -450,8 +450,8 @@ static struct ctl_table ipv4_table[] = {
450 .data = &inet_peer_gc_mintime, 450 .data = &inet_peer_gc_mintime,
451 .maxlen = sizeof(int), 451 .maxlen = sizeof(int),
452 .mode = 0644, 452 .mode = 0644,
453 .proc_handler = &proc_dointvec_jiffies, 453 .proc_handler = proc_dointvec_jiffies,
454 .strategy = &sysctl_jiffies 454 .strategy = sysctl_jiffies
455 }, 455 },
456 { 456 {
457 .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME, 457 .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME,
@@ -459,8 +459,8 @@ static struct ctl_table ipv4_table[] = {
459 .data = &inet_peer_gc_maxtime, 459 .data = &inet_peer_gc_maxtime,
460 .maxlen = sizeof(int), 460 .maxlen = sizeof(int),
461 .mode = 0644, 461 .mode = 0644,
462 .proc_handler = &proc_dointvec_jiffies, 462 .proc_handler = proc_dointvec_jiffies,
463 .strategy = &sysctl_jiffies 463 .strategy = sysctl_jiffies
464 }, 464 },
465 { 465 {
466 .ctl_name = NET_TCP_ORPHAN_RETRIES, 466 .ctl_name = NET_TCP_ORPHAN_RETRIES,
@@ -468,7 +468,7 @@ static struct ctl_table ipv4_table[] = {
468 .data = &sysctl_tcp_orphan_retries, 468 .data = &sysctl_tcp_orphan_retries,
469 .maxlen = sizeof(int), 469 .maxlen = sizeof(int),
470 .mode = 0644, 470 .mode = 0644,
471 .proc_handler = &proc_dointvec 471 .proc_handler = proc_dointvec
472 }, 472 },
473 { 473 {
474 .ctl_name = NET_TCP_FACK, 474 .ctl_name = NET_TCP_FACK,
@@ -476,7 +476,7 @@ static struct ctl_table ipv4_table[] = {
476 .data = &sysctl_tcp_fack, 476 .data = &sysctl_tcp_fack,
477 .maxlen = sizeof(int), 477 .maxlen = sizeof(int),
478 .mode = 0644, 478 .mode = 0644,
479 .proc_handler = &proc_dointvec 479 .proc_handler = proc_dointvec
480 }, 480 },
481 { 481 {
482 .ctl_name = NET_TCP_REORDERING, 482 .ctl_name = NET_TCP_REORDERING,
@@ -484,7 +484,7 @@ static struct ctl_table ipv4_table[] = {
484 .data = &sysctl_tcp_reordering, 484 .data = &sysctl_tcp_reordering,
485 .maxlen = sizeof(int), 485 .maxlen = sizeof(int),
486 .mode = 0644, 486 .mode = 0644,
487 .proc_handler = &proc_dointvec 487 .proc_handler = proc_dointvec
488 }, 488 },
489 { 489 {
490 .ctl_name = NET_TCP_ECN, 490 .ctl_name = NET_TCP_ECN,
@@ -492,7 +492,7 @@ static struct ctl_table ipv4_table[] = {
492 .data = &sysctl_tcp_ecn, 492 .data = &sysctl_tcp_ecn,
493 .maxlen = sizeof(int), 493 .maxlen = sizeof(int),
494 .mode = 0644, 494 .mode = 0644,
495 .proc_handler = &proc_dointvec 495 .proc_handler = proc_dointvec
496 }, 496 },
497 { 497 {
498 .ctl_name = NET_TCP_DSACK, 498 .ctl_name = NET_TCP_DSACK,
@@ -500,7 +500,7 @@ static struct ctl_table ipv4_table[] = {
500 .data = &sysctl_tcp_dsack, 500 .data = &sysctl_tcp_dsack,
501 .maxlen = sizeof(int), 501 .maxlen = sizeof(int),
502 .mode = 0644, 502 .mode = 0644,
503 .proc_handler = &proc_dointvec 503 .proc_handler = proc_dointvec
504 }, 504 },
505 { 505 {
506 .ctl_name = NET_TCP_MEM, 506 .ctl_name = NET_TCP_MEM,
@@ -508,7 +508,7 @@ static struct ctl_table ipv4_table[] = {
508 .data = &sysctl_tcp_mem, 508 .data = &sysctl_tcp_mem,
509 .maxlen = sizeof(sysctl_tcp_mem), 509 .maxlen = sizeof(sysctl_tcp_mem),
510 .mode = 0644, 510 .mode = 0644,
511 .proc_handler = &proc_dointvec 511 .proc_handler = proc_dointvec
512 }, 512 },
513 { 513 {
514 .ctl_name = NET_TCP_WMEM, 514 .ctl_name = NET_TCP_WMEM,
@@ -516,7 +516,7 @@ static struct ctl_table ipv4_table[] = {
516 .data = &sysctl_tcp_wmem, 516 .data = &sysctl_tcp_wmem,
517 .maxlen = sizeof(sysctl_tcp_wmem), 517 .maxlen = sizeof(sysctl_tcp_wmem),
518 .mode = 0644, 518 .mode = 0644,
519 .proc_handler = &proc_dointvec 519 .proc_handler = proc_dointvec
520 }, 520 },
521 { 521 {
522 .ctl_name = NET_TCP_RMEM, 522 .ctl_name = NET_TCP_RMEM,
@@ -524,7 +524,7 @@ static struct ctl_table ipv4_table[] = {
524 .data = &sysctl_tcp_rmem, 524 .data = &sysctl_tcp_rmem,
525 .maxlen = sizeof(sysctl_tcp_rmem), 525 .maxlen = sizeof(sysctl_tcp_rmem),
526 .mode = 0644, 526 .mode = 0644,
527 .proc_handler = &proc_dointvec 527 .proc_handler = proc_dointvec
528 }, 528 },
529 { 529 {
530 .ctl_name = NET_TCP_APP_WIN, 530 .ctl_name = NET_TCP_APP_WIN,
@@ -532,7 +532,7 @@ static struct ctl_table ipv4_table[] = {
532 .data = &sysctl_tcp_app_win, 532 .data = &sysctl_tcp_app_win,
533 .maxlen = sizeof(int), 533 .maxlen = sizeof(int),
534 .mode = 0644, 534 .mode = 0644,
535 .proc_handler = &proc_dointvec 535 .proc_handler = proc_dointvec
536 }, 536 },
537 { 537 {
538 .ctl_name = NET_TCP_ADV_WIN_SCALE, 538 .ctl_name = NET_TCP_ADV_WIN_SCALE,
@@ -540,7 +540,7 @@ static struct ctl_table ipv4_table[] = {
540 .data = &sysctl_tcp_adv_win_scale, 540 .data = &sysctl_tcp_adv_win_scale,
541 .maxlen = sizeof(int), 541 .maxlen = sizeof(int),
542 .mode = 0644, 542 .mode = 0644,
543 .proc_handler = &proc_dointvec 543 .proc_handler = proc_dointvec
544 }, 544 },
545 { 545 {
546 .ctl_name = NET_TCP_TW_REUSE, 546 .ctl_name = NET_TCP_TW_REUSE,
@@ -548,7 +548,7 @@ static struct ctl_table ipv4_table[] = {
548 .data = &sysctl_tcp_tw_reuse, 548 .data = &sysctl_tcp_tw_reuse,
549 .maxlen = sizeof(int), 549 .maxlen = sizeof(int),
550 .mode = 0644, 550 .mode = 0644,
551 .proc_handler = &proc_dointvec 551 .proc_handler = proc_dointvec
552 }, 552 },
553 { 553 {
554 .ctl_name = NET_TCP_FRTO, 554 .ctl_name = NET_TCP_FRTO,
@@ -556,7 +556,7 @@ static struct ctl_table ipv4_table[] = {
556 .data = &sysctl_tcp_frto, 556 .data = &sysctl_tcp_frto,
557 .maxlen = sizeof(int), 557 .maxlen = sizeof(int),
558 .mode = 0644, 558 .mode = 0644,
559 .proc_handler = &proc_dointvec 559 .proc_handler = proc_dointvec
560 }, 560 },
561 { 561 {
562 .ctl_name = NET_TCP_FRTO_RESPONSE, 562 .ctl_name = NET_TCP_FRTO_RESPONSE,
@@ -564,7 +564,7 @@ static struct ctl_table ipv4_table[] = {
564 .data = &sysctl_tcp_frto_response, 564 .data = &sysctl_tcp_frto_response,
565 .maxlen = sizeof(int), 565 .maxlen = sizeof(int),
566 .mode = 0644, 566 .mode = 0644,
567 .proc_handler = &proc_dointvec 567 .proc_handler = proc_dointvec
568 }, 568 },
569 { 569 {
570 .ctl_name = NET_TCP_LOW_LATENCY, 570 .ctl_name = NET_TCP_LOW_LATENCY,
@@ -572,7 +572,7 @@ static struct ctl_table ipv4_table[] = {
572 .data = &sysctl_tcp_low_latency, 572 .data = &sysctl_tcp_low_latency,
573 .maxlen = sizeof(int), 573 .maxlen = sizeof(int),
574 .mode = 0644, 574 .mode = 0644,
575 .proc_handler = &proc_dointvec 575 .proc_handler = proc_dointvec
576 }, 576 },
577 { 577 {
578 .ctl_name = NET_TCP_NO_METRICS_SAVE, 578 .ctl_name = NET_TCP_NO_METRICS_SAVE,
@@ -580,7 +580,7 @@ static struct ctl_table ipv4_table[] = {
580 .data = &sysctl_tcp_nometrics_save, 580 .data = &sysctl_tcp_nometrics_save,
581 .maxlen = sizeof(int), 581 .maxlen = sizeof(int),
582 .mode = 0644, 582 .mode = 0644,
583 .proc_handler = &proc_dointvec, 583 .proc_handler = proc_dointvec,
584 }, 584 },
585 { 585 {
586 .ctl_name = NET_TCP_MODERATE_RCVBUF, 586 .ctl_name = NET_TCP_MODERATE_RCVBUF,
@@ -588,7 +588,7 @@ static struct ctl_table ipv4_table[] = {
588 .data = &sysctl_tcp_moderate_rcvbuf, 588 .data = &sysctl_tcp_moderate_rcvbuf,
589 .maxlen = sizeof(int), 589 .maxlen = sizeof(int),
590 .mode = 0644, 590 .mode = 0644,
591 .proc_handler = &proc_dointvec, 591 .proc_handler = proc_dointvec,
592 }, 592 },
593 { 593 {
594 .ctl_name = NET_TCP_TSO_WIN_DIVISOR, 594 .ctl_name = NET_TCP_TSO_WIN_DIVISOR,
@@ -596,15 +596,15 @@ static struct ctl_table ipv4_table[] = {
596 .data = &sysctl_tcp_tso_win_divisor, 596 .data = &sysctl_tcp_tso_win_divisor,
597 .maxlen = sizeof(int), 597 .maxlen = sizeof(int),
598 .mode = 0644, 598 .mode = 0644,
599 .proc_handler = &proc_dointvec, 599 .proc_handler = proc_dointvec,
600 }, 600 },
601 { 601 {
602 .ctl_name = NET_TCP_CONG_CONTROL, 602 .ctl_name = NET_TCP_CONG_CONTROL,
603 .procname = "tcp_congestion_control", 603 .procname = "tcp_congestion_control",
604 .mode = 0644, 604 .mode = 0644,
605 .maxlen = TCP_CA_NAME_MAX, 605 .maxlen = TCP_CA_NAME_MAX,
606 .proc_handler = &proc_tcp_congestion_control, 606 .proc_handler = proc_tcp_congestion_control,
607 .strategy = &sysctl_tcp_congestion_control, 607 .strategy = sysctl_tcp_congestion_control,
608 }, 608 },
609 { 609 {
610 .ctl_name = NET_TCP_ABC, 610 .ctl_name = NET_TCP_ABC,
@@ -612,7 +612,7 @@ static struct ctl_table ipv4_table[] = {
612 .data = &sysctl_tcp_abc, 612 .data = &sysctl_tcp_abc,
613 .maxlen = sizeof(int), 613 .maxlen = sizeof(int),
614 .mode = 0644, 614 .mode = 0644,
615 .proc_handler = &proc_dointvec, 615 .proc_handler = proc_dointvec,
616 }, 616 },
617 { 617 {
618 .ctl_name = NET_TCP_MTU_PROBING, 618 .ctl_name = NET_TCP_MTU_PROBING,
@@ -620,7 +620,7 @@ static struct ctl_table ipv4_table[] = {
620 .data = &sysctl_tcp_mtu_probing, 620 .data = &sysctl_tcp_mtu_probing,
621 .maxlen = sizeof(int), 621 .maxlen = sizeof(int),
622 .mode = 0644, 622 .mode = 0644,
623 .proc_handler = &proc_dointvec, 623 .proc_handler = proc_dointvec,
624 }, 624 },
625 { 625 {
626 .ctl_name = NET_TCP_BASE_MSS, 626 .ctl_name = NET_TCP_BASE_MSS,
@@ -628,7 +628,7 @@ static struct ctl_table ipv4_table[] = {
628 .data = &sysctl_tcp_base_mss, 628 .data = &sysctl_tcp_base_mss,
629 .maxlen = sizeof(int), 629 .maxlen = sizeof(int),
630 .mode = 0644, 630 .mode = 0644,
631 .proc_handler = &proc_dointvec, 631 .proc_handler = proc_dointvec,
632 }, 632 },
633 { 633 {
634 .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, 634 .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,
@@ -636,7 +636,7 @@ static struct ctl_table ipv4_table[] = {
636 .data = &sysctl_tcp_workaround_signed_windows, 636 .data = &sysctl_tcp_workaround_signed_windows,
637 .maxlen = sizeof(int), 637 .maxlen = sizeof(int),
638 .mode = 0644, 638 .mode = 0644,
639 .proc_handler = &proc_dointvec 639 .proc_handler = proc_dointvec
640 }, 640 },
641#ifdef CONFIG_NET_DMA 641#ifdef CONFIG_NET_DMA
642 { 642 {
@@ -645,7 +645,7 @@ static struct ctl_table ipv4_table[] = {
645 .data = &sysctl_tcp_dma_copybreak, 645 .data = &sysctl_tcp_dma_copybreak,
646 .maxlen = sizeof(int), 646 .maxlen = sizeof(int),
647 .mode = 0644, 647 .mode = 0644,
648 .proc_handler = &proc_dointvec 648 .proc_handler = proc_dointvec
649 }, 649 },
650#endif 650#endif
651 { 651 {
@@ -654,7 +654,7 @@ static struct ctl_table ipv4_table[] = {
654 .data = &sysctl_tcp_slow_start_after_idle, 654 .data = &sysctl_tcp_slow_start_after_idle,
655 .maxlen = sizeof(int), 655 .maxlen = sizeof(int),
656 .mode = 0644, 656 .mode = 0644,
657 .proc_handler = &proc_dointvec 657 .proc_handler = proc_dointvec
658 }, 658 },
659#ifdef CONFIG_NETLABEL 659#ifdef CONFIG_NETLABEL
660 { 660 {
@@ -663,7 +663,7 @@ static struct ctl_table ipv4_table[] = {
663 .data = &cipso_v4_cache_enabled, 663 .data = &cipso_v4_cache_enabled,
664 .maxlen = sizeof(int), 664 .maxlen = sizeof(int),
665 .mode = 0644, 665 .mode = 0644,
666 .proc_handler = &proc_dointvec, 666 .proc_handler = proc_dointvec,
667 }, 667 },
668 { 668 {
669 .ctl_name = NET_CIPSOV4_CACHE_BUCKET_SIZE, 669 .ctl_name = NET_CIPSOV4_CACHE_BUCKET_SIZE,
@@ -671,7 +671,7 @@ static struct ctl_table ipv4_table[] = {
671 .data = &cipso_v4_cache_bucketsize, 671 .data = &cipso_v4_cache_bucketsize,
672 .maxlen = sizeof(int), 672 .maxlen = sizeof(int),
673 .mode = 0644, 673 .mode = 0644,
674 .proc_handler = &proc_dointvec, 674 .proc_handler = proc_dointvec,
675 }, 675 },
676 { 676 {
677 .ctl_name = NET_CIPSOV4_RBM_OPTFMT, 677 .ctl_name = NET_CIPSOV4_RBM_OPTFMT,
@@ -679,7 +679,7 @@ static struct ctl_table ipv4_table[] = {
679 .data = &cipso_v4_rbm_optfmt, 679 .data = &cipso_v4_rbm_optfmt,
680 .maxlen = sizeof(int), 680 .maxlen = sizeof(int),
681 .mode = 0644, 681 .mode = 0644,
682 .proc_handler = &proc_dointvec, 682 .proc_handler = proc_dointvec,
683 }, 683 },
684 { 684 {
685 .ctl_name = NET_CIPSOV4_RBM_STRICTVALID, 685 .ctl_name = NET_CIPSOV4_RBM_STRICTVALID,
@@ -687,22 +687,22 @@ static struct ctl_table ipv4_table[] = {
687 .data = &cipso_v4_rbm_strictvalid, 687 .data = &cipso_v4_rbm_strictvalid,
688 .maxlen = sizeof(int), 688 .maxlen = sizeof(int),
689 .mode = 0644, 689 .mode = 0644,
690 .proc_handler = &proc_dointvec, 690 .proc_handler = proc_dointvec,
691 }, 691 },
692#endif /* CONFIG_NETLABEL */ 692#endif /* CONFIG_NETLABEL */
693 { 693 {
694 .procname = "tcp_available_congestion_control", 694 .procname = "tcp_available_congestion_control",
695 .maxlen = TCP_CA_BUF_MAX, 695 .maxlen = TCP_CA_BUF_MAX,
696 .mode = 0444, 696 .mode = 0444,
697 .proc_handler = &proc_tcp_available_congestion_control, 697 .proc_handler = proc_tcp_available_congestion_control,
698 }, 698 },
699 { 699 {
700 .ctl_name = NET_TCP_ALLOWED_CONG_CONTROL, 700 .ctl_name = NET_TCP_ALLOWED_CONG_CONTROL,
701 .procname = "tcp_allowed_congestion_control", 701 .procname = "tcp_allowed_congestion_control",
702 .maxlen = TCP_CA_BUF_MAX, 702 .maxlen = TCP_CA_BUF_MAX,
703 .mode = 0644, 703 .mode = 0644,
704 .proc_handler = &proc_allowed_congestion_control, 704 .proc_handler = proc_allowed_congestion_control,
705 .strategy = &strategy_allowed_congestion_control, 705 .strategy = strategy_allowed_congestion_control,
706 }, 706 },
707 { 707 {
708 .ctl_name = NET_TCP_MAX_SSTHRESH, 708 .ctl_name = NET_TCP_MAX_SSTHRESH,
@@ -710,7 +710,7 @@ static struct ctl_table ipv4_table[] = {
710 .data = &sysctl_tcp_max_ssthresh, 710 .data = &sysctl_tcp_max_ssthresh,
711 .maxlen = sizeof(int), 711 .maxlen = sizeof(int),
712 .mode = 0644, 712 .mode = 0644,
713 .proc_handler = &proc_dointvec, 713 .proc_handler = proc_dointvec,
714 }, 714 },
715 { 715 {
716 .ctl_name = CTL_UNNUMBERED, 716 .ctl_name = CTL_UNNUMBERED,
@@ -718,8 +718,8 @@ static struct ctl_table ipv4_table[] = {
718 .data = &sysctl_udp_mem, 718 .data = &sysctl_udp_mem,
719 .maxlen = sizeof(sysctl_udp_mem), 719 .maxlen = sizeof(sysctl_udp_mem),
720 .mode = 0644, 720 .mode = 0644,
721 .proc_handler = &proc_dointvec_minmax, 721 .proc_handler = proc_dointvec_minmax,
722 .strategy = &sysctl_intvec, 722 .strategy = sysctl_intvec,
723 .extra1 = &zero 723 .extra1 = &zero
724 }, 724 },
725 { 725 {
@@ -728,8 +728,8 @@ static struct ctl_table ipv4_table[] = {
728 .data = &sysctl_udp_rmem_min, 728 .data = &sysctl_udp_rmem_min,
729 .maxlen = sizeof(sysctl_udp_rmem_min), 729 .maxlen = sizeof(sysctl_udp_rmem_min),
730 .mode = 0644, 730 .mode = 0644,
731 .proc_handler = &proc_dointvec_minmax, 731 .proc_handler = proc_dointvec_minmax,
732 .strategy = &sysctl_intvec, 732 .strategy = sysctl_intvec,
733 .extra1 = &zero 733 .extra1 = &zero
734 }, 734 },
735 { 735 {
@@ -738,8 +738,8 @@ static struct ctl_table ipv4_table[] = {
738 .data = &sysctl_udp_wmem_min, 738 .data = &sysctl_udp_wmem_min,
739 .maxlen = sizeof(sysctl_udp_wmem_min), 739 .maxlen = sizeof(sysctl_udp_wmem_min),
740 .mode = 0644, 740 .mode = 0644,
741 .proc_handler = &proc_dointvec_minmax, 741 .proc_handler = proc_dointvec_minmax,
742 .strategy = &sysctl_intvec, 742 .strategy = sysctl_intvec,
743 .extra1 = &zero 743 .extra1 = &zero
744 }, 744 },
745 { .ctl_name = 0 } 745 { .ctl_name = 0 }
@@ -752,7 +752,7 @@ static struct ctl_table ipv4_net_table[] = {
752 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, 752 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
753 .maxlen = sizeof(int), 753 .maxlen = sizeof(int),
754 .mode = 0644, 754 .mode = 0644,
755 .proc_handler = &proc_dointvec 755 .proc_handler = proc_dointvec
756 }, 756 },
757 { 757 {
758 .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, 758 .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
@@ -760,7 +760,7 @@ static struct ctl_table ipv4_net_table[] = {
760 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, 760 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
761 .maxlen = sizeof(int), 761 .maxlen = sizeof(int),
762 .mode = 0644, 762 .mode = 0644,
763 .proc_handler = &proc_dointvec 763 .proc_handler = proc_dointvec
764 }, 764 },
765 { 765 {
766 .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, 766 .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
@@ -768,7 +768,7 @@ static struct ctl_table ipv4_net_table[] = {
768 .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, 768 .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
769 .maxlen = sizeof(int), 769 .maxlen = sizeof(int),
770 .mode = 0644, 770 .mode = 0644,
771 .proc_handler = &proc_dointvec 771 .proc_handler = proc_dointvec
772 }, 772 },
773 { 773 {
774 .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, 774 .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
@@ -776,7 +776,7 @@ static struct ctl_table ipv4_net_table[] = {
776 .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, 776 .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
777 .maxlen = sizeof(int), 777 .maxlen = sizeof(int),
778 .mode = 0644, 778 .mode = 0644,
779 .proc_handler = &proc_dointvec 779 .proc_handler = proc_dointvec
780 }, 780 },
781 { 781 {
782 .ctl_name = NET_IPV4_ICMP_RATELIMIT, 782 .ctl_name = NET_IPV4_ICMP_RATELIMIT,
@@ -784,8 +784,8 @@ static struct ctl_table ipv4_net_table[] = {
784 .data = &init_net.ipv4.sysctl_icmp_ratelimit, 784 .data = &init_net.ipv4.sysctl_icmp_ratelimit,
785 .maxlen = sizeof(int), 785 .maxlen = sizeof(int),
786 .mode = 0644, 786 .mode = 0644,
787 .proc_handler = &proc_dointvec_ms_jiffies, 787 .proc_handler = proc_dointvec_ms_jiffies,
788 .strategy = &sysctl_ms_jiffies 788 .strategy = sysctl_ms_jiffies
789 }, 789 },
790 { 790 {
791 .ctl_name = NET_IPV4_ICMP_RATEMASK, 791 .ctl_name = NET_IPV4_ICMP_RATEMASK,
@@ -793,7 +793,15 @@ static struct ctl_table ipv4_net_table[] = {
793 .data = &init_net.ipv4.sysctl_icmp_ratemask, 793 .data = &init_net.ipv4.sysctl_icmp_ratemask,
794 .maxlen = sizeof(int), 794 .maxlen = sizeof(int),
795 .mode = 0644, 795 .mode = 0644,
796 .proc_handler = &proc_dointvec 796 .proc_handler = proc_dointvec
797 },
798 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "rt_cache_rebuild_count",
801 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
802 .maxlen = sizeof(int),
803 .mode = 0644,
804 .proc_handler = proc_dointvec
797 }, 805 },
798 { } 806 { }
799}; 807};
@@ -827,8 +835,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
827 &net->ipv4.sysctl_icmp_ratelimit; 835 &net->ipv4.sysctl_icmp_ratelimit;
828 table[5].data = 836 table[5].data =
829 &net->ipv4.sysctl_icmp_ratemask; 837 &net->ipv4.sysctl_icmp_ratemask;
838 table[6].data =
839 &net->ipv4.sysctl_rt_cache_rebuild_count;
830 } 840 }
831 841
842 net->ipv4.sysctl_rt_cache_rebuild_count = 4;
843
832 net->ipv4.ipv4_hdr = register_net_sysctl_table(net, 844 net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
833 net_ipv4_ctl_path, table); 845 net_ipv4_ctl_path, table);
834 if (net->ipv4.ipv4_hdr == NULL) 846 if (net->ipv4.ipv4_hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c5aca0bb116a..1f3d52946b3b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -277,8 +277,7 @@
277 277
278int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; 278int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
279 279
280atomic_t tcp_orphan_count = ATOMIC_INIT(0); 280struct percpu_counter tcp_orphan_count;
281
282EXPORT_SYMBOL_GPL(tcp_orphan_count); 281EXPORT_SYMBOL_GPL(tcp_orphan_count);
283 282
284int sysctl_tcp_mem[3] __read_mostly; 283int sysctl_tcp_mem[3] __read_mostly;
@@ -290,9 +289,12 @@ EXPORT_SYMBOL(sysctl_tcp_rmem);
290EXPORT_SYMBOL(sysctl_tcp_wmem); 289EXPORT_SYMBOL(sysctl_tcp_wmem);
291 290
292atomic_t tcp_memory_allocated; /* Current allocated memory. */ 291atomic_t tcp_memory_allocated; /* Current allocated memory. */
293atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
294
295EXPORT_SYMBOL(tcp_memory_allocated); 292EXPORT_SYMBOL(tcp_memory_allocated);
293
294/*
295 * Current number of TCP sockets.
296 */
297struct percpu_counter tcp_sockets_allocated;
296EXPORT_SYMBOL(tcp_sockets_allocated); 298EXPORT_SYMBOL(tcp_sockets_allocated);
297 299
298/* 300/*
@@ -1680,7 +1682,7 @@ void tcp_set_state(struct sock *sk, int state)
1680 inet_put_port(sk); 1682 inet_put_port(sk);
1681 /* fall through */ 1683 /* fall through */
1682 default: 1684 default:
1683 if (oldstate==TCP_ESTABLISHED) 1685 if (oldstate == TCP_ESTABLISHED)
1684 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); 1686 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1685 } 1687 }
1686 1688
@@ -1690,7 +1692,7 @@ void tcp_set_state(struct sock *sk, int state)
1690 sk->sk_state = state; 1692 sk->sk_state = state;
1691 1693
1692#ifdef STATE_TRACE 1694#ifdef STATE_TRACE
1693 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]); 1695 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1694#endif 1696#endif
1695} 1697}
1696EXPORT_SYMBOL_GPL(tcp_set_state); 1698EXPORT_SYMBOL_GPL(tcp_set_state);
@@ -1834,7 +1836,7 @@ adjudge_to_death:
1834 state = sk->sk_state; 1836 state = sk->sk_state;
1835 sock_hold(sk); 1837 sock_hold(sk);
1836 sock_orphan(sk); 1838 sock_orphan(sk);
1837 atomic_inc(sk->sk_prot->orphan_count); 1839 percpu_counter_inc(sk->sk_prot->orphan_count);
1838 1840
1839 /* It is the last release_sock in its life. It will remove backlog. */ 1841 /* It is the last release_sock in its life. It will remove backlog. */
1840 release_sock(sk); 1842 release_sock(sk);
@@ -1885,9 +1887,11 @@ adjudge_to_death:
1885 } 1887 }
1886 } 1888 }
1887 if (sk->sk_state != TCP_CLOSE) { 1889 if (sk->sk_state != TCP_CLOSE) {
1890 int orphan_count = percpu_counter_read_positive(
1891 sk->sk_prot->orphan_count);
1892
1888 sk_mem_reclaim(sk); 1893 sk_mem_reclaim(sk);
1889 if (tcp_too_many_orphans(sk, 1894 if (tcp_too_many_orphans(sk, orphan_count)) {
1890 atomic_read(sk->sk_prot->orphan_count))) {
1891 if (net_ratelimit()) 1895 if (net_ratelimit())
1892 printk(KERN_INFO "TCP: too many of orphaned " 1896 printk(KERN_INFO "TCP: too many of orphaned "
1893 "sockets\n"); 1897 "sockets\n");
@@ -2461,6 +2465,106 @@ out:
2461} 2465}
2462EXPORT_SYMBOL(tcp_tso_segment); 2466EXPORT_SYMBOL(tcp_tso_segment);
2463 2467
2468struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2469{
2470 struct sk_buff **pp = NULL;
2471 struct sk_buff *p;
2472 struct tcphdr *th;
2473 struct tcphdr *th2;
2474 unsigned int thlen;
2475 unsigned int flags;
2476 unsigned int total;
2477 unsigned int mss = 1;
2478 int flush = 1;
2479
2480 if (!pskb_may_pull(skb, sizeof(*th)))
2481 goto out;
2482
2483 th = tcp_hdr(skb);
2484 thlen = th->doff * 4;
2485 if (thlen < sizeof(*th))
2486 goto out;
2487
2488 if (!pskb_may_pull(skb, thlen))
2489 goto out;
2490
2491 th = tcp_hdr(skb);
2492 __skb_pull(skb, thlen);
2493
2494 flags = tcp_flag_word(th);
2495
2496 for (; (p = *head); head = &p->next) {
2497 if (!NAPI_GRO_CB(p)->same_flow)
2498 continue;
2499
2500 th2 = tcp_hdr(p);
2501
2502 if (th->source != th2->source || th->dest != th2->dest) {
2503 NAPI_GRO_CB(p)->same_flow = 0;
2504 continue;
2505 }
2506
2507 goto found;
2508 }
2509
2510 goto out_check_final;
2511
2512found:
2513 flush = NAPI_GRO_CB(p)->flush;
2514 flush |= flags & TCP_FLAG_CWR;
2515 flush |= (flags ^ tcp_flag_word(th2)) &
2516 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
2517 flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
2518 flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
2519
2520 total = p->len;
2521 mss = total;
2522 if (skb_shinfo(p)->frag_list)
2523 mss = skb_shinfo(p)->frag_list->len;
2524
2525 flush |= skb->len > mss || skb->len <= 0;
2526 flush |= ntohl(th2->seq) + total != ntohl(th->seq);
2527
2528 if (flush || skb_gro_receive(head, skb)) {
2529 mss = 1;
2530 goto out_check_final;
2531 }
2532
2533 p = *head;
2534 th2 = tcp_hdr(p);
2535 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
2536
2537out_check_final:
2538 flush = skb->len < mss;
2539 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
2540 TCP_FLAG_SYN | TCP_FLAG_FIN);
2541
2542 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
2543 pp = head;
2544
2545out:
2546 NAPI_GRO_CB(skb)->flush |= flush;
2547
2548 return pp;
2549}
2550
2551int tcp_gro_complete(struct sk_buff *skb)
2552{
2553 struct tcphdr *th = tcp_hdr(skb);
2554
2555 skb->csum_start = skb_transport_header(skb) - skb->head;
2556 skb->csum_offset = offsetof(struct tcphdr, check);
2557 skb->ip_summed = CHECKSUM_PARTIAL;
2558
2559 skb_shinfo(skb)->gso_size = skb_shinfo(skb)->frag_list->len;
2560 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
2561
2562 if (th->cwr)
2563 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
2564
2565 return 0;
2566}
2567
2464#ifdef CONFIG_TCP_MD5SIG 2568#ifdef CONFIG_TCP_MD5SIG
2465static unsigned long tcp_md5sig_users; 2569static unsigned long tcp_md5sig_users;
2466static struct tcp_md5sig_pool **tcp_md5sig_pool; 2570static struct tcp_md5sig_pool **tcp_md5sig_pool;
@@ -2650,7 +2754,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
2650 2754
2651void tcp_done(struct sock *sk) 2755void tcp_done(struct sock *sk)
2652{ 2756{
2653 if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 2757 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2654 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 2758 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
2655 2759
2656 tcp_set_state(sk, TCP_CLOSE); 2760 tcp_set_state(sk, TCP_CLOSE);
@@ -2685,6 +2789,8 @@ void __init tcp_init(void)
2685 2789
2686 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 2790 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2687 2791
2792 percpu_counter_init(&tcp_sockets_allocated, 0);
2793 percpu_counter_init(&tcp_orphan_count, 0);
2688 tcp_hashinfo.bind_bucket_cachep = 2794 tcp_hashinfo.bind_bucket_cachep =
2689 kmem_cache_create("tcp_bind_bucket", 2795 kmem_cache_create("tcp_bind_bucket",
2690 sizeof(struct inet_bind_bucket), 0, 2796 sizeof(struct inet_bind_bucket), 0,
@@ -2707,8 +2813,8 @@ void __init tcp_init(void)
2707 thash_entries ? 0 : 512 * 1024); 2813 thash_entries ? 0 : 512 * 1024);
2708 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 2814 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2709 for (i = 0; i < tcp_hashinfo.ehash_size; i++) { 2815 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2710 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); 2816 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2711 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); 2817 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2712 } 2818 }
2713 if (inet_ehash_locks_alloc(&tcp_hashinfo)) 2819 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2714 panic("TCP: failed to alloc ehash_locks"); 2820 panic("TCP: failed to alloc ehash_locks");
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 4a1221e5e8ee..ee467ec40c4f 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -1,13 +1,23 @@
1/* 1/*
2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.2 2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
3 * Home page: 3 * Home page:
4 * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC 4 * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
5 * This is from the implementation of CUBIC TCP in 5 * This is from the implementation of CUBIC TCP in
6 * Injong Rhee, Lisong Xu. 6 * Sangtae Ha, Injong Rhee and Lisong Xu,
7 * "CUBIC: A New TCP-Friendly High-Speed TCP Variant 7 * "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
8 * in PFLDnet 2005 8 * in ACM SIGOPS Operating System Review, July 2008.
9 * Available from: 9 * Available from:
10 * http://netsrv.csc.ncsu.edu/export/cubic-paper.pdf 10 * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
11 *
12 * CUBIC integrates a new slow start algorithm, called HyStart.
13 * The details of HyStart are presented in
14 * Sangtae Ha and Injong Rhee,
15 * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
16 * Available from:
17 * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
18 *
19 * All testing results are available from:
20 * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
11 * 21 *
12 * Unless CUBIC is enabled and congestion window is large 22 * Unless CUBIC is enabled and congestion window is large
13 * this behaves the same as the original Reno. 23 * this behaves the same as the original Reno.
@@ -23,12 +33,26 @@
23 */ 33 */
24#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ 34#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
25 35
36/* Two methods of hybrid slow start */
37#define HYSTART_ACK_TRAIN 0x1
38#define HYSTART_DELAY 0x2
39
40/* Number of delay samples for detecting the increase of delay */
41#define HYSTART_MIN_SAMPLES 8
42#define HYSTART_DELAY_MIN (2U<<3)
43#define HYSTART_DELAY_MAX (16U<<3)
44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
45
26static int fast_convergence __read_mostly = 1; 46static int fast_convergence __read_mostly = 1;
27static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ 47static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */
28static int initial_ssthresh __read_mostly; 48static int initial_ssthresh __read_mostly;
29static int bic_scale __read_mostly = 41; 49static int bic_scale __read_mostly = 41;
30static int tcp_friendliness __read_mostly = 1; 50static int tcp_friendliness __read_mostly = 1;
31 51
52static int hystart __read_mostly = 1;
53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
54static int hystart_low_window __read_mostly = 16;
55
32static u32 cube_rtt_scale __read_mostly; 56static u32 cube_rtt_scale __read_mostly;
33static u32 beta_scale __read_mostly; 57static u32 beta_scale __read_mostly;
34static u64 cube_factor __read_mostly; 58static u64 cube_factor __read_mostly;
@@ -44,6 +68,13 @@ module_param(bic_scale, int, 0444);
44MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); 68MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
45module_param(tcp_friendliness, int, 0644); 69module_param(tcp_friendliness, int, 0644);
46MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); 70MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
71module_param(hystart, int, 0644);
72MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
73module_param(hystart_detect, int, 0644);
74MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
75 " 1: packet-train 2: delay 3: both packet-train and delay");
76module_param(hystart_low_window, int, 0644);
77MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
47 78
48/* BIC TCP Parameters */ 79/* BIC TCP Parameters */
49struct bictcp { 80struct bictcp {
@@ -59,7 +90,13 @@ struct bictcp {
59 u32 ack_cnt; /* number of acks */ 90 u32 ack_cnt; /* number of acks */
60 u32 tcp_cwnd; /* estimated tcp cwnd */ 91 u32 tcp_cwnd; /* estimated tcp cwnd */
61#define ACK_RATIO_SHIFT 4 92#define ACK_RATIO_SHIFT 4
62 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ 93 u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
94 u8 sample_cnt; /* number of samples to decide curr_rtt */
95 u8 found; /* the exit point is found? */
96 u32 round_start; /* beginning of each round */
97 u32 end_seq; /* end_seq of the round */
98 u32 last_jiffies; /* last time when the ACK spacing is close */
99 u32 curr_rtt; /* the minimum rtt of current round */
63}; 100};
64 101
65static inline void bictcp_reset(struct bictcp *ca) 102static inline void bictcp_reset(struct bictcp *ca)
@@ -76,12 +113,28 @@ static inline void bictcp_reset(struct bictcp *ca)
76 ca->delayed_ack = 2 << ACK_RATIO_SHIFT; 113 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
77 ca->ack_cnt = 0; 114 ca->ack_cnt = 0;
78 ca->tcp_cwnd = 0; 115 ca->tcp_cwnd = 0;
116 ca->found = 0;
117}
118
119static inline void bictcp_hystart_reset(struct sock *sk)
120{
121 struct tcp_sock *tp = tcp_sk(sk);
122 struct bictcp *ca = inet_csk_ca(sk);
123
124 ca->round_start = ca->last_jiffies = jiffies;
125 ca->end_seq = tp->snd_nxt;
126 ca->curr_rtt = 0;
127 ca->sample_cnt = 0;
79} 128}
80 129
81static void bictcp_init(struct sock *sk) 130static void bictcp_init(struct sock *sk)
82{ 131{
83 bictcp_reset(inet_csk_ca(sk)); 132 bictcp_reset(inet_csk_ca(sk));
84 if (initial_ssthresh) 133
134 if (hystart)
135 bictcp_hystart_reset(sk);
136
137 if (!hystart && initial_ssthresh)
85 tcp_sk(sk)->snd_ssthresh = initial_ssthresh; 138 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
86} 139}
87 140
@@ -235,9 +288,11 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
235 if (!tcp_is_cwnd_limited(sk, in_flight)) 288 if (!tcp_is_cwnd_limited(sk, in_flight))
236 return; 289 return;
237 290
238 if (tp->snd_cwnd <= tp->snd_ssthresh) 291 if (tp->snd_cwnd <= tp->snd_ssthresh) {
292 if (hystart && after(ack, ca->end_seq))
293 bictcp_hystart_reset(sk);
239 tcp_slow_start(tp); 294 tcp_slow_start(tp);
240 else { 295 } else {
241 bictcp_update(ca, tp->snd_cwnd); 296 bictcp_update(ca, tp->snd_cwnd);
242 297
243 /* In dangerous area, increase slowly. 298 /* In dangerous area, increase slowly.
@@ -281,8 +336,45 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
281 336
282static void bictcp_state(struct sock *sk, u8 new_state) 337static void bictcp_state(struct sock *sk, u8 new_state)
283{ 338{
284 if (new_state == TCP_CA_Loss) 339 if (new_state == TCP_CA_Loss) {
285 bictcp_reset(inet_csk_ca(sk)); 340 bictcp_reset(inet_csk_ca(sk));
341 bictcp_hystart_reset(sk);
342 }
343}
344
345static void hystart_update(struct sock *sk, u32 delay)
346{
347 struct tcp_sock *tp = tcp_sk(sk);
348 struct bictcp *ca = inet_csk_ca(sk);
349
350 if (!(ca->found & hystart_detect)) {
351 u32 curr_jiffies = jiffies;
352
353 /* first detection parameter - ack-train detection */
354 if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) {
355 ca->last_jiffies = curr_jiffies;
356 if (curr_jiffies - ca->round_start >= ca->delay_min>>4)
357 ca->found |= HYSTART_ACK_TRAIN;
358 }
359
360 /* obtain the minimum delay of more than sampling packets */
361 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
362 if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
363 ca->curr_rtt = delay;
364
365 ca->sample_cnt++;
366 } else {
367 if (ca->curr_rtt > ca->delay_min +
368 HYSTART_DELAY_THRESH(ca->delay_min>>4))
369 ca->found |= HYSTART_DELAY;
370 }
371 /*
372 * Either one of two conditions are met,
373 * we exit from slow start immediately.
374 */
375 if (ca->found & hystart_detect)
376 tp->snd_ssthresh = tp->snd_cwnd;
377 }
286} 378}
287 379
288/* Track delayed acknowledgment ratio using sliding window 380/* Track delayed acknowledgment ratio using sliding window
@@ -291,6 +383,7 @@ static void bictcp_state(struct sock *sk, u8 new_state)
291static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) 383static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
292{ 384{
293 const struct inet_connection_sock *icsk = inet_csk(sk); 385 const struct inet_connection_sock *icsk = inet_csk(sk);
386 const struct tcp_sock *tp = tcp_sk(sk);
294 struct bictcp *ca = inet_csk_ca(sk); 387 struct bictcp *ca = inet_csk_ca(sk);
295 u32 delay; 388 u32 delay;
296 389
@@ -314,6 +407,11 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
314 /* first time call or link delay decreases */ 407 /* first time call or link delay decreases */
315 if (ca->delay_min == 0 || ca->delay_min > delay) 408 if (ca->delay_min == 0 || ca->delay_min > delay)
316 ca->delay_min = delay; 409 ca->delay_min = delay;
410
411 /* hystart triggers when cwnd is larger than some threshold */
412 if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
413 tp->snd_cwnd >= hystart_low_window)
414 hystart_update(sk, delay);
317} 415}
318 416
319static struct tcp_congestion_ops cubictcp = { 417static struct tcp_congestion_ops cubictcp = {
@@ -372,4 +470,4 @@ module_exit(cubictcp_unregister);
372MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); 470MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
373MODULE_LICENSE("GPL"); 471MODULE_LICENSE("GPL");
374MODULE_DESCRIPTION("CUBIC TCP"); 472MODULE_DESCRIPTION("CUBIC TCP");
375MODULE_VERSION("2.2"); 473MODULE_VERSION("2.3");
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 838d491dfda7..fcbcd4ff6c5f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -34,7 +34,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
34 tcp_get_info(sk, info); 34 tcp_get_info(sk, info);
35} 35}
36 36
37static struct inet_diag_handler tcp_diag_handler = { 37static const struct inet_diag_handler tcp_diag_handler = {
38 .idiag_hashinfo = &tcp_hashinfo, 38 .idiag_hashinfo = &tcp_hashinfo,
39 .idiag_get_info = tcp_diag_get_info, 39 .idiag_get_info = tcp_diag_get_info,
40 .idiag_type = TCPDIAG_GETSOCK, 40 .idiag_type = TCPDIAG_GETSOCK,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d77c0d29e239..99b7ecbe8893 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -701,13 +701,10 @@ static inline void tcp_set_rto(struct sock *sk)
701 * all the algo is pure shit and should be replaced 701 * all the algo is pure shit and should be replaced
702 * with correct one. It is exactly, which we pretend to do. 702 * with correct one. It is exactly, which we pretend to do.
703 */ 703 */
704}
705 704
706/* NOTE: clamping at TCP_RTO_MIN is not required, current algo 705 /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
707 * guarantees that rto is higher. 706 * guarantees that rto is higher.
708 */ 707 */
709static inline void tcp_bound_rto(struct sock *sk)
710{
711 if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) 708 if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
712 inet_csk(sk)->icsk_rto = TCP_RTO_MAX; 709 inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
713} 710}
@@ -928,7 +925,6 @@ static void tcp_init_metrics(struct sock *sk)
928 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 925 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
929 } 926 }
930 tcp_set_rto(sk); 927 tcp_set_rto(sk);
931 tcp_bound_rto(sk);
932 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 928 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
933 goto reset; 929 goto reset;
934 tp->snd_cwnd = tcp_init_cwnd(tp, dst); 930 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
@@ -1002,7 +998,8 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
1002 } 998 }
1003} 999}
1004 1000
1005void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) 1001static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
1002 struct sk_buff *skb)
1006{ 1003{
1007 tcp_verify_retransmit_hint(tp, skb); 1004 tcp_verify_retransmit_hint(tp, skb);
1008 1005
@@ -1236,31 +1233,58 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1236 return dup_sack; 1233 return dup_sack;
1237} 1234}
1238 1235
1236struct tcp_sacktag_state {
1237 int reord;
1238 int fack_count;
1239 int flag;
1240};
1241
1239/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1242/* Check if skb is fully within the SACK block. In presence of GSO skbs,
1240 * the incoming SACK may not exactly match but we can find smaller MSS 1243 * the incoming SACK may not exactly match but we can find smaller MSS
1241 * aligned portion of it that matches. Therefore we might need to fragment 1244 * aligned portion of it that matches. Therefore we might need to fragment
1242 * which may fail and creates some hassle (caller must handle error case 1245 * which may fail and creates some hassle (caller must handle error case
1243 * returns). 1246 * returns).
1247 *
1248 * FIXME: this could be merged to shift decision code
1244 */ 1249 */
1245static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1250static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1246 u32 start_seq, u32 end_seq) 1251 u32 start_seq, u32 end_seq)
1247{ 1252{
1248 int in_sack, err; 1253 int in_sack, err;
1249 unsigned int pkt_len; 1254 unsigned int pkt_len;
1255 unsigned int mss;
1250 1256
1251 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && 1257 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1252 !before(end_seq, TCP_SKB_CB(skb)->end_seq); 1258 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1253 1259
1254 if (tcp_skb_pcount(skb) > 1 && !in_sack && 1260 if (tcp_skb_pcount(skb) > 1 && !in_sack &&
1255 after(TCP_SKB_CB(skb)->end_seq, start_seq)) { 1261 after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1256 1262 mss = tcp_skb_mss(skb);
1257 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); 1263 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1258 1264
1259 if (!in_sack) 1265 if (!in_sack) {
1260 pkt_len = start_seq - TCP_SKB_CB(skb)->seq; 1266 pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1261 else 1267 if (pkt_len < mss)
1268 pkt_len = mss;
1269 } else {
1262 pkt_len = end_seq - TCP_SKB_CB(skb)->seq; 1270 pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1263 err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); 1271 if (pkt_len < mss)
1272 return -EINVAL;
1273 }
1274
1275 /* Round if necessary so that SACKs cover only full MSSes
1276 * and/or the remaining small portion (if present)
1277 */
1278 if (pkt_len > mss) {
1279 unsigned int new_len = (pkt_len / mss) * mss;
1280 if (!in_sack && new_len < pkt_len) {
1281 new_len += mss;
1282 if (new_len > skb->len)
1283 return 0;
1284 }
1285 pkt_len = new_len;
1286 }
1287 err = tcp_fragment(sk, skb, pkt_len, mss);
1264 if (err < 0) 1288 if (err < 0)
1265 return err; 1289 return err;
1266 } 1290 }
@@ -1268,24 +1292,25 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1268 return in_sack; 1292 return in_sack;
1269} 1293}
1270 1294
1271static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, 1295static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1272 int *reord, int dup_sack, int fack_count) 1296 struct tcp_sacktag_state *state,
1297 int dup_sack, int pcount)
1273{ 1298{
1274 struct tcp_sock *tp = tcp_sk(sk); 1299 struct tcp_sock *tp = tcp_sk(sk);
1275 u8 sacked = TCP_SKB_CB(skb)->sacked; 1300 u8 sacked = TCP_SKB_CB(skb)->sacked;
1276 int flag = 0; 1301 int fack_count = state->fack_count;
1277 1302
1278 /* Account D-SACK for retransmitted packet. */ 1303 /* Account D-SACK for retransmitted packet. */
1279 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1304 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1280 if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) 1305 if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
1281 tp->undo_retrans--; 1306 tp->undo_retrans--;
1282 if (sacked & TCPCB_SACKED_ACKED) 1307 if (sacked & TCPCB_SACKED_ACKED)
1283 *reord = min(fack_count, *reord); 1308 state->reord = min(fack_count, state->reord);
1284 } 1309 }
1285 1310
1286 /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1311 /* Nothing to do; acked frame is about to be dropped (was ACKed). */
1287 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 1312 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1288 return flag; 1313 return sacked;
1289 1314
1290 if (!(sacked & TCPCB_SACKED_ACKED)) { 1315 if (!(sacked & TCPCB_SACKED_ACKED)) {
1291 if (sacked & TCPCB_SACKED_RETRANS) { 1316 if (sacked & TCPCB_SACKED_RETRANS) {
@@ -1294,10 +1319,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1294 * that retransmission is still in flight. 1319 * that retransmission is still in flight.
1295 */ 1320 */
1296 if (sacked & TCPCB_LOST) { 1321 if (sacked & TCPCB_LOST) {
1297 TCP_SKB_CB(skb)->sacked &= 1322 sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1298 ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1323 tp->lost_out -= pcount;
1299 tp->lost_out -= tcp_skb_pcount(skb); 1324 tp->retrans_out -= pcount;
1300 tp->retrans_out -= tcp_skb_pcount(skb);
1301 } 1325 }
1302 } else { 1326 } else {
1303 if (!(sacked & TCPCB_RETRANS)) { 1327 if (!(sacked & TCPCB_RETRANS)) {
@@ -1306,56 +1330,280 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1306 */ 1330 */
1307 if (before(TCP_SKB_CB(skb)->seq, 1331 if (before(TCP_SKB_CB(skb)->seq,
1308 tcp_highest_sack_seq(tp))) 1332 tcp_highest_sack_seq(tp)))
1309 *reord = min(fack_count, *reord); 1333 state->reord = min(fack_count,
1334 state->reord);
1310 1335
1311 /* SACK enhanced F-RTO (RFC4138; Appendix B) */ 1336 /* SACK enhanced F-RTO (RFC4138; Appendix B) */
1312 if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) 1337 if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
1313 flag |= FLAG_ONLY_ORIG_SACKED; 1338 state->flag |= FLAG_ONLY_ORIG_SACKED;
1314 } 1339 }
1315 1340
1316 if (sacked & TCPCB_LOST) { 1341 if (sacked & TCPCB_LOST) {
1317 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1342 sacked &= ~TCPCB_LOST;
1318 tp->lost_out -= tcp_skb_pcount(skb); 1343 tp->lost_out -= pcount;
1319 } 1344 }
1320 } 1345 }
1321 1346
1322 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; 1347 sacked |= TCPCB_SACKED_ACKED;
1323 flag |= FLAG_DATA_SACKED; 1348 state->flag |= FLAG_DATA_SACKED;
1324 tp->sacked_out += tcp_skb_pcount(skb); 1349 tp->sacked_out += pcount;
1325 1350
1326 fack_count += tcp_skb_pcount(skb); 1351 fack_count += pcount;
1327 1352
1328 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1353 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1329 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && 1354 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
1330 before(TCP_SKB_CB(skb)->seq, 1355 before(TCP_SKB_CB(skb)->seq,
1331 TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1356 TCP_SKB_CB(tp->lost_skb_hint)->seq))
1332 tp->lost_cnt_hint += tcp_skb_pcount(skb); 1357 tp->lost_cnt_hint += pcount;
1333 1358
1334 if (fack_count > tp->fackets_out) 1359 if (fack_count > tp->fackets_out)
1335 tp->fackets_out = fack_count; 1360 tp->fackets_out = fack_count;
1336
1337 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
1338 tcp_advance_highest_sack(sk, skb);
1339 } 1361 }
1340 1362
1341 /* D-SACK. We can detect redundant retransmission in S|R and plain R 1363 /* D-SACK. We can detect redundant retransmission in S|R and plain R
1342 * frames and clear it. undo_retrans is decreased above, L|R frames 1364 * frames and clear it. undo_retrans is decreased above, L|R frames
1343 * are accounted above as well. 1365 * are accounted above as well.
1344 */ 1366 */
1345 if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { 1367 if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1346 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1368 sacked &= ~TCPCB_SACKED_RETRANS;
1347 tp->retrans_out -= tcp_skb_pcount(skb); 1369 tp->retrans_out -= pcount;
1348 } 1370 }
1349 1371
1350 return flag; 1372 return sacked;
1373}
1374
1375static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1376 struct tcp_sacktag_state *state,
1377 unsigned int pcount, int shifted, int mss)
1378{
1379 struct tcp_sock *tp = tcp_sk(sk);
1380 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1381
1382 BUG_ON(!pcount);
1383
1384 /* Tweak before seqno plays */
1385 if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint &&
1386 !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
1387 tp->lost_cnt_hint += pcount;
1388
1389 TCP_SKB_CB(prev)->end_seq += shifted;
1390 TCP_SKB_CB(skb)->seq += shifted;
1391
1392 skb_shinfo(prev)->gso_segs += pcount;
1393 BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
1394 skb_shinfo(skb)->gso_segs -= pcount;
1395
1396 /* When we're adding to gso_segs == 1, gso_size will be zero,
1397 * in theory this shouldn't be necessary but as long as DSACK
1398 * code can come after this skb later on it's better to keep
1399 * setting gso_size to something.
1400 */
1401 if (!skb_shinfo(prev)->gso_size) {
1402 skb_shinfo(prev)->gso_size = mss;
1403 skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1404 }
1405
1406 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1407 if (skb_shinfo(skb)->gso_segs <= 1) {
1408 skb_shinfo(skb)->gso_size = 0;
1409 skb_shinfo(skb)->gso_type = 0;
1410 }
1411
1412 /* We discard results */
1413 tcp_sacktag_one(skb, sk, state, 0, pcount);
1414
1415 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1416 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1417
1418 if (skb->len > 0) {
1419 BUG_ON(!tcp_skb_pcount(skb));
1420 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1421 return 0;
1422 }
1423
1424 /* Whole SKB was eaten :-) */
1425
1426 if (skb == tp->retransmit_skb_hint)
1427 tp->retransmit_skb_hint = prev;
1428 if (skb == tp->scoreboard_skb_hint)
1429 tp->scoreboard_skb_hint = prev;
1430 if (skb == tp->lost_skb_hint) {
1431 tp->lost_skb_hint = prev;
1432 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1433 }
1434
1435 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
1436 if (skb == tcp_highest_sack(sk))
1437 tcp_advance_highest_sack(sk, skb);
1438
1439 tcp_unlink_write_queue(skb, sk);
1440 sk_wmem_free_skb(sk, skb);
1441
1442 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
1443
1444 return 1;
1445}
1446
1447/* I wish gso_size would have a bit more sane initialization than
1448 * something-or-zero which complicates things
1449 */
1450static int tcp_skb_seglen(struct sk_buff *skb)
1451{
1452 return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1453}
1454
1455/* Shifting pages past head area doesn't work */
1456static int skb_can_shift(struct sk_buff *skb)
1457{
1458 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1459}
1460
1461/* Try collapsing SACK blocks spanning across multiple skbs to a single
1462 * skb.
1463 */
1464static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1465 struct tcp_sacktag_state *state,
1466 u32 start_seq, u32 end_seq,
1467 int dup_sack)
1468{
1469 struct tcp_sock *tp = tcp_sk(sk);
1470 struct sk_buff *prev;
1471 int mss;
1472 int pcount = 0;
1473 int len;
1474 int in_sack;
1475
1476 if (!sk_can_gso(sk))
1477 goto fallback;
1478
1479 /* Normally R but no L won't result in plain S */
1480 if (!dup_sack &&
1481 (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1482 goto fallback;
1483 if (!skb_can_shift(skb))
1484 goto fallback;
1485 /* This frame is about to be dropped (was ACKed). */
1486 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1487 goto fallback;
1488
1489 /* Can only happen with delayed DSACK + discard craziness */
1490 if (unlikely(skb == tcp_write_queue_head(sk)))
1491 goto fallback;
1492 prev = tcp_write_queue_prev(sk, skb);
1493
1494 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1495 goto fallback;
1496
1497 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1498 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1499
1500 if (in_sack) {
1501 len = skb->len;
1502 pcount = tcp_skb_pcount(skb);
1503 mss = tcp_skb_seglen(skb);
1504
1505 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1506 * drop this restriction as unnecessary
1507 */
1508 if (mss != tcp_skb_seglen(prev))
1509 goto fallback;
1510 } else {
1511 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1512 goto noop;
1513 /* CHECKME: This is non-MSS split case only?, this will
1514 * cause skipped skbs due to advancing loop btw, original
1515 * has that feature too
1516 */
1517 if (tcp_skb_pcount(skb) <= 1)
1518 goto noop;
1519
1520 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1521 if (!in_sack) {
1522 /* TODO: head merge to next could be attempted here
1523 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1524 * though it might not be worth of the additional hassle
1525 *
1526 * ...we can probably just fallback to what was done
1527 * previously. We could try merging non-SACKed ones
1528 * as well but it probably isn't going to buy off
1529 * because later SACKs might again split them, and
1530 * it would make skb timestamp tracking considerably
1531 * harder problem.
1532 */
1533 goto fallback;
1534 }
1535
1536 len = end_seq - TCP_SKB_CB(skb)->seq;
1537 BUG_ON(len < 0);
1538 BUG_ON(len > skb->len);
1539
1540 /* MSS boundaries should be honoured or else pcount will
1541 * severely break even though it makes things bit trickier.
1542 * Optimize common case to avoid most of the divides
1543 */
1544 mss = tcp_skb_mss(skb);
1545
1546 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1547 * drop this restriction as unnecessary
1548 */
1549 if (mss != tcp_skb_seglen(prev))
1550 goto fallback;
1551
1552 if (len == mss) {
1553 pcount = 1;
1554 } else if (len < mss) {
1555 goto noop;
1556 } else {
1557 pcount = len / mss;
1558 len = pcount * mss;
1559 }
1560 }
1561
1562 if (!skb_shift(prev, skb, len))
1563 goto fallback;
1564 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss))
1565 goto out;
1566
1567 /* Hole filled allows collapsing with the next as well, this is very
1568 * useful when hole on every nth skb pattern happens
1569 */
1570 if (prev == tcp_write_queue_tail(sk))
1571 goto out;
1572 skb = tcp_write_queue_next(sk, prev);
1573
1574 if (!skb_can_shift(skb) ||
1575 (skb == tcp_send_head(sk)) ||
1576 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1577 (mss != tcp_skb_seglen(skb)))
1578 goto out;
1579
1580 len = skb->len;
1581 if (skb_shift(prev, skb, len)) {
1582 pcount += tcp_skb_pcount(skb);
1583 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss);
1584 }
1585
1586out:
1587 state->fack_count += pcount;
1588 return prev;
1589
1590noop:
1591 return skb;
1592
1593fallback:
1594 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1595 return NULL;
1351} 1596}
1352 1597
1353static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, 1598static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1354 struct tcp_sack_block *next_dup, 1599 struct tcp_sack_block *next_dup,
1600 struct tcp_sacktag_state *state,
1355 u32 start_seq, u32 end_seq, 1601 u32 start_seq, u32 end_seq,
1356 int dup_sack_in, int *fack_count, 1602 int dup_sack_in)
1357 int *reord, int *flag)
1358{ 1603{
1604 struct tcp_sock *tp = tcp_sk(sk);
1605 struct sk_buff *tmp;
1606
1359 tcp_for_write_queue_from(skb, sk) { 1607 tcp_for_write_queue_from(skb, sk) {
1360 int in_sack = 0; 1608 int in_sack = 0;
1361 int dup_sack = dup_sack_in; 1609 int dup_sack = dup_sack_in;
@@ -1376,17 +1624,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1376 dup_sack = 1; 1624 dup_sack = 1;
1377 } 1625 }
1378 1626
1379 if (in_sack <= 0) 1627 /* skb reference here is a bit tricky to get right, since
1380 in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, 1628 * shifting can eat and free both this skb and the next,
1381 end_seq); 1629 * so not even _safe variant of the loop is enough.
1630 */
1631 if (in_sack <= 0) {
1632 tmp = tcp_shift_skb_data(sk, skb, state,
1633 start_seq, end_seq, dup_sack);
1634 if (tmp != NULL) {
1635 if (tmp != skb) {
1636 skb = tmp;
1637 continue;
1638 }
1639
1640 in_sack = 0;
1641 } else {
1642 in_sack = tcp_match_skb_to_sack(sk, skb,
1643 start_seq,
1644 end_seq);
1645 }
1646 }
1647
1382 if (unlikely(in_sack < 0)) 1648 if (unlikely(in_sack < 0))
1383 break; 1649 break;
1384 1650
1385 if (in_sack) 1651 if (in_sack) {
1386 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, 1652 TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
1387 *fack_count); 1653 state,
1654 dup_sack,
1655 tcp_skb_pcount(skb));
1656
1657 if (!before(TCP_SKB_CB(skb)->seq,
1658 tcp_highest_sack_seq(tp)))
1659 tcp_advance_highest_sack(sk, skb);
1660 }
1388 1661
1389 *fack_count += tcp_skb_pcount(skb); 1662 state->fack_count += tcp_skb_pcount(skb);
1390 } 1663 }
1391 return skb; 1664 return skb;
1392} 1665}
@@ -1395,16 +1668,17 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1395 * a normal way 1668 * a normal way
1396 */ 1669 */
1397static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, 1670static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1398 u32 skip_to_seq, int *fack_count) 1671 struct tcp_sacktag_state *state,
1672 u32 skip_to_seq)
1399{ 1673{
1400 tcp_for_write_queue_from(skb, sk) { 1674 tcp_for_write_queue_from(skb, sk) {
1401 if (skb == tcp_send_head(sk)) 1675 if (skb == tcp_send_head(sk))
1402 break; 1676 break;
1403 1677
1404 if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) 1678 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1405 break; 1679 break;
1406 1680
1407 *fack_count += tcp_skb_pcount(skb); 1681 state->fack_count += tcp_skb_pcount(skb);
1408 } 1682 }
1409 return skb; 1683 return skb;
1410} 1684}
@@ -1412,18 +1686,17 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1412static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, 1686static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1413 struct sock *sk, 1687 struct sock *sk,
1414 struct tcp_sack_block *next_dup, 1688 struct tcp_sack_block *next_dup,
1415 u32 skip_to_seq, 1689 struct tcp_sacktag_state *state,
1416 int *fack_count, int *reord, 1690 u32 skip_to_seq)
1417 int *flag)
1418{ 1691{
1419 if (next_dup == NULL) 1692 if (next_dup == NULL)
1420 return skb; 1693 return skb;
1421 1694
1422 if (before(next_dup->start_seq, skip_to_seq)) { 1695 if (before(next_dup->start_seq, skip_to_seq)) {
1423 skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); 1696 skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
1424 skb = tcp_sacktag_walk(skb, sk, NULL, 1697 skb = tcp_sacktag_walk(skb, sk, NULL, state,
1425 next_dup->start_seq, next_dup->end_seq, 1698 next_dup->start_seq, next_dup->end_seq,
1426 1, fack_count, reord, flag); 1699 1);
1427 } 1700 }
1428 1701
1429 return skb; 1702 return skb;
@@ -1445,16 +1718,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1445 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); 1718 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1446 struct tcp_sack_block sp[TCP_NUM_SACKS]; 1719 struct tcp_sack_block sp[TCP_NUM_SACKS];
1447 struct tcp_sack_block *cache; 1720 struct tcp_sack_block *cache;
1721 struct tcp_sacktag_state state;
1448 struct sk_buff *skb; 1722 struct sk_buff *skb;
1449 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); 1723 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1450 int used_sacks; 1724 int used_sacks;
1451 int reord = tp->packets_out;
1452 int flag = 0;
1453 int found_dup_sack = 0; 1725 int found_dup_sack = 0;
1454 int fack_count;
1455 int i, j; 1726 int i, j;
1456 int first_sack_index; 1727 int first_sack_index;
1457 1728
1729 state.flag = 0;
1730 state.reord = tp->packets_out;
1731
1458 if (!tp->sacked_out) { 1732 if (!tp->sacked_out) {
1459 if (WARN_ON(tp->fackets_out)) 1733 if (WARN_ON(tp->fackets_out))
1460 tp->fackets_out = 0; 1734 tp->fackets_out = 0;
@@ -1464,7 +1738,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1464 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, 1738 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1465 num_sacks, prior_snd_una); 1739 num_sacks, prior_snd_una);
1466 if (found_dup_sack) 1740 if (found_dup_sack)
1467 flag |= FLAG_DSACKING_ACK; 1741 state.flag |= FLAG_DSACKING_ACK;
1468 1742
1469 /* Eliminate too old ACKs, but take into 1743 /* Eliminate too old ACKs, but take into
1470 * account more or less fresh ones, they can 1744 * account more or less fresh ones, they can
@@ -1533,7 +1807,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1533 } 1807 }
1534 1808
1535 skb = tcp_write_queue_head(sk); 1809 skb = tcp_write_queue_head(sk);
1536 fack_count = 0; 1810 state.fack_count = 0;
1537 i = 0; 1811 i = 0;
1538 1812
1539 if (!tp->sacked_out) { 1813 if (!tp->sacked_out) {
@@ -1558,7 +1832,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1558 1832
1559 /* Event "B" in the comment above. */ 1833 /* Event "B" in the comment above. */
1560 if (after(end_seq, tp->high_seq)) 1834 if (after(end_seq, tp->high_seq))
1561 flag |= FLAG_DATA_LOST; 1835 state.flag |= FLAG_DATA_LOST;
1562 1836
1563 /* Skip too early cached blocks */ 1837 /* Skip too early cached blocks */
1564 while (tcp_sack_cache_ok(tp, cache) && 1838 while (tcp_sack_cache_ok(tp, cache) &&
@@ -1571,13 +1845,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1571 1845
1572 /* Head todo? */ 1846 /* Head todo? */
1573 if (before(start_seq, cache->start_seq)) { 1847 if (before(start_seq, cache->start_seq)) {
1574 skb = tcp_sacktag_skip(skb, sk, start_seq, 1848 skb = tcp_sacktag_skip(skb, sk, &state,
1575 &fack_count); 1849 start_seq);
1576 skb = tcp_sacktag_walk(skb, sk, next_dup, 1850 skb = tcp_sacktag_walk(skb, sk, next_dup,
1851 &state,
1577 start_seq, 1852 start_seq,
1578 cache->start_seq, 1853 cache->start_seq,
1579 dup_sack, &fack_count, 1854 dup_sack);
1580 &reord, &flag);
1581 } 1855 }
1582 1856
1583 /* Rest of the block already fully processed? */ 1857 /* Rest of the block already fully processed? */
@@ -1585,9 +1859,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1585 goto advance_sp; 1859 goto advance_sp;
1586 1860
1587 skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, 1861 skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1588 cache->end_seq, 1862 &state,
1589 &fack_count, &reord, 1863 cache->end_seq);
1590 &flag);
1591 1864
1592 /* ...tail remains todo... */ 1865 /* ...tail remains todo... */
1593 if (tcp_highest_sack_seq(tp) == cache->end_seq) { 1866 if (tcp_highest_sack_seq(tp) == cache->end_seq) {
@@ -1595,13 +1868,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1595 skb = tcp_highest_sack(sk); 1868 skb = tcp_highest_sack(sk);
1596 if (skb == NULL) 1869 if (skb == NULL)
1597 break; 1870 break;
1598 fack_count = tp->fackets_out; 1871 state.fack_count = tp->fackets_out;
1599 cache++; 1872 cache++;
1600 goto walk; 1873 goto walk;
1601 } 1874 }
1602 1875
1603 skb = tcp_sacktag_skip(skb, sk, cache->end_seq, 1876 skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
1604 &fack_count);
1605 /* Check overlap against next cached too (past this one already) */ 1877 /* Check overlap against next cached too (past this one already) */
1606 cache++; 1878 cache++;
1607 continue; 1879 continue;
@@ -1611,20 +1883,20 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1611 skb = tcp_highest_sack(sk); 1883 skb = tcp_highest_sack(sk);
1612 if (skb == NULL) 1884 if (skb == NULL)
1613 break; 1885 break;
1614 fack_count = tp->fackets_out; 1886 state.fack_count = tp->fackets_out;
1615 } 1887 }
1616 skb = tcp_sacktag_skip(skb, sk, start_seq, &fack_count); 1888 skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
1617 1889
1618walk: 1890walk:
1619 skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, 1891 skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
1620 dup_sack, &fack_count, &reord, &flag); 1892 start_seq, end_seq, dup_sack);
1621 1893
1622advance_sp: 1894advance_sp:
1623 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct 1895 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
1624 * due to in-order walk 1896 * due to in-order walk
1625 */ 1897 */
1626 if (after(end_seq, tp->frto_highmark)) 1898 if (after(end_seq, tp->frto_highmark))
1627 flag &= ~FLAG_ONLY_ORIG_SACKED; 1899 state.flag &= ~FLAG_ONLY_ORIG_SACKED;
1628 1900
1629 i++; 1901 i++;
1630 } 1902 }
@@ -1641,10 +1913,10 @@ advance_sp:
1641 1913
1642 tcp_verify_left_out(tp); 1914 tcp_verify_left_out(tp);
1643 1915
1644 if ((reord < tp->fackets_out) && 1916 if ((state.reord < tp->fackets_out) &&
1645 ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && 1917 ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
1646 (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) 1918 (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
1647 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 1919 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
1648 1920
1649out: 1921out:
1650 1922
@@ -1654,13 +1926,13 @@ out:
1654 WARN_ON((int)tp->retrans_out < 0); 1926 WARN_ON((int)tp->retrans_out < 0);
1655 WARN_ON((int)tcp_packets_in_flight(tp) < 0); 1927 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1656#endif 1928#endif
1657 return flag; 1929 return state.flag;
1658} 1930}
1659 1931
1660/* Limits sacked_out so that sum with lost_out isn't ever larger than 1932/* Limits sacked_out so that sum with lost_out isn't ever larger than
1661 * packets_out. Returns zero if sacked_out adjustement wasn't necessary. 1933 * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
1662 */ 1934 */
1663int tcp_limit_reno_sacked(struct tcp_sock *tp) 1935static int tcp_limit_reno_sacked(struct tcp_sock *tp)
1664{ 1936{
1665 u32 holes; 1937 u32 holes;
1666 1938
@@ -2336,9 +2608,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2336 struct inet_sock *inet = inet_sk(sk); 2608 struct inet_sock *inet = inet_sk(sk);
2337 2609
2338 if (sk->sk_family == AF_INET) { 2610 if (sk->sk_family == AF_INET) {
2339 printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n", 2611 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2340 msg, 2612 msg,
2341 NIPQUAD(inet->daddr), ntohs(inet->dport), 2613 &inet->daddr, ntohs(inet->dport),
2342 tp->snd_cwnd, tcp_left_out(tp), 2614 tp->snd_cwnd, tcp_left_out(tp),
2343 tp->snd_ssthresh, tp->prior_ssthresh, 2615 tp->snd_ssthresh, tp->prior_ssthresh,
2344 tp->packets_out); 2616 tp->packets_out);
@@ -2346,9 +2618,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2346#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 2618#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2347 else if (sk->sk_family == AF_INET6) { 2619 else if (sk->sk_family == AF_INET6) {
2348 struct ipv6_pinfo *np = inet6_sk(sk); 2620 struct ipv6_pinfo *np = inet6_sk(sk);
2349 printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n", 2621 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2350 msg, 2622 msg,
2351 NIP6(np->daddr), ntohs(inet->dport), 2623 &np->daddr, ntohs(inet->dport),
2352 tp->snd_cwnd, tcp_left_out(tp), 2624 tp->snd_cwnd, tcp_left_out(tp),
2353 tp->snd_ssthresh, tp->prior_ssthresh, 2625 tp->snd_ssthresh, tp->prior_ssthresh,
2354 tp->packets_out); 2626 tp->packets_out);
@@ -2559,6 +2831,56 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
2559 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 2831 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2560} 2832}
2561 2833
2834/* Do a simple retransmit without using the backoff mechanisms in
2835 * tcp_timer. This is used for path mtu discovery.
2836 * The socket is already locked here.
2837 */
2838void tcp_simple_retransmit(struct sock *sk)
2839{
2840 const struct inet_connection_sock *icsk = inet_csk(sk);
2841 struct tcp_sock *tp = tcp_sk(sk);
2842 struct sk_buff *skb;
2843 unsigned int mss = tcp_current_mss(sk, 0);
2844 u32 prior_lost = tp->lost_out;
2845
2846 tcp_for_write_queue(skb, sk) {
2847 if (skb == tcp_send_head(sk))
2848 break;
2849 if (tcp_skb_seglen(skb) > mss &&
2850 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2851 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2852 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2853 tp->retrans_out -= tcp_skb_pcount(skb);
2854 }
2855 tcp_skb_mark_lost_uncond_verify(tp, skb);
2856 }
2857 }
2858
2859 tcp_clear_retrans_hints_partial(tp);
2860
2861 if (prior_lost == tp->lost_out)
2862 return;
2863
2864 if (tcp_is_reno(tp))
2865 tcp_limit_reno_sacked(tp);
2866
2867 tcp_verify_left_out(tp);
2868
2869 /* Don't muck with the congestion window here.
2870 * Reason is that we do not increase amount of _data_
2871 * in network, but units changed and effective
2872 * cwnd/ssthresh really reduced now.
2873 */
2874 if (icsk->icsk_ca_state != TCP_CA_Loss) {
2875 tp->high_seq = tp->snd_nxt;
2876 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2877 tp->prior_ssthresh = 0;
2878 tp->undo_marker = 0;
2879 tcp_set_ca_state(sk, TCP_CA_Loss);
2880 }
2881 tcp_xmit_retransmit_queue(sk);
2882}
2883
2562/* Process an event, which can update packets-in-flight not trivially. 2884/* Process an event, which can update packets-in-flight not trivially.
2563 * Main goal of this function is to calculate new estimate for left_out, 2885 * Main goal of this function is to calculate new estimate for left_out,
2564 * taking into account both packets sitting in receiver's buffer and 2886 * taking into account both packets sitting in receiver's buffer and
@@ -2730,6 +3052,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
2730 tcp_xmit_retransmit_queue(sk); 3052 tcp_xmit_retransmit_queue(sk);
2731} 3053}
2732 3054
3055static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
3056{
3057 tcp_rtt_estimator(sk, seq_rtt);
3058 tcp_set_rto(sk);
3059 inet_csk(sk)->icsk_backoff = 0;
3060}
3061
2733/* Read draft-ietf-tcplw-high-performance before mucking 3062/* Read draft-ietf-tcplw-high-performance before mucking
2734 * with this code. (Supersedes RFC1323) 3063 * with this code. (Supersedes RFC1323)
2735 */ 3064 */
@@ -2751,11 +3080,8 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
2751 * in window is lost... Voila. --ANK (010210) 3080 * in window is lost... Voila. --ANK (010210)
2752 */ 3081 */
2753 struct tcp_sock *tp = tcp_sk(sk); 3082 struct tcp_sock *tp = tcp_sk(sk);
2754 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 3083
2755 tcp_rtt_estimator(sk, seq_rtt); 3084 tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
2756 tcp_set_rto(sk);
2757 inet_csk(sk)->icsk_backoff = 0;
2758 tcp_bound_rto(sk);
2759} 3085}
2760 3086
2761static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) 3087static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
@@ -2772,10 +3098,7 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
2772 if (flag & FLAG_RETRANS_DATA_ACKED) 3098 if (flag & FLAG_RETRANS_DATA_ACKED)
2773 return; 3099 return;
2774 3100
2775 tcp_rtt_estimator(sk, seq_rtt); 3101 tcp_valid_rtt_meas(sk, seq_rtt);
2776 tcp_set_rto(sk);
2777 inet_csk(sk)->icsk_backoff = 0;
2778 tcp_bound_rto(sk);
2779} 3102}
2780 3103
2781static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, 3104static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5c8fa7f1e327..10172487921b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
97} 97}
98#endif 98#endif
99 99
100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { 100struct inet_hashinfo tcp_hashinfo;
101 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102 .lhash_users = ATOMIC_INIT(0),
103 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104};
105 101
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 102static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{ 103{
@@ -492,7 +488,7 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
492 skb->csum_offset = offsetof(struct tcphdr, check); 488 skb->csum_offset = offsetof(struct tcphdr, check);
493 } else { 489 } else {
494 th->check = tcp_v4_check(len, inet->saddr, inet->daddr, 490 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495 csum_partial((char *)th, 491 csum_partial(th,
496 th->doff << 2, 492 th->doff << 2,
497 skb->csum)); 493 skb->csum));
498 } 494 }
@@ -726,7 +722,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
726 th->check = tcp_v4_check(skb->len, 722 th->check = tcp_v4_check(skb->len,
727 ireq->loc_addr, 723 ireq->loc_addr,
728 ireq->rmt_addr, 724 ireq->rmt_addr,
729 csum_partial((char *)th, skb->len, 725 csum_partial(th, skb->len,
730 skb->csum)); 726 skb->csum));
731 727
732 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, 728 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
@@ -1139,10 +1135,9 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1139 1135
1140 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1136 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1141 if (net_ratelimit()) { 1137 if (net_ratelimit()) {
1142 printk(KERN_INFO "MD5 Hash failed for " 1138 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1143 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n", 1139 &iph->saddr, ntohs(th->source),
1144 NIPQUAD(iph->saddr), ntohs(th->source), 1140 &iph->daddr, ntohs(th->dest),
1145 NIPQUAD(iph->daddr), ntohs(th->dest),
1146 genhash ? " tcp_v4_calc_md5_hash failed" : ""); 1141 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1147 } 1142 }
1148 return 1; 1143 return 1;
@@ -1297,10 +1292,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1297 * to destinations, already remembered 1292 * to destinations, already remembered
1298 * to the moment of synflood. 1293 * to the moment of synflood.
1299 */ 1294 */
1300 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " 1295 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1301 "request from " NIPQUAD_FMT "/%u\n", 1296 &saddr, ntohs(tcp_hdr(skb)->source));
1302 NIPQUAD(saddr),
1303 ntohs(tcp_hdr(skb)->source));
1304 goto drop_and_release; 1297 goto drop_and_release;
1305 } 1298 }
1306 1299
@@ -1804,7 +1797,7 @@ static int tcp_v4_init_sock(struct sock *sk)
1804 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1797 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1805 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1798 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1806 1799
1807 atomic_inc(&tcp_sockets_allocated); 1800 percpu_counter_inc(&tcp_sockets_allocated);
1808 1801
1809 return 0; 1802 return 0;
1810} 1803}
@@ -1852,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
1852 sk->sk_sndmsg_page = NULL; 1845 sk->sk_sndmsg_page = NULL;
1853 } 1846 }
1854 1847
1855 atomic_dec(&tcp_sockets_allocated); 1848 percpu_counter_dec(&tcp_sockets_allocated);
1856} 1849}
1857 1850
1858EXPORT_SYMBOL(tcp_v4_destroy_sock); 1851EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -1860,32 +1853,35 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
1860#ifdef CONFIG_PROC_FS 1853#ifdef CONFIG_PROC_FS
1861/* Proc filesystem TCP sock list dumping. */ 1854/* Proc filesystem TCP sock list dumping. */
1862 1855
1863static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) 1856static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1864{ 1857{
1865 return hlist_empty(head) ? NULL : 1858 return hlist_nulls_empty(head) ? NULL :
1866 list_entry(head->first, struct inet_timewait_sock, tw_node); 1859 list_entry(head->first, struct inet_timewait_sock, tw_node);
1867} 1860}
1868 1861
1869static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) 1862static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1870{ 1863{
1871 return tw->tw_node.next ? 1864 return !is_a_nulls(tw->tw_node.next) ?
1872 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1865 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1873} 1866}
1874 1867
1875static void *listening_get_next(struct seq_file *seq, void *cur) 1868static void *listening_get_next(struct seq_file *seq, void *cur)
1876{ 1869{
1877 struct inet_connection_sock *icsk; 1870 struct inet_connection_sock *icsk;
1878 struct hlist_node *node; 1871 struct hlist_nulls_node *node;
1879 struct sock *sk = cur; 1872 struct sock *sk = cur;
1880 struct tcp_iter_state* st = seq->private; 1873 struct inet_listen_hashbucket *ilb;
1874 struct tcp_iter_state *st = seq->private;
1881 struct net *net = seq_file_net(seq); 1875 struct net *net = seq_file_net(seq);
1882 1876
1883 if (!sk) { 1877 if (!sk) {
1884 st->bucket = 0; 1878 st->bucket = 0;
1885 sk = sk_head(&tcp_hashinfo.listening_hash[0]); 1879 ilb = &tcp_hashinfo.listening_hash[0];
1880 spin_lock_bh(&ilb->lock);
1881 sk = sk_nulls_head(&ilb->head);
1886 goto get_sk; 1882 goto get_sk;
1887 } 1883 }
1888 1884 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1889 ++st->num; 1885 ++st->num;
1890 1886
1891 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1887 if (st->state == TCP_SEQ_STATE_OPENREQ) {
@@ -1918,7 +1914,7 @@ get_req:
1918 sk = sk_next(sk); 1914 sk = sk_next(sk);
1919 } 1915 }
1920get_sk: 1916get_sk:
1921 sk_for_each_from(sk, node) { 1917 sk_nulls_for_each_from(sk, node) {
1922 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { 1918 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1923 cur = sk; 1919 cur = sk;
1924 goto out; 1920 goto out;
@@ -1935,8 +1931,11 @@ start_req:
1935 } 1931 }
1936 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1932 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1937 } 1933 }
1934 spin_unlock_bh(&ilb->lock);
1938 if (++st->bucket < INET_LHTABLE_SIZE) { 1935 if (++st->bucket < INET_LHTABLE_SIZE) {
1939 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); 1936 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1937 spin_lock_bh(&ilb->lock);
1938 sk = sk_nulls_head(&ilb->head);
1940 goto get_sk; 1939 goto get_sk;
1941 } 1940 }
1942 cur = NULL; 1941 cur = NULL;
@@ -1957,28 +1956,28 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1957 1956
1958static inline int empty_bucket(struct tcp_iter_state *st) 1957static inline int empty_bucket(struct tcp_iter_state *st)
1959{ 1958{
1960 return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && 1959 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1961 hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 1960 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1962} 1961}
1963 1962
1964static void *established_get_first(struct seq_file *seq) 1963static void *established_get_first(struct seq_file *seq)
1965{ 1964{
1966 struct tcp_iter_state* st = seq->private; 1965 struct tcp_iter_state *st = seq->private;
1967 struct net *net = seq_file_net(seq); 1966 struct net *net = seq_file_net(seq);
1968 void *rc = NULL; 1967 void *rc = NULL;
1969 1968
1970 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { 1969 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1971 struct sock *sk; 1970 struct sock *sk;
1972 struct hlist_node *node; 1971 struct hlist_nulls_node *node;
1973 struct inet_timewait_sock *tw; 1972 struct inet_timewait_sock *tw;
1974 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1973 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1975 1974
1976 /* Lockless fast path for the common case of empty buckets */ 1975 /* Lockless fast path for the common case of empty buckets */
1977 if (empty_bucket(st)) 1976 if (empty_bucket(st))
1978 continue; 1977 continue;
1979 1978
1980 read_lock_bh(lock); 1979 spin_lock_bh(lock);
1981 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1980 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1982 if (sk->sk_family != st->family || 1981 if (sk->sk_family != st->family ||
1983 !net_eq(sock_net(sk), net)) { 1982 !net_eq(sock_net(sk), net)) {
1984 continue; 1983 continue;
@@ -1996,7 +1995,7 @@ static void *established_get_first(struct seq_file *seq)
1996 rc = tw; 1995 rc = tw;
1997 goto out; 1996 goto out;
1998 } 1997 }
1999 read_unlock_bh(lock); 1998 spin_unlock_bh(lock);
2000 st->state = TCP_SEQ_STATE_ESTABLISHED; 1999 st->state = TCP_SEQ_STATE_ESTABLISHED;
2001 } 2000 }
2002out: 2001out:
@@ -2007,8 +2006,8 @@ static void *established_get_next(struct seq_file *seq, void *cur)
2007{ 2006{
2008 struct sock *sk = cur; 2007 struct sock *sk = cur;
2009 struct inet_timewait_sock *tw; 2008 struct inet_timewait_sock *tw;
2010 struct hlist_node *node; 2009 struct hlist_nulls_node *node;
2011 struct tcp_iter_state* st = seq->private; 2010 struct tcp_iter_state *st = seq->private;
2012 struct net *net = seq_file_net(seq); 2011 struct net *net = seq_file_net(seq);
2013 2012
2014 ++st->num; 2013 ++st->num;
@@ -2024,7 +2023,7 @@ get_tw:
2024 cur = tw; 2023 cur = tw;
2025 goto out; 2024 goto out;
2026 } 2025 }
2027 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2026 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2028 st->state = TCP_SEQ_STATE_ESTABLISHED; 2027 st->state = TCP_SEQ_STATE_ESTABLISHED;
2029 2028
2030 /* Look for next non empty bucket */ 2029 /* Look for next non empty bucket */
@@ -2034,12 +2033,12 @@ get_tw:
2034 if (st->bucket >= tcp_hashinfo.ehash_size) 2033 if (st->bucket >= tcp_hashinfo.ehash_size)
2035 return NULL; 2034 return NULL;
2036 2035
2037 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2036 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2038 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2037 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2039 } else 2038 } else
2040 sk = sk_next(sk); 2039 sk = sk_nulls_next(sk);
2041 2040
2042 sk_for_each_from(sk, node) { 2041 sk_nulls_for_each_from(sk, node) {
2043 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2042 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2044 goto found; 2043 goto found;
2045 } 2044 }
@@ -2067,14 +2066,12 @@ static void *established_get_idx(struct seq_file *seq, loff_t pos)
2067static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2066static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2068{ 2067{
2069 void *rc; 2068 void *rc;
2070 struct tcp_iter_state* st = seq->private; 2069 struct tcp_iter_state *st = seq->private;
2071 2070
2072 inet_listen_lock(&tcp_hashinfo);
2073 st->state = TCP_SEQ_STATE_LISTENING; 2071 st->state = TCP_SEQ_STATE_LISTENING;
2074 rc = listening_get_idx(seq, &pos); 2072 rc = listening_get_idx(seq, &pos);
2075 2073
2076 if (!rc) { 2074 if (!rc) {
2077 inet_listen_unlock(&tcp_hashinfo);
2078 st->state = TCP_SEQ_STATE_ESTABLISHED; 2075 st->state = TCP_SEQ_STATE_ESTABLISHED;
2079 rc = established_get_idx(seq, pos); 2076 rc = established_get_idx(seq, pos);
2080 } 2077 }
@@ -2084,7 +2081,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2084 2081
2085static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2082static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2086{ 2083{
2087 struct tcp_iter_state* st = seq->private; 2084 struct tcp_iter_state *st = seq->private;
2088 st->state = TCP_SEQ_STATE_LISTENING; 2085 st->state = TCP_SEQ_STATE_LISTENING;
2089 st->num = 0; 2086 st->num = 0;
2090 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2087 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
@@ -2093,7 +2090,7 @@ static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2093static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2090static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2094{ 2091{
2095 void *rc = NULL; 2092 void *rc = NULL;
2096 struct tcp_iter_state* st; 2093 struct tcp_iter_state *st;
2097 2094
2098 if (v == SEQ_START_TOKEN) { 2095 if (v == SEQ_START_TOKEN) {
2099 rc = tcp_get_idx(seq, 0); 2096 rc = tcp_get_idx(seq, 0);
@@ -2106,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2106 case TCP_SEQ_STATE_LISTENING: 2103 case TCP_SEQ_STATE_LISTENING:
2107 rc = listening_get_next(seq, v); 2104 rc = listening_get_next(seq, v);
2108 if (!rc) { 2105 if (!rc) {
2109 inet_listen_unlock(&tcp_hashinfo);
2110 st->state = TCP_SEQ_STATE_ESTABLISHED; 2106 st->state = TCP_SEQ_STATE_ESTABLISHED;
2111 rc = established_get_first(seq); 2107 rc = established_get_first(seq);
2112 } 2108 }
@@ -2123,7 +2119,7 @@ out:
2123 2119
2124static void tcp_seq_stop(struct seq_file *seq, void *v) 2120static void tcp_seq_stop(struct seq_file *seq, void *v)
2125{ 2121{
2126 struct tcp_iter_state* st = seq->private; 2122 struct tcp_iter_state *st = seq->private;
2127 2123
2128 switch (st->state) { 2124 switch (st->state) {
2129 case TCP_SEQ_STATE_OPENREQ: 2125 case TCP_SEQ_STATE_OPENREQ:
@@ -2133,12 +2129,12 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2133 } 2129 }
2134 case TCP_SEQ_STATE_LISTENING: 2130 case TCP_SEQ_STATE_LISTENING:
2135 if (v != SEQ_START_TOKEN) 2131 if (v != SEQ_START_TOKEN)
2136 inet_listen_unlock(&tcp_hashinfo); 2132 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2137 break; 2133 break;
2138 case TCP_SEQ_STATE_TIME_WAIT: 2134 case TCP_SEQ_STATE_TIME_WAIT:
2139 case TCP_SEQ_STATE_ESTABLISHED: 2135 case TCP_SEQ_STATE_ESTABLISHED:
2140 if (v) 2136 if (v)
2141 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2137 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2142 break; 2138 break;
2143 } 2139 }
2144} 2140}
@@ -2284,7 +2280,7 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw,
2284 2280
2285static int tcp4_seq_show(struct seq_file *seq, void *v) 2281static int tcp4_seq_show(struct seq_file *seq, void *v)
2286{ 2282{
2287 struct tcp_iter_state* st; 2283 struct tcp_iter_state *st;
2288 int len; 2284 int len;
2289 2285
2290 if (v == SEQ_START_TOKEN) { 2286 if (v == SEQ_START_TOKEN) {
@@ -2350,6 +2346,41 @@ void tcp4_proc_exit(void)
2350} 2346}
2351#endif /* CONFIG_PROC_FS */ 2347#endif /* CONFIG_PROC_FS */
2352 2348
2349struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2350{
2351 struct iphdr *iph = ip_hdr(skb);
2352
2353 switch (skb->ip_summed) {
2354 case CHECKSUM_COMPLETE:
2355 if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr,
2356 skb->csum)) {
2357 skb->ip_summed = CHECKSUM_UNNECESSARY;
2358 break;
2359 }
2360
2361 /* fall through */
2362 case CHECKSUM_NONE:
2363 NAPI_GRO_CB(skb)->flush = 1;
2364 return NULL;
2365 }
2366
2367 return tcp_gro_receive(head, skb);
2368}
2369EXPORT_SYMBOL(tcp4_gro_receive);
2370
2371int tcp4_gro_complete(struct sk_buff *skb)
2372{
2373 struct iphdr *iph = ip_hdr(skb);
2374 struct tcphdr *th = tcp_hdr(skb);
2375
2376 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2377 iph->saddr, iph->daddr, 0);
2378 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2379
2380 return tcp_gro_complete(skb);
2381}
2382EXPORT_SYMBOL(tcp4_gro_complete);
2383
2353struct proto tcp_prot = { 2384struct proto tcp_prot = {
2354 .name = "TCP", 2385 .name = "TCP",
2355 .owner = THIS_MODULE, 2386 .owner = THIS_MODULE,
@@ -2378,6 +2409,7 @@ struct proto tcp_prot = {
2378 .sysctl_rmem = sysctl_tcp_rmem, 2409 .sysctl_rmem = sysctl_tcp_rmem,
2379 .max_header = MAX_TCP_HEADER, 2410 .max_header = MAX_TCP_HEADER,
2380 .obj_size = sizeof(struct tcp_sock), 2411 .obj_size = sizeof(struct tcp_sock),
2412 .slab_flags = SLAB_DESTROY_BY_RCU,
2381 .twsk_prot = &tcp_timewait_sock_ops, 2413 .twsk_prot = &tcp_timewait_sock_ops,
2382 .rsk_prot = &tcp_request_sock_ops, 2414 .rsk_prot = &tcp_request_sock_ops,
2383 .h.hashinfo = &tcp_hashinfo, 2415 .h.hashinfo = &tcp_hashinfo,
@@ -2407,6 +2439,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
2407 2439
2408void __init tcp_v4_init(void) 2440void __init tcp_v4_init(void)
2409{ 2441{
2442 inet_hashinfo_init(&tcp_hashinfo);
2410 if (register_pernet_device(&tcp_sk_ops)) 2443 if (register_pernet_device(&tcp_sk_ops))
2411 panic("Failed to create the TCP control socket.\n"); 2444 panic("Failed to create the TCP control socket.\n");
2412} 2445}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 779f2e9d0689..f67effbb102b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -491,7 +491,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
491 * as a request_sock. 491 * as a request_sock.
492 */ 492 */
493 493
494struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, 494struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
495 struct request_sock *req, 495 struct request_sock *req,
496 struct request_sock **prev) 496 struct request_sock **prev)
497{ 497{
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fe3b4bdfd251..557fe16cbfb0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -42,7 +42,7 @@
42/* People can turn this off for buggy TCP's found in printers etc. */ 42/* People can turn this off for buggy TCP's found in printers etc. */
43int sysctl_tcp_retrans_collapse __read_mostly = 1; 43int sysctl_tcp_retrans_collapse __read_mostly = 1;
44 44
45/* People can turn this on to work with those rare, broken TCPs that 45/* People can turn this on to work with those rare, broken TCPs that
46 * interpret the window field as a signed quantity. 46 * interpret the window field as a signed quantity.
47 */ 47 */
48int sysctl_tcp_workaround_signed_windows __read_mostly = 0; 48int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
@@ -484,7 +484,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
484 } 484 }
485 if (likely(sysctl_tcp_window_scaling)) { 485 if (likely(sysctl_tcp_window_scaling)) {
486 opts->ws = tp->rx_opt.rcv_wscale; 486 opts->ws = tp->rx_opt.rcv_wscale;
487 if(likely(opts->ws)) 487 if (likely(opts->ws))
488 size += TCPOLEN_WSCALE_ALIGNED; 488 size += TCPOLEN_WSCALE_ALIGNED;
489 } 489 }
490 if (likely(sysctl_tcp_sack)) { 490 if (likely(sysctl_tcp_sack)) {
@@ -526,7 +526,7 @@ static unsigned tcp_synack_options(struct sock *sk,
526 526
527 if (likely(ireq->wscale_ok)) { 527 if (likely(ireq->wscale_ok)) {
528 opts->ws = ireq->rcv_wscale; 528 opts->ws = ireq->rcv_wscale;
529 if(likely(opts->ws)) 529 if (likely(opts->ws))
530 size += TCPOLEN_WSCALE_ALIGNED; 530 size += TCPOLEN_WSCALE_ALIGNED;
531 } 531 }
532 if (likely(doing_ts)) { 532 if (likely(doing_ts)) {
@@ -663,10 +663,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
663 th->urg_ptr = 0; 663 th->urg_ptr = 0;
664 664
665 /* The urg_mode check is necessary during a below snd_una win probe */ 665 /* The urg_mode check is necessary during a below snd_una win probe */
666 if (unlikely(tcp_urg_mode(tp) && 666 if (unlikely(tcp_urg_mode(tp))) {
667 between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { 667 if (between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF)) {
668 th->urg_ptr = htons(tp->snd_up - tcb->seq); 668 th->urg_ptr = htons(tp->snd_up - tcb->seq);
669 th->urg = 1; 669 th->urg = 1;
670 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
671 th->urg_ptr = 0xFFFF;
672 th->urg = 1;
673 }
670 } 674 }
671 675
672 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 676 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
@@ -1168,7 +1172,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
1168 1172
1169static inline int tcp_minshall_check(const struct tcp_sock *tp) 1173static inline int tcp_minshall_check(const struct tcp_sock *tp)
1170{ 1174{
1171 return after(tp->snd_sml,tp->snd_una) && 1175 return after(tp->snd_sml, tp->snd_una) &&
1172 !after(tp->snd_sml, tp->snd_nxt); 1176 !after(tp->snd_sml, tp->snd_nxt);
1173} 1177}
1174 1178
@@ -1334,7 +1338,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1334 1338
1335 /* Defer for less than two clock ticks. */ 1339 /* Defer for less than two clock ticks. */
1336 if (tp->tso_deferred && 1340 if (tp->tso_deferred &&
1337 ((jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) 1341 (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
1338 goto send_now; 1342 goto send_now;
1339 1343
1340 in_flight = tcp_packets_in_flight(tp); 1344 in_flight = tcp_packets_in_flight(tp);
@@ -1519,7 +1523,8 @@ static int tcp_mtu_probe(struct sock *sk)
1519 * Returns 1, if no segments are in flight and we have queued segments, but 1523 * Returns 1, if no segments are in flight and we have queued segments, but
1520 * cannot send anything now because of SWS or another problem. 1524 * cannot send anything now because of SWS or another problem.
1521 */ 1525 */
1522static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) 1526static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1527 int push_one, gfp_t gfp)
1523{ 1528{
1524 struct tcp_sock *tp = tcp_sk(sk); 1529 struct tcp_sock *tp = tcp_sk(sk);
1525 struct sk_buff *skb; 1530 struct sk_buff *skb;
@@ -1527,20 +1532,16 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1527 int cwnd_quota; 1532 int cwnd_quota;
1528 int result; 1533 int result;
1529 1534
1530 /* If we are closed, the bytes will have to remain here.
1531 * In time closedown will finish, we empty the write queue and all
1532 * will be happy.
1533 */
1534 if (unlikely(sk->sk_state == TCP_CLOSE))
1535 return 0;
1536
1537 sent_pkts = 0; 1535 sent_pkts = 0;
1538 1536
1539 /* Do MTU probing. */ 1537 if (!push_one) {
1540 if ((result = tcp_mtu_probe(sk)) == 0) { 1538 /* Do MTU probing. */
1541 return 0; 1539 result = tcp_mtu_probe(sk);
1542 } else if (result > 0) { 1540 if (!result) {
1543 sent_pkts = 1; 1541 return 0;
1542 } else if (result > 0) {
1543 sent_pkts = 1;
1544 }
1544 } 1545 }
1545 1546
1546 while ((skb = tcp_send_head(sk))) { 1547 while ((skb = tcp_send_head(sk))) {
@@ -1562,7 +1563,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1562 nonagle : TCP_NAGLE_PUSH)))) 1563 nonagle : TCP_NAGLE_PUSH))))
1563 break; 1564 break;
1564 } else { 1565 } else {
1565 if (tcp_tso_should_defer(sk, skb)) 1566 if (!push_one && tcp_tso_should_defer(sk, skb))
1566 break; 1567 break;
1567 } 1568 }
1568 1569
@@ -1577,7 +1578,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1577 1578
1578 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1579 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1579 1580
1580 if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) 1581 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
1581 break; 1582 break;
1582 1583
1583 /* Advance the send_head. This one is sent out. 1584 /* Advance the send_head. This one is sent out.
@@ -1587,6 +1588,9 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1587 1588
1588 tcp_minshall_update(tp, mss_now, skb); 1589 tcp_minshall_update(tp, mss_now, skb);
1589 sent_pkts++; 1590 sent_pkts++;
1591
1592 if (push_one)
1593 break;
1590 } 1594 }
1591 1595
1592 if (likely(sent_pkts)) { 1596 if (likely(sent_pkts)) {
@@ -1605,10 +1609,18 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1605{ 1609{
1606 struct sk_buff *skb = tcp_send_head(sk); 1610 struct sk_buff *skb = tcp_send_head(sk);
1607 1611
1608 if (skb) { 1612 if (!skb)
1609 if (tcp_write_xmit(sk, cur_mss, nonagle)) 1613 return;
1610 tcp_check_probe_timer(sk); 1614
1611 } 1615 /* If we are closed, the bytes will have to remain here.
1616 * In time closedown will finish, we empty the write queue and
1617 * all will be happy.
1618 */
1619 if (unlikely(sk->sk_state == TCP_CLOSE))
1620 return;
1621
1622 if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
1623 tcp_check_probe_timer(sk);
1612} 1624}
1613 1625
1614/* Send _single_ skb sitting at the send head. This function requires 1626/* Send _single_ skb sitting at the send head. This function requires
@@ -1616,38 +1628,11 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1616 */ 1628 */
1617void tcp_push_one(struct sock *sk, unsigned int mss_now) 1629void tcp_push_one(struct sock *sk, unsigned int mss_now)
1618{ 1630{
1619 struct tcp_sock *tp = tcp_sk(sk);
1620 struct sk_buff *skb = tcp_send_head(sk); 1631 struct sk_buff *skb = tcp_send_head(sk);
1621 unsigned int tso_segs, cwnd_quota;
1622 1632
1623 BUG_ON(!skb || skb->len < mss_now); 1633 BUG_ON(!skb || skb->len < mss_now);
1624 1634
1625 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1635 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
1626 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1627
1628 if (likely(cwnd_quota)) {
1629 unsigned int limit;
1630
1631 BUG_ON(!tso_segs);
1632
1633 limit = mss_now;
1634 if (tso_segs > 1 && !tcp_urg_mode(tp))
1635 limit = tcp_mss_split_point(sk, skb, mss_now,
1636 cwnd_quota);
1637
1638 if (skb->len > limit &&
1639 unlikely(tso_fragment(sk, skb, limit, mss_now)))
1640 return;
1641
1642 /* Send it out now. */
1643 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1644
1645 if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
1646 tcp_event_new_data_sent(sk, skb);
1647 tcp_cwnd_validate(sk);
1648 return;
1649 }
1650 }
1651} 1636}
1652 1637
1653/* This function returns the amount that we can raise the 1638/* This function returns the amount that we can raise the
@@ -1767,46 +1752,22 @@ u32 __tcp_select_window(struct sock *sk)
1767 return window; 1752 return window;
1768} 1753}
1769 1754
1770/* Attempt to collapse two adjacent SKB's during retransmission. */ 1755/* Collapses two adjacent SKB's during retransmission. */
1771static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, 1756static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1772 int mss_now)
1773{ 1757{
1774 struct tcp_sock *tp = tcp_sk(sk); 1758 struct tcp_sock *tp = tcp_sk(sk);
1775 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 1759 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
1776 int skb_size, next_skb_size; 1760 int skb_size, next_skb_size;
1777 u16 flags; 1761 u16 flags;
1778 1762
1779 /* The first test we must make is that neither of these two
1780 * SKB's are still referenced by someone else.
1781 */
1782 if (skb_cloned(skb) || skb_cloned(next_skb))
1783 return;
1784
1785 skb_size = skb->len; 1763 skb_size = skb->len;
1786 next_skb_size = next_skb->len; 1764 next_skb_size = next_skb->len;
1787 flags = TCP_SKB_CB(skb)->flags; 1765 flags = TCP_SKB_CB(skb)->flags;
1788 1766
1789 /* Also punt if next skb has been SACK'd. */
1790 if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
1791 return;
1792
1793 /* Next skb is out of window. */
1794 if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp)))
1795 return;
1796
1797 /* Punt if not enough space exists in the first SKB for
1798 * the data in the second, or the total combined payload
1799 * would exceed the MSS.
1800 */
1801 if ((next_skb_size > skb_tailroom(skb)) ||
1802 ((skb_size + next_skb_size) > mss_now))
1803 return;
1804
1805 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 1767 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
1806 1768
1807 tcp_highest_sack_combine(sk, next_skb, skb); 1769 tcp_highest_sack_combine(sk, next_skb, skb);
1808 1770
1809 /* Ok. We will be able to collapse the packet. */
1810 tcp_unlink_write_queue(next_skb, sk); 1771 tcp_unlink_write_queue(next_skb, sk);
1811 1772
1812 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), 1773 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
@@ -1848,54 +1809,60 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
1848 sk_wmem_free_skb(sk, next_skb); 1809 sk_wmem_free_skb(sk, next_skb);
1849} 1810}
1850 1811
1851/* Do a simple retransmit without using the backoff mechanisms in 1812static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
1852 * tcp_timer. This is used for path mtu discovery. 1813{
1853 * The socket is already locked here. 1814 if (tcp_skb_pcount(skb) > 1)
1854 */ 1815 return 0;
1855void tcp_simple_retransmit(struct sock *sk) 1816 /* TODO: SACK collapsing could be used to remove this condition */
1817 if (skb_shinfo(skb)->nr_frags != 0)
1818 return 0;
1819 if (skb_cloned(skb))
1820 return 0;
1821 if (skb == tcp_send_head(sk))
1822 return 0;
1823 /* Some heurestics for collapsing over SACK'd could be invented */
1824 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1825 return 0;
1826
1827 return 1;
1828}
1829
1830static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
1831 int space)
1856{ 1832{
1857 const struct inet_connection_sock *icsk = inet_csk(sk);
1858 struct tcp_sock *tp = tcp_sk(sk); 1833 struct tcp_sock *tp = tcp_sk(sk);
1859 struct sk_buff *skb; 1834 struct sk_buff *skb = to, *tmp;
1860 unsigned int mss = tcp_current_mss(sk, 0); 1835 int first = 1;
1861 u32 prior_lost = tp->lost_out;
1862 1836
1863 tcp_for_write_queue(skb, sk) { 1837 if (!sysctl_tcp_retrans_collapse)
1864 if (skb == tcp_send_head(sk)) 1838 return;
1839 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)
1840 return;
1841
1842 tcp_for_write_queue_from_safe(skb, tmp, sk) {
1843 if (!tcp_can_collapse(sk, skb))
1865 break; 1844 break;
1866 if (skb->len > mss &&
1867 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1868 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1869 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1870 tp->retrans_out -= tcp_skb_pcount(skb);
1871 }
1872 tcp_skb_mark_lost_uncond_verify(tp, skb);
1873 }
1874 }
1875 1845
1876 tcp_clear_retrans_hints_partial(tp); 1846 space -= skb->len;
1877 1847
1878 if (prior_lost == tp->lost_out) 1848 if (first) {
1879 return; 1849 first = 0;
1850 continue;
1851 }
1880 1852
1881 if (tcp_is_reno(tp)) 1853 if (space < 0)
1882 tcp_limit_reno_sacked(tp); 1854 break;
1855 /* Punt if not enough space exists in the first SKB for
1856 * the data in the second
1857 */
1858 if (skb->len > skb_tailroom(to))
1859 break;
1883 1860
1884 tcp_verify_left_out(tp); 1861 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
1862 break;
1885 1863
1886 /* Don't muck with the congestion window here. 1864 tcp_collapse_retrans(sk, to);
1887 * Reason is that we do not increase amount of _data_
1888 * in network, but units changed and effective
1889 * cwnd/ssthresh really reduced now.
1890 */
1891 if (icsk->icsk_ca_state != TCP_CA_Loss) {
1892 tp->high_seq = tp->snd_nxt;
1893 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1894 tp->prior_ssthresh = 0;
1895 tp->undo_marker = 0;
1896 tcp_set_ca_state(sk, TCP_CA_Loss);
1897 } 1865 }
1898 tcp_xmit_retransmit_queue(sk);
1899} 1866}
1900 1867
1901/* This retransmits one SKB. Policy decisions and retransmit queue 1868/* This retransmits one SKB. Policy decisions and retransmit queue
@@ -1947,17 +1914,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1947 return -ENOMEM; /* We'll try again later. */ 1914 return -ENOMEM; /* We'll try again later. */
1948 } 1915 }
1949 1916
1950 /* Collapse two adjacent packets if worthwhile and we can. */ 1917 tcp_retrans_try_collapse(sk, skb, cur_mss);
1951 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1952 (skb->len < (cur_mss >> 1)) &&
1953 (!tcp_skb_is_last(sk, skb)) &&
1954 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1955 (skb_shinfo(skb)->nr_frags == 0 &&
1956 skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
1957 (tcp_skb_pcount(skb) == 1 &&
1958 tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
1959 (sysctl_tcp_retrans_collapse != 0))
1960 tcp_retrans_try_collapse(sk, skb, cur_mss);
1961 1918
1962 /* Some Solaris stacks overoptimize and ignore the FIN on a 1919 /* Some Solaris stacks overoptimize and ignore the FIN on a
1963 * retransmit when old data is attached. So strip it off 1920 * retransmit when old data is attached. So strip it off
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 7ddc30f0744f..25524d4e372a 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -153,12 +153,11 @@ static int tcpprobe_sprint(char *tbuf, int n)
153 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); 153 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
154 154
155 return snprintf(tbuf, n, 155 return snprintf(tbuf, n,
156 "%lu.%09lu " NIPQUAD_FMT ":%u " NIPQUAD_FMT ":%u" 156 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
157 " %d %#x %#x %u %u %u %u\n",
158 (unsigned long) tv.tv_sec, 157 (unsigned long) tv.tv_sec,
159 (unsigned long) tv.tv_nsec, 158 (unsigned long) tv.tv_nsec,
160 NIPQUAD(p->saddr), ntohs(p->sport), 159 &p->saddr, ntohs(p->sport),
161 NIPQUAD(p->daddr), ntohs(p->dport), 160 &p->daddr, ntohs(p->dport),
162 p->length, p->snd_nxt, p->snd_una, 161 p->length, p->snd_nxt, p->snd_una,
163 p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); 162 p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt);
164} 163}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6b6dff1164b9..0170e914f1b0 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -65,7 +65,7 @@ static void tcp_write_err(struct sock *sk)
65static int tcp_out_of_resources(struct sock *sk, int do_reset) 65static int tcp_out_of_resources(struct sock *sk, int do_reset)
66{ 66{
67 struct tcp_sock *tp = tcp_sk(sk); 67 struct tcp_sock *tp = tcp_sk(sk);
68 int orphans = atomic_read(&tcp_orphan_count); 68 int orphans = percpu_counter_read_positive(&tcp_orphan_count);
69 69
70 /* If peer does not open window for long time, or did not transmit 70 /* If peer does not open window for long time, or did not transmit
71 * anything for long time, penalize it. */ 71 * anything for long time, penalize it. */
@@ -171,7 +171,7 @@ static int tcp_write_timeout(struct sock *sk)
171 171
172static void tcp_delack_timer(unsigned long data) 172static void tcp_delack_timer(unsigned long data)
173{ 173{
174 struct sock *sk = (struct sock*)data; 174 struct sock *sk = (struct sock *)data;
175 struct tcp_sock *tp = tcp_sk(sk); 175 struct tcp_sock *tp = tcp_sk(sk);
176 struct inet_connection_sock *icsk = inet_csk(sk); 176 struct inet_connection_sock *icsk = inet_csk(sk);
177 177
@@ -299,15 +299,15 @@ static void tcp_retransmit_timer(struct sock *sk)
299#ifdef TCP_DEBUG 299#ifdef TCP_DEBUG
300 struct inet_sock *inet = inet_sk(sk); 300 struct inet_sock *inet = inet_sk(sk);
301 if (sk->sk_family == AF_INET) { 301 if (sk->sk_family == AF_INET) {
302 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer " NIPQUAD_FMT ":%u/%u shrinks window %u:%u. Repaired.\n", 302 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
303 NIPQUAD(inet->daddr), ntohs(inet->dport), 303 &inet->daddr, ntohs(inet->dport),
304 inet->num, tp->snd_una, tp->snd_nxt); 304 inet->num, tp->snd_una, tp->snd_nxt);
305 } 305 }
306#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 306#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
307 else if (sk->sk_family == AF_INET6) { 307 else if (sk->sk_family == AF_INET6) {
308 struct ipv6_pinfo *np = inet6_sk(sk); 308 struct ipv6_pinfo *np = inet6_sk(sk);
309 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer " NIP6_FMT ":%u/%u shrinks window %u:%u. Repaired.\n", 309 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
310 NIP6(np->daddr), ntohs(inet->dport), 310 &np->daddr, ntohs(inet->dport),
311 inet->num, tp->snd_una, tp->snd_nxt); 311 inet->num, tp->snd_una, tp->snd_nxt);
312 } 312 }
313#endif 313#endif
@@ -396,7 +396,7 @@ out:;
396 396
397static void tcp_write_timer(unsigned long data) 397static void tcp_write_timer(unsigned long data)
398{ 398{
399 struct sock *sk = (struct sock*)data; 399 struct sock *sk = (struct sock *)data;
400 struct inet_connection_sock *icsk = inet_csk(sk); 400 struct inet_connection_sock *icsk = inet_csk(sk);
401 int event; 401 int event;
402 402
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index e03b10183a8b..9ec843a9bbb2 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -83,7 +83,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
83 else if (!yeah->doing_reno_now) { 83 else if (!yeah->doing_reno_now) {
84 /* Scalable */ 84 /* Scalable */
85 85
86 tp->snd_cwnd_cnt+=yeah->pkts_acked; 86 tp->snd_cwnd_cnt += yeah->pkts_acked;
87 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ 87 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
88 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 88 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
89 tp->snd_cwnd++; 89 tp->snd_cwnd++;
@@ -224,7 +224,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
224 224
225 reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); 225 reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
226 } else 226 } else
227 reduction = max(tp->snd_cwnd>>1,2U); 227 reduction = max(tp->snd_cwnd>>1, 2U);
228 228
229 yeah->fast_count = 0; 229 yeah->fast_count = 0;
230 yeah->reno_count = max(yeah->reno_count>>1, 2U); 230 yeah->reno_count = max(yeah->reno_count>>1, 2U);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 98c1fd09be88..cf5ab0581eba 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -81,6 +81,8 @@
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82#include <asm/ioctls.h> 82#include <asm/ioctls.h>
83#include <linux/bootmem.h> 83#include <linux/bootmem.h>
84#include <linux/highmem.h>
85#include <linux/swap.h>
84#include <linux/types.h> 86#include <linux/types.h>
85#include <linux/fcntl.h> 87#include <linux/fcntl.h>
86#include <linux/module.h> 88#include <linux/module.h>
@@ -104,12 +106,8 @@
104#include <net/xfrm.h> 106#include <net/xfrm.h>
105#include "udp_impl.h" 107#include "udp_impl.h"
106 108
107/* 109struct udp_table udp_table;
108 * Snmp MIB for the UDP layer 110EXPORT_SYMBOL(udp_table);
109 */
110
111struct hlist_head udp_hash[UDP_HTABLE_SIZE];
112DEFINE_RWLOCK(udp_hash_lock);
113 111
114int sysctl_udp_mem[3] __read_mostly; 112int sysctl_udp_mem[3] __read_mostly;
115int sysctl_udp_rmem_min __read_mostly; 113int sysctl_udp_rmem_min __read_mostly;
@@ -123,15 +121,15 @@ atomic_t udp_memory_allocated;
123EXPORT_SYMBOL(udp_memory_allocated); 121EXPORT_SYMBOL(udp_memory_allocated);
124 122
125static int udp_lib_lport_inuse(struct net *net, __u16 num, 123static int udp_lib_lport_inuse(struct net *net, __u16 num,
126 const struct hlist_head udptable[], 124 const struct udp_hslot *hslot,
127 struct sock *sk, 125 struct sock *sk,
128 int (*saddr_comp)(const struct sock *sk1, 126 int (*saddr_comp)(const struct sock *sk1,
129 const struct sock *sk2)) 127 const struct sock *sk2))
130{ 128{
131 struct sock *sk2; 129 struct sock *sk2;
132 struct hlist_node *node; 130 struct hlist_nulls_node *node;
133 131
134 sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)]) 132 sk_nulls_for_each(sk2, node, &hslot->head)
135 if (net_eq(sock_net(sk2), net) && 133 if (net_eq(sock_net(sk2), net) &&
136 sk2 != sk && 134 sk2 != sk &&
137 sk2->sk_hash == num && 135 sk2->sk_hash == num &&
@@ -154,12 +152,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
154 int (*saddr_comp)(const struct sock *sk1, 152 int (*saddr_comp)(const struct sock *sk1,
155 const struct sock *sk2 ) ) 153 const struct sock *sk2 ) )
156{ 154{
157 struct hlist_head *udptable = sk->sk_prot->h.udp_hash; 155 struct udp_hslot *hslot;
156 struct udp_table *udptable = sk->sk_prot->h.udp_table;
158 int error = 1; 157 int error = 1;
159 struct net *net = sock_net(sk); 158 struct net *net = sock_net(sk);
160 159
161 write_lock_bh(&udp_hash_lock);
162
163 if (!snum) { 160 if (!snum) {
164 int low, high, remaining; 161 int low, high, remaining;
165 unsigned rand; 162 unsigned rand;
@@ -171,26 +168,34 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
171 rand = net_random(); 168 rand = net_random();
172 snum = first = rand % remaining + low; 169 snum = first = rand % remaining + low;
173 rand |= 1; 170 rand |= 1;
174 while (udp_lib_lport_inuse(net, snum, udptable, sk, 171 for (;;) {
175 saddr_comp)) { 172 hslot = &udptable->hash[udp_hashfn(net, snum)];
173 spin_lock_bh(&hslot->lock);
174 if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
175 break;
176 spin_unlock_bh(&hslot->lock);
176 do { 177 do {
177 snum = snum + rand; 178 snum = snum + rand;
178 } while (snum < low || snum > high); 179 } while (snum < low || snum > high);
179 if (snum == first) 180 if (snum == first)
180 goto fail; 181 goto fail;
181 } 182 }
182 } else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp)) 183 } else {
183 goto fail; 184 hslot = &udptable->hash[udp_hashfn(net, snum)];
184 185 spin_lock_bh(&hslot->lock);
186 if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
187 goto fail_unlock;
188 }
185 inet_sk(sk)->num = snum; 189 inet_sk(sk)->num = snum;
186 sk->sk_hash = snum; 190 sk->sk_hash = snum;
187 if (sk_unhashed(sk)) { 191 if (sk_unhashed(sk)) {
188 sk_add_node(sk, &udptable[udp_hashfn(net, snum)]); 192 sk_nulls_add_node_rcu(sk, &hslot->head);
189 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 193 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
190 } 194 }
191 error = 0; 195 error = 0;
196fail_unlock:
197 spin_unlock_bh(&hslot->lock);
192fail: 198fail:
193 write_unlock_bh(&udp_hash_lock);
194 return error; 199 return error;
195} 200}
196 201
@@ -208,63 +213,91 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
208 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); 213 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
209} 214}
210 215
216static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
217 unsigned short hnum,
218 __be16 sport, __be32 daddr, __be16 dport, int dif)
219{
220 int score = -1;
221
222 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
223 !ipv6_only_sock(sk)) {
224 struct inet_sock *inet = inet_sk(sk);
225
226 score = (sk->sk_family == PF_INET ? 1 : 0);
227 if (inet->rcv_saddr) {
228 if (inet->rcv_saddr != daddr)
229 return -1;
230 score += 2;
231 }
232 if (inet->daddr) {
233 if (inet->daddr != saddr)
234 return -1;
235 score += 2;
236 }
237 if (inet->dport) {
238 if (inet->dport != sport)
239 return -1;
240 score += 2;
241 }
242 if (sk->sk_bound_dev_if) {
243 if (sk->sk_bound_dev_if != dif)
244 return -1;
245 score += 2;
246 }
247 }
248 return score;
249}
250
211/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 251/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
212 * harder than this. -DaveM 252 * harder than this. -DaveM
213 */ 253 */
214static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, 254static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
215 __be16 sport, __be32 daddr, __be16 dport, 255 __be16 sport, __be32 daddr, __be16 dport,
216 int dif, struct hlist_head udptable[]) 256 int dif, struct udp_table *udptable)
217{ 257{
218 struct sock *sk, *result = NULL; 258 struct sock *sk, *result;
219 struct hlist_node *node; 259 struct hlist_nulls_node *node;
220 unsigned short hnum = ntohs(dport); 260 unsigned short hnum = ntohs(dport);
221 int badness = -1; 261 unsigned int hash = udp_hashfn(net, hnum);
222 262 struct udp_hslot *hslot = &udptable->hash[hash];
223 read_lock(&udp_hash_lock); 263 int score, badness;
224 sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) { 264
225 struct inet_sock *inet = inet_sk(sk); 265 rcu_read_lock();
226 266begin:
227 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && 267 result = NULL;
228 !ipv6_only_sock(sk)) { 268 badness = -1;
229 int score = (sk->sk_family == PF_INET ? 1 : 0); 269 sk_nulls_for_each_rcu(sk, node, &hslot->head) {
230 if (inet->rcv_saddr) { 270 score = compute_score(sk, net, saddr, hnum, sport,
231 if (inet->rcv_saddr != daddr) 271 daddr, dport, dif);
232 continue; 272 if (score > badness) {
233 score+=2; 273 result = sk;
234 } 274 badness = score;
235 if (inet->daddr) {
236 if (inet->daddr != saddr)
237 continue;
238 score+=2;
239 }
240 if (inet->dport) {
241 if (inet->dport != sport)
242 continue;
243 score+=2;
244 }
245 if (sk->sk_bound_dev_if) {
246 if (sk->sk_bound_dev_if != dif)
247 continue;
248 score+=2;
249 }
250 if (score == 9) {
251 result = sk;
252 break;
253 } else if (score > badness) {
254 result = sk;
255 badness = score;
256 }
257 } 275 }
258 } 276 }
259 if (result) 277 /*
260 sock_hold(result); 278 * if the nulls value we got at the end of this lookup is
261 read_unlock(&udp_hash_lock); 279 * not the expected one, we must restart lookup.
280 * We probably met an item that was moved to another chain.
281 */
282 if (get_nulls_value(node) != hash)
283 goto begin;
284
285 if (result) {
286 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
287 result = NULL;
288 else if (unlikely(compute_score(result, net, saddr, hnum, sport,
289 daddr, dport, dif) < badness)) {
290 sock_put(result);
291 goto begin;
292 }
293 }
294 rcu_read_unlock();
262 return result; 295 return result;
263} 296}
264 297
265static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, 298static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
266 __be16 sport, __be16 dport, 299 __be16 sport, __be16 dport,
267 struct hlist_head udptable[]) 300 struct udp_table *udptable)
268{ 301{
269 struct sock *sk; 302 struct sock *sk;
270 const struct iphdr *iph = ip_hdr(skb); 303 const struct iphdr *iph = ip_hdr(skb);
@@ -280,7 +313,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
280struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, 313struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
281 __be32 daddr, __be16 dport, int dif) 314 __be32 daddr, __be16 dport, int dif)
282{ 315{
283 return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash); 316 return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
284} 317}
285EXPORT_SYMBOL_GPL(udp4_lib_lookup); 318EXPORT_SYMBOL_GPL(udp4_lib_lookup);
286 319
@@ -289,11 +322,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
289 __be16 rmt_port, __be32 rmt_addr, 322 __be16 rmt_port, __be32 rmt_addr,
290 int dif) 323 int dif)
291{ 324{
292 struct hlist_node *node; 325 struct hlist_nulls_node *node;
293 struct sock *s = sk; 326 struct sock *s = sk;
294 unsigned short hnum = ntohs(loc_port); 327 unsigned short hnum = ntohs(loc_port);
295 328
296 sk_for_each_from(s, node) { 329 sk_nulls_for_each_from(s, node) {
297 struct inet_sock *inet = inet_sk(s); 330 struct inet_sock *inet = inet_sk(s);
298 331
299 if (!net_eq(sock_net(s), net) || 332 if (!net_eq(sock_net(s), net) ||
@@ -324,7 +357,7 @@ found:
324 * to find the appropriate port. 357 * to find the appropriate port.
325 */ 358 */
326 359
327void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) 360void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
328{ 361{
329 struct inet_sock *inet; 362 struct inet_sock *inet;
330 struct iphdr *iph = (struct iphdr*)skb->data; 363 struct iphdr *iph = (struct iphdr*)skb->data;
@@ -393,7 +426,7 @@ out:
393 426
394void udp_err(struct sk_buff *skb, u32 info) 427void udp_err(struct sk_buff *skb, u32 info)
395{ 428{
396 __udp4_lib_err(skb, info, udp_hash); 429 __udp4_lib_err(skb, info, &udp_table);
397} 430}
398 431
399/* 432/*
@@ -686,7 +719,7 @@ do_append_data:
686 up->len += ulen; 719 up->len += ulen;
687 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; 720 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
688 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, 721 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
689 sizeof(struct udphdr), &ipc, rt, 722 sizeof(struct udphdr), &ipc, &rt,
690 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); 723 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
691 if (err) 724 if (err)
692 udp_flush_pending_frames(sk); 725 udp_flush_pending_frames(sk);
@@ -935,6 +968,23 @@ int udp_disconnect(struct sock *sk, int flags)
935 return 0; 968 return 0;
936} 969}
937 970
971void udp_lib_unhash(struct sock *sk)
972{
973 if (sk_hashed(sk)) {
974 struct udp_table *udptable = sk->sk_prot->h.udp_table;
975 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
976 struct udp_hslot *hslot = &udptable->hash[hash];
977
978 spin_lock_bh(&hslot->lock);
979 if (sk_nulls_del_node_init_rcu(sk)) {
980 inet_sk(sk)->num = 0;
981 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
982 }
983 spin_unlock_bh(&hslot->lock);
984 }
985}
986EXPORT_SYMBOL(udp_lib_unhash);
987
938static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 988static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
939{ 989{
940 int is_udplite = IS_UDPLITE(sk); 990 int is_udplite = IS_UDPLITE(sk);
@@ -1073,13 +1123,14 @@ drop:
1073static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, 1123static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1074 struct udphdr *uh, 1124 struct udphdr *uh,
1075 __be32 saddr, __be32 daddr, 1125 __be32 saddr, __be32 daddr,
1076 struct hlist_head udptable[]) 1126 struct udp_table *udptable)
1077{ 1127{
1078 struct sock *sk; 1128 struct sock *sk;
1129 struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
1079 int dif; 1130 int dif;
1080 1131
1081 read_lock(&udp_hash_lock); 1132 spin_lock(&hslot->lock);
1082 sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); 1133 sk = sk_nulls_head(&hslot->head);
1083 dif = skb->dev->ifindex; 1134 dif = skb->dev->ifindex;
1084 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1135 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
1085 if (sk) { 1136 if (sk) {
@@ -1088,7 +1139,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1088 do { 1139 do {
1089 struct sk_buff *skb1 = skb; 1140 struct sk_buff *skb1 = skb;
1090 1141
1091 sknext = udp_v4_mcast_next(net, sk_next(sk), uh->dest, 1142 sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1092 daddr, uh->source, saddr, 1143 daddr, uh->source, saddr,
1093 dif); 1144 dif);
1094 if (sknext) 1145 if (sknext)
@@ -1105,7 +1156,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1105 } while (sknext); 1156 } while (sknext);
1106 } else 1157 } else
1107 kfree_skb(skb); 1158 kfree_skb(skb);
1108 read_unlock(&udp_hash_lock); 1159 spin_unlock(&hslot->lock);
1109 return 0; 1160 return 0;
1110} 1161}
1111 1162
@@ -1151,7 +1202,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1151 * All we need to do is get the socket, and then do a checksum. 1202 * All we need to do is get the socket, and then do a checksum.
1152 */ 1203 */
1153 1204
1154int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], 1205int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1155 int proto) 1206 int proto)
1156{ 1207{
1157 struct sock *sk; 1208 struct sock *sk;
@@ -1219,13 +1270,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
1219 return 0; 1270 return 0;
1220 1271
1221short_packet: 1272short_packet:
1222 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From " NIPQUAD_FMT ":%u %d/%d to " NIPQUAD_FMT ":%u\n", 1273 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
1223 proto == IPPROTO_UDPLITE ? "-Lite" : "", 1274 proto == IPPROTO_UDPLITE ? "-Lite" : "",
1224 NIPQUAD(saddr), 1275 &saddr,
1225 ntohs(uh->source), 1276 ntohs(uh->source),
1226 ulen, 1277 ulen,
1227 skb->len, 1278 skb->len,
1228 NIPQUAD(daddr), 1279 &daddr,
1229 ntohs(uh->dest)); 1280 ntohs(uh->dest));
1230 goto drop; 1281 goto drop;
1231 1282
@@ -1234,11 +1285,11 @@ csum_error:
1234 * RFC1122: OK. Discards the bad packet silently (as far as 1285 * RFC1122: OK. Discards the bad packet silently (as far as
1235 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 1286 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1236 */ 1287 */
1237 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From " NIPQUAD_FMT ":%u to " NIPQUAD_FMT ":%u ulen %d\n", 1288 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
1238 proto == IPPROTO_UDPLITE ? "-Lite" : "", 1289 proto == IPPROTO_UDPLITE ? "-Lite" : "",
1239 NIPQUAD(saddr), 1290 &saddr,
1240 ntohs(uh->source), 1291 ntohs(uh->source),
1241 NIPQUAD(daddr), 1292 &daddr,
1242 ntohs(uh->dest), 1293 ntohs(uh->dest),
1243 ulen); 1294 ulen);
1244drop: 1295drop:
@@ -1249,7 +1300,7 @@ drop:
1249 1300
1250int udp_rcv(struct sk_buff *skb) 1301int udp_rcv(struct sk_buff *skb)
1251{ 1302{
1252 return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP); 1303 return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
1253} 1304}
1254 1305
1255void udp_destroy_sock(struct sock *sk) 1306void udp_destroy_sock(struct sock *sk)
@@ -1491,7 +1542,8 @@ struct proto udp_prot = {
1491 .sysctl_wmem = &sysctl_udp_wmem_min, 1542 .sysctl_wmem = &sysctl_udp_wmem_min,
1492 .sysctl_rmem = &sysctl_udp_rmem_min, 1543 .sysctl_rmem = &sysctl_udp_rmem_min,
1493 .obj_size = sizeof(struct udp_sock), 1544 .obj_size = sizeof(struct udp_sock),
1494 .h.udp_hash = udp_hash, 1545 .slab_flags = SLAB_DESTROY_BY_RCU,
1546 .h.udp_table = &udp_table,
1495#ifdef CONFIG_COMPAT 1547#ifdef CONFIG_COMPAT
1496 .compat_setsockopt = compat_udp_setsockopt, 1548 .compat_setsockopt = compat_udp_setsockopt,
1497 .compat_getsockopt = compat_udp_getsockopt, 1549 .compat_getsockopt = compat_udp_getsockopt,
@@ -1501,20 +1553,23 @@ struct proto udp_prot = {
1501/* ------------------------------------------------------------------------ */ 1553/* ------------------------------------------------------------------------ */
1502#ifdef CONFIG_PROC_FS 1554#ifdef CONFIG_PROC_FS
1503 1555
1504static struct sock *udp_get_first(struct seq_file *seq) 1556static struct sock *udp_get_first(struct seq_file *seq, int start)
1505{ 1557{
1506 struct sock *sk; 1558 struct sock *sk;
1507 struct udp_iter_state *state = seq->private; 1559 struct udp_iter_state *state = seq->private;
1508 struct net *net = seq_file_net(seq); 1560 struct net *net = seq_file_net(seq);
1509 1561
1510 for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { 1562 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
1511 struct hlist_node *node; 1563 struct hlist_nulls_node *node;
1512 sk_for_each(sk, node, state->hashtable + state->bucket) { 1564 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1565 spin_lock_bh(&hslot->lock);
1566 sk_nulls_for_each(sk, node, &hslot->head) {
1513 if (!net_eq(sock_net(sk), net)) 1567 if (!net_eq(sock_net(sk), net))
1514 continue; 1568 continue;
1515 if (sk->sk_family == state->family) 1569 if (sk->sk_family == state->family)
1516 goto found; 1570 goto found;
1517 } 1571 }
1572 spin_unlock_bh(&hslot->lock);
1518 } 1573 }
1519 sk = NULL; 1574 sk = NULL;
1520found: 1575found:
@@ -1527,21 +1582,19 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1527 struct net *net = seq_file_net(seq); 1582 struct net *net = seq_file_net(seq);
1528 1583
1529 do { 1584 do {
1530 sk = sk_next(sk); 1585 sk = sk_nulls_next(sk);
1531try_again:
1532 ;
1533 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); 1586 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1534 1587
1535 if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { 1588 if (!sk) {
1536 sk = sk_head(state->hashtable + state->bucket); 1589 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1537 goto try_again; 1590 return udp_get_first(seq, state->bucket + 1);
1538 } 1591 }
1539 return sk; 1592 return sk;
1540} 1593}
1541 1594
1542static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) 1595static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1543{ 1596{
1544 struct sock *sk = udp_get_first(seq); 1597 struct sock *sk = udp_get_first(seq, 0);
1545 1598
1546 if (sk) 1599 if (sk)
1547 while (pos && (sk = udp_get_next(seq, sk)) != NULL) 1600 while (pos && (sk = udp_get_next(seq, sk)) != NULL)
@@ -1550,9 +1603,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1550} 1603}
1551 1604
1552static void *udp_seq_start(struct seq_file *seq, loff_t *pos) 1605static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1553 __acquires(udp_hash_lock)
1554{ 1606{
1555 read_lock(&udp_hash_lock);
1556 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; 1607 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1557} 1608}
1558 1609
@@ -1570,9 +1621,11 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1570} 1621}
1571 1622
1572static void udp_seq_stop(struct seq_file *seq, void *v) 1623static void udp_seq_stop(struct seq_file *seq, void *v)
1573 __releases(udp_hash_lock)
1574{ 1624{
1575 read_unlock(&udp_hash_lock); 1625 struct udp_iter_state *state = seq->private;
1626
1627 if (state->bucket < UDP_HTABLE_SIZE)
1628 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1576} 1629}
1577 1630
1578static int udp_seq_open(struct inode *inode, struct file *file) 1631static int udp_seq_open(struct inode *inode, struct file *file)
@@ -1588,7 +1641,7 @@ static int udp_seq_open(struct inode *inode, struct file *file)
1588 1641
1589 s = ((struct seq_file *)file->private_data)->private; 1642 s = ((struct seq_file *)file->private_data)->private;
1590 s->family = afinfo->family; 1643 s->family = afinfo->family;
1591 s->hashtable = afinfo->hashtable; 1644 s->udp_table = afinfo->udp_table;
1592 return err; 1645 return err;
1593} 1646}
1594 1647
@@ -1660,7 +1713,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
1660static struct udp_seq_afinfo udp4_seq_afinfo = { 1713static struct udp_seq_afinfo udp4_seq_afinfo = {
1661 .name = "udp", 1714 .name = "udp",
1662 .family = AF_INET, 1715 .family = AF_INET,
1663 .hashtable = udp_hash, 1716 .udp_table = &udp_table,
1664 .seq_fops = { 1717 .seq_fops = {
1665 .owner = THIS_MODULE, 1718 .owner = THIS_MODULE,
1666 }, 1719 },
@@ -1695,16 +1748,28 @@ void udp4_proc_exit(void)
1695} 1748}
1696#endif /* CONFIG_PROC_FS */ 1749#endif /* CONFIG_PROC_FS */
1697 1750
1751void __init udp_table_init(struct udp_table *table)
1752{
1753 int i;
1754
1755 for (i = 0; i < UDP_HTABLE_SIZE; i++) {
1756 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
1757 spin_lock_init(&table->hash[i].lock);
1758 }
1759}
1760
1698void __init udp_init(void) 1761void __init udp_init(void)
1699{ 1762{
1700 unsigned long limit; 1763 unsigned long nr_pages, limit;
1701 1764
1765 udp_table_init(&udp_table);
1702 /* Set the pressure threshold up by the same strategy of TCP. It is a 1766 /* Set the pressure threshold up by the same strategy of TCP. It is a
1703 * fraction of global memory that is up to 1/2 at 256 MB, decreasing 1767 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1704 * toward zero with the amount of memory, with a floor of 128 pages. 1768 * toward zero with the amount of memory, with a floor of 128 pages.
1705 */ 1769 */
1706 limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); 1770 nr_pages = totalram_pages - totalhigh_pages;
1707 limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); 1771 limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
1772 limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
1708 limit = max(limit, 128UL); 1773 limit = max(limit, 128UL);
1709 sysctl_udp_mem[0] = limit / 4 * 3; 1774 sysctl_udp_mem[0] = limit / 4 * 3;
1710 sysctl_udp_mem[1] = limit; 1775 sysctl_udp_mem[1] = limit;
@@ -1715,8 +1780,6 @@ void __init udp_init(void)
1715} 1780}
1716 1781
1717EXPORT_SYMBOL(udp_disconnect); 1782EXPORT_SYMBOL(udp_disconnect);
1718EXPORT_SYMBOL(udp_hash);
1719EXPORT_SYMBOL(udp_hash_lock);
1720EXPORT_SYMBOL(udp_ioctl); 1783EXPORT_SYMBOL(udp_ioctl);
1721EXPORT_SYMBOL(udp_prot); 1784EXPORT_SYMBOL(udp_prot);
1722EXPORT_SYMBOL(udp_sendmsg); 1785EXPORT_SYMBOL(udp_sendmsg);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 2e9bad2fa1bc..9f4a6165f722 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -5,8 +5,8 @@
5#include <net/protocol.h> 5#include <net/protocol.h>
6#include <net/inet_common.h> 6#include <net/inet_common.h>
7 7
8extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int ); 8extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
9extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []); 9extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
10 10
11extern int udp_v4_get_port(struct sock *sk, unsigned short snum); 11extern int udp_v4_get_port(struct sock *sk, unsigned short snum);
12 12
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 3c807964da96..c784891cb7e5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,16 +12,17 @@
12 */ 12 */
13#include "udp_impl.h" 13#include "udp_impl.h"
14 14
15struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; 15struct udp_table udplite_table;
16EXPORT_SYMBOL(udplite_table);
16 17
17static int udplite_rcv(struct sk_buff *skb) 18static int udplite_rcv(struct sk_buff *skb)
18{ 19{
19 return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); 20 return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
20} 21}
21 22
22static void udplite_err(struct sk_buff *skb, u32 info) 23static void udplite_err(struct sk_buff *skb, u32 info)
23{ 24{
24 __udp4_lib_err(skb, info, udplite_hash); 25 __udp4_lib_err(skb, info, &udplite_table);
25} 26}
26 27
27static struct net_protocol udplite_protocol = { 28static struct net_protocol udplite_protocol = {
@@ -50,7 +51,8 @@ struct proto udplite_prot = {
50 .unhash = udp_lib_unhash, 51 .unhash = udp_lib_unhash,
51 .get_port = udp_v4_get_port, 52 .get_port = udp_v4_get_port,
52 .obj_size = sizeof(struct udp_sock), 53 .obj_size = sizeof(struct udp_sock),
53 .h.udp_hash = udplite_hash, 54 .slab_flags = SLAB_DESTROY_BY_RCU,
55 .h.udp_table = &udplite_table,
54#ifdef CONFIG_COMPAT 56#ifdef CONFIG_COMPAT
55 .compat_setsockopt = compat_udp_setsockopt, 57 .compat_setsockopt = compat_udp_setsockopt,
56 .compat_getsockopt = compat_udp_getsockopt, 58 .compat_getsockopt = compat_udp_getsockopt,
@@ -71,7 +73,7 @@ static struct inet_protosw udplite4_protosw = {
71static struct udp_seq_afinfo udplite4_seq_afinfo = { 73static struct udp_seq_afinfo udplite4_seq_afinfo = {
72 .name = "udplite", 74 .name = "udplite",
73 .family = AF_INET, 75 .family = AF_INET,
74 .hashtable = udplite_hash, 76 .udp_table = &udplite_table,
75 .seq_fops = { 77 .seq_fops = {
76 .owner = THIS_MODULE, 78 .owner = THIS_MODULE,
77 }, 79 },
@@ -108,6 +110,7 @@ static inline int udplite4_proc_init(void)
108 110
109void __init udplite4_register(void) 111void __init udplite4_register(void)
110{ 112{
113 udp_table_init(&udplite_table);
111 if (proto_register(&udplite_prot, 1)) 114 if (proto_register(&udplite_prot, 1))
112 goto out_register_err; 115 goto out_register_err;
113 116
@@ -126,5 +129,4 @@ out_register_err:
126 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); 129 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
127} 130}
128 131
129EXPORT_SYMBOL(udplite_hash);
130EXPORT_SYMBOL(udplite_prot); 132EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 390dcb1354a5..4ec2162a437e 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -78,7 +78,6 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
78 struct udphdr *uh; 78 struct udphdr *uh;
79 struct iphdr *iph; 79 struct iphdr *iph;
80 int iphlen, len; 80 int iphlen, len;
81 int ret;
82 81
83 __u8 *udpdata; 82 __u8 *udpdata;
84 __be32 *udpdata32; 83 __be32 *udpdata32;
@@ -152,8 +151,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
152 skb_reset_transport_header(skb); 151 skb_reset_transport_header(skb);
153 152
154 /* process ESP */ 153 /* process ESP */
155 ret = xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, encap_type); 154 return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, encap_type);
156 return ret;
157 155
158drop: 156drop:
159 kfree_skb(skb); 157 kfree_skb(skb);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index c63de0a72aba..2ad24ba31f9d 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -18,7 +18,8 @@
18static struct dst_ops xfrm4_dst_ops; 18static struct dst_ops xfrm4_dst_ops;
19static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 19static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20 20
21static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, 21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
22 xfrm_address_t *saddr,
22 xfrm_address_t *daddr) 23 xfrm_address_t *daddr)
23{ 24{
24 struct flowi fl = { 25 struct flowi fl = {
@@ -36,19 +37,20 @@ static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr,
36 if (saddr) 37 if (saddr)
37 fl.fl4_src = saddr->a4; 38 fl.fl4_src = saddr->a4;
38 39
39 err = __ip_route_output_key(&init_net, &rt, &fl); 40 err = __ip_route_output_key(net, &rt, &fl);
40 dst = &rt->u.dst; 41 dst = &rt->u.dst;
41 if (err) 42 if (err)
42 dst = ERR_PTR(err); 43 dst = ERR_PTR(err);
43 return dst; 44 return dst;
44} 45}
45 46
46static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) 47static int xfrm4_get_saddr(struct net *net,
48 xfrm_address_t *saddr, xfrm_address_t *daddr)
47{ 49{
48 struct dst_entry *dst; 50 struct dst_entry *dst;
49 struct rtable *rt; 51 struct rtable *rt;
50 52
51 dst = xfrm4_dst_lookup(0, NULL, daddr); 53 dst = xfrm4_dst_lookup(net, 0, NULL, daddr);
52 if (IS_ERR(dst)) 54 if (IS_ERR(dst))
53 return -EHOSTUNREACH; 55 return -EHOSTUNREACH;
54 56
@@ -65,7 +67,7 @@ __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
65 67
66 read_lock_bh(&policy->lock); 68 read_lock_bh(&policy->lock);
67 for (dst = policy->bundles; dst; dst = dst->next) { 69 for (dst = policy->bundles; dst; dst = dst->next) {
68 struct xfrm_dst *xdst = (struct xfrm_dst*)dst; 70 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
69 if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ 71 if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
70 xdst->u.rt.fl.fl4_dst == fl->fl4_dst && 72 xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
71 xdst->u.rt.fl.fl4_src == fl->fl4_src && 73 xdst->u.rt.fl.fl4_src == fl->fl4_src &&
@@ -187,7 +189,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
187 189
188static inline int xfrm4_garbage_collect(struct dst_ops *ops) 190static inline int xfrm4_garbage_collect(struct dst_ops *ops)
189{ 191{
190 xfrm4_policy_afinfo.garbage_collect(); 192 xfrm4_policy_afinfo.garbage_collect(&init_net);
191 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); 193 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
192} 194}
193 195
@@ -246,7 +248,6 @@ static struct dst_ops xfrm4_dst_ops = {
246 .ifdown = xfrm4_dst_ifdown, 248 .ifdown = xfrm4_dst_ifdown,
247 .local_out = __ip_local_out, 249 .local_out = __ip_local_out,
248 .gc_thresh = 1024, 250 .gc_thresh = 1024,
249 .entry_size = sizeof(struct xfrm_dst),
250 .entries = ATOMIC_INIT(0), 251 .entries = ATOMIC_INIT(0),
251}; 252};
252 253
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 55dc6beab9aa..1ef1366a0a03 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -13,8 +13,6 @@
13#include <linux/ipsec.h> 13#include <linux/ipsec.h>
14#include <linux/netfilter_ipv4.h> 14#include <linux/netfilter_ipv4.h>
15 15
16static struct xfrm_state_afinfo xfrm4_state_afinfo;
17
18static int xfrm4_init_flags(struct xfrm_state *x) 16static int xfrm4_init_flags(struct xfrm_state *x)
19{ 17{
20 if (ipv4_config.no_pmtu_disc) 18 if (ipv4_config.no_pmtu_disc)