aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig52
-rw-r--r--net/ipv4/af_inet.c28
-rw-r--r--net/ipv4/arp.c13
-rw-r--r--net/ipv4/devinet.c12
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_semantics.c5
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c42
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/inet_hashtables.c12
-rw-r--r--net/ipv4/ip_gre.c136
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ipconfig.c8
-rw-r--r--net/ipv4/ipip.c7
-rw-r--r--net/ipv4/ipmr.c464
-rw-r--r--net/ipv4/netfilter/Kconfig30
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/arp_tables.c163
-rw-r--r--net/ipv4/netfilter/arptable_filter.c2
-rw-r--r--net/ipv4/netfilter/ip_queue.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c153
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c2
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c97
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c2
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c63
-rw-r--r--net/ipv4/netfilter/iptable_filter.c1
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c1
-rw-r--r--net/ipv4/netfilter/iptable_raw.c1
-rw-r--r--net/ipv4/netfilter/iptable_security.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c1
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c4
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/raw.c1
-rw-r--r--net/ipv4/route.c15
-rw-r--r--net/ipv4/tcp.c89
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c21
-rw-r--r--net/ipv4/tcp_cubic.c11
-rw-r--r--net/ipv4/tcp_htcp.c3
-rw-r--r--net/ipv4/tcp_input.c198
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c94
-rw-r--r--net/ipv4/tcp_probe.c5
-rw-r--r--net/ipv4/tcp_scalable.c10
-rw-r--r--net/ipv4/tcp_timer.c23
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_yeah.c9
-rw-r--r--net/ipv4/udp.c9
-rw-r--r--net/ipv4/xfrm4_policy.c2
51 files changed, 1018 insertions, 833 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 691268f3a359..b2cf91e4ccaa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER
35 35
36 at boot time after the /proc file system has been mounted. 36 at boot time after the /proc file system has been mounted.
37 37
38 If you turn on IP forwarding, you will also get the rp_filter, which 38 If you turn on IP forwarding, you should consider the rp_filter, which
39 automatically rejects incoming packets if the routing table entry 39 automatically rejects incoming packets if the routing table entry
40 for their source address doesn't match the network interface they're 40 for their source address doesn't match the network interface they're
41 arriving on. This has security advantages because it prevents the 41 arriving on. This has security advantages because it prevents the
@@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER
46 rp_filter on use: 46 rp_filter on use:
47 47
48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter 48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
49 or 49 and
50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter 50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
51 51
52 Note that some distributions enable it in startup scripts.
53 For details about rp_filter strict and loose mode read
54 <file:Documentation/networking/ip-sysctl.txt>.
55
52 If unsure, say N here. 56 If unsure, say N here.
53 57
54choice 58choice
55 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" 59 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
56 depends on IP_ADVANCED_ROUTER 60 depends on IP_ADVANCED_ROUTER
57 default ASK_IP_FIB_HASH 61 default ASK_IP_FIB_HASH
@@ -59,27 +63,29 @@ choice
59config ASK_IP_FIB_HASH 63config ASK_IP_FIB_HASH
60 bool "FIB_HASH" 64 bool "FIB_HASH"
61 ---help--- 65 ---help---
62 Current FIB is very proven and good enough for most users. 66 Current FIB is very proven and good enough for most users.
63 67
64config IP_FIB_TRIE 68config IP_FIB_TRIE
65 bool "FIB_TRIE" 69 bool "FIB_TRIE"
66 ---help--- 70 ---help---
67 Use new experimental LC-trie as FIB lookup algorithm. 71 Use new experimental LC-trie as FIB lookup algorithm.
68 This improves lookup performance if you have a large 72 This improves lookup performance if you have a large
69 number of routes. 73 number of routes.
70 74
71 LC-trie is a longest matching prefix lookup algorithm which 75 LC-trie is a longest matching prefix lookup algorithm which
72 performs better than FIB_HASH for large routing tables. 76 performs better than FIB_HASH for large routing tables.
73 But, it consumes more memory and is more complex. 77 But, it consumes more memory and is more complex.
74 78
75 LC-trie is described in: 79 LC-trie is described in:
76 80
77 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 81 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
78 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 82 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
79 An experimental study of compression methods for dynamic tries 83 June 1999
80 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 84
81 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 85 An experimental study of compression methods for dynamic tries
82 86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
88
83endchoice 89endchoice
84 90
85config IP_FIB_HASH 91config IP_FIB_HASH
@@ -191,7 +197,7 @@ config IP_PNP_RARP
191 <file:Documentation/filesystems/nfsroot.txt> for details. 197 <file:Documentation/filesystems/nfsroot.txt> for details.
192 198
193# not yet ready.. 199# not yet ready..
194# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 200# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
195config NET_IPIP 201config NET_IPIP
196 tristate "IP: tunneling" 202 tristate "IP: tunneling"
197 select INET_TUNNEL 203 select INET_TUNNEL
@@ -361,7 +367,7 @@ config INET_IPCOMP
361 ---help--- 367 ---help---
362 Support for IP Payload Compression Protocol (IPComp) (RFC3173), 368 Support for IP Payload Compression Protocol (IPComp) (RFC3173),
363 typically needed for IPsec. 369 typically needed for IPsec.
364 370
365 If unsure, say Y. 371 If unsure, say Y.
366 372
367config INET_XFRM_TUNNEL 373config INET_XFRM_TUNNEL
@@ -415,7 +421,7 @@ config INET_DIAG
415 Support for INET (TCP, DCCP, etc) socket monitoring interface used by 421 Support for INET (TCP, DCCP, etc) socket monitoring interface used by
416 native Linux tools such as ss. ss is included in iproute2, currently 422 native Linux tools such as ss. ss is included in iproute2, currently
417 downloadable at <http://linux-net.osdl.org/index.php/Iproute2>. 423 downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
418 424
419 If unsure, say Y. 425 If unsure, say Y.
420 426
421config INET_TCP_DIAG 427config INET_TCP_DIAG
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f5542d65a..d5aaabbb7cb3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -369,7 +369,6 @@ lookup_protocol:
369 sock_init_data(sock, sk); 369 sock_init_data(sock, sk);
370 370
371 sk->sk_destruct = inet_sock_destruct; 371 sk->sk_destruct = inet_sock_destruct;
372 sk->sk_family = PF_INET;
373 sk->sk_protocol = protocol; 372 sk->sk_protocol = protocol;
374 sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; 373 sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
375 374
@@ -1253,10 +1252,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1253 int proto; 1252 int proto;
1254 int id; 1253 int id;
1255 1254
1256 if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) 1255 iph = skb_gro_header(skb, sizeof(*iph));
1256 if (unlikely(!iph))
1257 goto out; 1257 goto out;
1258 1258
1259 iph = ip_hdr(skb);
1260 proto = iph->protocol & (MAX_INET_PROTOS - 1); 1259 proto = iph->protocol & (MAX_INET_PROTOS - 1);
1261 1260
1262 rcu_read_lock(); 1261 rcu_read_lock();
@@ -1264,13 +1263,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1264 if (!ops || !ops->gro_receive) 1263 if (!ops || !ops->gro_receive)
1265 goto out_unlock; 1264 goto out_unlock;
1266 1265
1267 if (iph->version != 4 || iph->ihl != 5) 1266 if (*(u8 *)iph != 0x45)
1268 goto out_unlock; 1267 goto out_unlock;
1269 1268
1270 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) 1269 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
1271 goto out_unlock; 1270 goto out_unlock;
1272 1271
1273 flush = ntohs(iph->tot_len) != skb->len || 1272 flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
1274 iph->frag_off != htons(IP_DF); 1273 iph->frag_off != htons(IP_DF);
1275 id = ntohs(iph->id); 1274 id = ntohs(iph->id);
1276 1275
@@ -1282,24 +1281,25 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1282 1281
1283 iph2 = ip_hdr(p); 1282 iph2 = ip_hdr(p);
1284 1283
1285 if (iph->protocol != iph2->protocol || 1284 if ((iph->protocol ^ iph2->protocol) |
1286 iph->tos != iph2->tos || 1285 (iph->tos ^ iph2->tos) |
1287 memcmp(&iph->saddr, &iph2->saddr, 8)) { 1286 (iph->saddr ^ iph2->saddr) |
1287 (iph->daddr ^ iph2->daddr)) {
1288 NAPI_GRO_CB(p)->same_flow = 0; 1288 NAPI_GRO_CB(p)->same_flow = 0;
1289 continue; 1289 continue;
1290 } 1290 }
1291 1291
1292 /* All fields must match except length and checksum. */ 1292 /* All fields must match except length and checksum. */
1293 NAPI_GRO_CB(p)->flush |= 1293 NAPI_GRO_CB(p)->flush |=
1294 memcmp(&iph->frag_off, &iph2->frag_off, 4) || 1294 (iph->ttl ^ iph2->ttl) |
1295 (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id; 1295 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
1296 1296
1297 NAPI_GRO_CB(p)->flush |= flush; 1297 NAPI_GRO_CB(p)->flush |= flush;
1298 } 1298 }
1299 1299
1300 NAPI_GRO_CB(skb)->flush |= flush; 1300 NAPI_GRO_CB(skb)->flush |= flush;
1301 __skb_pull(skb, sizeof(*iph)); 1301 skb_gro_pull(skb, sizeof(*iph));
1302 skb_reset_transport_header(skb); 1302 skb_set_transport_header(skb, skb_gro_offset(skb));
1303 1303
1304 pp = ops->gro_receive(head, skb); 1304 pp = ops->gro_receive(head, skb);
1305 1305
@@ -1500,8 +1500,8 @@ static int ipv4_proc_init(void);
1500 * IP protocol layer initialiser 1500 * IP protocol layer initialiser
1501 */ 1501 */
1502 1502
1503static struct packet_type ip_packet_type = { 1503static struct packet_type ip_packet_type __read_mostly = {
1504 .type = __constant_htons(ETH_P_IP), 1504 .type = cpu_to_be16(ETH_P_IP),
1505 .func = ip_rcv, 1505 .func = ip_rcv,
1506 .gso_send_check = inet_gso_send_check, 1506 .gso_send_check = inet_gso_send_check,
1507 .gso_segment = inet_gso_segment, 1507 .gso_segment = inet_gso_segment,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 29a74c01d8de..f11931c18381 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb)
801 * cache. 801 * cache.
802 */ 802 */
803 803
804 /* Special case: IPv4 duplicate address detection packet (RFC2131) */ 804 /*
805 if (sip == 0) { 805 * Special case: IPv4 duplicate address detection packet (RFC2131)
806 * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4)
807 */
808 if (sip == 0 || tip == sip) {
806 if (arp->ar_op == htons(ARPOP_REQUEST) && 809 if (arp->ar_op == htons(ARPOP_REQUEST) &&
807 inet_addr_type(net, tip) == RTN_LOCAL && 810 inet_addr_type(net, tip) == RTN_LOCAL &&
808 !arp_ignore(in_dev, sip, tip)) 811 !arp_ignore(in_dev, sip, tip))
@@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb)
892out: 895out:
893 if (in_dev) 896 if (in_dev)
894 in_dev_put(in_dev); 897 in_dev_put(in_dev);
895 kfree_skb(skb); 898 consume_skb(skb);
896 return 0; 899 return 0;
897} 900}
898 901
@@ -1225,8 +1228,8 @@ void arp_ifdown(struct net_device *dev)
1225 * Called once on startup. 1228 * Called once on startup.
1226 */ 1229 */
1227 1230
1228static struct packet_type arp_packet_type = { 1231static struct packet_type arp_packet_type __read_mostly = {
1229 .type = __constant_htons(ETH_P_ARP), 1232 .type = cpu_to_be16(ETH_P_ARP),
1230 .func = arp_rcv, 1233 .func = arp_rcv,
1231}; 1234};
1232 1235
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 309997edc8a5..126bb911880f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1075,6 +1075,14 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1075 } 1075 }
1076 } 1076 }
1077 ip_mc_up(in_dev); 1077 ip_mc_up(in_dev);
1078 /* fall through */
1079 case NETDEV_CHANGEADDR:
1080 if (IN_DEV_ARP_NOTIFY(in_dev))
1081 arp_send(ARPOP_REQUEST, ETH_P_ARP,
1082 in_dev->ifa_list->ifa_address,
1083 dev,
1084 in_dev->ifa_list->ifa_address,
1085 NULL, dev->dev_addr, NULL);
1078 break; 1086 break;
1079 case NETDEV_DOWN: 1087 case NETDEV_DOWN:
1080 ip_mc_down(in_dev); 1088 ip_mc_down(in_dev);
@@ -1208,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1208 kfree_skb(skb); 1216 kfree_skb(skb);
1209 goto errout; 1217 goto errout;
1210 } 1218 }
1211 err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); 1219 rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
1220 return;
1212errout: 1221errout:
1213 if (err < 0) 1222 if (err < 0)
1214 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); 1223 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
@@ -1439,6 +1448,7 @@ static struct devinet_sysctl_table {
1439 DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), 1448 DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
1440 DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), 1449 DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
1441 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), 1450 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
1451 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
1442 1452
1443 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), 1453 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
1444 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), 1454 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 741e4fa3e474..cafcc49d0993 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
275 fib_res_put(&res); 275 fib_res_put(&res);
276 if (no_addr) 276 if (no_addr)
277 goto last_resort; 277 goto last_resort;
278 if (rpf) 278 if (rpf == 1)
279 goto e_inval; 279 goto e_inval;
280 fl.oif = dev->ifindex; 280 fl.oif = dev->ifindex;
281 281
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4817dea3bc73..f831df500907 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
322 kfree_skb(skb); 322 kfree_skb(skb);
323 goto errout; 323 goto errout;
324 } 324 }
325 err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, 325 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326 info->nlh, GFP_KERNEL); 326 info->nlh, GFP_KERNEL);
327 return;
327errout: 328errout:
328 if (err < 0) 329 if (err < 0)
329 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 330 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fc562d29cc46..3f50807237e0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -375,6 +375,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
375 inet->tos = ip_hdr(skb)->tos; 375 inet->tos = ip_hdr(skb)->tos;
376 daddr = ipc.addr = rt->rt_src; 376 daddr = ipc.addr = rt->rt_src;
377 ipc.opt = NULL; 377 ipc.opt = NULL;
378 ipc.shtx.flags = 0;
378 if (icmp_param->replyopts.optlen) { 379 if (icmp_param->replyopts.optlen) {
379 ipc.opt = &icmp_param->replyopts; 380 ipc.opt = &icmp_param->replyopts;
380 if (ipc.opt->srr) 381 if (ipc.opt->srr)
@@ -532,6 +533,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
532 inet_sk(sk)->tos = tos; 533 inet_sk(sk)->tos = tos;
533 ipc.addr = iph->saddr; 534 ipc.addr = iph->saddr;
534 ipc.opt = &icmp_param.replyopts; 535 ipc.opt = &icmp_param.replyopts;
536 ipc.shtx.flags = 0;
535 537
536 { 538 {
537 struct flowi fl = { 539 struct flowi fl = {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f26ab38680de..22cd19ee44e5 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -93,24 +93,40 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
93 struct inet_bind_hashbucket *head; 93 struct inet_bind_hashbucket *head;
94 struct hlist_node *node; 94 struct hlist_node *node;
95 struct inet_bind_bucket *tb; 95 struct inet_bind_bucket *tb;
96 int ret; 96 int ret, attempts = 5;
97 struct net *net = sock_net(sk); 97 struct net *net = sock_net(sk);
98 int smallest_size = -1, smallest_rover;
98 99
99 local_bh_disable(); 100 local_bh_disable();
100 if (!snum) { 101 if (!snum) {
101 int remaining, rover, low, high; 102 int remaining, rover, low, high;
102 103
104again:
103 inet_get_local_port_range(&low, &high); 105 inet_get_local_port_range(&low, &high);
104 remaining = (high - low) + 1; 106 remaining = (high - low) + 1;
105 rover = net_random() % remaining + low; 107 smallest_rover = rover = net_random() % remaining + low;
106 108
109 smallest_size = -1;
107 do { 110 do {
108 head = &hashinfo->bhash[inet_bhashfn(net, rover, 111 head = &hashinfo->bhash[inet_bhashfn(net, rover,
109 hashinfo->bhash_size)]; 112 hashinfo->bhash_size)];
110 spin_lock(&head->lock); 113 spin_lock(&head->lock);
111 inet_bind_bucket_for_each(tb, node, &head->chain) 114 inet_bind_bucket_for_each(tb, node, &head->chain)
112 if (ib_net(tb) == net && tb->port == rover) 115 if (ib_net(tb) == net && tb->port == rover) {
116 if (tb->fastreuse > 0 &&
117 sk->sk_reuse &&
118 sk->sk_state != TCP_LISTEN &&
119 (tb->num_owners < smallest_size || smallest_size == -1)) {
120 smallest_size = tb->num_owners;
121 smallest_rover = rover;
122 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
123 spin_unlock(&head->lock);
124 snum = smallest_rover;
125 goto have_snum;
126 }
127 }
113 goto next; 128 goto next;
129 }
114 break; 130 break;
115 next: 131 next:
116 spin_unlock(&head->lock); 132 spin_unlock(&head->lock);
@@ -125,14 +141,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
125 * the top level, not from the 'break;' statement. 141 * the top level, not from the 'break;' statement.
126 */ 142 */
127 ret = 1; 143 ret = 1;
128 if (remaining <= 0) 144 if (remaining <= 0) {
145 if (smallest_size != -1) {
146 snum = smallest_rover;
147 goto have_snum;
148 }
129 goto fail; 149 goto fail;
130 150 }
131 /* OK, here is the one we will use. HEAD is 151 /* OK, here is the one we will use. HEAD is
132 * non-NULL and we hold it's mutex. 152 * non-NULL and we hold it's mutex.
133 */ 153 */
134 snum = rover; 154 snum = rover;
135 } else { 155 } else {
156have_snum:
136 head = &hashinfo->bhash[inet_bhashfn(net, snum, 157 head = &hashinfo->bhash[inet_bhashfn(net, snum,
137 hashinfo->bhash_size)]; 158 hashinfo->bhash_size)];
138 spin_lock(&head->lock); 159 spin_lock(&head->lock);
@@ -145,12 +166,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
145tb_found: 166tb_found:
146 if (!hlist_empty(&tb->owners)) { 167 if (!hlist_empty(&tb->owners)) {
147 if (tb->fastreuse > 0 && 168 if (tb->fastreuse > 0 &&
148 sk->sk_reuse && sk->sk_state != TCP_LISTEN) { 169 sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
170 smallest_size == -1) {
149 goto success; 171 goto success;
150 } else { 172 } else {
151 ret = 1; 173 ret = 1;
152 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) 174 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
175 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
176 smallest_size != -1 && --attempts >= 0) {
177 spin_unlock(&head->lock);
178 goto again;
179 }
153 goto fail_unlock; 180 goto fail_unlock;
181 }
154 } 182 }
155 } 183 }
156tb_not_found: 184tb_not_found:
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6c52e08f786e..eaf3e2c8646a 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
267 267
268struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, 268struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
269 struct inet_frags *f, void *key, unsigned int hash) 269 struct inet_frags *f, void *key, unsigned int hash)
270 __releases(&f->lock)
270{ 271{
271 struct inet_frag_queue *q; 272 struct inet_frag_queue *q;
272 struct hlist_node *n; 273 struct hlist_node *n;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 6a1045da48d2..625cc5f64c94 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -38,6 +38,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
38 write_pnet(&tb->ib_net, hold_net(net)); 38 write_pnet(&tb->ib_net, hold_net(net));
39 tb->port = snum; 39 tb->port = snum;
40 tb->fastreuse = 0; 40 tb->fastreuse = 0;
41 tb->num_owners = 0;
41 INIT_HLIST_HEAD(&tb->owners); 42 INIT_HLIST_HEAD(&tb->owners);
42 hlist_add_head(&tb->node, &head->chain); 43 hlist_add_head(&tb->node, &head->chain);
43 } 44 }
@@ -59,8 +60,13 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
59void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 60void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
60 const unsigned short snum) 61 const unsigned short snum)
61{ 62{
63 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
64
65 atomic_inc(&hashinfo->bsockets);
66
62 inet_sk(sk)->num = snum; 67 inet_sk(sk)->num = snum;
63 sk_add_bind_node(sk, &tb->owners); 68 sk_add_bind_node(sk, &tb->owners);
69 tb->num_owners++;
64 inet_csk(sk)->icsk_bind_hash = tb; 70 inet_csk(sk)->icsk_bind_hash = tb;
65} 71}
66 72
@@ -75,9 +81,12 @@ static void __inet_put_port(struct sock *sk)
75 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
76 struct inet_bind_bucket *tb; 82 struct inet_bind_bucket *tb;
77 83
84 atomic_dec(&hashinfo->bsockets);
85
78 spin_lock(&head->lock); 86 spin_lock(&head->lock);
79 tb = inet_csk(sk)->icsk_bind_hash; 87 tb = inet_csk(sk)->icsk_bind_hash;
80 __sk_del_bind_node(sk); 88 __sk_del_bind_node(sk);
89 tb->num_owners--;
81 inet_csk(sk)->icsk_bind_hash = NULL; 90 inet_csk(sk)->icsk_bind_hash = NULL;
82 inet_sk(sk)->num = 0; 91 inet_sk(sk)->num = 0;
83 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -444,9 +453,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
444 */ 453 */
445 inet_bind_bucket_for_each(tb, node, &head->chain) { 454 inet_bind_bucket_for_each(tb, node, &head->chain) {
446 if (ib_net(tb) == net && tb->port == port) { 455 if (ib_net(tb) == net && tb->port == port) {
447 WARN_ON(hlist_empty(&tb->owners));
448 if (tb->fastreuse >= 0) 456 if (tb->fastreuse >= 0)
449 goto next_port; 457 goto next_port;
458 WARN_ON(hlist_empty(&tb->owners));
450 if (!check_established(death_row, sk, 459 if (!check_established(death_row, sk,
451 port, &tw)) 460 port, &tw))
452 goto ok; 461 goto ok;
@@ -523,6 +532,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
523{ 532{
524 int i; 533 int i;
525 534
535 atomic_set(&h->bsockets, 0);
526 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 536 for (i = 0; i < INET_LHTABLE_SIZE; i++) {
527 spin_lock_init(&h->listening_hash[i].lock); 537 spin_lock_init(&h->listening_hash[i].lock);
528 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, 538 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0101521f366b..e62510d5ea5a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -164,67 +164,124 @@ static DEFINE_RWLOCK(ipgre_lock);
164 164
165/* Given src, dst and key, find appropriate for input tunnel. */ 165/* Given src, dst and key, find appropriate for input tunnel. */
166 166
167static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, 167static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
168 __be32 remote, __be32 local, 168 __be32 remote, __be32 local,
169 __be32 key, __be16 gre_proto) 169 __be32 key, __be16 gre_proto)
170{ 170{
171 struct net *net = dev_net(dev);
172 int link = dev->ifindex;
171 unsigned h0 = HASH(remote); 173 unsigned h0 = HASH(remote);
172 unsigned h1 = HASH(key); 174 unsigned h1 = HASH(key);
173 struct ip_tunnel *t; 175 struct ip_tunnel *t, *cand = NULL;
174 struct ip_tunnel *t2 = NULL;
175 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 176 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
176 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 177 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
177 ARPHRD_ETHER : ARPHRD_IPGRE; 178 ARPHRD_ETHER : ARPHRD_IPGRE;
179 int score, cand_score = 4;
178 180
179 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { 181 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
180 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { 182 if (local != t->parms.iph.saddr ||
181 if (t->parms.i_key == key && t->dev->flags & IFF_UP) { 183 remote != t->parms.iph.daddr ||
182 if (t->dev->type == dev_type) 184 key != t->parms.i_key ||
183 return t; 185 !(t->dev->flags & IFF_UP))
184 if (t->dev->type == ARPHRD_IPGRE && !t2) 186 continue;
185 t2 = t; 187
186 } 188 if (t->dev->type != ARPHRD_IPGRE &&
189 t->dev->type != dev_type)
190 continue;
191
192 score = 0;
193 if (t->parms.link != link)
194 score |= 1;
195 if (t->dev->type != dev_type)
196 score |= 2;
197 if (score == 0)
198 return t;
199
200 if (score < cand_score) {
201 cand = t;
202 cand_score = score;
187 } 203 }
188 } 204 }
189 205
190 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { 206 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
191 if (remote == t->parms.iph.daddr) { 207 if (remote != t->parms.iph.daddr ||
192 if (t->parms.i_key == key && t->dev->flags & IFF_UP) { 208 key != t->parms.i_key ||
193 if (t->dev->type == dev_type) 209 !(t->dev->flags & IFF_UP))
194 return t; 210 continue;
195 if (t->dev->type == ARPHRD_IPGRE && !t2) 211
196 t2 = t; 212 if (t->dev->type != ARPHRD_IPGRE &&
197 } 213 t->dev->type != dev_type)
214 continue;
215
216 score = 0;
217 if (t->parms.link != link)
218 score |= 1;
219 if (t->dev->type != dev_type)
220 score |= 2;
221 if (score == 0)
222 return t;
223
224 if (score < cand_score) {
225 cand = t;
226 cand_score = score;
198 } 227 }
199 } 228 }
200 229
201 for (t = ign->tunnels_l[h1]; t; t = t->next) { 230 for (t = ign->tunnels_l[h1]; t; t = t->next) {
202 if (local == t->parms.iph.saddr || 231 if ((local != t->parms.iph.saddr &&
203 (local == t->parms.iph.daddr && 232 (local != t->parms.iph.daddr ||
204 ipv4_is_multicast(local))) { 233 !ipv4_is_multicast(local))) ||
205 if (t->parms.i_key == key && t->dev->flags & IFF_UP) { 234 key != t->parms.i_key ||
206 if (t->dev->type == dev_type) 235 !(t->dev->flags & IFF_UP))
207 return t; 236 continue;
208 if (t->dev->type == ARPHRD_IPGRE && !t2) 237
209 t2 = t; 238 if (t->dev->type != ARPHRD_IPGRE &&
210 } 239 t->dev->type != dev_type)
240 continue;
241
242 score = 0;
243 if (t->parms.link != link)
244 score |= 1;
245 if (t->dev->type != dev_type)
246 score |= 2;
247 if (score == 0)
248 return t;
249
250 if (score < cand_score) {
251 cand = t;
252 cand_score = score;
211 } 253 }
212 } 254 }
213 255
214 for (t = ign->tunnels_wc[h1]; t; t = t->next) { 256 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
215 if (t->parms.i_key == key && t->dev->flags & IFF_UP) { 257 if (t->parms.i_key != key ||
216 if (t->dev->type == dev_type) 258 !(t->dev->flags & IFF_UP))
217 return t; 259 continue;
218 if (t->dev->type == ARPHRD_IPGRE && !t2) 260
219 t2 = t; 261 if (t->dev->type != ARPHRD_IPGRE &&
262 t->dev->type != dev_type)
263 continue;
264
265 score = 0;
266 if (t->parms.link != link)
267 score |= 1;
268 if (t->dev->type != dev_type)
269 score |= 2;
270 if (score == 0)
271 return t;
272
273 if (score < cand_score) {
274 cand = t;
275 cand_score = score;
220 } 276 }
221 } 277 }
222 278
223 if (t2) 279 if (cand != NULL)
224 return t2; 280 return cand;
225 281
226 if (ign->fb_tunnel_dev->flags&IFF_UP) 282 if (ign->fb_tunnel_dev->flags & IFF_UP)
227 return netdev_priv(ign->fb_tunnel_dev); 283 return netdev_priv(ign->fb_tunnel_dev);
284
228 return NULL; 285 return NULL;
229} 286}
230 287
@@ -284,6 +341,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
284 __be32 remote = parms->iph.daddr; 341 __be32 remote = parms->iph.daddr;
285 __be32 local = parms->iph.saddr; 342 __be32 local = parms->iph.saddr;
286 __be32 key = parms->i_key; 343 __be32 key = parms->i_key;
344 int link = parms->link;
287 struct ip_tunnel *t, **tp; 345 struct ip_tunnel *t, **tp;
288 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 346 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
289 347
@@ -291,6 +349,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
291 if (local == t->parms.iph.saddr && 349 if (local == t->parms.iph.saddr &&
292 remote == t->parms.iph.daddr && 350 remote == t->parms.iph.daddr &&
293 key == t->parms.i_key && 351 key == t->parms.i_key &&
352 link == t->parms.link &&
294 type == t->dev->type) 353 type == t->dev->type)
295 break; 354 break;
296 355
@@ -421,7 +480,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
421 } 480 }
422 481
423 read_lock(&ipgre_lock); 482 read_lock(&ipgre_lock);
424 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr, 483 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
425 flags & GRE_KEY ? 484 flags & GRE_KEY ?
426 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 485 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
427 p[1]); 486 p[1]);
@@ -432,7 +491,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
432 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 491 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
433 goto out; 492 goto out;
434 493
435 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) 494 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
436 t->err_count++; 495 t->err_count++;
437 else 496 else
438 t->err_count = 1; 497 t->err_count = 1;
@@ -518,7 +577,7 @@ static int ipgre_rcv(struct sk_buff *skb)
518 gre_proto = *(__be16 *)(h + 2); 577 gre_proto = *(__be16 *)(h + 2);
519 578
520 read_lock(&ipgre_lock); 579 read_lock(&ipgre_lock);
521 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), 580 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
522 iph->saddr, iph->daddr, key, 581 iph->saddr, iph->daddr, key,
523 gre_proto))) { 582 gre_proto))) {
524 struct net_device_stats *stats = &tunnel->dev->stats; 583 struct net_device_stats *stats = &tunnel->dev->stats;
@@ -744,7 +803,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
744#endif 803#endif
745 804
746 if (tunnel->err_count > 0) { 805 if (tunnel->err_count > 0) {
747 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { 806 if (time_before(jiffies,
807 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
748 tunnel->err_count--; 808 tunnel->err_count--;
749 809
750 dst_link_failure(skb); 810 dst_link_failure(skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8ebe86dd72af..3e7e910c7c0f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -935,6 +935,10 @@ alloc_new_skb:
935 sk->sk_allocation); 935 sk->sk_allocation);
936 if (unlikely(skb == NULL)) 936 if (unlikely(skb == NULL))
937 err = -ENOBUFS; 937 err = -ENOBUFS;
938 else
939 /* only the initial fragment is
940 time stamped */
941 ipc->shtx.flags = 0;
938 } 942 }
939 if (skb == NULL) 943 if (skb == NULL)
940 goto error; 944 goto error;
@@ -945,6 +949,7 @@ alloc_new_skb:
945 skb->ip_summed = csummode; 949 skb->ip_summed = csummode;
946 skb->csum = 0; 950 skb->csum = 0;
947 skb_reserve(skb, hh_len); 951 skb_reserve(skb, hh_len);
952 *skb_tx(skb) = ipc->shtx;
948 953
949 /* 954 /*
950 * Find where to start putting bytes. 955 * Find where to start putting bytes.
@@ -1364,6 +1369,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1364 1369
1365 daddr = ipc.addr = rt->rt_src; 1370 daddr = ipc.addr = rt->rt_src;
1366 ipc.opt = NULL; 1371 ipc.opt = NULL;
1372 ipc.shtx.flags = 0;
1367 1373
1368 if (replyopts.opt.optlen) { 1374 if (replyopts.opt.optlen) {
1369 ipc.opt = &replyopts.opt; 1375 ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d722013c1cae..90d22ae0a419 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -100,8 +100,8 @@
100#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers 100#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
101 - '3' from resolv.h */ 101 - '3' from resolv.h */
102 102
103#define NONE __constant_htonl(INADDR_NONE) 103#define NONE cpu_to_be32(INADDR_NONE)
104#define ANY __constant_htonl(INADDR_ANY) 104#define ANY cpu_to_be32(INADDR_ANY)
105 105
106/* 106/*
107 * Public IP configuration 107 * Public IP configuration
@@ -406,7 +406,7 @@ static int __init ic_defaults(void)
406static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); 406static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
407 407
408static struct packet_type rarp_packet_type __initdata = { 408static struct packet_type rarp_packet_type __initdata = {
409 .type = __constant_htons(ETH_P_RARP), 409 .type = cpu_to_be16(ETH_P_RARP),
410 .func = ic_rarp_recv, 410 .func = ic_rarp_recv,
411}; 411};
412 412
@@ -568,7 +568,7 @@ struct bootp_pkt { /* BOOTP packet format */
568static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); 568static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
569 569
570static struct packet_type bootp_packet_type __initdata = { 570static struct packet_type bootp_packet_type __initdata = {
571 .type = __constant_htons(ETH_P_IP), 571 .type = cpu_to_be16(ETH_P_IP),
572 .func = ic_bootp_recv, 572 .func = ic_bootp_recv,
573}; 573};
574 574
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 5079dfbc6f38..9054139795af 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -327,7 +327,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
327 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 327 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
328 goto out; 328 goto out;
329 329
330 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) 330 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
331 t->err_count++; 331 t->err_count++;
332 else 332 else
333 t->err_count = 1; 333 t->err_count = 1;
@@ -466,7 +466,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
466 } 466 }
467 467
468 if (tunnel->err_count > 0) { 468 if (tunnel->err_count > 0) {
469 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { 469 if (time_before(jiffies,
470 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
470 tunnel->err_count--; 471 tunnel->err_count--;
471 dst_link_failure(skb); 472 dst_link_failure(skb);
472 } else 473 } else
@@ -750,7 +751,7 @@ static struct xfrm_tunnel ipip_handler = {
750 .priority = 1, 751 .priority = 1,
751}; 752};
752 753
753static char banner[] __initdata = 754static const char banner[] __initconst =
754 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 755 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
755 756
756static void ipip_destroy_tunnels(struct ipip_net *ipn) 757static void ipip_destroy_tunnels(struct ipip_net *ipn)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 14666449dc1c..13e9dd3012b3 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,9 +67,6 @@
67#define CONFIG_IP_PIMSM 1 67#define CONFIG_IP_PIMSM 1
68#endif 68#endif
69 69
70static struct sock *mroute_socket;
71
72
73/* Big lock, protecting vif table, mrt cache and mroute socket state. 70/* Big lock, protecting vif table, mrt cache and mroute socket state.
74 Note that the changes are semaphored via rtnl_lock. 71 Note that the changes are semaphored via rtnl_lock.
75 */ 72 */
@@ -80,18 +77,9 @@ static DEFINE_RWLOCK(mrt_lock);
80 * Multicast router control variables 77 * Multicast router control variables
81 */ 78 */
82 79
83static struct vif_device vif_table[MAXVIFS]; /* Devices */ 80#define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
84static int maxvif;
85
86#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88static int mroute_do_assert; /* Set in PIM assert */
89static int mroute_do_pim;
90
91static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */
92 81
93static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */ 82static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
94static atomic_t cache_resolve_queue_len; /* Size of unresolved */
95 83
96/* Special spinlock for queue of unresolved entries */ 84/* Special spinlock for queue of unresolved entries */
97static DEFINE_SPINLOCK(mfc_unres_lock); 85static DEFINE_SPINLOCK(mfc_unres_lock);
@@ -107,7 +95,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
107static struct kmem_cache *mrt_cachep __read_mostly; 95static struct kmem_cache *mrt_cachep __read_mostly;
108 96
109static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); 97static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); 98static int ipmr_cache_report(struct net *net,
99 struct sk_buff *pkt, vifi_t vifi, int assert);
111static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); 100static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112 101
113#ifdef CONFIG_IP_PIMSM_V2 102#ifdef CONFIG_IP_PIMSM_V2
@@ -120,9 +109,11 @@ static struct timer_list ipmr_expire_timer;
120 109
121static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) 110static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
122{ 111{
112 struct net *net = dev_net(dev);
113
123 dev_close(dev); 114 dev_close(dev);
124 115
125 dev = __dev_get_by_name(&init_net, "tunl0"); 116 dev = __dev_get_by_name(net, "tunl0");
126 if (dev) { 117 if (dev) {
127 const struct net_device_ops *ops = dev->netdev_ops; 118 const struct net_device_ops *ops = dev->netdev_ops;
128 struct ifreq ifr; 119 struct ifreq ifr;
@@ -148,11 +139,11 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
148} 139}
149 140
150static 141static
151struct net_device *ipmr_new_tunnel(struct vifctl *v) 142struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
152{ 143{
153 struct net_device *dev; 144 struct net_device *dev;
154 145
155 dev = __dev_get_by_name(&init_net, "tunl0"); 146 dev = __dev_get_by_name(net, "tunl0");
156 147
157 if (dev) { 148 if (dev) {
158 const struct net_device_ops *ops = dev->netdev_ops; 149 const struct net_device_ops *ops = dev->netdev_ops;
@@ -181,7 +172,8 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
181 172
182 dev = NULL; 173 dev = NULL;
183 174
184 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) { 175 if (err == 0 &&
176 (dev = __dev_get_by_name(net, p.name)) != NULL) {
185 dev->flags |= IFF_MULTICAST; 177 dev->flags |= IFF_MULTICAST;
186 178
187 in_dev = __in_dev_get_rtnl(dev); 179 in_dev = __in_dev_get_rtnl(dev);
@@ -209,14 +201,15 @@ failure:
209 201
210#ifdef CONFIG_IP_PIMSM 202#ifdef CONFIG_IP_PIMSM
211 203
212static int reg_vif_num = -1;
213
214static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) 204static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
215{ 205{
206 struct net *net = dev_net(dev);
207
216 read_lock(&mrt_lock); 208 read_lock(&mrt_lock);
217 dev->stats.tx_bytes += skb->len; 209 dev->stats.tx_bytes += skb->len;
218 dev->stats.tx_packets++; 210 dev->stats.tx_packets++;
219 ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); 211 ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num,
212 IGMPMSG_WHOLEPKT);
220 read_unlock(&mrt_lock); 213 read_unlock(&mrt_lock);
221 kfree_skb(skb); 214 kfree_skb(skb);
222 return 0; 215 return 0;
@@ -283,16 +276,16 @@ failure:
283 * @notify: Set to 1, if the caller is a notifier_call 276 * @notify: Set to 1, if the caller is a notifier_call
284 */ 277 */
285 278
286static int vif_delete(int vifi, int notify) 279static int vif_delete(struct net *net, int vifi, int notify)
287{ 280{
288 struct vif_device *v; 281 struct vif_device *v;
289 struct net_device *dev; 282 struct net_device *dev;
290 struct in_device *in_dev; 283 struct in_device *in_dev;
291 284
292 if (vifi < 0 || vifi >= maxvif) 285 if (vifi < 0 || vifi >= net->ipv4.maxvif)
293 return -EADDRNOTAVAIL; 286 return -EADDRNOTAVAIL;
294 287
295 v = &vif_table[vifi]; 288 v = &net->ipv4.vif_table[vifi];
296 289
297 write_lock_bh(&mrt_lock); 290 write_lock_bh(&mrt_lock);
298 dev = v->dev; 291 dev = v->dev;
@@ -304,17 +297,17 @@ static int vif_delete(int vifi, int notify)
304 } 297 }
305 298
306#ifdef CONFIG_IP_PIMSM 299#ifdef CONFIG_IP_PIMSM
307 if (vifi == reg_vif_num) 300 if (vifi == net->ipv4.mroute_reg_vif_num)
308 reg_vif_num = -1; 301 net->ipv4.mroute_reg_vif_num = -1;
309#endif 302#endif
310 303
311 if (vifi+1 == maxvif) { 304 if (vifi+1 == net->ipv4.maxvif) {
312 int tmp; 305 int tmp;
313 for (tmp=vifi-1; tmp>=0; tmp--) { 306 for (tmp=vifi-1; tmp>=0; tmp--) {
314 if (VIF_EXISTS(tmp)) 307 if (VIF_EXISTS(net, tmp))
315 break; 308 break;
316 } 309 }
317 maxvif = tmp+1; 310 net->ipv4.maxvif = tmp+1;
318 } 311 }
319 312
320 write_unlock_bh(&mrt_lock); 313 write_unlock_bh(&mrt_lock);
@@ -333,6 +326,12 @@ static int vif_delete(int vifi, int notify)
333 return 0; 326 return 0;
334} 327}
335 328
329static inline void ipmr_cache_free(struct mfc_cache *c)
330{
331 release_net(mfc_net(c));
332 kmem_cache_free(mrt_cachep, c);
333}
334
336/* Destroy an unresolved cache entry, killing queued skbs 335/* Destroy an unresolved cache entry, killing queued skbs
337 and reporting error to netlink readers. 336 and reporting error to netlink readers.
338 */ 337 */
@@ -341,8 +340,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
341{ 340{
342 struct sk_buff *skb; 341 struct sk_buff *skb;
343 struct nlmsgerr *e; 342 struct nlmsgerr *e;
343 struct net *net = mfc_net(c);
344 344
345 atomic_dec(&cache_resolve_queue_len); 345 atomic_dec(&net->ipv4.cache_resolve_queue_len);
346 346
347 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { 347 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
348 if (ip_hdr(skb)->version == 0) { 348 if (ip_hdr(skb)->version == 0) {
@@ -354,12 +354,12 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
354 e->error = -ETIMEDOUT; 354 e->error = -ETIMEDOUT;
355 memset(&e->msg, 0, sizeof(e->msg)); 355 memset(&e->msg, 0, sizeof(e->msg));
356 356
357 rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); 357 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
358 } else 358 } else
359 kfree_skb(skb); 359 kfree_skb(skb);
360 } 360 }
361 361
362 kmem_cache_free(mrt_cachep, c); 362 ipmr_cache_free(c);
363} 363}
364 364
365 365
@@ -376,7 +376,7 @@ static void ipmr_expire_process(unsigned long dummy)
376 return; 376 return;
377 } 377 }
378 378
379 if (atomic_read(&cache_resolve_queue_len) == 0) 379 if (mfc_unres_queue == NULL)
380 goto out; 380 goto out;
381 381
382 now = jiffies; 382 now = jiffies;
@@ -397,7 +397,7 @@ static void ipmr_expire_process(unsigned long dummy)
397 ipmr_destroy_unres(c); 397 ipmr_destroy_unres(c);
398 } 398 }
399 399
400 if (atomic_read(&cache_resolve_queue_len)) 400 if (mfc_unres_queue != NULL)
401 mod_timer(&ipmr_expire_timer, jiffies + expires); 401 mod_timer(&ipmr_expire_timer, jiffies + expires);
402 402
403out: 403out:
@@ -409,13 +409,15 @@ out:
409static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) 409static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
410{ 410{
411 int vifi; 411 int vifi;
412 struct net *net = mfc_net(cache);
412 413
413 cache->mfc_un.res.minvif = MAXVIFS; 414 cache->mfc_un.res.minvif = MAXVIFS;
414 cache->mfc_un.res.maxvif = 0; 415 cache->mfc_un.res.maxvif = 0;
415 memset(cache->mfc_un.res.ttls, 255, MAXVIFS); 416 memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
416 417
417 for (vifi=0; vifi<maxvif; vifi++) { 418 for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) {
418 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) { 419 if (VIF_EXISTS(net, vifi) &&
420 ttls[vifi] && ttls[vifi] < 255) {
419 cache->mfc_un.res.ttls[vifi] = ttls[vifi]; 421 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
420 if (cache->mfc_un.res.minvif > vifi) 422 if (cache->mfc_un.res.minvif > vifi)
421 cache->mfc_un.res.minvif = vifi; 423 cache->mfc_un.res.minvif = vifi;
@@ -425,16 +427,16 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
425 } 427 }
426} 428}
427 429
428static int vif_add(struct vifctl *vifc, int mrtsock) 430static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
429{ 431{
430 int vifi = vifc->vifc_vifi; 432 int vifi = vifc->vifc_vifi;
431 struct vif_device *v = &vif_table[vifi]; 433 struct vif_device *v = &net->ipv4.vif_table[vifi];
432 struct net_device *dev; 434 struct net_device *dev;
433 struct in_device *in_dev; 435 struct in_device *in_dev;
434 int err; 436 int err;
435 437
436 /* Is vif busy ? */ 438 /* Is vif busy ? */
437 if (VIF_EXISTS(vifi)) 439 if (VIF_EXISTS(net, vifi))
438 return -EADDRINUSE; 440 return -EADDRINUSE;
439 441
440 switch (vifc->vifc_flags) { 442 switch (vifc->vifc_flags) {
@@ -444,7 +446,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
444 * Special Purpose VIF in PIM 446 * Special Purpose VIF in PIM
445 * All the packets will be sent to the daemon 447 * All the packets will be sent to the daemon
446 */ 448 */
447 if (reg_vif_num >= 0) 449 if (net->ipv4.mroute_reg_vif_num >= 0)
448 return -EADDRINUSE; 450 return -EADDRINUSE;
449 dev = ipmr_reg_vif(); 451 dev = ipmr_reg_vif();
450 if (!dev) 452 if (!dev)
@@ -458,7 +460,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
458 break; 460 break;
459#endif 461#endif
460 case VIFF_TUNNEL: 462 case VIFF_TUNNEL:
461 dev = ipmr_new_tunnel(vifc); 463 dev = ipmr_new_tunnel(net, vifc);
462 if (!dev) 464 if (!dev)
463 return -ENOBUFS; 465 return -ENOBUFS;
464 err = dev_set_allmulti(dev, 1); 466 err = dev_set_allmulti(dev, 1);
@@ -469,7 +471,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
469 } 471 }
470 break; 472 break;
471 case 0: 473 case 0:
472 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); 474 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
473 if (!dev) 475 if (!dev)
474 return -EADDRNOTAVAIL; 476 return -EADDRNOTAVAIL;
475 err = dev_set_allmulti(dev, 1); 477 err = dev_set_allmulti(dev, 1);
@@ -510,20 +512,22 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
510 v->dev = dev; 512 v->dev = dev;
511#ifdef CONFIG_IP_PIMSM 513#ifdef CONFIG_IP_PIMSM
512 if (v->flags&VIFF_REGISTER) 514 if (v->flags&VIFF_REGISTER)
513 reg_vif_num = vifi; 515 net->ipv4.mroute_reg_vif_num = vifi;
514#endif 516#endif
515 if (vifi+1 > maxvif) 517 if (vifi+1 > net->ipv4.maxvif)
516 maxvif = vifi+1; 518 net->ipv4.maxvif = vifi+1;
517 write_unlock_bh(&mrt_lock); 519 write_unlock_bh(&mrt_lock);
518 return 0; 520 return 0;
519} 521}
520 522
521static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp) 523static struct mfc_cache *ipmr_cache_find(struct net *net,
524 __be32 origin,
525 __be32 mcastgrp)
522{ 526{
523 int line = MFC_HASH(mcastgrp, origin); 527 int line = MFC_HASH(mcastgrp, origin);
524 struct mfc_cache *c; 528 struct mfc_cache *c;
525 529
526 for (c=mfc_cache_array[line]; c; c = c->next) { 530 for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) {
527 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) 531 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
528 break; 532 break;
529 } 533 }
@@ -533,22 +537,24 @@ static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
533/* 537/*
534 * Allocate a multicast cache entry 538 * Allocate a multicast cache entry
535 */ 539 */
536static struct mfc_cache *ipmr_cache_alloc(void) 540static struct mfc_cache *ipmr_cache_alloc(struct net *net)
537{ 541{
538 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 542 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
539 if (c == NULL) 543 if (c == NULL)
540 return NULL; 544 return NULL;
541 c->mfc_un.res.minvif = MAXVIFS; 545 c->mfc_un.res.minvif = MAXVIFS;
546 mfc_net_set(c, net);
542 return c; 547 return c;
543} 548}
544 549
545static struct mfc_cache *ipmr_cache_alloc_unres(void) 550static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
546{ 551{
547 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 552 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
548 if (c == NULL) 553 if (c == NULL)
549 return NULL; 554 return NULL;
550 skb_queue_head_init(&c->mfc_un.unres.unresolved); 555 skb_queue_head_init(&c->mfc_un.unres.unresolved);
551 c->mfc_un.unres.expires = jiffies + 10*HZ; 556 c->mfc_un.unres.expires = jiffies + 10*HZ;
557 mfc_net_set(c, net);
552 return c; 558 return c;
553} 559}
554 560
@@ -581,7 +587,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
581 memset(&e->msg, 0, sizeof(e->msg)); 587 memset(&e->msg, 0, sizeof(e->msg));
582 } 588 }
583 589
584 rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); 590 rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid);
585 } else 591 } else
586 ip_mr_forward(skb, c, 0); 592 ip_mr_forward(skb, c, 0);
587 } 593 }
@@ -594,7 +600,8 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
594 * Called under mrt_lock. 600 * Called under mrt_lock.
595 */ 601 */
596 602
597static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) 603static int ipmr_cache_report(struct net *net,
604 struct sk_buff *pkt, vifi_t vifi, int assert)
598{ 605{
599 struct sk_buff *skb; 606 struct sk_buff *skb;
600 const int ihl = ip_hdrlen(pkt); 607 const int ihl = ip_hdrlen(pkt);
@@ -626,7 +633,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
626 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); 633 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
627 msg->im_msgtype = IGMPMSG_WHOLEPKT; 634 msg->im_msgtype = IGMPMSG_WHOLEPKT;
628 msg->im_mbz = 0; 635 msg->im_mbz = 0;
629 msg->im_vif = reg_vif_num; 636 msg->im_vif = net->ipv4.mroute_reg_vif_num;
630 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; 637 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
631 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + 638 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
632 sizeof(struct iphdr)); 639 sizeof(struct iphdr));
@@ -658,7 +665,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
658 skb->transport_header = skb->network_header; 665 skb->transport_header = skb->network_header;
659 } 666 }
660 667
661 if (mroute_socket == NULL) { 668 if (net->ipv4.mroute_sk == NULL) {
662 kfree_skb(skb); 669 kfree_skb(skb);
663 return -EINVAL; 670 return -EINVAL;
664 } 671 }
@@ -666,7 +673,8 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
666 /* 673 /*
667 * Deliver to mrouted 674 * Deliver to mrouted
668 */ 675 */
669 if ((ret = sock_queue_rcv_skb(mroute_socket, skb))<0) { 676 ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb);
677 if (ret < 0) {
670 if (net_ratelimit()) 678 if (net_ratelimit())
671 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 679 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
672 kfree_skb(skb); 680 kfree_skb(skb);
@@ -680,7 +688,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
680 */ 688 */
681 689
682static int 690static int
683ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) 691ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
684{ 692{
685 int err; 693 int err;
686 struct mfc_cache *c; 694 struct mfc_cache *c;
@@ -688,7 +696,8 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
688 696
689 spin_lock_bh(&mfc_unres_lock); 697 spin_lock_bh(&mfc_unres_lock);
690 for (c=mfc_unres_queue; c; c=c->next) { 698 for (c=mfc_unres_queue; c; c=c->next) {
691 if (c->mfc_mcastgrp == iph->daddr && 699 if (net_eq(mfc_net(c), net) &&
700 c->mfc_mcastgrp == iph->daddr &&
692 c->mfc_origin == iph->saddr) 701 c->mfc_origin == iph->saddr)
693 break; 702 break;
694 } 703 }
@@ -698,8 +707,8 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
698 * Create a new entry if allowable 707 * Create a new entry if allowable
699 */ 708 */
700 709
701 if (atomic_read(&cache_resolve_queue_len) >= 10 || 710 if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 ||
702 (c=ipmr_cache_alloc_unres())==NULL) { 711 (c = ipmr_cache_alloc_unres(net)) == NULL) {
703 spin_unlock_bh(&mfc_unres_lock); 712 spin_unlock_bh(&mfc_unres_lock);
704 713
705 kfree_skb(skb); 714 kfree_skb(skb);
@@ -716,18 +725,19 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
716 /* 725 /*
717 * Reflect first query at mrouted. 726 * Reflect first query at mrouted.
718 */ 727 */
719 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) { 728 err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE);
729 if (err < 0) {
720 /* If the report failed throw the cache entry 730 /* If the report failed throw the cache entry
721 out - Brad Parker 731 out - Brad Parker
722 */ 732 */
723 spin_unlock_bh(&mfc_unres_lock); 733 spin_unlock_bh(&mfc_unres_lock);
724 734
725 kmem_cache_free(mrt_cachep, c); 735 ipmr_cache_free(c);
726 kfree_skb(skb); 736 kfree_skb(skb);
727 return err; 737 return err;
728 } 738 }
729 739
730 atomic_inc(&cache_resolve_queue_len); 740 atomic_inc(&net->ipv4.cache_resolve_queue_len);
731 c->next = mfc_unres_queue; 741 c->next = mfc_unres_queue;
732 mfc_unres_queue = c; 742 mfc_unres_queue = c;
733 743
@@ -753,35 +763,37 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
753 * MFC cache manipulation by user space mroute daemon 763 * MFC cache manipulation by user space mroute daemon
754 */ 764 */
755 765
756static int ipmr_mfc_delete(struct mfcctl *mfc) 766static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
757{ 767{
758 int line; 768 int line;
759 struct mfc_cache *c, **cp; 769 struct mfc_cache *c, **cp;
760 770
761 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 771 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
762 772
763 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { 773 for (cp = &net->ipv4.mfc_cache_array[line];
774 (c = *cp) != NULL; cp = &c->next) {
764 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 775 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
765 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 776 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
766 write_lock_bh(&mrt_lock); 777 write_lock_bh(&mrt_lock);
767 *cp = c->next; 778 *cp = c->next;
768 write_unlock_bh(&mrt_lock); 779 write_unlock_bh(&mrt_lock);
769 780
770 kmem_cache_free(mrt_cachep, c); 781 ipmr_cache_free(c);
771 return 0; 782 return 0;
772 } 783 }
773 } 784 }
774 return -ENOENT; 785 return -ENOENT;
775} 786}
776 787
777static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) 788static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
778{ 789{
779 int line; 790 int line;
780 struct mfc_cache *uc, *c, **cp; 791 struct mfc_cache *uc, *c, **cp;
781 792
782 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 793 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
783 794
784 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { 795 for (cp = &net->ipv4.mfc_cache_array[line];
796 (c = *cp) != NULL; cp = &c->next) {
785 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 797 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
786 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) 798 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
787 break; 799 break;
@@ -800,7 +812,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
800 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) 812 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
801 return -EINVAL; 813 return -EINVAL;
802 814
803 c = ipmr_cache_alloc(); 815 c = ipmr_cache_alloc(net);
804 if (c == NULL) 816 if (c == NULL)
805 return -ENOMEM; 817 return -ENOMEM;
806 818
@@ -812,8 +824,8 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
812 c->mfc_flags |= MFC_STATIC; 824 c->mfc_flags |= MFC_STATIC;
813 825
814 write_lock_bh(&mrt_lock); 826 write_lock_bh(&mrt_lock);
815 c->next = mfc_cache_array[line]; 827 c->next = net->ipv4.mfc_cache_array[line];
816 mfc_cache_array[line] = c; 828 net->ipv4.mfc_cache_array[line] = c;
817 write_unlock_bh(&mrt_lock); 829 write_unlock_bh(&mrt_lock);
818 830
819 /* 831 /*
@@ -823,19 +835,21 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
823 spin_lock_bh(&mfc_unres_lock); 835 spin_lock_bh(&mfc_unres_lock);
824 for (cp = &mfc_unres_queue; (uc=*cp) != NULL; 836 for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
825 cp = &uc->next) { 837 cp = &uc->next) {
826 if (uc->mfc_origin == c->mfc_origin && 838 if (net_eq(mfc_net(uc), net) &&
839 uc->mfc_origin == c->mfc_origin &&
827 uc->mfc_mcastgrp == c->mfc_mcastgrp) { 840 uc->mfc_mcastgrp == c->mfc_mcastgrp) {
828 *cp = uc->next; 841 *cp = uc->next;
829 if (atomic_dec_and_test(&cache_resolve_queue_len)) 842 atomic_dec(&net->ipv4.cache_resolve_queue_len);
830 del_timer(&ipmr_expire_timer);
831 break; 843 break;
832 } 844 }
833 } 845 }
846 if (mfc_unres_queue == NULL)
847 del_timer(&ipmr_expire_timer);
834 spin_unlock_bh(&mfc_unres_lock); 848 spin_unlock_bh(&mfc_unres_lock);
835 849
836 if (uc) { 850 if (uc) {
837 ipmr_cache_resolve(uc, c); 851 ipmr_cache_resolve(uc, c);
838 kmem_cache_free(mrt_cachep, uc); 852 ipmr_cache_free(uc);
839 } 853 }
840 return 0; 854 return 0;
841} 855}
@@ -844,16 +858,16 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
844 * Close the multicast socket, and clear the vif tables etc 858 * Close the multicast socket, and clear the vif tables etc
845 */ 859 */
846 860
847static void mroute_clean_tables(struct sock *sk) 861static void mroute_clean_tables(struct net *net)
848{ 862{
849 int i; 863 int i;
850 864
851 /* 865 /*
852 * Shut down all active vif entries 866 * Shut down all active vif entries
853 */ 867 */
854 for (i=0; i<maxvif; i++) { 868 for (i = 0; i < net->ipv4.maxvif; i++) {
855 if (!(vif_table[i].flags&VIFF_STATIC)) 869 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
856 vif_delete(i, 0); 870 vif_delete(net, i, 0);
857 } 871 }
858 872
859 /* 873 /*
@@ -862,7 +876,7 @@ static void mroute_clean_tables(struct sock *sk)
862 for (i=0; i<MFC_LINES; i++) { 876 for (i=0; i<MFC_LINES; i++) {
863 struct mfc_cache *c, **cp; 877 struct mfc_cache *c, **cp;
864 878
865 cp = &mfc_cache_array[i]; 879 cp = &net->ipv4.mfc_cache_array[i];
866 while ((c = *cp) != NULL) { 880 while ((c = *cp) != NULL) {
867 if (c->mfc_flags&MFC_STATIC) { 881 if (c->mfc_flags&MFC_STATIC) {
868 cp = &c->next; 882 cp = &c->next;
@@ -872,22 +886,23 @@ static void mroute_clean_tables(struct sock *sk)
872 *cp = c->next; 886 *cp = c->next;
873 write_unlock_bh(&mrt_lock); 887 write_unlock_bh(&mrt_lock);
874 888
875 kmem_cache_free(mrt_cachep, c); 889 ipmr_cache_free(c);
876 } 890 }
877 } 891 }
878 892
879 if (atomic_read(&cache_resolve_queue_len) != 0) { 893 if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) {
880 struct mfc_cache *c; 894 struct mfc_cache *c, **cp;
881 895
882 spin_lock_bh(&mfc_unres_lock); 896 spin_lock_bh(&mfc_unres_lock);
883 while (mfc_unres_queue != NULL) { 897 cp = &mfc_unres_queue;
884 c = mfc_unres_queue; 898 while ((c = *cp) != NULL) {
885 mfc_unres_queue = c->next; 899 if (!net_eq(mfc_net(c), net)) {
886 spin_unlock_bh(&mfc_unres_lock); 900 cp = &c->next;
901 continue;
902 }
903 *cp = c->next;
887 904
888 ipmr_destroy_unres(c); 905 ipmr_destroy_unres(c);
889
890 spin_lock_bh(&mfc_unres_lock);
891 } 906 }
892 spin_unlock_bh(&mfc_unres_lock); 907 spin_unlock_bh(&mfc_unres_lock);
893 } 908 }
@@ -895,15 +910,17 @@ static void mroute_clean_tables(struct sock *sk)
895 910
896static void mrtsock_destruct(struct sock *sk) 911static void mrtsock_destruct(struct sock *sk)
897{ 912{
913 struct net *net = sock_net(sk);
914
898 rtnl_lock(); 915 rtnl_lock();
899 if (sk == mroute_socket) { 916 if (sk == net->ipv4.mroute_sk) {
900 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--; 917 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
901 918
902 write_lock_bh(&mrt_lock); 919 write_lock_bh(&mrt_lock);
903 mroute_socket = NULL; 920 net->ipv4.mroute_sk = NULL;
904 write_unlock_bh(&mrt_lock); 921 write_unlock_bh(&mrt_lock);
905 922
906 mroute_clean_tables(sk); 923 mroute_clean_tables(net);
907 } 924 }
908 rtnl_unlock(); 925 rtnl_unlock();
909} 926}
@@ -920,9 +937,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
920 int ret; 937 int ret;
921 struct vifctl vif; 938 struct vifctl vif;
922 struct mfcctl mfc; 939 struct mfcctl mfc;
940 struct net *net = sock_net(sk);
923 941
924 if (optname != MRT_INIT) { 942 if (optname != MRT_INIT) {
925 if (sk != mroute_socket && !capable(CAP_NET_ADMIN)) 943 if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
926 return -EACCES; 944 return -EACCES;
927 } 945 }
928 946
@@ -935,7 +953,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
935 return -ENOPROTOOPT; 953 return -ENOPROTOOPT;
936 954
937 rtnl_lock(); 955 rtnl_lock();
938 if (mroute_socket) { 956 if (net->ipv4.mroute_sk) {
939 rtnl_unlock(); 957 rtnl_unlock();
940 return -EADDRINUSE; 958 return -EADDRINUSE;
941 } 959 }
@@ -943,15 +961,15 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
943 ret = ip_ra_control(sk, 1, mrtsock_destruct); 961 ret = ip_ra_control(sk, 1, mrtsock_destruct);
944 if (ret == 0) { 962 if (ret == 0) {
945 write_lock_bh(&mrt_lock); 963 write_lock_bh(&mrt_lock);
946 mroute_socket = sk; 964 net->ipv4.mroute_sk = sk;
947 write_unlock_bh(&mrt_lock); 965 write_unlock_bh(&mrt_lock);
948 966
949 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++; 967 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
950 } 968 }
951 rtnl_unlock(); 969 rtnl_unlock();
952 return ret; 970 return ret;
953 case MRT_DONE: 971 case MRT_DONE:
954 if (sk != mroute_socket) 972 if (sk != net->ipv4.mroute_sk)
955 return -EACCES; 973 return -EACCES;
956 return ip_ra_control(sk, 0, NULL); 974 return ip_ra_control(sk, 0, NULL);
957 case MRT_ADD_VIF: 975 case MRT_ADD_VIF:
@@ -964,9 +982,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
964 return -ENFILE; 982 return -ENFILE;
965 rtnl_lock(); 983 rtnl_lock();
966 if (optname == MRT_ADD_VIF) { 984 if (optname == MRT_ADD_VIF) {
967 ret = vif_add(&vif, sk==mroute_socket); 985 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
968 } else { 986 } else {
969 ret = vif_delete(vif.vifc_vifi, 0); 987 ret = vif_delete(net, vif.vifc_vifi, 0);
970 } 988 }
971 rtnl_unlock(); 989 rtnl_unlock();
972 return ret; 990 return ret;
@@ -983,9 +1001,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
983 return -EFAULT; 1001 return -EFAULT;
984 rtnl_lock(); 1002 rtnl_lock();
985 if (optname == MRT_DEL_MFC) 1003 if (optname == MRT_DEL_MFC)
986 ret = ipmr_mfc_delete(&mfc); 1004 ret = ipmr_mfc_delete(net, &mfc);
987 else 1005 else
988 ret = ipmr_mfc_add(&mfc, sk==mroute_socket); 1006 ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk);
989 rtnl_unlock(); 1007 rtnl_unlock();
990 return ret; 1008 return ret;
991 /* 1009 /*
@@ -996,7 +1014,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
996 int v; 1014 int v;
997 if (get_user(v,(int __user *)optval)) 1015 if (get_user(v,(int __user *)optval))
998 return -EFAULT; 1016 return -EFAULT;
999 mroute_do_assert=(v)?1:0; 1017 net->ipv4.mroute_do_assert = (v) ? 1 : 0;
1000 return 0; 1018 return 0;
1001 } 1019 }
1002#ifdef CONFIG_IP_PIMSM 1020#ifdef CONFIG_IP_PIMSM
@@ -1010,11 +1028,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
1010 1028
1011 rtnl_lock(); 1029 rtnl_lock();
1012 ret = 0; 1030 ret = 0;
1013 if (v != mroute_do_pim) { 1031 if (v != net->ipv4.mroute_do_pim) {
1014 mroute_do_pim = v; 1032 net->ipv4.mroute_do_pim = v;
1015 mroute_do_assert = v; 1033 net->ipv4.mroute_do_assert = v;
1016#ifdef CONFIG_IP_PIMSM_V2 1034#ifdef CONFIG_IP_PIMSM_V2
1017 if (mroute_do_pim) 1035 if (net->ipv4.mroute_do_pim)
1018 ret = inet_add_protocol(&pim_protocol, 1036 ret = inet_add_protocol(&pim_protocol,
1019 IPPROTO_PIM); 1037 IPPROTO_PIM);
1020 else 1038 else
@@ -1045,6 +1063,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1045{ 1063{
1046 int olr; 1064 int olr;
1047 int val; 1065 int val;
1066 struct net *net = sock_net(sk);
1048 1067
1049 if (optname != MRT_VERSION && 1068 if (optname != MRT_VERSION &&
1050#ifdef CONFIG_IP_PIMSM 1069#ifdef CONFIG_IP_PIMSM
@@ -1066,10 +1085,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1066 val = 0x0305; 1085 val = 0x0305;
1067#ifdef CONFIG_IP_PIMSM 1086#ifdef CONFIG_IP_PIMSM
1068 else if (optname == MRT_PIM) 1087 else if (optname == MRT_PIM)
1069 val = mroute_do_pim; 1088 val = net->ipv4.mroute_do_pim;
1070#endif 1089#endif
1071 else 1090 else
1072 val = mroute_do_assert; 1091 val = net->ipv4.mroute_do_assert;
1073 if (copy_to_user(optval, &val, olr)) 1092 if (copy_to_user(optval, &val, olr))
1074 return -EFAULT; 1093 return -EFAULT;
1075 return 0; 1094 return 0;
@@ -1085,16 +1104,17 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1085 struct sioc_vif_req vr; 1104 struct sioc_vif_req vr;
1086 struct vif_device *vif; 1105 struct vif_device *vif;
1087 struct mfc_cache *c; 1106 struct mfc_cache *c;
1107 struct net *net = sock_net(sk);
1088 1108
1089 switch (cmd) { 1109 switch (cmd) {
1090 case SIOCGETVIFCNT: 1110 case SIOCGETVIFCNT:
1091 if (copy_from_user(&vr, arg, sizeof(vr))) 1111 if (copy_from_user(&vr, arg, sizeof(vr)))
1092 return -EFAULT; 1112 return -EFAULT;
1093 if (vr.vifi >= maxvif) 1113 if (vr.vifi >= net->ipv4.maxvif)
1094 return -EINVAL; 1114 return -EINVAL;
1095 read_lock(&mrt_lock); 1115 read_lock(&mrt_lock);
1096 vif=&vif_table[vr.vifi]; 1116 vif = &net->ipv4.vif_table[vr.vifi];
1097 if (VIF_EXISTS(vr.vifi)) { 1117 if (VIF_EXISTS(net, vr.vifi)) {
1098 vr.icount = vif->pkt_in; 1118 vr.icount = vif->pkt_in;
1099 vr.ocount = vif->pkt_out; 1119 vr.ocount = vif->pkt_out;
1100 vr.ibytes = vif->bytes_in; 1120 vr.ibytes = vif->bytes_in;
@@ -1112,7 +1132,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1112 return -EFAULT; 1132 return -EFAULT;
1113 1133
1114 read_lock(&mrt_lock); 1134 read_lock(&mrt_lock);
1115 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); 1135 c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr);
1116 if (c) { 1136 if (c) {
1117 sr.pktcnt = c->mfc_un.res.pkt; 1137 sr.pktcnt = c->mfc_un.res.pkt;
1118 sr.bytecnt = c->mfc_un.res.bytes; 1138 sr.bytecnt = c->mfc_un.res.bytes;
@@ -1134,18 +1154,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1134static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) 1154static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1135{ 1155{
1136 struct net_device *dev = ptr; 1156 struct net_device *dev = ptr;
1157 struct net *net = dev_net(dev);
1137 struct vif_device *v; 1158 struct vif_device *v;
1138 int ct; 1159 int ct;
1139 1160
1140 if (!net_eq(dev_net(dev), &init_net)) 1161 if (!net_eq(dev_net(dev), net))
1141 return NOTIFY_DONE; 1162 return NOTIFY_DONE;
1142 1163
1143 if (event != NETDEV_UNREGISTER) 1164 if (event != NETDEV_UNREGISTER)
1144 return NOTIFY_DONE; 1165 return NOTIFY_DONE;
1145 v=&vif_table[0]; 1166 v = &net->ipv4.vif_table[0];
1146 for (ct=0; ct<maxvif; ct++,v++) { 1167 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
1147 if (v->dev == dev) 1168 if (v->dev == dev)
1148 vif_delete(ct, 1); 1169 vif_delete(net, ct, 1);
1149 } 1170 }
1150 return NOTIFY_DONE; 1171 return NOTIFY_DONE;
1151} 1172}
@@ -1205,8 +1226,9 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
1205 1226
1206static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) 1227static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1207{ 1228{
1229 struct net *net = mfc_net(c);
1208 const struct iphdr *iph = ip_hdr(skb); 1230 const struct iphdr *iph = ip_hdr(skb);
1209 struct vif_device *vif = &vif_table[vifi]; 1231 struct vif_device *vif = &net->ipv4.vif_table[vifi];
1210 struct net_device *dev; 1232 struct net_device *dev;
1211 struct rtable *rt; 1233 struct rtable *rt;
1212 int encap = 0; 1234 int encap = 0;
@@ -1220,9 +1242,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1220 vif->bytes_out += skb->len; 1242 vif->bytes_out += skb->len;
1221 vif->dev->stats.tx_bytes += skb->len; 1243 vif->dev->stats.tx_bytes += skb->len;
1222 vif->dev->stats.tx_packets++; 1244 vif->dev->stats.tx_packets++;
1223 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); 1245 ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT);
1224 kfree_skb(skb); 1246 goto out_free;
1225 return;
1226 } 1247 }
1227#endif 1248#endif
1228 1249
@@ -1233,7 +1254,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1233 .saddr = vif->local, 1254 .saddr = vif->local,
1234 .tos = RT_TOS(iph->tos) } }, 1255 .tos = RT_TOS(iph->tos) } },
1235 .proto = IPPROTO_IPIP }; 1256 .proto = IPPROTO_IPIP };
1236 if (ip_route_output_key(&init_net, &rt, &fl)) 1257 if (ip_route_output_key(net, &rt, &fl))
1237 goto out_free; 1258 goto out_free;
1238 encap = sizeof(struct iphdr); 1259 encap = sizeof(struct iphdr);
1239 } else { 1260 } else {
@@ -1242,7 +1263,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1242 { .daddr = iph->daddr, 1263 { .daddr = iph->daddr,
1243 .tos = RT_TOS(iph->tos) } }, 1264 .tos = RT_TOS(iph->tos) } },
1244 .proto = IPPROTO_IPIP }; 1265 .proto = IPPROTO_IPIP };
1245 if (ip_route_output_key(&init_net, &rt, &fl)) 1266 if (ip_route_output_key(net, &rt, &fl))
1246 goto out_free; 1267 goto out_free;
1247 } 1268 }
1248 1269
@@ -1306,9 +1327,10 @@ out_free:
1306 1327
1307static int ipmr_find_vif(struct net_device *dev) 1328static int ipmr_find_vif(struct net_device *dev)
1308{ 1329{
1330 struct net *net = dev_net(dev);
1309 int ct; 1331 int ct;
1310 for (ct=maxvif-1; ct>=0; ct--) { 1332 for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) {
1311 if (vif_table[ct].dev == dev) 1333 if (net->ipv4.vif_table[ct].dev == dev)
1312 break; 1334 break;
1313 } 1335 }
1314 return ct; 1336 return ct;
@@ -1320,6 +1342,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1320{ 1342{
1321 int psend = -1; 1343 int psend = -1;
1322 int vif, ct; 1344 int vif, ct;
1345 struct net *net = mfc_net(cache);
1323 1346
1324 vif = cache->mfc_parent; 1347 vif = cache->mfc_parent;
1325 cache->mfc_un.res.pkt++; 1348 cache->mfc_un.res.pkt++;
@@ -1328,7 +1351,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1328 /* 1351 /*
1329 * Wrong interface: drop packet and (maybe) send PIM assert. 1352 * Wrong interface: drop packet and (maybe) send PIM assert.
1330 */ 1353 */
1331 if (vif_table[vif].dev != skb->dev) { 1354 if (net->ipv4.vif_table[vif].dev != skb->dev) {
1332 int true_vifi; 1355 int true_vifi;
1333 1356
1334 if (skb->rtable->fl.iif == 0) { 1357 if (skb->rtable->fl.iif == 0) {
@@ -1349,23 +1372,24 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1349 cache->mfc_un.res.wrong_if++; 1372 cache->mfc_un.res.wrong_if++;
1350 true_vifi = ipmr_find_vif(skb->dev); 1373 true_vifi = ipmr_find_vif(skb->dev);
1351 1374
1352 if (true_vifi >= 0 && mroute_do_assert && 1375 if (true_vifi >= 0 && net->ipv4.mroute_do_assert &&
1353 /* pimsm uses asserts, when switching from RPT to SPT, 1376 /* pimsm uses asserts, when switching from RPT to SPT,
1354 so that we cannot check that packet arrived on an oif. 1377 so that we cannot check that packet arrived on an oif.
1355 It is bad, but otherwise we would need to move pretty 1378 It is bad, but otherwise we would need to move pretty
1356 large chunk of pimd to kernel. Ough... --ANK 1379 large chunk of pimd to kernel. Ough... --ANK
1357 */ 1380 */
1358 (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && 1381 (net->ipv4.mroute_do_pim ||
1382 cache->mfc_un.res.ttls[true_vifi] < 255) &&
1359 time_after(jiffies, 1383 time_after(jiffies,
1360 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { 1384 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1361 cache->mfc_un.res.last_assert = jiffies; 1385 cache->mfc_un.res.last_assert = jiffies;
1362 ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF); 1386 ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF);
1363 } 1387 }
1364 goto dont_forward; 1388 goto dont_forward;
1365 } 1389 }
1366 1390
1367 vif_table[vif].pkt_in++; 1391 net->ipv4.vif_table[vif].pkt_in++;
1368 vif_table[vif].bytes_in += skb->len; 1392 net->ipv4.vif_table[vif].bytes_in += skb->len;
1369 1393
1370 /* 1394 /*
1371 * Forward the frame 1395 * Forward the frame
@@ -1405,6 +1429,7 @@ dont_forward:
1405int ip_mr_input(struct sk_buff *skb) 1429int ip_mr_input(struct sk_buff *skb)
1406{ 1430{
1407 struct mfc_cache *cache; 1431 struct mfc_cache *cache;
1432 struct net *net = dev_net(skb->dev);
1408 int local = skb->rtable->rt_flags&RTCF_LOCAL; 1433 int local = skb->rtable->rt_flags&RTCF_LOCAL;
1409 1434
1410 /* Packet is looped back after forward, it should not be 1435 /* Packet is looped back after forward, it should not be
@@ -1425,9 +1450,9 @@ int ip_mr_input(struct sk_buff *skb)
1425 that we can forward NO IGMP messages. 1450 that we can forward NO IGMP messages.
1426 */ 1451 */
1427 read_lock(&mrt_lock); 1452 read_lock(&mrt_lock);
1428 if (mroute_socket) { 1453 if (net->ipv4.mroute_sk) {
1429 nf_reset(skb); 1454 nf_reset(skb);
1430 raw_rcv(mroute_socket, skb); 1455 raw_rcv(net->ipv4.mroute_sk, skb);
1431 read_unlock(&mrt_lock); 1456 read_unlock(&mrt_lock);
1432 return 0; 1457 return 0;
1433 } 1458 }
@@ -1436,7 +1461,7 @@ int ip_mr_input(struct sk_buff *skb)
1436 } 1461 }
1437 1462
1438 read_lock(&mrt_lock); 1463 read_lock(&mrt_lock);
1439 cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 1464 cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1440 1465
1441 /* 1466 /*
1442 * No usable cache entry 1467 * No usable cache entry
@@ -1456,7 +1481,7 @@ int ip_mr_input(struct sk_buff *skb)
1456 1481
1457 vif = ipmr_find_vif(skb->dev); 1482 vif = ipmr_find_vif(skb->dev);
1458 if (vif >= 0) { 1483 if (vif >= 0) {
1459 int err = ipmr_cache_unresolved(vif, skb); 1484 int err = ipmr_cache_unresolved(net, vif, skb);
1460 read_unlock(&mrt_lock); 1485 read_unlock(&mrt_lock);
1461 1486
1462 return err; 1487 return err;
@@ -1487,6 +1512,7 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1487{ 1512{
1488 struct net_device *reg_dev = NULL; 1513 struct net_device *reg_dev = NULL;
1489 struct iphdr *encap; 1514 struct iphdr *encap;
1515 struct net *net = dev_net(skb->dev);
1490 1516
1491 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); 1517 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1492 /* 1518 /*
@@ -1501,8 +1527,8 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1501 return 1; 1527 return 1;
1502 1528
1503 read_lock(&mrt_lock); 1529 read_lock(&mrt_lock);
1504 if (reg_vif_num >= 0) 1530 if (net->ipv4.mroute_reg_vif_num >= 0)
1505 reg_dev = vif_table[reg_vif_num].dev; 1531 reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev;
1506 if (reg_dev) 1532 if (reg_dev)
1507 dev_hold(reg_dev); 1533 dev_hold(reg_dev);
1508 read_unlock(&mrt_lock); 1534 read_unlock(&mrt_lock);
@@ -1537,13 +1563,14 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1537int pim_rcv_v1(struct sk_buff * skb) 1563int pim_rcv_v1(struct sk_buff * skb)
1538{ 1564{
1539 struct igmphdr *pim; 1565 struct igmphdr *pim;
1566 struct net *net = dev_net(skb->dev);
1540 1567
1541 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) 1568 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1542 goto drop; 1569 goto drop;
1543 1570
1544 pim = igmp_hdr(skb); 1571 pim = igmp_hdr(skb);
1545 1572
1546 if (!mroute_do_pim || 1573 if (!net->ipv4.mroute_do_pim ||
1547 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1574 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1548 goto drop; 1575 goto drop;
1549 1576
@@ -1583,7 +1610,8 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1583{ 1610{
1584 int ct; 1611 int ct;
1585 struct rtnexthop *nhp; 1612 struct rtnexthop *nhp;
1586 struct net_device *dev = vif_table[c->mfc_parent].dev; 1613 struct net *net = mfc_net(c);
1614 struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
1587 u8 *b = skb_tail_pointer(skb); 1615 u8 *b = skb_tail_pointer(skb);
1588 struct rtattr *mp_head; 1616 struct rtattr *mp_head;
1589 1617
@@ -1599,7 +1627,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1599 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); 1627 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1600 nhp->rtnh_flags = 0; 1628 nhp->rtnh_flags = 0;
1601 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 1629 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1602 nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; 1630 nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex;
1603 nhp->rtnh_len = sizeof(*nhp); 1631 nhp->rtnh_len = sizeof(*nhp);
1604 } 1632 }
1605 } 1633 }
@@ -1613,14 +1641,15 @@ rtattr_failure:
1613 return -EMSGSIZE; 1641 return -EMSGSIZE;
1614} 1642}
1615 1643
1616int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) 1644int ipmr_get_route(struct net *net,
1645 struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1617{ 1646{
1618 int err; 1647 int err;
1619 struct mfc_cache *cache; 1648 struct mfc_cache *cache;
1620 struct rtable *rt = skb->rtable; 1649 struct rtable *rt = skb->rtable;
1621 1650
1622 read_lock(&mrt_lock); 1651 read_lock(&mrt_lock);
1623 cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); 1652 cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst);
1624 1653
1625 if (cache == NULL) { 1654 if (cache == NULL) {
1626 struct sk_buff *skb2; 1655 struct sk_buff *skb2;
@@ -1651,7 +1680,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1651 iph->saddr = rt->rt_src; 1680 iph->saddr = rt->rt_src;
1652 iph->daddr = rt->rt_dst; 1681 iph->daddr = rt->rt_dst;
1653 iph->version = 0; 1682 iph->version = 0;
1654 err = ipmr_cache_unresolved(vif, skb2); 1683 err = ipmr_cache_unresolved(net, vif, skb2);
1655 read_unlock(&mrt_lock); 1684 read_unlock(&mrt_lock);
1656 return err; 1685 return err;
1657 } 1686 }
@@ -1668,17 +1697,19 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1668 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif 1697 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1669 */ 1698 */
1670struct ipmr_vif_iter { 1699struct ipmr_vif_iter {
1700 struct seq_net_private p;
1671 int ct; 1701 int ct;
1672}; 1702};
1673 1703
1674static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, 1704static struct vif_device *ipmr_vif_seq_idx(struct net *net,
1705 struct ipmr_vif_iter *iter,
1675 loff_t pos) 1706 loff_t pos)
1676{ 1707{
1677 for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) { 1708 for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) {
1678 if (!VIF_EXISTS(iter->ct)) 1709 if (!VIF_EXISTS(net, iter->ct))
1679 continue; 1710 continue;
1680 if (pos-- == 0) 1711 if (pos-- == 0)
1681 return &vif_table[iter->ct]; 1712 return &net->ipv4.vif_table[iter->ct];
1682 } 1713 }
1683 return NULL; 1714 return NULL;
1684} 1715}
@@ -1686,23 +1717,26 @@ static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1686static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) 1717static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1687 __acquires(mrt_lock) 1718 __acquires(mrt_lock)
1688{ 1719{
1720 struct net *net = seq_file_net(seq);
1721
1689 read_lock(&mrt_lock); 1722 read_lock(&mrt_lock);
1690 return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 1723 return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
1691 : SEQ_START_TOKEN; 1724 : SEQ_START_TOKEN;
1692} 1725}
1693 1726
1694static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1727static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1695{ 1728{
1696 struct ipmr_vif_iter *iter = seq->private; 1729 struct ipmr_vif_iter *iter = seq->private;
1730 struct net *net = seq_file_net(seq);
1697 1731
1698 ++*pos; 1732 ++*pos;
1699 if (v == SEQ_START_TOKEN) 1733 if (v == SEQ_START_TOKEN)
1700 return ipmr_vif_seq_idx(iter, 0); 1734 return ipmr_vif_seq_idx(net, iter, 0);
1701 1735
1702 while (++iter->ct < maxvif) { 1736 while (++iter->ct < net->ipv4.maxvif) {
1703 if (!VIF_EXISTS(iter->ct)) 1737 if (!VIF_EXISTS(net, iter->ct))
1704 continue; 1738 continue;
1705 return &vif_table[iter->ct]; 1739 return &net->ipv4.vif_table[iter->ct];
1706 } 1740 }
1707 return NULL; 1741 return NULL;
1708} 1742}
@@ -1715,6 +1749,8 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1715 1749
1716static int ipmr_vif_seq_show(struct seq_file *seq, void *v) 1750static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1717{ 1751{
1752 struct net *net = seq_file_net(seq);
1753
1718 if (v == SEQ_START_TOKEN) { 1754 if (v == SEQ_START_TOKEN) {
1719 seq_puts(seq, 1755 seq_puts(seq,
1720 "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); 1756 "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
@@ -1724,7 +1760,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1724 1760
1725 seq_printf(seq, 1761 seq_printf(seq,
1726 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", 1762 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
1727 vif - vif_table, 1763 vif - net->ipv4.vif_table,
1728 name, vif->bytes_in, vif->pkt_in, 1764 name, vif->bytes_in, vif->pkt_in,
1729 vif->bytes_out, vif->pkt_out, 1765 vif->bytes_out, vif->pkt_out,
1730 vif->flags, vif->local, vif->remote); 1766 vif->flags, vif->local, vif->remote);
@@ -1741,8 +1777,8 @@ static const struct seq_operations ipmr_vif_seq_ops = {
1741 1777
1742static int ipmr_vif_open(struct inode *inode, struct file *file) 1778static int ipmr_vif_open(struct inode *inode, struct file *file)
1743{ 1779{
1744 return seq_open_private(file, &ipmr_vif_seq_ops, 1780 return seq_open_net(inode, file, &ipmr_vif_seq_ops,
1745 sizeof(struct ipmr_vif_iter)); 1781 sizeof(struct ipmr_vif_iter));
1746} 1782}
1747 1783
1748static const struct file_operations ipmr_vif_fops = { 1784static const struct file_operations ipmr_vif_fops = {
@@ -1750,23 +1786,26 @@ static const struct file_operations ipmr_vif_fops = {
1750 .open = ipmr_vif_open, 1786 .open = ipmr_vif_open,
1751 .read = seq_read, 1787 .read = seq_read,
1752 .llseek = seq_lseek, 1788 .llseek = seq_lseek,
1753 .release = seq_release_private, 1789 .release = seq_release_net,
1754}; 1790};
1755 1791
1756struct ipmr_mfc_iter { 1792struct ipmr_mfc_iter {
1793 struct seq_net_private p;
1757 struct mfc_cache **cache; 1794 struct mfc_cache **cache;
1758 int ct; 1795 int ct;
1759}; 1796};
1760 1797
1761 1798
1762static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) 1799static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
1800 struct ipmr_mfc_iter *it, loff_t pos)
1763{ 1801{
1764 struct mfc_cache *mfc; 1802 struct mfc_cache *mfc;
1765 1803
1766 it->cache = mfc_cache_array; 1804 it->cache = net->ipv4.mfc_cache_array;
1767 read_lock(&mrt_lock); 1805 read_lock(&mrt_lock);
1768 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 1806 for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1769 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 1807 for (mfc = net->ipv4.mfc_cache_array[it->ct];
1808 mfc; mfc = mfc->next)
1770 if (pos-- == 0) 1809 if (pos-- == 0)
1771 return mfc; 1810 return mfc;
1772 read_unlock(&mrt_lock); 1811 read_unlock(&mrt_lock);
@@ -1774,7 +1813,8 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1774 it->cache = &mfc_unres_queue; 1813 it->cache = &mfc_unres_queue;
1775 spin_lock_bh(&mfc_unres_lock); 1814 spin_lock_bh(&mfc_unres_lock);
1776 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) 1815 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1777 if (pos-- == 0) 1816 if (net_eq(mfc_net(mfc), net) &&
1817 pos-- == 0)
1778 return mfc; 1818 return mfc;
1779 spin_unlock_bh(&mfc_unres_lock); 1819 spin_unlock_bh(&mfc_unres_lock);
1780 1820
@@ -1786,9 +1826,11 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1786static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) 1826static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1787{ 1827{
1788 struct ipmr_mfc_iter *it = seq->private; 1828 struct ipmr_mfc_iter *it = seq->private;
1829 struct net *net = seq_file_net(seq);
1830
1789 it->cache = NULL; 1831 it->cache = NULL;
1790 it->ct = 0; 1832 it->ct = 0;
1791 return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 1833 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
1792 : SEQ_START_TOKEN; 1834 : SEQ_START_TOKEN;
1793} 1835}
1794 1836
@@ -1796,11 +1838,12 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1796{ 1838{
1797 struct mfc_cache *mfc = v; 1839 struct mfc_cache *mfc = v;
1798 struct ipmr_mfc_iter *it = seq->private; 1840 struct ipmr_mfc_iter *it = seq->private;
1841 struct net *net = seq_file_net(seq);
1799 1842
1800 ++*pos; 1843 ++*pos;
1801 1844
1802 if (v == SEQ_START_TOKEN) 1845 if (v == SEQ_START_TOKEN)
1803 return ipmr_mfc_seq_idx(seq->private, 0); 1846 return ipmr_mfc_seq_idx(net, seq->private, 0);
1804 1847
1805 if (mfc->next) 1848 if (mfc->next)
1806 return mfc->next; 1849 return mfc->next;
@@ -1808,10 +1851,10 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1808 if (it->cache == &mfc_unres_queue) 1851 if (it->cache == &mfc_unres_queue)
1809 goto end_of_list; 1852 goto end_of_list;
1810 1853
1811 BUG_ON(it->cache != mfc_cache_array); 1854 BUG_ON(it->cache != net->ipv4.mfc_cache_array);
1812 1855
1813 while (++it->ct < MFC_LINES) { 1856 while (++it->ct < MFC_LINES) {
1814 mfc = mfc_cache_array[it->ct]; 1857 mfc = net->ipv4.mfc_cache_array[it->ct];
1815 if (mfc) 1858 if (mfc)
1816 return mfc; 1859 return mfc;
1817 } 1860 }
@@ -1823,6 +1866,8 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1823 1866
1824 spin_lock_bh(&mfc_unres_lock); 1867 spin_lock_bh(&mfc_unres_lock);
1825 mfc = mfc_unres_queue; 1868 mfc = mfc_unres_queue;
1869 while (mfc && !net_eq(mfc_net(mfc), net))
1870 mfc = mfc->next;
1826 if (mfc) 1871 if (mfc)
1827 return mfc; 1872 return mfc;
1828 1873
@@ -1836,16 +1881,18 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1836static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) 1881static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1837{ 1882{
1838 struct ipmr_mfc_iter *it = seq->private; 1883 struct ipmr_mfc_iter *it = seq->private;
1884 struct net *net = seq_file_net(seq);
1839 1885
1840 if (it->cache == &mfc_unres_queue) 1886 if (it->cache == &mfc_unres_queue)
1841 spin_unlock_bh(&mfc_unres_lock); 1887 spin_unlock_bh(&mfc_unres_lock);
1842 else if (it->cache == mfc_cache_array) 1888 else if (it->cache == net->ipv4.mfc_cache_array)
1843 read_unlock(&mrt_lock); 1889 read_unlock(&mrt_lock);
1844} 1890}
1845 1891
1846static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) 1892static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1847{ 1893{
1848 int n; 1894 int n;
1895 struct net *net = seq_file_net(seq);
1849 1896
1850 if (v == SEQ_START_TOKEN) { 1897 if (v == SEQ_START_TOKEN) {
1851 seq_puts(seq, 1898 seq_puts(seq,
@@ -1866,9 +1913,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1866 mfc->mfc_un.res.wrong_if); 1913 mfc->mfc_un.res.wrong_if);
1867 for (n = mfc->mfc_un.res.minvif; 1914 for (n = mfc->mfc_un.res.minvif;
1868 n < mfc->mfc_un.res.maxvif; n++ ) { 1915 n < mfc->mfc_un.res.maxvif; n++ ) {
1869 if (VIF_EXISTS(n) 1916 if (VIF_EXISTS(net, n) &&
1870 && mfc->mfc_un.res.ttls[n] < 255) 1917 mfc->mfc_un.res.ttls[n] < 255)
1871 seq_printf(seq, 1918 seq_printf(seq,
1872 " %2d:%-3d", 1919 " %2d:%-3d",
1873 n, mfc->mfc_un.res.ttls[n]); 1920 n, mfc->mfc_un.res.ttls[n]);
1874 } 1921 }
@@ -1892,8 +1939,8 @@ static const struct seq_operations ipmr_mfc_seq_ops = {
1892 1939
1893static int ipmr_mfc_open(struct inode *inode, struct file *file) 1940static int ipmr_mfc_open(struct inode *inode, struct file *file)
1894{ 1941{
1895 return seq_open_private(file, &ipmr_mfc_seq_ops, 1942 return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
1896 sizeof(struct ipmr_mfc_iter)); 1943 sizeof(struct ipmr_mfc_iter));
1897} 1944}
1898 1945
1899static const struct file_operations ipmr_mfc_fops = { 1946static const struct file_operations ipmr_mfc_fops = {
@@ -1901,7 +1948,7 @@ static const struct file_operations ipmr_mfc_fops = {
1901 .open = ipmr_mfc_open, 1948 .open = ipmr_mfc_open,
1902 .read = seq_read, 1949 .read = seq_read,
1903 .llseek = seq_lseek, 1950 .llseek = seq_lseek,
1904 .release = seq_release_private, 1951 .release = seq_release_net,
1905}; 1952};
1906#endif 1953#endif
1907 1954
@@ -1915,6 +1962,65 @@ static struct net_protocol pim_protocol = {
1915/* 1962/*
1916 * Setup for IP multicast routing 1963 * Setup for IP multicast routing
1917 */ 1964 */
1965static int __net_init ipmr_net_init(struct net *net)
1966{
1967 int err = 0;
1968
1969 net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
1970 GFP_KERNEL);
1971 if (!net->ipv4.vif_table) {
1972 err = -ENOMEM;
1973 goto fail;
1974 }
1975
1976 /* Forwarding cache */
1977 net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
1978 sizeof(struct mfc_cache *),
1979 GFP_KERNEL);
1980 if (!net->ipv4.mfc_cache_array) {
1981 err = -ENOMEM;
1982 goto fail_mfc_cache;
1983 }
1984
1985#ifdef CONFIG_IP_PIMSM
1986 net->ipv4.mroute_reg_vif_num = -1;
1987#endif
1988
1989#ifdef CONFIG_PROC_FS
1990 err = -ENOMEM;
1991 if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
1992 goto proc_vif_fail;
1993 if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1994 goto proc_cache_fail;
1995#endif
1996 return 0;
1997
1998#ifdef CONFIG_PROC_FS
1999proc_cache_fail:
2000 proc_net_remove(net, "ip_mr_vif");
2001proc_vif_fail:
2002 kfree(net->ipv4.mfc_cache_array);
2003#endif
2004fail_mfc_cache:
2005 kfree(net->ipv4.vif_table);
2006fail:
2007 return err;
2008}
2009
2010static void __net_exit ipmr_net_exit(struct net *net)
2011{
2012#ifdef CONFIG_PROC_FS
2013 proc_net_remove(net, "ip_mr_cache");
2014 proc_net_remove(net, "ip_mr_vif");
2015#endif
2016 kfree(net->ipv4.mfc_cache_array);
2017 kfree(net->ipv4.vif_table);
2018}
2019
2020static struct pernet_operations ipmr_net_ops = {
2021 .init = ipmr_net_init,
2022 .exit = ipmr_net_exit,
2023};
1918 2024
1919int __init ip_mr_init(void) 2025int __init ip_mr_init(void)
1920{ 2026{
@@ -1927,26 +2033,20 @@ int __init ip_mr_init(void)
1927 if (!mrt_cachep) 2033 if (!mrt_cachep)
1928 return -ENOMEM; 2034 return -ENOMEM;
1929 2035
2036 err = register_pernet_subsys(&ipmr_net_ops);
2037 if (err)
2038 goto reg_pernet_fail;
2039
1930 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); 2040 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1931 err = register_netdevice_notifier(&ip_mr_notifier); 2041 err = register_netdevice_notifier(&ip_mr_notifier);
1932 if (err) 2042 if (err)
1933 goto reg_notif_fail; 2043 goto reg_notif_fail;
1934#ifdef CONFIG_PROC_FS
1935 err = -ENOMEM;
1936 if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1937 goto proc_vif_fail;
1938 if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1939 goto proc_cache_fail;
1940#endif
1941 return 0; 2044 return 0;
1942#ifdef CONFIG_PROC_FS 2045
1943proc_cache_fail:
1944 proc_net_remove(&init_net, "ip_mr_vif");
1945proc_vif_fail:
1946 unregister_netdevice_notifier(&ip_mr_notifier);
1947#endif
1948reg_notif_fail: 2046reg_notif_fail:
1949 del_timer(&ipmr_expire_timer); 2047 del_timer(&ipmr_expire_timer);
2048 unregister_pernet_subsys(&ipmr_net_ops);
2049reg_pernet_fail:
1950 kmem_cache_destroy(mrt_cachep); 2050 kmem_cache_destroy(mrt_cachep);
1951 return err; 2051 return err;
1952} 2052}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 3816e1dc9295..1833bdbf9805 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -31,7 +31,7 @@ config NF_CONNTRACK_PROC_COMPAT
31 default y 31 default y
32 help 32 help
33 This option enables /proc and sysctl compatibility with the old 33 This option enables /proc and sysctl compatibility with the old
34 layer 3 dependant connection tracking. This is needed to keep 34 layer 3 dependent connection tracking. This is needed to keep
35 old programs that have not been adapted to the new names working. 35 old programs that have not been adapted to the new names working.
36 36
37 If unsure, say Y. 37 If unsure, say Y.
@@ -95,11 +95,11 @@ config IP_NF_MATCH_ECN
95config IP_NF_MATCH_TTL 95config IP_NF_MATCH_TTL
96 tristate '"ttl" match support' 96 tristate '"ttl" match support'
97 depends on NETFILTER_ADVANCED 97 depends on NETFILTER_ADVANCED
98 help 98 select NETFILTER_XT_MATCH_HL
99 This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user 99 ---help---
100 to match packets by their TTL value. 100 This is a backwards-compat option for the user's convenience
101 101 (e.g. when running oldconfig). It selects
102 To compile it as a module, choose M here. If unsure, say N. 102 CONFIG_NETFILTER_XT_MATCH_HL.
103 103
104# `filter', generic and specific targets 104# `filter', generic and specific targets
105config IP_NF_FILTER 105config IP_NF_FILTER
@@ -323,19 +323,13 @@ config IP_NF_TARGET_ECN
323 To compile it as a module, choose M here. If unsure, say N. 323 To compile it as a module, choose M here. If unsure, say N.
324 324
325config IP_NF_TARGET_TTL 325config IP_NF_TARGET_TTL
326 tristate 'TTL target support' 326 tristate '"TTL" target support'
327 depends on IP_NF_MANGLE
328 depends on NETFILTER_ADVANCED 327 depends on NETFILTER_ADVANCED
329 help 328 select NETFILTER_XT_TARGET_HL
330 This option adds a `TTL' target, which enables the user to modify 329 ---help---
331 the TTL value of the IP header. 330 This is a backwards-compat option for the user's convenience
332 331 (e.g. when running oldconfig). It selects
333 While it is safe to decrement/lower the TTL, this target also enables 332 CONFIG_NETFILTER_XT_TARGET_HL.
334 functionality to increment and set the TTL value of the IP header to
335 arbitrary values. This is EXTREMELY DANGEROUS since you can easily
336 create immortal packets that loop forever on the network.
337
338 To compile it as a module, choose M here. If unsure, say N.
339 333
340# raw + specific targets 334# raw + specific targets
341config IP_NF_RAW 335config IP_NF_RAW
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 5f9b650d90fc..48111594ee9b 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -51,7 +51,6 @@ obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
51obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o 51obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
52obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o 52obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
53obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o 53obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
54obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
55 54
56# targets 55# targets
57obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o 56obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
@@ -61,7 +60,6 @@ obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
61obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o 60obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
62obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o 61obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
63obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o 62obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
64obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
65obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o 63obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
66 64
67# generic ARP tables 65# generic ARP tables
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 7ea88b61cb0d..84b9c179df51 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -73,6 +73,40 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
73 return (ret != 0); 73 return (ret != 0);
74} 74}
75 75
76/*
77 * Unfortunatly, _b and _mask are not aligned to an int (or long int)
78 * Some arches dont care, unrolling the loop is a win on them.
79 * For other arches, we only have a 16bit alignement.
80 */
81static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask)
82{
83#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
84 const unsigned long *a = (const unsigned long *)_a;
85 const unsigned long *b = (const unsigned long *)_b;
86 const unsigned long *mask = (const unsigned long *)_mask;
87 unsigned long ret;
88
89 ret = (a[0] ^ b[0]) & mask[0];
90 if (IFNAMSIZ > sizeof(unsigned long))
91 ret |= (a[1] ^ b[1]) & mask[1];
92 if (IFNAMSIZ > 2 * sizeof(unsigned long))
93 ret |= (a[2] ^ b[2]) & mask[2];
94 if (IFNAMSIZ > 3 * sizeof(unsigned long))
95 ret |= (a[3] ^ b[3]) & mask[3];
96 BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
97#else
98 unsigned long ret = 0;
99 const u16 *a = (const u16 *)_a;
100 const u16 *b = (const u16 *)_b;
101 const u16 *mask = (const u16 *)_mask;
102 int i;
103
104 for (i = 0; i < IFNAMSIZ/sizeof(u16); i++)
105 ret |= (a[i] ^ b[i]) & mask[i];
106#endif
107 return ret;
108}
109
76/* Returns whether packet matches rule or not. */ 110/* Returns whether packet matches rule or not. */
77static inline int arp_packet_match(const struct arphdr *arphdr, 111static inline int arp_packet_match(const struct arphdr *arphdr,
78 struct net_device *dev, 112 struct net_device *dev,
@@ -83,7 +117,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
83 const char *arpptr = (char *)(arphdr + 1); 117 const char *arpptr = (char *)(arphdr + 1);
84 const char *src_devaddr, *tgt_devaddr; 118 const char *src_devaddr, *tgt_devaddr;
85 __be32 src_ipaddr, tgt_ipaddr; 119 __be32 src_ipaddr, tgt_ipaddr;
86 int i, ret; 120 long ret;
87 121
88#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) 122#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
89 123
@@ -156,10 +190,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
156 } 190 }
157 191
158 /* Look for ifname matches. */ 192 /* Look for ifname matches. */
159 for (i = 0, ret = 0; i < IFNAMSIZ; i++) { 193 ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
160 ret |= (indev[i] ^ arpinfo->iniface[i])
161 & arpinfo->iniface_mask[i];
162 }
163 194
164 if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { 195 if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
165 dprintf("VIA in mismatch (%s vs %s).%s\n", 196 dprintf("VIA in mismatch (%s vs %s).%s\n",
@@ -168,10 +199,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
168 return 0; 199 return 0;
169 } 200 }
170 201
171 for (i = 0, ret = 0; i < IFNAMSIZ; i++) { 202 ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
172 ret |= (outdev[i] ^ arpinfo->outiface[i])
173 & arpinfo->outiface_mask[i];
174 }
175 203
176 if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { 204 if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
177 dprintf("VIA out mismatch (%s vs %s).%s\n", 205 dprintf("VIA out mismatch (%s vs %s).%s\n",
@@ -221,7 +249,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
221 const struct net_device *out, 249 const struct net_device *out,
222 struct xt_table *table) 250 struct xt_table *table)
223{ 251{
224 static const char nulldevname[IFNAMSIZ]; 252 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
225 unsigned int verdict = NF_DROP; 253 unsigned int verdict = NF_DROP;
226 const struct arphdr *arp; 254 const struct arphdr *arp;
227 bool hotdrop = false; 255 bool hotdrop = false;
@@ -237,9 +265,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
237 indev = in ? in->name : nulldevname; 265 indev = in ? in->name : nulldevname;
238 outdev = out ? out->name : nulldevname; 266 outdev = out ? out->name : nulldevname;
239 267
240 read_lock_bh(&table->lock); 268 rcu_read_lock();
241 private = table->private; 269 private = rcu_dereference(table->private);
242 table_base = (void *)private->entries[smp_processor_id()]; 270 table_base = rcu_dereference(private->entries[smp_processor_id()]);
271
243 e = get_entry(table_base, private->hook_entry[hook]); 272 e = get_entry(table_base, private->hook_entry[hook]);
244 back = get_entry(table_base, private->underflow[hook]); 273 back = get_entry(table_base, private->underflow[hook]);
245 274
@@ -311,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
311 e = (void *)e + e->next_offset; 340 e = (void *)e + e->next_offset;
312 } 341 }
313 } while (!hotdrop); 342 } while (!hotdrop);
314 read_unlock_bh(&table->lock); 343
344 rcu_read_unlock();
315 345
316 if (hotdrop) 346 if (hotdrop)
317 return NF_DROP; 347 return NF_DROP;
@@ -714,11 +744,65 @@ static void get_counters(const struct xt_table_info *t,
714 } 744 }
715} 745}
716 746
717static inline struct xt_counters *alloc_counters(struct xt_table *table) 747
748/* We're lazy, and add to the first CPU; overflow works its fey magic
749 * and everything is OK. */
750static int
751add_counter_to_entry(struct arpt_entry *e,
752 const struct xt_counters addme[],
753 unsigned int *i)
754{
755 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
756
757 (*i)++;
758 return 0;
759}
760
761/* Take values from counters and add them back onto the current cpu */
762static void put_counters(struct xt_table_info *t,
763 const struct xt_counters counters[])
764{
765 unsigned int i, cpu;
766
767 local_bh_disable();
768 cpu = smp_processor_id();
769 i = 0;
770 ARPT_ENTRY_ITERATE(t->entries[cpu],
771 t->size,
772 add_counter_to_entry,
773 counters,
774 &i);
775 local_bh_enable();
776}
777
778static inline int
779zero_entry_counter(struct arpt_entry *e, void *arg)
780{
781 e->counters.bcnt = 0;
782 e->counters.pcnt = 0;
783 return 0;
784}
785
786static void
787clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
788{
789 unsigned int cpu;
790 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
791
792 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
793 for_each_possible_cpu(cpu) {
794 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
795 ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
796 zero_entry_counter, NULL);
797 }
798}
799
800static struct xt_counters *alloc_counters(struct xt_table *table)
718{ 801{
719 unsigned int countersize; 802 unsigned int countersize;
720 struct xt_counters *counters; 803 struct xt_counters *counters;
721 const struct xt_table_info *private = table->private; 804 struct xt_table_info *private = table->private;
805 struct xt_table_info *info;
722 806
723 /* We need atomic snapshot of counters: rest doesn't change 807 /* We need atomic snapshot of counters: rest doesn't change
724 * (other than comefrom, which userspace doesn't care 808 * (other than comefrom, which userspace doesn't care
@@ -728,14 +812,30 @@ static inline struct xt_counters *alloc_counters(struct xt_table *table)
728 counters = vmalloc_node(countersize, numa_node_id()); 812 counters = vmalloc_node(countersize, numa_node_id());
729 813
730 if (counters == NULL) 814 if (counters == NULL)
731 return ERR_PTR(-ENOMEM); 815 goto nomem;
816
817 info = xt_alloc_table_info(private->size);
818 if (!info)
819 goto free_counters;
820
821 clone_counters(info, private);
822
823 mutex_lock(&table->lock);
824 xt_table_entry_swap_rcu(private, info);
825 synchronize_net(); /* Wait until smoke has cleared */
826
827 get_counters(info, counters);
828 put_counters(private, counters);
829 mutex_unlock(&table->lock);
732 830
733 /* First, sum counters... */ 831 xt_free_table_info(info);
734 write_lock_bh(&table->lock);
735 get_counters(private, counters);
736 write_unlock_bh(&table->lock);
737 832
738 return counters; 833 return counters;
834
835 free_counters:
836 vfree(counters);
837 nomem:
838 return ERR_PTR(-ENOMEM);
739} 839}
740 840
741static int copy_entries_to_user(unsigned int total_size, 841static int copy_entries_to_user(unsigned int total_size,
@@ -1075,20 +1175,6 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1075 return ret; 1175 return ret;
1076} 1176}
1077 1177
1078/* We're lazy, and add to the first CPU; overflow works its fey magic
1079 * and everything is OK.
1080 */
1081static inline int add_counter_to_entry(struct arpt_entry *e,
1082 const struct xt_counters addme[],
1083 unsigned int *i)
1084{
1085
1086 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1087
1088 (*i)++;
1089 return 0;
1090}
1091
1092static int do_add_counters(struct net *net, void __user *user, unsigned int len, 1178static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1093 int compat) 1179 int compat)
1094{ 1180{
@@ -1148,13 +1234,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1148 goto free; 1234 goto free;
1149 } 1235 }
1150 1236
1151 write_lock_bh(&t->lock); 1237 mutex_lock(&t->lock);
1152 private = t->private; 1238 private = t->private;
1153 if (private->number != num_counters) { 1239 if (private->number != num_counters) {
1154 ret = -EINVAL; 1240 ret = -EINVAL;
1155 goto unlock_up_free; 1241 goto unlock_up_free;
1156 } 1242 }
1157 1243
1244 preempt_disable();
1158 i = 0; 1245 i = 0;
1159 /* Choose the copy that is on our node */ 1246 /* Choose the copy that is on our node */
1160 loc_cpu_entry = private->entries[smp_processor_id()]; 1247 loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1163,8 +1250,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1163 add_counter_to_entry, 1250 add_counter_to_entry,
1164 paddc, 1251 paddc,
1165 &i); 1252 &i);
1253 preempt_enable();
1166 unlock_up_free: 1254 unlock_up_free:
1167 write_unlock_bh(&t->lock); 1255 mutex_unlock(&t->lock);
1256
1168 xt_table_unlock(t); 1257 xt_table_unlock(t);
1169 module_put(t->me); 1258 module_put(t->me);
1170 free: 1259 free:
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index e091187e864f..6ecfdae7c589 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -48,8 +48,6 @@ static struct
48static struct xt_table packet_filter = { 48static struct xt_table packet_filter = {
49 .name = "filter", 49 .name = "filter",
50 .valid_hooks = FILTER_VALID_HOOKS, 50 .valid_hooks = FILTER_VALID_HOOKS,
51 .lock = __RW_LOCK_UNLOCKED(packet_filter.lock),
52 .private = NULL,
53 .me = THIS_MODULE, 51 .me = THIS_MODULE,
54 .af = NFPROTO_ARP, 52 .af = NFPROTO_ARP,
55}; 53};
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 432ce9d1c11c..5f22c91c6e15 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -24,6 +24,7 @@
24#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
25#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/net.h>
27#include <linux/mutex.h> 28#include <linux/mutex.h>
28#include <net/net_namespace.h> 29#include <net/net_namespace.h>
29#include <net/sock.h> 30#include <net/sock.h>
@@ -640,6 +641,7 @@ static void __exit ip_queue_fini(void)
640MODULE_DESCRIPTION("IPv4 packet queue handler"); 641MODULE_DESCRIPTION("IPv4 packet queue handler");
641MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 642MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
642MODULE_LICENSE("GPL"); 643MODULE_LICENSE("GPL");
644MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL);
643 645
644module_init(ip_queue_init); 646module_init(ip_queue_init);
645module_exit(ip_queue_fini); 647module_exit(ip_queue_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ef8b6ca068b2..e5294aec967d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -74,6 +74,25 @@ do { \
74 74
75 Hence the start of any table is given by get_table() below. */ 75 Hence the start of any table is given by get_table() below. */
76 76
77static unsigned long ifname_compare(const char *_a, const char *_b,
78 const unsigned char *_mask)
79{
80 const unsigned long *a = (const unsigned long *)_a;
81 const unsigned long *b = (const unsigned long *)_b;
82 const unsigned long *mask = (const unsigned long *)_mask;
83 unsigned long ret;
84
85 ret = (a[0] ^ b[0]) & mask[0];
86 if (IFNAMSIZ > sizeof(unsigned long))
87 ret |= (a[1] ^ b[1]) & mask[1];
88 if (IFNAMSIZ > 2 * sizeof(unsigned long))
89 ret |= (a[2] ^ b[2]) & mask[2];
90 if (IFNAMSIZ > 3 * sizeof(unsigned long))
91 ret |= (a[3] ^ b[3]) & mask[3];
92 BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
93 return ret;
94}
95
77/* Returns whether matches rule or not. */ 96/* Returns whether matches rule or not. */
78/* Performance critical - called for every packet */ 97/* Performance critical - called for every packet */
79static inline bool 98static inline bool
@@ -83,7 +102,6 @@ ip_packet_match(const struct iphdr *ip,
83 const struct ipt_ip *ipinfo, 102 const struct ipt_ip *ipinfo,
84 int isfrag) 103 int isfrag)
85{ 104{
86 size_t i;
87 unsigned long ret; 105 unsigned long ret;
88 106
89#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) 107#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
@@ -103,12 +121,7 @@ ip_packet_match(const struct iphdr *ip,
103 return false; 121 return false;
104 } 122 }
105 123
106 /* Look for ifname matches; this should unroll nicely. */ 124 ret = ifname_compare(indev, ipinfo->iniface, ipinfo->iniface_mask);
107 for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
108 ret |= (((const unsigned long *)indev)[i]
109 ^ ((const unsigned long *)ipinfo->iniface)[i])
110 & ((const unsigned long *)ipinfo->iniface_mask)[i];
111 }
112 125
113 if (FWINV(ret != 0, IPT_INV_VIA_IN)) { 126 if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
114 dprintf("VIA in mismatch (%s vs %s).%s\n", 127 dprintf("VIA in mismatch (%s vs %s).%s\n",
@@ -117,11 +130,7 @@ ip_packet_match(const struct iphdr *ip,
117 return false; 130 return false;
118 } 131 }
119 132
120 for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { 133 ret = ifname_compare(outdev, ipinfo->outiface, ipinfo->outiface_mask);
121 ret |= (((const unsigned long *)outdev)[i]
122 ^ ((const unsigned long *)ipinfo->outiface)[i])
123 & ((const unsigned long *)ipinfo->outiface_mask)[i];
124 }
125 134
126 if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { 135 if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
127 dprintf("VIA out mismatch (%s vs %s).%s\n", 136 dprintf("VIA out mismatch (%s vs %s).%s\n",
@@ -347,10 +356,12 @@ ipt_do_table(struct sk_buff *skb,
347 mtpar.family = tgpar.family = NFPROTO_IPV4; 356 mtpar.family = tgpar.family = NFPROTO_IPV4;
348 tgpar.hooknum = hook; 357 tgpar.hooknum = hook;
349 358
350 read_lock_bh(&table->lock);
351 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 359 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
352 private = table->private; 360
353 table_base = (void *)private->entries[smp_processor_id()]; 361 rcu_read_lock();
362 private = rcu_dereference(table->private);
363 table_base = rcu_dereference(private->entries[smp_processor_id()]);
364
354 e = get_entry(table_base, private->hook_entry[hook]); 365 e = get_entry(table_base, private->hook_entry[hook]);
355 366
356 /* For return from builtin chain */ 367 /* For return from builtin chain */
@@ -445,7 +456,7 @@ ipt_do_table(struct sk_buff *skb,
445 } 456 }
446 } while (!hotdrop); 457 } while (!hotdrop);
447 458
448 read_unlock_bh(&table->lock); 459 rcu_read_unlock();
449 460
450#ifdef DEBUG_ALLOW_ALL 461#ifdef DEBUG_ALLOW_ALL
451 return NF_ACCEPT; 462 return NF_ACCEPT;
@@ -924,13 +935,68 @@ get_counters(const struct xt_table_info *t,
924 counters, 935 counters,
925 &i); 936 &i);
926 } 937 }
938
939}
940
941/* We're lazy, and add to the first CPU; overflow works its fey magic
942 * and everything is OK. */
943static int
944add_counter_to_entry(struct ipt_entry *e,
945 const struct xt_counters addme[],
946 unsigned int *i)
947{
948 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
949
950 (*i)++;
951 return 0;
952}
953
954/* Take values from counters and add them back onto the current cpu */
955static void put_counters(struct xt_table_info *t,
956 const struct xt_counters counters[])
957{
958 unsigned int i, cpu;
959
960 local_bh_disable();
961 cpu = smp_processor_id();
962 i = 0;
963 IPT_ENTRY_ITERATE(t->entries[cpu],
964 t->size,
965 add_counter_to_entry,
966 counters,
967 &i);
968 local_bh_enable();
969}
970
971
972static inline int
973zero_entry_counter(struct ipt_entry *e, void *arg)
974{
975 e->counters.bcnt = 0;
976 e->counters.pcnt = 0;
977 return 0;
978}
979
980static void
981clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
982{
983 unsigned int cpu;
984 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
985
986 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
987 for_each_possible_cpu(cpu) {
988 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
989 IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
990 zero_entry_counter, NULL);
991 }
927} 992}
928 993
929static struct xt_counters * alloc_counters(struct xt_table *table) 994static struct xt_counters * alloc_counters(struct xt_table *table)
930{ 995{
931 unsigned int countersize; 996 unsigned int countersize;
932 struct xt_counters *counters; 997 struct xt_counters *counters;
933 const struct xt_table_info *private = table->private; 998 struct xt_table_info *private = table->private;
999 struct xt_table_info *info;
934 1000
935 /* We need atomic snapshot of counters: rest doesn't change 1001 /* We need atomic snapshot of counters: rest doesn't change
936 (other than comefrom, which userspace doesn't care 1002 (other than comefrom, which userspace doesn't care
@@ -939,14 +1005,30 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
939 counters = vmalloc_node(countersize, numa_node_id()); 1005 counters = vmalloc_node(countersize, numa_node_id());
940 1006
941 if (counters == NULL) 1007 if (counters == NULL)
942 return ERR_PTR(-ENOMEM); 1008 goto nomem;
1009
1010 info = xt_alloc_table_info(private->size);
1011 if (!info)
1012 goto free_counters;
943 1013
944 /* First, sum counters... */ 1014 clone_counters(info, private);
945 write_lock_bh(&table->lock); 1015
946 get_counters(private, counters); 1016 mutex_lock(&table->lock);
947 write_unlock_bh(&table->lock); 1017 xt_table_entry_swap_rcu(private, info);
1018 synchronize_net(); /* Wait until smoke has cleared */
1019
1020 get_counters(info, counters);
1021 put_counters(private, counters);
1022 mutex_unlock(&table->lock);
1023
1024 xt_free_table_info(info);
948 1025
949 return counters; 1026 return counters;
1027
1028 free_counters:
1029 vfree(counters);
1030 nomem:
1031 return ERR_PTR(-ENOMEM);
950} 1032}
951 1033
952static int 1034static int
@@ -1312,27 +1394,6 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1312 return ret; 1394 return ret;
1313} 1395}
1314 1396
1315/* We're lazy, and add to the first CPU; overflow works its fey magic
1316 * and everything is OK. */
1317static int
1318add_counter_to_entry(struct ipt_entry *e,
1319 const struct xt_counters addme[],
1320 unsigned int *i)
1321{
1322#if 0
1323 duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
1324 *i,
1325 (long unsigned int)e->counters.pcnt,
1326 (long unsigned int)e->counters.bcnt,
1327 (long unsigned int)addme[*i].pcnt,
1328 (long unsigned int)addme[*i].bcnt);
1329#endif
1330
1331 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1332
1333 (*i)++;
1334 return 0;
1335}
1336 1397
1337static int 1398static int
1338do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) 1399do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1454,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1393 goto free; 1454 goto free;
1394 } 1455 }
1395 1456
1396 write_lock_bh(&t->lock); 1457 mutex_lock(&t->lock);
1397 private = t->private; 1458 private = t->private;
1398 if (private->number != num_counters) { 1459 if (private->number != num_counters) {
1399 ret = -EINVAL; 1460 ret = -EINVAL;
1400 goto unlock_up_free; 1461 goto unlock_up_free;
1401 } 1462 }
1402 1463
1464 preempt_disable();
1403 i = 0; 1465 i = 0;
1404 /* Choose the copy that is on our node */ 1466 /* Choose the copy that is on our node */
1405 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1467 loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1470,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1408 add_counter_to_entry, 1470 add_counter_to_entry,
1409 paddc, 1471 paddc,
1410 &i); 1472 &i);
1473 preempt_enable();
1411 unlock_up_free: 1474 unlock_up_free:
1412 write_unlock_bh(&t->lock); 1475 mutex_unlock(&t->lock);
1413 xt_table_unlock(t); 1476 xt_table_unlock(t);
1414 module_put(t->me); 1477 module_put(t->me);
1415 free: 1478 free:
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 27a78fbbd92b..acc44c69eb68 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -464,7 +464,7 @@ static struct xt_target log_tg_reg __read_mostly = {
464 .me = THIS_MODULE, 464 .me = THIS_MODULE,
465}; 465};
466 466
467static const struct nf_logger ipt_log_logger ={ 467static struct nf_logger ipt_log_logger __read_mostly = {
468 .name = "ipt_LOG", 468 .name = "ipt_LOG",
469 .logfn = &ipt_log_packet, 469 .logfn = &ipt_log_packet,
470 .me = THIS_MODULE, 470 .me = THIS_MODULE,
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
deleted file mode 100644
index 6d76aae90cc0..000000000000
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ /dev/null
@@ -1,97 +0,0 @@
1/* TTL modification target for IP tables
2 * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 */
9
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/ip.h>
13#include <net/checksum.h>
14
15#include <linux/netfilter/x_tables.h>
16#include <linux/netfilter_ipv4/ipt_TTL.h>
17
18MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
19MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target");
20MODULE_LICENSE("GPL");
21
22static unsigned int
23ttl_tg(struct sk_buff *skb, const struct xt_target_param *par)
24{
25 struct iphdr *iph;
26 const struct ipt_TTL_info *info = par->targinfo;
27 int new_ttl;
28
29 if (!skb_make_writable(skb, skb->len))
30 return NF_DROP;
31
32 iph = ip_hdr(skb);
33
34 switch (info->mode) {
35 case IPT_TTL_SET:
36 new_ttl = info->ttl;
37 break;
38 case IPT_TTL_INC:
39 new_ttl = iph->ttl + info->ttl;
40 if (new_ttl > 255)
41 new_ttl = 255;
42 break;
43 case IPT_TTL_DEC:
44 new_ttl = iph->ttl - info->ttl;
45 if (new_ttl < 0)
46 new_ttl = 0;
47 break;
48 default:
49 new_ttl = iph->ttl;
50 break;
51 }
52
53 if (new_ttl != iph->ttl) {
54 csum_replace2(&iph->check, htons(iph->ttl << 8),
55 htons(new_ttl << 8));
56 iph->ttl = new_ttl;
57 }
58
59 return XT_CONTINUE;
60}
61
62static bool ttl_tg_check(const struct xt_tgchk_param *par)
63{
64 const struct ipt_TTL_info *info = par->targinfo;
65
66 if (info->mode > IPT_TTL_MAXMODE) {
67 printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
68 info->mode);
69 return false;
70 }
71 if (info->mode != IPT_TTL_SET && info->ttl == 0)
72 return false;
73 return true;
74}
75
76static struct xt_target ttl_tg_reg __read_mostly = {
77 .name = "TTL",
78 .family = NFPROTO_IPV4,
79 .target = ttl_tg,
80 .targetsize = sizeof(struct ipt_TTL_info),
81 .table = "mangle",
82 .checkentry = ttl_tg_check,
83 .me = THIS_MODULE,
84};
85
86static int __init ttl_tg_init(void)
87{
88 return xt_register_target(&ttl_tg_reg);
89}
90
91static void __exit ttl_tg_exit(void)
92{
93 xt_unregister_target(&ttl_tg_reg);
94}
95
96module_init(ttl_tg_init);
97module_exit(ttl_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 18a2826b57c6..d32cc4bb328a 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -379,7 +379,7 @@ static struct xt_target ulog_tg_reg __read_mostly = {
379 .me = THIS_MODULE, 379 .me = THIS_MODULE,
380}; 380};
381 381
382static struct nf_logger ipt_ulog_logger = { 382static struct nf_logger ipt_ulog_logger __read_mostly = {
383 .name = "ipt_ULOG", 383 .name = "ipt_ULOG",
384 .logfn = ipt_logfn, 384 .logfn = ipt_logfn,
385 .me = THIS_MODULE, 385 .me = THIS_MODULE,
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
deleted file mode 100644
index 297f1cbf4ff5..000000000000
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ /dev/null
@@ -1,63 +0,0 @@
1/* IP tables module for matching the value of the TTL
2 *
3 * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/ip.h>
11#include <linux/module.h>
12#include <linux/skbuff.h>
13
14#include <linux/netfilter_ipv4/ipt_ttl.h>
15#include <linux/netfilter/x_tables.h>
16
17MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
18MODULE_DESCRIPTION("Xtables: IPv4 TTL field match");
19MODULE_LICENSE("GPL");
20
21static bool ttl_mt(const struct sk_buff *skb, const struct xt_match_param *par)
22{
23 const struct ipt_ttl_info *info = par->matchinfo;
24 const u8 ttl = ip_hdr(skb)->ttl;
25
26 switch (info->mode) {
27 case IPT_TTL_EQ:
28 return ttl == info->ttl;
29 case IPT_TTL_NE:
30 return ttl != info->ttl;
31 case IPT_TTL_LT:
32 return ttl < info->ttl;
33 case IPT_TTL_GT:
34 return ttl > info->ttl;
35 default:
36 printk(KERN_WARNING "ipt_ttl: unknown mode %d\n",
37 info->mode);
38 return false;
39 }
40
41 return false;
42}
43
44static struct xt_match ttl_mt_reg __read_mostly = {
45 .name = "ttl",
46 .family = NFPROTO_IPV4,
47 .match = ttl_mt,
48 .matchsize = sizeof(struct ipt_ttl_info),
49 .me = THIS_MODULE,
50};
51
52static int __init ttl_mt_init(void)
53{
54 return xt_register_match(&ttl_mt_reg);
55}
56
57static void __exit ttl_mt_exit(void)
58{
59 xt_unregister_match(&ttl_mt_reg);
60}
61
62module_init(ttl_mt_init);
63module_exit(ttl_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 52cb6939d093..c30a969724f8 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -56,7 +56,6 @@ static struct
56static struct xt_table packet_filter = { 56static struct xt_table packet_filter = {
57 .name = "filter", 57 .name = "filter",
58 .valid_hooks = FILTER_VALID_HOOKS, 58 .valid_hooks = FILTER_VALID_HOOKS,
59 .lock = __RW_LOCK_UNLOCKED(packet_filter.lock),
60 .me = THIS_MODULE, 59 .me = THIS_MODULE,
61 .af = AF_INET, 60 .af = AF_INET,
62}; 61};
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 3929d20b9e45..4087614d9519 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -67,7 +67,6 @@ static struct
67static struct xt_table packet_mangler = { 67static struct xt_table packet_mangler = {
68 .name = "mangle", 68 .name = "mangle",
69 .valid_hooks = MANGLE_VALID_HOOKS, 69 .valid_hooks = MANGLE_VALID_HOOKS,
70 .lock = __RW_LOCK_UNLOCKED(packet_mangler.lock),
71 .me = THIS_MODULE, 70 .me = THIS_MODULE,
72 .af = AF_INET, 71 .af = AF_INET,
73}; 72};
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 7f65d18333e3..e5356da1fb54 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -39,7 +39,6 @@ static struct
39static struct xt_table packet_raw = { 39static struct xt_table packet_raw = {
40 .name = "raw", 40 .name = "raw",
41 .valid_hooks = RAW_VALID_HOOKS, 41 .valid_hooks = RAW_VALID_HOOKS,
42 .lock = __RW_LOCK_UNLOCKED(packet_raw.lock),
43 .me = THIS_MODULE, 42 .me = THIS_MODULE,
44 .af = AF_INET, 43 .af = AF_INET,
45}; 44};
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index a52a35f4a584..29ab630f240a 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -60,7 +60,6 @@ static struct
60static struct xt_table security_table = { 60static struct xt_table security_table = {
61 .name = "security", 61 .name = "security",
62 .valid_hooks = SECURITY_VALID_HOOKS, 62 .valid_hooks = SECURITY_VALID_HOOKS,
63 .lock = __RW_LOCK_UNLOCKED(security_table.lock),
64 .me = THIS_MODULE, 63 .me = THIS_MODULE,
65 .af = AF_INET, 64 .af = AF_INET,
66}; 65};
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 4beb04fac588..8b681f24e271 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -120,8 +120,10 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
120 typeof(nf_nat_seq_adjust_hook) seq_adjust; 120 typeof(nf_nat_seq_adjust_hook) seq_adjust;
121 121
122 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); 122 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
123 if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) 123 if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) {
124 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
124 return NF_DROP; 125 return NF_DROP;
126 }
125 } 127 }
126out: 128out:
127 /* We've seen it coming out the other side: confirm it */ 129 /* We've seen it coming out the other side: confirm it */
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index a7eb04719044..6348a793936e 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -61,7 +61,6 @@ static struct
61static struct xt_table nat_table = { 61static struct xt_table nat_table = {
62 .name = "nat", 62 .name = "nat",
63 .valid_hooks = NAT_VALID_HOOKS, 63 .valid_hooks = NAT_VALID_HOOKS,
64 .lock = __RW_LOCK_UNLOCKED(nat_table.lock),
65 .me = THIS_MODULE, 64 .me = THIS_MODULE,
66 .af = AF_INET, 65 .af = AF_INET,
67}; 66};
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 182f845de92f..d9521f6f9ed0 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1292,7 +1292,7 @@ static struct nf_conntrack_helper snmp_helper __read_mostly = {
1292 .expect_policy = &snmp_exp_policy, 1292 .expect_policy = &snmp_exp_policy,
1293 .name = "snmp", 1293 .name = "snmp",
1294 .tuple.src.l3num = AF_INET, 1294 .tuple.src.l3num = AF_INET,
1295 .tuple.src.u.udp.port = __constant_htons(SNMP_PORT), 1295 .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
1296 .tuple.dst.protonum = IPPROTO_UDP, 1296 .tuple.dst.protonum = IPPROTO_UDP,
1297}; 1297};
1298 1298
@@ -1302,7 +1302,7 @@ static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
1302 .expect_policy = &snmp_exp_policy, 1302 .expect_policy = &snmp_exp_policy,
1303 .name = "snmp_trap", 1303 .name = "snmp_trap",
1304 .tuple.src.l3num = AF_INET, 1304 .tuple.src.l3num = AF_INET,
1305 .tuple.src.u.udp.port = __constant_htons(SNMP_TRAP_PORT), 1305 .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT),
1306 .tuple.dst.protonum = IPPROTO_UDP, 1306 .tuple.dst.protonum = IPPROTO_UDP,
1307}; 1307};
1308 1308
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index eb62e58bff79..cf0cdeeb1db0 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,8 +54,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
54 int orphans, sockets; 54 int orphans, sockets;
55 55
56 local_bh_disable(); 56 local_bh_disable();
57 orphans = percpu_counter_sum_positive(&tcp_orphan_count), 57 orphans = percpu_counter_sum_positive(&tcp_orphan_count);
58 sockets = percpu_counter_sum_positive(&tcp_sockets_allocated), 58 sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
59 local_bh_enable(); 59 local_bh_enable();
60 60
61 socket_seq_show(seq); 61 socket_seq_show(seq);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index dff8bc4e0fac..f774651f0a47 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -493,6 +493,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
493 493
494 ipc.addr = inet->saddr; 494 ipc.addr = inet->saddr;
495 ipc.opt = NULL; 495 ipc.opt = NULL;
496 ipc.shtx.flags = 0;
496 ipc.oif = sk->sk_bound_dev_if; 497 ipc.oif = sk->sk_bound_dev_if;
497 498
498 if (msg->msg_controllen) { 499 if (msg->msg_controllen) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 97f71153584f..5caee609be06 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -151,7 +151,7 @@ static void rt_emergency_hash_rebuild(struct net *net);
151 151
152static struct dst_ops ipv4_dst_ops = { 152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET, 153 .family = AF_INET,
154 .protocol = __constant_htons(ETH_P_IP), 154 .protocol = cpu_to_be16(ETH_P_IP),
155 .gc = rt_garbage_collect, 155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check, 156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy, 157 .destroy = ipv4_dst_destroy,
@@ -2696,7 +2696,7 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2696 2696
2697static struct dst_ops ipv4_dst_blackhole_ops = { 2697static struct dst_ops ipv4_dst_blackhole_ops = {
2698 .family = AF_INET, 2698 .family = AF_INET,
2699 .protocol = __constant_htons(ETH_P_IP), 2699 .protocol = cpu_to_be16(ETH_P_IP),
2700 .destroy = ipv4_dst_destroy, 2700 .destroy = ipv4_dst_destroy,
2701 .check = ipv4_dst_check, 2701 .check = ipv4_dst_check,
2702 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2702 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
@@ -2779,7 +2779,8 @@ int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2779 return ip_route_output_flow(net, rp, flp, NULL, 0); 2779 return ip_route_output_flow(net, rp, flp, NULL, 0);
2780} 2780}
2781 2781
2782static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 2782static int rt_fill_info(struct net *net,
2783 struct sk_buff *skb, u32 pid, u32 seq, int event,
2783 int nowait, unsigned int flags) 2784 int nowait, unsigned int flags)
2784{ 2785{
2785 struct rtable *rt = skb->rtable; 2786 struct rtable *rt = skb->rtable;
@@ -2844,8 +2845,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2844 __be32 dst = rt->rt_dst; 2845 __be32 dst = rt->rt_dst;
2845 2846
2846 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2847 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2847 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) { 2848 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2848 int err = ipmr_get_route(skb, r, nowait); 2849 int err = ipmr_get_route(net, skb, r, nowait);
2849 if (err <= 0) { 2850 if (err <= 0) {
2850 if (!nowait) { 2851 if (!nowait) {
2851 if (err == 0) 2852 if (err == 0)
@@ -2950,7 +2951,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2950 if (rtm->rtm_flags & RTM_F_NOTIFY) 2951 if (rtm->rtm_flags & RTM_F_NOTIFY)
2951 rt->rt_flags |= RTCF_NOTIFY; 2952 rt->rt_flags |= RTCF_NOTIFY;
2952 2953
2953 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2954 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2954 RTM_NEWROUTE, 0, 0); 2955 RTM_NEWROUTE, 0, 0);
2955 if (err <= 0) 2956 if (err <= 0)
2956 goto errout_free; 2957 goto errout_free;
@@ -2988,7 +2989,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2988 if (rt_is_expired(rt)) 2989 if (rt_is_expired(rt))
2989 continue; 2990 continue;
2990 skb->dst = dst_clone(&rt->u.dst); 2991 skb->dst = dst_clone(&rt->u.dst);
2991 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 2992 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2992 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 2993 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2993 1, NLM_F_MULTI) <= 0) { 2994 1, NLM_F_MULTI) <= 0) {
2994 dst_release(xchg(&skb->dst, NULL)); 2995 dst_release(xchg(&skb->dst, NULL));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 76b148bcb0dc..2451aeb5ac23 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -661,6 +661,47 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
661 return NULL; 661 return NULL;
662} 662}
663 663
664static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
665 int large_allowed)
666{
667 struct tcp_sock *tp = tcp_sk(sk);
668 u32 xmit_size_goal, old_size_goal;
669
670 xmit_size_goal = mss_now;
671
672 if (large_allowed && sk_can_gso(sk)) {
673 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
674 inet_csk(sk)->icsk_af_ops->net_header_len -
675 inet_csk(sk)->icsk_ext_hdr_len -
676 tp->tcp_header_len);
677
678 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
679
680 /* We try hard to avoid divides here */
681 old_size_goal = tp->xmit_size_goal_segs * mss_now;
682
683 if (likely(old_size_goal <= xmit_size_goal &&
684 old_size_goal + mss_now > xmit_size_goal)) {
685 xmit_size_goal = old_size_goal;
686 } else {
687 tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
688 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
689 }
690 }
691
692 return max(xmit_size_goal, mss_now);
693}
694
695static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
696{
697 int mss_now;
698
699 mss_now = tcp_current_mss(sk);
700 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
701
702 return mss_now;
703}
704
664static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, 705static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
665 size_t psize, int flags) 706 size_t psize, int flags)
666{ 707{
@@ -677,13 +718,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
677 718
678 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 719 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
679 720
680 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 721 mss_now = tcp_send_mss(sk, &size_goal, flags);
681 size_goal = tp->xmit_size_goal;
682 copied = 0; 722 copied = 0;
683 723
684 err = -EPIPE; 724 err = -EPIPE;
685 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 725 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
686 goto do_error; 726 goto out_err;
687 727
688 while (psize > 0) { 728 while (psize > 0) {
689 struct sk_buff *skb = tcp_write_queue_tail(sk); 729 struct sk_buff *skb = tcp_write_queue_tail(sk);
@@ -761,8 +801,7 @@ wait_for_memory:
761 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 801 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
762 goto do_error; 802 goto do_error;
763 803
764 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 804 mss_now = tcp_send_mss(sk, &size_goal, flags);
765 size_goal = tp->xmit_size_goal;
766 } 805 }
767 806
768out: 807out:
@@ -844,8 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
844 /* This should be in poll */ 883 /* This should be in poll */
845 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 884 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
846 885
847 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 886 mss_now = tcp_send_mss(sk, &size_goal, flags);
848 size_goal = tp->xmit_size_goal;
849 887
850 /* Ok commence sending. */ 888 /* Ok commence sending. */
851 iovlen = msg->msg_iovlen; 889 iovlen = msg->msg_iovlen;
@@ -854,7 +892,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
854 892
855 err = -EPIPE; 893 err = -EPIPE;
856 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 894 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
857 goto do_error; 895 goto out_err;
858 896
859 while (--iovlen >= 0) { 897 while (--iovlen >= 0) {
860 int seglen = iov->iov_len; 898 int seglen = iov->iov_len;
@@ -1007,8 +1045,7 @@ wait_for_memory:
1007 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 1045 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1008 goto do_error; 1046 goto do_error;
1009 1047
1010 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 1048 mss_now = tcp_send_mss(sk, &size_goal, flags);
1011 size_goal = tp->xmit_size_goal;
1012 } 1049 }
1013 } 1050 }
1014 1051
@@ -1045,8 +1082,7 @@ out_err:
1045 */ 1082 */
1046 1083
1047static int tcp_recv_urg(struct sock *sk, long timeo, 1084static int tcp_recv_urg(struct sock *sk, long timeo,
1048 struct msghdr *msg, int len, int flags, 1085 struct msghdr *msg, int len, int flags)
1049 int *addr_len)
1050{ 1086{
1051 struct tcp_sock *tp = tcp_sk(sk); 1087 struct tcp_sock *tp = tcp_sk(sk);
1052 1088
@@ -1661,7 +1697,7 @@ out:
1661 return err; 1697 return err;
1662 1698
1663recv_urg: 1699recv_urg:
1664 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); 1700 err = tcp_recv_urg(sk, timeo, msg, len, flags);
1665 goto out; 1701 goto out;
1666} 1702}
1667 1703
@@ -2478,23 +2514,23 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2478 struct tcphdr *th2; 2514 struct tcphdr *th2;
2479 unsigned int thlen; 2515 unsigned int thlen;
2480 unsigned int flags; 2516 unsigned int flags;
2481 unsigned int total;
2482 unsigned int mss = 1; 2517 unsigned int mss = 1;
2483 int flush = 1; 2518 int flush = 1;
2519 int i;
2484 2520
2485 if (!pskb_may_pull(skb, sizeof(*th))) 2521 th = skb_gro_header(skb, sizeof(*th));
2522 if (unlikely(!th))
2486 goto out; 2523 goto out;
2487 2524
2488 th = tcp_hdr(skb);
2489 thlen = th->doff * 4; 2525 thlen = th->doff * 4;
2490 if (thlen < sizeof(*th)) 2526 if (thlen < sizeof(*th))
2491 goto out; 2527 goto out;
2492 2528
2493 if (!pskb_may_pull(skb, thlen)) 2529 th = skb_gro_header(skb, thlen);
2530 if (unlikely(!th))
2494 goto out; 2531 goto out;
2495 2532
2496 th = tcp_hdr(skb); 2533 skb_gro_pull(skb, thlen);
2497 __skb_pull(skb, thlen);
2498 2534
2499 flags = tcp_flag_word(th); 2535 flags = tcp_flag_word(th);
2500 2536
@@ -2504,7 +2540,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2504 2540
2505 th2 = tcp_hdr(p); 2541 th2 = tcp_hdr(p);
2506 2542
2507 if (th->source != th2->source || th->dest != th2->dest) { 2543 if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) {
2508 NAPI_GRO_CB(p)->same_flow = 0; 2544 NAPI_GRO_CB(p)->same_flow = 0;
2509 continue; 2545 continue;
2510 } 2546 }
@@ -2519,14 +2555,15 @@ found:
2519 flush |= flags & TCP_FLAG_CWR; 2555 flush |= flags & TCP_FLAG_CWR;
2520 flush |= (flags ^ tcp_flag_word(th2)) & 2556 flush |= (flags ^ tcp_flag_word(th2)) &
2521 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); 2557 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
2522 flush |= th->ack_seq != th2->ack_seq || th->window != th2->window; 2558 flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window);
2523 flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th)); 2559 for (i = sizeof(*th); !flush && i < thlen; i += 4)
2560 flush |= *(u32 *)((u8 *)th + i) ^
2561 *(u32 *)((u8 *)th2 + i);
2524 2562
2525 total = p->len;
2526 mss = skb_shinfo(p)->gso_size; 2563 mss = skb_shinfo(p)->gso_size;
2527 2564
2528 flush |= skb->len > mss || skb->len <= 0; 2565 flush |= (skb_gro_len(skb) > mss) | !skb_gro_len(skb);
2529 flush |= ntohl(th2->seq) + total != ntohl(th->seq); 2566 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
2530 2567
2531 if (flush || skb_gro_receive(head, skb)) { 2568 if (flush || skb_gro_receive(head, skb)) {
2532 mss = 1; 2569 mss = 1;
@@ -2538,7 +2575,7 @@ found:
2538 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); 2575 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
2539 2576
2540out_check_final: 2577out_check_final:
2541 flush = skb->len < mss; 2578 flush = skb_gro_len(skb) < mss;
2542 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | 2579 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
2543 TCP_FLAG_SYN | TCP_FLAG_FIN); 2580 TCP_FLAG_SYN | TCP_FLAG_FIN);
2544 2581
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 7eb7636db0d0..3b53fd1af23f 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -149,16 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
149 tcp_slow_start(tp); 149 tcp_slow_start(tp);
150 else { 150 else {
151 bictcp_update(ca, tp->snd_cwnd); 151 bictcp_update(ca, tp->snd_cwnd);
152 152 tcp_cong_avoid_ai(tp, ca->cnt);
153 /* In dangerous area, increase slowly.
154 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
155 */
156 if (tp->snd_cwnd_cnt >= ca->cnt) {
157 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
158 tp->snd_cwnd++;
159 tp->snd_cwnd_cnt = 0;
160 } else
161 tp->snd_cwnd_cnt++;
162 } 153 }
163 154
164} 155}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4ec5b4e97c4e..e92beb9e55e0 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -336,6 +336,19 @@ void tcp_slow_start(struct tcp_sock *tp)
336} 336}
337EXPORT_SYMBOL_GPL(tcp_slow_start); 337EXPORT_SYMBOL_GPL(tcp_slow_start);
338 338
339/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
340void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
341{
342 if (tp->snd_cwnd_cnt >= w) {
343 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
344 tp->snd_cwnd++;
345 tp->snd_cwnd_cnt = 0;
346 } else {
347 tp->snd_cwnd_cnt++;
348 }
349}
350EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
351
339/* 352/*
340 * TCP Reno congestion control 353 * TCP Reno congestion control
341 * This is special case used for fallback as well. 354 * This is special case used for fallback as well.
@@ -365,13 +378,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
365 tp->snd_cwnd++; 378 tp->snd_cwnd++;
366 } 379 }
367 } else { 380 } else {
368 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ 381 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
369 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
370 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
371 tp->snd_cwnd++;
372 tp->snd_cwnd_cnt = 0;
373 } else
374 tp->snd_cwnd_cnt++;
375 } 382 }
376} 383}
377EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 384EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ee467ec40c4f..71d5f2f29fa6 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -294,16 +294,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
294 tcp_slow_start(tp); 294 tcp_slow_start(tp);
295 } else { 295 } else {
296 bictcp_update(ca, tp->snd_cwnd); 296 bictcp_update(ca, tp->snd_cwnd);
297 297 tcp_cong_avoid_ai(tp, ca->cnt);
298 /* In dangerous area, increase slowly.
299 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
300 */
301 if (tp->snd_cwnd_cnt >= ca->cnt) {
302 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
303 tp->snd_cwnd++;
304 tp->snd_cwnd_cnt = 0;
305 } else
306 tp->snd_cwnd_cnt++;
307 } 298 }
308 299
309} 300}
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 937549b8a921..26d5c7fc7de5 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -115,8 +115,7 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
115 return; 115 return;
116 116
117 /* achieved throughput calculations */ 117 /* achieved throughput calculations */
118 if (icsk->icsk_ca_state != TCP_CA_Open && 118 if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
119 icsk->icsk_ca_state != TCP_CA_Disorder) {
120 ca->packetcount = 0; 119 ca->packetcount = 0;
121 ca->lasttime = now; 120 ca->lasttime = now;
122 return; 121 return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c28976a7e596..2bc8e27a163d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -64,6 +64,7 @@
64#include <linux/mm.h> 64#include <linux/mm.h>
65#include <linux/module.h> 65#include <linux/module.h>
66#include <linux/sysctl.h> 66#include <linux/sysctl.h>
67#include <linux/kernel.h>
67#include <net/dst.h> 68#include <net/dst.h>
68#include <net/tcp.h> 69#include <net/tcp.h>
69#include <net/inet_common.h> 70#include <net/inet_common.h>
@@ -1178,10 +1179,18 @@ static void tcp_mark_lost_retrans(struct sock *sk)
1178 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) 1179 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
1179 continue; 1180 continue;
1180 1181
1181 if (after(received_upto, ack_seq) && 1182 /* TODO: We would like to get rid of tcp_is_fack(tp) only
1182 (tcp_is_fack(tp) || 1183 * constraint here (see above) but figuring out that at
1183 !before(received_upto, 1184 * least tp->reordering SACK blocks reside between ack_seq
1184 ack_seq + tp->reordering * tp->mss_cache))) { 1185 * and received_upto is not easy task to do cheaply with
1186 * the available datastructures.
1187 *
1188 * Whether FACK should check here for tp->reordering segs
1189 * in-between one could argue for either way (it would be
1190 * rather simple to implement as we could count fack_count
1191 * during the walk and do tp->fackets_out - fack_count).
1192 */
1193 if (after(received_upto, ack_seq)) {
1185 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1194 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1186 tp->retrans_out -= tcp_skb_pcount(skb); 1195 tp->retrans_out -= tcp_skb_pcount(skb);
1187 1196
@@ -1794,11 +1803,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1794 for (i = used_sacks - 1; i > 0; i--) { 1803 for (i = used_sacks - 1; i > 0; i--) {
1795 for (j = 0; j < i; j++) { 1804 for (j = 0; j < i; j++) {
1796 if (after(sp[j].start_seq, sp[j + 1].start_seq)) { 1805 if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
1797 struct tcp_sack_block tmp; 1806 swap(sp[j], sp[j + 1]);
1798
1799 tmp = sp[j];
1800 sp[j] = sp[j + 1];
1801 sp[j + 1] = tmp;
1802 1807
1803 /* Track where the first SACK block goes to */ 1808 /* Track where the first SACK block goes to */
1804 if (j == first_sack_index) 1809 if (j == first_sack_index)
@@ -2453,6 +2458,44 @@ static int tcp_time_to_recover(struct sock *sk)
2453 return 0; 2458 return 0;
2454} 2459}
2455 2460
2461/* New heuristics: it is possible only after we switched to restart timer
2462 * each time when something is ACKed. Hence, we can detect timed out packets
2463 * during fast retransmit without falling to slow start.
2464 *
2465 * Usefulness of this as is very questionable, since we should know which of
2466 * the segments is the next to timeout which is relatively expensive to find
2467 * in general case unless we add some data structure just for that. The
2468 * current approach certainly won't find the right one too often and when it
2469 * finally does find _something_ it usually marks large part of the window
2470 * right away (because a retransmission with a larger timestamp blocks the
2471 * loop from advancing). -ij
2472 */
2473static void tcp_timeout_skbs(struct sock *sk)
2474{
2475 struct tcp_sock *tp = tcp_sk(sk);
2476 struct sk_buff *skb;
2477
2478 if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
2479 return;
2480
2481 skb = tp->scoreboard_skb_hint;
2482 if (tp->scoreboard_skb_hint == NULL)
2483 skb = tcp_write_queue_head(sk);
2484
2485 tcp_for_write_queue_from(skb, sk) {
2486 if (skb == tcp_send_head(sk))
2487 break;
2488 if (!tcp_skb_timedout(sk, skb))
2489 break;
2490
2491 tcp_skb_mark_lost(tp, skb);
2492 }
2493
2494 tp->scoreboard_skb_hint = skb;
2495
2496 tcp_verify_left_out(tp);
2497}
2498
2456/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2499/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2457 * is against sacked "cnt", otherwise it's against facked "cnt" 2500 * is against sacked "cnt", otherwise it's against facked "cnt"
2458 */ 2501 */
@@ -2525,30 +2568,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2525 tcp_mark_head_lost(sk, sacked_upto); 2568 tcp_mark_head_lost(sk, sacked_upto);
2526 } 2569 }
2527 2570
2528 /* New heuristics: it is possible only after we switched 2571 tcp_timeout_skbs(sk);
2529 * to restart timer each time when something is ACKed.
2530 * Hence, we can detect timed out packets during fast
2531 * retransmit without falling to slow start.
2532 */
2533 if (tcp_is_fack(tp) && tcp_head_timedout(sk)) {
2534 struct sk_buff *skb;
2535
2536 skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
2537 : tcp_write_queue_head(sk);
2538
2539 tcp_for_write_queue_from(skb, sk) {
2540 if (skb == tcp_send_head(sk))
2541 break;
2542 if (!tcp_skb_timedout(sk, skb))
2543 break;
2544
2545 tcp_skb_mark_lost(tp, skb);
2546 }
2547
2548 tp->scoreboard_skb_hint = skb;
2549
2550 tcp_verify_left_out(tp);
2551 }
2552} 2572}
2553 2573
2554/* CWND moderation, preventing bursts due to too big ACKs 2574/* CWND moderation, preventing bursts due to too big ACKs
@@ -2813,7 +2833,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
2813 icsk->icsk_mtup.probe_size = 0; 2833 icsk->icsk_mtup.probe_size = 0;
2814} 2834}
2815 2835
2816static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) 2836static void tcp_mtup_probe_success(struct sock *sk)
2817{ 2837{
2818 struct tcp_sock *tp = tcp_sk(sk); 2838 struct tcp_sock *tp = tcp_sk(sk);
2819 struct inet_connection_sock *icsk = inet_csk(sk); 2839 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2841,7 +2861,7 @@ void tcp_simple_retransmit(struct sock *sk)
2841 const struct inet_connection_sock *icsk = inet_csk(sk); 2861 const struct inet_connection_sock *icsk = inet_csk(sk);
2842 struct tcp_sock *tp = tcp_sk(sk); 2862 struct tcp_sock *tp = tcp_sk(sk);
2843 struct sk_buff *skb; 2863 struct sk_buff *skb;
2844 unsigned int mss = tcp_current_mss(sk, 0); 2864 unsigned int mss = tcp_current_mss(sk);
2845 u32 prior_lost = tp->lost_out; 2865 u32 prior_lost = tp->lost_out;
2846 2866
2847 tcp_for_write_queue(skb, sk) { 2867 tcp_for_write_queue(skb, sk) {
@@ -3178,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3178 3198
3179 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3199 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3180 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3200 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3181 u32 end_seq;
3182 u32 acked_pcount; 3201 u32 acked_pcount;
3183 u8 sacked = scb->sacked; 3202 u8 sacked = scb->sacked;
3184 3203
@@ -3193,16 +3212,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3193 break; 3212 break;
3194 3213
3195 fully_acked = 0; 3214 fully_acked = 0;
3196 end_seq = tp->snd_una;
3197 } else { 3215 } else {
3198 acked_pcount = tcp_skb_pcount(skb); 3216 acked_pcount = tcp_skb_pcount(skb);
3199 end_seq = scb->end_seq;
3200 }
3201
3202 /* MTU probing checks */
3203 if (fully_acked && icsk->icsk_mtup.probe_size &&
3204 !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) {
3205 tcp_mtup_probe_success(sk, skb);
3206 } 3217 }
3207 3218
3208 if (sacked & TCPCB_RETRANS) { 3219 if (sacked & TCPCB_RETRANS) {
@@ -3267,24 +3278,26 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3267 const struct tcp_congestion_ops *ca_ops 3278 const struct tcp_congestion_ops *ca_ops
3268 = inet_csk(sk)->icsk_ca_ops; 3279 = inet_csk(sk)->icsk_ca_ops;
3269 3280
3281 if (unlikely(icsk->icsk_mtup.probe_size &&
3282 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3283 tcp_mtup_probe_success(sk);
3284 }
3285
3270 tcp_ack_update_rtt(sk, flag, seq_rtt); 3286 tcp_ack_update_rtt(sk, flag, seq_rtt);
3271 tcp_rearm_rto(sk); 3287 tcp_rearm_rto(sk);
3272 3288
3273 if (tcp_is_reno(tp)) { 3289 if (tcp_is_reno(tp)) {
3274 tcp_remove_reno_sacks(sk, pkts_acked); 3290 tcp_remove_reno_sacks(sk, pkts_acked);
3275 } else { 3291 } else {
3292 int delta;
3293
3276 /* Non-retransmitted hole got filled? That's reordering */ 3294 /* Non-retransmitted hole got filled? That's reordering */
3277 if (reord < prior_fackets) 3295 if (reord < prior_fackets)
3278 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 3296 tcp_update_reordering(sk, tp->fackets_out - reord, 0);
3279 3297
3280 /* No need to care for underflows here because 3298 delta = tcp_is_fack(tp) ? pkts_acked :
3281 * the lost_skb_hint gets NULLed if we're past it 3299 prior_sacked - tp->sacked_out;
3282 * (or something non-trivial happened) 3300 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3283 */
3284 if (tcp_is_fack(tp))
3285 tp->lost_cnt_hint -= pkts_acked;
3286 else
3287 tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
3288 } 3301 }
3289 3302
3290 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3303 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
@@ -3396,7 +3409,7 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
3396 3409
3397 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { 3410 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3398 flag |= FLAG_WIN_UPDATE; 3411 flag |= FLAG_WIN_UPDATE;
3399 tcp_update_wl(tp, ack, ack_seq); 3412 tcp_update_wl(tp, ack_seq);
3400 3413
3401 if (tp->snd_wnd != nwin) { 3414 if (tp->snd_wnd != nwin) {
3402 tp->snd_wnd = nwin; 3415 tp->snd_wnd = nwin;
@@ -3572,15 +3585,18 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3572 int prior_packets; 3585 int prior_packets;
3573 int frto_cwnd = 0; 3586 int frto_cwnd = 0;
3574 3587
3575 /* If the ack is newer than sent or older than previous acks 3588 /* If the ack is older than previous acks
3576 * then we can probably ignore it. 3589 * then we can probably ignore it.
3577 */ 3590 */
3578 if (after(ack, tp->snd_nxt))
3579 goto uninteresting_ack;
3580
3581 if (before(ack, prior_snd_una)) 3591 if (before(ack, prior_snd_una))
3582 goto old_ack; 3592 goto old_ack;
3583 3593
3594 /* If the ack includes data we haven't sent yet, discard
3595 * this segment (RFC793 Section 3.9).
3596 */
3597 if (after(ack, tp->snd_nxt))
3598 goto invalid_ack;
3599
3584 if (after(ack, prior_snd_una)) 3600 if (after(ack, prior_snd_una))
3585 flag |= FLAG_SND_UNA_ADVANCED; 3601 flag |= FLAG_SND_UNA_ADVANCED;
3586 3602
@@ -3601,7 +3617,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3601 * No more checks are required. 3617 * No more checks are required.
3602 * Note, we use the fact that SND.UNA>=SND.WL2. 3618 * Note, we use the fact that SND.UNA>=SND.WL2.
3603 */ 3619 */
3604 tcp_update_wl(tp, ack, ack_seq); 3620 tcp_update_wl(tp, ack_seq);
3605 tp->snd_una = ack; 3621 tp->snd_una = ack;
3606 flag |= FLAG_WIN_UPDATE; 3622 flag |= FLAG_WIN_UPDATE;
3607 3623
@@ -3670,6 +3686,10 @@ no_queue:
3670 tcp_ack_probe(sk); 3686 tcp_ack_probe(sk);
3671 return 1; 3687 return 1;
3672 3688
3689invalid_ack:
3690 SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3691 return -1;
3692
3673old_ack: 3693old_ack:
3674 if (TCP_SKB_CB(skb)->sacked) { 3694 if (TCP_SKB_CB(skb)->sacked) {
3675 tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3695 tcp_sacktag_write_queue(sk, skb, prior_snd_una);
@@ -3677,8 +3697,7 @@ old_ack:
3677 tcp_try_keep_open(sk); 3697 tcp_try_keep_open(sk);
3678 } 3698 }
3679 3699
3680uninteresting_ack: 3700 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3681 SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3682 return 0; 3701 return 0;
3683} 3702}
3684 3703
@@ -3866,8 +3885,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3866 * Not only, also it occurs for expired timestamps. 3885 * Not only, also it occurs for expired timestamps.
3867 */ 3886 */
3868 3887
3869 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || 3888 if (tcp_paws_check(&tp->rx_opt, 0))
3870 get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
3871 tcp_store_ts_recent(tp); 3889 tcp_store_ts_recent(tp);
3872 } 3890 }
3873} 3891}
@@ -3919,9 +3937,9 @@ static inline int tcp_paws_discard(const struct sock *sk,
3919 const struct sk_buff *skb) 3937 const struct sk_buff *skb)
3920{ 3938{
3921 const struct tcp_sock *tp = tcp_sk(sk); 3939 const struct tcp_sock *tp = tcp_sk(sk);
3922 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && 3940
3923 get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && 3941 return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
3924 !tcp_disordered_ack(sk, skb)); 3942 !tcp_disordered_ack(sk, skb);
3925} 3943}
3926 3944
3927/* Check segment sequence number for validity. 3945/* Check segment sequence number for validity.
@@ -4079,7 +4097,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4079 tp->rx_opt.dsack = 1; 4097 tp->rx_opt.dsack = 1;
4080 tp->duplicate_sack[0].start_seq = seq; 4098 tp->duplicate_sack[0].start_seq = seq;
4081 tp->duplicate_sack[0].end_seq = end_seq; 4099 tp->duplicate_sack[0].end_seq = end_seq;
4082 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1;
4083 } 4100 }
4084} 4101}
4085 4102
@@ -4134,8 +4151,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4134 * Decrease num_sacks. 4151 * Decrease num_sacks.
4135 */ 4152 */
4136 tp->rx_opt.num_sacks--; 4153 tp->rx_opt.num_sacks--;
4137 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
4138 tp->rx_opt.dsack;
4139 for (i = this_sack; i < tp->rx_opt.num_sacks; i++) 4154 for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4140 sp[i] = sp[i + 1]; 4155 sp[i] = sp[i + 1];
4141 continue; 4156 continue;
@@ -4144,20 +4159,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4144 } 4159 }
4145} 4160}
4146 4161
4147static inline void tcp_sack_swap(struct tcp_sack_block *sack1,
4148 struct tcp_sack_block *sack2)
4149{
4150 __u32 tmp;
4151
4152 tmp = sack1->start_seq;
4153 sack1->start_seq = sack2->start_seq;
4154 sack2->start_seq = tmp;
4155
4156 tmp = sack1->end_seq;
4157 sack1->end_seq = sack2->end_seq;
4158 sack2->end_seq = tmp;
4159}
4160
4161static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) 4162static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4162{ 4163{
4163 struct tcp_sock *tp = tcp_sk(sk); 4164 struct tcp_sock *tp = tcp_sk(sk);
@@ -4172,7 +4173,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4172 if (tcp_sack_extend(sp, seq, end_seq)) { 4173 if (tcp_sack_extend(sp, seq, end_seq)) {
4173 /* Rotate this_sack to the first one. */ 4174 /* Rotate this_sack to the first one. */
4174 for (; this_sack > 0; this_sack--, sp--) 4175 for (; this_sack > 0; this_sack--, sp--)
4175 tcp_sack_swap(sp, sp - 1); 4176 swap(*sp, *(sp - 1));
4176 if (cur_sacks > 1) 4177 if (cur_sacks > 1)
4177 tcp_sack_maybe_coalesce(tp); 4178 tcp_sack_maybe_coalesce(tp);
4178 return; 4179 return;
@@ -4198,7 +4199,6 @@ new_sack:
4198 sp->start_seq = seq; 4199 sp->start_seq = seq;
4199 sp->end_seq = end_seq; 4200 sp->end_seq = end_seq;
4200 tp->rx_opt.num_sacks++; 4201 tp->rx_opt.num_sacks++;
4201 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
4202} 4202}
4203 4203
4204/* RCV.NXT advances, some SACKs should be eaten. */ 4204/* RCV.NXT advances, some SACKs should be eaten. */
@@ -4212,7 +4212,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4212 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 4212 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
4213 if (skb_queue_empty(&tp->out_of_order_queue)) { 4213 if (skb_queue_empty(&tp->out_of_order_queue)) {
4214 tp->rx_opt.num_sacks = 0; 4214 tp->rx_opt.num_sacks = 0;
4215 tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
4216 return; 4215 return;
4217 } 4216 }
4218 4217
@@ -4233,11 +4232,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4233 this_sack++; 4232 this_sack++;
4234 sp++; 4233 sp++;
4235 } 4234 }
4236 if (num_sacks != tp->rx_opt.num_sacks) { 4235 tp->rx_opt.num_sacks = num_sacks;
4237 tp->rx_opt.num_sacks = num_sacks;
4238 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
4239 tp->rx_opt.dsack;
4240 }
4241} 4236}
4242 4237
4243/* This one checks to see if we can put data from the 4238/* This one checks to see if we can put data from the
@@ -4313,10 +4308,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4313 4308
4314 TCP_ECN_accept_cwr(tp, skb); 4309 TCP_ECN_accept_cwr(tp, skb);
4315 4310
4316 if (tp->rx_opt.dsack) { 4311 tp->rx_opt.dsack = 0;
4317 tp->rx_opt.dsack = 0;
4318 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
4319 }
4320 4312
4321 /* Queue data for delivery to the user. 4313 /* Queue data for delivery to the user.
4322 * Packets in sequence go to the receive queue. 4314 * Packets in sequence go to the receive queue.
@@ -4435,8 +4427,6 @@ drop:
4435 /* Initial out of order segment, build 1 SACK. */ 4427 /* Initial out of order segment, build 1 SACK. */
4436 if (tcp_is_sack(tp)) { 4428 if (tcp_is_sack(tp)) {
4437 tp->rx_opt.num_sacks = 1; 4429 tp->rx_opt.num_sacks = 1;
4438 tp->rx_opt.dsack = 0;
4439 tp->rx_opt.eff_sacks = 1;
4440 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; 4430 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
4441 tp->selective_acks[0].end_seq = 4431 tp->selective_acks[0].end_seq =
4442 TCP_SKB_CB(skb)->end_seq; 4432 TCP_SKB_CB(skb)->end_seq;
@@ -5157,7 +5147,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5157 */ 5147 */
5158 5148
5159 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && 5149 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5160 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { 5150 TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5151 !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5161 int tcp_header_len = tp->tcp_header_len; 5152 int tcp_header_len = tp->tcp_header_len;
5162 5153
5163 /* Timestamp header prediction: tcp_header_len 5154 /* Timestamp header prediction: tcp_header_len
@@ -5310,8 +5301,8 @@ slow_path:
5310 return -res; 5301 return -res;
5311 5302
5312step5: 5303step5:
5313 if (th->ack) 5304 if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
5314 tcp_ack(sk, skb, FLAG_SLOWPATH); 5305 goto discard;
5315 5306
5316 tcp_rcv_rtt_measure_ts(sk, skb); 5307 tcp_rcv_rtt_measure_ts(sk, skb);
5317 5308
@@ -5409,7 +5400,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5409 * never scaled. 5400 * never scaled.
5410 */ 5401 */
5411 tp->snd_wnd = ntohs(th->window); 5402 tp->snd_wnd = ntohs(th->window);
5412 tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); 5403 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5413 5404
5414 if (!tp->rx_opt.wscale_ok) { 5405 if (!tp->rx_opt.wscale_ok) {
5415 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 5406 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5510,7 +5501,7 @@ discard:
5510 5501
5511 /* PAWS check. */ 5502 /* PAWS check. */
5512 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && 5503 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
5513 tcp_paws_check(&tp->rx_opt, 0)) 5504 tcp_paws_reject(&tp->rx_opt, 0))
5514 goto discard_and_undo; 5505 goto discard_and_undo;
5515 5506
5516 if (th->syn) { 5507 if (th->syn) {
@@ -5648,7 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5648 5639
5649 /* step 5: check the ACK field */ 5640 /* step 5: check the ACK field */
5650 if (th->ack) { 5641 if (th->ack) {
5651 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); 5642 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
5652 5643
5653 switch (sk->sk_state) { 5644 switch (sk->sk_state) {
5654 case TCP_SYN_RECV: 5645 case TCP_SYN_RECV:
@@ -5670,8 +5661,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5670 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 5661 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5671 tp->snd_wnd = ntohs(th->window) << 5662 tp->snd_wnd = ntohs(th->window) <<
5672 tp->rx_opt.snd_wscale; 5663 tp->rx_opt.snd_wscale;
5673 tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, 5664 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5674 TCP_SKB_CB(skb)->seq);
5675 5665
5676 /* tcp_ack considers this ACK as duplicate 5666 /* tcp_ack considers this ACK as duplicate
5677 * and does not calculate rtt. 5667 * and does not calculate rtt.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cf74c416831a..d0a314879d81 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1226,15 +1226,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1226 if (want_cookie && !tmp_opt.saw_tstamp) 1226 if (want_cookie && !tmp_opt.saw_tstamp)
1227 tcp_clear_options(&tmp_opt); 1227 tcp_clear_options(&tmp_opt);
1228 1228
1229 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1230 /* Some OSes (unknown ones, but I see them on web server, which
1231 * contains information interesting only for windows'
1232 * users) do not send their stamp in SYN. It is easy case.
1233 * We simply do not advertise TS support.
1234 */
1235 tmp_opt.saw_tstamp = 0;
1236 tmp_opt.tstamp_ok = 0;
1237 }
1238 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1229 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1239 1230
1240 tcp_openreq_init(req, &tmp_opt, skb); 1231 tcp_openreq_init(req, &tmp_opt, skb);
@@ -2355,7 +2346,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2355 2346
2356 switch (skb->ip_summed) { 2347 switch (skb->ip_summed) {
2357 case CHECKSUM_COMPLETE: 2348 case CHECKSUM_COMPLETE:
2358 if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr, 2349 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2359 skb->csum)) { 2350 skb->csum)) {
2360 skb->ip_summed = CHECKSUM_UNNECESSARY; 2351 skb->ip_summed = CHECKSUM_UNNECESSARY;
2361 break; 2352 break;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f67effbb102b..43bbba7926ee 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -107,7 +107,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
107 if (tmp_opt.saw_tstamp) { 107 if (tmp_opt.saw_tstamp) {
108 tmp_opt.ts_recent = tcptw->tw_ts_recent; 108 tmp_opt.ts_recent = tcptw->tw_ts_recent;
109 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 109 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
110 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 110 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
111 } 111 }
112 } 112 }
113 113
@@ -399,7 +399,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
399 399
400 tcp_prequeue_init(newtp); 400 tcp_prequeue_init(newtp);
401 401
402 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); 402 tcp_init_wl(newtp, treq->rcv_isn);
403 403
404 newtp->srtt = 0; 404 newtp->srtt = 0;
405 newtp->mdev = TCP_TIMEOUT_INIT; 405 newtp->mdev = TCP_TIMEOUT_INIT;
@@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
434 newtp->rx_opt.saw_tstamp = 0; 434 newtp->rx_opt.saw_tstamp = 0;
435 435
436 newtp->rx_opt.dsack = 0; 436 newtp->rx_opt.dsack = 0;
437 newtp->rx_opt.eff_sacks = 0;
438
439 newtp->rx_opt.num_sacks = 0; 437 newtp->rx_opt.num_sacks = 0;
438
440 newtp->urg_data = 0; 439 newtp->urg_data = 0;
441 440
442 if (sock_flag(newsk, SOCK_KEEPOPEN)) 441 if (sock_flag(newsk, SOCK_KEEPOPEN))
@@ -512,7 +511,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
512 * from another data. 511 * from another data.
513 */ 512 */
514 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); 513 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
515 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 514 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
516 } 515 }
517 } 516 }
518 517
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index da2c3b8794f2..c1f259d2d33b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -441,10 +441,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
441 *ptr++ = htonl(sp[this_sack].end_seq); 441 *ptr++ = htonl(sp[this_sack].end_seq);
442 } 442 }
443 443
444 if (tp->rx_opt.dsack) { 444 tp->rx_opt.dsack = 0;
445 tp->rx_opt.dsack = 0;
446 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
447 }
448 } 445 }
449} 446}
450 447
@@ -550,6 +547,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
550 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; 547 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
551 struct tcp_sock *tp = tcp_sk(sk); 548 struct tcp_sock *tp = tcp_sk(sk);
552 unsigned size = 0; 549 unsigned size = 0;
550 unsigned int eff_sacks;
553 551
554#ifdef CONFIG_TCP_MD5SIG 552#ifdef CONFIG_TCP_MD5SIG
555 *md5 = tp->af_specific->md5_lookup(sk, sk); 553 *md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -568,10 +566,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
568 size += TCPOLEN_TSTAMP_ALIGNED; 566 size += TCPOLEN_TSTAMP_ALIGNED;
569 } 567 }
570 568
571 if (unlikely(tp->rx_opt.eff_sacks)) { 569 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
570 if (unlikely(eff_sacks)) {
572 const unsigned remaining = MAX_TCP_OPTION_SPACE - size; 571 const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
573 opts->num_sack_blocks = 572 opts->num_sack_blocks =
574 min_t(unsigned, tp->rx_opt.eff_sacks, 573 min_t(unsigned, eff_sacks,
575 (remaining - TCPOLEN_SACK_BASE_ALIGNED) / 574 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
576 TCPOLEN_SACK_PERBLOCK); 575 TCPOLEN_SACK_PERBLOCK);
577 size += TCPOLEN_SACK_BASE_ALIGNED + 576 size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -663,10 +662,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
663 th->urg_ptr = 0; 662 th->urg_ptr = 0;
664 663
665 /* The urg_mode check is necessary during a below snd_una win probe */ 664 /* The urg_mode check is necessary during a below snd_una win probe */
666 if (unlikely(tcp_urg_mode(tp) && 665 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
667 between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { 666 if (before(tp->snd_up, tcb->seq + 0x10000)) {
668 th->urg_ptr = htons(tp->snd_up - tcb->seq); 667 th->urg_ptr = htons(tp->snd_up - tcb->seq);
669 th->urg = 1; 668 th->urg = 1;
669 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
670 th->urg_ptr = 0xFFFF;
671 th->urg = 1;
672 }
670 } 673 }
671 674
672 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 675 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
@@ -763,11 +766,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
763 struct sk_buff *buff; 766 struct sk_buff *buff;
764 int nsize, old_factor; 767 int nsize, old_factor;
765 int nlen; 768 int nlen;
766 u16 flags; 769 u8 flags;
767 770
768 BUG_ON(len > skb->len); 771 BUG_ON(len > skb->len);
769 772
770 tcp_clear_retrans_hints_partial(tp);
771 nsize = skb_headlen(skb) - len; 773 nsize = skb_headlen(skb) - len;
772 if (nsize < 0) 774 if (nsize < 0)
773 nsize = 0; 775 nsize = 0;
@@ -850,6 +852,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
850 tcp_verify_left_out(tp); 852 tcp_verify_left_out(tp);
851 } 853 }
852 tcp_adjust_fackets_out(sk, skb, diff); 854 tcp_adjust_fackets_out(sk, skb, diff);
855
856 if (tp->lost_skb_hint &&
857 before(TCP_SKB_CB(skb)->seq,
858 TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
859 (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked))
860 tp->lost_cnt_hint -= diff;
853 } 861 }
854 862
855 /* Link BUFF into the send queue. */ 863 /* Link BUFF into the send queue. */
@@ -913,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
913 * factor and mss. 921 * factor and mss.
914 */ 922 */
915 if (tcp_skb_pcount(skb) > 1) 923 if (tcp_skb_pcount(skb) > 1)
916 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1)); 924 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
917 925
918 return 0; 926 return 0;
919} 927}
@@ -974,15 +982,6 @@ void tcp_mtup_init(struct sock *sk)
974 icsk->icsk_mtup.probe_size = 0; 982 icsk->icsk_mtup.probe_size = 0;
975} 983}
976 984
977/* Bound MSS / TSO packet size with the half of the window */
978static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
979{
980 if (tp->max_window && pktsize > (tp->max_window >> 1))
981 return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
982 else
983 return pktsize;
984}
985
986/* This function synchronize snd mss to current pmtu/exthdr set. 985/* This function synchronize snd mss to current pmtu/exthdr set.
987 986
988 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts 987 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -1029,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1029/* Compute the current effective MSS, taking SACKs and IP options, 1028/* Compute the current effective MSS, taking SACKs and IP options,
1030 * and even PMTU discovery events into account. 1029 * and even PMTU discovery events into account.
1031 */ 1030 */
1032unsigned int tcp_current_mss(struct sock *sk, int large_allowed) 1031unsigned int tcp_current_mss(struct sock *sk)
1033{ 1032{
1034 struct tcp_sock *tp = tcp_sk(sk); 1033 struct tcp_sock *tp = tcp_sk(sk);
1035 struct dst_entry *dst = __sk_dst_get(sk); 1034 struct dst_entry *dst = __sk_dst_get(sk);
1036 u32 mss_now; 1035 u32 mss_now;
1037 u16 xmit_size_goal;
1038 int doing_tso = 0;
1039 unsigned header_len; 1036 unsigned header_len;
1040 struct tcp_out_options opts; 1037 struct tcp_out_options opts;
1041 struct tcp_md5sig_key *md5; 1038 struct tcp_md5sig_key *md5;
1042 1039
1043 mss_now = tp->mss_cache; 1040 mss_now = tp->mss_cache;
1044 1041
1045 if (large_allowed && sk_can_gso(sk))
1046 doing_tso = 1;
1047
1048 if (dst) { 1042 if (dst) {
1049 u32 mtu = dst_mtu(dst); 1043 u32 mtu = dst_mtu(dst);
1050 if (mtu != inet_csk(sk)->icsk_pmtu_cookie) 1044 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
@@ -1062,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
1062 mss_now -= delta; 1056 mss_now -= delta;
1063 } 1057 }
1064 1058
1065 xmit_size_goal = mss_now;
1066
1067 if (doing_tso) {
1068 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
1069 inet_csk(sk)->icsk_af_ops->net_header_len -
1070 inet_csk(sk)->icsk_ext_hdr_len -
1071 tp->tcp_header_len);
1072
1073 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
1074 xmit_size_goal -= (xmit_size_goal % mss_now);
1075 }
1076 tp->xmit_size_goal = xmit_size_goal;
1077
1078 return mss_now; 1059 return mss_now;
1079} 1060}
1080 1061
@@ -1256,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk)
1256 struct sk_buff *skb = tcp_send_head(sk); 1237 struct sk_buff *skb = tcp_send_head(sk);
1257 1238
1258 return (skb && 1239 return (skb &&
1259 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), 1240 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1260 (tcp_skb_is_last(sk, skb) ? 1241 (tcp_skb_is_last(sk, skb) ?
1261 tp->nonagle : TCP_NAGLE_PUSH))); 1242 tp->nonagle : TCP_NAGLE_PUSH)));
1262} 1243}
@@ -1273,7 +1254,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1273{ 1254{
1274 struct sk_buff *buff; 1255 struct sk_buff *buff;
1275 int nlen = skb->len - len; 1256 int nlen = skb->len - len;
1276 u16 flags; 1257 u8 flags;
1277 1258
1278 /* All of a TSO frame must be composed of paged data. */ 1259 /* All of a TSO frame must be composed of paged data. */
1279 if (skb->len != skb->data_len) 1260 if (skb->len != skb->data_len)
@@ -1352,6 +1333,10 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1352 if (limit >= sk->sk_gso_max_size) 1333 if (limit >= sk->sk_gso_max_size)
1353 goto send_now; 1334 goto send_now;
1354 1335
1336 /* Middle in queue won't get any more data, full sendable already? */
1337 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1338 goto send_now;
1339
1355 if (sysctl_tcp_tso_win_divisor) { 1340 if (sysctl_tcp_tso_win_divisor) {
1356 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1341 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1357 1342
@@ -1405,11 +1390,11 @@ static int tcp_mtu_probe(struct sock *sk)
1405 icsk->icsk_mtup.probe_size || 1390 icsk->icsk_mtup.probe_size ||
1406 inet_csk(sk)->icsk_ca_state != TCP_CA_Open || 1391 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1407 tp->snd_cwnd < 11 || 1392 tp->snd_cwnd < 11 ||
1408 tp->rx_opt.eff_sacks) 1393 tp->rx_opt.num_sacks || tp->rx_opt.dsack)
1409 return -1; 1394 return -1;
1410 1395
1411 /* Very simple search strategy: just double the MSS. */ 1396 /* Very simple search strategy: just double the MSS. */
1412 mss_now = tcp_current_mss(sk, 0); 1397 mss_now = tcp_current_mss(sk);
1413 probe_size = 2 * tp->mss_cache; 1398 probe_size = 2 * tp->mss_cache;
1414 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; 1399 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1415 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { 1400 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
@@ -1754,11 +1739,9 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1754 struct tcp_sock *tp = tcp_sk(sk); 1739 struct tcp_sock *tp = tcp_sk(sk);
1755 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 1740 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
1756 int skb_size, next_skb_size; 1741 int skb_size, next_skb_size;
1757 u16 flags;
1758 1742
1759 skb_size = skb->len; 1743 skb_size = skb->len;
1760 next_skb_size = next_skb->len; 1744 next_skb_size = next_skb->len;
1761 flags = TCP_SKB_CB(skb)->flags;
1762 1745
1763 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 1746 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
1764 1747
@@ -1778,9 +1761,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1778 /* Update sequence range on original skb. */ 1761 /* Update sequence range on original skb. */
1779 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; 1762 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
1780 1763
1781 /* Merge over control information. */ 1764 /* Merge over control information. This moves PSH/FIN etc. over */
1782 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ 1765 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
1783 TCP_SKB_CB(skb)->flags = flags;
1784 1766
1785 /* All done, get rid of second SKB and account for it so 1767 /* All done, get rid of second SKB and account for it so
1786 * packet counting does not break. 1768 * packet counting does not break.
@@ -1894,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1894 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) 1876 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
1895 return -EHOSTUNREACH; /* Routing failure or similar. */ 1877 return -EHOSTUNREACH; /* Routing failure or similar. */
1896 1878
1897 cur_mss = tcp_current_mss(sk, 0); 1879 cur_mss = tcp_current_mss(sk);
1898 1880
1899 /* If receiver has shrunk his window, and skb is out of 1881 /* If receiver has shrunk his window, and skb is out of
1900 * new window, do not retransmit it. The exception is the 1882 * new window, do not retransmit it. The exception is the
@@ -1908,6 +1890,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1908 if (skb->len > cur_mss) { 1890 if (skb->len > cur_mss) {
1909 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 1891 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1910 return -ENOMEM; /* We'll try again later. */ 1892 return -ENOMEM; /* We'll try again later. */
1893 } else {
1894 tcp_init_tso_segs(sk, skb, cur_mss);
1911 } 1895 }
1912 1896
1913 tcp_retrans_try_collapse(sk, skb, cur_mss); 1897 tcp_retrans_try_collapse(sk, skb, cur_mss);
@@ -2061,7 +2045,7 @@ begin_fwd:
2061 goto begin_fwd; 2045 goto begin_fwd;
2062 2046
2063 } else if (!(sacked & TCPCB_LOST)) { 2047 } else if (!(sacked & TCPCB_LOST)) {
2064 if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) 2048 if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
2065 hole = skb; 2049 hole = skb;
2066 continue; 2050 continue;
2067 2051
@@ -2100,7 +2084,7 @@ void tcp_send_fin(struct sock *sk)
2100 * unsent frames. But be careful about outgoing SACKS 2084 * unsent frames. But be careful about outgoing SACKS
2101 * and IP options. 2085 * and IP options.
2102 */ 2086 */
2103 mss_now = tcp_current_mss(sk, 1); 2087 mss_now = tcp_current_mss(sk);
2104 2088
2105 if (tcp_send_head(sk) != NULL) { 2089 if (tcp_send_head(sk) != NULL) {
2106 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; 2090 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
@@ -2325,7 +2309,7 @@ static void tcp_connect_init(struct sock *sk)
2325 sk->sk_err = 0; 2309 sk->sk_err = 0;
2326 sock_reset_flag(sk, SOCK_DONE); 2310 sock_reset_flag(sk, SOCK_DONE);
2327 tp->snd_wnd = 0; 2311 tp->snd_wnd = 0;
2328 tcp_init_wl(tp, tp->write_seq, 0); 2312 tcp_init_wl(tp, 0);
2329 tp->snd_una = tp->write_seq; 2313 tp->snd_una = tp->write_seq;
2330 tp->snd_sml = tp->write_seq; 2314 tp->snd_sml = tp->write_seq;
2331 tp->snd_up = tp->write_seq; 2315 tp->snd_up = tp->write_seq;
@@ -2512,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk)
2512 if ((skb = tcp_send_head(sk)) != NULL && 2496 if ((skb = tcp_send_head(sk)) != NULL &&
2513 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { 2497 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
2514 int err; 2498 int err;
2515 unsigned int mss = tcp_current_mss(sk, 0); 2499 unsigned int mss = tcp_current_mss(sk);
2516 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 2500 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2517 2501
2518 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) 2502 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 25524d4e372a..59f5b5e7c566 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -165,9 +165,10 @@ static int tcpprobe_sprint(char *tbuf, int n)
165static ssize_t tcpprobe_read(struct file *file, char __user *buf, 165static ssize_t tcpprobe_read(struct file *file, char __user *buf,
166 size_t len, loff_t *ppos) 166 size_t len, loff_t *ppos)
167{ 167{
168 int error = 0, cnt = 0; 168 int error = 0;
169 size_t cnt = 0;
169 170
170 if (!buf || len < 0) 171 if (!buf)
171 return -EINVAL; 172 return -EINVAL;
172 173
173 while (cnt < len) { 174 while (cnt < len) {
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 4660b088a8ce..a76513779e2b 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -24,14 +24,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
24 24
25 if (tp->snd_cwnd <= tp->snd_ssthresh) 25 if (tp->snd_cwnd <= tp->snd_ssthresh)
26 tcp_slow_start(tp); 26 tcp_slow_start(tp);
27 else { 27 else
28 tp->snd_cwnd_cnt++; 28 tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
29 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
30 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
31 tp->snd_cwnd++;
32 tp->snd_cwnd_cnt = 0;
33 }
34 }
35} 29}
36 30
37static u32 tcp_scalable_ssthresh(struct sock *sk) 31static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0170e914f1b0..b144a26359bc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -328,19 +328,16 @@ static void tcp_retransmit_timer(struct sock *sk)
328 if (icsk->icsk_retransmits == 0) { 328 if (icsk->icsk_retransmits == 0) {
329 int mib_idx; 329 int mib_idx;
330 330
331 if (icsk->icsk_ca_state == TCP_CA_Disorder || 331 if (icsk->icsk_ca_state == TCP_CA_Disorder) {
332 icsk->icsk_ca_state == TCP_CA_Recovery) { 332 if (tcp_is_sack(tp))
333 if (tcp_is_sack(tp)) { 333 mib_idx = LINUX_MIB_TCPSACKFAILURES;
334 if (icsk->icsk_ca_state == TCP_CA_Recovery) 334 else
335 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; 335 mib_idx = LINUX_MIB_TCPRENOFAILURES;
336 else 336 } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
337 mib_idx = LINUX_MIB_TCPSACKFAILURES; 337 if (tcp_is_sack(tp))
338 } else { 338 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
339 if (icsk->icsk_ca_state == TCP_CA_Recovery) 339 else
340 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; 340 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
341 else
342 mib_idx = LINUX_MIB_TCPRENOFAILURES;
343 }
344 } else if (icsk->icsk_ca_state == TCP_CA_Loss) { 341 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
345 mib_idx = LINUX_MIB_TCPLOSSFAILURES; 342 mib_idx = LINUX_MIB_TCPLOSSFAILURES;
346 } else { 343 } else {
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index d08b2e855c22..e9bbff746488 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,12 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
159 /* In the "non-congestive state", increase cwnd 159 /* In the "non-congestive state", increase cwnd
160 * every rtt. 160 * every rtt.
161 */ 161 */
162 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 162 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
163 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
164 tp->snd_cwnd++;
165 tp->snd_cwnd_cnt = 0;
166 } else
167 tp->snd_cwnd_cnt++;
168 } else { 163 } else {
169 /* In the "congestive state", increase cwnd 164 /* In the "congestive state", increase cwnd
170 * every other rtt. 165 * every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9ec843a9bbb2..66b6821b984e 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -94,14 +94,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
94 94
95 } else { 95 } else {
96 /* Reno */ 96 /* Reno */
97 97 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
98 if (tp->snd_cwnd_cnt < tp->snd_cwnd)
99 tp->snd_cwnd_cnt++;
100
101 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
102 tp->snd_cwnd++;
103 tp->snd_cwnd_cnt = 0;
104 }
105 } 98 }
106 99
107 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. 100 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c8bee189a193..bda08a09357d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -222,7 +222,7 @@ fail:
222 return error; 222 return error;
223} 223}
224 224
225static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) 225int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
226{ 226{
227 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); 227 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
228 228
@@ -596,6 +596,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
596 return -EOPNOTSUPP; 596 return -EOPNOTSUPP;
597 597
598 ipc.opt = NULL; 598 ipc.opt = NULL;
599 ipc.shtx.flags = 0;
599 600
600 if (up->pending) { 601 if (up->pending) {
601 /* 602 /*
@@ -643,6 +644,9 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
643 ipc.addr = inet->saddr; 644 ipc.addr = inet->saddr;
644 645
645 ipc.oif = sk->sk_bound_dev_if; 646 ipc.oif = sk->sk_bound_dev_if;
647 err = sock_tx_timestamp(msg, sk, &ipc.shtx);
648 if (err)
649 return err;
646 if (msg->msg_controllen) { 650 if (msg->msg_controllen) {
647 err = ip_cmsg_send(sock_net(sk), msg, &ipc); 651 err = ip_cmsg_send(sock_net(sk), msg, &ipc);
648 if (err) 652 if (err)
@@ -1180,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1180 sk = sknext; 1184 sk = sknext;
1181 } while (sknext); 1185 } while (sknext);
1182 } else 1186 } else
1183 kfree_skb(skb); 1187 consume_skb(skb);
1184 spin_unlock(&hslot->lock); 1188 spin_unlock(&hslot->lock);
1185 return 0; 1189 return 0;
1186} 1190}
@@ -1819,6 +1823,7 @@ EXPORT_SYMBOL(udp_lib_getsockopt);
1819EXPORT_SYMBOL(udp_lib_setsockopt); 1823EXPORT_SYMBOL(udp_lib_setsockopt);
1820EXPORT_SYMBOL(udp_poll); 1824EXPORT_SYMBOL(udp_poll);
1821EXPORT_SYMBOL(udp_lib_get_port); 1825EXPORT_SYMBOL(udp_lib_get_port);
1826EXPORT_SYMBOL(ipv4_rcv_saddr_equal);
1822 1827
1823#ifdef CONFIG_PROC_FS 1828#ifdef CONFIG_PROC_FS
1824EXPORT_SYMBOL(udp_proc_register); 1829EXPORT_SYMBOL(udp_proc_register);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 2ad24ba31f9d..60d918c96a4f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -241,7 +241,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
241 241
242static struct dst_ops xfrm4_dst_ops = { 242static struct dst_ops xfrm4_dst_ops = {
243 .family = AF_INET, 243 .family = AF_INET,
244 .protocol = __constant_htons(ETH_P_IP), 244 .protocol = cpu_to_be16(ETH_P_IP),
245 .gc = xfrm4_garbage_collect, 245 .gc = xfrm4_garbage_collect,
246 .update_pmtu = xfrm4_update_pmtu, 246 .update_pmtu = xfrm4_update_pmtu,
247 .destroy = xfrm4_dst_destroy, 247 .destroy = xfrm4_dst_destroy,