aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/appletalk/aarp.c9
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/bridge/br_device.c15
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_if.c23
-rw-r--r--net/bridge/br_input.c12
-rw-r--r--net/bridge/br_netfilter.c38
-rw-r--r--net/bridge/br_notify.c9
-rw-r--r--net/bridge/br_private.h1
-rw-r--r--net/bridge/br_stp_bpdu.c3
-rw-r--r--net/bridge/netfilter/ebtables.c21
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/dev.c138
-rw-r--r--net/core/ethtool.c22
-rw-r--r--net/core/neighbour.c336
-rw-r--r--net/core/net-sysfs.c20
-rw-r--r--net/core/netfilter.c138
-rw-r--r--net/core/netpoll.c80
-rw-r--r--net/core/pktgen.c29
-rw-r--r--net/core/request_sock.c64
-rw-r--r--net/core/rtnetlink.c33
-rw-r--r--net/core/skbuff.c163
-rw-r--r--net/core/sock.c35
-rw-r--r--net/core/sysctl_net_core.c61
-rw-r--r--net/decnet/dn_dev.c9
-rw-r--r--net/decnet/dn_neigh.c1
-rw-r--r--net/decnet/dn_route.c11
-rw-r--r--net/decnet/dn_rules.c7
-rw-r--r--net/decnet/dn_table.c8
-rw-r--r--net/ipv4/Kconfig145
-rw-r--r--net/ipv4/Makefile14
-rw-r--r--net/ipv4/af_inet.c13
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/devinet.c45
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_frontend.c55
-rw-r--r--net/ipv4/fib_hash.c3
-rw-r--r--net/ipv4/fib_lookup.h3
-rw-r--r--net/ipv4/fib_rules.c7
-rw-r--r--net/ipv4/fib_semantics.c10
-rw-r--r--net/ipv4/fib_trie.c2454
-rw-r--r--net/ipv4/icmp.c9
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c11
-rw-r--r--net/ipv4/ip_sockglue.c6
-rw-r--r--net/ipv4/ipcomp.c11
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/Makefile2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_icmp.c182
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c1
-rw-r--r--net/ipv4/multipath_drr.c20
-rw-r--r--net/ipv4/multipath_random.c2
-rw-r--r--net/ipv4/multipath_rr.c22
-rw-r--r--net/ipv4/multipath_wrandom.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c107
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c23
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c27
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c23
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c32
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c5
-rw-r--r--net/ipv4/netfilter/ip_queue.c10
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c51
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c10
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c13
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c15
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c17
-rw-r--r--net/ipv4/netfilter/ipt_helper.c4
-rw-r--r--net/ipv4/netfilter/ipt_recent.c10
-rw-r--r--net/ipv4/raw.c22
-rw-r--r--net/ipv4/route.c19
-rw-r--r--net/ipv4/syncookies.c49
-rw-r--r--net/ipv4/sysctl_net_ipv4.c123
-rw-r--r--net/ipv4/tcp.c119
-rw-r--r--net/ipv4/tcp_bic.c331
-rw-r--r--net/ipv4/tcp_cong.c237
-rw-r--r--net/ipv4/tcp_diag.c71
-rw-r--r--net/ipv4/tcp_highspeed.c181
-rw-r--r--net/ipv4/tcp_htcp.c289
-rw-r--r--net/ipv4/tcp_hybla.c187
-rw-r--r--net/ipv4/tcp_input.c737
-rw-r--r--net/ipv4/tcp_ipv4.c175
-rw-r--r--net/ipv4/tcp_minisocks.c72
-rw-r--r--net/ipv4/tcp_output.c50
-rw-r--r--net/ipv4/tcp_scalable.c68
-rw-r--r--net/ipv4/tcp_timer.c18
-rw-r--r--net/ipv4/tcp_vegas.c411
-rw-r--r--net/ipv4/tcp_westwood.c259
-rw-r--r--net/ipv4/udp.c12
-rw-r--r--net/ipv4/xfrm4_output.c8
-rw-r--r--net/ipv4/xfrm4_state.c9
-rw-r--r--net/ipv4/xfrm4_tunnel.c2
-rw-r--r--net/ipv6/addrconf.c75
-rw-r--r--net/ipv6/ah6.c2
-rw-r--r--net/ipv6/anycast.c4
-rw-r--r--net/ipv6/datagram.c6
-rw-r--r--net/ipv6/esp6.c2
-rw-r--r--net/ipv6/icmp.c14
-rw-r--r--net/ipv6/ip6_fib.c19
-rw-r--r--net/ipv6/ip6_flowlabel.c10
-rw-r--r--net/ipv6/ip6_output.c3
-rw-r--r--net/ipv6/ip6_tunnel.c1
-rw-r--r--net/ipv6/ipcomp6.c9
-rw-r--r--net/ipv6/ipv6_sockglue.c5
-rw-r--r--net/ipv6/ipv6_syms.c1
-rw-r--r--net/ipv6/mcast.c68
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter/ip6_tables.c1
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c54
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c6
-rw-r--r--net/ipv6/raw.c8
-rw-r--r--net/ipv6/route.c79
-rw-r--r--net/ipv6/tcp_ipv6.c150
-rw-r--r--net/ipv6/udp.c4
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/key/af_key.c385
-rw-r--r--net/netlink/af_netlink.c19
-rw-r--r--net/rxrpc/krxiod.c2
-rw-r--r--net/rxrpc/krxsecd.c2
-rw-r--r--net/rxrpc/krxtimod.c2
-rw-r--r--net/sched/Kconfig15
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c13
-rw-r--r--net/sched/cls_api.c5
-rw-r--r--net/sched/cls_basic.c3
-rw-r--r--net/sched/em_meta.c295
-rw-r--r--net/sched/em_text.c157
-rw-r--r--net/sched/sch_api.c10
-rw-r--r--net/sched/sch_dsmark.c373
-rw-r--r--net/sched/sch_fifo.c152
-rw-r--r--net/sched/sch_generic.c84
-rw-r--r--net/sctp/associola.c151
-rw-r--r--net/sctp/endpointola.c1
-rw-r--r--net/sctp/input.c51
-rw-r--r--net/sctp/ipv6.c36
-rw-r--r--net/sctp/outqueue.c11
-rw-r--r--net/sctp/proc.c194
-rw-r--r--net/sctp/protocol.c7
-rw-r--r--net/sctp/sm_make_chunk.c20
-rw-r--r--net/sctp/sm_sideeffect.c105
-rw-r--r--net/sctp/sm_statefuns.c162
-rw-r--r--net/sctp/sm_statetable.c6
-rw-r--r--net/sctp/socket.c425
-rw-r--r--net/sctp/transport.c4
-rw-r--r--net/socket.c12
-rw-r--r--net/sunrpc/auth.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c18
-rw-r--r--net/sunrpc/clnt.c205
-rw-r--r--net/sunrpc/pmap_clnt.c9
-rw-r--r--net/sunrpc/sched.c84
-rw-r--r--net/sunrpc/sunrpc_syms.c7
-rw-r--r--net/sunrpc/svc.c36
-rw-r--r--net/sunrpc/svcauth_unix.c11
-rw-r--r--net/sunrpc/svcsock.c6
-rw-r--r--net/sunrpc/xdr.c298
-rw-r--r--net/sunrpc/xprt.c71
-rw-r--r--net/x25/af_x25.c110
-rw-r--r--net/x25/x25_facilities.c34
-rw-r--r--net/x25/x25_subr.c41
-rw-r--r--net/xfrm/xfrm_policy.c9
-rw-r--r--net/xfrm/xfrm_state.c118
-rw-r--r--net/xfrm/xfrm_user.c297
172 files changed, 9399 insertions, 3366 deletions
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 54640c01b50c..c34614ea5fce 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -35,6 +35,7 @@
35#include <net/datalink.h> 35#include <net/datalink.h>
36#include <net/psnap.h> 36#include <net/psnap.h>
37#include <linux/atalk.h> 37#include <linux/atalk.h>
38#include <linux/delay.h>
38#include <linux/init.h> 39#include <linux/init.h>
39#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
40#include <linux/seq_file.h> 41#include <linux/seq_file.h>
@@ -462,8 +463,7 @@ void aarp_probe_network(struct atalk_iface *atif)
462 aarp_send_probe(atif->dev, &atif->address); 463 aarp_send_probe(atif->dev, &atif->address);
463 464
464 /* Defer 1/10th */ 465 /* Defer 1/10th */
465 current->state = TASK_INTERRUPTIBLE; 466 msleep(100);
466 schedule_timeout(HZ / 10);
467 467
468 if (atif->status & ATIF_PROBE_FAIL) 468 if (atif->status & ATIF_PROBE_FAIL)
469 break; 469 break;
@@ -510,9 +510,8 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
510 aarp_send_probe(atif->dev, sa); 510 aarp_send_probe(atif->dev, sa);
511 511
512 /* Defer 1/10th */ 512 /* Defer 1/10th */
513 current->state = TASK_INTERRUPTIBLE;
514 write_unlock_bh(&aarp_lock); 513 write_unlock_bh(&aarp_lock);
515 schedule_timeout(HZ / 10); 514 msleep(100);
516 write_lock_bh(&aarp_lock); 515 write_lock_bh(&aarp_lock);
517 516
518 if (entry->status & ATIF_PROBE_FAIL) 517 if (entry->status & ATIF_PROBE_FAIL)
@@ -565,7 +564,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
565 * numbers we just happen to need. Now put the 564 * numbers we just happen to need. Now put the
566 * length in the lower two. 565 * length in the lower two.
567 */ 566 */
568 *((__u16 *)skb->data) = htons(skb->len); 567 *((__be16 *)skb->data) = htons(skb->len);
569 ft = 1; 568 ft = 1;
570 } 569 }
571 /* 570 /*
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 876dbac71060..192b529f86a4 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -401,7 +401,7 @@ out_err:
401} 401}
402 402
403/* Find a match for a specific network:node pair */ 403/* Find a match for a specific network:node pair */
404static struct atalk_iface *atalk_find_interface(int net, int node) 404static struct atalk_iface *atalk_find_interface(__be16 net, int node)
405{ 405{
406 struct atalk_iface *iface; 406 struct atalk_iface *iface;
407 407
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index d9b72fde433c..f564ee99782d 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -21,10 +21,7 @@
21 21
22static struct net_device_stats *br_dev_get_stats(struct net_device *dev) 22static struct net_device_stats *br_dev_get_stats(struct net_device *dev)
23{ 23{
24 struct net_bridge *br; 24 struct net_bridge *br = netdev_priv(dev);
25
26 br = dev->priv;
27
28 return &br->statistics; 25 return &br->statistics;
29} 26}
30 27
@@ -54,9 +51,11 @@ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
54 51
55static int br_dev_open(struct net_device *dev) 52static int br_dev_open(struct net_device *dev)
56{ 53{
57 netif_start_queue(dev); 54 struct net_bridge *br = netdev_priv(dev);
58 55
59 br_stp_enable_bridge(dev->priv); 56 br_features_recompute(br);
57 netif_start_queue(dev);
58 br_stp_enable_bridge(br);
60 59
61 return 0; 60 return 0;
62} 61}
@@ -67,7 +66,7 @@ static void br_dev_set_multicast_list(struct net_device *dev)
67 66
68static int br_dev_stop(struct net_device *dev) 67static int br_dev_stop(struct net_device *dev)
69{ 68{
70 br_stp_disable_bridge(dev->priv); 69 br_stp_disable_bridge(netdev_priv(dev));
71 70
72 netif_stop_queue(dev); 71 netif_stop_queue(dev);
73 72
@@ -76,7 +75,7 @@ static int br_dev_stop(struct net_device *dev)
76 75
77static int br_change_mtu(struct net_device *dev, int new_mtu) 76static int br_change_mtu(struct net_device *dev, int new_mtu)
78{ 77{
79 if ((new_mtu < 68) || new_mtu > br_min_mtu(dev->priv)) 78 if (new_mtu < 68 || new_mtu > br_min_mtu(netdev_priv(dev)))
80 return -EINVAL; 79 return -EINVAL;
81 80
82 dev->mtu = new_mtu; 81 dev->mtu = new_mtu;
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index ef9f2095f96e..069253f830c1 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -57,9 +57,6 @@ int br_forward_finish(struct sk_buff *skb)
57static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) 57static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
58{ 58{
59 skb->dev = to->dev; 59 skb->dev = to->dev;
60#ifdef CONFIG_NETFILTER_DEBUG
61 skb->nf_debug = 0;
62#endif
63 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, 60 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
64 br_forward_finish); 61 br_forward_finish);
65} 62}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 69872bf3b87e..91bb895375f4 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -314,6 +314,28 @@ int br_min_mtu(const struct net_bridge *br)
314 return mtu; 314 return mtu;
315} 315}
316 316
317/*
318 * Recomputes features using slave's features
319 */
320void br_features_recompute(struct net_bridge *br)
321{
322 struct net_bridge_port *p;
323 unsigned long features, checksum;
324
325 features = NETIF_F_SG | NETIF_F_FRAGLIST
326 | NETIF_F_HIGHDMA | NETIF_F_TSO;
327 checksum = NETIF_F_IP_CSUM; /* least commmon subset */
328
329 list_for_each_entry(p, &br->port_list, list) {
330 if (!(p->dev->features
331 & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)))
332 checksum = 0;
333 features &= p->dev->features;
334 }
335
336 br->dev->features = features | checksum | NETIF_F_LLTX;
337}
338
317/* called with RTNL */ 339/* called with RTNL */
318int br_add_if(struct net_bridge *br, struct net_device *dev) 340int br_add_if(struct net_bridge *br, struct net_device *dev)
319{ 341{
@@ -368,6 +390,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
368 390
369 spin_lock_bh(&br->lock); 391 spin_lock_bh(&br->lock);
370 br_stp_recalculate_bridge_id(br); 392 br_stp_recalculate_bridge_id(br);
393 br_features_recompute(br);
371 spin_unlock_bh(&br->lock); 394 spin_unlock_bh(&br->lock);
372 395
373 return 0; 396 return 0;
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 2b1cce46cab4..9a45e6279c57 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -23,11 +23,7 @@ const unsigned char bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
23 23
24static int br_pass_frame_up_finish(struct sk_buff *skb) 24static int br_pass_frame_up_finish(struct sk_buff *skb)
25{ 25{
26#ifdef CONFIG_NETFILTER_DEBUG 26 netif_receive_skb(skb);
27 skb->nf_debug = 0;
28#endif
29 netif_rx(skb);
30
31 return 0; 27 return 0;
32} 28}
33 29
@@ -54,6 +50,9 @@ int br_handle_frame_finish(struct sk_buff *skb)
54 struct net_bridge_fdb_entry *dst; 50 struct net_bridge_fdb_entry *dst;
55 int passedup = 0; 51 int passedup = 0;
56 52
53 /* insert into forwarding database after filtering to avoid spoofing */
54 br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
55
57 if (br->dev->flags & IFF_PROMISC) { 56 if (br->dev->flags & IFF_PROMISC) {
58 struct sk_buff *skb2; 57 struct sk_buff *skb2;
59 58
@@ -108,8 +107,7 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
108 if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) 107 if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
109 goto err; 108 goto err;
110 109
111 if (p->state == BR_STATE_LEARNING || 110 if (p->state == BR_STATE_LEARNING)
112 p->state == BR_STATE_FORWARDING)
113 br_fdb_update(p->br, p, eth_hdr(skb)->h_source); 111 br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
114 112
115 if (p->br->stp_enabled && 113 if (p->br->stp_enabled &&
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index be03d3ad2648..03ae4edddac3 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -102,10 +102,6 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
102{ 102{
103 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 103 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
104 104
105#ifdef CONFIG_NETFILTER_DEBUG
106 skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
107#endif
108
109 if (nf_bridge->mask & BRNF_PKT_TYPE) { 105 if (nf_bridge->mask & BRNF_PKT_TYPE) {
110 skb->pkt_type = PACKET_OTHERHOST; 106 skb->pkt_type = PACKET_OTHERHOST;
111 nf_bridge->mask ^= BRNF_PKT_TYPE; 107 nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -182,10 +178,6 @@ static void __br_dnat_complain(void)
182 * --Bart, 20021007 (updated) */ 178 * --Bart, 20021007 (updated) */
183static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) 179static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
184{ 180{
185#ifdef CONFIG_NETFILTER_DEBUG
186 skb->nf_debug |= (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_FORWARD);
187#endif
188
189 if (skb->pkt_type == PACKET_OTHERHOST) { 181 if (skb->pkt_type == PACKET_OTHERHOST) {
190 skb->pkt_type = PACKET_HOST; 182 skb->pkt_type = PACKET_HOST;
191 skb->nf_bridge->mask |= BRNF_PKT_TYPE; 183 skb->nf_bridge->mask |= BRNF_PKT_TYPE;
@@ -207,10 +199,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
207 struct iphdr *iph = skb->nh.iph; 199 struct iphdr *iph = skb->nh.iph;
208 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 200 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
209 201
210#ifdef CONFIG_NETFILTER_DEBUG
211 skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
212#endif
213
214 if (nf_bridge->mask & BRNF_PKT_TYPE) { 202 if (nf_bridge->mask & BRNF_PKT_TYPE) {
215 skb->pkt_type = PACKET_OTHERHOST; 203 skb->pkt_type = PACKET_OTHERHOST;
216 nf_bridge->mask ^= BRNF_PKT_TYPE; 204 nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -382,9 +370,6 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
382 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) 370 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
383 goto inhdr_error; 371 goto inhdr_error;
384 372
385#ifdef CONFIG_NETFILTER_DEBUG
386 skb->nf_debug ^= (1 << NF_IP6_PRE_ROUTING);
387#endif
388 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) 373 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
389 return NF_DROP; 374 return NF_DROP;
390 setup_pre_routing(skb); 375 setup_pre_routing(skb);
@@ -468,9 +453,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
468 skb->ip_summed = CHECKSUM_NONE; 453 skb->ip_summed = CHECKSUM_NONE;
469 } 454 }
470 455
471#ifdef CONFIG_NETFILTER_DEBUG
472 skb->nf_debug ^= (1 << NF_IP_PRE_ROUTING);
473#endif
474 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) 456 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
475 return NF_DROP; 457 return NF_DROP;
476 setup_pre_routing(skb); 458 setup_pre_routing(skb);
@@ -517,10 +499,6 @@ static int br_nf_forward_finish(struct sk_buff *skb)
517 struct net_device *in; 499 struct net_device *in;
518 struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); 500 struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
519 501
520#ifdef CONFIG_NETFILTER_DEBUG
521 skb->nf_debug ^= (1 << NF_BR_FORWARD);
522#endif
523
524 if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) { 502 if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) {
525 in = nf_bridge->physindev; 503 in = nf_bridge->physindev;
526 if (nf_bridge->mask & BRNF_PKT_TYPE) { 504 if (nf_bridge->mask & BRNF_PKT_TYPE) {
@@ -566,9 +544,6 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb,
566 (*pskb)->nh.raw += VLAN_HLEN; 544 (*pskb)->nh.raw += VLAN_HLEN;
567 } 545 }
568 546
569#ifdef CONFIG_NETFILTER_DEBUG
570 skb->nf_debug ^= (1 << NF_BR_FORWARD);
571#endif
572 nf_bridge = skb->nf_bridge; 547 nf_bridge = skb->nf_bridge;
573 if (skb->pkt_type == PACKET_OTHERHOST) { 548 if (skb->pkt_type == PACKET_OTHERHOST) {
574 skb->pkt_type = PACKET_HOST; 549 skb->pkt_type = PACKET_HOST;
@@ -605,10 +580,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
605 (*pskb)->nh.raw += VLAN_HLEN; 580 (*pskb)->nh.raw += VLAN_HLEN;
606 } 581 }
607 582
608#ifdef CONFIG_NETFILTER_DEBUG
609 skb->nf_debug ^= (1 << NF_BR_FORWARD);
610#endif
611
612 if (skb->nh.arph->ar_pln != 4) { 583 if (skb->nh.arph->ar_pln != 4) {
613 if (IS_VLAN_ARP) { 584 if (IS_VLAN_ARP) {
614 skb_push(*pskb, VLAN_HLEN); 585 skb_push(*pskb, VLAN_HLEN);
@@ -627,9 +598,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
627/* PF_BRIDGE/LOCAL_OUT ***********************************************/ 598/* PF_BRIDGE/LOCAL_OUT ***********************************************/
628static int br_nf_local_out_finish(struct sk_buff *skb) 599static int br_nf_local_out_finish(struct sk_buff *skb)
629{ 600{
630#ifdef CONFIG_NETFILTER_DEBUG
631 skb->nf_debug &= ~(1 << NF_BR_LOCAL_OUT);
632#endif
633 if (skb->protocol == __constant_htons(ETH_P_8021Q)) { 601 if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
634 skb_push(skb, VLAN_HLEN); 602 skb_push(skb, VLAN_HLEN);
635 skb->nh.raw -= VLAN_HLEN; 603 skb->nh.raw -= VLAN_HLEN;
@@ -731,10 +699,6 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb,
731 realoutdev, br_nf_local_out_finish, 699 realoutdev, br_nf_local_out_finish,
732 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1); 700 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1);
733 } else { 701 } else {
734#ifdef CONFIG_NETFILTER_DEBUG
735 skb->nf_debug ^= (1 << NF_IP_LOCAL_OUT);
736#endif
737
738 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev, 702 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev,
739 realoutdev, br_nf_local_out_finish, 703 realoutdev, br_nf_local_out_finish,
740 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1); 704 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1);
@@ -779,8 +743,6 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
779 printk(KERN_CRIT "br_netfilter: skb->dst == NULL."); 743 printk(KERN_CRIT "br_netfilter: skb->dst == NULL.");
780 goto print_error; 744 goto print_error;
781 } 745 }
782
783 skb->nf_debug ^= (1 << NF_IP_POST_ROUTING);
784#endif 746#endif
785 747
786 /* We assume any code from br_dev_queue_push_xmit onwards doesn't care 748 /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index f8fb49e34764..917311c6828b 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -65,6 +65,15 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
65 } 65 }
66 break; 66 break;
67 67
68 case NETDEV_FEAT_CHANGE:
69 if (br->dev->flags & IFF_UP)
70 br_features_recompute(br);
71
72 /* could do recursive feature change notification
73 * but who would care??
74 */
75 break;
76
68 case NETDEV_DOWN: 77 case NETDEV_DOWN:
69 if (br->dev->flags & IFF_UP) 78 if (br->dev->flags & IFF_UP)
70 br_stp_disable_port(p); 79 br_stp_disable_port(p);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 54d63f1372a0..bdf95a74d8cd 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -174,6 +174,7 @@ extern int br_add_if(struct net_bridge *br,
174extern int br_del_if(struct net_bridge *br, 174extern int br_del_if(struct net_bridge *br,
175 struct net_device *dev); 175 struct net_device *dev);
176extern int br_min_mtu(const struct net_bridge *br); 176extern int br_min_mtu(const struct net_bridge *br);
177extern void br_features_recompute(struct net_bridge *br);
177 178
178/* br_input.c */ 179/* br_input.c */
179extern int br_handle_frame_finish(struct sk_buff *skb); 180extern int br_handle_frame_finish(struct sk_buff *skb);
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index b91a875aca01..d071f1c9ad0b 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -140,6 +140,9 @@ int br_stp_handle_bpdu(struct sk_buff *skb)
140 struct net_bridge *br = p->br; 140 struct net_bridge *br = p->br;
141 unsigned char *buf; 141 unsigned char *buf;
142 142
143 /* insert into forwarding database after filtering to avoid spoofing */
144 br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
145
143 /* need at least the 802 and STP headers */ 146 /* need at least the 802 and STP headers */
144 if (!pskb_may_pull(skb, sizeof(header)+1) || 147 if (!pskb_may_pull(skb, sizeof(header)+1) ||
145 memcmp(skb->data, header, sizeof(header))) 148 memcmp(skb->data, header, sizeof(header)))
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 18ebc664769b..c4540144f0f4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -859,8 +859,7 @@ static int translate_table(struct ebt_replace *repl,
859 if (repl->valid_hooks & (1 << i)) 859 if (repl->valid_hooks & (1 << i))
860 if (check_chainloops(newinfo->hook_entry[i], 860 if (check_chainloops(newinfo->hook_entry[i],
861 cl_s, udc_cnt, i, newinfo->entries)) { 861 cl_s, udc_cnt, i, newinfo->entries)) {
862 if (cl_s) 862 vfree(cl_s);
863 vfree(cl_s);
864 return -EINVAL; 863 return -EINVAL;
865 } 864 }
866 865
@@ -883,8 +882,7 @@ static int translate_table(struct ebt_replace *repl,
883 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, 882 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
884 ebt_cleanup_entry, &i); 883 ebt_cleanup_entry, &i);
885 } 884 }
886 if (cl_s) 885 vfree(cl_s);
887 vfree(cl_s);
888 return ret; 886 return ret;
889} 887}
890 888
@@ -1030,8 +1028,7 @@ static int do_replace(void __user *user, unsigned int len)
1030 } 1028 }
1031 vfree(table); 1029 vfree(table);
1032 1030
1033 if (counterstmp) 1031 vfree(counterstmp);
1034 vfree(counterstmp);
1035 return ret; 1032 return ret;
1036 1033
1037free_unlock: 1034free_unlock:
@@ -1040,8 +1037,7 @@ free_iterate:
1040 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, 1037 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
1041 ebt_cleanup_entry, NULL); 1038 ebt_cleanup_entry, NULL);
1042free_counterstmp: 1039free_counterstmp:
1043 if (counterstmp) 1040 vfree(counterstmp);
1044 vfree(counterstmp);
1045 /* can be initialized in translate_table() */ 1041 /* can be initialized in translate_table() */
1046 if (newinfo->chainstack) { 1042 if (newinfo->chainstack) {
1047 for (i = 0; i < num_possible_cpus(); i++) 1043 for (i = 0; i < num_possible_cpus(); i++)
@@ -1049,11 +1045,9 @@ free_counterstmp:
1049 vfree(newinfo->chainstack); 1045 vfree(newinfo->chainstack);
1050 } 1046 }
1051free_entries: 1047free_entries:
1052 if (newinfo->entries) 1048 vfree(newinfo->entries);
1053 vfree(newinfo->entries);
1054free_newinfo: 1049free_newinfo:
1055 if (newinfo) 1050 vfree(newinfo);
1056 vfree(newinfo);
1057 return ret; 1051 return ret;
1058} 1052}
1059 1053
@@ -1213,8 +1207,7 @@ void ebt_unregister_table(struct ebt_table *table)
1213 down(&ebt_mutex); 1207 down(&ebt_mutex);
1214 LIST_DELETE(&ebt_tables, table); 1208 LIST_DELETE(&ebt_tables, table);
1215 up(&ebt_mutex); 1209 up(&ebt_mutex);
1216 if (table->private->entries) 1210 vfree(table->private->entries);
1217 vfree(table->private->entries);
1218 if (table->private->chainstack) { 1211 if (table->private->chainstack) {
1219 for (i = 0; i < num_possible_cpus(); i++) 1212 for (i = 0; i < num_possible_cpus(); i++)
1220 vfree(table->private->chainstack[i]); 1213 vfree(table->private->chainstack[i]);
diff --git a/net/core/Makefile b/net/core/Makefile
index 81f03243fe2f..5e0c56b7f607 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -2,7 +2,8 @@
2# Makefile for the Linux networking core. 2# Makefile for the Linux networking core.
3# 3#
4 4
5obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o 5obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
6 gen_stats.o gen_estimator.o
6 7
7obj-$(CONFIG_SYSCTL) += sysctl_net_core.o 8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
8 9
diff --git a/net/core/dev.c b/net/core/dev.c
index d4d9e2680adb..7016e0c36b3d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
115#endif /* CONFIG_NET_RADIO */ 115#endif /* CONFIG_NET_RADIO */
116#include <asm/current.h> 116#include <asm/current.h>
117 117
118/* This define, if set, will randomly drop a packet when congestion
119 * is more than moderate. It helps fairness in the multi-interface
120 * case when one of them is a hog, but it kills performance for the
121 * single interface case so it is off now by default.
122 */
123#undef RAND_LIE
124
125/* Setting this will sample the queue lengths and thus congestion
126 * via a timer instead of as each packet is received.
127 */
128#undef OFFLINE_SAMPLE
129
130/* 118/*
131 * The list of packet types we will receive (as opposed to discard) 119 * The list of packet types we will receive (as opposed to discard)
132 * and the routines to invoke. 120 * and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
159static struct list_head ptype_base[16]; /* 16 way hashed list */ 147static struct list_head ptype_base[16]; /* 16 way hashed list */
160static struct list_head ptype_all; /* Taps */ 148static struct list_head ptype_all; /* Taps */
161 149
162#ifdef OFFLINE_SAMPLE
163static void sample_queue(unsigned long dummy);
164static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
165#endif
166
167/* 150/*
168 * The @dev_base list is protected by @dev_base_lock and the rtln 151 * The @dev_base list is protected by @dev_base_lock and the rtln
169 * semaphore. 152 * semaphore.
@@ -215,7 +198,7 @@ static struct notifier_block *netdev_chain;
215 * Device drivers call our routines to queue packets here. We empty the 198 * Device drivers call our routines to queue packets here. We empty the
216 * queue in the local softnet handler. 199 * queue in the local softnet handler.
217 */ 200 */
218DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; 201DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
219 202
220#ifdef CONFIG_SYSFS 203#ifdef CONFIG_SYSFS
221extern int netdev_sysfs_init(void); 204extern int netdev_sysfs_init(void);
@@ -761,6 +744,18 @@ int dev_change_name(struct net_device *dev, char *newname)
761} 744}
762 745
763/** 746/**
747 * netdev_features_change - device changes fatures
748 * @dev: device to cause notification
749 *
750 * Called to indicate a device has changed features.
751 */
752void netdev_features_change(struct net_device *dev)
753{
754 notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
755}
756EXPORT_SYMBOL(netdev_features_change);
757
758/**
764 * netdev_state_change - device changes state 759 * netdev_state_change - device changes state
765 * @dev: device to cause notification 760 * @dev: device to cause notification
766 * 761 *
@@ -1351,71 +1346,13 @@ out:
1351 Receiver routines 1346 Receiver routines
1352 =======================================================================*/ 1347 =======================================================================*/
1353 1348
1354int netdev_max_backlog = 300; 1349int netdev_max_backlog = 1000;
1350int netdev_budget = 300;
1355int weight_p = 64; /* old backlog weight */ 1351int weight_p = 64; /* old backlog weight */
1356/* These numbers are selected based on intuition and some
1357 * experimentatiom, if you have more scientific way of doing this
1358 * please go ahead and fix things.
1359 */
1360int no_cong_thresh = 10;
1361int no_cong = 20;
1362int lo_cong = 100;
1363int mod_cong = 290;
1364 1352
1365DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 1353DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1366 1354
1367 1355
1368static void get_sample_stats(int cpu)
1369{
1370#ifdef RAND_LIE
1371 unsigned long rd;
1372 int rq;
1373#endif
1374 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1375 int blog = sd->input_pkt_queue.qlen;
1376 int avg_blog = sd->avg_blog;
1377
1378 avg_blog = (avg_blog >> 1) + (blog >> 1);
1379
1380 if (avg_blog > mod_cong) {
1381 /* Above moderate congestion levels. */
1382 sd->cng_level = NET_RX_CN_HIGH;
1383#ifdef RAND_LIE
1384 rd = net_random();
1385 rq = rd % netdev_max_backlog;
1386 if (rq < avg_blog) /* unlucky bastard */
1387 sd->cng_level = NET_RX_DROP;
1388#endif
1389 } else if (avg_blog > lo_cong) {
1390 sd->cng_level = NET_RX_CN_MOD;
1391#ifdef RAND_LIE
1392 rd = net_random();
1393 rq = rd % netdev_max_backlog;
1394 if (rq < avg_blog) /* unlucky bastard */
1395 sd->cng_level = NET_RX_CN_HIGH;
1396#endif
1397 } else if (avg_blog > no_cong)
1398 sd->cng_level = NET_RX_CN_LOW;
1399 else /* no congestion */
1400 sd->cng_level = NET_RX_SUCCESS;
1401
1402 sd->avg_blog = avg_blog;
1403}
1404
1405#ifdef OFFLINE_SAMPLE
1406static void sample_queue(unsigned long dummy)
1407{
1408/* 10 ms 0r 1ms -- i don't care -- JHS */
1409 int next_tick = 1;
1410 int cpu = smp_processor_id();
1411
1412 get_sample_stats(cpu);
1413 next_tick += jiffies;
1414 mod_timer(&samp_timer, next_tick);
1415}
1416#endif
1417
1418
1419/** 1356/**
1420 * netif_rx - post buffer to the network code 1357 * netif_rx - post buffer to the network code
1421 * @skb: buffer to post 1358 * @skb: buffer to post
@@ -1436,7 +1373,6 @@ static void sample_queue(unsigned long dummy)
1436 1373
1437int netif_rx(struct sk_buff *skb) 1374int netif_rx(struct sk_buff *skb)
1438{ 1375{
1439 int this_cpu;
1440 struct softnet_data *queue; 1376 struct softnet_data *queue;
1441 unsigned long flags; 1377 unsigned long flags;
1442 1378
@@ -1452,38 +1388,22 @@ int netif_rx(struct sk_buff *skb)
1452 * short when CPU is congested, but is still operating. 1388 * short when CPU is congested, but is still operating.
1453 */ 1389 */
1454 local_irq_save(flags); 1390 local_irq_save(flags);
1455 this_cpu = smp_processor_id();
1456 queue = &__get_cpu_var(softnet_data); 1391 queue = &__get_cpu_var(softnet_data);
1457 1392
1458 __get_cpu_var(netdev_rx_stat).total++; 1393 __get_cpu_var(netdev_rx_stat).total++;
1459 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 1394 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1460 if (queue->input_pkt_queue.qlen) { 1395 if (queue->input_pkt_queue.qlen) {
1461 if (queue->throttle)
1462 goto drop;
1463
1464enqueue: 1396enqueue:
1465 dev_hold(skb->dev); 1397 dev_hold(skb->dev);
1466 __skb_queue_tail(&queue->input_pkt_queue, skb); 1398 __skb_queue_tail(&queue->input_pkt_queue, skb);
1467#ifndef OFFLINE_SAMPLE
1468 get_sample_stats(this_cpu);
1469#endif
1470 local_irq_restore(flags); 1399 local_irq_restore(flags);
1471 return queue->cng_level; 1400 return NET_RX_SUCCESS;
1472 } 1401 }
1473 1402
1474 if (queue->throttle)
1475 queue->throttle = 0;
1476
1477 netif_rx_schedule(&queue->backlog_dev); 1403 netif_rx_schedule(&queue->backlog_dev);
1478 goto enqueue; 1404 goto enqueue;
1479 } 1405 }
1480 1406
1481 if (!queue->throttle) {
1482 queue->throttle = 1;
1483 __get_cpu_var(netdev_rx_stat).throttled++;
1484 }
1485
1486drop:
1487 __get_cpu_var(netdev_rx_stat).dropped++; 1407 __get_cpu_var(netdev_rx_stat).dropped++;
1488 local_irq_restore(flags); 1408 local_irq_restore(flags);
1489 1409
@@ -1732,6 +1652,7 @@ static int process_backlog(struct net_device *backlog_dev, int *budget)
1732 struct softnet_data *queue = &__get_cpu_var(softnet_data); 1652 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1733 unsigned long start_time = jiffies; 1653 unsigned long start_time = jiffies;
1734 1654
1655 backlog_dev->weight = weight_p;
1735 for (;;) { 1656 for (;;) {
1736 struct sk_buff *skb; 1657 struct sk_buff *skb;
1737 struct net_device *dev; 1658 struct net_device *dev;
@@ -1767,8 +1688,6 @@ job_done:
1767 smp_mb__before_clear_bit(); 1688 smp_mb__before_clear_bit();
1768 netif_poll_enable(backlog_dev); 1689 netif_poll_enable(backlog_dev);
1769 1690
1770 if (queue->throttle)
1771 queue->throttle = 0;
1772 local_irq_enable(); 1691 local_irq_enable();
1773 return 0; 1692 return 0;
1774} 1693}
@@ -1777,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h)
1777{ 1696{
1778 struct softnet_data *queue = &__get_cpu_var(softnet_data); 1697 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1779 unsigned long start_time = jiffies; 1698 unsigned long start_time = jiffies;
1780 int budget = netdev_max_backlog; 1699 int budget = netdev_budget;
1781
1782 1700
1783 local_irq_disable(); 1701 local_irq_disable();
1784 1702
@@ -2042,15 +1960,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
2042 struct netif_rx_stats *s = v; 1960 struct netif_rx_stats *s = v;
2043 1961
2044 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 1962 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2045 s->total, s->dropped, s->time_squeeze, s->throttled, 1963 s->total, s->dropped, s->time_squeeze, 0,
2046 s->fastroute_hit, s->fastroute_success, s->fastroute_defer, 1964 0, 0, 0, 0, /* was fastroute */
2047 s->fastroute_deferred_out, 1965 s->cpu_collision );
2048#if 0
2049 s->fastroute_latency_reduction
2050#else
2051 s->cpu_collision
2052#endif
2053 );
2054 return 0; 1966 return 0;
2055} 1967}
2056 1968
@@ -3292,9 +3204,6 @@ static int __init net_dev_init(void)
3292 3204
3293 queue = &per_cpu(softnet_data, i); 3205 queue = &per_cpu(softnet_data, i);
3294 skb_queue_head_init(&queue->input_pkt_queue); 3206 skb_queue_head_init(&queue->input_pkt_queue);
3295 queue->throttle = 0;
3296 queue->cng_level = 0;
3297 queue->avg_blog = 10; /* arbitrary non-zero */
3298 queue->completion_queue = NULL; 3207 queue->completion_queue = NULL;
3299 INIT_LIST_HEAD(&queue->poll_list); 3208 INIT_LIST_HEAD(&queue->poll_list);
3300 set_bit(__LINK_STATE_START, &queue->backlog_dev.state); 3209 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3303,11 +3212,6 @@ static int __init net_dev_init(void)
3303 atomic_set(&queue->backlog_dev.refcnt, 1); 3212 atomic_set(&queue->backlog_dev.refcnt, 1);
3304 } 3213 }
3305 3214
3306#ifdef OFFLINE_SAMPLE
3307 samp_timer.expires = jiffies + (10 * HZ);
3308 add_timer(&samp_timer);
3309#endif
3310
3311 dev_boot_phase = 0; 3215 dev_boot_phase = 0;
3312 3216
3313 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); 3217 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f05fde97c43d..a3eeb88e1c81 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -29,7 +29,7 @@ u32 ethtool_op_get_link(struct net_device *dev)
29 29
30u32 ethtool_op_get_tx_csum(struct net_device *dev) 30u32 ethtool_op_get_tx_csum(struct net_device *dev)
31{ 31{
32 return (dev->features & NETIF_F_IP_CSUM) != 0; 32 return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
33} 33}
34 34
35int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) 35int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
@@ -42,6 +42,15 @@ int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
42 return 0; 42 return 0;
43} 43}
44 44
45int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
46{
47 if (data)
48 dev->features |= NETIF_F_HW_CSUM;
49 else
50 dev->features &= ~NETIF_F_HW_CSUM;
51
52 return 0;
53}
45u32 ethtool_op_get_sg(struct net_device *dev) 54u32 ethtool_op_get_sg(struct net_device *dev)
46{ 55{
47 return (dev->features & NETIF_F_SG) != 0; 56 return (dev->features & NETIF_F_SG) != 0;
@@ -347,7 +356,7 @@ static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
347{ 356{
348 struct ethtool_coalesce coalesce; 357 struct ethtool_coalesce coalesce;
349 358
350 if (!dev->ethtool_ops->get_coalesce) 359 if (!dev->ethtool_ops->set_coalesce)
351 return -EOPNOTSUPP; 360 return -EOPNOTSUPP;
352 361
353 if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) 362 if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
@@ -682,6 +691,7 @@ int dev_ethtool(struct ifreq *ifr)
682 void __user *useraddr = ifr->ifr_data; 691 void __user *useraddr = ifr->ifr_data;
683 u32 ethcmd; 692 u32 ethcmd;
684 int rc; 693 int rc;
694 unsigned long old_features;
685 695
686 /* 696 /*
687 * XXX: This can be pushed down into the ethtool_* handlers that 697 * XXX: This can be pushed down into the ethtool_* handlers that
@@ -703,6 +713,8 @@ int dev_ethtool(struct ifreq *ifr)
703 if ((rc = dev->ethtool_ops->begin(dev)) < 0) 713 if ((rc = dev->ethtool_ops->begin(dev)) < 0)
704 return rc; 714 return rc;
705 715
716 old_features = dev->features;
717
706 switch (ethcmd) { 718 switch (ethcmd) {
707 case ETHTOOL_GSET: 719 case ETHTOOL_GSET:
708 rc = ethtool_get_settings(dev, useraddr); 720 rc = ethtool_get_settings(dev, useraddr);
@@ -712,7 +724,6 @@ int dev_ethtool(struct ifreq *ifr)
712 break; 724 break;
713 case ETHTOOL_GDRVINFO: 725 case ETHTOOL_GDRVINFO:
714 rc = ethtool_get_drvinfo(dev, useraddr); 726 rc = ethtool_get_drvinfo(dev, useraddr);
715
716 break; 727 break;
717 case ETHTOOL_GREGS: 728 case ETHTOOL_GREGS:
718 rc = ethtool_get_regs(dev, useraddr); 729 rc = ethtool_get_regs(dev, useraddr);
@@ -801,6 +812,10 @@ int dev_ethtool(struct ifreq *ifr)
801 812
802 if(dev->ethtool_ops->complete) 813 if(dev->ethtool_ops->complete)
803 dev->ethtool_ops->complete(dev); 814 dev->ethtool_ops->complete(dev);
815
816 if (old_features != dev->features)
817 netdev_features_change(dev);
818
804 return rc; 819 return rc;
805 820
806 ioctl: 821 ioctl:
@@ -817,3 +832,4 @@ EXPORT_SYMBOL(ethtool_op_get_tx_csum);
817EXPORT_SYMBOL(ethtool_op_set_sg); 832EXPORT_SYMBOL(ethtool_op_set_sg);
818EXPORT_SYMBOL(ethtool_op_set_tso); 833EXPORT_SYMBOL(ethtool_op_set_tso);
819EXPORT_SYMBOL(ethtool_op_set_tx_csum); 834EXPORT_SYMBOL(ethtool_op_set_tx_csum);
835EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 43bdc521e20d..851eb927ed97 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -32,6 +32,7 @@
32#include <net/sock.h> 32#include <net/sock.h>
33#include <linux/rtnetlink.h> 33#include <linux/rtnetlink.h>
34#include <linux/random.h> 34#include <linux/random.h>
35#include <linux/string.h>
35 36
36#define NEIGH_DEBUG 1 37#define NEIGH_DEBUG 1
37 38
@@ -1276,9 +1277,14 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
1276 INIT_RCU_HEAD(&p->rcu_head); 1277 INIT_RCU_HEAD(&p->rcu_head);
1277 p->reachable_time = 1278 p->reachable_time =
1278 neigh_rand_reach_time(p->base_reachable_time); 1279 neigh_rand_reach_time(p->base_reachable_time);
1279 if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) { 1280 if (dev) {
1280 kfree(p); 1281 if (dev->neigh_setup && dev->neigh_setup(dev, p)) {
1281 return NULL; 1282 kfree(p);
1283 return NULL;
1284 }
1285
1286 dev_hold(dev);
1287 p->dev = dev;
1282 } 1288 }
1283 p->sysctl_table = NULL; 1289 p->sysctl_table = NULL;
1284 write_lock_bh(&tbl->lock); 1290 write_lock_bh(&tbl->lock);
@@ -1309,6 +1315,8 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
1309 *p = parms->next; 1315 *p = parms->next;
1310 parms->dead = 1; 1316 parms->dead = 1;
1311 write_unlock_bh(&tbl->lock); 1317 write_unlock_bh(&tbl->lock);
1318 if (parms->dev)
1319 dev_put(parms->dev);
1312 call_rcu(&parms->rcu_head, neigh_rcu_free_parms); 1320 call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
1313 return; 1321 return;
1314 } 1322 }
@@ -1546,20 +1554,323 @@ out:
1546 return err; 1554 return err;
1547} 1555}
1548 1556
1557static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
1558{
1559 struct rtattr *nest = NULL;
1560
1561 nest = RTA_NEST(skb, NDTA_PARMS);
1562
1563 if (parms->dev)
1564 RTA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
1565
1566 RTA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
1567 RTA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
1568 RTA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
1569 RTA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
1570 RTA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
1571 RTA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
1572 RTA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
1573 RTA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
1574 parms->base_reachable_time);
1575 RTA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
1576 RTA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
1577 RTA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
1578 RTA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
1579 RTA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
1580 RTA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
1581
1582 return RTA_NEST_END(skb, nest);
1583
1584rtattr_failure:
1585 return RTA_NEST_CANCEL(skb, nest);
1586}
1587
1588static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
1589 struct netlink_callback *cb)
1590{
1591 struct nlmsghdr *nlh;
1592 struct ndtmsg *ndtmsg;
1593
1594 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
1595 NLM_F_MULTI);
1596
1597 ndtmsg = NLMSG_DATA(nlh);
1598
1599 read_lock_bh(&tbl->lock);
1600 ndtmsg->ndtm_family = tbl->family;
1601
1602 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1603 RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
1604 RTA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
1605 RTA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
1606 RTA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
1607
1608 {
1609 unsigned long now = jiffies;
1610 unsigned int flush_delta = now - tbl->last_flush;
1611 unsigned int rand_delta = now - tbl->last_rand;
1612
1613 struct ndt_config ndc = {
1614 .ndtc_key_len = tbl->key_len,
1615 .ndtc_entry_size = tbl->entry_size,
1616 .ndtc_entries = atomic_read(&tbl->entries),
1617 .ndtc_last_flush = jiffies_to_msecs(flush_delta),
1618 .ndtc_last_rand = jiffies_to_msecs(rand_delta),
1619 .ndtc_hash_rnd = tbl->hash_rnd,
1620 .ndtc_hash_mask = tbl->hash_mask,
1621 .ndtc_hash_chain_gc = tbl->hash_chain_gc,
1622 .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
1623 };
1624
1625 RTA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
1626 }
1627
1628 {
1629 int cpu;
1630 struct ndt_stats ndst;
1631
1632 memset(&ndst, 0, sizeof(ndst));
1633
1634 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1635 struct neigh_statistics *st;
1636
1637 if (!cpu_possible(cpu))
1638 continue;
1639
1640 st = per_cpu_ptr(tbl->stats, cpu);
1641 ndst.ndts_allocs += st->allocs;
1642 ndst.ndts_destroys += st->destroys;
1643 ndst.ndts_hash_grows += st->hash_grows;
1644 ndst.ndts_res_failed += st->res_failed;
1645 ndst.ndts_lookups += st->lookups;
1646 ndst.ndts_hits += st->hits;
1647 ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
1648 ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
1649 ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
1650 ndst.ndts_forced_gc_runs += st->forced_gc_runs;
1651 }
1652
1653 RTA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
1654 }
1655
1656 BUG_ON(tbl->parms.dev);
1657 if (neightbl_fill_parms(skb, &tbl->parms) < 0)
1658 goto rtattr_failure;
1659
1660 read_unlock_bh(&tbl->lock);
1661 return NLMSG_END(skb, nlh);
1662
1663rtattr_failure:
1664 read_unlock_bh(&tbl->lock);
1665 return NLMSG_CANCEL(skb, nlh);
1666
1667nlmsg_failure:
1668 return -1;
1669}
1670
1671static int neightbl_fill_param_info(struct neigh_table *tbl,
1672 struct neigh_parms *parms,
1673 struct sk_buff *skb,
1674 struct netlink_callback *cb)
1675{
1676 struct ndtmsg *ndtmsg;
1677 struct nlmsghdr *nlh;
1678
1679 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
1680 NLM_F_MULTI);
1681
1682 ndtmsg = NLMSG_DATA(nlh);
1683
1684 read_lock_bh(&tbl->lock);
1685 ndtmsg->ndtm_family = tbl->family;
1686 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1687
1688 if (neightbl_fill_parms(skb, parms) < 0)
1689 goto rtattr_failure;
1690
1691 read_unlock_bh(&tbl->lock);
1692 return NLMSG_END(skb, nlh);
1693
1694rtattr_failure:
1695 read_unlock_bh(&tbl->lock);
1696 return NLMSG_CANCEL(skb, nlh);
1697
1698nlmsg_failure:
1699 return -1;
1700}
1701
1702static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl,
1703 int ifindex)
1704{
1705 struct neigh_parms *p;
1706
1707 for (p = &tbl->parms; p; p = p->next)
1708 if ((p->dev && p->dev->ifindex == ifindex) ||
1709 (!p->dev && !ifindex))
1710 return p;
1711
1712 return NULL;
1713}
1714
1715int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1716{
1717 struct neigh_table *tbl;
1718 struct ndtmsg *ndtmsg = NLMSG_DATA(nlh);
1719 struct rtattr **tb = arg;
1720 int err = -EINVAL;
1721
1722 if (!tb[NDTA_NAME - 1] || !RTA_PAYLOAD(tb[NDTA_NAME - 1]))
1723 return -EINVAL;
1724
1725 read_lock(&neigh_tbl_lock);
1726 for (tbl = neigh_tables; tbl; tbl = tbl->next) {
1727 if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
1728 continue;
1729
1730 if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id))
1731 break;
1732 }
1733
1734 if (tbl == NULL) {
1735 err = -ENOENT;
1736 goto errout;
1737 }
1738
1739 /*
1740 * We acquire tbl->lock to be nice to the periodic timers and
1741 * make sure they always see a consistent set of values.
1742 */
1743 write_lock_bh(&tbl->lock);
1744
1745 if (tb[NDTA_THRESH1 - 1])
1746 tbl->gc_thresh1 = RTA_GET_U32(tb[NDTA_THRESH1 - 1]);
1747
1748 if (tb[NDTA_THRESH2 - 1])
1749 tbl->gc_thresh2 = RTA_GET_U32(tb[NDTA_THRESH2 - 1]);
1750
1751 if (tb[NDTA_THRESH3 - 1])
1752 tbl->gc_thresh3 = RTA_GET_U32(tb[NDTA_THRESH3 - 1]);
1753
1754 if (tb[NDTA_GC_INTERVAL - 1])
1755 tbl->gc_interval = RTA_GET_MSECS(tb[NDTA_GC_INTERVAL - 1]);
1756
1757 if (tb[NDTA_PARMS - 1]) {
1758 struct rtattr *tbp[NDTPA_MAX];
1759 struct neigh_parms *p;
1760 u32 ifindex = 0;
1761
1762 if (rtattr_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS - 1]) < 0)
1763 goto rtattr_failure;
1764
1765 if (tbp[NDTPA_IFINDEX - 1])
1766 ifindex = RTA_GET_U32(tbp[NDTPA_IFINDEX - 1]);
1767
1768 p = lookup_neigh_params(tbl, ifindex);
1769 if (p == NULL) {
1770 err = -ENOENT;
1771 goto rtattr_failure;
1772 }
1773
1774 if (tbp[NDTPA_QUEUE_LEN - 1])
1775 p->queue_len = RTA_GET_U32(tbp[NDTPA_QUEUE_LEN - 1]);
1776
1777 if (tbp[NDTPA_PROXY_QLEN - 1])
1778 p->proxy_qlen = RTA_GET_U32(tbp[NDTPA_PROXY_QLEN - 1]);
1779
1780 if (tbp[NDTPA_APP_PROBES - 1])
1781 p->app_probes = RTA_GET_U32(tbp[NDTPA_APP_PROBES - 1]);
1782
1783 if (tbp[NDTPA_UCAST_PROBES - 1])
1784 p->ucast_probes =
1785 RTA_GET_U32(tbp[NDTPA_UCAST_PROBES - 1]);
1786
1787 if (tbp[NDTPA_MCAST_PROBES - 1])
1788 p->mcast_probes =
1789 RTA_GET_U32(tbp[NDTPA_MCAST_PROBES - 1]);
1790
1791 if (tbp[NDTPA_BASE_REACHABLE_TIME - 1])
1792 p->base_reachable_time =
1793 RTA_GET_MSECS(tbp[NDTPA_BASE_REACHABLE_TIME - 1]);
1794
1795 if (tbp[NDTPA_GC_STALETIME - 1])
1796 p->gc_staletime =
1797 RTA_GET_MSECS(tbp[NDTPA_GC_STALETIME - 1]);
1798
1799 if (tbp[NDTPA_DELAY_PROBE_TIME - 1])
1800 p->delay_probe_time =
1801 RTA_GET_MSECS(tbp[NDTPA_DELAY_PROBE_TIME - 1]);
1802
1803 if (tbp[NDTPA_RETRANS_TIME - 1])
1804 p->retrans_time =
1805 RTA_GET_MSECS(tbp[NDTPA_RETRANS_TIME - 1]);
1806
1807 if (tbp[NDTPA_ANYCAST_DELAY - 1])
1808 p->anycast_delay =
1809 RTA_GET_MSECS(tbp[NDTPA_ANYCAST_DELAY - 1]);
1810
1811 if (tbp[NDTPA_PROXY_DELAY - 1])
1812 p->proxy_delay =
1813 RTA_GET_MSECS(tbp[NDTPA_PROXY_DELAY - 1]);
1814
1815 if (tbp[NDTPA_LOCKTIME - 1])
1816 p->locktime = RTA_GET_MSECS(tbp[NDTPA_LOCKTIME - 1]);
1817 }
1818
1819 err = 0;
1820
1821rtattr_failure:
1822 write_unlock_bh(&tbl->lock);
1823errout:
1824 read_unlock(&neigh_tbl_lock);
1825 return err;
1826}
1827
1828int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
1829{
1830 int idx, family;
1831 int s_idx = cb->args[0];
1832 struct neigh_table *tbl;
1833
1834 family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family;
1835
1836 read_lock(&neigh_tbl_lock);
1837 for (tbl = neigh_tables, idx = 0; tbl; tbl = tbl->next) {
1838 struct neigh_parms *p;
1839
1840 if (idx < s_idx || (family && tbl->family != family))
1841 continue;
1842
1843 if (neightbl_fill_info(tbl, skb, cb) <= 0)
1844 break;
1845
1846 for (++idx, p = tbl->parms.next; p; p = p->next, idx++) {
1847 if (idx < s_idx)
1848 continue;
1849
1850 if (neightbl_fill_param_info(tbl, p, skb, cb) <= 0)
1851 goto out;
1852 }
1853
1854 }
1855out:
1856 read_unlock(&neigh_tbl_lock);
1857 cb->args[0] = idx;
1858
1859 return skb->len;
1860}
1549 1861
1550static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, 1862static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
1551 u32 pid, u32 seq, int event) 1863 u32 pid, u32 seq, int event, unsigned int flags)
1552{ 1864{
1553 unsigned long now = jiffies; 1865 unsigned long now = jiffies;
1554 unsigned char *b = skb->tail; 1866 unsigned char *b = skb->tail;
1555 struct nda_cacheinfo ci; 1867 struct nda_cacheinfo ci;
1556 int locked = 0; 1868 int locked = 0;
1557 u32 probes; 1869 u32 probes;
1558 struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, event, 1870 struct nlmsghdr *nlh = NLMSG_NEW(skb, pid, seq, event,
1559 sizeof(struct ndmsg)); 1871 sizeof(struct ndmsg), flags);
1560 struct ndmsg *ndm = NLMSG_DATA(nlh); 1872 struct ndmsg *ndm = NLMSG_DATA(nlh);
1561 1873
1562 nlh->nlmsg_flags = pid ? NLM_F_MULTI : 0;
1563 ndm->ndm_family = n->ops->family; 1874 ndm->ndm_family = n->ops->family;
1564 ndm->ndm_flags = n->flags; 1875 ndm->ndm_flags = n->flags;
1565 ndm->ndm_type = n->type; 1876 ndm->ndm_type = n->type;
@@ -1609,7 +1920,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
1609 continue; 1920 continue;
1610 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, 1921 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
1611 cb->nlh->nlmsg_seq, 1922 cb->nlh->nlmsg_seq,
1612 RTM_NEWNEIGH) <= 0) { 1923 RTM_NEWNEIGH,
1924 NLM_F_MULTI) <= 0) {
1613 read_unlock_bh(&tbl->lock); 1925 read_unlock_bh(&tbl->lock);
1614 rc = -1; 1926 rc = -1;
1615 goto out; 1927 goto out;
@@ -2018,7 +2330,7 @@ void neigh_app_ns(struct neighbour *n)
2018 if (!skb) 2330 if (!skb)
2019 return; 2331 return;
2020 2332
2021 if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { 2333 if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH, 0) < 0) {
2022 kfree_skb(skb); 2334 kfree_skb(skb);
2023 return; 2335 return;
2024 } 2336 }
@@ -2037,7 +2349,7 @@ static void neigh_app_notify(struct neighbour *n)
2037 if (!skb) 2349 if (!skb)
2038 return; 2350 return;
2039 2351
2040 if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { 2352 if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH, 0) < 0) {
2041 kfree_skb(skb); 2353 kfree_skb(skb);
2042 return; 2354 return;
2043 } 2355 }
@@ -2281,7 +2593,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
2281 t->neigh_vars[17].extra1 = dev; 2593 t->neigh_vars[17].extra1 = dev;
2282 } 2594 }
2283 2595
2284 dev_name = net_sysctl_strdup(dev_name_source); 2596 dev_name = kstrdup(dev_name_source, GFP_KERNEL);
2285 if (!dev_name) { 2597 if (!dev_name) {
2286 err = -ENOBUFS; 2598 err = -ENOBUFS;
2287 goto free; 2599 goto free;
@@ -2352,6 +2664,8 @@ EXPORT_SYMBOL(neigh_update);
2352EXPORT_SYMBOL(neigh_update_hhs); 2664EXPORT_SYMBOL(neigh_update_hhs);
2353EXPORT_SYMBOL(pneigh_enqueue); 2665EXPORT_SYMBOL(pneigh_enqueue);
2354EXPORT_SYMBOL(pneigh_lookup); 2666EXPORT_SYMBOL(pneigh_lookup);
2667EXPORT_SYMBOL(neightbl_dump_info);
2668EXPORT_SYMBOL(neightbl_set);
2355 2669
2356#ifdef CONFIG_ARPD 2670#ifdef CONFIG_ARPD
2357EXPORT_SYMBOL(neigh_app_ns); 2671EXPORT_SYMBOL(neigh_app_ns);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 060f703659e8..e2137f3e489d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -21,6 +21,7 @@
21#define to_net_dev(class) container_of(class, struct net_device, class_dev) 21#define to_net_dev(class) container_of(class, struct net_device, class_dev)
22 22
23static const char fmt_hex[] = "%#x\n"; 23static const char fmt_hex[] = "%#x\n";
24static const char fmt_long_hex[] = "%#lx\n";
24static const char fmt_dec[] = "%d\n"; 25static const char fmt_dec[] = "%d\n";
25static const char fmt_ulong[] = "%lu\n"; 26static const char fmt_ulong[] = "%lu\n";
26 27
@@ -91,7 +92,7 @@ static CLASS_DEVICE_ATTR(field, S_IRUGO, show_##field, NULL) \
91NETDEVICE_ATTR(addr_len, fmt_dec); 92NETDEVICE_ATTR(addr_len, fmt_dec);
92NETDEVICE_ATTR(iflink, fmt_dec); 93NETDEVICE_ATTR(iflink, fmt_dec);
93NETDEVICE_ATTR(ifindex, fmt_dec); 94NETDEVICE_ATTR(ifindex, fmt_dec);
94NETDEVICE_ATTR(features, fmt_hex); 95NETDEVICE_ATTR(features, fmt_long_hex);
95NETDEVICE_ATTR(type, fmt_dec); 96NETDEVICE_ATTR(type, fmt_dec);
96 97
97/* use same locking rules as GIFHWADDR ioctl's */ 98/* use same locking rules as GIFHWADDR ioctl's */
@@ -184,6 +185,22 @@ static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, siz
184static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, 185static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
185 store_tx_queue_len); 186 store_tx_queue_len);
186 187
188NETDEVICE_SHOW(weight, fmt_dec);
189
190static int change_weight(struct net_device *net, unsigned long new_weight)
191{
192 net->weight = new_weight;
193 return 0;
194}
195
196static ssize_t store_weight(struct class_device *dev, const char *buf, size_t len)
197{
198 return netdev_store(dev, buf, len, change_weight);
199}
200
201static CLASS_DEVICE_ATTR(weight, S_IRUGO | S_IWUSR, show_weight,
202 store_weight);
203
187 204
188static struct class_device_attribute *net_class_attributes[] = { 205static struct class_device_attribute *net_class_attributes[] = {
189 &class_device_attr_ifindex, 206 &class_device_attr_ifindex,
@@ -193,6 +210,7 @@ static struct class_device_attribute *net_class_attributes[] = {
193 &class_device_attr_features, 210 &class_device_attr_features,
194 &class_device_attr_mtu, 211 &class_device_attr_mtu,
195 &class_device_attr_flags, 212 &class_device_attr_flags,
213 &class_device_attr_weight,
196 &class_device_attr_type, 214 &class_device_attr_type,
197 &class_device_attr_address, 215 &class_device_attr_address,
198 &class_device_attr_broadcast, 216 &class_device_attr_broadcast,
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
index 22a8f127c4aa..076c156d5eda 100644
--- a/net/core/netfilter.c
+++ b/net/core/netfilter.c
@@ -141,136 +141,6 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
141 up(&nf_sockopt_mutex); 141 up(&nf_sockopt_mutex);
142} 142}
143 143
144#ifdef CONFIG_NETFILTER_DEBUG
145#include <net/ip.h>
146#include <net/tcp.h>
147#include <linux/netfilter_ipv4.h>
148
149static void debug_print_hooks_ip(unsigned int nf_debug)
150{
151 if (nf_debug & (1 << NF_IP_PRE_ROUTING)) {
152 printk("PRE_ROUTING ");
153 nf_debug ^= (1 << NF_IP_PRE_ROUTING);
154 }
155 if (nf_debug & (1 << NF_IP_LOCAL_IN)) {
156 printk("LOCAL_IN ");
157 nf_debug ^= (1 << NF_IP_LOCAL_IN);
158 }
159 if (nf_debug & (1 << NF_IP_FORWARD)) {
160 printk("FORWARD ");
161 nf_debug ^= (1 << NF_IP_FORWARD);
162 }
163 if (nf_debug & (1 << NF_IP_LOCAL_OUT)) {
164 printk("LOCAL_OUT ");
165 nf_debug ^= (1 << NF_IP_LOCAL_OUT);
166 }
167 if (nf_debug & (1 << NF_IP_POST_ROUTING)) {
168 printk("POST_ROUTING ");
169 nf_debug ^= (1 << NF_IP_POST_ROUTING);
170 }
171 if (nf_debug)
172 printk("Crap bits: 0x%04X", nf_debug);
173 printk("\n");
174}
175
176static void nf_dump_skb(int pf, struct sk_buff *skb)
177{
178 printk("skb: pf=%i %s dev=%s len=%u\n",
179 pf,
180 skb->sk ? "(owned)" : "(unowned)",
181 skb->dev ? skb->dev->name : "(no dev)",
182 skb->len);
183 switch (pf) {
184 case PF_INET: {
185 const struct iphdr *ip = skb->nh.iph;
186 __u32 *opt = (__u32 *) (ip + 1);
187 int opti;
188 __u16 src_port = 0, dst_port = 0;
189
190 if (ip->protocol == IPPROTO_TCP
191 || ip->protocol == IPPROTO_UDP) {
192 struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
193 src_port = ntohs(tcp->source);
194 dst_port = ntohs(tcp->dest);
195 }
196
197 printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu"
198 " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
199 ip->protocol, NIPQUAD(ip->saddr),
200 src_port, NIPQUAD(ip->daddr),
201 dst_port,
202 ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
203 ntohs(ip->frag_off), ip->ttl);
204
205 for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
206 printk(" O=0x%8.8X", *opt++);
207 printk("\n");
208 }
209 }
210}
211
212void nf_debug_ip_local_deliver(struct sk_buff *skb)
213{
214 /* If it's a loopback packet, it must have come through
215 * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and
216 * NF_IP_LOCAL_IN. Otherwise, must have gone through
217 * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */
218 if (!skb->dev) {
219 printk("ip_local_deliver: skb->dev is NULL.\n");
220 } else {
221 if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING)
222 | (1<<NF_IP_LOCAL_IN))) {
223 printk("ip_local_deliver: bad skb: ");
224 debug_print_hooks_ip(skb->nf_debug);
225 nf_dump_skb(PF_INET, skb);
226 }
227 }
228}
229
230void nf_debug_ip_loopback_xmit(struct sk_buff *newskb)
231{
232 if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
233 | (1 << NF_IP_POST_ROUTING))) {
234 printk("ip_dev_loopback_xmit: bad owned skb = %p: ",
235 newskb);
236 debug_print_hooks_ip(newskb->nf_debug);
237 nf_dump_skb(PF_INET, newskb);
238 }
239}
240
241void nf_debug_ip_finish_output2(struct sk_buff *skb)
242{
243 /* If it's owned, it must have gone through the
244 * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING.
245 * Otherwise, must have gone through
246 * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING.
247 */
248 if (skb->sk) {
249 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
250 | (1 << NF_IP_POST_ROUTING))) {
251 printk("ip_finish_output: bad owned skb = %p: ", skb);
252 debug_print_hooks_ip(skb->nf_debug);
253 nf_dump_skb(PF_INET, skb);
254 }
255 } else {
256 if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
257 | (1 << NF_IP_FORWARD)
258 | (1 << NF_IP_POST_ROUTING))) {
259 /* Fragments, entunnelled packets, TCP RSTs
260 generated by ipt_REJECT will have no
261 owners, but still may be local */
262 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
263 | (1 << NF_IP_POST_ROUTING))){
264 printk("ip_finish_output:"
265 " bad unowned skb = %p: ",skb);
266 debug_print_hooks_ip(skb->nf_debug);
267 nf_dump_skb(PF_INET, skb);
268 }
269 }
270 }
271}
272#endif /*CONFIG_NETFILTER_DEBUG*/
273
274/* Call get/setsockopt() */ 144/* Call get/setsockopt() */
275static int nf_sockopt(struct sock *sk, int pf, int val, 145static int nf_sockopt(struct sock *sk, int pf, int val,
276 char __user *opt, int *len, int get) 146 char __user *opt, int *len, int get)
@@ -488,14 +358,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
488 /* We may already have this, but read-locks nest anyway */ 358 /* We may already have this, but read-locks nest anyway */
489 rcu_read_lock(); 359 rcu_read_lock();
490 360
491#ifdef CONFIG_NETFILTER_DEBUG
492 if (unlikely((*pskb)->nf_debug & (1 << hook))) {
493 printk("nf_hook: hook %i already set.\n", hook);
494 nf_dump_skb(pf, *pskb);
495 }
496 (*pskb)->nf_debug |= (1 << hook);
497#endif
498
499 elem = &nf_hooks[pf][hook]; 361 elem = &nf_hooks[pf][hook];
500next_hook: 362next_hook:
501 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, 363 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a119696d5521..c327c9edadc5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -130,19 +130,20 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
130 */ 130 */
131static void poll_napi(struct netpoll *np) 131static void poll_napi(struct netpoll *np)
132{ 132{
133 struct netpoll_info *npinfo = np->dev->npinfo;
133 int budget = 16; 134 int budget = 16;
134 135
135 if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && 136 if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
136 np->poll_owner != smp_processor_id() && 137 npinfo->poll_owner != smp_processor_id() &&
137 spin_trylock(&np->poll_lock)) { 138 spin_trylock(&npinfo->poll_lock)) {
138 np->rx_flags |= NETPOLL_RX_DROP; 139 npinfo->rx_flags |= NETPOLL_RX_DROP;
139 atomic_inc(&trapped); 140 atomic_inc(&trapped);
140 141
141 np->dev->poll(np->dev, &budget); 142 np->dev->poll(np->dev, &budget);
142 143
143 atomic_dec(&trapped); 144 atomic_dec(&trapped);
144 np->rx_flags &= ~NETPOLL_RX_DROP; 145 npinfo->rx_flags &= ~NETPOLL_RX_DROP;
145 spin_unlock(&np->poll_lock); 146 spin_unlock(&npinfo->poll_lock);
146 } 147 }
147} 148}
148 149
@@ -245,6 +246,7 @@ repeat:
245static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) 246static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
246{ 247{
247 int status; 248 int status;
249 struct netpoll_info *npinfo;
248 250
249repeat: 251repeat:
250 if(!np || !np->dev || !netif_running(np->dev)) { 252 if(!np || !np->dev || !netif_running(np->dev)) {
@@ -253,8 +255,9 @@ repeat:
253 } 255 }
254 256
255 /* avoid recursion */ 257 /* avoid recursion */
256 if(np->poll_owner == smp_processor_id() || 258 npinfo = np->dev->npinfo;
257 np->dev->xmit_lock_owner == smp_processor_id()) { 259 if (npinfo->poll_owner == smp_processor_id() ||
260 np->dev->xmit_lock_owner == smp_processor_id()) {
258 if (np->drop) 261 if (np->drop)
259 np->drop(skb); 262 np->drop(skb);
260 else 263 else
@@ -341,14 +344,22 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
341 344
342static void arp_reply(struct sk_buff *skb) 345static void arp_reply(struct sk_buff *skb)
343{ 346{
347 struct netpoll_info *npinfo = skb->dev->npinfo;
344 struct arphdr *arp; 348 struct arphdr *arp;
345 unsigned char *arp_ptr; 349 unsigned char *arp_ptr;
346 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; 350 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
347 u32 sip, tip; 351 u32 sip, tip;
352 unsigned long flags;
348 struct sk_buff *send_skb; 353 struct sk_buff *send_skb;
349 struct netpoll *np = skb->dev->np; 354 struct netpoll *np = NULL;
355
356 spin_lock_irqsave(&npinfo->rx_lock, flags);
357 if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
358 np = npinfo->rx_np;
359 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
350 360
351 if (!np) return; 361 if (!np)
362 return;
352 363
353 /* No arp on this interface */ 364 /* No arp on this interface */
354 if (skb->dev->flags & IFF_NOARP) 365 if (skb->dev->flags & IFF_NOARP)
@@ -429,9 +440,9 @@ int __netpoll_rx(struct sk_buff *skb)
429 int proto, len, ulen; 440 int proto, len, ulen;
430 struct iphdr *iph; 441 struct iphdr *iph;
431 struct udphdr *uh; 442 struct udphdr *uh;
432 struct netpoll *np = skb->dev->np; 443 struct netpoll *np = skb->dev->npinfo->rx_np;
433 444
434 if (!np->rx_hook) 445 if (!np)
435 goto out; 446 goto out;
436 if (skb->dev->type != ARPHRD_ETHER) 447 if (skb->dev->type != ARPHRD_ETHER)
437 goto out; 448 goto out;
@@ -611,9 +622,8 @@ int netpoll_setup(struct netpoll *np)
611{ 622{
612 struct net_device *ndev = NULL; 623 struct net_device *ndev = NULL;
613 struct in_device *in_dev; 624 struct in_device *in_dev;
614 625 struct netpoll_info *npinfo;
615 np->poll_lock = SPIN_LOCK_UNLOCKED; 626 unsigned long flags;
616 np->poll_owner = -1;
617 627
618 if (np->dev_name) 628 if (np->dev_name)
619 ndev = dev_get_by_name(np->dev_name); 629 ndev = dev_get_by_name(np->dev_name);
@@ -624,7 +634,17 @@ int netpoll_setup(struct netpoll *np)
624 } 634 }
625 635
626 np->dev = ndev; 636 np->dev = ndev;
627 ndev->np = np; 637 if (!ndev->npinfo) {
638 npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
639 if (!npinfo)
640 goto release;
641
642 npinfo->rx_np = NULL;
643 npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
644 npinfo->poll_owner = -1;
645 npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
646 } else
647 npinfo = ndev->npinfo;
628 648
629 if (!ndev->poll_controller) { 649 if (!ndev->poll_controller) {
630 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", 650 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
@@ -692,13 +712,20 @@ int netpoll_setup(struct netpoll *np)
692 np->name, HIPQUAD(np->local_ip)); 712 np->name, HIPQUAD(np->local_ip));
693 } 713 }
694 714
695 if(np->rx_hook) 715 if (np->rx_hook) {
696 np->rx_flags = NETPOLL_RX_ENABLED; 716 spin_lock_irqsave(&npinfo->rx_lock, flags);
717 npinfo->rx_flags |= NETPOLL_RX_ENABLED;
718 npinfo->rx_np = np;
719 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
720 }
721 /* last thing to do is link it to the net device structure */
722 ndev->npinfo = npinfo;
697 723
698 return 0; 724 return 0;
699 725
700 release: 726 release:
701 ndev->np = NULL; 727 if (!ndev->npinfo)
728 kfree(npinfo);
702 np->dev = NULL; 729 np->dev = NULL;
703 dev_put(ndev); 730 dev_put(ndev);
704 return -1; 731 return -1;
@@ -706,9 +733,20 @@ int netpoll_setup(struct netpoll *np)
706 733
707void netpoll_cleanup(struct netpoll *np) 734void netpoll_cleanup(struct netpoll *np)
708{ 735{
709 if (np->dev) 736 struct netpoll_info *npinfo;
710 np->dev->np = NULL; 737 unsigned long flags;
711 dev_put(np->dev); 738
739 if (np->dev) {
740 npinfo = np->dev->npinfo;
741 if (npinfo && npinfo->rx_np == np) {
742 spin_lock_irqsave(&npinfo->rx_lock, flags);
743 npinfo->rx_np = NULL;
744 npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
745 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
746 }
747 dev_put(np->dev);
748 }
749
712 np->dev = NULL; 750 np->dev = NULL;
713} 751}
714 752
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06bc79f3..975d651312dc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
151#include <asm/timex.h> 151#include <asm/timex.h>
152 152
153 153
154#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n" 154#define VERSION "pktgen v2.62: Packet Generator for packet performance testing.\n"
155 155
156/* #define PG_DEBUG(a) a */ 156/* #define PG_DEBUG(a) a */
157#define PG_DEBUG(a) 157#define PG_DEBUG(a)
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1921 struct iphdr *iph; 1921 struct iphdr *iph;
1922 struct pktgen_hdr *pgh = NULL; 1922 struct pktgen_hdr *pgh = NULL;
1923 1923
1924 /* Update any of the values, used when we're incrementing various
1925 * fields.
1926 */
1927 mod_cur_headers(pkt_dev);
1928
1924 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 1929 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
1925 if (!skb) { 1930 if (!skb) {
1926 sprintf(pkt_dev->result, "No memory"); 1931 sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1934 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); 1939 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
1935 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 1940 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
1936 1941
1937 /* Update any of the values, used when we're incrementing various
1938 * fields.
1939 */
1940 mod_cur_headers(pkt_dev);
1941
1942 memcpy(eth, pkt_dev->hh, 12); 1942 memcpy(eth, pkt_dev->hh, 12);
1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP); 1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
1944 1944
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2192 int datalen; 2192 int datalen;
2193 struct ipv6hdr *iph; 2193 struct ipv6hdr *iph;
2194 struct pktgen_hdr *pgh = NULL; 2194 struct pktgen_hdr *pgh = NULL;
2195 2195
2196 /* Update any of the values, used when we're incrementing various
2197 * fields.
2198 */
2199 mod_cur_headers(pkt_dev);
2200
2196 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 2201 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
2197 if (!skb) { 2202 if (!skb) {
2198 sprintf(pkt_dev->result, "No memory"); 2203 sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2206 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); 2211 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
2207 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 2212 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
2208 2213
2209
2210 /* Update any of the values, used when we're incrementing various
2211 * fields.
2212 */
2213 mod_cur_headers(pkt_dev);
2214
2215
2216 memcpy(eth, pkt_dev->hh, 12); 2214 memcpy(eth, pkt_dev->hh, 12);
2217 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6); 2215 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
2218 2216
2219
2220 datalen = pkt_dev->cur_pkt_size-14- 2217 datalen = pkt_dev->cur_pkt_size-14-
2221 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */ 2218 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
2222 2219
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
new file mode 100644
index 000000000000..bb55675f0685
--- /dev/null
+++ b/net/core/request_sock.c
@@ -0,0 +1,64 @@
1/*
2 * NET Generic infrastructure for Network protocols.
3 *
4 * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
5 *
6 * From code originally in include/net/tcp.h
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/module.h>
15#include <linux/random.h>
16#include <linux/slab.h>
17#include <linux/string.h>
18
19#include <net/request_sock.h>
20
21/*
22 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
23 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
24 * It would be better to replace it with a global counter for all sockets
25 * but then some measure against one socket starving all other sockets
26 * would be needed.
27 *
28 * It was 128 by default. Experiments with real servers show, that
29 * it is absolutely not enough even at 100conn/sec. 256 cures most
30 * of problems. This value is adjusted to 128 for very small machines
31 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
32 * Further increasing requires to change hash table size.
33 */
34int sysctl_max_syn_backlog = 256;
35EXPORT_SYMBOL(sysctl_max_syn_backlog);
36
37int reqsk_queue_alloc(struct request_sock_queue *queue,
38 const int nr_table_entries)
39{
40 const int lopt_size = sizeof(struct listen_sock) +
41 nr_table_entries * sizeof(struct request_sock *);
42 struct listen_sock *lopt = kmalloc(lopt_size, GFP_KERNEL);
43
44 if (lopt == NULL)
45 return -ENOMEM;
46
47 memset(lopt, 0, lopt_size);
48
49 for (lopt->max_qlen_log = 6;
50 (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
51 lopt->max_qlen_log++);
52
53 get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
54 rwlock_init(&queue->syn_wait_lock);
55 queue->rskq_accept_head = queue->rskq_accept_head = NULL;
56
57 write_lock_bh(&queue->syn_wait_lock);
58 queue->listen_opt = lopt;
59 write_unlock_bh(&queue->syn_wait_lock);
60
61 return 0;
62}
63
64EXPORT_SYMBOL(reqsk_queue_alloc);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 00caf4b318b2..e013d836a7ab 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -100,6 +100,7 @@ static const int rtm_min[RTM_NR_FAMILIES] =
100 [RTM_FAM(RTM_NEWPREFIX)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), 100 [RTM_FAM(RTM_NEWPREFIX)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
101 [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), 101 [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
102 [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), 102 [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
103 [RTM_FAM(RTM_NEWNEIGHTBL)] = NLMSG_LENGTH(sizeof(struct ndtmsg)),
103}; 104};
104 105
105static const int rta_max[RTM_NR_FAMILIES] = 106static const int rta_max[RTM_NR_FAMILIES] =
@@ -113,6 +114,7 @@ static const int rta_max[RTM_NR_FAMILIES] =
113 [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, 114 [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX,
114 [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, 115 [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX,
115 [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, 116 [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX,
117 [RTM_FAM(RTM_NEWNEIGHTBL)] = NDTA_MAX,
116}; 118};
117 119
118void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) 120void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
@@ -176,14 +178,14 @@ rtattr_failure:
176 178
177 179
178static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, 180static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
179 int type, u32 pid, u32 seq, u32 change) 181 int type, u32 pid, u32 seq, u32 change,
182 unsigned int flags)
180{ 183{
181 struct ifinfomsg *r; 184 struct ifinfomsg *r;
182 struct nlmsghdr *nlh; 185 struct nlmsghdr *nlh;
183 unsigned char *b = skb->tail; 186 unsigned char *b = skb->tail;
184 187
185 nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); 188 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
186 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
187 r = NLMSG_DATA(nlh); 189 r = NLMSG_DATA(nlh);
188 r->ifi_family = AF_UNSPEC; 190 r->ifi_family = AF_UNSPEC;
189 r->ifi_type = dev->type; 191 r->ifi_type = dev->type;
@@ -273,7 +275,10 @@ static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *c
273 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { 275 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
274 if (idx < s_idx) 276 if (idx < s_idx)
275 continue; 277 continue;
276 if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) 278 if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK,
279 NETLINK_CB(cb->skb).pid,
280 cb->nlh->nlmsg_seq, 0,
281 NLM_F_MULTI) <= 0)
277 break; 282 break;
278 } 283 }
279 read_unlock(&dev_base_lock); 284 read_unlock(&dev_base_lock);
@@ -447,7 +452,7 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
447 if (!skb) 452 if (!skb)
448 return; 453 return;
449 454
450 if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change) < 0) { 455 if (rtnetlink_fill_ifinfo(skb, dev, type, current->pid, 0, change, 0) < 0) {
451 kfree_skb(skb); 456 kfree_skb(skb);
452 return; 457 return;
453 } 458 }
@@ -649,14 +654,16 @@ static void rtnetlink_rcv(struct sock *sk, int len)
649 654
650static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] = 655static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
651{ 656{
652 [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo }, 657 [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
653 [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink }, 658 [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink },
654 [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, 659 [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
655 [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, 660 [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
656 [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add }, 661 [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add },
657 [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete }, 662 [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete },
658 [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info }, 663 [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info },
659 [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, 664 [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
665 [RTM_GETNEIGHTBL - RTM_BASE] = { .dumpit = neightbl_dump_info },
666 [RTM_SETNEIGHTBL - RTM_BASE] = { .doit = neightbl_set },
660}; 667};
661 668
662static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) 669static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f65b3de590a9..bb73b2190ec7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -365,9 +365,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
365 C(nfct); 365 C(nfct);
366 nf_conntrack_get(skb->nfct); 366 nf_conntrack_get(skb->nfct);
367 C(nfctinfo); 367 C(nfctinfo);
368#ifdef CONFIG_NETFILTER_DEBUG
369 C(nf_debug);
370#endif
371#ifdef CONFIG_BRIDGE_NETFILTER 368#ifdef CONFIG_BRIDGE_NETFILTER
372 C(nf_bridge); 369 C(nf_bridge);
373 nf_bridge_get(skb->nf_bridge); 370 nf_bridge_get(skb->nf_bridge);
@@ -432,9 +429,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
432 new->nfct = old->nfct; 429 new->nfct = old->nfct;
433 nf_conntrack_get(old->nfct); 430 nf_conntrack_get(old->nfct);
434 new->nfctinfo = old->nfctinfo; 431 new->nfctinfo = old->nfctinfo;
435#ifdef CONFIG_NETFILTER_DEBUG
436 new->nf_debug = old->nf_debug;
437#endif
438#ifdef CONFIG_BRIDGE_NETFILTER 432#ifdef CONFIG_BRIDGE_NETFILTER
439 new->nf_bridge = old->nf_bridge; 433 new->nf_bridge = old->nf_bridge;
440 nf_bridge_get(old->nf_bridge); 434 nf_bridge_get(old->nf_bridge);
@@ -1506,6 +1500,159 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
1506 skb_split_no_header(skb, skb1, len, pos); 1500 skb_split_no_header(skb, skb1, len, pos);
1507} 1501}
1508 1502
1503/**
1504 * skb_prepare_seq_read - Prepare a sequential read of skb data
1505 * @skb: the buffer to read
1506 * @from: lower offset of data to be read
1507 * @to: upper offset of data to be read
1508 * @st: state variable
1509 *
1510 * Initializes the specified state variable. Must be called before
1511 * invoking skb_seq_read() for the first time.
1512 */
1513void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
1514 unsigned int to, struct skb_seq_state *st)
1515{
1516 st->lower_offset = from;
1517 st->upper_offset = to;
1518 st->root_skb = st->cur_skb = skb;
1519 st->frag_idx = st->stepped_offset = 0;
1520 st->frag_data = NULL;
1521}
1522
1523/**
1524 * skb_seq_read - Sequentially read skb data
1525 * @consumed: number of bytes consumed by the caller so far
1526 * @data: destination pointer for data to be returned
1527 * @st: state variable
1528 *
1529 * Reads a block of skb data at &consumed relative to the
1530 * lower offset specified to skb_prepare_seq_read(). Assigns
1531 * the head of the data block to &data and returns the length
1532 * of the block or 0 if the end of the skb data or the upper
1533 * offset has been reached.
1534 *
1535 * The caller is not required to consume all of the data
1536 * returned, i.e. &consumed is typically set to the number
1537 * of bytes already consumed and the next call to
1538 * skb_seq_read() will return the remaining part of the block.
1539 *
1540 * Note: The size of each block of data returned can be arbitary,
1541 * this limitation is the cost for zerocopy seqeuental
1542 * reads of potentially non linear data.
1543 *
1544 * Note: Fragment lists within fragments are not implemented
1545 * at the moment, state->root_skb could be replaced with
1546 * a stack for this purpose.
1547 */
1548unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
1549 struct skb_seq_state *st)
1550{
1551 unsigned int block_limit, abs_offset = consumed + st->lower_offset;
1552 skb_frag_t *frag;
1553
1554 if (unlikely(abs_offset >= st->upper_offset))
1555 return 0;
1556
1557next_skb:
1558 block_limit = skb_headlen(st->cur_skb);
1559
1560 if (abs_offset < block_limit) {
1561 *data = st->cur_skb->data + abs_offset;
1562 return block_limit - abs_offset;
1563 }
1564
1565 if (st->frag_idx == 0 && !st->frag_data)
1566 st->stepped_offset += skb_headlen(st->cur_skb);
1567
1568 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
1569 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
1570 block_limit = frag->size + st->stepped_offset;
1571
1572 if (abs_offset < block_limit) {
1573 if (!st->frag_data)
1574 st->frag_data = kmap_skb_frag(frag);
1575
1576 *data = (u8 *) st->frag_data + frag->page_offset +
1577 (abs_offset - st->stepped_offset);
1578
1579 return block_limit - abs_offset;
1580 }
1581
1582 if (st->frag_data) {
1583 kunmap_skb_frag(st->frag_data);
1584 st->frag_data = NULL;
1585 }
1586
1587 st->frag_idx++;
1588 st->stepped_offset += frag->size;
1589 }
1590
1591 if (st->cur_skb->next) {
1592 st->cur_skb = st->cur_skb->next;
1593 st->frag_idx = 0;
1594 goto next_skb;
1595 } else if (st->root_skb == st->cur_skb &&
1596 skb_shinfo(st->root_skb)->frag_list) {
1597 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
1598 goto next_skb;
1599 }
1600
1601 return 0;
1602}
1603
1604/**
1605 * skb_abort_seq_read - Abort a sequential read of skb data
1606 * @st: state variable
1607 *
1608 * Must be called if skb_seq_read() was not called until it
1609 * returned 0.
1610 */
1611void skb_abort_seq_read(struct skb_seq_state *st)
1612{
1613 if (st->frag_data)
1614 kunmap_skb_frag(st->frag_data);
1615}
1616
1617#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
1618
1619static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
1620 struct ts_config *conf,
1621 struct ts_state *state)
1622{
1623 return skb_seq_read(offset, text, TS_SKB_CB(state));
1624}
1625
1626static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
1627{
1628 skb_abort_seq_read(TS_SKB_CB(state));
1629}
1630
1631/**
1632 * skb_find_text - Find a text pattern in skb data
1633 * @skb: the buffer to look in
1634 * @from: search offset
1635 * @to: search limit
1636 * @config: textsearch configuration
1637 * @state: uninitialized textsearch state variable
1638 *
1639 * Finds a pattern in the skb data according to the specified
1640 * textsearch configuration. Use textsearch_next() to retrieve
1641 * subsequent occurrences of the pattern. Returns the offset
1642 * to the first occurrence or UINT_MAX if no match was found.
1643 */
1644unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
1645 unsigned int to, struct ts_config *config,
1646 struct ts_state *state)
1647{
1648 config->get_next_block = skb_ts_get_next_block;
1649 config->finish = skb_ts_finish;
1650
1651 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
1652
1653 return textsearch_find(config, state);
1654}
1655
1509void __init skb_init(void) 1656void __init skb_init(void)
1510{ 1657{
1511 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 1658 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1544,3 +1691,7 @@ EXPORT_SYMBOL(skb_queue_tail);
1544EXPORT_SYMBOL(skb_unlink); 1691EXPORT_SYMBOL(skb_unlink);
1545EXPORT_SYMBOL(skb_append); 1692EXPORT_SYMBOL(skb_append);
1546EXPORT_SYMBOL(skb_split); 1693EXPORT_SYMBOL(skb_split);
1694EXPORT_SYMBOL(skb_prepare_seq_read);
1695EXPORT_SYMBOL(skb_seq_read);
1696EXPORT_SYMBOL(skb_abort_seq_read);
1697EXPORT_SYMBOL(skb_find_text);
diff --git a/net/core/sock.c b/net/core/sock.c
index 96e00b08698f..a6ec3ada7f9e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,6 +118,7 @@
118#include <linux/netdevice.h> 118#include <linux/netdevice.h>
119#include <net/protocol.h> 119#include <net/protocol.h>
120#include <linux/skbuff.h> 120#include <linux/skbuff.h>
121#include <net/request_sock.h>
121#include <net/sock.h> 122#include <net/sock.h>
122#include <net/xfrm.h> 123#include <net/xfrm.h>
123#include <linux/ipsec.h> 124#include <linux/ipsec.h>
@@ -1363,6 +1364,7 @@ static LIST_HEAD(proto_list);
1363 1364
1364int proto_register(struct proto *prot, int alloc_slab) 1365int proto_register(struct proto *prot, int alloc_slab)
1365{ 1366{
1367 char *request_sock_slab_name;
1366 int rc = -ENOBUFS; 1368 int rc = -ENOBUFS;
1367 1369
1368 if (alloc_slab) { 1370 if (alloc_slab) {
@@ -1374,6 +1376,25 @@ int proto_register(struct proto *prot, int alloc_slab)
1374 prot->name); 1376 prot->name);
1375 goto out; 1377 goto out;
1376 } 1378 }
1379
1380 if (prot->rsk_prot != NULL) {
1381 static const char mask[] = "request_sock_%s";
1382
1383 request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1384 if (request_sock_slab_name == NULL)
1385 goto out_free_sock_slab;
1386
1387 sprintf(request_sock_slab_name, mask, prot->name);
1388 prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1389 prot->rsk_prot->obj_size, 0,
1390 SLAB_HWCACHE_ALIGN, NULL, NULL);
1391
1392 if (prot->rsk_prot->slab == NULL) {
1393 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1394 prot->name);
1395 goto out_free_request_sock_slab_name;
1396 }
1397 }
1377 } 1398 }
1378 1399
1379 write_lock(&proto_list_lock); 1400 write_lock(&proto_list_lock);
@@ -1382,6 +1403,12 @@ int proto_register(struct proto *prot, int alloc_slab)
1382 rc = 0; 1403 rc = 0;
1383out: 1404out:
1384 return rc; 1405 return rc;
1406out_free_request_sock_slab_name:
1407 kfree(request_sock_slab_name);
1408out_free_sock_slab:
1409 kmem_cache_destroy(prot->slab);
1410 prot->slab = NULL;
1411 goto out;
1385} 1412}
1386 1413
1387EXPORT_SYMBOL(proto_register); 1414EXPORT_SYMBOL(proto_register);
@@ -1395,6 +1422,14 @@ void proto_unregister(struct proto *prot)
1395 prot->slab = NULL; 1422 prot->slab = NULL;
1396 } 1423 }
1397 1424
1425 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1426 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1427
1428 kmem_cache_destroy(prot->rsk_prot->slab);
1429 kfree(name);
1430 prot->rsk_prot->slab = NULL;
1431 }
1432
1398 list_del(&prot->node); 1433 list_del(&prot->node);
1399 write_unlock(&proto_list_lock); 1434 write_unlock(&proto_list_lock);
1400} 1435}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8be646cb191..8f817ad9f546 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,12 +13,8 @@
13#ifdef CONFIG_SYSCTL 13#ifdef CONFIG_SYSCTL
14 14
15extern int netdev_max_backlog; 15extern int netdev_max_backlog;
16extern int netdev_budget;
16extern int weight_p; 17extern int weight_p;
17extern int no_cong_thresh;
18extern int no_cong;
19extern int lo_cong;
20extern int mod_cong;
21extern int netdev_fastroute;
22extern int net_msg_cost; 18extern int net_msg_cost;
23extern int net_msg_burst; 19extern int net_msg_burst;
24 20
@@ -35,19 +31,6 @@ extern int sysctl_somaxconn;
35extern char sysctl_divert_version[]; 31extern char sysctl_divert_version[];
36#endif /* CONFIG_NET_DIVERT */ 32#endif /* CONFIG_NET_DIVERT */
37 33
38/*
39 * This strdup() is used for creating copies of network
40 * device names to be handed over to sysctl.
41 */
42
43char *net_sysctl_strdup(const char *s)
44{
45 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
46 if (rv)
47 strcpy(rv, s);
48 return rv;
49}
50
51ctl_table core_table[] = { 34ctl_table core_table[] = {
52#ifdef CONFIG_NET 35#ifdef CONFIG_NET
53 { 36 {
@@ -99,38 +82,6 @@ ctl_table core_table[] = {
99 .proc_handler = &proc_dointvec 82 .proc_handler = &proc_dointvec
100 }, 83 },
101 { 84 {
102 .ctl_name = NET_CORE_NO_CONG_THRESH,
103 .procname = "no_cong_thresh",
104 .data = &no_cong_thresh,
105 .maxlen = sizeof(int),
106 .mode = 0644,
107 .proc_handler = &proc_dointvec
108 },
109 {
110 .ctl_name = NET_CORE_NO_CONG,
111 .procname = "no_cong",
112 .data = &no_cong,
113 .maxlen = sizeof(int),
114 .mode = 0644,
115 .proc_handler = &proc_dointvec
116 },
117 {
118 .ctl_name = NET_CORE_LO_CONG,
119 .procname = "lo_cong",
120 .data = &lo_cong,
121 .maxlen = sizeof(int),
122 .mode = 0644,
123 .proc_handler = &proc_dointvec
124 },
125 {
126 .ctl_name = NET_CORE_MOD_CONG,
127 .procname = "mod_cong",
128 .data = &mod_cong,
129 .maxlen = sizeof(int),
130 .mode = 0644,
131 .proc_handler = &proc_dointvec
132 },
133 {
134 .ctl_name = NET_CORE_MSG_COST, 85 .ctl_name = NET_CORE_MSG_COST,
135 .procname = "message_cost", 86 .procname = "message_cost",
136 .data = &net_msg_cost, 87 .data = &net_msg_cost,
@@ -174,9 +125,15 @@ ctl_table core_table[] = {
174 .mode = 0644, 125 .mode = 0644,
175 .proc_handler = &proc_dointvec 126 .proc_handler = &proc_dointvec
176 }, 127 },
128 {
129 .ctl_name = NET_CORE_BUDGET,
130 .procname = "netdev_budget",
131 .data = &netdev_budget,
132 .maxlen = sizeof(int),
133 .mode = 0644,
134 .proc_handler = &proc_dointvec
135 },
177 { .ctl_name = 0 } 136 { .ctl_name = 0 }
178}; 137};
179 138
180EXPORT_SYMBOL(net_sysctl_strdup);
181
182#endif 139#endif
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index ee7bf46eb78a..00233ecbc9cb 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -716,13 +716,13 @@ static int dn_dev_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *a
716} 716}
717 717
718static int dn_dev_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, 718static int dn_dev_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
719 u32 pid, u32 seq, int event) 719 u32 pid, u32 seq, int event, unsigned int flags)
720{ 720{
721 struct ifaddrmsg *ifm; 721 struct ifaddrmsg *ifm;
722 struct nlmsghdr *nlh; 722 struct nlmsghdr *nlh;
723 unsigned char *b = skb->tail; 723 unsigned char *b = skb->tail;
724 724
725 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 725 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
726 ifm = NLMSG_DATA(nlh); 726 ifm = NLMSG_DATA(nlh);
727 727
728 ifm->ifa_family = AF_DECnet; 728 ifm->ifa_family = AF_DECnet;
@@ -755,7 +755,7 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa)
755 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS); 755 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS);
756 return; 756 return;
757 } 757 }
758 if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { 758 if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
759 kfree_skb(skb); 759 kfree_skb(skb);
760 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL); 760 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL);
761 return; 761 return;
@@ -790,7 +790,8 @@ static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
790 if (dn_dev_fill_ifaddr(skb, ifa, 790 if (dn_dev_fill_ifaddr(skb, ifa,
791 NETLINK_CB(cb->skb).pid, 791 NETLINK_CB(cb->skb).pid,
792 cb->nlh->nlmsg_seq, 792 cb->nlh->nlmsg_seq,
793 RTM_NEWADDR) <= 0) 793 RTM_NEWADDR,
794 NLM_F_MULTI) <= 0)
794 goto done; 795 goto done;
795 } 796 }
796 } 797 }
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f6dfe96f45b7..f32dba9e26fe 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -101,7 +101,6 @@ struct neigh_table dn_neigh_table = {
101 .id = "dn_neigh_cache", 101 .id = "dn_neigh_cache",
102 .parms ={ 102 .parms ={
103 .tbl = &dn_neigh_table, 103 .tbl = &dn_neigh_table,
104 .entries = 0,
105 .base_reachable_time = 30 * HZ, 104 .base_reachable_time = 30 * HZ,
106 .retrans_time = 1 * HZ, 105 .retrans_time = 1 * HZ,
107 .gc_staletime = 60 * HZ, 106 .gc_staletime = 60 * HZ,
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 1e7b5c3ea215..2399fa8a3f86 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1465,7 +1465,8 @@ int dn_route_input(struct sk_buff *skb)
1465 return dn_route_input_slow(skb); 1465 return dn_route_input_slow(skb);
1466} 1466}
1467 1467
1468static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait) 1468static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
1469 int event, int nowait, unsigned int flags)
1469{ 1470{
1470 struct dn_route *rt = (struct dn_route *)skb->dst; 1471 struct dn_route *rt = (struct dn_route *)skb->dst;
1471 struct rtmsg *r; 1472 struct rtmsg *r;
@@ -1473,9 +1474,8 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int
1473 unsigned char *b = skb->tail; 1474 unsigned char *b = skb->tail;
1474 struct rta_cacheinfo ci; 1475 struct rta_cacheinfo ci;
1475 1476
1476 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); 1477 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
1477 r = NLMSG_DATA(nlh); 1478 r = NLMSG_DATA(nlh);
1478 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1479 r->rtm_family = AF_DECnet; 1479 r->rtm_family = AF_DECnet;
1480 r->rtm_dst_len = 16; 1480 r->rtm_dst_len = 16;
1481 r->rtm_src_len = 0; 1481 r->rtm_src_len = 0;
@@ -1596,7 +1596,7 @@ int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
1596 1596
1597 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; 1597 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1598 1598
1599 err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0); 1599 err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
1600 1600
1601 if (err == 0) 1601 if (err == 0)
1602 goto out_free; 1602 goto out_free;
@@ -1644,7 +1644,8 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
1644 continue; 1644 continue;
1645 skb->dst = dst_clone(&rt->u.dst); 1645 skb->dst = dst_clone(&rt->u.dst);
1646 if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 1646 if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
1647 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { 1647 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
1648 1, NLM_F_MULTI) <= 0) {
1648 dst_release(xchg(&skb->dst, NULL)); 1649 dst_release(xchg(&skb->dst, NULL));
1649 rcu_read_unlock_bh(); 1650 rcu_read_unlock_bh();
1650 goto done; 1651 goto done;
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 597587d170d8..1060de70bc0c 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -342,14 +342,15 @@ static struct notifier_block dn_fib_rules_notifier = {
342 .notifier_call = dn_fib_rules_event, 342 .notifier_call = dn_fib_rules_event,
343}; 343};
344 344
345static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r, struct netlink_callback *cb) 345static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r,
346 struct netlink_callback *cb, unsigned int flags)
346{ 347{
347 struct rtmsg *rtm; 348 struct rtmsg *rtm;
348 struct nlmsghdr *nlh; 349 struct nlmsghdr *nlh;
349 unsigned char *b = skb->tail; 350 unsigned char *b = skb->tail;
350 351
351 352
352 nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); 353 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
353 rtm = NLMSG_DATA(nlh); 354 rtm = NLMSG_DATA(nlh);
354 rtm->rtm_family = AF_DECnet; 355 rtm->rtm_family = AF_DECnet;
355 rtm->rtm_dst_len = r->r_dst_len; 356 rtm->rtm_dst_len = r->r_dst_len;
@@ -394,7 +395,7 @@ int dn_fib_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
394 for(r = dn_fib_rules, idx = 0; r; r = r->r_next, idx++) { 395 for(r = dn_fib_rules, idx = 0; r; r = r->r_next, idx++) {
395 if (idx < s_idx) 396 if (idx < s_idx)
396 continue; 397 continue;
397 if (dn_fib_fill_rule(skb, r, cb) < 0) 398 if (dn_fib_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
398 break; 399 break;
399 } 400 }
400 read_unlock(&dn_fib_rules_lock); 401 read_unlock(&dn_fib_rules_lock);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index dad5603912be..28ba5777a25a 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -270,13 +270,13 @@ static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern
270 270
271static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 271static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
272 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, 272 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len,
273 struct dn_fib_info *fi) 273 struct dn_fib_info *fi, unsigned int flags)
274{ 274{
275 struct rtmsg *rtm; 275 struct rtmsg *rtm;
276 struct nlmsghdr *nlh; 276 struct nlmsghdr *nlh;
277 unsigned char *b = skb->tail; 277 unsigned char *b = skb->tail;
278 278
279 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); 279 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
280 rtm = NLMSG_DATA(nlh); 280 rtm = NLMSG_DATA(nlh);
281 rtm->rtm_family = AF_DECnet; 281 rtm->rtm_family = AF_DECnet;
282 rtm->rtm_dst_len = dst_len; 282 rtm->rtm_dst_len = dst_len;
@@ -345,7 +345,7 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
345 345
346 if (dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id, 346 if (dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id,
347 f->fn_type, f->fn_scope, &f->fn_key, z, 347 f->fn_type, f->fn_scope, &f->fn_key, z,
348 DN_FIB_INFO(f)) < 0) { 348 DN_FIB_INFO(f), 0) < 0) {
349 kfree_skb(skb); 349 kfree_skb(skb);
350 return; 350 return;
351 } 351 }
@@ -377,7 +377,7 @@ static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
377 tb->n, 377 tb->n,
378 (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type, 378 (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
379 f->fn_scope, &f->fn_key, dz->dz_order, 379 f->fn_scope, &f->fn_key, dz->dz_order,
380 f->fn_info) < 0) { 380 f->fn_info, NLM_F_MULTI) < 0) {
381 cb->args[3] = i; 381 cb->args[3] = i;
382 return -1; 382 return -1;
383 } 383 }
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d3e8b1bd1f2..3e63123f7bbd 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -53,6 +53,44 @@ config IP_ADVANCED_ROUTER
53 53
54 If unsure, say N here. 54 If unsure, say N here.
55 55
56choice
57 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
58 depends on IP_ADVANCED_ROUTER
59 default IP_FIB_HASH
60
61config IP_FIB_HASH
62 bool "FIB_HASH"
63 ---help---
64 Current FIB is very proven and good enough for most users.
65
66config IP_FIB_TRIE
67 bool "FIB_TRIE"
68 ---help---
69 Use new experimental LC-trie as FIB lookup algoritm.
70 This improves lookup performance if you have a large
71 number of routes.
72
73 LC-trie is a longest matching prefix lookup algorithm which
74 performs better than FIB_HASH for large routing tables.
75 But, it consumes more memory and is more complex.
76
77 LC-trie is described in:
78
79 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
80 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
81 An experimental study of compression methods for dynamic tries
82 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
83 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
84
85endchoice
86
87# If the user does not enable advanced routing, he gets the safe
88# default of the fib-hash algorithm.
89config IP_FIB_HASH
90 bool
91 depends on !IP_ADVANCED_ROUTER
92 default y
93
56config IP_MULTIPLE_TABLES 94config IP_MULTIPLE_TABLES
57 bool "IP: policy routing" 95 bool "IP: policy routing"
58 depends on IP_ADVANCED_ROUTER 96 depends on IP_ADVANCED_ROUTER
@@ -407,5 +445,112 @@ config IP_TCPDIAG
407config IP_TCPDIAG_IPV6 445config IP_TCPDIAG_IPV6
408 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) 446 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
409 447
448config TCP_CONG_ADVANCED
449 bool "TCP: advanced congestion control"
450 depends on INET
451 ---help---
452 Support for selection of various TCP congestion control
453 modules.
454
455 Nearly all users can safely say no here, and a safe default
456 selection will be made (BIC-TCP with new Reno as a fallback).
457
458 If unsure, say N.
459
460# TCP Reno is builtin (required as fallback)
461menu "TCP congestion control"
462 depends on TCP_CONG_ADVANCED
463
464config TCP_CONG_BIC
465 tristate "Binary Increase Congestion (BIC) control"
466 depends on INET
467 default y
468 ---help---
469 BIC-TCP is a sender-side only change that ensures a linear RTT
470 fairness under large windows while offering both scalability and
471 bounded TCP-friendliness. The protocol combines two schemes
472 called additive increase and binary search increase. When the
473 congestion window is large, additive increase with a large
474 increment ensures linear RTT fairness as well as good
475 scalability. Under small congestion windows, binary search
476 increase provides TCP friendliness.
477 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
478
479config TCP_CONG_WESTWOOD
480 tristate "TCP Westwood+"
481 depends on INET
482 default m
483 ---help---
484 TCP Westwood+ is a sender-side only modification of the TCP Reno
485 protocol stack that optimizes the performance of TCP congestion
486 control. It is based on end-to-end bandwidth estimation to set
487 congestion window and slow start threshold after a congestion
488 episode. Using this estimation, TCP Westwood+ adaptively sets a
489 slow start threshold and a congestion window which takes into
490 account the bandwidth used at the time congestion is experienced.
491 TCP Westwood+ significantly increases fairness wrt TCP Reno in
492 wired networks and throughput over wireless links.
493
494config TCP_CONG_HTCP
495 tristate "H-TCP"
496 depends on INET
497 default m
498 ---help---
499 H-TCP is a send-side only modifications of the TCP Reno
500 protocol stack that optimizes the performance of TCP
501 congestion control for high speed network links. It uses a
502 modeswitch to change the alpha and beta parameters of TCP Reno
503 based on network conditions and in a way so as to be fair with
504 other Reno and H-TCP flows.
505
506config TCP_CONG_HSTCP
507 tristate "High Speed TCP"
508 depends on INET && EXPERIMENTAL
509 default n
510 ---help---
511 Sally Floyd's High Speed TCP (RFC 3649) congestion control.
512 A modification to TCP's congestion control mechanism for use
513 with large congestion windows. A table indicates how much to
514 increase the congestion window by when an ACK is received.
515 For more detail see http://www.icir.org/floyd/hstcp.html
516
517config TCP_CONG_HYBLA
518 tristate "TCP-Hybla congestion control algorithm"
519 depends on INET && EXPERIMENTAL
520 default n
521 ---help---
522 TCP-Hybla is a sender-side only change that eliminates penalization of
523 long-RTT, large-bandwidth connections, like when satellite legs are
524 involved, expecially when sharing a common bottleneck with normal
525 terrestrial connections.
526
527config TCP_CONG_VEGAS
528 tristate "TCP Vegas"
529 depends on INET && EXPERIMENTAL
530 default n
531 ---help---
532 TCP Vegas is a sender-side only change to TCP that anticipates
533 the onset of congestion by estimating the bandwidth. TCP Vegas
534 adjusts the sending rate by modifying the congestion
535 window. TCP Vegas should provide less packet loss, but it is
536 not as aggressive as TCP Reno.
537
538config TCP_CONG_SCALABLE
539 tristate "Scalable TCP"
540 depends on INET && EXPERIMENTAL
541 default n
542 ---help---
543 Scalable TCP is a sender-side only change to TCP which uses a
544 MIMD congestion control algorithm which has some nice scaling
545 properties, though is known to have fairness issues.
546 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
547
548endmenu
549
550config TCP_CONG_BIC
551 tristate
552 depends on !TCP_CONG_ADVANCED
553 default y
554
410source "net/ipv4/ipvs/Kconfig" 555source "net/ipv4/ipvs/Kconfig"
411 556
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8b379627ebb6..5718cdb3a61e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,10 +5,13 @@
5obj-y := utils.o route.o inetpeer.o protocol.o \ 5obj-y := utils.o route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
9 tcp_minisocks.o tcp_cong.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 10 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o 11 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 12
13obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
14obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
12obj-$(CONFIG_PROC_FS) += proc.o 15obj-$(CONFIG_PROC_FS) += proc.o
13obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 16obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
14obj-$(CONFIG_IP_MROUTE) += ipmr.o 17obj-$(CONFIG_IP_MROUTE) += ipmr.o
@@ -28,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/
28obj-$(CONFIG_IP_VS) += ipvs/ 31obj-$(CONFIG_IP_VS) += ipvs/
29obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 32obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
30obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 33obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
34obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
37obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
38obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
39obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
40obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
31 41
32obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 42obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
33 xfrm4_output.o 43 xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b3cb49ce5fad..658e7977924d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1119,6 +1119,10 @@ module_init(inet_init);
1119#ifdef CONFIG_PROC_FS 1119#ifdef CONFIG_PROC_FS
1120extern int fib_proc_init(void); 1120extern int fib_proc_init(void);
1121extern void fib_proc_exit(void); 1121extern void fib_proc_exit(void);
1122#ifdef CONFIG_IP_FIB_TRIE
1123extern int fib_stat_proc_init(void);
1124extern void fib_stat_proc_exit(void);
1125#endif
1122extern int ip_misc_proc_init(void); 1126extern int ip_misc_proc_init(void);
1123extern int raw_proc_init(void); 1127extern int raw_proc_init(void);
1124extern void raw_proc_exit(void); 1128extern void raw_proc_exit(void);
@@ -1139,11 +1143,19 @@ static int __init ipv4_proc_init(void)
1139 goto out_udp; 1143 goto out_udp;
1140 if (fib_proc_init()) 1144 if (fib_proc_init())
1141 goto out_fib; 1145 goto out_fib;
1146#ifdef CONFIG_IP_FIB_TRIE
1147 if (fib_stat_proc_init())
1148 goto out_fib_stat;
1149 #endif
1142 if (ip_misc_proc_init()) 1150 if (ip_misc_proc_init())
1143 goto out_misc; 1151 goto out_misc;
1144out: 1152out:
1145 return rc; 1153 return rc;
1146out_misc: 1154out_misc:
1155#ifdef CONFIG_IP_FIB_TRIE
1156 fib_stat_proc_exit();
1157out_fib_stat:
1158#endif
1147 fib_proc_exit(); 1159 fib_proc_exit();
1148out_fib: 1160out_fib:
1149 udp4_proc_exit(); 1161 udp4_proc_exit();
@@ -1181,6 +1193,7 @@ EXPORT_SYMBOL(inet_stream_connect);
1181EXPORT_SYMBOL(inet_stream_ops); 1193EXPORT_SYMBOL(inet_stream_ops);
1182EXPORT_SYMBOL(inet_unregister_protosw); 1194EXPORT_SYMBOL(inet_unregister_protosw);
1183EXPORT_SYMBOL(net_statistics); 1195EXPORT_SYMBOL(net_statistics);
1196EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
1184 1197
1185#ifdef INET_REFCNT_DEBUG 1198#ifdef INET_REFCNT_DEBUG
1186EXPORT_SYMBOL(inet_sock_nr); 1199EXPORT_SYMBOL(inet_sock_nr);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 0e98f2235b6e..514c85b2631a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -200,7 +200,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
200 xfrm_state_put(x); 200 xfrm_state_put(x);
201} 201}
202 202
203static int ah_init_state(struct xfrm_state *x, void *args) 203static int ah_init_state(struct xfrm_state *x)
204{ 204{
205 struct ah_data *ahp = NULL; 205 struct ah_data *ahp = NULL;
206 struct xfrm_algo_desc *aalg_desc; 206 struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3cc96730c4ed..d8a10e3dd77d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -233,11 +233,14 @@ int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
233static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, 233static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
234 int destroy) 234 int destroy)
235{ 235{
236 struct in_ifaddr *promote = NULL;
236 struct in_ifaddr *ifa1 = *ifap; 237 struct in_ifaddr *ifa1 = *ifap;
237 238
238 ASSERT_RTNL(); 239 ASSERT_RTNL();
239 240
240 /* 1. Deleting primary ifaddr forces deletion all secondaries */ 241 /* 1. Deleting primary ifaddr forces deletion all secondaries
242 * unless alias promotion is set
243 **/
241 244
242 if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { 245 if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
243 struct in_ifaddr *ifa; 246 struct in_ifaddr *ifa;
@@ -251,11 +254,16 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
251 continue; 254 continue;
252 } 255 }
253 256
254 *ifap1 = ifa->ifa_next; 257 if (!IN_DEV_PROMOTE_SECONDARIES(in_dev)) {
258 *ifap1 = ifa->ifa_next;
255 259
256 rtmsg_ifa(RTM_DELADDR, ifa); 260 rtmsg_ifa(RTM_DELADDR, ifa);
257 notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); 261 notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
258 inet_free_ifa(ifa); 262 inet_free_ifa(ifa);
263 } else {
264 promote = ifa;
265 break;
266 }
259 } 267 }
260 } 268 }
261 269
@@ -281,6 +289,13 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
281 if (!in_dev->ifa_list) 289 if (!in_dev->ifa_list)
282 inetdev_destroy(in_dev); 290 inetdev_destroy(in_dev);
283 } 291 }
292
293 if (promote && IN_DEV_PROMOTE_SECONDARIES(in_dev)) {
294 /* not sure if we should send a delete notify first? */
295 promote->ifa_flags &= ~IFA_F_SECONDARY;
296 rtmsg_ifa(RTM_NEWADDR, promote);
297 notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote);
298 }
284} 299}
285 300
286static int inet_insert_ifa(struct in_ifaddr *ifa) 301static int inet_insert_ifa(struct in_ifaddr *ifa)
@@ -1015,14 +1030,13 @@ static struct notifier_block ip_netdev_notifier = {
1015}; 1030};
1016 1031
1017static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, 1032static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1018 u32 pid, u32 seq, int event) 1033 u32 pid, u32 seq, int event, unsigned int flags)
1019{ 1034{
1020 struct ifaddrmsg *ifm; 1035 struct ifaddrmsg *ifm;
1021 struct nlmsghdr *nlh; 1036 struct nlmsghdr *nlh;
1022 unsigned char *b = skb->tail; 1037 unsigned char *b = skb->tail;
1023 1038
1024 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 1039 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
1025 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
1026 ifm = NLMSG_DATA(nlh); 1040 ifm = NLMSG_DATA(nlh);
1027 ifm->ifa_family = AF_INET; 1041 ifm->ifa_family = AF_INET;
1028 ifm->ifa_prefixlen = ifa->ifa_prefixlen; 1042 ifm->ifa_prefixlen = ifa->ifa_prefixlen;
@@ -1075,7 +1089,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1075 continue; 1089 continue;
1076 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, 1090 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
1077 cb->nlh->nlmsg_seq, 1091 cb->nlh->nlmsg_seq,
1078 RTM_NEWADDR) <= 0) { 1092 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1079 rcu_read_unlock(); 1093 rcu_read_unlock();
1080 goto done; 1094 goto done;
1081 } 1095 }
@@ -1098,7 +1112,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
1098 1112
1099 if (!skb) 1113 if (!skb)
1100 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); 1114 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
1101 else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { 1115 else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
1102 kfree_skb(skb); 1116 kfree_skb(skb);
1103 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); 1117 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
1104 } else { 1118 } else {
@@ -1384,6 +1398,15 @@ static struct devinet_sysctl_table {
1384 .proc_handler = &ipv4_doint_and_flush, 1398 .proc_handler = &ipv4_doint_and_flush,
1385 .strategy = &ipv4_doint_and_flush_strategy, 1399 .strategy = &ipv4_doint_and_flush_strategy,
1386 }, 1400 },
1401 {
1402 .ctl_name = NET_IPV4_CONF_PROMOTE_SECONDARIES,
1403 .procname = "promote_secondaries",
1404 .data = &ipv4_devconf.promote_secondaries,
1405 .maxlen = sizeof(int),
1406 .mode = 0644,
1407 .proc_handler = &ipv4_doint_and_flush,
1408 .strategy = &ipv4_doint_and_flush_strategy,
1409 },
1387 }, 1410 },
1388 .devinet_dev = { 1411 .devinet_dev = {
1389 { 1412 {
@@ -1448,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
1448 * by sysctl and we wouldn't want anyone to change it under our feet 1471 * by sysctl and we wouldn't want anyone to change it under our feet
1449 * (see SIOCSIFNAME). 1472 * (see SIOCSIFNAME).
1450 */ 1473 */
1451 dev_name = net_sysctl_strdup(dev_name); 1474 dev_name = kstrdup(dev_name, GFP_KERNEL);
1452 if (!dev_name) 1475 if (!dev_name)
1453 goto free; 1476 goto free;
1454 1477
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 053a883247ba..ba57446d5d1f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -362,7 +362,7 @@ static void esp_destroy(struct xfrm_state *x)
362 kfree(esp); 362 kfree(esp);
363} 363}
364 364
365static int esp_init_state(struct xfrm_state *x, void *args) 365static int esp_init_state(struct xfrm_state *x)
366{ 366{
367 struct esp_data *esp = NULL; 367 struct esp_data *esp = NULL;
368 368
@@ -478,7 +478,7 @@ static int __init esp4_init(void)
478{ 478{
479 struct xfrm_decap_state decap; 479 struct xfrm_decap_state decap;
480 480
481 if (sizeof(struct esp_decap_data) < 481 if (sizeof(struct esp_decap_data) >
482 sizeof(decap.decap_data)) { 482 sizeof(decap.decap_data)) {
483 extern void decap_data_too_small(void); 483 extern void decap_data_too_small(void);
484 484
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 563e7d612706..cd8e45ab9580 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -516,6 +516,60 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
516#undef BRD1_OK 516#undef BRD1_OK
517} 517}
518 518
519static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
520{
521
522 struct fib_result res;
523 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
524 .fwmark = frn->fl_fwmark,
525 .tos = frn->fl_tos,
526 .scope = frn->fl_scope } } };
527 if (tb) {
528 local_bh_disable();
529
530 frn->tb_id = tb->tb_id;
531 frn->err = tb->tb_lookup(tb, &fl, &res);
532
533 if (!frn->err) {
534 frn->prefixlen = res.prefixlen;
535 frn->nh_sel = res.nh_sel;
536 frn->type = res.type;
537 frn->scope = res.scope;
538 }
539 local_bh_enable();
540 }
541}
542
543static void nl_fib_input(struct sock *sk, int len)
544{
545 struct sk_buff *skb = NULL;
546 struct nlmsghdr *nlh = NULL;
547 struct fib_result_nl *frn;
548 int err;
549 u32 pid;
550 struct fib_table *tb;
551
552 skb = skb_recv_datagram(sk, 0, 0, &err);
553 nlh = (struct nlmsghdr *)skb->data;
554
555 frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
556 tb = fib_get_table(frn->tb_id_in);
557
558 nl_fib_lookup(frn, tb);
559
560 pid = nlh->nlmsg_pid; /*pid of sending process */
561 NETLINK_CB(skb).groups = 0; /* not in mcast group */
562 NETLINK_CB(skb).pid = 0; /* from kernel */
563 NETLINK_CB(skb).dst_pid = pid;
564 NETLINK_CB(skb).dst_groups = 0; /* unicast */
565 netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
566}
567
568static void nl_fib_lookup_init(void)
569{
570 netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
571}
572
519static void fib_disable_ip(struct net_device *dev, int force) 573static void fib_disable_ip(struct net_device *dev, int force)
520{ 574{
521 if (fib_sync_down(0, dev, force)) 575 if (fib_sync_down(0, dev, force))
@@ -604,6 +658,7 @@ void __init ip_fib_init(void)
604 658
605 register_netdevice_notifier(&fib_netdev_notifier); 659 register_netdevice_notifier(&fib_netdev_notifier);
606 register_inetaddr_notifier(&fib_inetaddr_notifier); 660 register_inetaddr_notifier(&fib_inetaddr_notifier);
661 nl_fib_lookup_init();
607} 662}
608 663
609EXPORT_SYMBOL(inet_addr_type); 664EXPORT_SYMBOL(inet_addr_type);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 6506dcc01b46..b10d6bb5ef3d 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -703,7 +703,8 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
703 &f->fn_key, 703 &f->fn_key,
704 fz->fz_order, 704 fz->fz_order,
705 fa->fa_tos, 705 fa->fa_tos,
706 fa->fa_info) < 0) { 706 fa->fa_info,
707 NLM_F_MULTI) < 0) {
707 cb->args[3] = i; 708 cb->args[3] = i;
708 return -1; 709 return -1;
709 } 710 }
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index ac4485f75e97..b729d97cfa93 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -30,7 +30,8 @@ extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
30 struct kern_rta *rta, struct fib_info *fi); 30 struct kern_rta *rta, struct fib_info *fi);
31extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 31extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
32 u8 tb_id, u8 type, u8 scope, void *dst, 32 u8 tb_id, u8 type, u8 scope, void *dst,
33 int dst_len, u8 tos, struct fib_info *fi); 33 int dst_len, u8 tos, struct fib_info *fi,
34 unsigned int);
34extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, 35extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
35 int z, int tb_id, 36 int z, int tb_id,
36 struct nlmsghdr *n, struct netlink_skb_parms *req); 37 struct nlmsghdr *n, struct netlink_skb_parms *req);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 39d0aadb9a2a..0b298bbc1518 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -367,13 +367,14 @@ static struct notifier_block fib_rules_notifier = {
367 367
368static __inline__ int inet_fill_rule(struct sk_buff *skb, 368static __inline__ int inet_fill_rule(struct sk_buff *skb,
369 struct fib_rule *r, 369 struct fib_rule *r,
370 struct netlink_callback *cb) 370 struct netlink_callback *cb,
371 unsigned int flags)
371{ 372{
372 struct rtmsg *rtm; 373 struct rtmsg *rtm;
373 struct nlmsghdr *nlh; 374 struct nlmsghdr *nlh;
374 unsigned char *b = skb->tail; 375 unsigned char *b = skb->tail;
375 376
376 nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); 377 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
377 rtm = NLMSG_DATA(nlh); 378 rtm = NLMSG_DATA(nlh);
378 rtm->rtm_family = AF_INET; 379 rtm->rtm_family = AF_INET;
379 rtm->rtm_dst_len = r->r_dst_len; 380 rtm->rtm_dst_len = r->r_dst_len;
@@ -422,7 +423,7 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
422 for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { 423 for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
423 if (idx < s_idx) 424 if (idx < s_idx)
424 continue; 425 continue;
425 if (inet_fill_rule(skb, r, cb) < 0) 426 if (inet_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
426 break; 427 break;
427 } 428 }
428 read_unlock(&fib_rules_lock); 429 read_unlock(&fib_rules_lock);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 029362d66135..c886b28ba9f5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -276,7 +276,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
276 struct nlmsghdr *n, struct netlink_skb_parms *req) 276 struct nlmsghdr *n, struct netlink_skb_parms *req)
277{ 277{
278 struct sk_buff *skb; 278 struct sk_buff *skb;
279 u32 pid = req ? req->pid : 0; 279 u32 pid = req ? req->pid : n->nlmsg_pid;
280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); 280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
281 281
282 skb = alloc_skb(size, GFP_KERNEL); 282 skb = alloc_skb(size, GFP_KERNEL);
@@ -286,7 +286,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id, 286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
287 fa->fa_type, fa->fa_scope, &key, z, 287 fa->fa_type, fa->fa_scope, &key, z,
288 fa->fa_tos, 288 fa->fa_tos,
289 fa->fa_info) < 0) { 289 fa->fa_info, 0) < 0) {
290 kfree_skb(skb); 290 kfree_skb(skb);
291 return; 291 return;
292 } 292 }
@@ -932,13 +932,13 @@ u32 __fib_res_prefsrc(struct fib_result *res)
932int 932int
933fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 933fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
934 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, 934 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
935 struct fib_info *fi) 935 struct fib_info *fi, unsigned int flags)
936{ 936{
937 struct rtmsg *rtm; 937 struct rtmsg *rtm;
938 struct nlmsghdr *nlh; 938 struct nlmsghdr *nlh;
939 unsigned char *b = skb->tail; 939 unsigned char *b = skb->tail;
940 940
941 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); 941 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
942 rtm = NLMSG_DATA(nlh); 942 rtm = NLMSG_DATA(nlh);
943 rtm->rtm_family = AF_INET; 943 rtm->rtm_family = AF_INET;
944 rtm->rtm_dst_len = dst_len; 944 rtm->rtm_dst_len = dst_len;
@@ -1035,7 +1035,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
1035 } 1035 }
1036 1036
1037 nl->nlmsg_flags = NLM_F_REQUEST; 1037 nl->nlmsg_flags = NLM_F_REQUEST;
1038 nl->nlmsg_pid = 0; 1038 nl->nlmsg_pid = current->pid;
1039 nl->nlmsg_seq = 0; 1039 nl->nlmsg_seq = 0;
1040 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm)); 1040 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1041 if (cmd == SIOCDELRT) { 1041 if (cmd == SIOCDELRT) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 000000000000..0671569ee6f0
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2454 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
46#define VERSION "0.323"
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106 t_key key;
107 unsigned long _parent;
108};
109
110struct leaf {
111 t_key key;
112 unsigned long _parent;
113 struct hlist_head list;
114};
115
116struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120};
121
122struct tnode {
123 t_key key;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_CHILDS];
149};
150
151struct trie {
152 struct node *trie;
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
155#endif
156 int size;
157 unsigned int revision;
158};
159
160static int trie_debug = 0;
161
162static int tnode_full(struct tnode *tn, struct node *n);
163static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn);
168static struct tnode *halve(struct trie *t, struct tnode *tn);
169static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
172extern int fib_detect_death(struct fib_info *fi, int order,
173 struct fib_info **last_resort, int *last_idx, int *dflt);
174
175extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
176 struct nlmsghdr *n, struct netlink_skb_parms *req);
177
178static kmem_cache_t *fn_alias_kmem;
179static struct trie *trie_local = NULL, *trie_main = NULL;
180
181static void trie_bug(char *err)
182{
183 printk("Trie Bug: %s\n", err);
184 BUG();
185}
186
187static inline struct node *tnode_get_child(struct tnode *tn, int i)
188{
189 if (i >= 1<<tn->bits)
190 trie_bug("tnode_get_child");
191
192 return tn->child[i];
193}
194
195static inline int tnode_child_length(struct tnode *tn)
196{
197 return 1<<tn->bits;
198}
199
200/*
201 _________________________________________________________________
202 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203 ----------------------------------------------------------------
204 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
205
206 _________________________________________________________________
207 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
208 -----------------------------------------------------------------
209 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
210
211 tp->pos = 7
212 tp->bits = 3
213 n->pos = 15
214 n->bits=4
215 KEYLENGTH=32
216*/
217
218static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
219{
220 if (offset < KEYLENGTH)
221 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
222 else
223 return 0;
224}
225
226static inline int tkey_equals(t_key a, t_key b)
227{
228 return a == b;
229}
230
231static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232{
233 if (bits == 0 || offset >= KEYLENGTH)
234 return 1;
235 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
236 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237}
238
239static inline int tkey_mismatch(t_key a, int offset, t_key b)
240{
241 t_key diff = a ^ b;
242 int i = offset;
243
244 if(!diff)
245 return 0;
246 while((diff << i) >> (KEYLENGTH-1) == 0)
247 i++;
248 return i;
249}
250
251/* Candiate for fib_semantics */
252
253static void fn_free_alias(struct fib_alias *fa)
254{
255 fib_release_info(fa->fa_info);
256 kmem_cache_free(fn_alias_kmem, fa);
257}
258
259/*
260 To understand this stuff, an understanding of keys and all their bits is
261 necessary. Every node in the trie has a key associated with it, but not
262 all of the bits in that key are significant.
263
264 Consider a node 'n' and its parent 'tp'.
265
266 If n is a leaf, every bit in its key is significant. Its presence is
267 necessitaded by path compression, since during a tree traversal (when
268 searching for a leaf - unless we are doing an insertion) we will completely
269 ignore all skipped bits we encounter. Thus we need to verify, at the end of
270 a potentially successful search, that we have indeed been walking the
271 correct key path.
272
273 Note that we can never "miss" the correct key in the tree if present by
274 following the wrong path. Path compression ensures that segments of the key
275 that are the same for all keys with a given prefix are skipped, but the
276 skipped part *is* identical for each node in the subtrie below the skipped
277 bit! trie_insert() in this implementation takes care of that - note the
278 call to tkey_sub_equals() in trie_insert().
279
280 if n is an internal node - a 'tnode' here, the various parts of its key
281 have many different meanings.
282
283 Example:
284 _________________________________________________________________
285 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
286 -----------------------------------------------------------------
287 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
288
289 _________________________________________________________________
290 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
291 -----------------------------------------------------------------
292 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
293
294 tp->pos = 7
295 tp->bits = 3
296 n->pos = 15
297 n->bits=4
298
299 First, let's just ignore the bits that come before the parent tp, that is
300 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
301 not use them for anything.
302
303 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
304 index into the parent's child array. That is, they will be used to find
305 'n' among tp's children.
306
307 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
308 for the node n.
309
310 All the bits we have seen so far are significant to the node n. The rest
311 of the bits are really not needed or indeed known in n->key.
312
313 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314 n's child array, and will of course be different for each child.
315
316 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
317 at this point.
318
319*/
320
321static void check_tnode(struct tnode *tn)
322{
323 if(tn && tn->pos+tn->bits > 32) {
324 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
325 }
326}
327
328static int halve_threshold = 25;
329static int inflate_threshold = 50;
330
331static struct leaf *leaf_new(void)
332{
333 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
334 if(l) {
335 NODE_INIT_PARENT(l, T_LEAF);
336 INIT_HLIST_HEAD(&l->list);
337 }
338 return l;
339}
340
341static struct leaf_info *leaf_info_new(int plen)
342{
343 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
344 li->plen = plen;
345 INIT_LIST_HEAD(&li->falh);
346 return li;
347}
348
349static inline void free_leaf(struct leaf *l)
350{
351 kfree(l);
352}
353
354static inline void free_leaf_info(struct leaf_info *li)
355{
356 kfree(li);
357}
358
359static struct tnode* tnode_new(t_key key, int pos, int bits)
360{
361 int nchildren = 1<<bits;
362 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 struct tnode *tn = kmalloc(sz, GFP_KERNEL);
364
365 if(tn) {
366 memset(tn, 0, sz);
367 NODE_INIT_PARENT(tn, T_TNODE);
368 tn->pos = pos;
369 tn->bits = bits;
370 tn->key = key;
371 tn->full_children = 0;
372 tn->empty_children = 1<<bits;
373 }
374 if(trie_debug > 0)
375 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
376 (unsigned int) (sizeof(struct node) * 1<<bits));
377 return tn;
378}
379
380static void tnode_free(struct tnode *tn)
381{
382 if(!tn) {
383 trie_bug("tnode_free\n");
384 }
385 if(IS_LEAF(tn)) {
386 free_leaf((struct leaf *)tn);
387 if(trie_debug > 0 )
388 printk("FL %p \n", tn);
389 }
390 else if(IS_TNODE(tn)) {
391 kfree(tn);
392 if(trie_debug > 0 )
393 printk("FT %p \n", tn);
394 }
395 else {
396 trie_bug("tnode_free\n");
397 }
398}
399
400/*
401 * Check whether a tnode 'n' is "full", i.e. it is an internal node
402 * and no bits are skipped. See discussion in dyntree paper p. 6
403 */
404
405static inline int tnode_full(struct tnode *tn, struct node *n)
406{
407 if(n == NULL || IS_LEAF(n))
408 return 0;
409
410 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
411}
412
413static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
414{
415 tnode_put_child_reorg(tn, i, n, -1);
416}
417
418 /*
419 * Add a child at position i overwriting the old value.
420 * Update the value of full_children and empty_children.
421 */
422
423static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
424{
425 struct node *chi;
426 int isfull;
427
428 if(i >= 1<<tn->bits) {
429 printk("bits=%d, i=%d\n", tn->bits, i);
430 trie_bug("tnode_put_child_reorg bits");
431 }
432 write_lock_bh(&fib_lock);
433 chi = tn->child[i];
434
435 /* update emptyChildren */
436 if (n == NULL && chi != NULL)
437 tn->empty_children++;
438 else if (n != NULL && chi == NULL)
439 tn->empty_children--;
440
441 /* update fullChildren */
442 if (wasfull == -1)
443 wasfull = tnode_full(tn, chi);
444
445 isfull = tnode_full(tn, n);
446 if (wasfull && !isfull)
447 tn->full_children--;
448
449 else if (!wasfull && isfull)
450 tn->full_children++;
451 if(n)
452 NODE_SET_PARENT(n, tn);
453
454 tn->child[i] = n;
455 write_unlock_bh(&fib_lock);
456}
457
458static struct node *resize(struct trie *t, struct tnode *tn)
459{
460 int i;
461
462 if (!tn)
463 return NULL;
464
465 if(trie_debug)
466 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 tn, inflate_threshold, halve_threshold);
468
469 /* No children */
470 if (tn->empty_children == tnode_child_length(tn)) {
471 tnode_free(tn);
472 return NULL;
473 }
474 /* One child */
475 if (tn->empty_children == tnode_child_length(tn) - 1)
476 for (i = 0; i < tnode_child_length(tn); i++) {
477
478 write_lock_bh(&fib_lock);
479 if (tn->child[i] != NULL) {
480
481 /* compress one level */
482 struct node *n = tn->child[i];
483 if(n)
484 NODE_INIT_PARENT(n, NODE_TYPE(n));
485
486 write_unlock_bh(&fib_lock);
487 tnode_free(tn);
488 return n;
489 }
490 write_unlock_bh(&fib_lock);
491 }
492 /*
493 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold.
495 */
496
497 /*
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'."
503 *
504 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50.
510 *
511 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here.
517 *
518 * A clearer way to write this would be:
519 *
520 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 * tn->full_children;
523 *
524 * new_child_length = tnode_child_length(tn) * 2;
525 *
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 * new_child_length;
528 * if (new_fill_factor >= inflate_threshold)
529 *
530 * ...and so on, tho it would mess up the while() loop.
531 *
532 * anyway,
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
534 * inflate_threshold
535 *
536 * avoid a division:
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length
539 *
540 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children ) >= inflate_threshold * new_child_length
543 *
544 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children ) >=
547 * inflate_threshold * tnode_child_length(tn) * 2
548 *
549 * shorten again:
550 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children ) >= inflate_threshold *
552 * tnode_child_length(tn)
553 *
554 */
555
556 check_tnode(tn);
557
558 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) {
561
562 tn = inflate(t, tn);
563 }
564
565 check_tnode(tn);
566
567 /*
568 * Halve as long as the number of empty children in this
569 * node is above threshold.
570 */
571 while (tn->bits > 1 &&
572 100 * (tnode_child_length(tn) - tn->empty_children) <
573 halve_threshold * tnode_child_length(tn))
574
575 tn = halve(t, tn);
576
577 /* Only one child remains */
578
579 if (tn->empty_children == tnode_child_length(tn) - 1)
580 for (i = 0; i < tnode_child_length(tn); i++) {
581
582 write_lock_bh(&fib_lock);
583 if (tn->child[i] != NULL) {
584 /* compress one level */
585 struct node *n = tn->child[i];
586
587 if(n)
588 NODE_INIT_PARENT(n, NODE_TYPE(n));
589
590 write_unlock_bh(&fib_lock);
591 tnode_free(tn);
592 return n;
593 }
594 write_unlock_bh(&fib_lock);
595 }
596
597 return (struct node *) tn;
598}
599
600static struct tnode *inflate(struct trie *t, struct tnode *tn)
601{
602 struct tnode *inode;
603 struct tnode *oldtnode = tn;
604 int olen = tnode_child_length(tn);
605 int i;
606
607 if(trie_debug)
608 printk("In inflate\n");
609
610 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611
612 if (!tn)
613 trie_bug("tnode_new failed");
614
615 for(i = 0; i < olen; i++) {
616 struct node *node = tnode_get_child(oldtnode, i);
617
618 /* An empty child */
619 if (node == NULL)
620 continue;
621
622 /* A leaf or an internal node with skipped bits */
623
624 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
625 tn->pos + tn->bits - 1) {
626 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
627 1) == 0)
628 put_child(t, tn, 2*i, node);
629 else
630 put_child(t, tn, 2*i+1, node);
631 continue;
632 }
633
634 /* An internal node with two children */
635 inode = (struct tnode *) node;
636
637 if (inode->bits == 1) {
638 put_child(t, tn, 2*i, inode->child[0]);
639 put_child(t, tn, 2*i+1, inode->child[1]);
640
641 tnode_free(inode);
642 }
643
644 /* An internal node with more than two children */
645 else {
646 struct tnode *left, *right;
647 int size, j;
648
649 /* We will replace this node 'inode' with two new
650 * ones, 'left' and 'right', each with half of the
651 * original children. The two new nodes will have
652 * a position one bit further down the key and this
653 * means that the "significant" part of their keys
654 * (see the discussion near the top of this file)
655 * will differ by one bit, which will be "0" in
656 * left's key and "1" in right's key. Since we are
657 * moving the key position by one step, the bit that
658 * we are moving away from - the bit at position
659 * (inode->pos) - is the one that will differ between
660 * left and right. So... we synthesize that bit in the
661 * two new keys.
662 * The mask 'm' below will be a single "one" bit at
663 * the position (inode->pos)
664 */
665
666 t_key m = TKEY_GET_MASK(inode->pos, 1);
667
668 /* Use the old key, but set the new significant
669 * bit to zero.
670 */
671 left = tnode_new(inode->key&(~m), inode->pos + 1,
672 inode->bits - 1);
673
674 if(!left)
675 trie_bug("tnode_new failed");
676
677
678 /* Use the old key, but set the new significant
679 * bit to one.
680 */
681 right = tnode_new(inode->key|m, inode->pos + 1,
682 inode->bits - 1);
683
684 if(!right)
685 trie_bug("tnode_new failed");
686
687 size = tnode_child_length(left);
688 for(j = 0; j < size; j++) {
689 put_child(t, left, j, inode->child[j]);
690 put_child(t, right, j, inode->child[j + size]);
691 }
692 put_child(t, tn, 2*i, resize(t, left));
693 put_child(t, tn, 2*i+1, resize(t, right));
694
695 tnode_free(inode);
696 }
697 }
698 tnode_free(oldtnode);
699 return tn;
700}
701
702static struct tnode *halve(struct trie *t, struct tnode *tn)
703{
704 struct tnode *oldtnode = tn;
705 struct node *left, *right;
706 int i;
707 int olen = tnode_child_length(tn);
708
709 if(trie_debug) printk("In halve\n");
710
711 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
712
713 if(!tn)
714 trie_bug("tnode_new failed");
715
716 for(i = 0; i < olen; i += 2) {
717 left = tnode_get_child(oldtnode, i);
718 right = tnode_get_child(oldtnode, i+1);
719
720 /* At least one of the children is empty */
721 if (left == NULL) {
722 if (right == NULL) /* Both are empty */
723 continue;
724 put_child(t, tn, i/2, right);
725 } else if (right == NULL)
726 put_child(t, tn, i/2, left);
727
728 /* Two nonempty children */
729 else {
730 struct tnode *newBinNode =
731 tnode_new(left->key, tn->pos + tn->bits, 1);
732
733 if(!newBinNode)
734 trie_bug("tnode_new failed");
735
736 put_child(t, newBinNode, 0, left);
737 put_child(t, newBinNode, 1, right);
738 put_child(t, tn, i/2, resize(t, newBinNode));
739 }
740 }
741 tnode_free(oldtnode);
742 return tn;
743}
744
745static void *trie_init(struct trie *t)
746{
747 if(t) {
748 t->size = 0;
749 t->trie = NULL;
750 t->revision = 0;
751#ifdef CONFIG_IP_FIB_TRIE_STATS
752 memset(&t->stats, 0, sizeof(struct trie_use_stats));
753#endif
754 }
755 return t;
756}
757
758static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
759{
760 struct hlist_node *node;
761 struct leaf_info *li;
762
763 hlist_for_each_entry(li, node, head, hlist) {
764
765 if ( li->plen == plen )
766 return li;
767 }
768 return NULL;
769}
770
771static inline struct list_head * get_fa_head(struct leaf *l, int plen)
772{
773 struct list_head *fa_head=NULL;
774 struct leaf_info *li = find_leaf_info(&l->list, plen);
775
776 if(li)
777 fa_head = &li->falh;
778
779 return fa_head;
780}
781
782static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
783{
784 struct leaf_info *li=NULL, *last=NULL;
785 struct hlist_node *node, *tmp;
786
787 write_lock_bh(&fib_lock);
788
789 if(hlist_empty(head))
790 hlist_add_head(&new->hlist, head);
791 else {
792 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
793
794 if (new->plen > li->plen)
795 break;
796
797 last = li;
798 }
799 if(last)
800 hlist_add_after(&last->hlist, &new->hlist);
801 else
802 hlist_add_before(&new->hlist, &li->hlist);
803 }
804 write_unlock_bh(&fib_lock);
805}
806
807static struct leaf *
808fib_find_node(struct trie *t, u32 key)
809{
810 int pos;
811 struct tnode *tn;
812 struct node *n;
813
814 pos = 0;
815 n=t->trie;
816
817 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
818 tn = (struct tnode *) n;
819
820 check_tnode(tn);
821
822 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
823 pos=tn->pos + tn->bits;
824 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
825 }
826 else
827 break;
828 }
829 /* Case we have found a leaf. Compare prefixes */
830
831 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
832 struct leaf *l = (struct leaf *) n;
833 return l;
834 }
835 return NULL;
836}
837
838static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
839{
840 int i = 0;
841 int wasfull;
842 t_key cindex, key;
843 struct tnode *tp = NULL;
844
845 if(!tn)
846 BUG();
847
848 key = tn->key;
849 i = 0;
850
851 while (tn != NULL && NODE_PARENT(tn) != NULL) {
852
853 if( i > 10 ) {
854 printk("Rebalance tn=%p \n", tn);
855 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
856
857 printk("Rebalance tp=%p \n", tp);
858 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
859 }
860
861 if( i > 12 ) BUG();
862 i++;
863
864 tp = NODE_PARENT(tn);
865 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
866 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
867 tn = (struct tnode *) resize (t, (struct tnode *)tn);
868 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
869
870 if(!NODE_PARENT(tn))
871 break;
872
873 tn = NODE_PARENT(tn);
874 }
875 /* Handle last (top) tnode */
876 if (IS_TNODE(tn))
877 tn = (struct tnode*) resize(t, (struct tnode *)tn);
878
879 return (struct node*) tn;
880}
881
882static struct list_head *
883fib_insert_node(struct trie *t, u32 key, int plen)
884{
885 int pos, newpos;
886 struct tnode *tp = NULL, *tn = NULL;
887 struct node *n;
888 struct leaf *l;
889 int missbit;
890 struct list_head *fa_head=NULL;
891 struct leaf_info *li;
892 t_key cindex;
893
894 pos = 0;
895 n=t->trie;
896
897 /* If we point to NULL, stop. Either the tree is empty and we should
898 * just put a new leaf in if, or we have reached an empty child slot,
899 * and we should just put our new leaf in that.
900 * If we point to a T_TNODE, check if it matches our key. Note that
901 * a T_TNODE might be skipping any number of bits - its 'pos' need
902 * not be the parent's 'pos'+'bits'!
903 *
904 * If it does match the current key, get pos/bits from it, extract
905 * the index from our key, push the T_TNODE and walk the tree.
906 *
907 * If it doesn't, we have to replace it with a new T_TNODE.
908 *
909 * If we point to a T_LEAF, it might or might not have the same key
910 * as we do. If it does, just change the value, update the T_LEAF's
911 * value, and return it.
912 * If it doesn't, we need to replace it with a T_TNODE.
913 */
914
915 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
916 tn = (struct tnode *) n;
917
918 check_tnode(tn);
919
920 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
921 tp = tn;
922 pos=tn->pos + tn->bits;
923 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
924
925 if(n && NODE_PARENT(n) != tn) {
926 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
927 BUG();
928 }
929 }
930 else
931 break;
932 }
933
934 /*
935 * n ----> NULL, LEAF or TNODE
936 *
937 * tp is n's (parent) ----> NULL or TNODE
938 */
939
940 if(tp && IS_LEAF(tp))
941 BUG();
942
943 t->revision++;
944
945 /* Case 1: n is a leaf. Compare prefixes */
946
947 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
948 struct leaf *l = ( struct leaf *) n;
949
950 li = leaf_info_new(plen);
951
952 if(! li)
953 BUG();
954
955 fa_head = &li->falh;
956 insert_leaf_info(&l->list, li);
957 goto done;
958 }
959 t->size++;
960 l = leaf_new();
961
962 if(! l)
963 BUG();
964
965 l->key = key;
966 li = leaf_info_new(plen);
967
968 if(! li)
969 BUG();
970
971 fa_head = &li->falh;
972 insert_leaf_info(&l->list, li);
973
974 /* Case 2: n is NULL, and will just insert a new leaf */
975 if (t->trie && n == NULL) {
976
977 NODE_SET_PARENT(l, tp);
978
979 if (!tp)
980 BUG();
981
982 else {
983 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
984 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
985 }
986 }
987 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
988 else {
989 /*
990 * Add a new tnode here
991 * first tnode need some special handling
992 */
993
994 if (tp)
995 pos=tp->pos+tp->bits;
996 else
997 pos=0;
998 if(n) {
999 newpos = tkey_mismatch(key, pos, n->key);
1000 tn = tnode_new(n->key, newpos, 1);
1001 }
1002 else {
1003 newpos = 0;
1004 tn = tnode_new(key, newpos, 1); /* First tnode */
1005 }
1006 if(!tn)
1007 trie_bug("tnode_pfx_new failed");
1008
1009 NODE_SET_PARENT(tn, tp);
1010
1011 missbit=tkey_extract_bits(key, newpos, 1);
1012 put_child(t, tn, missbit, (struct node *)l);
1013 put_child(t, tn, 1-missbit, n);
1014
1015 if(tp) {
1016 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1018 }
1019 else {
1020 t->trie = (struct node*) tn; /* First tnode */
1021 tp = tn;
1022 }
1023 }
1024 if(tp && tp->pos+tp->bits > 32) {
1025 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026 tp, tp->pos, tp->bits, key, plen);
1027 }
1028 /* Rebalance the trie */
1029 t->trie = trie_rebalance(t, tp);
1030done:;
1031 return fa_head;
1032}
1033
1034static int
1035fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1036 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1037{
1038 struct trie *t = (struct trie *) tb->tb_data;
1039 struct fib_alias *fa, *new_fa;
1040 struct list_head *fa_head=NULL;
1041 struct fib_info *fi;
1042 int plen = r->rtm_dst_len;
1043 int type = r->rtm_type;
1044 u8 tos = r->rtm_tos;
1045 u32 key, mask;
1046 int err;
1047 struct leaf *l;
1048
1049 if (plen > 32)
1050 return -EINVAL;
1051
1052 key = 0;
1053 if (rta->rta_dst)
1054 memcpy(&key, rta->rta_dst, 4);
1055
1056 key = ntohl(key);
1057
1058 if(trie_debug)
1059 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1060
1061 mask = ntohl( inet_make_mask(plen) );
1062
1063 if(key & ~mask)
1064 return -EINVAL;
1065
1066 key = key & mask;
1067
1068 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1069 goto err;
1070
1071 l = fib_find_node(t, key);
1072 fa = NULL;
1073
1074 if(l) {
1075 fa_head = get_fa_head(l, plen);
1076 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1077 }
1078
1079 /* Now fa, if non-NULL, points to the first fib alias
1080 * with the same keys [prefix,tos,priority], if such key already
1081 * exists or to the node before which we will insert new one.
1082 *
1083 * If fa is NULL, we will need to allocate a new one and
1084 * insert to the head of f.
1085 *
1086 * If f is NULL, no fib node matched the destination key
1087 * and we need to allocate a new one of those as well.
1088 */
1089
1090 if (fa &&
1091 fa->fa_info->fib_priority == fi->fib_priority) {
1092 struct fib_alias *fa_orig;
1093
1094 err = -EEXIST;
1095 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1096 goto out;
1097
1098 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1099 struct fib_info *fi_drop;
1100 u8 state;
1101
1102 write_lock_bh(&fib_lock);
1103
1104 fi_drop = fa->fa_info;
1105 fa->fa_info = fi;
1106 fa->fa_type = type;
1107 fa->fa_scope = r->rtm_scope;
1108 state = fa->fa_state;
1109 fa->fa_state &= ~FA_S_ACCESSED;
1110
1111 write_unlock_bh(&fib_lock);
1112
1113 fib_release_info(fi_drop);
1114 if (state & FA_S_ACCESSED)
1115 rt_cache_flush(-1);
1116
1117 goto succeeded;
1118 }
1119 /* Error if we find a perfect match which
1120 * uses the same scope, type, and nexthop
1121 * information.
1122 */
1123 fa_orig = fa;
1124 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1125 if (fa->fa_tos != tos)
1126 break;
1127 if (fa->fa_info->fib_priority != fi->fib_priority)
1128 break;
1129 if (fa->fa_type == type &&
1130 fa->fa_scope == r->rtm_scope &&
1131 fa->fa_info == fi) {
1132 goto out;
1133 }
1134 }
1135 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1136 fa = fa_orig;
1137 }
1138 err = -ENOENT;
1139 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1140 goto out;
1141
1142 err = -ENOBUFS;
1143 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1144 if (new_fa == NULL)
1145 goto out;
1146
1147 new_fa->fa_info = fi;
1148 new_fa->fa_tos = tos;
1149 new_fa->fa_type = type;
1150 new_fa->fa_scope = r->rtm_scope;
1151 new_fa->fa_state = 0;
1152#if 0
1153 new_fa->dst = NULL;
1154#endif
1155 /*
1156 * Insert new entry to the list.
1157 */
1158
1159 if(!fa_head)
1160 fa_head = fib_insert_node(t, key, plen);
1161
1162 write_lock_bh(&fib_lock);
1163
1164 list_add_tail(&new_fa->fa_list,
1165 (fa ? &fa->fa_list : fa_head));
1166
1167 write_unlock_bh(&fib_lock);
1168
1169 rt_cache_flush(-1);
1170 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171succeeded:
1172 return 0;
1173out:
1174 fib_release_info(fi);
1175err:;
1176 return err;
1177}
1178
1179static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1180 struct fib_result *res, int *err)
1181{
1182 int i;
1183 t_key mask;
1184 struct leaf_info *li;
1185 struct hlist_head *hhead = &l->list;
1186 struct hlist_node *node;
1187
1188 hlist_for_each_entry(li, node, hhead, hlist) {
1189
1190 i = li->plen;
1191 mask = ntohl(inet_make_mask(i));
1192 if (l->key != (key & mask))
1193 continue;
1194
1195 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1196 *plen = i;
1197#ifdef CONFIG_IP_FIB_TRIE_STATS
1198 t->stats.semantic_match_passed++;
1199#endif
1200 return 1;
1201 }
1202#ifdef CONFIG_IP_FIB_TRIE_STATS
1203 t->stats.semantic_match_miss++;
1204#endif
1205 }
1206 return 0;
1207}
1208
1209static int
1210fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1211{
1212 struct trie *t = (struct trie *) tb->tb_data;
1213 int plen, ret = 0;
1214 struct node *n;
1215 struct tnode *pn;
1216 int pos, bits;
1217 t_key key=ntohl(flp->fl4_dst);
1218 int chopped_off;
1219 t_key cindex = 0;
1220 int current_prefix_length = KEYLENGTH;
1221 n = t->trie;
1222
1223 read_lock(&fib_lock);
1224 if(!n)
1225 goto failed;
1226
1227#ifdef CONFIG_IP_FIB_TRIE_STATS
1228 t->stats.gets++;
1229#endif
1230
1231 /* Just a leaf? */
1232 if (IS_LEAF(n)) {
1233 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1234 goto found;
1235 goto failed;
1236 }
1237 pn = (struct tnode *) n;
1238 chopped_off = 0;
1239
1240 while (pn) {
1241
1242 pos = pn->pos;
1243 bits = pn->bits;
1244
1245 if(!chopped_off)
1246 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1247
1248 n = tnode_get_child(pn, cindex);
1249
1250 if (n == NULL) {
1251#ifdef CONFIG_IP_FIB_TRIE_STATS
1252 t->stats.null_node_hit++;
1253#endif
1254 goto backtrace;
1255 }
1256
1257 if (IS_TNODE(n)) {
1258#define HL_OPTIMIZE
1259#ifdef HL_OPTIMIZE
1260 struct tnode *cn = (struct tnode *)n;
1261 t_key node_prefix, key_prefix, pref_mismatch;
1262 int mp;
1263
1264 /*
1265 * It's a tnode, and we can do some extra checks here if we
1266 * like, to avoid descending into a dead-end branch.
1267 * This tnode is in the parent's child array at index
1268 * key[p_pos..p_pos+p_bits] but potentially with some bits
1269 * chopped off, so in reality the index may be just a
1270 * subprefix, padded with zero at the end.
1271 * We can also take a look at any skipped bits in this
1272 * tnode - everything up to p_pos is supposed to be ok,
1273 * and the non-chopped bits of the index (se previous
1274 * paragraph) are also guaranteed ok, but the rest is
1275 * considered unknown.
1276 *
1277 * The skipped bits are key[pos+bits..cn->pos].
1278 */
1279
1280 /* If current_prefix_length < pos+bits, we are already doing
1281 * actual prefix matching, which means everything from
1282 * pos+(bits-chopped_off) onward must be zero along some
1283 * branch of this subtree - otherwise there is *no* valid
1284 * prefix present. Here we can only check the skipped
1285 * bits. Remember, since we have already indexed into the
1286 * parent's child array, we know that the bits we chopped of
1287 * *are* zero.
1288 */
1289
1290 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1291
1292 if (current_prefix_length < pos+bits) {
1293 if (tkey_extract_bits(cn->key, current_prefix_length,
1294 cn->pos - current_prefix_length) != 0 ||
1295 !(cn->child[0]))
1296 goto backtrace;
1297 }
1298
1299 /*
1300 * If chopped_off=0, the index is fully validated and we
1301 * only need to look at the skipped bits for this, the new,
1302 * tnode. What we actually want to do is to find out if
1303 * these skipped bits match our key perfectly, or if we will
1304 * have to count on finding a matching prefix further down,
1305 * because if we do, we would like to have some way of
1306 * verifying the existence of such a prefix at this point.
1307 */
1308
1309 /* The only thing we can do at this point is to verify that
1310 * any such matching prefix can indeed be a prefix to our
1311 * key, and if the bits in the node we are inspecting that
1312 * do not match our key are not ZERO, this cannot be true.
1313 * Thus, find out where there is a mismatch (before cn->pos)
1314 * and verify that all the mismatching bits are zero in the
1315 * new tnode's key.
1316 */
1317
1318 /* Note: We aren't very concerned about the piece of the key
1319 * that precede pn->pos+pn->bits, since these have already been
1320 * checked. The bits after cn->pos aren't checked since these are
1321 * by definition "unknown" at this point. Thus, what we want to
1322 * see is if we are about to enter the "prefix matching" state,
1323 * and in that case verify that the skipped bits that will prevail
1324 * throughout this subtree are zero, as they have to be if we are
1325 * to find a matching prefix.
1326 */
1327
1328 node_prefix = MASK_PFX(cn->key, cn->pos);
1329 key_prefix = MASK_PFX(key, cn->pos);
1330 pref_mismatch = key_prefix^node_prefix;
1331 mp = 0;
1332
1333 /* In short: If skipped bits in this node do not match the search
1334 * key, enter the "prefix matching" state.directly.
1335 */
1336 if (pref_mismatch) {
1337 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1338 mp++;
1339 pref_mismatch = pref_mismatch <<1;
1340 }
1341 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1342
1343 if (key_prefix != 0)
1344 goto backtrace;
1345
1346 if (current_prefix_length >= cn->pos)
1347 current_prefix_length=mp;
1348 }
1349#endif
1350 pn = (struct tnode *)n; /* Descend */
1351 chopped_off = 0;
1352 continue;
1353 }
1354 if (IS_LEAF(n)) {
1355 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1356 goto found;
1357 }
1358backtrace:
1359 chopped_off++;
1360
1361 /* As zero don't change the child key (cindex) */
1362 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1363 chopped_off++;
1364 }
1365
1366 /* Decrease current_... with bits chopped off */
1367 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1368 current_prefix_length = pn->pos + pn->bits - chopped_off;
1369
1370 /*
1371 * Either we do the actual chop off according or if we have
1372 * chopped off all bits in this tnode walk up to our parent.
1373 */
1374
1375 if(chopped_off <= pn->bits)
1376 cindex &= ~(1 << (chopped_off-1));
1377 else {
1378 if( NODE_PARENT(pn) == NULL)
1379 goto failed;
1380
1381 /* Get Child's index */
1382 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1383 pn = NODE_PARENT(pn);
1384 chopped_off = 0;
1385
1386#ifdef CONFIG_IP_FIB_TRIE_STATS
1387 t->stats.backtrack++;
1388#endif
1389 goto backtrace;
1390 }
1391 }
1392failed:
1393 ret = 1;
1394found:
1395 read_unlock(&fib_lock);
1396 return ret;
1397}
1398
1399static int trie_leaf_remove(struct trie *t, t_key key)
1400{
1401 t_key cindex;
1402 struct tnode *tp = NULL;
1403 struct node *n = t->trie;
1404 struct leaf *l;
1405
1406 if(trie_debug)
1407 printk("entering trie_leaf_remove(%p)\n", n);
1408
1409 /* Note that in the case skipped bits, those bits are *not* checked!
1410 * When we finish this, we will have NULL or a T_LEAF, and the
1411 * T_LEAF may or may not match our key.
1412 */
1413
1414 while (n != NULL && IS_TNODE(n)) {
1415 struct tnode *tn = (struct tnode *) n;
1416 check_tnode(tn);
1417 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1418
1419 if(n && NODE_PARENT(n) != tn) {
1420 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1421 BUG();
1422 }
1423 }
1424 l = (struct leaf *) n;
1425
1426 if(!n || !tkey_equals(l->key, key))
1427 return 0;
1428
1429 /*
1430 * Key found.
1431 * Remove the leaf and rebalance the tree
1432 */
1433
1434 t->revision++;
1435 t->size--;
1436
1437 tp = NODE_PARENT(n);
1438 tnode_free((struct tnode *) n);
1439
1440 if(tp) {
1441 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1442 put_child(t, (struct tnode *)tp, cindex, NULL);
1443 t->trie = trie_rebalance(t, tp);
1444 }
1445 else
1446 t->trie = NULL;
1447
1448 return 1;
1449}
1450
1451static int
1452fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1453 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1454{
1455 struct trie *t = (struct trie *) tb->tb_data;
1456 u32 key, mask;
1457 int plen = r->rtm_dst_len;
1458 u8 tos = r->rtm_tos;
1459 struct fib_alias *fa, *fa_to_delete;
1460 struct list_head *fa_head;
1461 struct leaf *l;
1462
1463 if (plen > 32)
1464 return -EINVAL;
1465
1466 key = 0;
1467 if (rta->rta_dst)
1468 memcpy(&key, rta->rta_dst, 4);
1469
1470 key = ntohl(key);
1471 mask = ntohl( inet_make_mask(plen) );
1472
1473 if(key & ~mask)
1474 return -EINVAL;
1475
1476 key = key & mask;
1477 l = fib_find_node(t, key);
1478
1479 if(!l)
1480 return -ESRCH;
1481
1482 fa_head = get_fa_head(l, plen);
1483 fa = fib_find_alias(fa_head, tos, 0);
1484
1485 if (!fa)
1486 return -ESRCH;
1487
1488 if (trie_debug)
1489 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1490
1491 fa_to_delete = NULL;
1492 fa_head = fa->fa_list.prev;
1493 list_for_each_entry(fa, fa_head, fa_list) {
1494 struct fib_info *fi = fa->fa_info;
1495
1496 if (fa->fa_tos != tos)
1497 break;
1498
1499 if ((!r->rtm_type ||
1500 fa->fa_type == r->rtm_type) &&
1501 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1502 fa->fa_scope == r->rtm_scope) &&
1503 (!r->rtm_protocol ||
1504 fi->fib_protocol == r->rtm_protocol) &&
1505 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1506 fa_to_delete = fa;
1507 break;
1508 }
1509 }
1510
1511 if (fa_to_delete) {
1512 int kill_li = 0;
1513 struct leaf_info *li;
1514
1515 fa = fa_to_delete;
1516 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1517
1518 l = fib_find_node(t, key);
1519 li = find_leaf_info(&l->list, plen);
1520
1521 write_lock_bh(&fib_lock);
1522
1523 list_del(&fa->fa_list);
1524
1525 if(list_empty(fa_head)) {
1526 hlist_del(&li->hlist);
1527 kill_li = 1;
1528 }
1529 write_unlock_bh(&fib_lock);
1530
1531 if(kill_li)
1532 free_leaf_info(li);
1533
1534 if(hlist_empty(&l->list))
1535 trie_leaf_remove(t, key);
1536
1537 if (fa->fa_state & FA_S_ACCESSED)
1538 rt_cache_flush(-1);
1539
1540 fn_free_alias(fa);
1541 return 0;
1542 }
1543 return -ESRCH;
1544}
1545
1546static int trie_flush_list(struct trie *t, struct list_head *head)
1547{
1548 struct fib_alias *fa, *fa_node;
1549 int found = 0;
1550
1551 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1552 struct fib_info *fi = fa->fa_info;
1553
1554 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1555
1556 write_lock_bh(&fib_lock);
1557 list_del(&fa->fa_list);
1558 write_unlock_bh(&fib_lock);
1559
1560 fn_free_alias(fa);
1561 found++;
1562 }
1563 }
1564 return found;
1565}
1566
1567static int trie_flush_leaf(struct trie *t, struct leaf *l)
1568{
1569 int found = 0;
1570 struct hlist_head *lih = &l->list;
1571 struct hlist_node *node, *tmp;
1572 struct leaf_info *li = NULL;
1573
1574 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1575
1576 found += trie_flush_list(t, &li->falh);
1577
1578 if (list_empty(&li->falh)) {
1579
1580 write_lock_bh(&fib_lock);
1581 hlist_del(&li->hlist);
1582 write_unlock_bh(&fib_lock);
1583
1584 free_leaf_info(li);
1585 }
1586 }
1587 return found;
1588}
1589
1590static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1591{
1592 struct node *c = (struct node *) thisleaf;
1593 struct tnode *p;
1594 int idx;
1595
1596 if(c == NULL) {
1597 if(t->trie == NULL)
1598 return NULL;
1599
1600 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1601 return (struct leaf *) t->trie;
1602
1603 p = (struct tnode*) t->trie; /* Start */
1604 }
1605 else
1606 p = (struct tnode *) NODE_PARENT(c);
1607 while (p) {
1608 int pos, last;
1609
1610 /* Find the next child of the parent */
1611 if(c)
1612 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1613 else
1614 pos = 0;
1615
1616 last = 1 << p->bits;
1617 for(idx = pos; idx < last ; idx++) {
1618 if( p->child[idx]) {
1619
1620 /* Decend if tnode */
1621
1622 while (IS_TNODE(p->child[idx])) {
1623 p = (struct tnode*) p->child[idx];
1624 idx = 0;
1625
1626 /* Rightmost non-NULL branch */
1627 if( p && IS_TNODE(p) )
1628 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1629
1630 /* Done with this tnode? */
1631 if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1632 goto up;
1633 }
1634 return (struct leaf*) p->child[idx];
1635 }
1636 }
1637up:
1638 /* No more children go up one step */
1639 c = (struct node*) p;
1640 p = (struct tnode *) NODE_PARENT(p);
1641 }
1642 return NULL; /* Ready. Root of trie */
1643}
1644
1645static int fn_trie_flush(struct fib_table *tb)
1646{
1647 struct trie *t = (struct trie *) tb->tb_data;
1648 struct leaf *ll = NULL, *l = NULL;
1649 int found = 0, h;
1650
1651 t->revision++;
1652
1653 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1654 found += trie_flush_leaf(t, l);
1655
1656 if (ll && hlist_empty(&ll->list))
1657 trie_leaf_remove(t, ll->key);
1658 ll = l;
1659 }
1660
1661 if (ll && hlist_empty(&ll->list))
1662 trie_leaf_remove(t, ll->key);
1663
1664 if(trie_debug)
1665 printk("trie_flush found=%d\n", found);
1666 return found;
1667}
1668
1669static int trie_last_dflt=-1;
1670
1671static void
1672fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1673{
1674 struct trie *t = (struct trie *) tb->tb_data;
1675 int order, last_idx;
1676 struct fib_info *fi = NULL;
1677 struct fib_info *last_resort;
1678 struct fib_alias *fa = NULL;
1679 struct list_head *fa_head;
1680 struct leaf *l;
1681
1682 last_idx = -1;
1683 last_resort = NULL;
1684 order = -1;
1685
1686 read_lock(&fib_lock);
1687
1688 l = fib_find_node(t, 0);
1689 if(!l)
1690 goto out;
1691
1692 fa_head = get_fa_head(l, 0);
1693 if(!fa_head)
1694 goto out;
1695
1696 if (list_empty(fa_head))
1697 goto out;
1698
1699 list_for_each_entry(fa, fa_head, fa_list) {
1700 struct fib_info *next_fi = fa->fa_info;
1701
1702 if (fa->fa_scope != res->scope ||
1703 fa->fa_type != RTN_UNICAST)
1704 continue;
1705
1706 if (next_fi->fib_priority > res->fi->fib_priority)
1707 break;
1708 if (!next_fi->fib_nh[0].nh_gw ||
1709 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1710 continue;
1711 fa->fa_state |= FA_S_ACCESSED;
1712
1713 if (fi == NULL) {
1714 if (next_fi != res->fi)
1715 break;
1716 } else if (!fib_detect_death(fi, order, &last_resort,
1717 &last_idx, &trie_last_dflt)) {
1718 if (res->fi)
1719 fib_info_put(res->fi);
1720 res->fi = fi;
1721 atomic_inc(&fi->fib_clntref);
1722 trie_last_dflt = order;
1723 goto out;
1724 }
1725 fi = next_fi;
1726 order++;
1727 }
1728 if (order <= 0 || fi == NULL) {
1729 trie_last_dflt = -1;
1730 goto out;
1731 }
1732
1733 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1734 if (res->fi)
1735 fib_info_put(res->fi);
1736 res->fi = fi;
1737 atomic_inc(&fi->fib_clntref);
1738 trie_last_dflt = order;
1739 goto out;
1740 }
1741 if (last_idx >= 0) {
1742 if (res->fi)
1743 fib_info_put(res->fi);
1744 res->fi = last_resort;
1745 if (last_resort)
1746 atomic_inc(&last_resort->fib_clntref);
1747 }
1748 trie_last_dflt = last_idx;
1749 out:;
1750 read_unlock(&fib_lock);
1751}
1752
1753static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1754 struct sk_buff *skb, struct netlink_callback *cb)
1755{
1756 int i, s_i;
1757 struct fib_alias *fa;
1758
1759 u32 xkey=htonl(key);
1760
1761 s_i=cb->args[3];
1762 i = 0;
1763
1764 list_for_each_entry(fa, fah, fa_list) {
1765 if (i < s_i) {
1766 i++;
1767 continue;
1768 }
1769 if (fa->fa_info->fib_nh == NULL) {
1770 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1771 i++;
1772 continue;
1773 }
1774 if (fa->fa_info == NULL) {
1775 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1776 i++;
1777 continue;
1778 }
1779
1780 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1781 cb->nlh->nlmsg_seq,
1782 RTM_NEWROUTE,
1783 tb->tb_id,
1784 fa->fa_type,
1785 fa->fa_scope,
1786 &xkey,
1787 plen,
1788 fa->fa_tos,
1789 fa->fa_info, 0) < 0) {
1790 cb->args[3] = i;
1791 return -1;
1792 }
1793 i++;
1794 }
1795 cb->args[3]=i;
1796 return skb->len;
1797}
1798
1799static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1800 struct netlink_callback *cb)
1801{
1802 int h, s_h;
1803 struct list_head *fa_head;
1804 struct leaf *l = NULL;
1805 s_h=cb->args[2];
1806
1807 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1808
1809 if (h < s_h)
1810 continue;
1811 if (h > s_h)
1812 memset(&cb->args[3], 0,
1813 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1814
1815 fa_head = get_fa_head(l, plen);
1816
1817 if(!fa_head)
1818 continue;
1819
1820 if(list_empty(fa_head))
1821 continue;
1822
1823 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1824 cb->args[2]=h;
1825 return -1;
1826 }
1827 }
1828 cb->args[2]=h;
1829 return skb->len;
1830}
1831
1832static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1833{
1834 int m, s_m;
1835 struct trie *t = (struct trie *) tb->tb_data;
1836
1837 s_m = cb->args[1];
1838
1839 read_lock(&fib_lock);
1840 for (m=0; m<=32; m++) {
1841
1842 if (m < s_m)
1843 continue;
1844 if (m > s_m)
1845 memset(&cb->args[2], 0,
1846 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1847
1848 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1849 cb->args[1] = m;
1850 goto out;
1851 }
1852 }
1853 read_unlock(&fib_lock);
1854 cb->args[1] = m;
1855 return skb->len;
1856 out:
1857 read_unlock(&fib_lock);
1858 return -1;
1859}
1860
1861/* Fix more generic FIB names for init later */
1862
1863#ifdef CONFIG_IP_MULTIPLE_TABLES
1864struct fib_table * fib_hash_init(int id)
1865#else
1866struct fib_table * __init fib_hash_init(int id)
1867#endif
1868{
1869 struct fib_table *tb;
1870 struct trie *t;
1871
1872 if (fn_alias_kmem == NULL)
1873 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1874 sizeof(struct fib_alias),
1875 0, SLAB_HWCACHE_ALIGN,
1876 NULL, NULL);
1877
1878 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1879 GFP_KERNEL);
1880 if (tb == NULL)
1881 return NULL;
1882
1883 tb->tb_id = id;
1884 tb->tb_lookup = fn_trie_lookup;
1885 tb->tb_insert = fn_trie_insert;
1886 tb->tb_delete = fn_trie_delete;
1887 tb->tb_flush = fn_trie_flush;
1888 tb->tb_select_default = fn_trie_select_default;
1889 tb->tb_dump = fn_trie_dump;
1890 memset(tb->tb_data, 0, sizeof(struct trie));
1891
1892 t = (struct trie *) tb->tb_data;
1893
1894 trie_init(t);
1895
1896 if (id == RT_TABLE_LOCAL)
1897 trie_local=t;
1898 else if (id == RT_TABLE_MAIN)
1899 trie_main=t;
1900
1901 if (id == RT_TABLE_LOCAL)
1902 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1903
1904 return tb;
1905}
1906
1907/* Trie dump functions */
1908
1909static void putspace_seq(struct seq_file *seq, int n)
1910{
1911 while (n--) seq_printf(seq, " ");
1912}
1913
1914static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1915{
1916 while (bits--)
1917 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1918}
1919
1920static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1921 int pend, int cindex, int bits)
1922{
1923 putspace_seq(seq, indent);
1924 if (IS_LEAF(n))
1925 seq_printf(seq, "|");
1926 else
1927 seq_printf(seq, "+");
1928 if (bits) {
1929 seq_printf(seq, "%d/", cindex);
1930 printbin_seq(seq, cindex, bits);
1931 seq_printf(seq, ": ");
1932 }
1933 else
1934 seq_printf(seq, "<root>: ");
1935 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1936
1937 if (IS_LEAF(n))
1938 seq_printf(seq, "key=%d.%d.%d.%d\n",
1939 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1940 else {
1941 int plen=((struct tnode *)n)->pos;
1942 t_key prf=MASK_PFX(n->key, plen);
1943 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1944 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1945 }
1946 if (IS_LEAF(n)) {
1947 struct leaf *l=(struct leaf *)n;
1948 struct fib_alias *fa;
1949 int i;
1950 for (i=32; i>=0; i--)
1951 if(find_leaf_info(&l->list, i)) {
1952
1953 struct list_head *fa_head = get_fa_head(l, i);
1954
1955 if(!fa_head)
1956 continue;
1957
1958 if(list_empty(fa_head))
1959 continue;
1960
1961 putspace_seq(seq, indent+2);
1962 seq_printf(seq, "{/%d...dumping}\n", i);
1963
1964
1965 list_for_each_entry(fa, fa_head, fa_list) {
1966 putspace_seq(seq, indent+2);
1967 if (fa->fa_info->fib_nh == NULL) {
1968 seq_printf(seq, "Error _fib_nh=NULL\n");
1969 continue;
1970 }
1971 if (fa->fa_info == NULL) {
1972 seq_printf(seq, "Error fa_info=NULL\n");
1973 continue;
1974 }
1975
1976 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
1977 fa->fa_type,
1978 fa->fa_scope,
1979 fa->fa_tos);
1980 }
1981 }
1982 }
1983 else if (IS_TNODE(n)) {
1984 struct tnode *tn=(struct tnode *)n;
1985 putspace_seq(seq, indent); seq_printf(seq, "| ");
1986 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
1987 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
1988 seq_printf(seq, "}\n");
1989 putspace_seq(seq, indent); seq_printf(seq, "| ");
1990 seq_printf(seq, "{pos=%d", tn->pos);
1991 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
1992 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
1993 putspace_seq(seq, indent); seq_printf(seq, "| ");
1994 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
1995 }
1996}
1997
1998static void trie_dump_seq(struct seq_file *seq, struct trie *t)
1999{
2000 struct node *n=t->trie;
2001 int cindex=0;
2002 int indent=1;
2003 int pend=0;
2004 int depth = 0;
2005
2006 read_lock(&fib_lock);
2007
2008 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2009 if (n) {
2010 printnode_seq(seq, indent, n, pend, cindex, 0);
2011 if (IS_TNODE(n)) {
2012 struct tnode *tn=(struct tnode *)n;
2013 pend = tn->pos+tn->bits;
2014 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2015 indent += 3;
2016 depth++;
2017
2018 while (tn && cindex < (1 << tn->bits)) {
2019 if (tn->child[cindex]) {
2020
2021 /* Got a child */
2022
2023 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2024 if (IS_LEAF(tn->child[cindex])) {
2025 cindex++;
2026
2027 }
2028 else {
2029 /*
2030 * New tnode. Decend one level
2031 */
2032
2033 depth++;
2034 n=tn->child[cindex];
2035 tn=(struct tnode *)n;
2036 pend=tn->pos+tn->bits;
2037 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2038 indent+=3;
2039 cindex=0;
2040 }
2041 }
2042 else
2043 cindex++;
2044
2045 /*
2046 * Test if we are done
2047 */
2048
2049 while (cindex >= (1 << tn->bits)) {
2050
2051 /*
2052 * Move upwards and test for root
2053 * pop off all traversed nodes
2054 */
2055
2056 if (NODE_PARENT(tn) == NULL) {
2057 tn = NULL;
2058 n = NULL;
2059 break;
2060 }
2061 else {
2062 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2063 tn = NODE_PARENT(tn);
2064 cindex++;
2065 n=(struct node *)tn;
2066 pend=tn->pos+tn->bits;
2067 indent-=3;
2068 depth--;
2069 }
2070 }
2071 }
2072 }
2073 else n = NULL;
2074 }
2075 else seq_printf(seq, "------ trie is empty\n");
2076
2077 read_unlock(&fib_lock);
2078}
2079
2080static struct trie_stat *trie_stat_new(void)
2081{
2082 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2083 int i;
2084
2085 if(s) {
2086 s->totdepth = 0;
2087 s->maxdepth = 0;
2088 s->tnodes = 0;
2089 s->leaves = 0;
2090 s->nullpointers = 0;
2091
2092 for(i=0; i< MAX_CHILDS; i++)
2093 s->nodesizes[i] = 0;
2094 }
2095 return s;
2096}
2097
2098static struct trie_stat *trie_collect_stats(struct trie *t)
2099{
2100 struct node *n=t->trie;
2101 struct trie_stat *s = trie_stat_new();
2102 int cindex = 0;
2103 int indent = 1;
2104 int pend = 0;
2105 int depth = 0;
2106
2107 read_lock(&fib_lock);
2108
2109 if (s) {
2110 if (n) {
2111 if (IS_TNODE(n)) {
2112 struct tnode *tn = (struct tnode *)n;
2113 pend=tn->pos+tn->bits;
2114 indent += 3;
2115 s->nodesizes[tn->bits]++;
2116 depth++;
2117
2118 while (tn && cindex < (1 << tn->bits)) {
2119 if (tn->child[cindex]) {
2120 /* Got a child */
2121
2122 if (IS_LEAF(tn->child[cindex])) {
2123 cindex++;
2124
2125 /* stats */
2126 if (depth > s->maxdepth)
2127 s->maxdepth = depth;
2128 s->totdepth += depth;
2129 s->leaves++;
2130 }
2131
2132 else {
2133 /*
2134 * New tnode. Decend one level
2135 */
2136
2137 s->tnodes++;
2138 s->nodesizes[tn->bits]++;
2139 depth++;
2140
2141 n = tn->child[cindex];
2142 tn = (struct tnode *)n;
2143 pend = tn->pos+tn->bits;
2144
2145 indent += 3;
2146 cindex = 0;
2147 }
2148 }
2149 else {
2150 cindex++;
2151 s->nullpointers++;
2152 }
2153
2154 /*
2155 * Test if we are done
2156 */
2157
2158 while (cindex >= (1 << tn->bits)) {
2159
2160 /*
2161 * Move upwards and test for root
2162 * pop off all traversed nodes
2163 */
2164
2165
2166 if (NODE_PARENT(tn) == NULL) {
2167 tn = NULL;
2168 n = NULL;
2169 break;
2170 }
2171 else {
2172 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2173 tn = NODE_PARENT(tn);
2174 cindex++;
2175 n = (struct node *)tn;
2176 pend=tn->pos+tn->bits;
2177 indent -= 3;
2178 depth--;
2179 }
2180 }
2181 }
2182 }
2183 else n = NULL;
2184 }
2185 }
2186
2187 read_unlock(&fib_lock);
2188 return s;
2189}
2190
2191#ifdef CONFIG_PROC_FS
2192
2193static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2194{
2195 return NULL;
2196}
2197
2198static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2199{
2200 return NULL;
2201}
2202
2203static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2204{
2205 void *v = NULL;
2206
2207 if (ip_fib_main_table)
2208 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2209 return v;
2210}
2211
2212static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213{
2214 ++*pos;
2215 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2216}
2217
2218static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2219{
2220
2221}
2222
2223/*
2224 * This outputs /proc/net/fib_triestats
2225 *
2226 * It always works in backward compatibility mode.
2227 * The format of the file is not supposed to be changed.
2228 */
2229
2230static void collect_and_show(struct trie *t, struct seq_file *seq)
2231{
2232 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2233 int i, max, pointers;
2234 struct trie_stat *stat;
2235 int avdepth;
2236
2237 stat = trie_collect_stats(t);
2238
2239 bytes=0;
2240 seq_printf(seq, "trie=%p\n", t);
2241
2242 if (stat) {
2243 if (stat->leaves)
2244 avdepth=stat->totdepth*100 / stat->leaves;
2245 else
2246 avdepth=0;
2247 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2248 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2249
2250 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2251 bytes += sizeof(struct leaf) * stat->leaves;
2252 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2253 bytes += sizeof(struct tnode) * stat->tnodes;
2254
2255 max = MAX_CHILDS-1;
2256
2257 while (max >= 0 && stat->nodesizes[max] == 0)
2258 max--;
2259 pointers = 0;
2260
2261 for (i = 1; i <= max; i++)
2262 if (stat->nodesizes[i] != 0) {
2263 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2264 pointers += (1<<i) * stat->nodesizes[i];
2265 }
2266 seq_printf(seq, "\n");
2267 seq_printf(seq, "Pointers: %d\n", pointers);
2268 bytes += sizeof(struct node *) * pointers;
2269 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2270 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2271
2272 kfree(stat);
2273 }
2274
2275#ifdef CONFIG_IP_FIB_TRIE_STATS
2276 seq_printf(seq, "Counters:\n---------\n");
2277 seq_printf(seq,"gets = %d\n", t->stats.gets);
2278 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2279 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2282#ifdef CLEAR_STATS
2283 memset(&(t->stats), 0, sizeof(t->stats));
2284#endif
2285#endif /* CONFIG_IP_FIB_TRIE_STATS */
2286}
2287
2288static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2289{
2290 char bf[128];
2291
2292 if (v == SEQ_START_TOKEN) {
2293 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294 sizeof(struct leaf), sizeof(struct tnode));
2295 if (trie_local)
2296 collect_and_show(trie_local, seq);
2297
2298 if (trie_main)
2299 collect_and_show(trie_main, seq);
2300 }
2301 else {
2302 snprintf(bf, sizeof(bf),
2303 "*\t%08X\t%08X", 200, 400);
2304
2305 seq_printf(seq, "%-127s\n", bf);
2306 }
2307 return 0;
2308}
2309
2310static struct seq_operations fib_triestat_seq_ops = {
2311 .start = fib_triestat_seq_start,
2312 .next = fib_triestat_seq_next,
2313 .stop = fib_triestat_seq_stop,
2314 .show = fib_triestat_seq_show,
2315};
2316
2317static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2318{
2319 struct seq_file *seq;
2320 int rc = -ENOMEM;
2321
2322 rc = seq_open(file, &fib_triestat_seq_ops);
2323 if (rc)
2324 goto out_kfree;
2325
2326 seq = file->private_data;
2327out:
2328 return rc;
2329out_kfree:
2330 goto out;
2331}
2332
2333static struct file_operations fib_triestat_seq_fops = {
2334 .owner = THIS_MODULE,
2335 .open = fib_triestat_seq_open,
2336 .read = seq_read,
2337 .llseek = seq_lseek,
2338 .release = seq_release_private,
2339};
2340
2341int __init fib_stat_proc_init(void)
2342{
2343 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2344 return -ENOMEM;
2345 return 0;
2346}
2347
2348void __init fib_stat_proc_exit(void)
2349{
2350 proc_net_remove("fib_triestat");
2351}
2352
2353static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2354{
2355 return NULL;
2356}
2357
2358static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2359{
2360 return NULL;
2361}
2362
2363static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2364{
2365 void *v = NULL;
2366
2367 if (ip_fib_main_table)
2368 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2369 return v;
2370}
2371
2372static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2373{
2374 ++*pos;
2375 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2376}
2377
2378static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2379{
2380
2381}
2382
2383/*
2384 * This outputs /proc/net/fib_trie.
2385 *
2386 * It always works in backward compatibility mode.
2387 * The format of the file is not supposed to be changed.
2388 */
2389
2390static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{
2392 char bf[128];
2393
2394 if (v == SEQ_START_TOKEN) {
2395 if (trie_local)
2396 trie_dump_seq(seq, trie_local);
2397
2398 if (trie_main)
2399 trie_dump_seq(seq, trie_main);
2400 }
2401
2402 else {
2403 snprintf(bf, sizeof(bf),
2404 "*\t%08X\t%08X", 200, 400);
2405 seq_printf(seq, "%-127s\n", bf);
2406 }
2407
2408 return 0;
2409}
2410
2411static struct seq_operations fib_trie_seq_ops = {
2412 .start = fib_trie_seq_start,
2413 .next = fib_trie_seq_next,
2414 .stop = fib_trie_seq_stop,
2415 .show = fib_trie_seq_show,
2416};
2417
2418static int fib_trie_seq_open(struct inode *inode, struct file *file)
2419{
2420 struct seq_file *seq;
2421 int rc = -ENOMEM;
2422
2423 rc = seq_open(file, &fib_trie_seq_ops);
2424 if (rc)
2425 goto out_kfree;
2426
2427 seq = file->private_data;
2428out:
2429 return rc;
2430out_kfree:
2431 goto out;
2432}
2433
2434static struct file_operations fib_trie_seq_fops = {
2435 .owner = THIS_MODULE,
2436 .open = fib_trie_seq_open,
2437 .read = seq_read,
2438 .llseek = seq_lseek,
2439 .release = seq_release_private,
2440};
2441
2442int __init fib_proc_init(void)
2443{
2444 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2445 return -ENOMEM;
2446 return 0;
2447}
2448
2449void __init fib_proc_exit(void)
2450{
2451 proc_net_remove("fib_trie");
2452}
2453
2454#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 85bf0d3e294b..cb759484979d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -207,6 +207,7 @@ int sysctl_icmp_ignore_bogus_error_responses;
207 207
208int sysctl_icmp_ratelimit = 1 * HZ; 208int sysctl_icmp_ratelimit = 1 * HZ;
209int sysctl_icmp_ratemask = 0x1818; 209int sysctl_icmp_ratemask = 0x1818;
210int sysctl_icmp_errors_use_inbound_ifaddr;
210 211
211/* 212/*
212 * ICMP control array. This specifies what to do with each ICMP. 213 * ICMP control array. This specifies what to do with each ICMP.
@@ -511,8 +512,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
511 */ 512 */
512 513
513 saddr = iph->daddr; 514 saddr = iph->daddr;
514 if (!(rt->rt_flags & RTCF_LOCAL)) 515 if (!(rt->rt_flags & RTCF_LOCAL)) {
515 saddr = 0; 516 if (sysctl_icmp_errors_use_inbound_ifaddr)
517 saddr = inet_select_addr(skb_in->dev, 0, RT_SCOPE_LINK);
518 else
519 saddr = 0;
520 }
516 521
517 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | 522 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
518 IPTOS_PREC_INTERNETCONTROL) : 523 IPTOS_PREC_INTERNETCONTROL) :
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4e47a2658c7c..af2ec88bbb2f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -184,6 +184,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
184 raw_rcv(last, skb2); 184 raw_rcv(last, skb2);
185 } 185 }
186 last = sk; 186 last = sk;
187 nf_reset(skb);
187 } 188 }
188 } 189 }
189 190
@@ -200,10 +201,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
200{ 201{
201 int ihl = skb->nh.iph->ihl*4; 202 int ihl = skb->nh.iph->ihl*4;
202 203
203#ifdef CONFIG_NETFILTER_DEBUG
204 nf_debug_ip_local_deliver(skb);
205#endif /*CONFIG_NETFILTER_DEBUG*/
206
207 __skb_pull(skb, ihl); 204 __skb_pull(skb, ihl);
208 205
209 /* Free reference early: we don't need it any more, and it may 206 /* Free reference early: we don't need it any more, and it may
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 760dc8238d65..ee07aec215a0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,10 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110
111#ifdef CONFIG_NETFILTER_DEBUG
112 nf_debug_ip_loopback_xmit(newskb);
113#endif
114 nf_reset(newskb); 110 nf_reset(newskb);
115 netif_rx(newskb); 111 netif_rx(newskb);
116 return 0; 112 return 0;
@@ -192,10 +188,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
192 skb = skb2; 188 skb = skb2;
193 } 189 }
194 190
195#ifdef CONFIG_NETFILTER_DEBUG
196 nf_debug_ip_finish_output2(skb);
197#endif /*CONFIG_NETFILTER_DEBUG*/
198
199 nf_reset(skb); 191 nf_reset(skb);
200 192
201 if (hh) { 193 if (hh) {
@@ -415,9 +407,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
415 to->nf_bridge = from->nf_bridge; 407 to->nf_bridge = from->nf_bridge;
416 nf_bridge_get(to->nf_bridge); 408 nf_bridge_get(to->nf_bridge);
417#endif 409#endif
418#ifdef CONFIG_NETFILTER_DEBUG
419 to->nf_debug = from->nf_debug;
420#endif
421#endif 410#endif
422} 411}
423 412
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 47012b93cad2..f8b172f89811 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -360,14 +360,14 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
360 err = copied; 360 err = copied;
361 361
362 /* Reset and regenerate socket error */ 362 /* Reset and regenerate socket error */
363 spin_lock_irq(&sk->sk_error_queue.lock); 363 spin_lock_bh(&sk->sk_error_queue.lock);
364 sk->sk_err = 0; 364 sk->sk_err = 0;
365 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { 365 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
366 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; 366 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
367 spin_unlock_irq(&sk->sk_error_queue.lock); 367 spin_unlock_bh(&sk->sk_error_queue.lock);
368 sk->sk_error_report(sk); 368 sk->sk_error_report(sk);
369 } else 369 } else
370 spin_unlock_irq(&sk->sk_error_queue.lock); 370 spin_unlock_bh(&sk->sk_error_queue.lock);
371 371
372out_free_skb: 372out_free_skb:
373 kfree_skb(skb); 373 kfree_skb(skb);
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 1a23c5263b99..2065944fd9e5 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -236,15 +236,10 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
236 t->props.mode = 1; 236 t->props.mode = 1;
237 t->props.saddr.a4 = x->props.saddr.a4; 237 t->props.saddr.a4 = x->props.saddr.a4;
238 t->props.flags = x->props.flags; 238 t->props.flags = x->props.flags;
239 239
240 t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family); 240 if (xfrm_init_state(t))
241 if (t->type == NULL)
242 goto error;
243
244 if (t->type->init_state(t, NULL))
245 goto error; 241 goto error;
246 242
247 t->km.state = XFRM_STATE_VALID;
248 atomic_set(&t->tunnel_users, 1); 243 atomic_set(&t->tunnel_users, 1);
249out: 244out:
250 return t; 245 return t;
@@ -422,7 +417,7 @@ static void ipcomp_destroy(struct xfrm_state *x)
422 kfree(ipcd); 417 kfree(ipcd);
423} 418}
424 419
425static int ipcomp_init_state(struct xfrm_state *x, void *args) 420static int ipcomp_init_state(struct xfrm_state *x)
426{ 421{
427 int err; 422 int err;
428 struct ipcomp_data *ipcd; 423 struct ipcomp_data *ipcd;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e21c049ec62a..e4f809a93f47 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1350,6 +1350,7 @@ int ip_mr_input(struct sk_buff *skb)
1350 */ 1350 */
1351 read_lock(&mrt_lock); 1351 read_lock(&mrt_lock);
1352 if (mroute_socket) { 1352 if (mroute_socket) {
1353 nf_reset(skb);
1353 raw_rcv(mroute_socket, skb); 1354 raw_rcv(mroute_socket, skb);
1354 read_unlock(&mrt_lock); 1355 read_unlock(&mrt_lock);
1355 return 0; 1356 return 0;
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
index a788461a40c9..30e85de9ffff 100644
--- a/net/ipv4/ipvs/Makefile
+++ b/net/ipv4/ipvs/Makefile
@@ -11,7 +11,7 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
11 11
12ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ 12ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
13 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ 13 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
14 ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \ 14 ip_vs_est.o ip_vs_proto.o \
15 $(ip_vs_proto-objs-y) 15 $(ip_vs_proto-objs-y)
16 16
17 17
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d9701036e..12a82e91d22a 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -2059,7 +2059,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2059 dst->addr = src->addr; 2059 dst->addr = src->addr;
2060 dst->port = src->port; 2060 dst->port = src->port;
2061 dst->fwmark = src->fwmark; 2061 dst->fwmark = src->fwmark;
2062 strcpy(dst->sched_name, src->scheduler->name); 2062 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2063 dst->flags = src->flags; 2063 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ; 2064 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask; 2065 dst->netmask = src->netmask;
@@ -2080,6 +2080,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services) 2081 if (count >= get->num_services)
2082 goto out; 2082 goto out;
2083 memset(&entry, 0, sizeof(entry));
2083 ip_vs_copy_service(&entry, svc); 2084 ip_vs_copy_service(&entry, svc);
2084 if (copy_to_user(&uptr->entrytable[count], 2085 if (copy_to_user(&uptr->entrytable[count],
2085 &entry, sizeof(entry))) { 2086 &entry, sizeof(entry))) {
@@ -2094,6 +2095,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2094 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2095 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095 if (count >= get->num_services) 2096 if (count >= get->num_services)
2096 goto out; 2097 goto out;
2098 memset(&entry, 0, sizeof(entry));
2097 ip_vs_copy_service(&entry, svc); 2099 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count], 2100 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) { 2101 &entry, sizeof(entry))) {
@@ -2304,12 +2306,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2304 memset(&d, 0, sizeof(d)); 2306 memset(&d, 0, sizeof(d));
2305 if (ip_vs_sync_state & IP_VS_STATE_MASTER) { 2307 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306 d[0].state = IP_VS_STATE_MASTER; 2308 d[0].state = IP_VS_STATE_MASTER;
2307 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); 2309 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2308 d[0].syncid = ip_vs_master_syncid; 2310 d[0].syncid = ip_vs_master_syncid;
2309 } 2311 }
2310 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { 2312 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311 d[1].state = IP_VS_STATE_BACKUP; 2313 d[1].state = IP_VS_STATE_BACKUP;
2312 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); 2314 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2313 d[1].syncid = ip_vs_backup_syncid; 2315 d[1].syncid = ip_vs_backup_syncid;
2314 } 2316 }
2315 if (copy_to_user(user, &d, sizeof(d)) != 0) 2317 if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
index 253c46252bd5..867d4e9c6594 100644
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -216,9 +216,6 @@ int ip_vs_protocol_init(void)
216#ifdef CONFIG_IP_VS_PROTO_UDP 216#ifdef CONFIG_IP_VS_PROTO_UDP
217 REGISTER_PROTOCOL(&ip_vs_protocol_udp); 217 REGISTER_PROTOCOL(&ip_vs_protocol_udp);
218#endif 218#endif
219#ifdef CONFIG_IP_VS_PROTO_ICMP
220 REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
221#endif
222#ifdef CONFIG_IP_VS_PROTO_AH 219#ifdef CONFIG_IP_VS_PROTO_AH
223 REGISTER_PROTOCOL(&ip_vs_protocol_ah); 220 REGISTER_PROTOCOL(&ip_vs_protocol_ah);
224#endif 221#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c
deleted file mode 100644
index 191e94aa1c1f..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_icmp.c
+++ /dev/null
@@ -1,182 +0,0 @@
1/*
2 * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, March 2002
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation;
9 *
10 */
11
12#include <linux/module.h>
13#include <linux/kernel.h>
14#include <linux/icmp.h>
15#include <linux/netfilter.h>
16#include <linux/netfilter_ipv4.h>
17
18#include <net/ip_vs.h>
19
20
21static int icmp_timeouts[1] = { 1*60*HZ };
22
23static char * icmp_state_name_table[1] = { "ICMP" };
24
25static struct ip_vs_conn *
26icmp_conn_in_get(const struct sk_buff *skb,
27 struct ip_vs_protocol *pp,
28 const struct iphdr *iph,
29 unsigned int proto_off,
30 int inverse)
31{
32#if 0
33 struct ip_vs_conn *cp;
34
35 if (likely(!inverse)) {
36 cp = ip_vs_conn_in_get(iph->protocol,
37 iph->saddr, 0,
38 iph->daddr, 0);
39 } else {
40 cp = ip_vs_conn_in_get(iph->protocol,
41 iph->daddr, 0,
42 iph->saddr, 0);
43 }
44
45 return cp;
46
47#else
48 return NULL;
49#endif
50}
51
52static struct ip_vs_conn *
53icmp_conn_out_get(const struct sk_buff *skb,
54 struct ip_vs_protocol *pp,
55 const struct iphdr *iph,
56 unsigned int proto_off,
57 int inverse)
58{
59#if 0
60 struct ip_vs_conn *cp;
61
62 if (likely(!inverse)) {
63 cp = ip_vs_conn_out_get(iph->protocol,
64 iph->saddr, 0,
65 iph->daddr, 0);
66 } else {
67 cp = ip_vs_conn_out_get(IPPROTO_UDP,
68 iph->daddr, 0,
69 iph->saddr, 0);
70 }
71
72 return cp;
73#else
74 return NULL;
75#endif
76}
77
78static int
79icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
80 int *verdict, struct ip_vs_conn **cpp)
81{
82 *verdict = NF_ACCEPT;
83 return 0;
84}
85
86static int
87icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
88{
89 if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
90 if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
91 if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
92 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
93 return 0;
94 }
95 }
96 }
97 return 1;
98}
99
100static void
101icmp_debug_packet(struct ip_vs_protocol *pp,
102 const struct sk_buff *skb,
103 int offset,
104 const char *msg)
105{
106 char buf[256];
107 struct iphdr _iph, *ih;
108
109 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
110 if (ih == NULL)
111 sprintf(buf, "%s TRUNCATED", pp->name);
112 else if (ih->frag_off & __constant_htons(IP_OFFSET))
113 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
114 pp->name, NIPQUAD(ih->saddr),
115 NIPQUAD(ih->daddr));
116 else {
117 struct icmphdr _icmph, *ic;
118
119 ic = skb_header_pointer(skb, offset + ih->ihl*4,
120 sizeof(_icmph), &_icmph);
121 if (ic == NULL)
122 sprintf(buf, "%s TRUNCATED to %u bytes\n",
123 pp->name, skb->len - offset);
124 else
125 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
126 pp->name, NIPQUAD(ih->saddr),
127 NIPQUAD(ih->daddr),
128 ic->type, ic->code);
129 }
130 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
131}
132
133static int
134icmp_state_transition(struct ip_vs_conn *cp, int direction,
135 const struct sk_buff *skb,
136 struct ip_vs_protocol *pp)
137{
138 cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
139 return 1;
140}
141
142static int
143icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
144{
145 int num;
146 char **names;
147
148 num = IP_VS_ICMP_S_LAST;
149 names = icmp_state_name_table;
150 return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
151}
152
153
154static void icmp_init(struct ip_vs_protocol *pp)
155{
156 pp->timeout_table = icmp_timeouts;
157}
158
159static void icmp_exit(struct ip_vs_protocol *pp)
160{
161}
162
163struct ip_vs_protocol ip_vs_protocol_icmp = {
164 .name = "ICMP",
165 .protocol = IPPROTO_ICMP,
166 .dont_defrag = 0,
167 .init = icmp_init,
168 .exit = icmp_exit,
169 .conn_schedule = icmp_conn_schedule,
170 .conn_in_get = icmp_conn_in_get,
171 .conn_out_get = icmp_conn_out_get,
172 .snat_handler = NULL,
173 .dnat_handler = NULL,
174 .csum_check = icmp_csum_check,
175 .state_transition = icmp_state_transition,
176 .register_app = NULL,
177 .unregister_app = NULL,
178 .app_conn_bind = NULL,
179 .debug_packet = icmp_debug_packet,
180 .timeout_change = NULL,
181 .set_state_timeout = icmp_set_state_timeout,
182};
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a32..574d1f509b46 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
839 839
840 ip_vs_sync_state |= state; 840 ip_vs_sync_state |= state;
841 if (state == IP_VS_STATE_MASTER) { 841 if (state == IP_VS_STATE_MASTER) {
842 strcpy(ip_vs_master_mcast_ifn, mcast_ifn); 842 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
843 ip_vs_master_syncid = syncid; 843 ip_vs_master_syncid = syncid;
844 } else { 844 } else {
845 strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); 845 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
846 ip_vs_backup_syncid = syncid; 846 ip_vs_backup_syncid = syncid;
847 } 847 }
848 848
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index de21da00057f..a8512a3fd08a 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,6 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
127 127
128#define IP_VS_XMIT(skb, rt) \ 128#define IP_VS_XMIT(skb, rt) \
129do { \ 129do { \
130 nf_reset_debug(skb); \
131 (skb)->nfcache |= NFC_IPVS_PROPERTY; \ 130 (skb)->nfcache |= NFC_IPVS_PROPERTY; \
132 (skb)->ip_summed = CHECKSUM_NONE; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \
133 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index 9349686131fc..c9cf8726051d 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -57,7 +58,6 @@ struct multipath_device {
57 58
58static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES]; 59static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
59static DEFINE_SPINLOCK(state_lock); 60static DEFINE_SPINLOCK(state_lock);
60static struct rtable *last_selection = NULL;
61 61
62static int inline __multipath_findslot(void) 62static int inline __multipath_findslot(void)
63{ 63{
@@ -111,11 +111,6 @@ struct notifier_block drr_dev_notifier = {
111 .notifier_call = drr_dev_event, 111 .notifier_call = drr_dev_event,
112}; 112};
113 113
114static void drr_remove(struct rtable *rt)
115{
116 if (last_selection == rt)
117 last_selection = NULL;
118}
119 114
120static void drr_safe_inc(atomic_t *usecount) 115static void drr_safe_inc(atomic_t *usecount)
121{ 116{
@@ -144,14 +139,6 @@ static void drr_select_route(const struct flowi *flp,
144 int devidx = -1; 139 int devidx = -1;
145 int cur_min_devidx = -1; 140 int cur_min_devidx = -1;
146 141
147 /* if necessary and possible utilize the old alternative */
148 if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
149 last_selection != NULL) {
150 result = last_selection;
151 *rp = result;
152 return;
153 }
154
155 /* 1. make sure all alt. nexthops have the same GC related data */ 142 /* 1. make sure all alt. nexthops have the same GC related data */
156 /* 2. determine the new candidate to be returned */ 143 /* 2. determine the new candidate to be returned */
157 result = NULL; 144 result = NULL;
@@ -229,12 +216,10 @@ static void drr_select_route(const struct flowi *flp,
229 } 216 }
230 217
231 *rp = result; 218 *rp = result;
232 last_selection = result;
233} 219}
234 220
235static struct ip_mp_alg_ops drr_ops = { 221static struct ip_mp_alg_ops drr_ops = {
236 .mp_alg_select_route = drr_select_route, 222 .mp_alg_select_route = drr_select_route,
237 .mp_alg_remove = drr_remove,
238}; 223};
239 224
240static int __init drr_init(void) 225static int __init drr_init(void)
@@ -244,7 +229,7 @@ static int __init drr_init(void)
244 if (err) 229 if (err)
245 return err; 230 return err;
246 231
247 err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR); 232 err = multipath_alg_register(&drr_ops, IP_MP_ALG_DRR);
248 if (err) 233 if (err)
249 goto fail; 234 goto fail;
250 235
@@ -263,3 +248,4 @@ static void __exit drr_exit(void)
263 248
264module_init(drr_init); 249module_init(drr_init);
265module_exit(drr_exit); 250module_exit(drr_exit);
251MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
index 805a16e47de5..5249dbe7c559 100644
--- a/net/ipv4/multipath_random.c
+++ b/net/ipv4/multipath_random.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -126,3 +127,4 @@ static void __exit random_exit(void)
126 127
127module_init(random_init); 128module_init(random_init);
128module_exit(random_exit); 129module_exit(random_exit);
130MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
index 554a82568160..b6cd2870478f 100644
--- a/net/ipv4/multipath_rr.c
+++ b/net/ipv4/multipath_rr.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -47,29 +48,12 @@
47#include <net/checksum.h> 48#include <net/checksum.h>
48#include <net/ip_mp_alg.h> 49#include <net/ip_mp_alg.h>
49 50
50#define MULTIPATH_MAX_CANDIDATES 40
51
52static struct rtable* last_used = NULL;
53
54static void rr_remove(struct rtable *rt)
55{
56 if (last_used == rt)
57 last_used = NULL;
58}
59
60static void rr_select_route(const struct flowi *flp, 51static void rr_select_route(const struct flowi *flp,
61 struct rtable *first, struct rtable **rp) 52 struct rtable *first, struct rtable **rp)
62{ 53{
63 struct rtable *nh, *result, *min_use_cand = NULL; 54 struct rtable *nh, *result, *min_use_cand = NULL;
64 int min_use = -1; 55 int min_use = -1;
65 56
66 /* if necessary and possible utilize the old alternative */
67 if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
68 last_used != NULL) {
69 result = last_used;
70 goto out;
71 }
72
73 /* 1. make sure all alt. nexthops have the same GC related data 57 /* 1. make sure all alt. nexthops have the same GC related data
74 * 2. determine the new candidate to be returned 58 * 2. determine the new candidate to be returned
75 */ 59 */
@@ -90,15 +74,12 @@ static void rr_select_route(const struct flowi *flp,
90 if (!result) 74 if (!result)
91 result = first; 75 result = first;
92 76
93out:
94 last_used = result;
95 result->u.dst.__use++; 77 result->u.dst.__use++;
96 *rp = result; 78 *rp = result;
97} 79}
98 80
99static struct ip_mp_alg_ops rr_ops = { 81static struct ip_mp_alg_ops rr_ops = {
100 .mp_alg_select_route = rr_select_route, 82 .mp_alg_select_route = rr_select_route,
101 .mp_alg_remove = rr_remove,
102}; 83};
103 84
104static int __init rr_init(void) 85static int __init rr_init(void)
@@ -113,3 +94,4 @@ static void __exit rr_exit(void)
113 94
114module_init(rr_init); 95module_init(rr_init);
115module_exit(rr_exit); 96module_exit(rr_exit);
97MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
index c3d2ca1a6781..bd7d75b6abe0 100644
--- a/net/ipv4/multipath_wrandom.c
+++ b/net/ipv4/multipath_wrandom.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -342,3 +343,4 @@ static void __exit wrandom_exit(void)
342 343
343module_init(wrandom_init); 344module_init(wrandom_init);
344module_exit(wrandom_exit); 345module_exit(wrandom_exit);
346MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index df79f5ed6a0a..fa1634256680 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -60,7 +60,6 @@ static DECLARE_MUTEX(arpt_mutex);
60 60
61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
63#include <linux/netfilter_ipv4/lockhelp.h>
64#include <linux/netfilter_ipv4/listhelp.h> 63#include <linux/netfilter_ipv4/listhelp.h>
65 64
66struct arpt_table_info { 65struct arpt_table_info {
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 3dbddd062605..a78a320eee08 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -26,7 +26,6 @@
26#include <net/checksum.h> 26#include <net/checksum.h>
27#include <net/udp.h> 27#include <net/udp.h>
28 28
29#include <linux/netfilter_ipv4/lockhelp.h>
30#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 29#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> 30#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
32 31
@@ -42,7 +41,7 @@ static char *conns[] = { "DATA ", "MESG ", "INDEX " };
42 41
43/* This is slow, but it's simple. --RR */ 42/* This is slow, but it's simple. --RR */
44static char amanda_buffer[65536]; 43static char amanda_buffer[65536];
45static DECLARE_LOCK(amanda_buffer_lock); 44static DEFINE_SPINLOCK(amanda_buffer_lock);
46 45
47unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
48 enum ip_conntrack_info ctinfo, 47 enum ip_conntrack_info ctinfo,
@@ -76,7 +75,7 @@ static int help(struct sk_buff **pskb,
76 return NF_ACCEPT; 75 return NF_ACCEPT;
77 } 76 }
78 77
79 LOCK_BH(&amanda_buffer_lock); 78 spin_lock_bh(&amanda_buffer_lock);
80 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); 79 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
81 data = amanda_buffer; 80 data = amanda_buffer;
82 data_limit = amanda_buffer + (*pskb)->len - dataoff; 81 data_limit = amanda_buffer + (*pskb)->len - dataoff;
@@ -134,7 +133,7 @@ static int help(struct sk_buff **pskb,
134 } 133 }
135 134
136out: 135out:
137 UNLOCK_BH(&amanda_buffer_lock); 136 spin_unlock_bh(&amanda_buffer_lock);
138 return ret; 137 return ret;
139} 138}
140 139
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 09e824622977..4b78ebeb6635 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -38,10 +38,10 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40 40
41/* This rwlock protects the main hash table, protocol/helper/expected 41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/ 42 registrations, conntrack timers*/
43#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 43#define ASSERT_READ_LOCK(x)
44#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 44#define ASSERT_WRITE_LOCK(x)
45 45
46#include <linux/netfilter_ipv4/ip_conntrack.h> 46#include <linux/netfilter_ipv4/ip_conntrack.h>
47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -57,7 +57,7 @@
57#define DEBUGP(format, args...) 57#define DEBUGP(format, args...)
58#endif 58#endif
59 59
60DECLARE_RWLOCK(ip_conntrack_lock); 60DEFINE_RWLOCK(ip_conntrack_lock);
61 61
62/* ip_conntrack_standalone needs this */ 62/* ip_conntrack_standalone needs this */
63atomic_t ip_conntrack_count = ATOMIC_INIT(0); 63atomic_t ip_conntrack_count = ATOMIC_INIT(0);
@@ -147,7 +147,7 @@ static void destroy_expect(struct ip_conntrack_expect *exp)
147 147
148static void unlink_expect(struct ip_conntrack_expect *exp) 148static void unlink_expect(struct ip_conntrack_expect *exp)
149{ 149{
150 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 150 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
151 list_del(&exp->list); 151 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */ 152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--; 153 exp->master->expecting--;
@@ -157,9 +157,9 @@ static void expectation_timed_out(unsigned long ul_expect)
157{ 157{
158 struct ip_conntrack_expect *exp = (void *)ul_expect; 158 struct ip_conntrack_expect *exp = (void *)ul_expect;
159 159
160 WRITE_LOCK(&ip_conntrack_lock); 160 write_lock_bh(&ip_conntrack_lock);
161 unlink_expect(exp); 161 unlink_expect(exp);
162 WRITE_UNLOCK(&ip_conntrack_lock); 162 write_unlock_bh(&ip_conntrack_lock);
163 destroy_expect(exp); 163 destroy_expect(exp);
164} 164}
165 165
@@ -209,7 +209,7 @@ clean_from_lists(struct ip_conntrack *ct)
209 unsigned int ho, hr; 209 unsigned int ho, hr;
210 210
211 DEBUGP("clean_from_lists(%p)\n", ct); 211 DEBUGP("clean_from_lists(%p)\n", ct);
212 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 212 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
213 213
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -240,7 +240,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
240 if (ip_conntrack_destroyed) 240 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct); 241 ip_conntrack_destroyed(ct);
242 242
243 WRITE_LOCK(&ip_conntrack_lock); 243 write_lock_bh(&ip_conntrack_lock);
244 /* Expectations will have been removed in clean_from_lists, 244 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet, 245 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here, 246 * before connection is in the list, so we need to clean here,
@@ -254,7 +254,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
254 } 254 }
255 255
256 CONNTRACK_STAT_INC(delete); 256 CONNTRACK_STAT_INC(delete);
257 WRITE_UNLOCK(&ip_conntrack_lock); 257 write_unlock_bh(&ip_conntrack_lock);
258 258
259 if (ct->master) 259 if (ct->master)
260 ip_conntrack_put(ct->master); 260 ip_conntrack_put(ct->master);
@@ -268,12 +268,12 @@ static void death_by_timeout(unsigned long ul_conntrack)
268{ 268{
269 struct ip_conntrack *ct = (void *)ul_conntrack; 269 struct ip_conntrack *ct = (void *)ul_conntrack;
270 270
271 WRITE_LOCK(&ip_conntrack_lock); 271 write_lock_bh(&ip_conntrack_lock);
272 /* Inside lock so preempt is disabled on module removal path. 272 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */ 273 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list); 274 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct); 275 clean_from_lists(ct);
276 WRITE_UNLOCK(&ip_conntrack_lock); 276 write_unlock_bh(&ip_conntrack_lock);
277 ip_conntrack_put(ct); 277 ip_conntrack_put(ct);
278} 278}
279 279
@@ -282,7 +282,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple, 282 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack) 283 const struct ip_conntrack *ignored_conntrack)
284{ 284{
285 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 285 ASSERT_READ_LOCK(&ip_conntrack_lock);
286 return tuplehash_to_ctrack(i) != ignored_conntrack 286 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple); 287 && ip_ct_tuple_equal(tuple, &i->tuple);
288} 288}
@@ -294,7 +294,7 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
294 struct ip_conntrack_tuple_hash *h; 294 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple); 295 unsigned int hash = hash_conntrack(tuple);
296 296
297 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 297 ASSERT_READ_LOCK(&ip_conntrack_lock);
298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) { 298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { 299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found); 300 CONNTRACK_STAT_INC(found);
@@ -313,11 +313,11 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
313{ 313{
314 struct ip_conntrack_tuple_hash *h; 314 struct ip_conntrack_tuple_hash *h;
315 315
316 READ_LOCK(&ip_conntrack_lock); 316 read_lock_bh(&ip_conntrack_lock);
317 h = __ip_conntrack_find(tuple, ignored_conntrack); 317 h = __ip_conntrack_find(tuple, ignored_conntrack);
318 if (h) 318 if (h)
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
320 READ_UNLOCK(&ip_conntrack_lock); 320 read_unlock_bh(&ip_conntrack_lock);
321 321
322 return h; 322 return h;
323} 323}
@@ -352,7 +352,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
352 IP_NF_ASSERT(!is_confirmed(ct)); 352 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct); 353 DEBUGP("Confirming conntrack %p\n", ct);
354 354
355 WRITE_LOCK(&ip_conntrack_lock); 355 write_lock_bh(&ip_conntrack_lock);
356 356
357 /* See if there's one in the list already, including reverse: 357 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're 358 NAT could have grabbed it without realizing, since we're
@@ -380,12 +380,12 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
380 atomic_inc(&ct->ct_general.use); 380 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status); 381 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert); 382 CONNTRACK_STAT_INC(insert);
383 WRITE_UNLOCK(&ip_conntrack_lock); 383 write_unlock_bh(&ip_conntrack_lock);
384 return NF_ACCEPT; 384 return NF_ACCEPT;
385 } 385 }
386 386
387 CONNTRACK_STAT_INC(insert_failed); 387 CONNTRACK_STAT_INC(insert_failed);
388 WRITE_UNLOCK(&ip_conntrack_lock); 388 write_unlock_bh(&ip_conntrack_lock);
389 389
390 return NF_DROP; 390 return NF_DROP;
391} 391}
@@ -398,9 +398,9 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
398{ 398{
399 struct ip_conntrack_tuple_hash *h; 399 struct ip_conntrack_tuple_hash *h;
400 400
401 READ_LOCK(&ip_conntrack_lock); 401 read_lock_bh(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack); 402 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 READ_UNLOCK(&ip_conntrack_lock); 403 read_unlock_bh(&ip_conntrack_lock);
404 404
405 return h != NULL; 405 return h != NULL;
406} 406}
@@ -419,13 +419,13 @@ static int early_drop(struct list_head *chain)
419 struct ip_conntrack *ct = NULL; 419 struct ip_conntrack *ct = NULL;
420 int dropped = 0; 420 int dropped = 0;
421 421
422 READ_LOCK(&ip_conntrack_lock); 422 read_lock_bh(&ip_conntrack_lock);
423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); 423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
424 if (h) { 424 if (h) {
425 ct = tuplehash_to_ctrack(h); 425 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use); 426 atomic_inc(&ct->ct_general.use);
427 } 427 }
428 READ_UNLOCK(&ip_conntrack_lock); 428 read_unlock_bh(&ip_conntrack_lock);
429 429
430 if (!ct) 430 if (!ct)
431 return dropped; 431 return dropped;
@@ -508,7 +508,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
508 conntrack->timeout.data = (unsigned long)conntrack; 508 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout; 509 conntrack->timeout.function = death_by_timeout;
510 510
511 WRITE_LOCK(&ip_conntrack_lock); 511 write_lock_bh(&ip_conntrack_lock);
512 exp = find_expectation(tuple); 512 exp = find_expectation(tuple);
513 513
514 if (exp) { 514 if (exp) {
@@ -532,7 +532,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); 532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
533 533
534 atomic_inc(&ip_conntrack_count); 534 atomic_inc(&ip_conntrack_count);
535 WRITE_UNLOCK(&ip_conntrack_lock); 535 write_unlock_bh(&ip_conntrack_lock);
536 536
537 if (exp) { 537 if (exp) {
538 if (exp->expectfn) 538 if (exp->expectfn)
@@ -723,17 +723,17 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
723{ 723{
724 struct ip_conntrack_expect *i; 724 struct ip_conntrack_expect *i;
725 725
726 WRITE_LOCK(&ip_conntrack_lock); 726 write_lock_bh(&ip_conntrack_lock);
727 /* choose the the oldest expectation to evict */ 727 /* choose the the oldest expectation to evict */
728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { 728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) { 729 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i); 730 unlink_expect(i);
731 WRITE_UNLOCK(&ip_conntrack_lock); 731 write_unlock_bh(&ip_conntrack_lock);
732 destroy_expect(i); 732 destroy_expect(i);
733 return; 733 return;
734 } 734 }
735 } 735 }
736 WRITE_UNLOCK(&ip_conntrack_lock); 736 write_unlock_bh(&ip_conntrack_lock);
737} 737}
738 738
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) 739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
@@ -760,15 +760,11 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
760 exp->master->expecting++; 760 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list); 761 list_add(&exp->list, &ip_conntrack_expect_list);
762 762
763 if (exp->master->helper->timeout) { 763 init_timer(&exp->timeout);
764 init_timer(&exp->timeout); 764 exp->timeout.data = (unsigned long)exp;
765 exp->timeout.data = (unsigned long)exp; 765 exp->timeout.function = expectation_timed_out;
766 exp->timeout.function = expectation_timed_out; 766 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
767 exp->timeout.expires 767 add_timer(&exp->timeout);
768 = jiffies + exp->master->helper->timeout * HZ;
769 add_timer(&exp->timeout);
770 } else
771 exp->timeout.function = NULL;
772 768
773 CONNTRACK_STAT_INC(expect_create); 769 CONNTRACK_STAT_INC(expect_create);
774} 770}
@@ -808,7 +804,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
808 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); 804 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
809 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); 805 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
810 806
811 WRITE_LOCK(&ip_conntrack_lock); 807 write_lock_bh(&ip_conntrack_lock);
812 list_for_each_entry(i, &ip_conntrack_expect_list, list) { 808 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
813 if (expect_matches(i, expect)) { 809 if (expect_matches(i, expect)) {
814 /* Refresh timer: if it's dying, ignore.. */ 810 /* Refresh timer: if it's dying, ignore.. */
@@ -832,7 +828,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
832 ip_conntrack_expect_insert(expect); 828 ip_conntrack_expect_insert(expect);
833 ret = 0; 829 ret = 0;
834out: 830out:
835 WRITE_UNLOCK(&ip_conntrack_lock); 831 write_unlock_bh(&ip_conntrack_lock);
836 return ret; 832 return ret;
837} 833}
838 834
@@ -841,7 +837,7 @@ out:
841void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, 837void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
842 const struct ip_conntrack_tuple *newreply) 838 const struct ip_conntrack_tuple *newreply)
843{ 839{
844 WRITE_LOCK(&ip_conntrack_lock); 840 write_lock_bh(&ip_conntrack_lock);
845 /* Should be unconfirmed, so not in hash table yet */ 841 /* Should be unconfirmed, so not in hash table yet */
846 IP_NF_ASSERT(!is_confirmed(conntrack)); 842 IP_NF_ASSERT(!is_confirmed(conntrack));
847 843
@@ -851,15 +847,15 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
851 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 847 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
852 if (!conntrack->master && conntrack->expecting == 0) 848 if (!conntrack->master && conntrack->expecting == 0)
853 conntrack->helper = ip_ct_find_helper(newreply); 849 conntrack->helper = ip_ct_find_helper(newreply);
854 WRITE_UNLOCK(&ip_conntrack_lock); 850 write_unlock_bh(&ip_conntrack_lock);
855} 851}
856 852
857int ip_conntrack_helper_register(struct ip_conntrack_helper *me) 853int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
858{ 854{
859 BUG_ON(me->timeout == 0); 855 BUG_ON(me->timeout == 0);
860 WRITE_LOCK(&ip_conntrack_lock); 856 write_lock_bh(&ip_conntrack_lock);
861 list_prepend(&helpers, me); 857 list_prepend(&helpers, me);
862 WRITE_UNLOCK(&ip_conntrack_lock); 858 write_unlock_bh(&ip_conntrack_lock);
863 859
864 return 0; 860 return 0;
865} 861}
@@ -878,7 +874,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
878 struct ip_conntrack_expect *exp, *tmp; 874 struct ip_conntrack_expect *exp, *tmp;
879 875
880 /* Need write lock here, to delete helper. */ 876 /* Need write lock here, to delete helper. */
881 WRITE_LOCK(&ip_conntrack_lock); 877 write_lock_bh(&ip_conntrack_lock);
882 LIST_DELETE(&helpers, me); 878 LIST_DELETE(&helpers, me);
883 879
884 /* Get rid of expectations */ 880 /* Get rid of expectations */
@@ -893,7 +889,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
893 for (i = 0; i < ip_conntrack_htable_size; i++) 889 for (i = 0; i < ip_conntrack_htable_size; i++)
894 LIST_FIND_W(&ip_conntrack_hash[i], unhelp, 890 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
895 struct ip_conntrack_tuple_hash *, me); 891 struct ip_conntrack_tuple_hash *, me);
896 WRITE_UNLOCK(&ip_conntrack_lock); 892 write_unlock_bh(&ip_conntrack_lock);
897 893
898 /* Someone could be still looking at the helper in a bh. */ 894 /* Someone could be still looking at the helper in a bh. */
899 synchronize_net(); 895 synchronize_net();
@@ -925,14 +921,14 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
925 ct->timeout.expires = extra_jiffies; 921 ct->timeout.expires = extra_jiffies;
926 ct_add_counters(ct, ctinfo, skb); 922 ct_add_counters(ct, ctinfo, skb);
927 } else { 923 } else {
928 WRITE_LOCK(&ip_conntrack_lock); 924 write_lock_bh(&ip_conntrack_lock);
929 /* Need del_timer for race avoidance (may already be dying). */ 925 /* Need del_timer for race avoidance (may already be dying). */
930 if (del_timer(&ct->timeout)) { 926 if (del_timer(&ct->timeout)) {
931 ct->timeout.expires = jiffies + extra_jiffies; 927 ct->timeout.expires = jiffies + extra_jiffies;
932 add_timer(&ct->timeout); 928 add_timer(&ct->timeout);
933 } 929 }
934 ct_add_counters(ct, ctinfo, skb); 930 ct_add_counters(ct, ctinfo, skb);
935 WRITE_UNLOCK(&ip_conntrack_lock); 931 write_unlock_bh(&ip_conntrack_lock);
936 } 932 }
937} 933}
938 934
@@ -940,10 +936,6 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
940struct sk_buff * 936struct sk_buff *
941ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) 937ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
942{ 938{
943#ifdef CONFIG_NETFILTER_DEBUG
944 unsigned int olddebug = skb->nf_debug;
945#endif
946
947 skb_orphan(skb); 939 skb_orphan(skb);
948 940
949 local_bh_disable(); 941 local_bh_disable();
@@ -953,12 +945,7 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
953 if (skb) { 945 if (skb) {
954 ip_send_check(skb->nh.iph); 946 ip_send_check(skb->nh.iph);
955 skb->nfcache |= NFC_ALTERED; 947 skb->nfcache |= NFC_ALTERED;
956#ifdef CONFIG_NETFILTER_DEBUG
957 /* Packet path as if nothing had happened. */
958 skb->nf_debug = olddebug;
959#endif
960 } 948 }
961
962 return skb; 949 return skb;
963} 950}
964 951
@@ -997,7 +984,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
997{ 984{
998 struct ip_conntrack_tuple_hash *h = NULL; 985 struct ip_conntrack_tuple_hash *h = NULL;
999 986
1000 WRITE_LOCK(&ip_conntrack_lock); 987 write_lock_bh(&ip_conntrack_lock);
1001 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { 988 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1002 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, 989 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1003 struct ip_conntrack_tuple_hash *, iter, data); 990 struct ip_conntrack_tuple_hash *, iter, data);
@@ -1009,7 +996,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1009 struct ip_conntrack_tuple_hash *, iter, data); 996 struct ip_conntrack_tuple_hash *, iter, data);
1010 if (h) 997 if (h)
1011 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 998 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1012 WRITE_UNLOCK(&ip_conntrack_lock); 999 write_unlock_bh(&ip_conntrack_lock);
1013 1000
1014 return h; 1001 return h;
1015} 1002}
@@ -1201,14 +1188,14 @@ int __init ip_conntrack_init(void)
1201 } 1188 }
1202 1189
1203 /* Don't NEED lock here, but good form anyway. */ 1190 /* Don't NEED lock here, but good form anyway. */
1204 WRITE_LOCK(&ip_conntrack_lock); 1191 write_lock_bh(&ip_conntrack_lock);
1205 for (i = 0; i < MAX_IP_CT_PROTO; i++) 1192 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1206 ip_ct_protos[i] = &ip_conntrack_generic_protocol; 1193 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1207 /* Sew in builtin protocols. */ 1194 /* Sew in builtin protocols. */
1208 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; 1195 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1209 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; 1196 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1210 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; 1197 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1211 WRITE_UNLOCK(&ip_conntrack_lock); 1198 write_unlock_bh(&ip_conntrack_lock);
1212 1199
1213 for (i = 0; i < ip_conntrack_htable_size; i++) 1200 for (i = 0; i < ip_conntrack_htable_size; i++)
1214 INIT_LIST_HEAD(&ip_conntrack_hash[i]); 1201 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index dd86503aa788..fea6dd2a00b6 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -16,7 +16,6 @@
16#include <net/checksum.h> 16#include <net/checksum.h>
17#include <net/tcp.h> 17#include <net/tcp.h>
18 18
19#include <linux/netfilter_ipv4/lockhelp.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> 20#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
22#include <linux/moduleparam.h> 21#include <linux/moduleparam.h>
@@ -28,7 +27,7 @@ MODULE_DESCRIPTION("ftp connection tracking helper");
28/* This is slow, but it's simple. --RR */ 27/* This is slow, but it's simple. --RR */
29static char ftp_buffer[65536]; 28static char ftp_buffer[65536];
30 29
31static DECLARE_LOCK(ip_ftp_lock); 30static DEFINE_SPINLOCK(ip_ftp_lock);
32 31
33#define MAX_PORTS 8 32#define MAX_PORTS 8
34static int ports[MAX_PORTS]; 33static int ports[MAX_PORTS];
@@ -319,7 +318,7 @@ static int help(struct sk_buff **pskb,
319 } 318 }
320 datalen = (*pskb)->len - dataoff; 319 datalen = (*pskb)->len - dataoff;
321 320
322 LOCK_BH(&ip_ftp_lock); 321 spin_lock_bh(&ip_ftp_lock);
323 fb_ptr = skb_header_pointer(*pskb, dataoff, 322 fb_ptr = skb_header_pointer(*pskb, dataoff,
324 (*pskb)->len - dataoff, ftp_buffer); 323 (*pskb)->len - dataoff, ftp_buffer);
325 BUG_ON(fb_ptr == NULL); 324 BUG_ON(fb_ptr == NULL);
@@ -442,7 +441,7 @@ out_update_nl:
442 if (ends_in_nl) 441 if (ends_in_nl)
443 update_nl_seq(seq, ct_ftp_info,dir); 442 update_nl_seq(seq, ct_ftp_info,dir);
444 out: 443 out:
445 UNLOCK_BH(&ip_ftp_lock); 444 spin_unlock_bh(&ip_ftp_lock);
446 return ret; 445 return ret;
447} 446}
448 447
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 33cc7348b6ee..cd98772cc332 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -29,7 +29,6 @@
29#include <net/checksum.h> 29#include <net/checksum.h>
30#include <net/tcp.h> 30#include <net/tcp.h>
31 31
32#include <linux/netfilter_ipv4/lockhelp.h>
33#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 32#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
34#include <linux/netfilter_ipv4/ip_conntrack_irc.h> 33#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
35#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
@@ -41,7 +40,7 @@ static int max_dcc_channels = 8;
41static unsigned int dcc_timeout = 300; 40static unsigned int dcc_timeout = 300;
42/* This is slow, but it's simple. --RR */ 41/* This is slow, but it's simple. --RR */
43static char irc_buffer[65536]; 42static char irc_buffer[65536];
44static DECLARE_LOCK(irc_buffer_lock); 43static DEFINE_SPINLOCK(irc_buffer_lock);
45 44
46unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, 45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
47 enum ip_conntrack_info ctinfo, 46 enum ip_conntrack_info ctinfo,
@@ -141,7 +140,7 @@ static int help(struct sk_buff **pskb,
141 if (dataoff >= (*pskb)->len) 140 if (dataoff >= (*pskb)->len)
142 return NF_ACCEPT; 141 return NF_ACCEPT;
143 142
144 LOCK_BH(&irc_buffer_lock); 143 spin_lock_bh(&irc_buffer_lock);
145 ib_ptr = skb_header_pointer(*pskb, dataoff, 144 ib_ptr = skb_header_pointer(*pskb, dataoff,
146 (*pskb)->len - dataoff, irc_buffer); 145 (*pskb)->len - dataoff, irc_buffer);
147 BUG_ON(ib_ptr == NULL); 146 BUG_ON(ib_ptr == NULL);
@@ -237,7 +236,7 @@ static int help(struct sk_buff **pskb,
237 } /* while data < ... */ 236 } /* while data < ... */
238 237
239 out: 238 out:
240 UNLOCK_BH(&irc_buffer_lock); 239 spin_unlock_bh(&irc_buffer_lock);
241 return ret; 240 return ret;
242} 241}
243 242
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index ff8c34a860ff..31d75390bf12 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/netfilter_ipv4/ip_conntrack.h> 27#include <linux/netfilter_ipv4/ip_conntrack.h>
28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
29#include <linux/netfilter_ipv4/lockhelp.h>
30 29
31#if 0 30#if 0
32#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) 31#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
@@ -35,7 +34,7 @@
35#endif 34#endif
36 35
37/* Protects conntrack->proto.sctp */ 36/* Protects conntrack->proto.sctp */
38static DECLARE_RWLOCK(sctp_lock); 37static DEFINE_RWLOCK(sctp_lock);
39 38
40/* FIXME: Examine ipfilter's timeouts and conntrack transitions more 39/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
41 closely. They're more complex. --RR 40 closely. They're more complex. --RR
@@ -199,9 +198,9 @@ static int sctp_print_conntrack(struct seq_file *s,
199 DEBUGP(__FUNCTION__); 198 DEBUGP(__FUNCTION__);
200 DEBUGP("\n"); 199 DEBUGP("\n");
201 200
202 READ_LOCK(&sctp_lock); 201 read_lock_bh(&sctp_lock);
203 state = conntrack->proto.sctp.state; 202 state = conntrack->proto.sctp.state;
204 READ_UNLOCK(&sctp_lock); 203 read_unlock_bh(&sctp_lock);
205 204
206 return seq_printf(s, "%s ", sctp_conntrack_names[state]); 205 return seq_printf(s, "%s ", sctp_conntrack_names[state]);
207} 206}
@@ -343,13 +342,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
343 342
344 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; 343 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
345 for_each_sctp_chunk (skb, sch, _sch, offset, count) { 344 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
346 WRITE_LOCK(&sctp_lock); 345 write_lock_bh(&sctp_lock);
347 346
348 /* Special cases of Verification tag check (Sec 8.5.1) */ 347 /* Special cases of Verification tag check (Sec 8.5.1) */
349 if (sch->type == SCTP_CID_INIT) { 348 if (sch->type == SCTP_CID_INIT) {
350 /* Sec 8.5.1 (A) */ 349 /* Sec 8.5.1 (A) */
351 if (sh->vtag != 0) { 350 if (sh->vtag != 0) {
352 WRITE_UNLOCK(&sctp_lock); 351 write_unlock_bh(&sctp_lock);
353 return -1; 352 return -1;
354 } 353 }
355 } else if (sch->type == SCTP_CID_ABORT) { 354 } else if (sch->type == SCTP_CID_ABORT) {
@@ -357,7 +356,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
357 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) 356 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
358 && !(sh->vtag == conntrack->proto.sctp.vtag 357 && !(sh->vtag == conntrack->proto.sctp.vtag
359 [1 - CTINFO2DIR(ctinfo)])) { 358 [1 - CTINFO2DIR(ctinfo)])) {
360 WRITE_UNLOCK(&sctp_lock); 359 write_unlock_bh(&sctp_lock);
361 return -1; 360 return -1;
362 } 361 }
363 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { 362 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
@@ -366,13 +365,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
366 && !(sh->vtag == conntrack->proto.sctp.vtag 365 && !(sh->vtag == conntrack->proto.sctp.vtag
367 [1 - CTINFO2DIR(ctinfo)] 366 [1 - CTINFO2DIR(ctinfo)]
368 && (sch->flags & 1))) { 367 && (sch->flags & 1))) {
369 WRITE_UNLOCK(&sctp_lock); 368 write_unlock_bh(&sctp_lock);
370 return -1; 369 return -1;
371 } 370 }
372 } else if (sch->type == SCTP_CID_COOKIE_ECHO) { 371 } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
373 /* Sec 8.5.1 (D) */ 372 /* Sec 8.5.1 (D) */
374 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { 373 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
375 WRITE_UNLOCK(&sctp_lock); 374 write_unlock_bh(&sctp_lock);
376 return -1; 375 return -1;
377 } 376 }
378 } 377 }
@@ -384,7 +383,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
384 if (newconntrack == SCTP_CONNTRACK_MAX) { 383 if (newconntrack == SCTP_CONNTRACK_MAX) {
385 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", 384 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
386 CTINFO2DIR(ctinfo), sch->type, oldsctpstate); 385 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
387 WRITE_UNLOCK(&sctp_lock); 386 write_unlock_bh(&sctp_lock);
388 return -1; 387 return -1;
389 } 388 }
390 389
@@ -396,7 +395,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
396 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), 395 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
397 sizeof(_inithdr), &_inithdr); 396 sizeof(_inithdr), &_inithdr);
398 if (ih == NULL) { 397 if (ih == NULL) {
399 WRITE_UNLOCK(&sctp_lock); 398 write_unlock_bh(&sctp_lock);
400 return -1; 399 return -1;
401 } 400 }
402 DEBUGP("Setting vtag %x for dir %d\n", 401 DEBUGP("Setting vtag %x for dir %d\n",
@@ -405,7 +404,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
405 } 404 }
406 405
407 conntrack->proto.sctp.state = newconntrack; 406 conntrack->proto.sctp.state = newconntrack;
408 WRITE_UNLOCK(&sctp_lock); 407 write_unlock_bh(&sctp_lock);
409 } 408 }
410 409
411 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); 410 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 721ddbf522b4..809dfed766d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -36,7 +36,6 @@
36#include <linux/netfilter_ipv4.h> 36#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter_ipv4/ip_conntrack.h> 37#include <linux/netfilter_ipv4/ip_conntrack.h>
38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39#include <linux/netfilter_ipv4/lockhelp.h>
40 39
41#if 0 40#if 0
42#define DEBUGP printk 41#define DEBUGP printk
@@ -46,7 +45,7 @@
46#endif 45#endif
47 46
48/* Protects conntrack->proto.tcp */ 47/* Protects conntrack->proto.tcp */
49static DECLARE_RWLOCK(tcp_lock); 48static DEFINE_RWLOCK(tcp_lock);
50 49
51/* "Be conservative in what you do, 50/* "Be conservative in what you do,
52 be liberal in what you accept from others." 51 be liberal in what you accept from others."
@@ -330,9 +329,9 @@ static int tcp_print_conntrack(struct seq_file *s,
330{ 329{
331 enum tcp_conntrack state; 330 enum tcp_conntrack state;
332 331
333 READ_LOCK(&tcp_lock); 332 read_lock_bh(&tcp_lock);
334 state = conntrack->proto.tcp.state; 333 state = conntrack->proto.tcp.state;
335 READ_UNLOCK(&tcp_lock); 334 read_unlock_bh(&tcp_lock);
336 335
337 return seq_printf(s, "%s ", tcp_conntrack_names[state]); 336 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
338} 337}
@@ -738,14 +737,14 @@ void ip_conntrack_tcp_update(struct sk_buff *skb,
738 737
739 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); 738 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
740 739
741 WRITE_LOCK(&tcp_lock); 740 write_lock_bh(&tcp_lock);
742 /* 741 /*
743 * We have to worry for the ack in the reply packet only... 742 * We have to worry for the ack in the reply packet only...
744 */ 743 */
745 if (after(end, conntrack->proto.tcp.seen[dir].td_end)) 744 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
746 conntrack->proto.tcp.seen[dir].td_end = end; 745 conntrack->proto.tcp.seen[dir].td_end = end;
747 conntrack->proto.tcp.last_end = end; 746 conntrack->proto.tcp.last_end = end;
748 WRITE_UNLOCK(&tcp_lock); 747 write_unlock_bh(&tcp_lock);
749 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " 748 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
750 "receiver end=%u maxend=%u maxwin=%u scale=%i\n", 749 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
751 sender->td_end, sender->td_maxend, sender->td_maxwin, 750 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -857,7 +856,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
857 sizeof(_tcph), &_tcph); 856 sizeof(_tcph), &_tcph);
858 BUG_ON(th == NULL); 857 BUG_ON(th == NULL);
859 858
860 WRITE_LOCK(&tcp_lock); 859 write_lock_bh(&tcp_lock);
861 old_state = conntrack->proto.tcp.state; 860 old_state = conntrack->proto.tcp.state;
862 dir = CTINFO2DIR(ctinfo); 861 dir = CTINFO2DIR(ctinfo);
863 index = get_conntrack_index(th); 862 index = get_conntrack_index(th);
@@ -879,7 +878,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
879 * that the client cannot but retransmit its SYN and 878 * that the client cannot but retransmit its SYN and
880 * thus initiate a clean new session. 879 * thus initiate a clean new session.
881 */ 880 */
882 WRITE_UNLOCK(&tcp_lock); 881 write_unlock_bh(&tcp_lock);
883 if (LOG_INVALID(IPPROTO_TCP)) 882 if (LOG_INVALID(IPPROTO_TCP))
884 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 883 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
885 "ip_ct_tcp: killing out of sync session "); 884 "ip_ct_tcp: killing out of sync session ");
@@ -894,7 +893,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
894 conntrack->proto.tcp.last_end = 893 conntrack->proto.tcp.last_end =
895 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); 894 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
896 895
897 WRITE_UNLOCK(&tcp_lock); 896 write_unlock_bh(&tcp_lock);
898 if (LOG_INVALID(IPPROTO_TCP)) 897 if (LOG_INVALID(IPPROTO_TCP))
899 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 898 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
900 "ip_ct_tcp: invalid packet ignored "); 899 "ip_ct_tcp: invalid packet ignored ");
@@ -904,7 +903,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
904 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", 903 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
905 dir, get_conntrack_index(th), 904 dir, get_conntrack_index(th),
906 old_state); 905 old_state);
907 WRITE_UNLOCK(&tcp_lock); 906 write_unlock_bh(&tcp_lock);
908 if (LOG_INVALID(IPPROTO_TCP)) 907 if (LOG_INVALID(IPPROTO_TCP))
909 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 908 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
910 "ip_ct_tcp: invalid state "); 909 "ip_ct_tcp: invalid state ");
@@ -918,13 +917,13 @@ static int tcp_packet(struct ip_conntrack *conntrack,
918 conntrack->proto.tcp.seen[dir].td_end)) { 917 conntrack->proto.tcp.seen[dir].td_end)) {
919 /* Attempt to reopen a closed connection. 918 /* Attempt to reopen a closed connection.
920 * Delete this connection and look up again. */ 919 * Delete this connection and look up again. */
921 WRITE_UNLOCK(&tcp_lock); 920 write_unlock_bh(&tcp_lock);
922 if (del_timer(&conntrack->timeout)) 921 if (del_timer(&conntrack->timeout))
923 conntrack->timeout.function((unsigned long) 922 conntrack->timeout.function((unsigned long)
924 conntrack); 923 conntrack);
925 return -NF_REPEAT; 924 return -NF_REPEAT;
926 } else { 925 } else {
927 WRITE_UNLOCK(&tcp_lock); 926 write_unlock_bh(&tcp_lock);
928 if (LOG_INVALID(IPPROTO_TCP)) 927 if (LOG_INVALID(IPPROTO_TCP))
929 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 928 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
930 "ip_ct_tcp: invalid SYN"); 929 "ip_ct_tcp: invalid SYN");
@@ -949,7 +948,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
949 948
950 if (!tcp_in_window(&conntrack->proto.tcp, dir, index, 949 if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
951 skb, iph, th)) { 950 skb, iph, th)) {
952 WRITE_UNLOCK(&tcp_lock); 951 write_unlock_bh(&tcp_lock);
953 return -NF_ACCEPT; 952 return -NF_ACCEPT;
954 } 953 }
955 in_window: 954 in_window:
@@ -972,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
972 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans 971 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
973 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans 972 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
974 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; 973 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
975 WRITE_UNLOCK(&tcp_lock); 974 write_unlock_bh(&tcp_lock);
976 975
977 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { 976 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
978 /* If only reply is a RST, we can consider ourselves not to 977 /* If only reply is a RST, we can consider ourselves not to
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 5bc28a224623..8c1eaba098d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,6 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
120 * and moreover root might send raw packets. 120 * and moreover root might send raw packets.
121 * FIXME: Source route IP option packets --RR */ 121 * FIXME: Source route IP option packets --RR */
122 if (hooknum == NF_IP_PRE_ROUTING 122 if (hooknum == NF_IP_PRE_ROUTING
123 && skb->ip_summed != CHECKSUM_UNNECESSARY
123 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, 124 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
124 skb->ip_summed == CHECKSUM_HW ? skb->csum 125 skb->ip_summed == CHECKSUM_HW ? skb->csum
125 : skb_checksum(skb, iph->ihl*4, udplen, 0))) { 126 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 46ca45f74d85..42dc95102873 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -28,8 +28,8 @@
28#include <net/checksum.h> 28#include <net/checksum.h>
29#include <net/ip.h> 29#include <net/ip.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -119,7 +119,7 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
119 119
120static void *ct_seq_start(struct seq_file *seq, loff_t *pos) 120static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
121{ 121{
122 READ_LOCK(&ip_conntrack_lock); 122 read_lock_bh(&ip_conntrack_lock);
123 return ct_get_idx(seq, *pos); 123 return ct_get_idx(seq, *pos);
124} 124}
125 125
@@ -131,7 +131,7 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
131 131
132static void ct_seq_stop(struct seq_file *s, void *v) 132static void ct_seq_stop(struct seq_file *s, void *v)
133{ 133{
134 READ_UNLOCK(&ip_conntrack_lock); 134 read_unlock_bh(&ip_conntrack_lock);
135} 135}
136 136
137static int ct_seq_show(struct seq_file *s, void *v) 137static int ct_seq_show(struct seq_file *s, void *v)
@@ -140,7 +140,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); 140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
141 struct ip_conntrack_protocol *proto; 141 struct ip_conntrack_protocol *proto;
142 142
143 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 143 ASSERT_READ_LOCK(&ip_conntrack_lock);
144 IP_NF_ASSERT(conntrack); 144 IP_NF_ASSERT(conntrack);
145 145
146 /* we only want to print DIR_ORIGINAL */ 146 /* we only want to print DIR_ORIGINAL */
@@ -239,7 +239,7 @@ static void *exp_seq_start(struct seq_file *s, loff_t *pos)
239 239
240 /* strange seq_file api calls stop even if we fail, 240 /* strange seq_file api calls stop even if we fail,
241 * thus we need to grab lock since stop unlocks */ 241 * thus we need to grab lock since stop unlocks */
242 READ_LOCK(&ip_conntrack_lock); 242 read_lock_bh(&ip_conntrack_lock);
243 243
244 if (list_empty(e)) 244 if (list_empty(e))
245 return NULL; 245 return NULL;
@@ -256,6 +256,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
256{ 256{
257 struct list_head *e = v; 257 struct list_head *e = v;
258 258
259 ++*pos;
259 e = e->next; 260 e = e->next;
260 261
261 if (e == &ip_conntrack_expect_list) 262 if (e == &ip_conntrack_expect_list)
@@ -266,7 +267,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
266 267
267static void exp_seq_stop(struct seq_file *s, void *v) 268static void exp_seq_stop(struct seq_file *s, void *v)
268{ 269{
269 READ_UNLOCK(&ip_conntrack_lock); 270 read_unlock_bh(&ip_conntrack_lock);
270} 271}
271 272
272static int exp_seq_show(struct seq_file *s, void *v) 273static int exp_seq_show(struct seq_file *s, void *v)
@@ -920,22 +921,22 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
920{ 921{
921 int ret = 0; 922 int ret = 0;
922 923
923 WRITE_LOCK(&ip_conntrack_lock); 924 write_lock_bh(&ip_conntrack_lock);
924 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { 925 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
925 ret = -EBUSY; 926 ret = -EBUSY;
926 goto out; 927 goto out;
927 } 928 }
928 ip_ct_protos[proto->proto] = proto; 929 ip_ct_protos[proto->proto] = proto;
929 out: 930 out:
930 WRITE_UNLOCK(&ip_conntrack_lock); 931 write_unlock_bh(&ip_conntrack_lock);
931 return ret; 932 return ret;
932} 933}
933 934
934void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) 935void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
935{ 936{
936 WRITE_LOCK(&ip_conntrack_lock); 937 write_lock_bh(&ip_conntrack_lock);
937 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; 938 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
938 WRITE_UNLOCK(&ip_conntrack_lock); 939 write_unlock_bh(&ip_conntrack_lock);
939 940
940 /* Somebody could be still looking at the proto in bh. */ 941 /* Somebody could be still looking at the proto in bh. */
941 synchronize_net(); 942 synchronize_net();
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 9fc6f93af0dd..739b6dde1c82 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -22,8 +22,8 @@
22#include <linux/udp.h> 22#include <linux/udp.h>
23#include <linux/jhash.h> 23#include <linux/jhash.h>
24 24
25#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 25#define ASSERT_READ_LOCK(x)
26#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 26#define ASSERT_WRITE_LOCK(x)
27 27
28#include <linux/netfilter_ipv4/ip_conntrack.h> 28#include <linux/netfilter_ipv4/ip_conntrack.h>
29#include <linux/netfilter_ipv4/ip_conntrack_core.h> 29#include <linux/netfilter_ipv4/ip_conntrack_core.h>
@@ -41,7 +41,7 @@
41#define DEBUGP(format, args...) 41#define DEBUGP(format, args...)
42#endif 42#endif
43 43
44DECLARE_RWLOCK(ip_nat_lock); 44DEFINE_RWLOCK(ip_nat_lock);
45 45
46/* Calculated at init based on memory size */ 46/* Calculated at init based on memory size */
47static unsigned int ip_nat_htable_size; 47static unsigned int ip_nat_htable_size;
@@ -65,9 +65,9 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
65 if (!(conn->status & IPS_NAT_DONE_MASK)) 65 if (!(conn->status & IPS_NAT_DONE_MASK))
66 return; 66 return;
67 67
68 WRITE_LOCK(&ip_nat_lock); 68 write_lock_bh(&ip_nat_lock);
69 list_del(&conn->nat.info.bysource); 69 list_del(&conn->nat.info.bysource);
70 WRITE_UNLOCK(&ip_nat_lock); 70 write_unlock_bh(&ip_nat_lock);
71} 71}
72 72
73/* We do checksum mangling, so if they were wrong before they're still 73/* We do checksum mangling, so if they were wrong before they're still
@@ -142,7 +142,7 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
142 unsigned int h = hash_by_src(tuple); 142 unsigned int h = hash_by_src(tuple);
143 struct ip_conntrack *ct; 143 struct ip_conntrack *ct;
144 144
145 READ_LOCK(&ip_nat_lock); 145 read_lock_bh(&ip_nat_lock);
146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) { 146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147 if (same_src(ct, tuple)) { 147 if (same_src(ct, tuple)) {
148 /* Copy source part from reply tuple. */ 148 /* Copy source part from reply tuple. */
@@ -151,12 +151,12 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
151 result->dst = tuple->dst; 151 result->dst = tuple->dst;
152 152
153 if (in_range(result, range)) { 153 if (in_range(result, range)) {
154 READ_UNLOCK(&ip_nat_lock); 154 read_unlock_bh(&ip_nat_lock);
155 return 1; 155 return 1;
156 } 156 }
157 } 157 }
158 } 158 }
159 READ_UNLOCK(&ip_nat_lock); 159 read_unlock_bh(&ip_nat_lock);
160 return 0; 160 return 0;
161} 161}
162 162
@@ -297,9 +297,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
297 unsigned int srchash 297 unsigned int srchash
298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299 .tuple); 299 .tuple);
300 WRITE_LOCK(&ip_nat_lock); 300 write_lock_bh(&ip_nat_lock);
301 list_add(&info->bysource, &bysource[srchash]); 301 list_add(&info->bysource, &bysource[srchash]);
302 WRITE_UNLOCK(&ip_nat_lock); 302 write_unlock_bh(&ip_nat_lock);
303 } 303 }
304 304
305 /* It's done. */ 305 /* It's done. */
@@ -474,23 +474,23 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
474{ 474{
475 int ret = 0; 475 int ret = 0;
476 476
477 WRITE_LOCK(&ip_nat_lock); 477 write_lock_bh(&ip_nat_lock);
478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { 478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
479 ret = -EBUSY; 479 ret = -EBUSY;
480 goto out; 480 goto out;
481 } 481 }
482 ip_nat_protos[proto->protonum] = proto; 482 ip_nat_protos[proto->protonum] = proto;
483 out: 483 out:
484 WRITE_UNLOCK(&ip_nat_lock); 484 write_unlock_bh(&ip_nat_lock);
485 return ret; 485 return ret;
486} 486}
487 487
488/* Noone stores the protocol anywhere; simply delete it. */ 488/* Noone stores the protocol anywhere; simply delete it. */
489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) 489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
490{ 490{
491 WRITE_LOCK(&ip_nat_lock); 491 write_lock_bh(&ip_nat_lock);
492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; 492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
493 WRITE_UNLOCK(&ip_nat_lock); 493 write_unlock_bh(&ip_nat_lock);
494 494
495 /* Someone could be still looking at the proto in a bh. */ 495 /* Someone could be still looking at the proto in a bh. */
496 synchronize_net(); 496 synchronize_net();
@@ -509,13 +509,13 @@ int __init ip_nat_init(void)
509 return -ENOMEM; 509 return -ENOMEM;
510 510
511 /* Sew in builtin protocols. */ 511 /* Sew in builtin protocols. */
512 WRITE_LOCK(&ip_nat_lock); 512 write_lock_bh(&ip_nat_lock);
513 for (i = 0; i < MAX_IP_NAT_PROTO; i++) 513 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
514 ip_nat_protos[i] = &ip_nat_unknown_protocol; 514 ip_nat_protos[i] = &ip_nat_unknown_protocol;
515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; 515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; 516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; 517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
518 WRITE_UNLOCK(&ip_nat_lock); 518 write_unlock_bh(&ip_nat_lock);
519 519
520 for (i = 0; i < ip_nat_htable_size; i++) { 520 for (i = 0; i < ip_nat_htable_size; i++) {
521 INIT_LIST_HEAD(&bysource[i]); 521 INIT_LIST_HEAD(&bysource[i]);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 1637b96d8c01..158f34f32c04 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -28,8 +28,8 @@
28#include <net/tcp.h> 28#include <net/tcp.h>
29#include <net/udp.h> 29#include <net/udp.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -47,7 +47,7 @@
47#define DUMP_OFFSET(x) 47#define DUMP_OFFSET(x)
48#endif 48#endif
49 49
50static DECLARE_LOCK(ip_nat_seqofs_lock); 50static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
51 51
52/* Setup TCP sequence correction given this change at this sequence */ 52/* Setup TCP sequence correction given this change at this sequence */
53static inline void 53static inline void
@@ -70,7 +70,7 @@ adjust_tcp_sequence(u32 seq,
70 DEBUGP("ip_nat_resize_packet: Seq_offset before: "); 70 DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
71 DUMP_OFFSET(this_way); 71 DUMP_OFFSET(this_way);
72 72
73 LOCK_BH(&ip_nat_seqofs_lock); 73 spin_lock_bh(&ip_nat_seqofs_lock);
74 74
75 /* SYN adjust. If it's uninitialized, or this is after last 75 /* SYN adjust. If it's uninitialized, or this is after last
76 * correction, record it: we don't handle more than one 76 * correction, record it: we don't handle more than one
@@ -82,7 +82,7 @@ adjust_tcp_sequence(u32 seq,
82 this_way->offset_before = this_way->offset_after; 82 this_way->offset_before = this_way->offset_after;
83 this_way->offset_after += sizediff; 83 this_way->offset_after += sizediff;
84 } 84 }
85 UNLOCK_BH(&ip_nat_seqofs_lock); 85 spin_unlock_bh(&ip_nat_seqofs_lock);
86 86
87 DEBUGP("ip_nat_resize_packet: Seq_offset after: "); 87 DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
88 DUMP_OFFSET(this_way); 88 DUMP_OFFSET(this_way);
@@ -142,9 +142,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
142 /* Transfer socket to new skb. */ 142 /* Transfer socket to new skb. */
143 if ((*pskb)->sk) 143 if ((*pskb)->sk)
144 skb_set_owner_w(nskb, (*pskb)->sk); 144 skb_set_owner_w(nskb, (*pskb)->sk);
145#ifdef CONFIG_NETFILTER_DEBUG
146 nskb->nf_debug = (*pskb)->nf_debug;
147#endif
148 kfree_skb(*pskb); 145 kfree_skb(*pskb);
149 *pskb = nskb; 146 *pskb = nskb;
150 return 1; 147 return 1;
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 581f097f5a24..60d70fa41a15 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -19,8 +19,8 @@
19#include <net/route.h> 19#include <net/route.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21 21
22#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 22#define ASSERT_READ_LOCK(x)
23#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 23#define ASSERT_WRITE_LOCK(x)
24 24
25#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
26#include <linux/netfilter_ipv4/ip_nat.h> 26#include <linux/netfilter_ipv4/ip_nat.h>
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 79f56f662b33..bc59d0d6e89e 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -31,8 +31,8 @@
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33 33
34#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 34#define ASSERT_READ_LOCK(x)
35#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 35#define ASSERT_WRITE_LOCK(x)
36 36
37#include <linux/netfilter_ipv4/ip_nat.h> 37#include <linux/netfilter_ipv4/ip_nat.h>
38#include <linux/netfilter_ipv4/ip_nat_rule.h> 38#include <linux/netfilter_ipv4/ip_nat_rule.h>
@@ -373,7 +373,6 @@ static int init_or_cleanup(int init)
373 cleanup_rule_init: 373 cleanup_rule_init:
374 ip_nat_rule_cleanup(); 374 ip_nat_rule_cleanup();
375 cleanup_nothing: 375 cleanup_nothing:
376 MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
377 return ret; 376 return ret;
378} 377}
379 378
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index e5746b674413..eda1fba431a4 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -3,6 +3,7 @@
3 * communicating with userspace via netlink. 3 * communicating with userspace via netlink.
4 * 4 *
5 * (C) 2000-2002 James Morris <jmorris@intercode.com.au> 5 * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
6 * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
6 * 7 *
7 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
@@ -17,6 +18,7 @@
17 * 2005-01-10: Added /proc counter for dropped packets; fixed so 18 * 2005-01-10: Added /proc counter for dropped packets; fixed so
18 * packets aren't delivered to user space if they're going 19 * packets aren't delivered to user space if they're going
19 * to be dropped. 20 * to be dropped.
21 * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte)
20 * 22 *
21 */ 23 */
22#include <linux/module.h> 24#include <linux/module.h>
@@ -71,7 +73,15 @@ static DECLARE_MUTEX(ipqnl_sem);
71static void 73static void
72ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) 74ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
73{ 75{
76 /* TCP input path (and probably other bits) assume to be called
77 * from softirq context, not from syscall, like ipq_issue_verdict is
78 * called. TCP input path deadlocks with locks taken from timer
79 * softirq, e.g. We therefore emulate this by local_bh_disable() */
80
81 local_bh_disable();
74 nf_reinject(entry->skb, entry->info, verdict); 82 nf_reinject(entry->skb, entry->info, verdict);
83 local_bh_enable();
84
75 kfree(entry); 85 kfree(entry);
76} 86}
77 87
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a54f92b8496..c88dfcd38c56 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -67,7 +67,6 @@ static DECLARE_MUTEX(ipt_mutex);
67/* Must have mutex */ 67/* Must have mutex */
68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
70#include <linux/netfilter_ipv4/lockhelp.h>
71#include <linux/netfilter_ipv4/listhelp.h> 70#include <linux/netfilter_ipv4/listhelp.h>
72 71
73#if 0 72#if 0
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0f12e3a3dc73..9cde8c61f525 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,7 +29,6 @@
29#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32#include <linux/netfilter_ipv4/lockhelp.h>
33 32
34#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.6"
35 34
@@ -41,6 +40,8 @@
41#define DEBUGP 40#define DEBUGP
42#endif 41#endif
43 42
43#define ASSERT_READ_LOCK(x)
44
44MODULE_LICENSE("GPL"); 45MODULE_LICENSE("GPL");
45MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); 46MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
46MODULE_DESCRIPTION("iptables target for CLUSTERIP"); 47MODULE_DESCRIPTION("iptables target for CLUSTERIP");
@@ -67,7 +68,7 @@ static LIST_HEAD(clusterip_configs);
67 68
68/* clusterip_lock protects the clusterip_configs list _AND_ the configurable 69/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
69 * data within all structurses (num_local_nodes, local_nodes[]) */ 70 * data within all structurses (num_local_nodes, local_nodes[]) */
70static DECLARE_RWLOCK(clusterip_lock); 71static DEFINE_RWLOCK(clusterip_lock);
71 72
72#ifdef CONFIG_PROC_FS 73#ifdef CONFIG_PROC_FS
73static struct file_operations clusterip_proc_fops; 74static struct file_operations clusterip_proc_fops;
@@ -82,9 +83,9 @@ clusterip_config_get(struct clusterip_config *c) {
82static inline void 83static inline void
83clusterip_config_put(struct clusterip_config *c) { 84clusterip_config_put(struct clusterip_config *c) {
84 if (atomic_dec_and_test(&c->refcount)) { 85 if (atomic_dec_and_test(&c->refcount)) {
85 WRITE_LOCK(&clusterip_lock); 86 write_lock_bh(&clusterip_lock);
86 list_del(&c->list); 87 list_del(&c->list);
87 WRITE_UNLOCK(&clusterip_lock); 88 write_unlock_bh(&clusterip_lock);
88 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); 89 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
89 dev_put(c->dev); 90 dev_put(c->dev);
90 kfree(c); 91 kfree(c);
@@ -97,7 +98,7 @@ __clusterip_config_find(u_int32_t clusterip)
97{ 98{
98 struct list_head *pos; 99 struct list_head *pos;
99 100
100 MUST_BE_READ_LOCKED(&clusterip_lock); 101 ASSERT_READ_LOCK(&clusterip_lock);
101 list_for_each(pos, &clusterip_configs) { 102 list_for_each(pos, &clusterip_configs) {
102 struct clusterip_config *c = list_entry(pos, 103 struct clusterip_config *c = list_entry(pos,
103 struct clusterip_config, list); 104 struct clusterip_config, list);
@@ -114,14 +115,14 @@ clusterip_config_find_get(u_int32_t clusterip)
114{ 115{
115 struct clusterip_config *c; 116 struct clusterip_config *c;
116 117
117 READ_LOCK(&clusterip_lock); 118 read_lock_bh(&clusterip_lock);
118 c = __clusterip_config_find(clusterip); 119 c = __clusterip_config_find(clusterip);
119 if (!c) { 120 if (!c) {
120 READ_UNLOCK(&clusterip_lock); 121 read_unlock_bh(&clusterip_lock);
121 return NULL; 122 return NULL;
122 } 123 }
123 atomic_inc(&c->refcount); 124 atomic_inc(&c->refcount);
124 READ_UNLOCK(&clusterip_lock); 125 read_unlock_bh(&clusterip_lock);
125 126
126 return c; 127 return c;
127} 128}
@@ -160,9 +161,9 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
160 c->pde->data = c; 161 c->pde->data = c;
161#endif 162#endif
162 163
163 WRITE_LOCK(&clusterip_lock); 164 write_lock_bh(&clusterip_lock);
164 list_add(&c->list, &clusterip_configs); 165 list_add(&c->list, &clusterip_configs);
165 WRITE_UNLOCK(&clusterip_lock); 166 write_unlock_bh(&clusterip_lock);
166 167
167 return c; 168 return c;
168} 169}
@@ -172,25 +173,25 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
172{ 173{
173 int i; 174 int i;
174 175
175 WRITE_LOCK(&clusterip_lock); 176 write_lock_bh(&clusterip_lock);
176 177
177 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES 178 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
178 || nodenum > CLUSTERIP_MAX_NODES) { 179 || nodenum > CLUSTERIP_MAX_NODES) {
179 WRITE_UNLOCK(&clusterip_lock); 180 write_unlock_bh(&clusterip_lock);
180 return 1; 181 return 1;
181 } 182 }
182 183
183 /* check if we alrady have this number in our array */ 184 /* check if we alrady have this number in our array */
184 for (i = 0; i < c->num_local_nodes; i++) { 185 for (i = 0; i < c->num_local_nodes; i++) {
185 if (c->local_nodes[i] == nodenum) { 186 if (c->local_nodes[i] == nodenum) {
186 WRITE_UNLOCK(&clusterip_lock); 187 write_unlock_bh(&clusterip_lock);
187 return 1; 188 return 1;
188 } 189 }
189 } 190 }
190 191
191 c->local_nodes[c->num_local_nodes++] = nodenum; 192 c->local_nodes[c->num_local_nodes++] = nodenum;
192 193
193 WRITE_UNLOCK(&clusterip_lock); 194 write_unlock_bh(&clusterip_lock);
194 return 0; 195 return 0;
195} 196}
196 197
@@ -199,10 +200,10 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
199{ 200{
200 int i; 201 int i;
201 202
202 WRITE_LOCK(&clusterip_lock); 203 write_lock_bh(&clusterip_lock);
203 204
204 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { 205 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
205 WRITE_UNLOCK(&clusterip_lock); 206 write_unlock_bh(&clusterip_lock);
206 return 1; 207 return 1;
207 } 208 }
208 209
@@ -211,12 +212,12 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
211 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); 212 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
212 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); 213 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
213 c->num_local_nodes--; 214 c->num_local_nodes--;
214 WRITE_UNLOCK(&clusterip_lock); 215 write_unlock_bh(&clusterip_lock);
215 return 0; 216 return 0;
216 } 217 }
217 } 218 }
218 219
219 WRITE_UNLOCK(&clusterip_lock); 220 write_unlock_bh(&clusterip_lock);
220 return 1; 221 return 1;
221} 222}
222 223
@@ -286,21 +287,21 @@ clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
286{ 287{
287 int i; 288 int i;
288 289
289 READ_LOCK(&clusterip_lock); 290 read_lock_bh(&clusterip_lock);
290 291
291 if (config->num_local_nodes == 0) { 292 if (config->num_local_nodes == 0) {
292 READ_UNLOCK(&clusterip_lock); 293 read_unlock_bh(&clusterip_lock);
293 return 0; 294 return 0;
294 } 295 }
295 296
296 for (i = 0; i < config->num_local_nodes; i++) { 297 for (i = 0; i < config->num_local_nodes; i++) {
297 if (config->local_nodes[i] == hash) { 298 if (config->local_nodes[i] == hash) {
298 READ_UNLOCK(&clusterip_lock); 299 read_unlock_bh(&clusterip_lock);
299 return 1; 300 return 1;
300 } 301 }
301 } 302 }
302 303
303 READ_UNLOCK(&clusterip_lock); 304 read_unlock_bh(&clusterip_lock);
304 305
305 return 0; 306 return 0;
306} 307}
@@ -338,7 +339,7 @@ target(struct sk_buff **pskb,
338 * error messages (RELATED) and information requests (see below) */ 339 * error messages (RELATED) and information requests (see below) */
339 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 340 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
340 && (ctinfo == IP_CT_RELATED 341 && (ctinfo == IP_CT_RELATED
341 || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) 342 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
342 return IPT_CONTINUE; 343 return IPT_CONTINUE;
343 344
344 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 345 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -578,7 +579,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
578 struct clusterip_config *c = pde->data; 579 struct clusterip_config *c = pde->data;
579 unsigned int *nodeidx; 580 unsigned int *nodeidx;
580 581
581 READ_LOCK(&clusterip_lock); 582 read_lock_bh(&clusterip_lock);
582 if (*pos >= c->num_local_nodes) 583 if (*pos >= c->num_local_nodes)
583 return NULL; 584 return NULL;
584 585
@@ -608,7 +609,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v)
608{ 609{
609 kfree(v); 610 kfree(v);
610 611
611 READ_UNLOCK(&clusterip_lock); 612 read_unlock_bh(&clusterip_lock);
612} 613}
613 614
614static int clusterip_seq_show(struct seq_file *s, void *v) 615static int clusterip_seq_show(struct seq_file *s, void *v)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 57e9f6cf1c36..91e74502c3d3 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module");
33#endif 33#endif
34 34
35/* Lock protects masq region inside conntrack */ 35/* Lock protects masq region inside conntrack */
36static DECLARE_RWLOCK(masq_lock); 36static DEFINE_RWLOCK(masq_lock);
37 37
38/* FIXME: Multiple targets. --RR */ 38/* FIXME: Multiple targets. --RR */
39static int 39static int
@@ -103,9 +103,9 @@ masquerade_target(struct sk_buff **pskb,
103 return NF_DROP; 103 return NF_DROP;
104 } 104 }
105 105
106 WRITE_LOCK(&masq_lock); 106 write_lock_bh(&masq_lock);
107 ct->nat.masq_index = out->ifindex; 107 ct->nat.masq_index = out->ifindex;
108 WRITE_UNLOCK(&masq_lock); 108 write_unlock_bh(&masq_lock);
109 109
110 /* Transfer from original range. */ 110 /* Transfer from original range. */
111 newrange = ((struct ip_nat_range) 111 newrange = ((struct ip_nat_range)
@@ -122,9 +122,9 @@ device_cmp(struct ip_conntrack *i, void *ifindex)
122{ 122{
123 int ret; 123 int ret;
124 124
125 READ_LOCK(&masq_lock); 125 read_lock_bh(&masq_lock);
126 ret = (i->nat.masq_index == (int)(long)ifindex); 126 ret = (i->nat.masq_index == (int)(long)ifindex);
127 READ_UNLOCK(&masq_lock); 127 read_unlock_bh(&masq_lock);
128 128
129 return ret; 129 return ret;
130} 130}
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 266d64979286..915696446020 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -104,10 +104,12 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
104static void send_reset(struct sk_buff *oldskb, int hook) 104static void send_reset(struct sk_buff *oldskb, int hook)
105{ 105{
106 struct sk_buff *nskb; 106 struct sk_buff *nskb;
107 struct iphdr *iph = oldskb->nh.iph;
107 struct tcphdr _otcph, *oth, *tcph; 108 struct tcphdr _otcph, *oth, *tcph;
108 struct rtable *rt; 109 struct rtable *rt;
109 u_int16_t tmp_port; 110 u_int16_t tmp_port;
110 u_int32_t tmp_addr; 111 u_int32_t tmp_addr;
112 unsigned int tcplen;
111 int needs_ack; 113 int needs_ack;
112 int hh_len; 114 int hh_len;
113 115
@@ -124,7 +126,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
124 if (oth->rst) 126 if (oth->rst)
125 return; 127 return;
126 128
127 /* FIXME: Check checksum --RR */ 129 /* Check checksum */
130 tcplen = oldskb->len - iph->ihl * 4;
131 if (((hook != NF_IP_LOCAL_IN && oldskb->ip_summed != CHECKSUM_HW) ||
132 (hook == NF_IP_LOCAL_IN &&
133 oldskb->ip_summed != CHECKSUM_UNNECESSARY)) &&
134 csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
135 oldskb->ip_summed == CHECKSUM_HW ? oldskb->csum :
136 skb_checksum(oldskb, iph->ihl * 4, tcplen, 0)))
137 return;
138
128 if ((rt = route_reverse(oldskb, oth, hook)) == NULL) 139 if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
129 return; 140 return;
130 141
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6f2cefbe16cd..52a0076302a7 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -56,7 +56,6 @@
56#include <linux/netfilter.h> 56#include <linux/netfilter.h>
57#include <linux/netfilter_ipv4/ip_tables.h> 57#include <linux/netfilter_ipv4/ip_tables.h>
58#include <linux/netfilter_ipv4/ipt_ULOG.h> 58#include <linux/netfilter_ipv4/ipt_ULOG.h>
59#include <linux/netfilter_ipv4/lockhelp.h>
60#include <net/sock.h> 59#include <net/sock.h>
61#include <linux/bitops.h> 60#include <linux/bitops.h>
62 61
@@ -99,8 +98,8 @@ typedef struct {
99 98
100static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ 99static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
101 100
102static struct sock *nflognl; /* our socket */ 101static struct sock *nflognl; /* our socket */
103static DECLARE_LOCK(ulog_lock); /* spinlock */ 102static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
104 103
105/* send one ulog_buff_t to userspace */ 104/* send one ulog_buff_t to userspace */
106static void ulog_send(unsigned int nlgroupnum) 105static void ulog_send(unsigned int nlgroupnum)
@@ -135,9 +134,9 @@ static void ulog_timer(unsigned long data)
135 134
136 /* lock to protect against somebody modifying our structure 135 /* lock to protect against somebody modifying our structure
137 * from ipt_ulog_target at the same time */ 136 * from ipt_ulog_target at the same time */
138 LOCK_BH(&ulog_lock); 137 spin_lock_bh(&ulog_lock);
139 ulog_send(data); 138 ulog_send(data);
140 UNLOCK_BH(&ulog_lock); 139 spin_unlock_bh(&ulog_lock);
141} 140}
142 141
143static struct sk_buff *ulog_alloc_skb(unsigned int size) 142static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -193,7 +192,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
193 192
194 ub = &ulog_buffers[groupnum]; 193 ub = &ulog_buffers[groupnum];
195 194
196 LOCK_BH(&ulog_lock); 195 spin_lock_bh(&ulog_lock);
197 196
198 if (!ub->skb) { 197 if (!ub->skb) {
199 if (!(ub->skb = ulog_alloc_skb(size))) 198 if (!(ub->skb = ulog_alloc_skb(size)))
@@ -278,7 +277,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
278 ulog_send(groupnum); 277 ulog_send(groupnum);
279 } 278 }
280 279
281 UNLOCK_BH(&ulog_lock); 280 spin_unlock_bh(&ulog_lock);
282 281
283 return; 282 return;
284 283
@@ -288,7 +287,7 @@ nlmsg_failure:
288alloc_failure: 287alloc_failure:
289 PRINTR("ipt_ULOG: Error building netlink message\n"); 288 PRINTR("ipt_ULOG: Error building netlink message\n");
290 289
291 UNLOCK_BH(&ulog_lock); 290 spin_unlock_bh(&ulog_lock);
292} 291}
293 292
294static unsigned int ipt_ulog_target(struct sk_buff **pskb, 293static unsigned int ipt_ulog_target(struct sk_buff **pskb,
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index f1937190cd77..564b49bfebcf 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/netfilter_ipv4/ip_tables.h> 38#include <linux/netfilter_ipv4/ip_tables.h>
39#include <linux/netfilter_ipv4/ipt_hashlimit.h> 39#include <linux/netfilter_ipv4/ipt_hashlimit.h>
40#include <linux/netfilter_ipv4/lockhelp.h>
41 40
42/* FIXME: this is just for IP_NF_ASSERRT */ 41/* FIXME: this is just for IP_NF_ASSERRT */
43#include <linux/netfilter_ipv4/ip_conntrack.h> 42#include <linux/netfilter_ipv4/ip_conntrack.h>
@@ -92,7 +91,7 @@ struct ipt_hashlimit_htable {
92 struct hlist_head hash[0]; /* hashtable itself */ 91 struct hlist_head hash[0]; /* hashtable itself */
93}; 92};
94 93
95static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ 94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
96static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ 95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
97static HLIST_HEAD(hashlimit_htables); 96static HLIST_HEAD(hashlimit_htables);
98static kmem_cache_t *hashlimit_cachep; 97static kmem_cache_t *hashlimit_cachep;
@@ -233,9 +232,9 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
233 hinfo->timer.function = htable_gc; 232 hinfo->timer.function = htable_gc;
234 add_timer(&hinfo->timer); 233 add_timer(&hinfo->timer);
235 234
236 LOCK_BH(&hashlimit_lock); 235 spin_lock_bh(&hashlimit_lock);
237 hlist_add_head(&hinfo->node, &hashlimit_htables); 236 hlist_add_head(&hinfo->node, &hashlimit_htables);
238 UNLOCK_BH(&hashlimit_lock); 237 spin_unlock_bh(&hashlimit_lock);
239 238
240 return 0; 239 return 0;
241} 240}
@@ -301,15 +300,15 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
301 struct ipt_hashlimit_htable *hinfo; 300 struct ipt_hashlimit_htable *hinfo;
302 struct hlist_node *pos; 301 struct hlist_node *pos;
303 302
304 LOCK_BH(&hashlimit_lock); 303 spin_lock_bh(&hashlimit_lock);
305 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { 304 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
306 if (!strcmp(name, hinfo->pde->name)) { 305 if (!strcmp(name, hinfo->pde->name)) {
307 atomic_inc(&hinfo->use); 306 atomic_inc(&hinfo->use);
308 UNLOCK_BH(&hashlimit_lock); 307 spin_unlock_bh(&hashlimit_lock);
309 return hinfo; 308 return hinfo;
310 } 309 }
311 } 310 }
312 UNLOCK_BH(&hashlimit_lock); 311 spin_unlock_bh(&hashlimit_lock);
313 312
314 return NULL; 313 return NULL;
315} 314}
@@ -317,9 +316,9 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
317static void htable_put(struct ipt_hashlimit_htable *hinfo) 316static void htable_put(struct ipt_hashlimit_htable *hinfo)
318{ 317{
319 if (atomic_dec_and_test(&hinfo->use)) { 318 if (atomic_dec_and_test(&hinfo->use)) {
320 LOCK_BH(&hashlimit_lock); 319 spin_lock_bh(&hashlimit_lock);
321 hlist_del(&hinfo->node); 320 hlist_del(&hinfo->node);
322 UNLOCK_BH(&hashlimit_lock); 321 spin_unlock_bh(&hashlimit_lock);
323 htable_destroy(hinfo); 322 htable_destroy(hinfo);
324 } 323 }
325} 324}
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 33fdf364d3d3..3e7dd014de43 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
53 return ret; 53 return ret;
54 } 54 }
55 55
56 READ_LOCK(&ip_conntrack_lock); 56 read_lock_bh(&ip_conntrack_lock);
57 if (!ct->master->helper) { 57 if (!ct->master->helper) {
58 DEBUGP("ipt_helper: master ct %p has no helper\n", 58 DEBUGP("ipt_helper: master ct %p has no helper\n",
59 exp->expectant); 59 exp->expectant);
@@ -69,7 +69,7 @@ match(const struct sk_buff *skb,
69 ret ^= !strncmp(ct->master->helper->name, info->name, 69 ret ^= !strncmp(ct->master->helper->name, info->name,
70 strlen(ct->master->helper->name)); 70 strlen(ct->master->helper->name));
71out_unlock: 71out_unlock:
72 READ_UNLOCK(&ip_conntrack_lock); 72 read_unlock_bh(&ip_conntrack_lock);
73 return ret; 73 return ret;
74} 74}
75 75
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 25ab9fabdcba..2d44b07688af 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -223,7 +223,7 @@ static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned
223 curr_table->table[count].last_seen = 0; 223 curr_table->table[count].last_seen = 0;
224 curr_table->table[count].addr = 0; 224 curr_table->table[count].addr = 0;
225 curr_table->table[count].ttl = 0; 225 curr_table->table[count].ttl = 0;
226 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 226 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
227 curr_table->table[count].oldest_pkt = 0; 227 curr_table->table[count].oldest_pkt = 0;
228 curr_table->table[count].time_pos = 0; 228 curr_table->table[count].time_pos = 0;
229 curr_table->time_info[count].position = count; 229 curr_table->time_info[count].position = count;
@@ -502,7 +502,7 @@ match(const struct sk_buff *skb,
502 location = time_info[curr_table->time_pos].position; 502 location = time_info[curr_table->time_pos].position;
503 hash_table[r_list[location].hash_entry] = -1; 503 hash_table[r_list[location].hash_entry] = -1;
504 hash_table[hash_result] = location; 504 hash_table[hash_result] = location;
505 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 505 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
506 r_list[location].time_pos = curr_table->time_pos; 506 r_list[location].time_pos = curr_table->time_pos;
507 r_list[location].addr = addr; 507 r_list[location].addr = addr;
508 r_list[location].ttl = ttl; 508 r_list[location].ttl = ttl;
@@ -631,7 +631,7 @@ match(const struct sk_buff *skb,
631 r_list[location].last_seen = 0; 631 r_list[location].last_seen = 0;
632 r_list[location].addr = 0; 632 r_list[location].addr = 0;
633 r_list[location].ttl = 0; 633 r_list[location].ttl = 0;
634 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 634 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
635 r_list[location].oldest_pkt = 0; 635 r_list[location].oldest_pkt = 0;
636 ans = !info->invert; 636 ans = !info->invert;
637 } 637 }
@@ -734,10 +734,10 @@ checkentry(const char *tablename,
734 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); 734 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
735#ifdef DEBUG 735#ifdef DEBUG
736 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", 736 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
737 sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); 737 sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
738#endif 738#endif
739 739
740 hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); 740 hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
741#ifdef DEBUG 741#ifdef DEBUG
742 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); 742 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
743#endif 743#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5b1ec586bae6..d1835b1bc8c4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -259,7 +259,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
259 return 0; 259 return 0;
260} 260}
261 261
262static int raw_send_hdrinc(struct sock *sk, void *from, int length, 262static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
263 struct rtable *rt, 263 struct rtable *rt,
264 unsigned int flags) 264 unsigned int flags)
265{ 265{
@@ -298,7 +298,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, int length,
298 goto error_fault; 298 goto error_fault;
299 299
300 /* We don't modify invalid header */ 300 /* We don't modify invalid header */
301 if (length >= sizeof(*iph) && iph->ihl * 4 <= length) { 301 if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
302 if (!iph->saddr) 302 if (!iph->saddr)
303 iph->saddr = rt->rt_src; 303 iph->saddr = rt->rt_src;
304 iph->check = 0; 304 iph->check = 0;
@@ -332,7 +332,7 @@ static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
332 u8 __user *type = NULL; 332 u8 __user *type = NULL;
333 u8 __user *code = NULL; 333 u8 __user *code = NULL;
334 int probed = 0; 334 int probed = 0;
335 int i; 335 unsigned int i;
336 336
337 if (!msg->msg_iov) 337 if (!msg->msg_iov)
338 return; 338 return;
@@ -384,7 +384,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
384 int err; 384 int err;
385 385
386 err = -EMSGSIZE; 386 err = -EMSGSIZE;
387 if (len < 0 || len > 0xFFFF) 387 if (len > 0xFFFF)
388 goto out; 388 goto out;
389 389
390 /* 390 /*
@@ -514,7 +514,10 @@ done:
514 kfree(ipc.opt); 514 kfree(ipc.opt);
515 ip_rt_put(rt); 515 ip_rt_put(rt);
516 516
517out: return err < 0 ? err : len; 517out:
518 if (err < 0)
519 return err;
520 return len;
518 521
519do_confirm: 522do_confirm:
520 dst_confirm(&rt->u.dst); 523 dst_confirm(&rt->u.dst);
@@ -610,7 +613,10 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
610 copied = skb->len; 613 copied = skb->len;
611done: 614done:
612 skb_free_datagram(sk, skb); 615 skb_free_datagram(sk, skb);
613out: return err ? err : copied; 616out:
617 if (err)
618 return err;
619 return copied;
614} 620}
615 621
616static int raw_init(struct sock *sk) 622static int raw_init(struct sock *sk)
@@ -691,11 +697,11 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
691 struct sk_buff *skb; 697 struct sk_buff *skb;
692 int amount = 0; 698 int amount = 0;
693 699
694 spin_lock_irq(&sk->sk_receive_queue.lock); 700 spin_lock_bh(&sk->sk_receive_queue.lock);
695 skb = skb_peek(&sk->sk_receive_queue); 701 skb = skb_peek(&sk->sk_receive_queue);
696 if (skb != NULL) 702 if (skb != NULL)
697 amount = skb->len; 703 amount = skb->len;
698 spin_unlock_irq(&sk->sk_receive_queue.lock); 704 spin_unlock_bh(&sk->sk_receive_queue.lock);
699 return put_user(amount, (int __user *)arg); 705 return put_user(amount, (int __user *)arg);
700 } 706 }
701 707
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a682d28e247b..80cf633d9f4a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1767,7 +1767,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
1767 struct in_device *in_dev, 1767 struct in_device *in_dev,
1768 u32 daddr, u32 saddr, u32 tos) 1768 u32 daddr, u32 saddr, u32 tos)
1769{ 1769{
1770 struct rtable* rth; 1770 struct rtable* rth = NULL;
1771 int err; 1771 int err;
1772 unsigned hash; 1772 unsigned hash;
1773 1773
@@ -1794,7 +1794,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
1794 u32 daddr, u32 saddr, u32 tos) 1794 u32 daddr, u32 saddr, u32 tos)
1795{ 1795{
1796#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1796#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1797 struct rtable* rth; 1797 struct rtable* rth = NULL;
1798 unsigned char hop, hopcount, lasthop; 1798 unsigned char hop, hopcount, lasthop;
1799 int err = -EINVAL; 1799 int err = -EINVAL;
1800 unsigned int hash; 1800 unsigned int hash;
@@ -2239,7 +2239,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
2239 struct net_device *dev_out, 2239 struct net_device *dev_out,
2240 unsigned flags) 2240 unsigned flags)
2241{ 2241{
2242 struct rtable *rth; 2242 struct rtable *rth = NULL;
2243 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2243 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2244 unsigned hash; 2244 unsigned hash;
2245 if (err == 0) { 2245 if (err == 0) {
@@ -2267,7 +2267,7 @@ static inline int ip_mkroute_output(struct rtable** rp,
2267 unsigned char hop; 2267 unsigned char hop;
2268 unsigned hash; 2268 unsigned hash;
2269 int err = -EINVAL; 2269 int err = -EINVAL;
2270 struct rtable *rth; 2270 struct rtable *rth = NULL;
2271 2271
2272 if (res->fi && res->fi->fib_nhs > 1) { 2272 if (res->fi && res->fi->fib_nhs > 1) {
2273 unsigned char hopcount = res->fi->fib_nhs; 2273 unsigned char hopcount = res->fi->fib_nhs;
@@ -2581,7 +2581,7 @@ int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2581} 2581}
2582 2582
2583static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 2583static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2584 int nowait) 2584 int nowait, unsigned int flags)
2585{ 2585{
2586 struct rtable *rt = (struct rtable*)skb->dst; 2586 struct rtable *rt = (struct rtable*)skb->dst;
2587 struct rtmsg *r; 2587 struct rtmsg *r;
@@ -2591,9 +2591,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2591#ifdef CONFIG_IP_MROUTE 2591#ifdef CONFIG_IP_MROUTE
2592 struct rtattr *eptr; 2592 struct rtattr *eptr;
2593#endif 2593#endif
2594 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); 2594 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2595 r = NLMSG_DATA(nlh); 2595 r = NLMSG_DATA(nlh);
2596 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2597 r->rtm_family = AF_INET; 2596 r->rtm_family = AF_INET;
2598 r->rtm_dst_len = 32; 2597 r->rtm_dst_len = 32;
2599 r->rtm_src_len = 0; 2598 r->rtm_src_len = 0;
@@ -2744,7 +2743,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2744 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; 2743 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2745 2744
2746 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2745 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2747 RTM_NEWROUTE, 0); 2746 RTM_NEWROUTE, 0, 0);
2748 if (!err) 2747 if (!err)
2749 goto out_free; 2748 goto out_free;
2750 if (err < 0) { 2749 if (err < 0) {
@@ -2781,8 +2780,8 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2781 continue; 2780 continue;
2782 skb->dst = dst_clone(&rt->u.dst); 2781 skb->dst = dst_clone(&rt->u.dst);
2783 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 2782 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2784 cb->nlh->nlmsg_seq, 2783 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2785 RTM_NEWROUTE, 1) <= 0) { 2784 1, NLM_F_MULTI) <= 0) {
2786 dst_release(xchg(&skb->dst, NULL)); 2785 dst_release(xchg(&skb->dst, NULL));
2787 rcu_read_unlock_bh(); 2786 rcu_read_unlock_bh();
2788 goto done; 2787 goto done;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e923d2f021aa..72d014442185 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,10 +169,10 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
170} 170}
171 171
172extern struct or_calltable or_ipv4; 172extern struct request_sock_ops tcp_request_sock_ops;
173 173
174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
175 struct open_request *req, 175 struct request_sock *req,
176 struct dst_entry *dst) 176 struct dst_entry *dst)
177{ 177{
178 struct tcp_sock *tp = tcp_sk(sk); 178 struct tcp_sock *tp = tcp_sk(sk);
@@ -182,7 +182,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
182 if (child) 182 if (child)
183 tcp_acceptq_queue(sk, req, child); 183 tcp_acceptq_queue(sk, req, child);
184 else 184 else
185 tcp_openreq_free(req); 185 reqsk_free(req);
186 186
187 return child; 187 return child;
188} 188}
@@ -190,10 +190,12 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
190struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 190struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
191 struct ip_options *opt) 191 struct ip_options *opt)
192{ 192{
193 struct inet_request_sock *ireq;
194 struct tcp_request_sock *treq;
193 struct tcp_sock *tp = tcp_sk(sk); 195 struct tcp_sock *tp = tcp_sk(sk);
194 __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; 196 __u32 cookie = ntohl(skb->h.th->ack_seq) - 1;
195 struct sock *ret = sk; 197 struct sock *ret = sk;
196 struct open_request *req; 198 struct request_sock *req;
197 int mss; 199 int mss;
198 struct rtable *rt; 200 struct rtable *rt;
199 __u8 rcv_wscale; 201 __u8 rcv_wscale;
@@ -209,19 +211,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
209 211
210 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); 212 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
211 213
212 req = tcp_openreq_alloc();
213 ret = NULL; 214 ret = NULL;
215 req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */
214 if (!req) 216 if (!req)
215 goto out; 217 goto out;
216 218
217 req->rcv_isn = htonl(skb->h.th->seq) - 1; 219 ireq = inet_rsk(req);
218 req->snt_isn = cookie; 220 treq = tcp_rsk(req);
221 treq->rcv_isn = htonl(skb->h.th->seq) - 1;
222 treq->snt_isn = cookie;
219 req->mss = mss; 223 req->mss = mss;
220 req->rmt_port = skb->h.th->source; 224 ireq->rmt_port = skb->h.th->source;
221 req->af.v4_req.loc_addr = skb->nh.iph->daddr; 225 ireq->loc_addr = skb->nh.iph->daddr;
222 req->af.v4_req.rmt_addr = skb->nh.iph->saddr; 226 ireq->rmt_addr = skb->nh.iph->saddr;
223 req->class = &or_ipv4; /* for savety */ 227 ireq->opt = NULL;
224 req->af.v4_req.opt = NULL;
225 228
226 /* We throwed the options of the initial SYN away, so we hope 229 /* We throwed the options of the initial SYN away, so we hope
227 * the ACK carries the same options again (see RFC1122 4.2.3.8) 230 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -229,17 +232,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
229 if (opt && opt->optlen) { 232 if (opt && opt->optlen) {
230 int opt_size = sizeof(struct ip_options) + opt->optlen; 233 int opt_size = sizeof(struct ip_options) + opt->optlen;
231 234
232 req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC); 235 ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
233 if (req->af.v4_req.opt) { 236 if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
234 if (ip_options_echo(req->af.v4_req.opt, skb)) { 237 kfree(ireq->opt);
235 kfree(req->af.v4_req.opt); 238 ireq->opt = NULL;
236 req->af.v4_req.opt = NULL;
237 }
238 } 239 }
239 } 240 }
240 241
241 req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0; 242 ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0;
242 req->wscale_ok = req->sack_ok = 0; 243 ireq->wscale_ok = ireq->sack_ok = 0;
243 req->expires = 0UL; 244 req->expires = 0UL;
244 req->retrans = 0; 245 req->retrans = 0;
245 246
@@ -253,15 +254,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
253 struct flowi fl = { .nl_u = { .ip4_u = 254 struct flowi fl = { .nl_u = { .ip4_u =
254 { .daddr = ((opt && opt->srr) ? 255 { .daddr = ((opt && opt->srr) ?
255 opt->faddr : 256 opt->faddr :
256 req->af.v4_req.rmt_addr), 257 ireq->rmt_addr),
257 .saddr = req->af.v4_req.loc_addr, 258 .saddr = ireq->loc_addr,
258 .tos = RT_CONN_FLAGS(sk) } }, 259 .tos = RT_CONN_FLAGS(sk) } },
259 .proto = IPPROTO_TCP, 260 .proto = IPPROTO_TCP,
260 .uli_u = { .ports = 261 .uli_u = { .ports =
261 { .sport = skb->h.th->dest, 262 { .sport = skb->h.th->dest,
262 .dport = skb->h.th->source } } }; 263 .dport = skb->h.th->source } } };
263 if (ip_route_output_key(&rt, &fl)) { 264 if (ip_route_output_key(&rt, &fl)) {
264 tcp_openreq_free(req); 265 reqsk_free(req);
265 goto out; 266 goto out;
266 } 267 }
267 } 268 }
@@ -272,7 +273,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
272 &req->rcv_wnd, &req->window_clamp, 273 &req->rcv_wnd, &req->window_clamp,
273 0, &rcv_wscale); 274 0, &rcv_wscale);
274 /* BTW win scale with syncookies is 0 by definition */ 275 /* BTW win scale with syncookies is 0 by definition */
275 req->rcv_wscale = rcv_wscale; 276 ireq->rcv_wscale = rcv_wscale;
276 277
277 ret = get_cookie_sock(sk, skb, req, &rt->u.dst); 278 ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
278out: return ret; 279out: return ret;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3aafb298c1c1..e32894532416 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -23,6 +23,7 @@ extern int sysctl_ip_nonlocal_bind;
23extern int sysctl_icmp_echo_ignore_all; 23extern int sysctl_icmp_echo_ignore_all;
24extern int sysctl_icmp_echo_ignore_broadcasts; 24extern int sysctl_icmp_echo_ignore_broadcasts;
25extern int sysctl_icmp_ignore_bogus_error_responses; 25extern int sysctl_icmp_ignore_bogus_error_responses;
26extern int sysctl_icmp_errors_use_inbound_ifaddr;
26 27
27/* From ip_fragment.c */ 28/* From ip_fragment.c */
28extern int sysctl_ipfrag_low_thresh; 29extern int sysctl_ipfrag_low_thresh;
@@ -117,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
117 return 1; 118 return 1;
118} 119}
119 120
121static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
122 void __user *buffer, size_t *lenp, loff_t *ppos)
123{
124 char val[TCP_CA_NAME_MAX];
125 ctl_table tbl = {
126 .data = val,
127 .maxlen = TCP_CA_NAME_MAX,
128 };
129 int ret;
130
131 tcp_get_default_congestion_control(val);
132
133 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
134 if (write && ret == 0)
135 ret = tcp_set_default_congestion_control(val);
136 return ret;
137}
138
139int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
140 void __user *oldval, size_t __user *oldlenp,
141 void __user *newval, size_t newlen,
142 void **context)
143{
144 char val[TCP_CA_NAME_MAX];
145 ctl_table tbl = {
146 .data = val,
147 .maxlen = TCP_CA_NAME_MAX,
148 };
149 int ret;
150
151 tcp_get_default_congestion_control(val);
152 ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
153 context);
154 if (ret == 0 && newval && newlen)
155 ret = tcp_set_default_congestion_control(val);
156 return ret;
157}
158
159
120ctl_table ipv4_table[] = { 160ctl_table ipv4_table[] = {
121 { 161 {
122 .ctl_name = NET_IPV4_TCP_TIMESTAMPS, 162 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -396,6 +436,14 @@ ctl_table ipv4_table[] = {
396 .proc_handler = &proc_dointvec 436 .proc_handler = &proc_dointvec
397 }, 437 },
398 { 438 {
439 .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
440 .procname = "icmp_errors_use_inbound_ifaddr",
441 .data = &sysctl_icmp_errors_use_inbound_ifaddr,
442 .maxlen = sizeof(int),
443 .mode = 0644,
444 .proc_handler = &proc_dointvec
445 },
446 {
399 .ctl_name = NET_IPV4_ROUTE, 447 .ctl_name = NET_IPV4_ROUTE,
400 .procname = "route", 448 .procname = "route",
401 .maxlen = 0, 449 .maxlen = 0,
@@ -603,70 +651,6 @@ ctl_table ipv4_table[] = {
603 .proc_handler = &proc_dointvec, 651 .proc_handler = &proc_dointvec,
604 }, 652 },
605 { 653 {
606 .ctl_name = NET_TCP_WESTWOOD,
607 .procname = "tcp_westwood",
608 .data = &sysctl_tcp_westwood,
609 .maxlen = sizeof(int),
610 .mode = 0644,
611 .proc_handler = &proc_dointvec,
612 },
613 {
614 .ctl_name = NET_TCP_VEGAS,
615 .procname = "tcp_vegas_cong_avoid",
616 .data = &sysctl_tcp_vegas_cong_avoid,
617 .maxlen = sizeof(int),
618 .mode = 0644,
619 .proc_handler = &proc_dointvec,
620 },
621 {
622 .ctl_name = NET_TCP_VEGAS_ALPHA,
623 .procname = "tcp_vegas_alpha",
624 .data = &sysctl_tcp_vegas_alpha,
625 .maxlen = sizeof(int),
626 .mode = 0644,
627 .proc_handler = &proc_dointvec,
628 },
629 {
630 .ctl_name = NET_TCP_VEGAS_BETA,
631 .procname = "tcp_vegas_beta",
632 .data = &sysctl_tcp_vegas_beta,
633 .maxlen = sizeof(int),
634 .mode = 0644,
635 .proc_handler = &proc_dointvec,
636 },
637 {
638 .ctl_name = NET_TCP_VEGAS_GAMMA,
639 .procname = "tcp_vegas_gamma",
640 .data = &sysctl_tcp_vegas_gamma,
641 .maxlen = sizeof(int),
642 .mode = 0644,
643 .proc_handler = &proc_dointvec,
644 },
645 {
646 .ctl_name = NET_TCP_BIC,
647 .procname = "tcp_bic",
648 .data = &sysctl_tcp_bic,
649 .maxlen = sizeof(int),
650 .mode = 0644,
651 .proc_handler = &proc_dointvec,
652 },
653 {
654 .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
655 .procname = "tcp_bic_fast_convergence",
656 .data = &sysctl_tcp_bic_fast_convergence,
657 .maxlen = sizeof(int),
658 .mode = 0644,
659 .proc_handler = &proc_dointvec,
660 },
661 {
662 .ctl_name = NET_TCP_BIC_LOW_WINDOW,
663 .procname = "tcp_bic_low_window",
664 .data = &sysctl_tcp_bic_low_window,
665 .maxlen = sizeof(int),
666 .mode = 0644,
667 .proc_handler = &proc_dointvec,
668 },
669 {
670 .ctl_name = NET_TCP_MODERATE_RCVBUF, 654 .ctl_name = NET_TCP_MODERATE_RCVBUF,
671 .procname = "tcp_moderate_rcvbuf", 655 .procname = "tcp_moderate_rcvbuf",
672 .data = &sysctl_tcp_moderate_rcvbuf, 656 .data = &sysctl_tcp_moderate_rcvbuf,
@@ -683,13 +667,14 @@ ctl_table ipv4_table[] = {
683 .proc_handler = &proc_dointvec, 667 .proc_handler = &proc_dointvec,
684 }, 668 },
685 { 669 {
686 .ctl_name = NET_TCP_BIC_BETA, 670 .ctl_name = NET_TCP_CONG_CONTROL,
687 .procname = "tcp_bic_beta", 671 .procname = "tcp_congestion_control",
688 .data = &sysctl_tcp_bic_beta,
689 .maxlen = sizeof(int),
690 .mode = 0644, 672 .mode = 0644,
691 .proc_handler = &proc_dointvec, 673 .maxlen = TCP_CA_NAME_MAX,
674 .proc_handler = &proc_tcp_congestion_control,
675 .strategy = &sysctl_tcp_congestion_control,
692 }, 676 },
677
693 { .ctl_name = 0 } 678 { .ctl_name = 0 }
694}; 679};
695 680
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a037bafcba3c..882436da9a3a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -271,7 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); 272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273 273
274kmem_cache_t *tcp_openreq_cachep;
275kmem_cache_t *tcp_bucket_cachep; 274kmem_cache_t *tcp_bucket_cachep;
276kmem_cache_t *tcp_timewait_cachep; 275kmem_cache_t *tcp_timewait_cachep;
277 276
@@ -317,7 +316,7 @@ EXPORT_SYMBOL(tcp_enter_memory_pressure);
317static __inline__ unsigned int tcp_listen_poll(struct sock *sk, 316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318 poll_table *wait) 317 poll_table *wait)
319{ 318{
320 return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0; 319 return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
321} 320}
322 321
323/* 322/*
@@ -463,28 +462,15 @@ int tcp_listen_start(struct sock *sk)
463{ 462{
464 struct inet_sock *inet = inet_sk(sk); 463 struct inet_sock *inet = inet_sk(sk);
465 struct tcp_sock *tp = tcp_sk(sk); 464 struct tcp_sock *tp = tcp_sk(sk);
466 struct tcp_listen_opt *lopt; 465 int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467 if (rc != 0)
468 return rc;
467 469
468 sk->sk_max_ack_backlog = 0; 470 sk->sk_max_ack_backlog = 0;
469 sk->sk_ack_backlog = 0; 471 sk->sk_ack_backlog = 0;
470 tp->accept_queue = tp->accept_queue_tail = NULL;
471 rwlock_init(&tp->syn_wait_lock);
472 tcp_delack_init(tp); 472 tcp_delack_init(tp);
473 473
474 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475 if (!lopt)
476 return -ENOMEM;
477
478 memset(lopt, 0, sizeof(struct tcp_listen_opt));
479 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481 break;
482 get_random_bytes(&lopt->hash_rnd, 4);
483
484 write_lock_bh(&tp->syn_wait_lock);
485 tp->listen_opt = lopt;
486 write_unlock_bh(&tp->syn_wait_lock);
487
488 /* There is race window here: we announce ourselves listening, 474 /* There is race window here: we announce ourselves listening,
489 * but this transition is still not validated by get_port(). 475 * but this transition is still not validated by get_port().
490 * It is OK, because this socket enters to hash table only 476 * It is OK, because this socket enters to hash table only
@@ -501,10 +487,7 @@ int tcp_listen_start(struct sock *sk)
501 } 487 }
502 488
503 sk->sk_state = TCP_CLOSE; 489 sk->sk_state = TCP_CLOSE;
504 write_lock_bh(&tp->syn_wait_lock); 490 reqsk_queue_destroy(&tp->accept_queue);
505 tp->listen_opt = NULL;
506 write_unlock_bh(&tp->syn_wait_lock);
507 kfree(lopt);
508 return -EADDRINUSE; 491 return -EADDRINUSE;
509} 492}
510 493
@@ -516,25 +499,23 @@ int tcp_listen_start(struct sock *sk)
516static void tcp_listen_stop (struct sock *sk) 499static void tcp_listen_stop (struct sock *sk)
517{ 500{
518 struct tcp_sock *tp = tcp_sk(sk); 501 struct tcp_sock *tp = tcp_sk(sk);
519 struct tcp_listen_opt *lopt = tp->listen_opt; 502 struct listen_sock *lopt;
520 struct open_request *acc_req = tp->accept_queue; 503 struct request_sock *acc_req;
521 struct open_request *req; 504 struct request_sock *req;
522 int i; 505 int i;
523 506
524 tcp_delete_keepalive_timer(sk); 507 tcp_delete_keepalive_timer(sk);
525 508
526 /* make all the listen_opt local to us */ 509 /* make all the listen_opt local to us */
527 write_lock_bh(&tp->syn_wait_lock); 510 lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
528 tp->listen_opt = NULL; 511 acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
529 write_unlock_bh(&tp->syn_wait_lock);
530 tp->accept_queue = tp->accept_queue_tail = NULL;
531 512
532 if (lopt->qlen) { 513 if (lopt->qlen) {
533 for (i = 0; i < TCP_SYNQ_HSIZE; i++) { 514 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
534 while ((req = lopt->syn_table[i]) != NULL) { 515 while ((req = lopt->syn_table[i]) != NULL) {
535 lopt->syn_table[i] = req->dl_next; 516 lopt->syn_table[i] = req->dl_next;
536 lopt->qlen--; 517 lopt->qlen--;
537 tcp_openreq_free(req); 518 reqsk_free(req);
538 519
539 /* Following specs, it would be better either to send FIN 520 /* Following specs, it would be better either to send FIN
540 * (and enter FIN-WAIT-1, it is normal close) 521 * (and enter FIN-WAIT-1, it is normal close)
@@ -574,7 +555,7 @@ static void tcp_listen_stop (struct sock *sk)
574 sock_put(child); 555 sock_put(child);
575 556
576 sk_acceptq_removed(sk); 557 sk_acceptq_removed(sk);
577 tcp_openreq_fastfree(req); 558 __reqsk_free(req);
578 } 559 }
579 BUG_TRAP(!sk->sk_ack_backlog); 560 BUG_TRAP(!sk->sk_ack_backlog);
580} 561}
@@ -1345,7 +1326,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1345 1326
1346 cleanup_rbuf(sk, copied); 1327 cleanup_rbuf(sk, copied);
1347 1328
1348 if (tp->ucopy.task == user_recv) { 1329 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1349 /* Install new reader */ 1330 /* Install new reader */
1350 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { 1331 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1351 user_recv = current; 1332 user_recv = current;
@@ -1868,11 +1849,11 @@ static int wait_for_connect(struct sock *sk, long timeo)
1868 prepare_to_wait_exclusive(sk->sk_sleep, &wait, 1849 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1869 TASK_INTERRUPTIBLE); 1850 TASK_INTERRUPTIBLE);
1870 release_sock(sk); 1851 release_sock(sk);
1871 if (!tp->accept_queue) 1852 if (reqsk_queue_empty(&tp->accept_queue))
1872 timeo = schedule_timeout(timeo); 1853 timeo = schedule_timeout(timeo);
1873 lock_sock(sk); 1854 lock_sock(sk);
1874 err = 0; 1855 err = 0;
1875 if (tp->accept_queue) 1856 if (!reqsk_queue_empty(&tp->accept_queue))
1876 break; 1857 break;
1877 err = -EINVAL; 1858 err = -EINVAL;
1878 if (sk->sk_state != TCP_LISTEN) 1859 if (sk->sk_state != TCP_LISTEN)
@@ -1895,7 +1876,6 @@ static int wait_for_connect(struct sock *sk, long timeo)
1895struct sock *tcp_accept(struct sock *sk, int flags, int *err) 1876struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1896{ 1877{
1897 struct tcp_sock *tp = tcp_sk(sk); 1878 struct tcp_sock *tp = tcp_sk(sk);
1898 struct open_request *req;
1899 struct sock *newsk; 1879 struct sock *newsk;
1900 int error; 1880 int error;
1901 1881
@@ -1906,37 +1886,31 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1906 */ 1886 */
1907 error = -EINVAL; 1887 error = -EINVAL;
1908 if (sk->sk_state != TCP_LISTEN) 1888 if (sk->sk_state != TCP_LISTEN)
1909 goto out; 1889 goto out_err;
1910 1890
1911 /* Find already established connection */ 1891 /* Find already established connection */
1912 if (!tp->accept_queue) { 1892 if (reqsk_queue_empty(&tp->accept_queue)) {
1913 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1893 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1914 1894
1915 /* If this is a non blocking socket don't sleep */ 1895 /* If this is a non blocking socket don't sleep */
1916 error = -EAGAIN; 1896 error = -EAGAIN;
1917 if (!timeo) 1897 if (!timeo)
1918 goto out; 1898 goto out_err;
1919 1899
1920 error = wait_for_connect(sk, timeo); 1900 error = wait_for_connect(sk, timeo);
1921 if (error) 1901 if (error)
1922 goto out; 1902 goto out_err;
1923 } 1903 }
1924 1904
1925 req = tp->accept_queue; 1905 newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1926 if ((tp->accept_queue = req->dl_next) == NULL)
1927 tp->accept_queue_tail = NULL;
1928
1929 newsk = req->sk;
1930 sk_acceptq_removed(sk);
1931 tcp_openreq_fastfree(req);
1932 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); 1906 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1933 release_sock(sk);
1934 return newsk;
1935
1936out: 1907out:
1937 release_sock(sk); 1908 release_sock(sk);
1909 return newsk;
1910out_err:
1911 newsk = NULL;
1938 *err = error; 1912 *err = error;
1939 return NULL; 1913 goto out;
1940} 1914}
1941 1915
1942/* 1916/*
@@ -1953,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1953 return tp->af_specific->setsockopt(sk, level, optname, 1927 return tp->af_specific->setsockopt(sk, level, optname,
1954 optval, optlen); 1928 optval, optlen);
1955 1929
1930 /* This is a string value all the others are int's */
1931 if (optname == TCP_CONGESTION) {
1932 char name[TCP_CA_NAME_MAX];
1933
1934 if (optlen < 1)
1935 return -EINVAL;
1936
1937 val = strncpy_from_user(name, optval,
1938 min(TCP_CA_NAME_MAX-1, optlen));
1939 if (val < 0)
1940 return -EFAULT;
1941 name[val] = 0;
1942
1943 lock_sock(sk);
1944 err = tcp_set_congestion_control(tp, name);
1945 release_sock(sk);
1946 return err;
1947 }
1948
1956 if (optlen < sizeof(int)) 1949 if (optlen < sizeof(int))
1957 return -EINVAL; 1950 return -EINVAL;
1958 1951
@@ -2237,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2237 case TCP_QUICKACK: 2230 case TCP_QUICKACK:
2238 val = !tp->ack.pingpong; 2231 val = !tp->ack.pingpong;
2239 break; 2232 break;
2233
2234 case TCP_CONGESTION:
2235 if (get_user(len, optlen))
2236 return -EFAULT;
2237 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2238 if (put_user(len, optlen))
2239 return -EFAULT;
2240 if (copy_to_user(optval, tp->ca_ops->name, len))
2241 return -EFAULT;
2242 return 0;
2240 default: 2243 default:
2241 return -ENOPROTOOPT; 2244 return -ENOPROTOOPT;
2242 }; 2245 };
@@ -2250,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2250 2253
2251 2254
2252extern void __skb_cb_too_small_for_tcp(int, int); 2255extern void __skb_cb_too_small_for_tcp(int, int);
2253extern void tcpdiag_init(void); 2256extern struct tcp_congestion_ops tcp_reno;
2254 2257
2255static __initdata unsigned long thash_entries; 2258static __initdata unsigned long thash_entries;
2256static int __init set_thash_entries(char *str) 2259static int __init set_thash_entries(char *str)
@@ -2271,13 +2274,6 @@ void __init tcp_init(void)
2271 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), 2274 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2272 sizeof(skb->cb)); 2275 sizeof(skb->cb));
2273 2276
2274 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2275 sizeof(struct open_request),
2276 0, SLAB_HWCACHE_ALIGN,
2277 NULL, NULL);
2278 if (!tcp_openreq_cachep)
2279 panic("tcp_init: Cannot alloc open_request cache.");
2280
2281 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", 2277 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282 sizeof(struct tcp_bind_bucket), 2278 sizeof(struct tcp_bind_bucket),
2283 0, SLAB_HWCACHE_ALIGN, 2279 0, SLAB_HWCACHE_ALIGN,
@@ -2338,7 +2334,7 @@ void __init tcp_init(void)
2338 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); 2334 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339 order++) 2335 order++)
2340 ; 2336 ;
2341 if (order > 4) { 2337 if (order >= 4) {
2342 sysctl_local_port_range[0] = 32768; 2338 sysctl_local_port_range[0] = 32768;
2343 sysctl_local_port_range[1] = 61000; 2339 sysctl_local_port_range[1] = 61000;
2344 sysctl_tcp_max_tw_buckets = 180000; 2340 sysctl_tcp_max_tw_buckets = 180000;
@@ -2366,6 +2362,8 @@ void __init tcp_init(void)
2366 printk(KERN_INFO "TCP: Hash tables configured " 2362 printk(KERN_INFO "TCP: Hash tables configured "
2367 "(established %d bind %d)\n", 2363 "(established %d bind %d)\n",
2368 tcp_ehash_size << 1, tcp_bhash_size); 2364 tcp_ehash_size << 1, tcp_bhash_size);
2365
2366 tcp_register_congestion_control(&tcp_reno);
2369} 2367}
2370 2368
2371EXPORT_SYMBOL(tcp_accept); 2369EXPORT_SYMBOL(tcp_accept);
@@ -2374,7 +2372,6 @@ EXPORT_SYMBOL(tcp_destroy_sock);
2374EXPORT_SYMBOL(tcp_disconnect); 2372EXPORT_SYMBOL(tcp_disconnect);
2375EXPORT_SYMBOL(tcp_getsockopt); 2373EXPORT_SYMBOL(tcp_getsockopt);
2376EXPORT_SYMBOL(tcp_ioctl); 2374EXPORT_SYMBOL(tcp_ioctl);
2377EXPORT_SYMBOL(tcp_openreq_cachep);
2378EXPORT_SYMBOL(tcp_poll); 2375EXPORT_SYMBOL(tcp_poll);
2379EXPORT_SYMBOL(tcp_read_sock); 2376EXPORT_SYMBOL(tcp_read_sock);
2380EXPORT_SYMBOL(tcp_recvmsg); 2377EXPORT_SYMBOL(tcp_recvmsg);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 000000000000..ec38d45d6649
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
1/*
2 * Binary Increase Congestion control for TCP
3 *
4 * This is from the implementation of BICTCP in
5 * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
6 * "Binary Increase Congestion Control for Fast, Long Distance
7 * Networks" in InfoComm 2004
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
10 *
11 * Unless BIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28
29static int fast_convergence = 1;
30static int max_increment = 32;
31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100;
36static int smooth_part = 20;
37
38module_param(fast_convergence, int, 0644);
39MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
40module_param(max_increment, int, 0644);
41MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
42module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644);
53MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
54
55
56/* BIC TCP Parameters */
57struct bictcp {
58 u32 cnt; /* increase cwnd by 1 after ACKs */
59 u32 last_max_cwnd; /* last maximum snd_cwnd */
60 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
71};
72
73static inline void bictcp_reset(struct bictcp *ca)
74{
75 ca->cnt = 0;
76 ca->last_max_cwnd = 0;
77 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0;
79 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87}
88
89static void bictcp_init(struct tcp_sock *tp)
90{
91 bictcp_reset(tcp_ca(tp));
92 if (initial_ssthresh)
93 tp->snd_ssthresh = initial_ssthresh;
94}
95
96/*
97 * Compute congestion window to use.
98 */
99static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
100{
101 if (ca->last_cwnd == cwnd &&
102 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
103 return;
104
105 ca->last_cwnd = cwnd;
106 ca->last_time = tcp_time_stamp;
107
108 if (ca->epoch_start == 0) /* record the beginning of an epoch */
109 ca->epoch_start = tcp_time_stamp;
110
111 /* start off normal */
112 if (cwnd <= low_window) {
113 ca->cnt = cwnd;
114 return;
115 }
116
117 /* binary increase */
118 if (cwnd < ca->last_max_cwnd) {
119 __u32 dist = (ca->last_max_cwnd - cwnd)
120 / BICTCP_B;
121
122 if (dist > max_increment)
123 /* linear increase */
124 ca->cnt = cwnd / max_increment;
125 else if (dist <= 1U)
126 /* binary search increase */
127 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
128 else
129 /* binary search increase */
130 ca->cnt = cwnd / dist;
131 } else {
132 /* slow start AMD linear increase */
133 if (cwnd < ca->last_max_cwnd + BICTCP_B)
134 /* slow start */
135 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
136 else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
137 /* slow start */
138 ca->cnt = (cwnd * (BICTCP_B-1))
139 / cwnd-ca->last_max_cwnd;
140 else
141 /* linear increase */
142 ca->cnt = cwnd / max_increment;
143 }
144
145 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 ||
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20;
150 }
151
152 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
153 if (ca->cnt == 0) /* cannot be zero */
154 ca->cnt = 1;
155}
156
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
160{
161 struct bictcp *ca = tcp_ca(tp);
162 u32 dist, delay;
163
164 /* No time stamp */
165 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
166 /* Discard delay samples right after fast recovery */
167 tcp_time_stamp < ca->epoch_start + HZ ||
168 /* this delay samples may not be accurate */
169 flag == 0) {
170 ca->last_delay = 0;
171 goto notlow;
172 }
173
174 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
175 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
176 if (delay == 0) /* no previous delay sample */
177 goto notlow;
178
179 /* first time call or link delay decreases */
180 if (ca->delay_min == 0 || ca->delay_min > delay) {
181 ca->delay_min = ca->delay_max = delay;
182 goto notlow;
183 }
184
185 if (ca->delay_max < delay)
186 ca->delay_max = delay;
187
188 /* utilization is low, if avg delay < dist*threshold
189 for checking_period time */
190 dist = ca->delay_max - ca->delay_min;
191 if (dist <= ca->delay_min>>6 ||
192 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
193 goto notlow;
194
195 if (ca->low_utilization_start == 0) {
196 ca->low_utilization = 0;
197 ca->low_utilization_start = tcp_time_stamp;
198 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
199 > low_utilization_period*HZ) {
200 ca->low_utilization = 1;
201 }
202
203 return;
204
205 notlow:
206 ca->low_utilization = 0;
207 ca->low_utilization_start = 0;
208
209}
210
211static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
212 u32 seq_rtt, u32 in_flight, int data_acked)
213{
214 struct bictcp *ca = tcp_ca(tp);
215
216 bictcp_low_utilization(tp, data_acked);
217
218 if (in_flight < tp->snd_cwnd)
219 return;
220
221 if (tp->snd_cwnd <= tp->snd_ssthresh) {
222 /* In "safe" area, increase. */
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 } else {
226 bictcp_update(ca, tp->snd_cwnd);
227
228 /* In dangerous area, increase slowly.
229 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
230 */
231 if (tp->snd_cwnd_cnt >= ca->cnt) {
232 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
233 tp->snd_cwnd++;
234 tp->snd_cwnd_cnt = 0;
235 } else
236 tp->snd_cwnd_cnt++;
237 }
238
239}
240
241/*
242 * behave like Reno until low_window is reached,
243 * then increase congestion window slowly
244 */
245static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
246{
247 struct bictcp *ca = tcp_ca(tp);
248
249 ca->epoch_start = 0; /* end of epoch */
250
251 /* in case of wrong delay_max*/
252 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
253 ca->delay_max = ca->delay_min
254 + ((ca->delay_max - ca->delay_min)* 90) / 100;
255
256 /* Wmax and fast convergence */
257 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
258 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
259 / (2 * BICTCP_BETA_SCALE);
260 else
261 ca->last_max_cwnd = tp->snd_cwnd;
262
263 ca->loss_cwnd = tp->snd_cwnd;
264
265
266 if (tp->snd_cwnd <= low_window)
267 return max(tp->snd_cwnd >> 1U, 2U);
268 else
269 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
270}
271
272static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
273{
274 struct bictcp *ca = tcp_ca(tp);
275
276 return max(tp->snd_cwnd, ca->last_max_cwnd);
277}
278
279static u32 bictcp_min_cwnd(struct tcp_sock *tp)
280{
281 return tp->snd_ssthresh;
282}
283
284static void bictcp_state(struct tcp_sock *tp, u8 new_state)
285{
286 if (new_state == TCP_CA_Loss)
287 bictcp_reset(tcp_ca(tp));
288}
289
290/* Track delayed acknowledgement ratio using sliding window
291 * ratio = (15*ratio + sample) / 16
292 */
293static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
294{
295 if (cnt > 0 && tp->ca_state == TCP_CA_Open) {
296 struct bictcp *ca = tcp_ca(tp);
297 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
298 ca->delayed_ack += cnt;
299 }
300}
301
302
303static struct tcp_congestion_ops bictcp = {
304 .init = bictcp_init,
305 .ssthresh = bictcp_recalc_ssthresh,
306 .cong_avoid = bictcp_cong_avoid,
307 .set_state = bictcp_state,
308 .undo_cwnd = bictcp_undo_cwnd,
309 .min_cwnd = bictcp_min_cwnd,
310 .pkts_acked = bictcp_acked,
311 .owner = THIS_MODULE,
312 .name = "bic",
313};
314
315static int __init bictcp_register(void)
316{
317 BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
318 return tcp_register_congestion_control(&bictcp);
319}
320
321static void __exit bictcp_unregister(void)
322{
323 tcp_unregister_congestion_control(&bictcp);
324}
325
326module_init(bictcp_register);
327module_exit(bictcp_unregister);
328
329MODULE_AUTHOR("Stephen Hemminger");
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000000..4970d10a7785
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,237 @@
1/*
2 * Plugable TCP congestion control support and newReno
3 * congestion control.
4 * Based on ideas from I/O scheduler suport and Web100.
5 *
6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/mm.h>
12#include <linux/types.h>
13#include <linux/list.h>
14#include <net/tcp.h>
15
16static DEFINE_SPINLOCK(tcp_cong_list_lock);
17static LIST_HEAD(tcp_cong_list);
18
19/* Simple linear search, don't expect many entries! */
20static struct tcp_congestion_ops *tcp_ca_find(const char *name)
21{
22 struct tcp_congestion_ops *e;
23
24 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
25 if (strcmp(e->name, name) == 0)
26 return e;
27 }
28
29 return NULL;
30}
31
32/*
33 * Attach new congestion control algorthim to the list
34 * of available options.
35 */
36int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
37{
38 int ret = 0;
39
40 /* all algorithms must implement ssthresh and cong_avoid ops */
41 if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
42 printk(KERN_ERR "TCP %s does not implement required ops\n",
43 ca->name);
44 return -EINVAL;
45 }
46
47 spin_lock(&tcp_cong_list_lock);
48 if (tcp_ca_find(ca->name)) {
49 printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
50 ret = -EEXIST;
51 } else {
52 list_add_rcu(&ca->list, &tcp_cong_list);
53 printk(KERN_INFO "TCP %s registered\n", ca->name);
54 }
55 spin_unlock(&tcp_cong_list_lock);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
60
61/*
62 * Remove congestion control algorithm, called from
63 * the module's remove function. Module ref counts are used
64 * to ensure that this can't be done till all sockets using
65 * that method are closed.
66 */
67void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
68{
69 spin_lock(&tcp_cong_list_lock);
70 list_del_rcu(&ca->list);
71 spin_unlock(&tcp_cong_list_lock);
72}
73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
74
75/* Assign choice of congestion control. */
76void tcp_init_congestion_control(struct tcp_sock *tp)
77{
78 struct tcp_congestion_ops *ca;
79
80 if (tp->ca_ops != &tcp_init_congestion_ops)
81 return;
82
83 rcu_read_lock();
84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
85 if (try_module_get(ca->owner)) {
86 tp->ca_ops = ca;
87 break;
88 }
89
90 }
91 rcu_read_unlock();
92
93 if (tp->ca_ops->init)
94 tp->ca_ops->init(tp);
95}
96
97/* Manage refcounts on socket close. */
98void tcp_cleanup_congestion_control(struct tcp_sock *tp)
99{
100 if (tp->ca_ops->release)
101 tp->ca_ops->release(tp);
102 module_put(tp->ca_ops->owner);
103}
104
105/* Used by sysctl to change default congestion control */
106int tcp_set_default_congestion_control(const char *name)
107{
108 struct tcp_congestion_ops *ca;
109 int ret = -ENOENT;
110
111 spin_lock(&tcp_cong_list_lock);
112 ca = tcp_ca_find(name);
113#ifdef CONFIG_KMOD
114 if (!ca) {
115 spin_unlock(&tcp_cong_list_lock);
116
117 request_module("tcp_%s", name);
118 spin_lock(&tcp_cong_list_lock);
119 ca = tcp_ca_find(name);
120 }
121#endif
122
123 if (ca) {
124 list_move(&ca->list, &tcp_cong_list);
125 ret = 0;
126 }
127 spin_unlock(&tcp_cong_list_lock);
128
129 return ret;
130}
131
132/* Get current default congestion control */
133void tcp_get_default_congestion_control(char *name)
134{
135 struct tcp_congestion_ops *ca;
136 /* We will always have reno... */
137 BUG_ON(list_empty(&tcp_cong_list));
138
139 rcu_read_lock();
140 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
141 strncpy(name, ca->name, TCP_CA_NAME_MAX);
142 rcu_read_unlock();
143}
144
145/* Change congestion control for socket */
146int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
147{
148 struct tcp_congestion_ops *ca;
149 int err = 0;
150
151 rcu_read_lock();
152 ca = tcp_ca_find(name);
153 if (ca == tp->ca_ops)
154 goto out;
155
156 if (!ca)
157 err = -ENOENT;
158
159 else if (!try_module_get(ca->owner))
160 err = -EBUSY;
161
162 else {
163 tcp_cleanup_congestion_control(tp);
164 tp->ca_ops = ca;
165 if (tp->ca_ops->init)
166 tp->ca_ops->init(tp);
167 }
168 out:
169 rcu_read_unlock();
170 return err;
171}
172
173/*
174 * TCP Reno congestion control
175 * This is special case used for fallback as well.
176 */
177/* This is Jacobson's slow start and congestion avoidance.
178 * SIGCOMM '88, p. 328.
179 */
180void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
181 int flag)
182{
183 if (in_flight < tp->snd_cwnd)
184 return;
185
186 if (tp->snd_cwnd <= tp->snd_ssthresh) {
187 /* In "safe" area, increase. */
188 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
189 tp->snd_cwnd++;
190 } else {
191 /* In dangerous area, increase slowly.
192 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
193 */
194 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
195 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
196 tp->snd_cwnd++;
197 tp->snd_cwnd_cnt = 0;
198 } else
199 tp->snd_cwnd_cnt++;
200 }
201}
202EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
203
204/* Slow start threshold is half the congestion window (min 2) */
205u32 tcp_reno_ssthresh(struct tcp_sock *tp)
206{
207 return max(tp->snd_cwnd >> 1U, 2U);
208}
209EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
210
211/* Lower bound on congestion window. */
212u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
213{
214 return tp->snd_ssthresh/2;
215}
216EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
217
218struct tcp_congestion_ops tcp_reno = {
219 .name = "reno",
220 .owner = THIS_MODULE,
221 .ssthresh = tcp_reno_ssthresh,
222 .cong_avoid = tcp_reno_cong_avoid,
223 .min_cwnd = tcp_reno_min_cwnd,
224};
225
226/* Initial congestion control used (until SYN)
227 * really reno under another name so we can tell difference
228 * during tcp_set_default_congestion_control
229 */
230struct tcp_congestion_ops tcp_init_congestion_ops = {
231 .name = "",
232 .owner = THIS_MODULE,
233 .ssthresh = tcp_reno_ssthresh,
234 .cong_avoid = tcp_reno_cong_avoid,
235 .min_cwnd = tcp_reno_min_cwnd,
236};
237EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 8faa8948f75c..f66945cb158f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,15 +42,8 @@ struct tcpdiag_entry
42 42
43static struct sock *tcpnl; 43static struct sock *tcpnl;
44 44
45
46#define TCPDIAG_PUT(skb, attrtype, attrlen) \ 45#define TCPDIAG_PUT(skb, attrtype, attrlen) \
47({ int rtalen = RTA_LENGTH(attrlen); \ 46 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
48 struct rtattr *rta; \
49 if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
50 rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
51 rta->rta_type = attrtype; \
52 rta->rta_len = rtalen; \
53 RTA_DATA(rta); })
54 47
55static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, 48static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
56 int ext, u32 pid, u32 seq, u16 nlmsg_flags) 49 int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
61 struct nlmsghdr *nlh; 54 struct nlmsghdr *nlh;
62 struct tcp_info *info = NULL; 55 struct tcp_info *info = NULL;
63 struct tcpdiag_meminfo *minfo = NULL; 56 struct tcpdiag_meminfo *minfo = NULL;
64 struct tcpvegas_info *vinfo = NULL;
65 unsigned char *b = skb->tail; 57 unsigned char *b = skb->tail;
66 58
67 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); 59 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
73 if (ext & (1<<(TCPDIAG_INFO-1))) 65 if (ext & (1<<(TCPDIAG_INFO-1)))
74 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); 66 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
75 67
76 if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) 68 if (ext & (1<<(TCPDIAG_CONG-1))) {
77 && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) 69 size_t len = strlen(tp->ca_ops->name);
78 vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); 70 strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
71 tp->ca_ops->name);
72 }
79 } 73 }
80 r->tcpdiag_family = sk->sk_family; 74 r->tcpdiag_family = sk->sk_family;
81 r->tcpdiag_state = sk->sk_state; 75 r->tcpdiag_state = sk->sk_state;
@@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
166 if (info) 160 if (info)
167 tcp_get_info(sk, info); 161 tcp_get_info(sk, info);
168 162
169 if (vinfo) { 163 if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
170 if (tcp_is_vegas(tp)) { 164 tp->ca_ops->get_info(tp, ext, skb);
171 vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
172 vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
173 vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
174 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
175 } else {
176 vinfo->tcpv_enabled = 0;
177 vinfo->tcpv_rttcnt = 0;
178 vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
179 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
180 }
181 }
182 165
183 nlh->nlmsg_len = skb->tail - b; 166 nlh->nlmsg_len = skb->tail - b;
184 return skb->len; 167 return skb->len;
185 168
169rtattr_failure:
186nlmsg_failure: 170nlmsg_failure:
187 skb_trim(skb, b - skb->data); 171 skb_trim(skb, b - skb->data);
188 return -1; 172 return -1;
@@ -455,9 +439,10 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
455} 439}
456 440
457static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, 441static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
458 struct open_request *req, 442 struct request_sock *req,
459 u32 pid, u32 seq) 443 u32 pid, u32 seq)
460{ 444{
445 const struct inet_request_sock *ireq = inet_rsk(req);
461 struct inet_sock *inet = inet_sk(sk); 446 struct inet_sock *inet = inet_sk(sk);
462 unsigned char *b = skb->tail; 447 unsigned char *b = skb->tail;
463 struct tcpdiagmsg *r; 448 struct tcpdiagmsg *r;
@@ -482,9 +467,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
482 tmo = 0; 467 tmo = 0;
483 468
484 r->id.tcpdiag_sport = inet->sport; 469 r->id.tcpdiag_sport = inet->sport;
485 r->id.tcpdiag_dport = req->rmt_port; 470 r->id.tcpdiag_dport = ireq->rmt_port;
486 r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr; 471 r->id.tcpdiag_src[0] = ireq->loc_addr;
487 r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr; 472 r->id.tcpdiag_dst[0] = ireq->rmt_addr;
488 r->tcpdiag_expires = jiffies_to_msecs(tmo), 473 r->tcpdiag_expires = jiffies_to_msecs(tmo),
489 r->tcpdiag_rqueue = 0; 474 r->tcpdiag_rqueue = 0;
490 r->tcpdiag_wqueue = 0; 475 r->tcpdiag_wqueue = 0;
@@ -493,9 +478,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
493#ifdef CONFIG_IP_TCPDIAG_IPV6 478#ifdef CONFIG_IP_TCPDIAG_IPV6
494 if (r->tcpdiag_family == AF_INET6) { 479 if (r->tcpdiag_family == AF_INET6) {
495 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, 480 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
496 &req->af.v6_req.loc_addr); 481 &tcp6_rsk(req)->loc_addr);
497 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, 482 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
498 &req->af.v6_req.rmt_addr); 483 &tcp6_rsk(req)->rmt_addr);
499 } 484 }
500#endif 485#endif
501 nlh->nlmsg_len = skb->tail - b; 486 nlh->nlmsg_len = skb->tail - b;
@@ -513,7 +498,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
513 struct tcpdiag_entry entry; 498 struct tcpdiag_entry entry;
514 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); 499 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
515 struct tcp_sock *tp = tcp_sk(sk); 500 struct tcp_sock *tp = tcp_sk(sk);
516 struct tcp_listen_opt *lopt; 501 struct listen_sock *lopt;
517 struct rtattr *bc = NULL; 502 struct rtattr *bc = NULL;
518 struct inet_sock *inet = inet_sk(sk); 503 struct inet_sock *inet = inet_sk(sk);
519 int j, s_j; 504 int j, s_j;
@@ -528,9 +513,9 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
528 513
529 entry.family = sk->sk_family; 514 entry.family = sk->sk_family;
530 515
531 read_lock_bh(&tp->syn_wait_lock); 516 read_lock_bh(&tp->accept_queue.syn_wait_lock);
532 517
533 lopt = tp->listen_opt; 518 lopt = tp->accept_queue.listen_opt;
534 if (!lopt || !lopt->qlen) 519 if (!lopt || !lopt->qlen)
535 goto out; 520 goto out;
536 521
@@ -541,13 +526,15 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
541 } 526 }
542 527
543 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { 528 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
544 struct open_request *req, *head = lopt->syn_table[j]; 529 struct request_sock *req, *head = lopt->syn_table[j];
545 530
546 reqnum = 0; 531 reqnum = 0;
547 for (req = head; req; reqnum++, req = req->dl_next) { 532 for (req = head; req; reqnum++, req = req->dl_next) {
533 struct inet_request_sock *ireq = inet_rsk(req);
534
548 if (reqnum < s_reqnum) 535 if (reqnum < s_reqnum)
549 continue; 536 continue;
550 if (r->id.tcpdiag_dport != req->rmt_port && 537 if (r->id.tcpdiag_dport != ireq->rmt_port &&
551 r->id.tcpdiag_dport) 538 r->id.tcpdiag_dport)
552 continue; 539 continue;
553 540
@@ -555,16 +542,16 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
555 entry.saddr = 542 entry.saddr =
556#ifdef CONFIG_IP_TCPDIAG_IPV6 543#ifdef CONFIG_IP_TCPDIAG_IPV6
557 (entry.family == AF_INET6) ? 544 (entry.family == AF_INET6) ?
558 req->af.v6_req.loc_addr.s6_addr32 : 545 tcp6_rsk(req)->loc_addr.s6_addr32 :
559#endif 546#endif
560 &req->af.v4_req.loc_addr; 547 &ireq->loc_addr;
561 entry.daddr = 548 entry.daddr =
562#ifdef CONFIG_IP_TCPDIAG_IPV6 549#ifdef CONFIG_IP_TCPDIAG_IPV6
563 (entry.family == AF_INET6) ? 550 (entry.family == AF_INET6) ?
564 req->af.v6_req.rmt_addr.s6_addr32 : 551 tcp6_rsk(req)->rmt_addr.s6_addr32 :
565#endif 552#endif
566 &req->af.v4_req.rmt_addr; 553 &ireq->rmt_addr;
567 entry.dport = ntohs(req->rmt_port); 554 entry.dport = ntohs(ireq->rmt_port);
568 555
569 if (!tcpdiag_bc_run(RTA_DATA(bc), 556 if (!tcpdiag_bc_run(RTA_DATA(bc),
570 RTA_PAYLOAD(bc), &entry)) 557 RTA_PAYLOAD(bc), &entry))
@@ -585,7 +572,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
585 } 572 }
586 573
587out: 574out:
588 read_unlock_bh(&tp->syn_wait_lock); 575 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
589 576
590 return err; 577 return err;
591} 578}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 000000000000..36c51f8136bf
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
1/*
2 * Sally Floyd's High Speed TCP (RFC 3649) congestion control
3 *
4 * See http://www.icir.org/floyd/hstcp.html
5 *
6 * John Heffner <jheffner@psc.edu>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <net/tcp.h>
12
13
14/* From AIMD tables from RFC 3649 appendix B,
15 * with fixed-point MD scaled <<8.
16 */
17static const struct hstcp_aimd_val {
18 unsigned int cwnd;
19 unsigned int md;
20} hstcp_aimd_vals[] = {
21 { 38, 128, /* 0.50 */ },
22 { 118, 112, /* 0.44 */ },
23 { 221, 104, /* 0.41 */ },
24 { 347, 98, /* 0.38 */ },
25 { 495, 93, /* 0.37 */ },
26 { 663, 89, /* 0.35 */ },
27 { 851, 86, /* 0.34 */ },
28 { 1058, 83, /* 0.33 */ },
29 { 1284, 81, /* 0.32 */ },
30 { 1529, 78, /* 0.31 */ },
31 { 1793, 76, /* 0.30 */ },
32 { 2076, 74, /* 0.29 */ },
33 { 2378, 72, /* 0.28 */ },
34 { 2699, 71, /* 0.28 */ },
35 { 3039, 69, /* 0.27 */ },
36 { 3399, 68, /* 0.27 */ },
37 { 3778, 66, /* 0.26 */ },
38 { 4177, 65, /* 0.26 */ },
39 { 4596, 64, /* 0.25 */ },
40 { 5036, 62, /* 0.25 */ },
41 { 5497, 61, /* 0.24 */ },
42 { 5979, 60, /* 0.24 */ },
43 { 6483, 59, /* 0.23 */ },
44 { 7009, 58, /* 0.23 */ },
45 { 7558, 57, /* 0.22 */ },
46 { 8130, 56, /* 0.22 */ },
47 { 8726, 55, /* 0.22 */ },
48 { 9346, 54, /* 0.21 */ },
49 { 9991, 53, /* 0.21 */ },
50 { 10661, 52, /* 0.21 */ },
51 { 11358, 52, /* 0.20 */ },
52 { 12082, 51, /* 0.20 */ },
53 { 12834, 50, /* 0.20 */ },
54 { 13614, 49, /* 0.19 */ },
55 { 14424, 48, /* 0.19 */ },
56 { 15265, 48, /* 0.19 */ },
57 { 16137, 47, /* 0.19 */ },
58 { 17042, 46, /* 0.18 */ },
59 { 17981, 45, /* 0.18 */ },
60 { 18955, 45, /* 0.18 */ },
61 { 19965, 44, /* 0.17 */ },
62 { 21013, 43, /* 0.17 */ },
63 { 22101, 43, /* 0.17 */ },
64 { 23230, 42, /* 0.17 */ },
65 { 24402, 41, /* 0.16 */ },
66 { 25618, 41, /* 0.16 */ },
67 { 26881, 40, /* 0.16 */ },
68 { 28193, 39, /* 0.16 */ },
69 { 29557, 39, /* 0.15 */ },
70 { 30975, 38, /* 0.15 */ },
71 { 32450, 38, /* 0.15 */ },
72 { 33986, 37, /* 0.15 */ },
73 { 35586, 36, /* 0.14 */ },
74 { 37253, 36, /* 0.14 */ },
75 { 38992, 35, /* 0.14 */ },
76 { 40808, 35, /* 0.14 */ },
77 { 42707, 34, /* 0.13 */ },
78 { 44694, 33, /* 0.13 */ },
79 { 46776, 33, /* 0.13 */ },
80 { 48961, 32, /* 0.13 */ },
81 { 51258, 32, /* 0.13 */ },
82 { 53677, 31, /* 0.12 */ },
83 { 56230, 30, /* 0.12 */ },
84 { 58932, 30, /* 0.12 */ },
85 { 61799, 29, /* 0.12 */ },
86 { 64851, 28, /* 0.11 */ },
87 { 68113, 28, /* 0.11 */ },
88 { 71617, 27, /* 0.11 */ },
89 { 75401, 26, /* 0.10 */ },
90 { 79517, 26, /* 0.10 */ },
91 { 84035, 25, /* 0.10 */ },
92 { 89053, 24, /* 0.10 */ },
93};
94
95#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
96
97struct hstcp {
98 u32 ai;
99};
100
101static void hstcp_init(struct tcp_sock *tp)
102{
103 struct hstcp *ca = tcp_ca(tp);
104
105 ca->ai = 0;
106
107 /* Ensure the MD arithmetic works. This is somewhat pedantic,
108 * since I don't think we will see a cwnd this large. :) */
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110}
111
112static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
113 u32 in_flight, int good)
114{
115 struct hstcp *ca = tcp_ca(tp);
116
117 if (in_flight < tp->snd_cwnd)
118 return;
119
120 if (tp->snd_cwnd <= tp->snd_ssthresh) {
121 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
122 tp->snd_cwnd++;
123 } else {
124 /* Update AIMD parameters */
125 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
126 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
127 ca->ai < HSTCP_AIMD_MAX)
128 ca->ai++;
129 } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
130 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
131 ca->ai > 0)
132 ca->ai--;
133 }
134
135 /* Do additive increase */
136 if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
137 tp->snd_cwnd_cnt += ca->ai;
138 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
139 tp->snd_cwnd++;
140 tp->snd_cwnd_cnt -= tp->snd_cwnd;
141 }
142 }
143 }
144}
145
146static u32 hstcp_ssthresh(struct tcp_sock *tp)
147{
148 struct hstcp *ca = tcp_ca(tp);
149
150 /* Do multiplicative decrease */
151 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
152}
153
154
155static struct tcp_congestion_ops tcp_highspeed = {
156 .init = hstcp_init,
157 .ssthresh = hstcp_ssthresh,
158 .cong_avoid = hstcp_cong_avoid,
159 .min_cwnd = tcp_reno_min_cwnd,
160
161 .owner = THIS_MODULE,
162 .name = "highspeed"
163};
164
165static int __init hstcp_register(void)
166{
167 BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
168 return tcp_register_congestion_control(&tcp_highspeed);
169}
170
171static void __exit hstcp_unregister(void)
172{
173 tcp_unregister_congestion_control(&tcp_highspeed);
174}
175
176module_init(hstcp_register);
177module_exit(hstcp_unregister);
178
179MODULE_AUTHOR("John Heffner");
180MODULE_LICENSE("GPL");
181MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 000000000000..40168275acf9
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
1/*
2 * H-TCP congestion control. The algorithm is detailed in:
3 * R.N.Shorten, D.J.Leith:
4 * "H-TCP: TCP for high-speed and long-distance networks"
5 * Proc. PFLDnet, Argonne, 2004.
6 * http://www.hamilton.ie/net/htcp3.pdf
7 */
8
9#include <linux/config.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <net/tcp.h>
13
14#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
15#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
16#define BETA_MAX 102 /* 0.8 with shift << 7 */
17
18static int use_rtt_scaling = 1;
19module_param(use_rtt_scaling, int, 0644);
20MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
21
22static int use_bandwidth_switch = 1;
23module_param(use_bandwidth_switch, int, 0644);
24MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
25
26struct htcp {
27 u16 alpha; /* Fixed point arith, << 7 */
28 u8 beta; /* Fixed point arith, << 7 */
29 u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */
30 u8 ccount; /* Number of RTTs since last congestion event */
31 u8 undo_ccount;
32 u16 packetcount;
33 u32 minRTT;
34 u32 maxRTT;
35 u32 snd_cwnd_cnt2;
36
37 u32 undo_maxRTT;
38 u32 undo_old_maxB;
39
40 /* Bandwidth estimation */
41 u32 minB;
42 u32 maxB;
43 u32 old_maxB;
44 u32 Bi;
45 u32 lasttime;
46};
47
48static inline void htcp_reset(struct htcp *ca)
49{
50 ca->undo_ccount = ca->ccount;
51 ca->undo_maxRTT = ca->maxRTT;
52 ca->undo_old_maxB = ca->old_maxB;
53
54 ca->ccount = 0;
55 ca->snd_cwnd_cnt2 = 0;
56}
57
58static u32 htcp_cwnd_undo(struct tcp_sock *tp)
59{
60 struct htcp *ca = tcp_ca(tp);
61 ca->ccount = ca->undo_ccount;
62 ca->maxRTT = ca->undo_maxRTT;
63 ca->old_maxB = ca->undo_old_maxB;
64 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
65}
66
67static inline void measure_rtt(struct tcp_sock *tp)
68{
69 struct htcp *ca = tcp_ca(tp);
70 u32 srtt = tp->srtt>>3;
71
72 /* keep track of minimum RTT seen so far, minRTT is zero at first */
73 if (ca->minRTT > srtt || !ca->minRTT)
74 ca->minRTT = srtt;
75
76 /* max RTT */
77 if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
78 if (ca->maxRTT < ca->minRTT)
79 ca->maxRTT = ca->minRTT;
80 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
81 ca->maxRTT = srtt;
82 }
83}
84
85static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
86{
87 struct htcp *ca = tcp_ca(tp);
88 u32 now = tcp_time_stamp;
89
90 /* achieved throughput calculations */
91 if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
92 ca->packetcount = 0;
93 ca->lasttime = now;
94 return;
95 }
96
97 ca->packetcount += pkts_acked;
98
99 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
100 && now - ca->lasttime >= ca->minRTT
101 && ca->minRTT > 0) {
102 __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
103 if (ca->ccount <= 3) {
104 /* just after backoff */
105 ca->minB = ca->maxB = ca->Bi = cur_Bi;
106 } else {
107 ca->Bi = (3*ca->Bi + cur_Bi)/4;
108 if (ca->Bi > ca->maxB)
109 ca->maxB = ca->Bi;
110 if (ca->minB > ca->maxB)
111 ca->minB = ca->maxB;
112 }
113 ca->packetcount = 0;
114 ca->lasttime = now;
115 }
116}
117
118static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
119{
120 if (use_bandwidth_switch) {
121 u32 maxB = ca->maxB;
122 u32 old_maxB = ca->old_maxB;
123 ca->old_maxB = ca->maxB;
124
125 if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
126 ca->beta = BETA_MIN;
127 ca->modeswitch = 0;
128 return;
129 }
130 }
131
132 if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
133 ca->beta = (minRTT<<7)/maxRTT;
134 if (ca->beta < BETA_MIN)
135 ca->beta = BETA_MIN;
136 else if (ca->beta > BETA_MAX)
137 ca->beta = BETA_MAX;
138 } else {
139 ca->beta = BETA_MIN;
140 ca->modeswitch = 1;
141 }
142}
143
144static inline void htcp_alpha_update(struct htcp *ca)
145{
146 u32 minRTT = ca->minRTT;
147 u32 factor = 1;
148 u32 diff = ca->ccount * minRTT; /* time since last backoff */
149
150 if (diff > HZ) {
151 diff -= HZ;
152 factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
153 }
154
155 if (use_rtt_scaling && minRTT) {
156 u32 scale = (HZ<<3)/(10*minRTT);
157 scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
158 factor = (factor<<3)/scale;
159 if (!factor)
160 factor = 1;
161 }
162
163 ca->alpha = 2*factor*((1<<7)-ca->beta);
164 if (!ca->alpha)
165 ca->alpha = ALPHA_BASE;
166}
167
168/* After we have the rtt data to calculate beta, we'd still prefer to wait one
169 * rtt before we adjust our beta to ensure we are working from a consistent
170 * data.
171 *
172 * This function should be called when we hit a congestion event since only at
173 * that point do we really have a real sense of maxRTT (the queues en route
174 * were getting just too full now).
175 */
176static void htcp_param_update(struct tcp_sock *tp)
177{
178 struct htcp *ca = tcp_ca(tp);
179 u32 minRTT = ca->minRTT;
180 u32 maxRTT = ca->maxRTT;
181
182 htcp_beta_update(ca, minRTT, maxRTT);
183 htcp_alpha_update(ca);
184
185 /* add slowly fading memory for maxRTT to accommodate routing changes etc */
186 if (minRTT > 0 && maxRTT > minRTT)
187 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
188}
189
190static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
191{
192 struct htcp *ca = tcp_ca(tp);
193 htcp_param_update(tp);
194 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
195}
196
197static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
198 u32 in_flight, int data_acked)
199{
200 struct htcp *ca = tcp_ca(tp);
201
202 if (in_flight < tp->snd_cwnd)
203 return;
204
205 if (tp->snd_cwnd <= tp->snd_ssthresh) {
206 /* In "safe" area, increase. */
207 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
208 tp->snd_cwnd++;
209 } else {
210 measure_rtt(tp);
211
212 /* keep track of number of round-trip times since last backoff event */
213 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
214 ca->ccount++;
215 ca->snd_cwnd_cnt2 = 0;
216 htcp_alpha_update(ca);
217 }
218
219 /* In dangerous area, increase slowly.
220 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
221 */
222 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 tp->snd_cwnd_cnt = 0;
226 ca->ccount++;
227 }
228 }
229}
230
231/* Lower bound on congestion window. */
232static u32 htcp_min_cwnd(struct tcp_sock *tp)
233{
234 return tp->snd_ssthresh;
235}
236
237
238static void htcp_init(struct tcp_sock *tp)
239{
240 struct htcp *ca = tcp_ca(tp);
241
242 memset(ca, 0, sizeof(struct htcp));
243 ca->alpha = ALPHA_BASE;
244 ca->beta = BETA_MIN;
245}
246
247static void htcp_state(struct tcp_sock *tp, u8 new_state)
248{
249 switch (new_state) {
250 case TCP_CA_CWR:
251 case TCP_CA_Recovery:
252 case TCP_CA_Loss:
253 htcp_reset(tcp_ca(tp));
254 break;
255 }
256}
257
258static struct tcp_congestion_ops htcp = {
259 .init = htcp_init,
260 .ssthresh = htcp_recalc_ssthresh,
261 .min_cwnd = htcp_min_cwnd,
262 .cong_avoid = htcp_cong_avoid,
263 .set_state = htcp_state,
264 .undo_cwnd = htcp_cwnd_undo,
265 .pkts_acked = measure_achieved_throughput,
266 .owner = THIS_MODULE,
267 .name = "htcp",
268};
269
270static int __init htcp_register(void)
271{
272 BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
273 BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
274 if (!use_bandwidth_switch)
275 htcp.pkts_acked = NULL;
276 return tcp_register_congestion_control(&htcp);
277}
278
279static void __exit htcp_unregister(void)
280{
281 tcp_unregister_congestion_control(&htcp);
282}
283
284module_init(htcp_register);
285module_exit(htcp_unregister);
286
287MODULE_AUTHOR("Baruch Even");
288MODULE_LICENSE("GPL");
289MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 000000000000..13a66342c304
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
1/*
2 * TCP HYBLA
3 *
4 * TCP-HYBLA Congestion control algorithm, based on:
5 * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
6 * for Heterogeneous Networks",
7 * International Journal on satellite Communications,
8 * September 2004
9 * Daniele Lacamera
10 * root at danielinux.net
11 */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <net/tcp.h>
16
17/* Tcp Hybla structure. */
18struct hybla {
19 u8 hybla_en;
20 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
21 u32 rho; /* Rho parameter, integer part */
22 u32 rho2; /* Rho * Rho, integer part */
23 u32 rho_3ls; /* Rho parameter, <<3 */
24 u32 rho2_7ls; /* Rho^2, <<7 */
25 u32 minrtt; /* Minimum smoothed round trip time value seen */
26};
27
28/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
29 expressed in jiffies */
30static int rtt0 = 25;
31module_param(rtt0, int, 0644);
32MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
33
34
35/* This is called to refresh values for hybla parameters */
36static inline void hybla_recalc_param (struct tcp_sock *tp)
37{
38 struct hybla *ca = tcp_ca(tp);
39
40 ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
41 ca->rho = ca->rho_3ls >> 3;
42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
43 ca->rho2 = ca->rho2_7ls >>7;
44}
45
46static void hybla_init(struct tcp_sock *tp)
47{
48 struct hybla *ca = tcp_ca(tp);
49
50 ca->rho = 0;
51 ca->rho2 = 0;
52 ca->rho_3ls = 0;
53 ca->rho2_7ls = 0;
54 ca->snd_cwnd_cents = 0;
55 ca->hybla_en = 1;
56 tp->snd_cwnd = 2;
57 tp->snd_cwnd_clamp = 65535;
58
59 /* 1st Rho measurement based on initial srtt */
60 hybla_recalc_param(tp);
61
62 /* set minimum rtt as this is the 1st ever seen */
63 ca->minrtt = tp->srtt;
64 tp->snd_cwnd = ca->rho;
65}
66
67static void hybla_state(struct tcp_sock *tp, u8 ca_state)
68{
69 struct hybla *ca = tcp_ca(tp);
70
71 ca->hybla_en = (ca_state == TCP_CA_Open);
72}
73
74static inline u32 hybla_fraction(u32 odds)
75{
76 static const u32 fractions[] = {
77 128, 139, 152, 165, 181, 197, 215, 234,
78 };
79
80 return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
81}
82
83/* TCP Hybla main routine.
84 * This is the algorithm behavior:
85 * o Recalc Hybla parameters if min_rtt has changed
86 * o Give cwnd a new value based on the model proposed
87 * o remember increments <1
88 */
89static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
90 u32 in_flight, int flag)
91{
92 struct hybla *ca = tcp_ca(tp);
93 u32 increment, odd, rho_fractions;
94 int is_slowstart = 0;
95
96 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){
98 hybla_recalc_param(tp);
99 ca->minrtt = tp->srtt;
100 }
101
102 if (!ca->hybla_en)
103 return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
104
105 if (in_flight < tp->snd_cwnd)
106 return;
107
108 if (ca->rho == 0)
109 hybla_recalc_param(tp);
110
111 rho_fractions = ca->rho_3ls - (ca->rho << 3);
112
113 if (tp->snd_cwnd < tp->snd_ssthresh) {
114 /*
115 * slow start
116 * INC = 2^RHO - 1
117 * This is done by splitting the rho parameter
118 * into 2 parts: an integer part and a fraction part.
119 * Inrement<<7 is estimated by doing:
120 * [2^(int+fract)]<<7
121 * that is equal to:
122 * (2^int) * [(2^fract) <<7]
123 * 2^int is straightly computed as 1<<int,
124 * while we will use hybla_slowstart_fraction_increment() to
125 * calculate 2^fract in a <<7 value.
126 */
127 is_slowstart = 1;
128 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
129 - 128;
130 } else {
131 /*
132 * congestion avoidance
133 * INC = RHO^2 / W
134 * as long as increment is estimated as (rho<<7)/window
135 * it already is <<7 and we can easily count its fractions.
136 */
137 increment = ca->rho2_7ls / tp->snd_cwnd;
138 if (increment < 128)
139 tp->snd_cwnd_cnt++;
140 }
141
142 odd = increment % 128;
143 tp->snd_cwnd += increment >> 7;
144 ca->snd_cwnd_cents += odd;
145
146 /* check when fractions goes >=128 and increase cwnd by 1. */
147 while(ca->snd_cwnd_cents >= 128) {
148 tp->snd_cwnd++;
149 ca->snd_cwnd_cents -= 128;
150 tp->snd_cwnd_cnt = 0;
151 }
152
153 /* clamp down slowstart cwnd to ssthresh value. */
154 if (is_slowstart)
155 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
156
157 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
158}
159
160static struct tcp_congestion_ops tcp_hybla = {
161 .init = hybla_init,
162 .ssthresh = tcp_reno_ssthresh,
163 .min_cwnd = tcp_reno_min_cwnd,
164 .cong_avoid = hybla_cong_avoid,
165 .set_state = hybla_state,
166
167 .owner = THIS_MODULE,
168 .name = "hybla"
169};
170
171static int __init hybla_register(void)
172{
173 BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
174 return tcp_register_congestion_control(&tcp_hybla);
175}
176
177static void __exit hybla_unregister(void)
178{
179 tcp_unregister_congestion_control(&tcp_hybla);
180}
181
182module_init(hybla_register);
183module_exit(hybla_unregister);
184
185MODULE_AUTHOR("Daniele Lacamera");
186MODULE_LICENSE("GPL");
187MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a3..7bbbbc33eb4b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission 61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission
62 * engine. Lots of bugs are found. 62 * engine. Lots of bugs are found.
63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
64 * Angelo Dell'Aera: TCP Westwood+ support
65 */ 64 */
66 65
67#include <linux/config.h> 66#include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
88int sysctl_tcp_max_orphans = NR_FILE; 87int sysctl_tcp_max_orphans = NR_FILE;
89int sysctl_tcp_frto; 88int sysctl_tcp_frto;
90int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
91int sysctl_tcp_westwood;
92int sysctl_tcp_vegas_cong_avoid;
93 90
94int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
95 92
96/* Default values of the Vegas variables, in fixed-point representation
97 * with V_PARAM_SHIFT bits to the right of the binary point.
98 */
99#define V_PARAM_SHIFT 1
100int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
101int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
102int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
103int sysctl_tcp_bic = 1;
104int sysctl_tcp_bic_fast_convergence = 1;
105int sysctl_tcp_bic_low_window = 14;
106int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
107
108#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 93#define FLAG_DATA 0x01 /* Incoming frame contained data. */
109#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
110#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ 95#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
333 tp->snd_cwnd_stamp = tcp_time_stamp; 318 tp->snd_cwnd_stamp = tcp_time_stamp;
334} 319}
335 320
336static void init_bictcp(struct tcp_sock *tp)
337{
338 tp->bictcp.cnt = 0;
339
340 tp->bictcp.last_max_cwnd = 0;
341 tp->bictcp.last_cwnd = 0;
342 tp->bictcp.last_stamp = 0;
343}
344
345/* 5. Recalculate window clamp after socket hit its memory bounds. */ 321/* 5. Recalculate window clamp after socket hit its memory bounds. */
346static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
347{ 323{
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
558 tcp_grow_window(sk, tp, skb); 534 tcp_grow_window(sk, tp, skb);
559} 535}
560 536
561/* When starting a new connection, pin down the current choice of
562 * congestion algorithm.
563 */
564void tcp_ca_init(struct tcp_sock *tp)
565{
566 if (sysctl_tcp_westwood)
567 tp->adv_cong = TCP_WESTWOOD;
568 else if (sysctl_tcp_bic)
569 tp->adv_cong = TCP_BIC;
570 else if (sysctl_tcp_vegas_cong_avoid) {
571 tp->adv_cong = TCP_VEGAS;
572 tp->vegas.baseRTT = 0x7fffffff;
573 tcp_vegas_enable(tp);
574 }
575}
576
577/* Do RTT sampling needed for Vegas.
578 * Basically we:
579 * o min-filter RTT samples from within an RTT to get the current
580 * propagation delay + queuing delay (we are min-filtering to try to
581 * avoid the effects of delayed ACKs)
582 * o min-filter RTT samples from a much longer window (forever for now)
583 * to find the propagation delay (baseRTT)
584 */
585static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
586{
587 __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
588
589 /* Filter to find propagation delay: */
590 if (vrtt < tp->vegas.baseRTT)
591 tp->vegas.baseRTT = vrtt;
592
593 /* Find the min RTT during the last RTT to find
594 * the current prop. delay + queuing delay:
595 */
596 tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
597 tp->vegas.cntRTT++;
598}
599
600/* Called to compute a smoothed rtt estimate. The data fed to this 537/* Called to compute a smoothed rtt estimate. The data fed to this
601 * routine either comes from timestamps, or from segments that were 538 * routine either comes from timestamps, or from segments that were
602 * known _not_ to have been retransmitted [see Karn/Partridge 539 * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
606 * To save cycles in the RFC 1323 implementation it was better to break 543 * To save cycles in the RFC 1323 implementation it was better to break
607 * it up into three procedures. -- erics 544 * it up into three procedures. -- erics
608 */ 545 */
609static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) 546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
610{ 547{
611 long m = mrtt; /* RTT */ 548 long m = mrtt; /* RTT */
612 549
613 if (tcp_vegas_enabled(tp))
614 vegas_rtt_calc(tp, mrtt);
615
616 /* The following amusing code comes from Jacobson's 550 /* The following amusing code comes from Jacobson's
617 * article in SIGCOMM '88. Note that rtt and mdev 551 * article in SIGCOMM '88. Note that rtt and mdev
618 * are scaled versions of rtt and mean deviation. 552 * are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
670 tp->rtt_seq = tp->snd_nxt; 604 tp->rtt_seq = tp->snd_nxt;
671 } 605 }
672 606
673 tcp_westwood_update_rtt(tp, tp->srtt >> 3); 607 if (tp->ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt);
674} 609}
675 610
676/* Calculate rto without backoff. This is the second half of Van Jacobson's 611/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
1185 tp->snd_una == tp->high_seq || 1120 tp->snd_una == tp->high_seq ||
1186 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1187 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1122 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1188 if (!tcp_westwood_ssthresh(tp)) 1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1189 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1124 tcp_ca_event(tp, CA_EVENT_FRTO);
1190 } 1125 }
1191 1126
1192 /* Have to clear retransmission markers here to keep the bookkeeping 1127 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
1252 tcp_set_ca_state(tp, TCP_CA_Loss); 1187 tcp_set_ca_state(tp, TCP_CA_Loss);
1253 tp->high_seq = tp->frto_highmark; 1188 tp->high_seq = tp->frto_highmark;
1254 TCP_ECN_queue_cwr(tp); 1189 TCP_ECN_queue_cwr(tp);
1255
1256 init_bictcp(tp);
1257} 1190}
1258 1191
1259void tcp_clear_retrans(struct tcp_sock *tp) 1192void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1283 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1284 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1285 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1218 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1286 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1220 tcp_ca_event(tp, CA_EVENT_LOSS);
1287 } 1221 }
1288 tp->snd_cwnd = 1; 1222 tp->snd_cwnd = 1;
1289 tp->snd_cwnd_cnt = 0; 1223 tp->snd_cwnd_cnt = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1596} 1530}
1597 1531
1598/* Decrease cwnd each second ack. */ 1532/* Decrease cwnd each second ack. */
1599
1600static void tcp_cwnd_down(struct tcp_sock *tp) 1533static void tcp_cwnd_down(struct tcp_sock *tp)
1601{ 1534{
1602 int decr = tp->snd_cwnd_cnt + 1; 1535 int decr = tp->snd_cwnd_cnt + 1;
1603 __u32 limit;
1604
1605 /*
1606 * TCP Westwood
1607 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
1608 * in packets we use mss_cache). If sysctl_tcp_westwood is off
1609 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
1610 * still used as usual. It prevents other strange cases in which
1611 * BWE*RTTmin could assume value 0. It should not happen but...
1612 */
1613
1614 if (!(limit = tcp_westwood_bw_rttmin(tp)))
1615 limit = tp->snd_ssthresh/2;
1616 1536
1617 tp->snd_cwnd_cnt = decr&1; 1537 tp->snd_cwnd_cnt = decr&1;
1618 decr >>= 1; 1538 decr >>= 1;
1619 1539
1620 if (decr && tp->snd_cwnd > limit) 1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
1621 tp->snd_cwnd -= decr; 1541 tp->snd_cwnd -= decr;
1622 1542
1623 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1654static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
1655{ 1575{
1656 if (tp->prior_ssthresh) { 1576 if (tp->prior_ssthresh) {
1657 if (tcp_is_bic(tp)) 1577 if (tp->ca_ops->undo_cwnd)
1658 tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); 1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
1659 else 1579 else
1660 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1661 1581
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1767 1687
1768static inline void tcp_complete_cwr(struct tcp_sock *tp) 1688static inline void tcp_complete_cwr(struct tcp_sock *tp)
1769{ 1689{
1770 if (tcp_westwood_cwnd(tp)) 1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1771 tp->snd_ssthresh = tp->snd_cwnd;
1772 else
1773 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1774 tp->snd_cwnd_stamp = tcp_time_stamp; 1691 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
1775} 1693}
1776 1694
1777static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1946 if (tp->ca_state < TCP_CA_CWR) { 1864 if (tp->ca_state < TCP_CA_CWR) {
1947 if (!(flag&FLAG_ECE)) 1865 if (!(flag&FLAG_ECE))
1948 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1866 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1949 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1950 TCP_ECN_queue_cwr(tp); 1868 TCP_ECN_queue_cwr(tp);
1951 } 1869 }
1952 1870
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1963/* Read draft-ietf-tcplw-high-performance before mucking 1881/* Read draft-ietf-tcplw-high-performance before mucking
1964 * with this code. (Superceeds RFC1323) 1882 * with this code. (Superceeds RFC1323)
1965 */ 1883 */
1966static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) 1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1967{ 1885{
1968 __u32 seq_rtt; 1886 __u32 seq_rtt;
1969 1887
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
1983 * in window is lost... Voila. --ANK (010210) 1901 * in window is lost... Voila. --ANK (010210)
1984 */ 1902 */
1985 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1986 tcp_rtt_estimator(tp, seq_rtt); 1904 tcp_rtt_estimator(tp, seq_rtt, usrtt);
1987 tcp_set_rto(tp); 1905 tcp_set_rto(tp);
1988 tp->backoff = 0; 1906 tp->backoff = 0;
1989 tcp_bound_rto(tp); 1907 tcp_bound_rto(tp);
1990} 1908}
1991 1909
1992static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) 1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
1993{ 1911{
1994 /* We don't have a timestamp. Can only use 1912 /* We don't have a timestamp. Can only use
1995 * packets that are not retransmitted to determine 1913 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
2003 if (flag & FLAG_RETRANS_DATA_ACKED) 1921 if (flag & FLAG_RETRANS_DATA_ACKED)
2004 return; 1922 return;
2005 1923
2006 tcp_rtt_estimator(tp, seq_rtt); 1924 tcp_rtt_estimator(tp, seq_rtt, usrtt);
2007 tcp_set_rto(tp); 1925 tcp_set_rto(tp);
2008 tp->backoff = 0; 1926 tp->backoff = 0;
2009 tcp_bound_rto(tp); 1927 tcp_bound_rto(tp);
2010} 1928}
2011 1929
2012static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
2013 int flag, s32 seq_rtt) 1931 int flag, s32 seq_rtt, u32 *usrtt)
2014{ 1932{
2015 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2016 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2017 tcp_ack_saw_tstamp(tp, flag); 1935 tcp_ack_saw_tstamp(tp, usrtt, flag);
2018 else if (seq_rtt >= 0) 1936 else if (seq_rtt >= 0)
2019 tcp_ack_no_tstamp(tp, seq_rtt, flag); 1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
2020} 1938}
2021 1939
2022/* 1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
2023 * Compute congestion window to use. 1941 u32 in_flight, int good)
2024 *
2025 * This is from the implementation of BICTCP in
2026 * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
2027 * "Binary Increase Congestion Control for Fast, Long Distance
2028 * Networks" in InfoComm 2004
2029 * Available from:
2030 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
2031 *
2032 * Unless BIC is enabled and congestion window is large
2033 * this behaves the same as the original Reno.
2034 */
2035static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
2036{
2037 /* orignal Reno behaviour */
2038 if (!tcp_is_bic(tp))
2039 return tp->snd_cwnd;
2040
2041 if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
2042 (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
2043 return tp->bictcp.cnt;
2044
2045 tp->bictcp.last_cwnd = tp->snd_cwnd;
2046 tp->bictcp.last_stamp = tcp_time_stamp;
2047
2048 /* start off normal */
2049 if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
2050 tp->bictcp.cnt = tp->snd_cwnd;
2051
2052 /* binary increase */
2053 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
2054 __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
2055 / BICTCP_B;
2056
2057 if (dist > BICTCP_MAX_INCREMENT)
2058 /* linear increase */
2059 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2060 else if (dist <= 1U)
2061 /* binary search increase */
2062 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2063 / BICTCP_B;
2064 else
2065 /* binary search increase */
2066 tp->bictcp.cnt = tp->snd_cwnd / dist;
2067 } else {
2068 /* slow start amd linear increase */
2069 if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
2070 /* slow start */
2071 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2072 / BICTCP_B;
2073 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
2074 + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
2075 /* slow start */
2076 tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
2077 / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
2078 else
2079 /* linear increase */
2080 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2081 }
2082 return tp->bictcp.cnt;
2083}
2084
2085/* This is Jacobson's slow start and congestion avoidance.
2086 * SIGCOMM '88, p. 328.
2087 */
2088static inline void reno_cong_avoid(struct tcp_sock *tp)
2089{ 1942{
2090 if (tp->snd_cwnd <= tp->snd_ssthresh) { 1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
2091 /* In "safe" area, increase. */
2092 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2093 tp->snd_cwnd++;
2094 } else {
2095 /* In dangerous area, increase slowly.
2096 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
2097 */
2098 if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
2099 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2100 tp->snd_cwnd++;
2101 tp->snd_cwnd_cnt=0;
2102 } else
2103 tp->snd_cwnd_cnt++;
2104 }
2105 tp->snd_cwnd_stamp = tcp_time_stamp; 1944 tp->snd_cwnd_stamp = tcp_time_stamp;
2106} 1945}
2107 1946
2108/* This is based on the congestion detection/avoidance scheme described in
2109 * Lawrence S. Brakmo and Larry L. Peterson.
2110 * "TCP Vegas: End to end congestion avoidance on a global internet."
2111 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
2112 * October 1995. Available from:
2113 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
2114 *
2115 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
2116 * The main aspects that distinguish this implementation from the
2117 * Arizona Vegas implementation are:
2118 * o We do not change the loss detection or recovery mechanisms of
2119 * Linux in any way. Linux already recovers from losses quite well,
2120 * using fine-grained timers, NewReno, and FACK.
2121 * o To avoid the performance penalty imposed by increasing cwnd
2122 * only every-other RTT during slow start, we increase during
2123 * every RTT during slow start, just like Reno.
2124 * o Largely to allow continuous cwnd growth during slow start,
2125 * we use the rate at which ACKs come back as the "actual"
2126 * rate, rather than the rate at which data is sent.
2127 * o To speed convergence to the right rate, we set the cwnd
2128 * to achieve the right ("actual") rate when we exit slow start.
2129 * o To filter out the noise caused by delayed ACKs, we use the
2130 * minimum RTT sample observed during the last RTT to calculate
2131 * the actual rate.
2132 * o When the sender re-starts from idle, it waits until it has
2133 * received ACKs for an entire flight of new data before making
2134 * a cwnd adjustment decision. The original Vegas implementation
2135 * assumed senders never went idle.
2136 */
2137static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2138{
2139 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
2140 *
2141 * These are so named because they represent the approximate values
2142 * of snd_una and snd_nxt at the beginning of the current RTT. More
2143 * precisely, they represent the amount of data sent during the RTT.
2144 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
2145 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
2146 * bytes of data have been ACKed during the course of the RTT, giving
2147 * an "actual" rate of:
2148 *
2149 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
2150 *
2151 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
2152 * because delayed ACKs can cover more than one segment, so they
2153 * don't line up nicely with the boundaries of RTTs.
2154 *
2155 * Another unfortunate fact of life is that delayed ACKs delay the
2156 * advance of the left edge of our send window, so that the number
2157 * of bytes we send in an RTT is often less than our cwnd will allow.
2158 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
2159 */
2160
2161 if (after(ack, tp->vegas.beg_snd_nxt)) {
2162 /* Do the Vegas once-per-RTT cwnd adjustment. */
2163 u32 old_wnd, old_snd_cwnd;
2164
2165
2166 /* Here old_wnd is essentially the window of data that was
2167 * sent during the previous RTT, and has all
2168 * been acknowledged in the course of the RTT that ended
2169 * with the ACK we just received. Likewise, old_snd_cwnd
2170 * is the cwnd during the previous RTT.
2171 */
2172 old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
2173 tp->mss_cache_std;
2174 old_snd_cwnd = tp->vegas.beg_snd_cwnd;
2175
2176 /* Save the extent of the current window so we can use this
2177 * at the end of the next RTT.
2178 */
2179 tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
2180 tp->vegas.beg_snd_nxt = tp->snd_nxt;
2181 tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
2182
2183 /* Take into account the current RTT sample too, to
2184 * decrease the impact of delayed acks. This double counts
2185 * this sample since we count it for the next window as well,
2186 * but that's not too awful, since we're taking the min,
2187 * rather than averaging.
2188 */
2189 vegas_rtt_calc(tp, seq_rtt);
2190
2191 /* We do the Vegas calculations only if we got enough RTT
2192 * samples that we can be reasonably sure that we got
2193 * at least one RTT sample that wasn't from a delayed ACK.
2194 * If we only had 2 samples total,
2195 * then that means we're getting only 1 ACK per RTT, which
2196 * means they're almost certainly delayed ACKs.
2197 * If we have 3 samples, we should be OK.
2198 */
2199
2200 if (tp->vegas.cntRTT <= 2) {
2201 /* We don't have enough RTT samples to do the Vegas
2202 * calculation, so we'll behave like Reno.
2203 */
2204 if (tp->snd_cwnd > tp->snd_ssthresh)
2205 tp->snd_cwnd++;
2206 } else {
2207 u32 rtt, target_cwnd, diff;
2208
2209 /* We have enough RTT samples, so, using the Vegas
2210 * algorithm, we determine if we should increase or
2211 * decrease cwnd, and by how much.
2212 */
2213
2214 /* Pluck out the RTT we are using for the Vegas
2215 * calculations. This is the min RTT seen during the
2216 * last RTT. Taking the min filters out the effects
2217 * of delayed ACKs, at the cost of noticing congestion
2218 * a bit later.
2219 */
2220 rtt = tp->vegas.minRTT;
2221
2222 /* Calculate the cwnd we should have, if we weren't
2223 * going too fast.
2224 *
2225 * This is:
2226 * (actual rate in segments) * baseRTT
2227 * We keep it as a fixed point number with
2228 * V_PARAM_SHIFT bits to the right of the binary point.
2229 */
2230 target_cwnd = ((old_wnd * tp->vegas.baseRTT)
2231 << V_PARAM_SHIFT) / rtt;
2232
2233 /* Calculate the difference between the window we had,
2234 * and the window we would like to have. This quantity
2235 * is the "Diff" from the Arizona Vegas papers.
2236 *
2237 * Again, this is a fixed point number with
2238 * V_PARAM_SHIFT bits to the right of the binary
2239 * point.
2240 */
2241 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
2242
2243 if (tp->snd_cwnd < tp->snd_ssthresh) {
2244 /* Slow start. */
2245 if (diff > sysctl_tcp_vegas_gamma) {
2246 /* Going too fast. Time to slow down
2247 * and switch to congestion avoidance.
2248 */
2249 tp->snd_ssthresh = 2;
2250
2251 /* Set cwnd to match the actual rate
2252 * exactly:
2253 * cwnd = (actual rate) * baseRTT
2254 * Then we add 1 because the integer
2255 * truncation robs us of full link
2256 * utilization.
2257 */
2258 tp->snd_cwnd = min(tp->snd_cwnd,
2259 (target_cwnd >>
2260 V_PARAM_SHIFT)+1);
2261
2262 }
2263 } else {
2264 /* Congestion avoidance. */
2265 u32 next_snd_cwnd;
2266
2267 /* Figure out where we would like cwnd
2268 * to be.
2269 */
2270 if (diff > sysctl_tcp_vegas_beta) {
2271 /* The old window was too fast, so
2272 * we slow down.
2273 */
2274 next_snd_cwnd = old_snd_cwnd - 1;
2275 } else if (diff < sysctl_tcp_vegas_alpha) {
2276 /* We don't have enough extra packets
2277 * in the network, so speed up.
2278 */
2279 next_snd_cwnd = old_snd_cwnd + 1;
2280 } else {
2281 /* Sending just as fast as we
2282 * should be.
2283 */
2284 next_snd_cwnd = old_snd_cwnd;
2285 }
2286
2287 /* Adjust cwnd upward or downward, toward the
2288 * desired value.
2289 */
2290 if (next_snd_cwnd > tp->snd_cwnd)
2291 tp->snd_cwnd++;
2292 else if (next_snd_cwnd < tp->snd_cwnd)
2293 tp->snd_cwnd--;
2294 }
2295 }
2296
2297 /* Wipe the slate clean for the next RTT. */
2298 tp->vegas.cntRTT = 0;
2299 tp->vegas.minRTT = 0x7fffffff;
2300 }
2301
2302 /* The following code is executed for every ack we receive,
2303 * except for conditions checked in should_advance_cwnd()
2304 * before the call to tcp_cong_avoid(). Mainly this means that
2305 * we only execute this code if the ack actually acked some
2306 * data.
2307 */
2308
2309 /* If we are in slow start, increase our cwnd in response to this ACK.
2310 * (If we are not in slow start then we are in congestion avoidance,
2311 * and adjust our congestion window only once per RTT. See the code
2312 * above.)
2313 */
2314 if (tp->snd_cwnd <= tp->snd_ssthresh)
2315 tp->snd_cwnd++;
2316
2317 /* to keep cwnd from growing without bound */
2318 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
2319
2320 /* Make sure that we are never so timid as to reduce our cwnd below
2321 * 2 MSS.
2322 *
2323 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
2324 */
2325 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
2326
2327 tp->snd_cwnd_stamp = tcp_time_stamp;
2328}
2329
2330static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2331{
2332 if (tcp_vegas_enabled(tp))
2333 vegas_cong_avoid(tp, ack, seq_rtt);
2334 else
2335 reno_cong_avoid(tp);
2336}
2337
2338/* Restart timer after forward progress on connection. 1947/* Restart timer after forward progress on connection.
2339 * RFC2988 recommends to restart timer to now+rto. 1948 * RFC2988 recommends to restart timer to now+rto.
2340 */ 1949 */
@@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2415 2024
2416 2025
2417/* Remove acknowledged frames from the retransmission queue. */ 2026/* Remove acknowledged frames from the retransmission queue. */
2418static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) 2027static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
2419{ 2028{
2420 struct tcp_sock *tp = tcp_sk(sk); 2029 struct tcp_sock *tp = tcp_sk(sk);
2421 struct sk_buff *skb; 2030 struct sk_buff *skb;
2422 __u32 now = tcp_time_stamp; 2031 __u32 now = tcp_time_stamp;
2423 int acked = 0; 2032 int acked = 0;
2424 __s32 seq_rtt = -1; 2033 __s32 seq_rtt = -1;
2034 struct timeval usnow;
2035 u32 pkts_acked = 0;
2036
2037 if (seq_usrtt)
2038 do_gettimeofday(&usnow);
2425 2039
2426 while ((skb = skb_peek(&sk->sk_write_queue)) && 2040 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2427 skb != sk->sk_send_head) { 2041 skb != sk->sk_send_head) {
@@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2448 */ 2062 */
2449 if (!(scb->flags & TCPCB_FLAG_SYN)) { 2063 if (!(scb->flags & TCPCB_FLAG_SYN)) {
2450 acked |= FLAG_DATA_ACKED; 2064 acked |= FLAG_DATA_ACKED;
2065 ++pkts_acked;
2451 } else { 2066 } else {
2452 acked |= FLAG_SYN_ACKED; 2067 acked |= FLAG_SYN_ACKED;
2453 tp->retrans_stamp = 0; 2068 tp->retrans_stamp = 0;
@@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2461 seq_rtt = -1; 2076 seq_rtt = -1;
2462 } else if (seq_rtt < 0) 2077 } else if (seq_rtt < 0)
2463 seq_rtt = now - scb->when; 2078 seq_rtt = now - scb->when;
2079 if (seq_usrtt)
2080 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
2081 + (usnow.tv_usec - skb->stamp.tv_usec);
2082
2464 if (sacked & TCPCB_SACKED_ACKED) 2083 if (sacked & TCPCB_SACKED_ACKED)
2465 tp->sacked_out -= tcp_skb_pcount(skb); 2084 tp->sacked_out -= tcp_skb_pcount(skb);
2466 if (sacked & TCPCB_LOST) 2085 if (sacked & TCPCB_LOST)
@@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2479 } 2098 }
2480 2099
2481 if (acked&FLAG_ACKED) { 2100 if (acked&FLAG_ACKED) {
2482 tcp_ack_update_rtt(tp, acked, seq_rtt); 2101 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
2483 tcp_ack_packets_out(sk, tp); 2102 tcp_ack_packets_out(sk, tp);
2103
2104 if (tp->ca_ops->pkts_acked)
2105 tp->ca_ops->pkts_acked(tp, pkts_acked);
2484 } 2106 }
2485 2107
2486#if FASTRETRANS_DEBUG > 0 2108#if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2624 tp->frto_counter = (tp->frto_counter + 1) % 3; 2246 tp->frto_counter = (tp->frto_counter + 1) % 3;
2625} 2247}
2626 2248
2627/*
2628 * TCP Westwood+
2629 */
2630
2631/*
2632 * @init_westwood
2633 * This function initializes fields used in TCP Westwood+. We can't
2634 * get no information about RTTmin at this time so we simply set it to
2635 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
2636 * since in this way we're sure it will be updated in a consistent
2637 * way as soon as possible. It will reasonably happen within the first
2638 * RTT period of the connection lifetime.
2639 */
2640
2641static void init_westwood(struct sock *sk)
2642{
2643 struct tcp_sock *tp = tcp_sk(sk);
2644
2645 tp->westwood.bw_ns_est = 0;
2646 tp->westwood.bw_est = 0;
2647 tp->westwood.accounted = 0;
2648 tp->westwood.cumul_ack = 0;
2649 tp->westwood.rtt_win_sx = tcp_time_stamp;
2650 tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
2651 tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
2652 tp->westwood.snd_una = tp->snd_una;
2653}
2654
2655/*
2656 * @westwood_do_filter
2657 * Low-pass filter. Implemented using constant coeffients.
2658 */
2659
2660static inline __u32 westwood_do_filter(__u32 a, __u32 b)
2661{
2662 return (((7 * a) + b) >> 3);
2663}
2664
2665static void westwood_filter(struct sock *sk, __u32 delta)
2666{
2667 struct tcp_sock *tp = tcp_sk(sk);
2668
2669 tp->westwood.bw_ns_est =
2670 westwood_do_filter(tp->westwood.bw_ns_est,
2671 tp->westwood.bk / delta);
2672 tp->westwood.bw_est =
2673 westwood_do_filter(tp->westwood.bw_est,
2674 tp->westwood.bw_ns_est);
2675}
2676
2677/*
2678 * @westwood_update_rttmin
2679 * It is used to update RTTmin. In this case we MUST NOT use
2680 * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
2681 */
2682
2683static inline __u32 westwood_update_rttmin(const struct sock *sk)
2684{
2685 const struct tcp_sock *tp = tcp_sk(sk);
2686 __u32 rttmin = tp->westwood.rtt_min;
2687
2688 if (tp->westwood.rtt != 0 &&
2689 (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
2690 rttmin = tp->westwood.rtt;
2691
2692 return rttmin;
2693}
2694
2695/*
2696 * @westwood_acked
2697 * Evaluate increases for dk.
2698 */
2699
2700static inline __u32 westwood_acked(const struct sock *sk)
2701{
2702 const struct tcp_sock *tp = tcp_sk(sk);
2703
2704 return tp->snd_una - tp->westwood.snd_una;
2705}
2706
2707/*
2708 * @westwood_new_window
2709 * It evaluates if we are receiving data inside the same RTT window as
2710 * when we started.
2711 * Return value:
2712 * It returns 0 if we are still evaluating samples in the same RTT
2713 * window, 1 if the sample has to be considered in the next window.
2714 */
2715
2716static int westwood_new_window(const struct sock *sk)
2717{
2718 const struct tcp_sock *tp = tcp_sk(sk);
2719 __u32 left_bound;
2720 __u32 rtt;
2721 int ret = 0;
2722
2723 left_bound = tp->westwood.rtt_win_sx;
2724 rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
2725
2726 /*
2727 * A RTT-window has passed. Be careful since if RTT is less than
2728 * 50ms we don't filter but we continue 'building the sample'.
2729 * This minimum limit was choosen since an estimation on small
2730 * time intervals is better to avoid...
2731 * Obvioulsy on a LAN we reasonably will always have
2732 * right_bound = left_bound + WESTWOOD_RTT_MIN
2733 */
2734
2735 if ((left_bound + rtt) < tcp_time_stamp)
2736 ret = 1;
2737
2738 return ret;
2739}
2740
2741/*
2742 * @westwood_update_window
2743 * It updates RTT evaluation window if it is the right moment to do
2744 * it. If so it calls filter for evaluating bandwidth.
2745 */
2746
2747static void __westwood_update_window(struct sock *sk, __u32 now)
2748{
2749 struct tcp_sock *tp = tcp_sk(sk);
2750 __u32 delta = now - tp->westwood.rtt_win_sx;
2751
2752 if (delta) {
2753 if (tp->westwood.rtt)
2754 westwood_filter(sk, delta);
2755
2756 tp->westwood.bk = 0;
2757 tp->westwood.rtt_win_sx = tcp_time_stamp;
2758 }
2759}
2760
2761
2762static void westwood_update_window(struct sock *sk, __u32 now)
2763{
2764 if (westwood_new_window(sk))
2765 __westwood_update_window(sk, now);
2766}
2767
2768/*
2769 * @__tcp_westwood_fast_bw
2770 * It is called when we are in fast path. In particular it is called when
2771 * header prediction is successfull. In such case infact update is
2772 * straight forward and doesn't need any particular care.
2773 */
2774
2775static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779 westwood_update_window(sk, tcp_time_stamp);
2780
2781 tp->westwood.bk += westwood_acked(sk);
2782 tp->westwood.snd_una = tp->snd_una;
2783 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2784}
2785
2786static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2787{
2788 if (tcp_is_westwood(tcp_sk(sk)))
2789 __tcp_westwood_fast_bw(sk, skb);
2790}
2791
2792
2793/*
2794 * @westwood_dupack_update
2795 * It updates accounted and cumul_ack when receiving a dupack.
2796 */
2797
2798static void westwood_dupack_update(struct sock *sk)
2799{
2800 struct tcp_sock *tp = tcp_sk(sk);
2801
2802 tp->westwood.accounted += tp->mss_cache_std;
2803 tp->westwood.cumul_ack = tp->mss_cache_std;
2804}
2805
2806static inline int westwood_may_change_cumul(struct tcp_sock *tp)
2807{
2808 return (tp->westwood.cumul_ack > tp->mss_cache_std);
2809}
2810
2811static inline void westwood_partial_update(struct tcp_sock *tp)
2812{
2813 tp->westwood.accounted -= tp->westwood.cumul_ack;
2814 tp->westwood.cumul_ack = tp->mss_cache_std;
2815}
2816
2817static inline void westwood_complete_update(struct tcp_sock *tp)
2818{
2819 tp->westwood.cumul_ack -= tp->westwood.accounted;
2820 tp->westwood.accounted = 0;
2821}
2822
2823/*
2824 * @westwood_acked_count
2825 * This function evaluates cumul_ack for evaluating dk in case of
2826 * delayed or partial acks.
2827 */
2828
2829static inline __u32 westwood_acked_count(struct sock *sk)
2830{
2831 struct tcp_sock *tp = tcp_sk(sk);
2832
2833 tp->westwood.cumul_ack = westwood_acked(sk);
2834
2835 /* If cumul_ack is 0 this is a dupack since it's not moving
2836 * tp->snd_una.
2837 */
2838 if (!(tp->westwood.cumul_ack))
2839 westwood_dupack_update(sk);
2840
2841 if (westwood_may_change_cumul(tp)) {
2842 /* Partial or delayed ack */
2843 if (tp->westwood.accounted >= tp->westwood.cumul_ack)
2844 westwood_partial_update(tp);
2845 else
2846 westwood_complete_update(tp);
2847 }
2848
2849 tp->westwood.snd_una = tp->snd_una;
2850
2851 return tp->westwood.cumul_ack;
2852}
2853
2854
2855/*
2856 * @__tcp_westwood_slow_bw
2857 * It is called when something is going wrong..even if there could
2858 * be no problems! Infact a simple delayed packet may trigger a
2859 * dupack. But we need to be careful in such case.
2860 */
2861
2862static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2863{
2864 struct tcp_sock *tp = tcp_sk(sk);
2865
2866 westwood_update_window(sk, tcp_time_stamp);
2867
2868 tp->westwood.bk += westwood_acked_count(sk);
2869 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2870}
2871
2872static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2873{
2874 if (tcp_is_westwood(tcp_sk(sk)))
2875 __tcp_westwood_slow_bw(sk, skb);
2876}
2877
2878/* This routine deals with incoming acks, but not outgoing ones. */ 2249/* This routine deals with incoming acks, but not outgoing ones. */
2879static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2250static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2880{ 2251{
@@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2884 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2255 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2885 u32 prior_in_flight; 2256 u32 prior_in_flight;
2886 s32 seq_rtt; 2257 s32 seq_rtt;
2258 s32 seq_usrtt = 0;
2887 int prior_packets; 2259 int prior_packets;
2888 2260
2889 /* If the ack is newer than sent or older than previous acks 2261 /* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2902 */ 2274 */
2903 tcp_update_wl(tp, ack, ack_seq); 2275 tcp_update_wl(tp, ack, ack_seq);
2904 tp->snd_una = ack; 2276 tp->snd_una = ack;
2905 tcp_westwood_fast_bw(sk, skb);
2906 flag |= FLAG_WIN_UPDATE; 2277 flag |= FLAG_WIN_UPDATE;
2907 2278
2279 tcp_ca_event(tp, CA_EVENT_FAST_ACK);
2280
2908 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2281 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2909 } else { 2282 } else {
2910 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 2283 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2920 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2293 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2921 flag |= FLAG_ECE; 2294 flag |= FLAG_ECE;
2922 2295
2923 tcp_westwood_slow_bw(sk,skb); 2296 tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
2924 } 2297 }
2925 2298
2926 /* We passed data and got it acked, remove any soft error 2299 /* We passed data and got it acked, remove any soft error
@@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2935 prior_in_flight = tcp_packets_in_flight(tp); 2308 prior_in_flight = tcp_packets_in_flight(tp);
2936 2309
2937 /* See if we can take anything off of the retransmit queue. */ 2310 /* See if we can take anything off of the retransmit queue. */
2938 flag |= tcp_clean_rtx_queue(sk, &seq_rtt); 2311 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2312 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
2939 2313
2940 if (tp->frto_counter) 2314 if (tp->frto_counter)
2941 tcp_process_frto(sk, prior_snd_una); 2315 tcp_process_frto(sk, prior_snd_una);
2942 2316
2943 if (tcp_ack_is_dubious(tp, flag)) { 2317 if (tcp_ack_is_dubious(tp, flag)) {
2944 /* Advanve CWND, if state allows this. */ 2318 /* Advanve CWND, if state allows this. */
2945 if ((flag & FLAG_DATA_ACKED) && 2319 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
2946 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && 2320 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
2947 tcp_may_raise_cwnd(tp, flag))
2948 tcp_cong_avoid(tp, ack, seq_rtt);
2949 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2321 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2950 } else { 2322 } else {
2951 if ((flag & FLAG_DATA_ACKED) && 2323 if ((flag & FLAG_DATA_ACKED))
2952 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) 2324 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
2953 tcp_cong_avoid(tp, ack, seq_rtt);
2954 } 2325 }
2955 2326
2956 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2327 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4552 3923
4553 tcp_init_metrics(sk); 3924 tcp_init_metrics(sk);
4554 3925
3926 tcp_init_congestion_control(tp);
3927
4555 /* Prevent spurious tcp_cwnd_restart() on first data 3928 /* Prevent spurious tcp_cwnd_restart() on first data
4556 * packet. 3929 * packet.
4557 */ 3930 */
@@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4708 if(tp->af_specific->conn_request(sk, skb) < 0) 4081 if(tp->af_specific->conn_request(sk, skb) < 0)
4709 return 1; 4082 return 1;
4710 4083
4711 init_westwood(sk);
4712 init_bictcp(tp);
4713
4714 /* Now we have several options: In theory there is 4084 /* Now we have several options: In theory there is
4715 * nothing else in the frame. KA9Q has an option to 4085 * nothing else in the frame. KA9Q has an option to
4716 * send data with the syn, BSD accepts data with the 4086 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4732 goto discard; 4102 goto discard;
4733 4103
4734 case TCP_SYN_SENT: 4104 case TCP_SYN_SENT:
4735 init_westwood(sk);
4736 init_bictcp(tp);
4737
4738 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 4105 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4739 if (queued >= 0) 4106 if (queued >= 0)
4740 return queued; 4107 return queued;
@@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4816 */ 4183 */
4817 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4184 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4818 !tp->srtt) 4185 !tp->srtt)
4819 tcp_ack_saw_tstamp(tp, 0); 4186 tcp_ack_saw_tstamp(tp, 0, 0);
4820 4187
4821 if (tp->rx_opt.tstamp_ok) 4188 if (tp->rx_opt.tstamp_ok)
4822 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4189 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4828 4195
4829 tcp_init_metrics(sk); 4196 tcp_init_metrics(sk);
4830 4197
4198 tcp_init_congestion_control(tp);
4199
4831 /* Prevent spurious tcp_cwnd_restart() on 4200 /* Prevent spurious tcp_cwnd_restart() on
4832 * first data packet. 4201 * first data packet.
4833 */ 4202 */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dad98e4a5043..ebf112347a97 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -36,7 +36,7 @@
36 * ACK bit. 36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery. 37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the 38 * Fixed many serious bugs in the
39 * open_request handling and moved 39 * request_sock handling and moved
40 * most of it into the af independent code. 40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes. 41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics. 42 * Added new listen sematics.
@@ -869,21 +869,23 @@ static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
869 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); 869 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
870} 870}
871 871
872static struct open_request *tcp_v4_search_req(struct tcp_sock *tp, 872static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
873 struct open_request ***prevp, 873 struct request_sock ***prevp,
874 __u16 rport, 874 __u16 rport,
875 __u32 raddr, __u32 laddr) 875 __u32 raddr, __u32 laddr)
876{ 876{
877 struct tcp_listen_opt *lopt = tp->listen_opt; 877 struct listen_sock *lopt = tp->accept_queue.listen_opt;
878 struct open_request *req, **prev; 878 struct request_sock *req, **prev;
879 879
880 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; 880 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
881 (req = *prev) != NULL; 881 (req = *prev) != NULL;
882 prev = &req->dl_next) { 882 prev = &req->dl_next) {
883 if (req->rmt_port == rport && 883 const struct inet_request_sock *ireq = inet_rsk(req);
884 req->af.v4_req.rmt_addr == raddr && 884
885 req->af.v4_req.loc_addr == laddr && 885 if (ireq->rmt_port == rport &&
886 TCP_INET_FAMILY(req->class->family)) { 886 ireq->rmt_addr == raddr &&
887 ireq->loc_addr == laddr &&
888 TCP_INET_FAMILY(req->rsk_ops->family)) {
887 BUG_TRAP(!req->sk); 889 BUG_TRAP(!req->sk);
888 *prevp = prev; 890 *prevp = prev;
889 break; 891 break;
@@ -893,21 +895,13 @@ static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
893 return req; 895 return req;
894} 896}
895 897
896static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) 898static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
897{ 899{
898 struct tcp_sock *tp = tcp_sk(sk); 900 struct tcp_sock *tp = tcp_sk(sk);
899 struct tcp_listen_opt *lopt = tp->listen_opt; 901 struct listen_sock *lopt = tp->accept_queue.listen_opt;
900 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd); 902 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
901
902 req->expires = jiffies + TCP_TIMEOUT_INIT;
903 req->retrans = 0;
904 req->sk = NULL;
905 req->dl_next = lopt->syn_table[h];
906
907 write_lock(&tp->syn_wait_lock);
908 lopt->syn_table[h] = req;
909 write_unlock(&tp->syn_wait_lock);
910 903
904 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
911 tcp_synq_added(sk); 905 tcp_synq_added(sk);
912} 906}
913 907
@@ -1050,7 +1044,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1050 } 1044 }
1051 1045
1052 switch (sk->sk_state) { 1046 switch (sk->sk_state) {
1053 struct open_request *req, **prev; 1047 struct request_sock *req, **prev;
1054 case TCP_LISTEN: 1048 case TCP_LISTEN:
1055 if (sock_owned_by_user(sk)) 1049 if (sock_owned_by_user(sk))
1056 goto out; 1050 goto out;
@@ -1065,7 +1059,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1065 */ 1059 */
1066 BUG_TRAP(!req->sk); 1060 BUG_TRAP(!req->sk);
1067 1061
1068 if (seq != req->snt_isn) { 1062 if (seq != tcp_rsk(req)->snt_isn) {
1069 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); 1063 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1070 goto out; 1064 goto out;
1071 } 1065 }
@@ -1254,28 +1248,29 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1254 tcp_tw_put(tw); 1248 tcp_tw_put(tw);
1255} 1249}
1256 1250
1257static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) 1251static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1258{ 1252{
1259 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd, 1253 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1260 req->ts_recent); 1254 req->ts_recent);
1261} 1255}
1262 1256
1263static struct dst_entry* tcp_v4_route_req(struct sock *sk, 1257static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1264 struct open_request *req) 1258 struct request_sock *req)
1265{ 1259{
1266 struct rtable *rt; 1260 struct rtable *rt;
1267 struct ip_options *opt = req->af.v4_req.opt; 1261 const struct inet_request_sock *ireq = inet_rsk(req);
1262 struct ip_options *opt = inet_rsk(req)->opt;
1268 struct flowi fl = { .oif = sk->sk_bound_dev_if, 1263 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1269 .nl_u = { .ip4_u = 1264 .nl_u = { .ip4_u =
1270 { .daddr = ((opt && opt->srr) ? 1265 { .daddr = ((opt && opt->srr) ?
1271 opt->faddr : 1266 opt->faddr :
1272 req->af.v4_req.rmt_addr), 1267 ireq->rmt_addr),
1273 .saddr = req->af.v4_req.loc_addr, 1268 .saddr = ireq->loc_addr,
1274 .tos = RT_CONN_FLAGS(sk) } }, 1269 .tos = RT_CONN_FLAGS(sk) } },
1275 .proto = IPPROTO_TCP, 1270 .proto = IPPROTO_TCP,
1276 .uli_u = { .ports = 1271 .uli_u = { .ports =
1277 { .sport = inet_sk(sk)->sport, 1272 { .sport = inet_sk(sk)->sport,
1278 .dport = req->rmt_port } } }; 1273 .dport = ireq->rmt_port } } };
1279 1274
1280 if (ip_route_output_flow(&rt, &fl, sk, 0)) { 1275 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1281 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 1276 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
@@ -1291,12 +1286,13 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1291 1286
1292/* 1287/*
1293 * Send a SYN-ACK after having received an ACK. 1288 * Send a SYN-ACK after having received an ACK.
1294 * This still operates on a open_request only, not on a big 1289 * This still operates on a request_sock only, not on a big
1295 * socket. 1290 * socket.
1296 */ 1291 */
1297static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, 1292static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1298 struct dst_entry *dst) 1293 struct dst_entry *dst)
1299{ 1294{
1295 const struct inet_request_sock *ireq = inet_rsk(req);
1300 int err = -1; 1296 int err = -1;
1301 struct sk_buff * skb; 1297 struct sk_buff * skb;
1302 1298
@@ -1310,14 +1306,14 @@ static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1310 struct tcphdr *th = skb->h.th; 1306 struct tcphdr *th = skb->h.th;
1311 1307
1312 th->check = tcp_v4_check(th, skb->len, 1308 th->check = tcp_v4_check(th, skb->len,
1313 req->af.v4_req.loc_addr, 1309 ireq->loc_addr,
1314 req->af.v4_req.rmt_addr, 1310 ireq->rmt_addr,
1315 csum_partial((char *)th, skb->len, 1311 csum_partial((char *)th, skb->len,
1316 skb->csum)); 1312 skb->csum));
1317 1313
1318 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, 1314 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1319 req->af.v4_req.rmt_addr, 1315 ireq->rmt_addr,
1320 req->af.v4_req.opt); 1316 ireq->opt);
1321 if (err == NET_XMIT_CN) 1317 if (err == NET_XMIT_CN)
1322 err = 0; 1318 err = 0;
1323 } 1319 }
@@ -1328,12 +1324,12 @@ out:
1328} 1324}
1329 1325
1330/* 1326/*
1331 * IPv4 open_request destructor. 1327 * IPv4 request_sock destructor.
1332 */ 1328 */
1333static void tcp_v4_or_free(struct open_request *req) 1329static void tcp_v4_reqsk_destructor(struct request_sock *req)
1334{ 1330{
1335 if (req->af.v4_req.opt) 1331 if (inet_rsk(req)->opt)
1336 kfree(req->af.v4_req.opt); 1332 kfree(inet_rsk(req)->opt);
1337} 1333}
1338 1334
1339static inline void syn_flood_warning(struct sk_buff *skb) 1335static inline void syn_flood_warning(struct sk_buff *skb)
@@ -1349,7 +1345,7 @@ static inline void syn_flood_warning(struct sk_buff *skb)
1349} 1345}
1350 1346
1351/* 1347/*
1352 * Save and compile IPv4 options into the open_request if needed. 1348 * Save and compile IPv4 options into the request_sock if needed.
1353 */ 1349 */
1354static inline struct ip_options *tcp_v4_save_options(struct sock *sk, 1350static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1355 struct sk_buff *skb) 1351 struct sk_buff *skb)
@@ -1370,33 +1366,20 @@ static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1370 return dopt; 1366 return dopt;
1371} 1367}
1372 1368
1373/* 1369struct request_sock_ops tcp_request_sock_ops = {
1374 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1375 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1376 * It would be better to replace it with a global counter for all sockets
1377 * but then some measure against one socket starving all other sockets
1378 * would be needed.
1379 *
1380 * It was 128 by default. Experiments with real servers show, that
1381 * it is absolutely not enough even at 100conn/sec. 256 cures most
1382 * of problems. This value is adjusted to 128 for very small machines
1383 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1384 * Further increasing requires to change hash table size.
1385 */
1386int sysctl_max_syn_backlog = 256;
1387
1388struct or_calltable or_ipv4 = {
1389 .family = PF_INET, 1370 .family = PF_INET,
1371 .obj_size = sizeof(struct tcp_request_sock),
1390 .rtx_syn_ack = tcp_v4_send_synack, 1372 .rtx_syn_ack = tcp_v4_send_synack,
1391 .send_ack = tcp_v4_or_send_ack, 1373 .send_ack = tcp_v4_reqsk_send_ack,
1392 .destructor = tcp_v4_or_free, 1374 .destructor = tcp_v4_reqsk_destructor,
1393 .send_reset = tcp_v4_send_reset, 1375 .send_reset = tcp_v4_send_reset,
1394}; 1376};
1395 1377
1396int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1378int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1397{ 1379{
1380 struct inet_request_sock *ireq;
1398 struct tcp_options_received tmp_opt; 1381 struct tcp_options_received tmp_opt;
1399 struct open_request *req; 1382 struct request_sock *req;
1400 __u32 saddr = skb->nh.iph->saddr; 1383 __u32 saddr = skb->nh.iph->saddr;
1401 __u32 daddr = skb->nh.iph->daddr; 1384 __u32 daddr = skb->nh.iph->daddr;
1402 __u32 isn = TCP_SKB_CB(skb)->when; 1385 __u32 isn = TCP_SKB_CB(skb)->when;
@@ -1433,7 +1416,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1433 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 1416 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1434 goto drop; 1417 goto drop;
1435 1418
1436 req = tcp_openreq_alloc(); 1419 req = reqsk_alloc(&tcp_request_sock_ops);
1437 if (!req) 1420 if (!req)
1438 goto drop; 1421 goto drop;
1439 1422
@@ -1461,10 +1444,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1461 1444
1462 tcp_openreq_init(req, &tmp_opt, skb); 1445 tcp_openreq_init(req, &tmp_opt, skb);
1463 1446
1464 req->af.v4_req.loc_addr = daddr; 1447 ireq = inet_rsk(req);
1465 req->af.v4_req.rmt_addr = saddr; 1448 ireq->loc_addr = daddr;
1466 req->af.v4_req.opt = tcp_v4_save_options(sk, skb); 1449 ireq->rmt_addr = saddr;
1467 req->class = &or_ipv4; 1450 ireq->opt = tcp_v4_save_options(sk, skb);
1468 if (!want_cookie) 1451 if (!want_cookie)
1469 TCP_ECN_create_request(req, skb->h.th); 1452 TCP_ECN_create_request(req, skb->h.th);
1470 1453
@@ -1523,20 +1506,20 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1523 1506
1524 isn = tcp_v4_init_sequence(sk, skb); 1507 isn = tcp_v4_init_sequence(sk, skb);
1525 } 1508 }
1526 req->snt_isn = isn; 1509 tcp_rsk(req)->snt_isn = isn;
1527 1510
1528 if (tcp_v4_send_synack(sk, req, dst)) 1511 if (tcp_v4_send_synack(sk, req, dst))
1529 goto drop_and_free; 1512 goto drop_and_free;
1530 1513
1531 if (want_cookie) { 1514 if (want_cookie) {
1532 tcp_openreq_free(req); 1515 reqsk_free(req);
1533 } else { 1516 } else {
1534 tcp_v4_synq_add(sk, req); 1517 tcp_v4_synq_add(sk, req);
1535 } 1518 }
1536 return 0; 1519 return 0;
1537 1520
1538drop_and_free: 1521drop_and_free:
1539 tcp_openreq_free(req); 1522 reqsk_free(req);
1540drop: 1523drop:
1541 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1524 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1542 return 0; 1525 return 0;
@@ -1548,9 +1531,10 @@ drop:
1548 * now create the new socket. 1531 * now create the new socket.
1549 */ 1532 */
1550struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 1533struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1551 struct open_request *req, 1534 struct request_sock *req,
1552 struct dst_entry *dst) 1535 struct dst_entry *dst)
1553{ 1536{
1537 struct inet_request_sock *ireq;
1554 struct inet_sock *newinet; 1538 struct inet_sock *newinet;
1555 struct tcp_sock *newtp; 1539 struct tcp_sock *newtp;
1556 struct sock *newsk; 1540 struct sock *newsk;
@@ -1570,11 +1554,12 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1570 1554
1571 newtp = tcp_sk(newsk); 1555 newtp = tcp_sk(newsk);
1572 newinet = inet_sk(newsk); 1556 newinet = inet_sk(newsk);
1573 newinet->daddr = req->af.v4_req.rmt_addr; 1557 ireq = inet_rsk(req);
1574 newinet->rcv_saddr = req->af.v4_req.loc_addr; 1558 newinet->daddr = ireq->rmt_addr;
1575 newinet->saddr = req->af.v4_req.loc_addr; 1559 newinet->rcv_saddr = ireq->loc_addr;
1576 newinet->opt = req->af.v4_req.opt; 1560 newinet->saddr = ireq->loc_addr;
1577 req->af.v4_req.opt = NULL; 1561 newinet->opt = ireq->opt;
1562 ireq->opt = NULL;
1578 newinet->mc_index = tcp_v4_iif(skb); 1563 newinet->mc_index = tcp_v4_iif(skb);
1579 newinet->mc_ttl = skb->nh.iph->ttl; 1564 newinet->mc_ttl = skb->nh.iph->ttl;
1580 newtp->ext_header_len = 0; 1565 newtp->ext_header_len = 0;
@@ -1605,9 +1590,9 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1605 struct iphdr *iph = skb->nh.iph; 1590 struct iphdr *iph = skb->nh.iph;
1606 struct tcp_sock *tp = tcp_sk(sk); 1591 struct tcp_sock *tp = tcp_sk(sk);
1607 struct sock *nsk; 1592 struct sock *nsk;
1608 struct open_request **prev; 1593 struct request_sock **prev;
1609 /* Find possible connection requests. */ 1594 /* Find possible connection requests. */
1610 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source, 1595 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1611 iph->saddr, iph->daddr); 1596 iph->saddr, iph->daddr);
1612 if (req) 1597 if (req)
1613 return tcp_check_req(sk, skb, req, prev); 1598 return tcp_check_req(sk, skb, req, prev);
@@ -2063,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
2063 tp->mss_cache_std = tp->mss_cache = 536; 2048 tp->mss_cache_std = tp->mss_cache = 536;
2064 2049
2065 tp->reordering = sysctl_tcp_reordering; 2050 tp->reordering = sysctl_tcp_reordering;
2051 tp->ca_ops = &tcp_init_congestion_ops;
2066 2052
2067 sk->sk_state = TCP_CLOSE; 2053 sk->sk_state = TCP_CLOSE;
2068 2054
@@ -2085,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
2085 2071
2086 tcp_clear_xmit_timers(sk); 2072 tcp_clear_xmit_timers(sk);
2087 2073
2074 tcp_cleanup_congestion_control(tp);
2075
2088 /* Cleanup up the write buffer. */ 2076 /* Cleanup up the write buffer. */
2089 sk_stream_writequeue_purge(sk); 2077 sk_stream_writequeue_purge(sk);
2090 2078
@@ -2144,13 +2132,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2144 ++st->num; 2132 ++st->num;
2145 2133
2146 if (st->state == TCP_SEQ_STATE_OPENREQ) { 2134 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2147 struct open_request *req = cur; 2135 struct request_sock *req = cur;
2148 2136
2149 tp = tcp_sk(st->syn_wait_sk); 2137 tp = tcp_sk(st->syn_wait_sk);
2150 req = req->dl_next; 2138 req = req->dl_next;
2151 while (1) { 2139 while (1) {
2152 while (req) { 2140 while (req) {
2153 if (req->class->family == st->family) { 2141 if (req->rsk_ops->family == st->family) {
2154 cur = req; 2142 cur = req;
2155 goto out; 2143 goto out;
2156 } 2144 }
@@ -2159,17 +2147,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2159 if (++st->sbucket >= TCP_SYNQ_HSIZE) 2147 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2160 break; 2148 break;
2161get_req: 2149get_req:
2162 req = tp->listen_opt->syn_table[st->sbucket]; 2150 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2163 } 2151 }
2164 sk = sk_next(st->syn_wait_sk); 2152 sk = sk_next(st->syn_wait_sk);
2165 st->state = TCP_SEQ_STATE_LISTENING; 2153 st->state = TCP_SEQ_STATE_LISTENING;
2166 read_unlock_bh(&tp->syn_wait_lock); 2154 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2167 } else { 2155 } else {
2168 tp = tcp_sk(sk); 2156 tp = tcp_sk(sk);
2169 read_lock_bh(&tp->syn_wait_lock); 2157 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2170 if (tp->listen_opt && tp->listen_opt->qlen) 2158 if (reqsk_queue_len(&tp->accept_queue))
2171 goto start_req; 2159 goto start_req;
2172 read_unlock_bh(&tp->syn_wait_lock); 2160 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2173 sk = sk_next(sk); 2161 sk = sk_next(sk);
2174 } 2162 }
2175get_sk: 2163get_sk:
@@ -2179,8 +2167,8 @@ get_sk:
2179 goto out; 2167 goto out;
2180 } 2168 }
2181 tp = tcp_sk(sk); 2169 tp = tcp_sk(sk);
2182 read_lock_bh(&tp->syn_wait_lock); 2170 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2183 if (tp->listen_opt && tp->listen_opt->qlen) { 2171 if (reqsk_queue_len(&tp->accept_queue)) {
2184start_req: 2172start_req:
2185 st->uid = sock_i_uid(sk); 2173 st->uid = sock_i_uid(sk);
2186 st->syn_wait_sk = sk; 2174 st->syn_wait_sk = sk;
@@ -2188,7 +2176,7 @@ start_req:
2188 st->sbucket = 0; 2176 st->sbucket = 0;
2189 goto get_req; 2177 goto get_req;
2190 } 2178 }
2191 read_unlock_bh(&tp->syn_wait_lock); 2179 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2192 } 2180 }
2193 if (++st->bucket < TCP_LHTABLE_SIZE) { 2181 if (++st->bucket < TCP_LHTABLE_SIZE) {
2194 sk = sk_head(&tcp_listening_hash[st->bucket]); 2182 sk = sk_head(&tcp_listening_hash[st->bucket]);
@@ -2375,7 +2363,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2375 case TCP_SEQ_STATE_OPENREQ: 2363 case TCP_SEQ_STATE_OPENREQ:
2376 if (v) { 2364 if (v) {
2377 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); 2365 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2378 read_unlock_bh(&tp->syn_wait_lock); 2366 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2379 } 2367 }
2380 case TCP_SEQ_STATE_LISTENING: 2368 case TCP_SEQ_STATE_LISTENING:
2381 if (v != SEQ_START_TOKEN) 2369 if (v != SEQ_START_TOKEN)
@@ -2451,18 +2439,19 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2451 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 2439 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2452} 2440}
2453 2441
2454static void get_openreq4(struct sock *sk, struct open_request *req, 2442static void get_openreq4(struct sock *sk, struct request_sock *req,
2455 char *tmpbuf, int i, int uid) 2443 char *tmpbuf, int i, int uid)
2456{ 2444{
2445 const struct inet_request_sock *ireq = inet_rsk(req);
2457 int ttd = req->expires - jiffies; 2446 int ttd = req->expires - jiffies;
2458 2447
2459 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" 2448 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2460 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p", 2449 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2461 i, 2450 i,
2462 req->af.v4_req.loc_addr, 2451 ireq->loc_addr,
2463 ntohs(inet_sk(sk)->sport), 2452 ntohs(inet_sk(sk)->sport),
2464 req->af.v4_req.rmt_addr, 2453 ireq->rmt_addr,
2465 ntohs(req->rmt_port), 2454 ntohs(ireq->rmt_port),
2466 TCP_SYN_RECV, 2455 TCP_SYN_RECV,
2467 0, 0, /* could print option size, but that is af dependent. */ 2456 0, 0, /* could print option size, but that is af dependent. */
2468 1, /* timers active (only the expire timer) */ 2457 1, /* timers active (only the expire timer) */
@@ -2618,6 +2607,7 @@ struct proto tcp_prot = {
2618 .sysctl_rmem = sysctl_tcp_rmem, 2607 .sysctl_rmem = sysctl_tcp_rmem,
2619 .max_header = MAX_TCP_HEADER, 2608 .max_header = MAX_TCP_HEADER,
2620 .obj_size = sizeof(struct tcp_sock), 2609 .obj_size = sizeof(struct tcp_sock),
2610 .rsk_prot = &tcp_request_sock_ops,
2621}; 2611};
2622 2612
2623 2613
@@ -2660,7 +2650,6 @@ EXPORT_SYMBOL(tcp_proc_register);
2660EXPORT_SYMBOL(tcp_proc_unregister); 2650EXPORT_SYMBOL(tcp_proc_unregister);
2661#endif 2651#endif
2662EXPORT_SYMBOL(sysctl_local_port_range); 2652EXPORT_SYMBOL(sysctl_local_port_range);
2663EXPORT_SYMBOL(sysctl_max_syn_backlog);
2664EXPORT_SYMBOL(sysctl_tcp_low_latency); 2653EXPORT_SYMBOL(sysctl_tcp_low_latency);
2665EXPORT_SYMBOL(sysctl_tcp_tw_reuse); 2654EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2666 2655
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index eea1a17a9ac2..f42a284164b7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -684,7 +684,7 @@ out:
684 * Actually, we could lots of memory writes here. tp of listening 684 * Actually, we could lots of memory writes here. tp of listening
685 * socket contains all necessary default parameters. 685 * socket contains all necessary default parameters.
686 */ 686 */
687struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) 687struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
688{ 688{
689 /* allocate the newsk from the same slab of the master sock, 689 /* allocate the newsk from the same slab of the master sock,
690 * if not, at sk_free time we'll try to free it from the wrong 690 * if not, at sk_free time we'll try to free it from the wrong
@@ -692,6 +692,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); 692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
693 693
694 if(newsk != NULL) { 694 if(newsk != NULL) {
695 struct inet_request_sock *ireq = inet_rsk(req);
696 struct tcp_request_sock *treq = tcp_rsk(req);
695 struct tcp_sock *newtp; 697 struct tcp_sock *newtp;
696 struct sk_filter *filter; 698 struct sk_filter *filter;
697 699
@@ -703,7 +705,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
703 tcp_sk(newsk)->bind_hash = NULL; 705 tcp_sk(newsk)->bind_hash = NULL;
704 706
705 /* Clone the TCP header template */ 707 /* Clone the TCP header template */
706 inet_sk(newsk)->dport = req->rmt_port; 708 inet_sk(newsk)->dport = ireq->rmt_port;
707 709
708 sock_lock_init(newsk); 710 sock_lock_init(newsk);
709 bh_lock_sock(newsk); 711 bh_lock_sock(newsk);
@@ -739,14 +741,14 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
739 /* Now setup tcp_sock */ 741 /* Now setup tcp_sock */
740 newtp = tcp_sk(newsk); 742 newtp = tcp_sk(newsk);
741 newtp->pred_flags = 0; 743 newtp->pred_flags = 0;
742 newtp->rcv_nxt = req->rcv_isn + 1; 744 newtp->rcv_nxt = treq->rcv_isn + 1;
743 newtp->snd_nxt = req->snt_isn + 1; 745 newtp->snd_nxt = treq->snt_isn + 1;
744 newtp->snd_una = req->snt_isn + 1; 746 newtp->snd_una = treq->snt_isn + 1;
745 newtp->snd_sml = req->snt_isn + 1; 747 newtp->snd_sml = treq->snt_isn + 1;
746 748
747 tcp_prequeue_init(newtp); 749 tcp_prequeue_init(newtp);
748 750
749 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn); 751 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
750 752
751 newtp->retransmits = 0; 753 newtp->retransmits = 0;
752 newtp->backoff = 0; 754 newtp->backoff = 0;
@@ -772,13 +774,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
772 newtp->frto_counter = 0; 774 newtp->frto_counter = 0;
773 newtp->frto_highmark = 0; 775 newtp->frto_highmark = 0;
774 776
777 newtp->ca_ops = &tcp_reno;
778
775 tcp_set_ca_state(newtp, TCP_CA_Open); 779 tcp_set_ca_state(newtp, TCP_CA_Open);
776 tcp_init_xmit_timers(newsk); 780 tcp_init_xmit_timers(newsk);
777 skb_queue_head_init(&newtp->out_of_order_queue); 781 skb_queue_head_init(&newtp->out_of_order_queue);
778 newtp->rcv_wup = req->rcv_isn + 1; 782 newtp->rcv_wup = treq->rcv_isn + 1;
779 newtp->write_seq = req->snt_isn + 1; 783 newtp->write_seq = treq->snt_isn + 1;
780 newtp->pushed_seq = newtp->write_seq; 784 newtp->pushed_seq = newtp->write_seq;
781 newtp->copied_seq = req->rcv_isn + 1; 785 newtp->copied_seq = treq->rcv_isn + 1;
782 786
783 newtp->rx_opt.saw_tstamp = 0; 787 newtp->rx_opt.saw_tstamp = 0;
784 788
@@ -788,10 +792,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
788 newtp->probes_out = 0; 792 newtp->probes_out = 0;
789 newtp->rx_opt.num_sacks = 0; 793 newtp->rx_opt.num_sacks = 0;
790 newtp->urg_data = 0; 794 newtp->urg_data = 0;
791 newtp->listen_opt = NULL; 795 /* Deinitialize accept_queue to trap illegal accesses. */
792 newtp->accept_queue = newtp->accept_queue_tail = NULL; 796 memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
793 /* Deinitialize syn_wait_lock to trap illegal accesses. */
794 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
795 797
796 /* Back to base struct sock members. */ 798 /* Back to base struct sock members. */
797 newsk->sk_err = 0; 799 newsk->sk_err = 0;
@@ -808,18 +810,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
808 newsk->sk_socket = NULL; 810 newsk->sk_socket = NULL;
809 newsk->sk_sleep = NULL; 811 newsk->sk_sleep = NULL;
810 812
811 newtp->rx_opt.tstamp_ok = req->tstamp_ok; 813 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
812 if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) { 814 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
813 if (sysctl_tcp_fack) 815 if (sysctl_tcp_fack)
814 newtp->rx_opt.sack_ok |= 2; 816 newtp->rx_opt.sack_ok |= 2;
815 } 817 }
816 newtp->window_clamp = req->window_clamp; 818 newtp->window_clamp = req->window_clamp;
817 newtp->rcv_ssthresh = req->rcv_wnd; 819 newtp->rcv_ssthresh = req->rcv_wnd;
818 newtp->rcv_wnd = req->rcv_wnd; 820 newtp->rcv_wnd = req->rcv_wnd;
819 newtp->rx_opt.wscale_ok = req->wscale_ok; 821 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
820 if (newtp->rx_opt.wscale_ok) { 822 if (newtp->rx_opt.wscale_ok) {
821 newtp->rx_opt.snd_wscale = req->snd_wscale; 823 newtp->rx_opt.snd_wscale = ireq->snd_wscale;
822 newtp->rx_opt.rcv_wscale = req->rcv_wscale; 824 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
823 } else { 825 } else {
824 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 826 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
825 newtp->window_clamp = min(newtp->window_clamp, 65535U); 827 newtp->window_clamp = min(newtp->window_clamp, 65535U);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
842 if (newtp->ecn_flags&TCP_ECN_OK) 844 if (newtp->ecn_flags&TCP_ECN_OK)
843 sock_set_flag(newsk, SOCK_NO_LARGESEND); 845 sock_set_flag(newsk, SOCK_NO_LARGESEND);
844 846
845 tcp_ca_init(newtp);
846
847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); 847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
848 } 848 }
849 return newsk; 849 return newsk;
@@ -851,12 +851,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
851 851
852/* 852/*
853 * Process an incoming packet for SYN_RECV sockets represented 853 * Process an incoming packet for SYN_RECV sockets represented
854 * as an open_request. 854 * as a request_sock.
855 */ 855 */
856 856
857struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, 857struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
858 struct open_request *req, 858 struct request_sock *req,
859 struct open_request **prev) 859 struct request_sock **prev)
860{ 860{
861 struct tcphdr *th = skb->h.th; 861 struct tcphdr *th = skb->h.th;
862 struct tcp_sock *tp = tcp_sk(sk); 862 struct tcp_sock *tp = tcp_sk(sk);
@@ -881,7 +881,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
881 } 881 }
882 882
883 /* Check for pure retransmitted SYN. */ 883 /* Check for pure retransmitted SYN. */
884 if (TCP_SKB_CB(skb)->seq == req->rcv_isn && 884 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
885 flg == TCP_FLAG_SYN && 885 flg == TCP_FLAG_SYN &&
886 !paws_reject) { 886 !paws_reject) {
887 /* 887 /*
@@ -901,7 +901,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
901 * Enforce "SYN-ACK" according to figure 8, figure 6 901 * Enforce "SYN-ACK" according to figure 8, figure 6
902 * of RFC793, fixed by RFC1122. 902 * of RFC793, fixed by RFC1122.
903 */ 903 */
904 req->class->rtx_syn_ack(sk, req, NULL); 904 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
905 return NULL; 905 return NULL;
906 } 906 }
907 907
@@ -959,7 +959,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
959 * Invalid ACK: reset will be sent by listening socket 959 * Invalid ACK: reset will be sent by listening socket
960 */ 960 */
961 if ((flg & TCP_FLAG_ACK) && 961 if ((flg & TCP_FLAG_ACK) &&
962 (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)) 962 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
963 return sk; 963 return sk;
964 964
965 /* Also, it would be not so bad idea to check rcv_tsecr, which 965 /* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -970,10 +970,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
970 /* RFC793: "first check sequence number". */ 970 /* RFC793: "first check sequence number". */
971 971
972 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 972 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
973 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) { 973 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
974 /* Out of window: send ACK and drop. */ 974 /* Out of window: send ACK and drop. */
975 if (!(flg & TCP_FLAG_RST)) 975 if (!(flg & TCP_FLAG_RST))
976 req->class->send_ack(skb, req); 976 req->rsk_ops->send_ack(skb, req);
977 if (paws_reject) 977 if (paws_reject)
978 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 978 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
979 return NULL; 979 return NULL;
@@ -981,12 +981,12 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
981 981
982 /* In sequence, PAWS is OK. */ 982 /* In sequence, PAWS is OK. */
983 983
984 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) 984 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
985 req->ts_recent = tmp_opt.rcv_tsval; 985 req->ts_recent = tmp_opt.rcv_tsval;
986 986
987 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { 987 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
988 /* Truncate SYN, it is out of window starting 988 /* Truncate SYN, it is out of window starting
989 at req->rcv_isn+1. */ 989 at tcp_rsk(req)->rcv_isn + 1. */
990 flg &= ~TCP_FLAG_SYN; 990 flg &= ~TCP_FLAG_SYN;
991 } 991 }
992 992
@@ -1003,8 +1003,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1003 return NULL; 1003 return NULL;
1004 1004
1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ 1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { 1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
1007 req->acked = 1; 1007 inet_rsk(req)->acked = 1;
1008 return NULL; 1008 return NULL;
1009 } 1009 }
1010 1010
@@ -1026,14 +1026,14 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1026 1026
1027 listen_overflow: 1027 listen_overflow:
1028 if (!sysctl_tcp_abort_on_overflow) { 1028 if (!sysctl_tcp_abort_on_overflow) {
1029 req->acked = 1; 1029 inet_rsk(req)->acked = 1;
1030 return NULL; 1030 return NULL;
1031 } 1031 }
1032 1032
1033 embryonic_reset: 1033 embryonic_reset:
1034 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); 1034 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
1035 if (!(flg & TCP_FLAG_RST)) 1035 if (!(flg & TCP_FLAG_RST))
1036 req->class->send_reset(skb); 1036 req->rsk_ops->send_reset(skb);
1037 1037
1038 tcp_synq_drop(sk, req, prev); 1038 tcp_synq_drop(sk, req, prev);
1039 return NULL; 1039 return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fa24e7ae1f40..0e17c244875c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 112 u32 cwnd = tp->snd_cwnd;
113 113
114 if (tcp_is_vegas(tp)) 114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
115 tcp_vegas_enable(tp);
116 115
117 tp->snd_ssthresh = tcp_current_ssthresh(tp); 116 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd); 117 restart_cwnd = min(restart_cwnd, cwnd);
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_WSCALE 0x2 279#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4 280#define SYSCTL_FLAG_SACK 0x4
282 281
282 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp);
285
283 sysctl_flags = 0; 286 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) { 287 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 288 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 307 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 } 308 }
306 309
307 /* 310 if (tcp_packets_in_flight(tp) == 0)
308 * If the connection is idle and we are restarting, 311 tcp_ca_event(tp, CA_EVENT_TX_START);
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318 312
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 313 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th; 314 skb->h.th = th;
@@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
521 * skbs, which it never sent before. --ANK 515 * skbs, which it never sent before. --ANK
522 */ 516 */
523 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 517 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
518 buff->stamp = skb->stamp;
524 519
525 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 520 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
526 tp->lost_out -= tcp_skb_pcount(skb); 521 tp->lost_out -= tcp_skb_pcount(skb);
@@ -1356,8 +1351,9 @@ int tcp_send_synack(struct sock *sk)
1356 * Prepare a SYN-ACK. 1351 * Prepare a SYN-ACK.
1357 */ 1352 */
1358struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, 1353struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1359 struct open_request *req) 1354 struct request_sock *req)
1360{ 1355{
1356 struct inet_request_sock *ireq = inet_rsk(req);
1361 struct tcp_sock *tp = tcp_sk(sk); 1357 struct tcp_sock *tp = tcp_sk(sk);
1362 struct tcphdr *th; 1358 struct tcphdr *th;
1363 int tcp_header_size; 1359 int tcp_header_size;
@@ -1373,47 +1369,47 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1373 skb->dst = dst_clone(dst); 1369 skb->dst = dst_clone(dst);
1374 1370
1375 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + 1371 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1376 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + 1372 (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1377 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + 1373 (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1378 /* SACK_PERM is in the place of NOP NOP of TS */ 1374 /* SACK_PERM is in the place of NOP NOP of TS */
1379 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); 1375 ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1380 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); 1376 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1381 1377
1382 memset(th, 0, sizeof(struct tcphdr)); 1378 memset(th, 0, sizeof(struct tcphdr));
1383 th->syn = 1; 1379 th->syn = 1;
1384 th->ack = 1; 1380 th->ack = 1;
1385 if (dst->dev->features&NETIF_F_TSO) 1381 if (dst->dev->features&NETIF_F_TSO)
1386 req->ecn_ok = 0; 1382 ireq->ecn_ok = 0;
1387 TCP_ECN_make_synack(req, th); 1383 TCP_ECN_make_synack(req, th);
1388 th->source = inet_sk(sk)->sport; 1384 th->source = inet_sk(sk)->sport;
1389 th->dest = req->rmt_port; 1385 th->dest = ireq->rmt_port;
1390 TCP_SKB_CB(skb)->seq = req->snt_isn; 1386 TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
1391 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; 1387 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1392 TCP_SKB_CB(skb)->sacked = 0; 1388 TCP_SKB_CB(skb)->sacked = 0;
1393 skb_shinfo(skb)->tso_segs = 1; 1389 skb_shinfo(skb)->tso_segs = 1;
1394 skb_shinfo(skb)->tso_size = 0; 1390 skb_shinfo(skb)->tso_size = 0;
1395 th->seq = htonl(TCP_SKB_CB(skb)->seq); 1391 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1396 th->ack_seq = htonl(req->rcv_isn + 1); 1392 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
1397 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 1393 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1398 __u8 rcv_wscale; 1394 __u8 rcv_wscale;
1399 /* Set this up on the first call only */ 1395 /* Set this up on the first call only */
1400 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 1396 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1401 /* tcp_full_space because it is guaranteed to be the first packet */ 1397 /* tcp_full_space because it is guaranteed to be the first packet */
1402 tcp_select_initial_window(tcp_full_space(sk), 1398 tcp_select_initial_window(tcp_full_space(sk),
1403 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 1399 dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1404 &req->rcv_wnd, 1400 &req->rcv_wnd,
1405 &req->window_clamp, 1401 &req->window_clamp,
1406 req->wscale_ok, 1402 ireq->wscale_ok,
1407 &rcv_wscale); 1403 &rcv_wscale);
1408 req->rcv_wscale = rcv_wscale; 1404 ireq->rcv_wscale = rcv_wscale;
1409 } 1405 }
1410 1406
1411 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 1407 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1412 th->window = htons(req->rcv_wnd); 1408 th->window = htons(req->rcv_wnd);
1413 1409
1414 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1410 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1415 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok, 1411 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
1416 req->sack_ok, req->wscale_ok, req->rcv_wscale, 1412 ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
1417 TCP_SKB_CB(skb)->when, 1413 TCP_SKB_CB(skb)->when,
1418 req->ts_recent); 1414 req->ts_recent);
1419 1415
@@ -1448,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk)
1448 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 1444 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1449 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 1445 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1450 tcp_initialize_rcv_mss(sk); 1446 tcp_initialize_rcv_mss(sk);
1451 tcp_ca_init(tp);
1452 1447
1453 tcp_select_initial_window(tcp_full_space(sk), 1448 tcp_select_initial_window(tcp_full_space(sk),
1454 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 1449 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1502,7 +1497,6 @@ int tcp_connect(struct sock *sk)
1502 TCP_SKB_CB(buff)->end_seq = tp->write_seq; 1497 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1503 tp->snd_nxt = tp->write_seq; 1498 tp->snd_nxt = tp->write_seq;
1504 tp->pushed_seq = tp->write_seq; 1499 tp->pushed_seq = tp->write_seq;
1505 tcp_ca_init(tp);
1506 1500
1507 /* Send it off. */ 1501 /* Send it off. */
1508 TCP_SKB_CB(buff)->when = tcp_time_stamp; 1502 TCP_SKB_CB(buff)->when = tcp_time_stamp;
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 000000000000..70e108e15c71
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
1/* Tom Kelly's Scalable TCP
2 *
3 * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
4 *
5 * John Heffner <jheffner@sc.edu>
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10#include <net/tcp.h>
11
12/* These factors derived from the recommended values in the aer:
13 * .01 and and 7/8. We use 50 instead of 100 to account for
14 * delayed ack.
15 */
16#define TCP_SCALABLE_AI_CNT 50U
17#define TCP_SCALABLE_MD_SCALE 3
18
19static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
20 u32 in_flight, int flag)
21{
22 if (in_flight < tp->snd_cwnd)
23 return;
24
25 if (tp->snd_cwnd <= tp->snd_ssthresh) {
26 tp->snd_cwnd++;
27 } else {
28 tp->snd_cwnd_cnt++;
29 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
30 tp->snd_cwnd++;
31 tp->snd_cwnd_cnt = 0;
32 }
33 }
34 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
35 tp->snd_cwnd_stamp = tcp_time_stamp;
36}
37
38static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
39{
40 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
41}
42
43
44static struct tcp_congestion_ops tcp_scalable = {
45 .ssthresh = tcp_scalable_ssthresh,
46 .cong_avoid = tcp_scalable_cong_avoid,
47 .min_cwnd = tcp_reno_min_cwnd,
48
49 .owner = THIS_MODULE,
50 .name = "scalable",
51};
52
53static int __init tcp_scalable_register(void)
54{
55 return tcp_register_congestion_control(&tcp_scalable);
56}
57
58static void __exit tcp_scalable_unregister(void)
59{
60 tcp_unregister_congestion_control(&tcp_scalable);
61}
62
63module_init(tcp_scalable_register);
64module_exit(tcp_scalable_unregister);
65
66MODULE_AUTHOR("John Heffner");
67MODULE_LICENSE("GPL");
68MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 799ebe061e2c..b127b4498565 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -464,11 +464,11 @@ out_unlock:
464static void tcp_synack_timer(struct sock *sk) 464static void tcp_synack_timer(struct sock *sk)
465{ 465{
466 struct tcp_sock *tp = tcp_sk(sk); 466 struct tcp_sock *tp = tcp_sk(sk);
467 struct tcp_listen_opt *lopt = tp->listen_opt; 467 struct listen_sock *lopt = tp->accept_queue.listen_opt;
468 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; 468 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
469 int thresh = max_retries; 469 int thresh = max_retries;
470 unsigned long now = jiffies; 470 unsigned long now = jiffies;
471 struct open_request **reqp, *req; 471 struct request_sock **reqp, *req;
472 int i, budget; 472 int i, budget;
473 473
474 if (lopt == NULL || lopt->qlen == 0) 474 if (lopt == NULL || lopt->qlen == 0)
@@ -513,8 +513,8 @@ static void tcp_synack_timer(struct sock *sk)
513 while ((req = *reqp) != NULL) { 513 while ((req = *reqp) != NULL) {
514 if (time_after_eq(now, req->expires)) { 514 if (time_after_eq(now, req->expires)) {
515 if ((req->retrans < thresh || 515 if ((req->retrans < thresh ||
516 (req->acked && req->retrans < max_retries)) 516 (inet_rsk(req)->acked && req->retrans < max_retries))
517 && !req->class->rtx_syn_ack(sk, req, NULL)) { 517 && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
518 unsigned long timeo; 518 unsigned long timeo;
519 519
520 if (req->retrans++ == 0) 520 if (req->retrans++ == 0)
@@ -527,13 +527,9 @@ static void tcp_synack_timer(struct sock *sk)
527 } 527 }
528 528
529 /* Drop this request */ 529 /* Drop this request */
530 write_lock(&tp->syn_wait_lock); 530 tcp_synq_unlink(tp, req, reqp);
531 *reqp = req->dl_next; 531 reqsk_queue_removed(&tp->accept_queue, req);
532 write_unlock(&tp->syn_wait_lock); 532 reqsk_free(req);
533 lopt->qlen--;
534 if (req->retrans == 0)
535 lopt->qlen_young--;
536 tcp_openreq_free(req);
537 continue; 533 continue;
538 } 534 }
539 reqp = &req->dl_next; 535 reqp = &req->dl_next;
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000000..9bd443db5193
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
1/*
2 * TCP Vegas congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * Lawrence S. Brakmo and Larry L. Peterson.
6 * "TCP Vegas: End to end congestion avoidance on a global internet."
7 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
8 * October 1995. Available from:
9 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
10 *
11 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
12 * The main aspects that distinguish this implementation from the
13 * Arizona Vegas implementation are:
14 * o We do not change the loss detection or recovery mechanisms of
15 * Linux in any way. Linux already recovers from losses quite well,
16 * using fine-grained timers, NewReno, and FACK.
17 * o To avoid the performance penalty imposed by increasing cwnd
18 * only every-other RTT during slow start, we increase during
19 * every RTT during slow start, just like Reno.
20 * o Largely to allow continuous cwnd growth during slow start,
21 * we use the rate at which ACKs come back as the "actual"
22 * rate, rather than the rate at which data is sent.
23 * o To speed convergence to the right rate, we set the cwnd
24 * to achieve the right ("actual") rate when we exit slow start.
25 * o To filter out the noise caused by delayed ACKs, we use the
26 * minimum RTT sample observed during the last RTT to calculate
27 * the actual rate.
28 * o When the sender re-starts from idle, it waits until it has
29 * received ACKs for an entire flight of new data before making
30 * a cwnd adjustment decision. The original Vegas implementation
31 * assumed senders never went idle.
32 */
33
34#include <linux/config.h>
35#include <linux/mm.h>
36#include <linux/module.h>
37#include <linux/skbuff.h>
38#include <linux/tcp_diag.h>
39
40#include <net/tcp.h>
41
42/* Default values of the Vegas variables, in fixed-point representation
43 * with V_PARAM_SHIFT bits to the right of the binary point.
44 */
45#define V_PARAM_SHIFT 1
46static int alpha = 1<<V_PARAM_SHIFT;
47static int beta = 3<<V_PARAM_SHIFT;
48static int gamma = 1<<V_PARAM_SHIFT;
49
50module_param(alpha, int, 0644);
51MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
52module_param(beta, int, 0644);
53MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
54module_param(gamma, int, 0644);
55MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
56
57
58/* Vegas variables */
59struct vegas {
60 u32 beg_snd_nxt; /* right edge during last RTT */
61 u32 beg_snd_una; /* left edge during last RTT */
62 u32 beg_snd_cwnd; /* saves the size of the cwnd */
63 u8 doing_vegas_now;/* if true, do vegas for this RTT */
64 u16 cntRTT; /* # of RTTs measured within last RTT */
65 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
66 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
67};
68
69/* There are several situations when we must "re-start" Vegas:
70 *
71 * o when a connection is established
72 * o after an RTO
73 * o after fast recovery
74 * o when we send a packet and there is no outstanding
75 * unacknowledged data (restarting an idle connection)
76 *
77 * In these circumstances we cannot do a Vegas calculation at the
78 * end of the first RTT, because any calculation we do is using
79 * stale info -- both the saved cwnd and congestion feedback are
80 * stale.
81 *
82 * Instead we must wait until the completion of an RTT during
83 * which we actually receive ACKs.
84 */
85static inline void vegas_enable(struct tcp_sock *tp)
86{
87 struct vegas *vegas = tcp_ca(tp);
88
89 /* Begin taking Vegas samples next time we send something. */
90 vegas->doing_vegas_now = 1;
91
92 /* Set the beginning of the next send window. */
93 vegas->beg_snd_nxt = tp->snd_nxt;
94
95 vegas->cntRTT = 0;
96 vegas->minRTT = 0x7fffffff;
97}
98
99/* Stop taking Vegas samples for now. */
100static inline void vegas_disable(struct tcp_sock *tp)
101{
102 struct vegas *vegas = tcp_ca(tp);
103
104 vegas->doing_vegas_now = 0;
105}
106
107static void tcp_vegas_init(struct tcp_sock *tp)
108{
109 struct vegas *vegas = tcp_ca(tp);
110
111 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(tp);
113}
114
115/* Do RTT sampling needed for Vegas.
116 * Basically we:
117 * o min-filter RTT samples from within an RTT to get the current
118 * propagation delay + queuing delay (we are min-filtering to try to
119 * avoid the effects of delayed ACKs)
120 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT)
122 */
123static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
124{
125 struct vegas *vegas = tcp_ca(tp);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
127
128 /* Filter to find propagation delay: */
129 if (vrtt < vegas->baseRTT)
130 vegas->baseRTT = vrtt;
131
132 /* Find the min RTT during the last RTT to find
133 * the current prop. delay + queuing delay:
134 */
135 vegas->minRTT = min(vegas->minRTT, vrtt);
136 vegas->cntRTT++;
137}
138
139static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
140{
141
142 if (ca_state == TCP_CA_Open)
143 vegas_enable(tp);
144 else
145 vegas_disable(tp);
146}
147
148/*
149 * If the connection is idle and we are restarting,
150 * then we don't want to do any Vegas calculations
151 * until we get fresh RTT samples. So when we
152 * restart, we reset our Vegas state to a clean
153 * slate. After we get acks for this flight of
154 * packets, _then_ we can make Vegas calculations
155 * again.
156 */
157static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
158{
159 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START)
161 tcp_vegas_init(tp);
162}
163
164static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag)
166{
167 struct vegas *vegas = tcp_ca(tp);
168
169 if (!vegas->doing_vegas_now)
170 return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
171
172 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
173 *
174 * These are so named because they represent the approximate values
175 * of snd_una and snd_nxt at the beginning of the current RTT. More
176 * precisely, they represent the amount of data sent during the RTT.
177 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
178 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
179 * bytes of data have been ACKed during the course of the RTT, giving
180 * an "actual" rate of:
181 *
182 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
183 *
184 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
185 * because delayed ACKs can cover more than one segment, so they
186 * don't line up nicely with the boundaries of RTTs.
187 *
188 * Another unfortunate fact of life is that delayed ACKs delay the
189 * advance of the left edge of our send window, so that the number
190 * of bytes we send in an RTT is often less than our cwnd will allow.
191 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
192 */
193
194 if (after(ack, vegas->beg_snd_nxt)) {
195 /* Do the Vegas once-per-RTT cwnd adjustment. */
196 u32 old_wnd, old_snd_cwnd;
197
198
199 /* Here old_wnd is essentially the window of data that was
200 * sent during the previous RTT, and has all
201 * been acknowledged in the course of the RTT that ended
202 * with the ACK we just received. Likewise, old_snd_cwnd
203 * is the cwnd during the previous RTT.
204 */
205 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
206 tp->mss_cache;
207 old_snd_cwnd = vegas->beg_snd_cwnd;
208
209 /* Save the extent of the current window so we can use this
210 * at the end of the next RTT.
211 */
212 vegas->beg_snd_una = vegas->beg_snd_nxt;
213 vegas->beg_snd_nxt = tp->snd_nxt;
214 vegas->beg_snd_cwnd = tp->snd_cwnd;
215
216 /* Take into account the current RTT sample too, to
217 * decrease the impact of delayed acks. This double counts
218 * this sample since we count it for the next window as well,
219 * but that's not too awful, since we're taking the min,
220 * rather than averaging.
221 */
222 tcp_vegas_rtt_calc(tp, seq_rtt*1000);
223
224 /* We do the Vegas calculations only if we got enough RTT
225 * samples that we can be reasonably sure that we got
226 * at least one RTT sample that wasn't from a delayed ACK.
227 * If we only had 2 samples total,
228 * then that means we're getting only 1 ACK per RTT, which
229 * means they're almost certainly delayed ACKs.
230 * If we have 3 samples, we should be OK.
231 */
232
233 if (vegas->cntRTT <= 2) {
234 /* We don't have enough RTT samples to do the Vegas
235 * calculation, so we'll behave like Reno.
236 */
237 if (tp->snd_cwnd > tp->snd_ssthresh)
238 tp->snd_cwnd++;
239 } else {
240 u32 rtt, target_cwnd, diff;
241
242 /* We have enough RTT samples, so, using the Vegas
243 * algorithm, we determine if we should increase or
244 * decrease cwnd, and by how much.
245 */
246
247 /* Pluck out the RTT we are using for the Vegas
248 * calculations. This is the min RTT seen during the
249 * last RTT. Taking the min filters out the effects
250 * of delayed ACKs, at the cost of noticing congestion
251 * a bit later.
252 */
253 rtt = vegas->minRTT;
254
255 /* Calculate the cwnd we should have, if we weren't
256 * going too fast.
257 *
258 * This is:
259 * (actual rate in segments) * baseRTT
260 * We keep it as a fixed point number with
261 * V_PARAM_SHIFT bits to the right of the binary point.
262 */
263 target_cwnd = ((old_wnd * vegas->baseRTT)
264 << V_PARAM_SHIFT) / rtt;
265
266 /* Calculate the difference between the window we had,
267 * and the window we would like to have. This quantity
268 * is the "Diff" from the Arizona Vegas papers.
269 *
270 * Again, this is a fixed point number with
271 * V_PARAM_SHIFT bits to the right of the binary
272 * point.
273 */
274 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
275
276 if (tp->snd_cwnd < tp->snd_ssthresh) {
277 /* Slow start. */
278 if (diff > gamma) {
279 /* Going too fast. Time to slow down
280 * and switch to congestion avoidance.
281 */
282 tp->snd_ssthresh = 2;
283
284 /* Set cwnd to match the actual rate
285 * exactly:
286 * cwnd = (actual rate) * baseRTT
287 * Then we add 1 because the integer
288 * truncation robs us of full link
289 * utilization.
290 */
291 tp->snd_cwnd = min(tp->snd_cwnd,
292 (target_cwnd >>
293 V_PARAM_SHIFT)+1);
294
295 }
296 } else {
297 /* Congestion avoidance. */
298 u32 next_snd_cwnd;
299
300 /* Figure out where we would like cwnd
301 * to be.
302 */
303 if (diff > beta) {
304 /* The old window was too fast, so
305 * we slow down.
306 */
307 next_snd_cwnd = old_snd_cwnd - 1;
308 } else if (diff < alpha) {
309 /* We don't have enough extra packets
310 * in the network, so speed up.
311 */
312 next_snd_cwnd = old_snd_cwnd + 1;
313 } else {
314 /* Sending just as fast as we
315 * should be.
316 */
317 next_snd_cwnd = old_snd_cwnd;
318 }
319
320 /* Adjust cwnd upward or downward, toward the
321 * desired value.
322 */
323 if (next_snd_cwnd > tp->snd_cwnd)
324 tp->snd_cwnd++;
325 else if (next_snd_cwnd < tp->snd_cwnd)
326 tp->snd_cwnd--;
327 }
328 }
329
330 /* Wipe the slate clean for the next RTT. */
331 vegas->cntRTT = 0;
332 vegas->minRTT = 0x7fffffff;
333 }
334
335 /* The following code is executed for every ack we receive,
336 * except for conditions checked in should_advance_cwnd()
337 * before the call to tcp_cong_avoid(). Mainly this means that
338 * we only execute this code if the ack actually acked some
339 * data.
340 */
341
342 /* If we are in slow start, increase our cwnd in response to this ACK.
343 * (If we are not in slow start then we are in congestion avoidance,
344 * and adjust our congestion window only once per RTT. See the code
345 * above.)
346 */
347 if (tp->snd_cwnd <= tp->snd_ssthresh)
348 tp->snd_cwnd++;
349
350 /* to keep cwnd from growing without bound */
351 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
352
353 /* Make sure that we are never so timid as to reduce our cwnd below
354 * 2 MSS.
355 *
356 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
357 */
358 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
359}
360
361/* Extract info for Tcp socket info provided via netlink. */
362static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
363 struct sk_buff *skb)
364{
365 const struct vegas *ca = tcp_ca(tp);
366 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
367 struct tcpvegas_info *info;
368
369 info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
370 sizeof(*info)));
371
372 info->tcpv_enabled = ca->doing_vegas_now;
373 info->tcpv_rttcnt = ca->cntRTT;
374 info->tcpv_rtt = ca->baseRTT;
375 info->tcpv_minrtt = ca->minRTT;
376 rtattr_failure: ;
377 }
378}
379
380static struct tcp_congestion_ops tcp_vegas = {
381 .init = tcp_vegas_init,
382 .ssthresh = tcp_reno_ssthresh,
383 .cong_avoid = tcp_vegas_cong_avoid,
384 .min_cwnd = tcp_reno_min_cwnd,
385 .rtt_sample = tcp_vegas_rtt_calc,
386 .set_state = tcp_vegas_state,
387 .cwnd_event = tcp_vegas_cwnd_event,
388 .get_info = tcp_vegas_get_info,
389
390 .owner = THIS_MODULE,
391 .name = "vegas",
392};
393
394static int __init tcp_vegas_register(void)
395{
396 BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
397 tcp_register_congestion_control(&tcp_vegas);
398 return 0;
399}
400
401static void __exit tcp_vegas_unregister(void)
402{
403 tcp_unregister_congestion_control(&tcp_vegas);
404}
405
406module_init(tcp_vegas_register);
407module_exit(tcp_vegas_unregister);
408
409MODULE_AUTHOR("Stephen Hemminger");
410MODULE_LICENSE("GPL");
411MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 000000000000..ef827242c940
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
1/*
2 * TCP Westwood+
3 *
4 * Angelo Dell'Aera: TCP Westwood+ support
5 */
6
7#include <linux/config.h>
8#include <linux/mm.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/tcp_diag.h>
12#include <net/tcp.h>
13
14/* TCP Westwood structure */
15struct westwood {
16 u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
17 u32 bw_est; /* bandwidth estimate */
18 u32 rtt_win_sx; /* here starts a new evaluation... */
19 u32 bk;
20 u32 snd_una; /* used for evaluating the number of acked bytes */
21 u32 cumul_ack;
22 u32 accounted;
23 u32 rtt;
24 u32 rtt_min; /* minimum observed RTT */
25};
26
27
28/* TCP Westwood functions and constants */
29#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
30#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
31
32/*
33 * @tcp_westwood_create
34 * This function initializes fields used in TCP Westwood+,
35 * it is called after the initial SYN, so the sequence numbers
36 * are correct but new passive connections we have no
37 * information about RTTmin at this time so we simply set it to
38 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
39 * since in this way we're sure it will be updated in a consistent
40 * way as soon as possible. It will reasonably happen within the first
41 * RTT period of the connection lifetime.
42 */
43static void tcp_westwood_init(struct tcp_sock *tp)
44{
45 struct westwood *w = tcp_ca(tp);
46
47 w->bk = 0;
48 w->bw_ns_est = 0;
49 w->bw_est = 0;
50 w->accounted = 0;
51 w->cumul_ack = 0;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tp->snd_una;
55}
56
57/*
58 * @westwood_do_filter
59 * Low-pass filter. Implemented using constant coefficients.
60 */
61static inline u32 westwood_do_filter(u32 a, u32 b)
62{
63 return (((7 * a) + b) >> 3);
64}
65
66static inline void westwood_filter(struct westwood *w, u32 delta)
67{
68 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
69 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
70}
71
72/*
73 * @westwood_pkts_acked
74 * Called after processing group of packets.
75 * but all westwood needs is the last sample of srtt.
76 */
77static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
78{
79 struct westwood *w = tcp_ca(tp);
80 if (cnt > 0)
81 w->rtt = tp->srtt >> 3;
82}
83
84/*
85 * @westwood_update_window
86 * It updates RTT evaluation window if it is the right moment to do
87 * it. If so it calls filter for evaluating bandwidth.
88 */
89static void westwood_update_window(struct tcp_sock *tp)
90{
91 struct westwood *w = tcp_ca(tp);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93
94 /*
95 * See if a RTT-window has passed.
96 * Be careful since if RTT is less than
97 * 50ms we don't filter but we continue 'building the sample'.
98 * This minimum limit was chosen since an estimation on small
99 * time intervals is better to avoid...
100 * Obviously on a LAN we reasonably will always have
101 * right_bound = left_bound + WESTWOOD_RTT_MIN
102 */
103 if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
104 westwood_filter(w, delta);
105
106 w->bk = 0;
107 w->rtt_win_sx = tcp_time_stamp;
108 }
109}
110
111/*
112 * @westwood_fast_bw
113 * It is called when we are in fast path. In particular it is called when
114 * header prediction is successful. In such case in fact update is
115 * straight forward and doesn't need any particular care.
116 */
117static inline void westwood_fast_bw(struct tcp_sock *tp)
118{
119 struct westwood *w = tcp_ca(tp);
120
121 westwood_update_window(tp);
122
123 w->bk += tp->snd_una - w->snd_una;
124 w->snd_una = tp->snd_una;
125 w->rtt_min = min(w->rtt, w->rtt_min);
126}
127
128/*
129 * @westwood_acked_count
130 * This function evaluates cumul_ack for evaluating bk in case of
131 * delayed or partial acks.
132 */
133static inline u32 westwood_acked_count(struct tcp_sock *tp)
134{
135 struct westwood *w = tcp_ca(tp);
136
137 w->cumul_ack = tp->snd_una - w->snd_una;
138
139 /* If cumul_ack is 0 this is a dupack since it's not moving
140 * tp->snd_una.
141 */
142 if (!w->cumul_ack) {
143 w->accounted += tp->mss_cache;
144 w->cumul_ack = tp->mss_cache;
145 }
146
147 if (w->cumul_ack > tp->mss_cache) {
148 /* Partial or delayed ack */
149 if (w->accounted >= w->cumul_ack) {
150 w->accounted -= w->cumul_ack;
151 w->cumul_ack = tp->mss_cache;
152 } else {
153 w->cumul_ack -= w->accounted;
154 w->accounted = 0;
155 }
156 }
157
158 w->snd_una = tp->snd_una;
159
160 return w->cumul_ack;
161}
162
163static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
164{
165 struct westwood *w = tcp_ca(tp);
166 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
167}
168
169/*
170 * TCP Westwood
171 * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
172 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
173 * so avoids ever returning 0.
174 */
175static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
176{
177 return westwood_bw_rttmin(tp);
178}
179
180static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
181{
182 struct westwood *w = tcp_ca(tp);
183
184 switch(event) {
185 case CA_EVENT_FAST_ACK:
186 westwood_fast_bw(tp);
187 break;
188
189 case CA_EVENT_COMPLETE_CWR:
190 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
191 break;
192
193 case CA_EVENT_FRTO:
194 tp->snd_ssthresh = westwood_bw_rttmin(tp);
195 break;
196
197 case CA_EVENT_SLOW_ACK:
198 westwood_update_window(tp);
199 w->bk += westwood_acked_count(tp);
200 w->rtt_min = min(w->rtt, w->rtt_min);
201 break;
202
203 default:
204 /* don't care */
205 break;
206 }
207}
208
209
210/* Extract info for Tcp socket info provided via netlink. */
211static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
212 struct sk_buff *skb)
213{
214 const struct westwood *ca = tcp_ca(tp);
215 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
216 struct rtattr *rta;
217 struct tcpvegas_info *info;
218
219 rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
220 info = RTA_DATA(rta);
221 info->tcpv_enabled = 1;
222 info->tcpv_rttcnt = 0;
223 info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
224 info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
225 rtattr_failure: ;
226 }
227}
228
229
230static struct tcp_congestion_ops tcp_westwood = {
231 .init = tcp_westwood_init,
232 .ssthresh = tcp_reno_ssthresh,
233 .cong_avoid = tcp_reno_cong_avoid,
234 .min_cwnd = tcp_westwood_cwnd_min,
235 .cwnd_event = tcp_westwood_event,
236 .get_info = tcp_westwood_info,
237 .pkts_acked = tcp_westwood_pkts_acked,
238
239 .owner = THIS_MODULE,
240 .name = "westwood"
241};
242
243static int __init tcp_westwood_register(void)
244{
245 BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
246 return tcp_register_congestion_control(&tcp_westwood);
247}
248
249static void __exit tcp_westwood_unregister(void)
250{
251 tcp_unregister_congestion_control(&tcp_westwood);
252}
253
254module_init(tcp_westwood_register);
255module_exit(tcp_westwood_unregister);
256
257MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
258MODULE_LICENSE("GPL");
259MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4a6952e3fee9..7c24e64b443f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -738,7 +738,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
738 unsigned long amount; 738 unsigned long amount;
739 739
740 amount = 0; 740 amount = 0;
741 spin_lock_irq(&sk->sk_receive_queue.lock); 741 spin_lock_bh(&sk->sk_receive_queue.lock);
742 skb = skb_peek(&sk->sk_receive_queue); 742 skb = skb_peek(&sk->sk_receive_queue);
743 if (skb != NULL) { 743 if (skb != NULL) {
744 /* 744 /*
@@ -748,7 +748,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
748 */ 748 */
749 amount = skb->len - sizeof(struct udphdr); 749 amount = skb->len - sizeof(struct udphdr);
750 } 750 }
751 spin_unlock_irq(&sk->sk_receive_queue.lock); 751 spin_unlock_bh(&sk->sk_receive_queue.lock);
752 return put_user(amount, (int __user *)arg); 752 return put_user(amount, (int __user *)arg);
753 } 753 }
754 754
@@ -848,12 +848,12 @@ csum_copy_err:
848 /* Clear queue. */ 848 /* Clear queue. */
849 if (flags&MSG_PEEK) { 849 if (flags&MSG_PEEK) {
850 int clear = 0; 850 int clear = 0;
851 spin_lock_irq(&sk->sk_receive_queue.lock); 851 spin_lock_bh(&sk->sk_receive_queue.lock);
852 if (skb == skb_peek(&sk->sk_receive_queue)) { 852 if (skb == skb_peek(&sk->sk_receive_queue)) {
853 __skb_unlink(skb, &sk->sk_receive_queue); 853 __skb_unlink(skb, &sk->sk_receive_queue);
854 clear = 1; 854 clear = 1;
855 } 855 }
856 spin_unlock_irq(&sk->sk_receive_queue.lock); 856 spin_unlock_bh(&sk->sk_receive_queue.lock);
857 if (clear) 857 if (clear)
858 kfree_skb(skb); 858 kfree_skb(skb);
859 } 859 }
@@ -1334,7 +1334,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
1334 struct sk_buff_head *rcvq = &sk->sk_receive_queue; 1334 struct sk_buff_head *rcvq = &sk->sk_receive_queue;
1335 struct sk_buff *skb; 1335 struct sk_buff *skb;
1336 1336
1337 spin_lock_irq(&rcvq->lock); 1337 spin_lock_bh(&rcvq->lock);
1338 while ((skb = skb_peek(rcvq)) != NULL) { 1338 while ((skb = skb_peek(rcvq)) != NULL) {
1339 if (udp_checksum_complete(skb)) { 1339 if (udp_checksum_complete(skb)) {
1340 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1340 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
@@ -1345,7 +1345,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
1345 break; 1345 break;
1346 } 1346 }
1347 } 1347 }
1348 spin_unlock_irq(&rcvq->lock); 1348 spin_unlock_bh(&rcvq->lock);
1349 1349
1350 /* nothing to see, move along */ 1350 /* nothing to see, move along */
1351 if (skb == NULL) 1351 if (skb == NULL)
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index af2392ae5769..66620a95942a 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -33,6 +33,7 @@ static void xfrm4_encap(struct sk_buff *skb)
33 struct dst_entry *dst = skb->dst; 33 struct dst_entry *dst = skb->dst;
34 struct xfrm_state *x = dst->xfrm; 34 struct xfrm_state *x = dst->xfrm;
35 struct iphdr *iph, *top_iph; 35 struct iphdr *iph, *top_iph;
36 int flags;
36 37
37 iph = skb->nh.iph; 38 iph = skb->nh.iph;
38 skb->h.ipiph = iph; 39 skb->h.ipiph = iph;
@@ -51,10 +52,13 @@ static void xfrm4_encap(struct sk_buff *skb)
51 52
52 /* DS disclosed */ 53 /* DS disclosed */
53 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); 54 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
54 if (x->props.flags & XFRM_STATE_NOECN) 55
56 flags = x->props.flags;
57 if (flags & XFRM_STATE_NOECN)
55 IP_ECN_clear(top_iph); 58 IP_ECN_clear(top_iph);
56 59
57 top_iph->frag_off = iph->frag_off & htons(IP_DF); 60 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
61 0 : (iph->frag_off & htons(IP_DF));
58 if (!top_iph->frag_off) 62 if (!top_iph->frag_off)
59 __ip_select_ident(top_iph, dst, 0); 63 __ip_select_ident(top_iph, dst, 0);
60 64
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 223a2e83853f..050611d7a967 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -7,12 +7,20 @@
7 * 7 *
8 */ 8 */
9 9
10#include <net/ip.h>
10#include <net/xfrm.h> 11#include <net/xfrm.h>
11#include <linux/pfkeyv2.h> 12#include <linux/pfkeyv2.h>
12#include <linux/ipsec.h> 13#include <linux/ipsec.h>
13 14
14static struct xfrm_state_afinfo xfrm4_state_afinfo; 15static struct xfrm_state_afinfo xfrm4_state_afinfo;
15 16
17static int xfrm4_init_flags(struct xfrm_state *x)
18{
19 if (ipv4_config.no_pmtu_disc)
20 x->props.flags |= XFRM_STATE_NOPMTUDISC;
21 return 0;
22}
23
16static void 24static void
17__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 25__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
18 struct xfrm_tmpl *tmpl, 26 struct xfrm_tmpl *tmpl,
@@ -109,6 +117,7 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
109static struct xfrm_state_afinfo xfrm4_state_afinfo = { 117static struct xfrm_state_afinfo xfrm4_state_afinfo = {
110 .family = AF_INET, 118 .family = AF_INET,
111 .lock = RW_LOCK_UNLOCKED, 119 .lock = RW_LOCK_UNLOCKED,
120 .init_flags = xfrm4_init_flags,
112 .init_tempsel = __xfrm4_init_tempsel, 121 .init_tempsel = __xfrm4_init_tempsel,
113 .state_lookup = __xfrm4_state_lookup, 122 .state_lookup = __xfrm4_state_lookup,
114 .find_acq = __xfrm4_find_acq, 123 .find_acq = __xfrm4_find_acq,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 413191f585f6..e1fe360ed27a 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -84,7 +84,7 @@ static void ipip_err(struct sk_buff *skb, u32 info)
84 handler->err_handler(skb, &arg); 84 handler->err_handler(skb, &arg);
85} 85}
86 86
87static int ipip_init_state(struct xfrm_state *x, void *args) 87static int ipip_init_state(struct xfrm_state *x)
88{ 88{
89 if (!x->props.mode) 89 if (!x->props.mode)
90 return -EINVAL; 90 return -EINVAL;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7744a2592693..a54d4ef3fd35 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -57,6 +57,7 @@
57#endif 57#endif
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/notifier.h> 59#include <linux/notifier.h>
60#include <linux/string.h>
60 61
61#include <net/sock.h> 62#include <net/sock.h>
62#include <net/snmp.h> 63#include <net/snmp.h>
@@ -131,7 +132,7 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
131 132
132static int addrconf_ifdown(struct net_device *dev, int how); 133static int addrconf_ifdown(struct net_device *dev, int how);
133 134
134static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags); 135static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
135static void addrconf_dad_timer(unsigned long data); 136static void addrconf_dad_timer(unsigned long data);
136static void addrconf_dad_completed(struct inet6_ifaddr *ifp); 137static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
137static void addrconf_rs_timer(unsigned long data); 138static void addrconf_rs_timer(unsigned long data);
@@ -372,6 +373,7 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
372 ndev->regen_timer.data = (unsigned long) ndev; 373 ndev->regen_timer.data = (unsigned long) ndev;
373 if ((dev->flags&IFF_LOOPBACK) || 374 if ((dev->flags&IFF_LOOPBACK) ||
374 dev->type == ARPHRD_TUNNEL || 375 dev->type == ARPHRD_TUNNEL ||
376 dev->type == ARPHRD_NONE ||
375 dev->type == ARPHRD_SIT) { 377 dev->type == ARPHRD_SIT) {
376 printk(KERN_INFO 378 printk(KERN_INFO
377 "Disabled Privacy Extensions on device %p(%s)\n", 379 "Disabled Privacy Extensions on device %p(%s)\n",
@@ -491,7 +493,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
491 493
492static struct inet6_ifaddr * 494static struct inet6_ifaddr *
493ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, 495ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
494 int scope, unsigned flags) 496 int scope, u32 flags)
495{ 497{
496 struct inet6_ifaddr *ifa = NULL; 498 struct inet6_ifaddr *ifa = NULL;
497 struct rt6_info *rt; 499 struct rt6_info *rt;
@@ -694,7 +696,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
694 696
695 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { 697 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
696 if (onlink == 0) { 698 if (onlink == 0) {
697 ip6_del_rt(rt, NULL, NULL); 699 ip6_del_rt(rt, NULL, NULL, NULL);
698 rt = NULL; 700 rt = NULL;
699 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { 701 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
700 rt->rt6i_expires = expires; 702 rt->rt6i_expires = expires;
@@ -1319,7 +1321,7 @@ static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
1319 1321
1320static void 1322static void
1321addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, 1323addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
1322 unsigned long expires, unsigned flags) 1324 unsigned long expires, u32 flags)
1323{ 1325{
1324 struct in6_rtmsg rtmsg; 1326 struct in6_rtmsg rtmsg;
1325 1327
@@ -1339,7 +1341,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
1339 if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) 1341 if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
1340 rtmsg.rtmsg_flags |= RTF_NONEXTHOP; 1342 rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
1341 1343
1342 ip6_route_add(&rtmsg, NULL, NULL); 1344 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1343} 1345}
1344 1346
1345/* Create "default" multicast route to the interface */ 1347/* Create "default" multicast route to the interface */
@@ -1356,7 +1358,7 @@ static void addrconf_add_mroute(struct net_device *dev)
1356 rtmsg.rtmsg_ifindex = dev->ifindex; 1358 rtmsg.rtmsg_ifindex = dev->ifindex;
1357 rtmsg.rtmsg_flags = RTF_UP; 1359 rtmsg.rtmsg_flags = RTF_UP;
1358 rtmsg.rtmsg_type = RTMSG_NEWROUTE; 1360 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1359 ip6_route_add(&rtmsg, NULL, NULL); 1361 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1360} 1362}
1361 1363
1362static void sit_route_add(struct net_device *dev) 1364static void sit_route_add(struct net_device *dev)
@@ -1373,7 +1375,7 @@ static void sit_route_add(struct net_device *dev)
1373 rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; 1375 rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP;
1374 rtmsg.rtmsg_ifindex = dev->ifindex; 1376 rtmsg.rtmsg_ifindex = dev->ifindex;
1375 1377
1376 ip6_route_add(&rtmsg, NULL, NULL); 1378 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1377} 1379}
1378 1380
1379static void addrconf_add_lroute(struct net_device *dev) 1381static void addrconf_add_lroute(struct net_device *dev)
@@ -1466,7 +1468,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
1466 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { 1468 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
1467 if (rt->rt6i_flags&RTF_EXPIRES) { 1469 if (rt->rt6i_flags&RTF_EXPIRES) {
1468 if (valid_lft == 0) { 1470 if (valid_lft == 0) {
1469 ip6_del_rt(rt, NULL, NULL); 1471 ip6_del_rt(rt, NULL, NULL, NULL);
1470 rt = NULL; 1472 rt = NULL;
1471 } else { 1473 } else {
1472 rt->rt6i_expires = rt_expires; 1474 rt->rt6i_expires = rt_expires;
@@ -2228,7 +2230,7 @@ out:
2228/* 2230/*
2229 * Duplicate Address Detection 2231 * Duplicate Address Detection
2230 */ 2232 */
2231static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags) 2233static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
2232{ 2234{
2233 struct inet6_dev *idev = ifp->idev; 2235 struct inet6_dev *idev = ifp->idev;
2234 struct net_device *dev = idev->dev; 2236 struct net_device *dev = idev->dev;
@@ -2621,15 +2623,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
2621} 2623}
2622 2624
2623static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, 2625static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
2624 u32 pid, u32 seq, int event) 2626 u32 pid, u32 seq, int event, unsigned int flags)
2625{ 2627{
2626 struct ifaddrmsg *ifm; 2628 struct ifaddrmsg *ifm;
2627 struct nlmsghdr *nlh; 2629 struct nlmsghdr *nlh;
2628 struct ifa_cacheinfo ci; 2630 struct ifa_cacheinfo ci;
2629 unsigned char *b = skb->tail; 2631 unsigned char *b = skb->tail;
2630 2632
2631 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 2633 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
2632 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
2633 ifm = NLMSG_DATA(nlh); 2634 ifm = NLMSG_DATA(nlh);
2634 ifm->ifa_family = AF_INET6; 2635 ifm->ifa_family = AF_INET6;
2635 ifm->ifa_prefixlen = ifa->prefix_len; 2636 ifm->ifa_prefixlen = ifa->prefix_len;
@@ -2671,15 +2672,14 @@ rtattr_failure:
2671} 2672}
2672 2673
2673static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, 2674static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
2674 u32 pid, u32 seq, int event) 2675 u32 pid, u32 seq, int event, u16 flags)
2675{ 2676{
2676 struct ifaddrmsg *ifm; 2677 struct ifaddrmsg *ifm;
2677 struct nlmsghdr *nlh; 2678 struct nlmsghdr *nlh;
2678 struct ifa_cacheinfo ci; 2679 struct ifa_cacheinfo ci;
2679 unsigned char *b = skb->tail; 2680 unsigned char *b = skb->tail;
2680 2681
2681 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 2682 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
2682 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
2683 ifm = NLMSG_DATA(nlh); 2683 ifm = NLMSG_DATA(nlh);
2684 ifm->ifa_family = AF_INET6; 2684 ifm->ifa_family = AF_INET6;
2685 ifm->ifa_prefixlen = 128; 2685 ifm->ifa_prefixlen = 128;
@@ -2708,15 +2708,14 @@ rtattr_failure:
2708} 2708}
2709 2709
2710static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, 2710static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
2711 u32 pid, u32 seq, int event) 2711 u32 pid, u32 seq, int event, unsigned int flags)
2712{ 2712{
2713 struct ifaddrmsg *ifm; 2713 struct ifaddrmsg *ifm;
2714 struct nlmsghdr *nlh; 2714 struct nlmsghdr *nlh;
2715 struct ifa_cacheinfo ci; 2715 struct ifa_cacheinfo ci;
2716 unsigned char *b = skb->tail; 2716 unsigned char *b = skb->tail;
2717 2717
2718 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 2718 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
2719 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
2720 ifm = NLMSG_DATA(nlh); 2719 ifm = NLMSG_DATA(nlh);
2721 ifm->ifa_family = AF_INET6; 2720 ifm->ifa_family = AF_INET6;
2722 ifm->ifa_prefixlen = 128; 2721 ifm->ifa_prefixlen = 128;
@@ -2785,7 +2784,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2785 continue; 2784 continue;
2786 if ((err = inet6_fill_ifaddr(skb, ifa, 2785 if ((err = inet6_fill_ifaddr(skb, ifa,
2787 NETLINK_CB(cb->skb).pid, 2786 NETLINK_CB(cb->skb).pid,
2788 cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0) 2787 cb->nlh->nlmsg_seq, RTM_NEWADDR,
2788 NLM_F_MULTI)) <= 0)
2789 goto done; 2789 goto done;
2790 } 2790 }
2791 /* temp addr */ 2791 /* temp addr */
@@ -2796,7 +2796,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2796 continue; 2796 continue;
2797 if ((err = inet6_fill_ifaddr(skb, ifa, 2797 if ((err = inet6_fill_ifaddr(skb, ifa,
2798 NETLINK_CB(cb->skb).pid, 2798 NETLINK_CB(cb->skb).pid,
2799 cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0) 2799 cb->nlh->nlmsg_seq, RTM_NEWADDR,
2800 NLM_F_MULTI)) <= 0)
2800 goto done; 2801 goto done;
2801 } 2802 }
2802#endif 2803#endif
@@ -2809,7 +2810,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2809 continue; 2810 continue;
2810 if ((err = inet6_fill_ifmcaddr(skb, ifmca, 2811 if ((err = inet6_fill_ifmcaddr(skb, ifmca,
2811 NETLINK_CB(cb->skb).pid, 2812 NETLINK_CB(cb->skb).pid,
2812 cb->nlh->nlmsg_seq, RTM_GETMULTICAST)) <= 0) 2813 cb->nlh->nlmsg_seq, RTM_GETMULTICAST,
2814 NLM_F_MULTI)) <= 0)
2813 goto done; 2815 goto done;
2814 } 2816 }
2815 break; 2817 break;
@@ -2821,7 +2823,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2821 continue; 2823 continue;
2822 if ((err = inet6_fill_ifacaddr(skb, ifaca, 2824 if ((err = inet6_fill_ifacaddr(skb, ifaca,
2823 NETLINK_CB(cb->skb).pid, 2825 NETLINK_CB(cb->skb).pid,
2824 cb->nlh->nlmsg_seq, RTM_GETANYCAST)) <= 0) 2826 cb->nlh->nlmsg_seq, RTM_GETANYCAST,
2827 NLM_F_MULTI)) <= 0)
2825 goto done; 2828 goto done;
2826 } 2829 }
2827 break; 2830 break;
@@ -2871,7 +2874,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
2871 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS); 2874 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS);
2872 return; 2875 return;
2873 } 2876 }
2874 if (inet6_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { 2877 if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
2875 kfree_skb(skb); 2878 kfree_skb(skb);
2876 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL); 2879 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL);
2877 return; 2880 return;
@@ -2906,7 +2909,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
2906} 2909}
2907 2910
2908static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 2911static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
2909 u32 pid, u32 seq, int event) 2912 u32 pid, u32 seq, int event, unsigned int flags)
2910{ 2913{
2911 struct net_device *dev = idev->dev; 2914 struct net_device *dev = idev->dev;
2912 __s32 *array = NULL; 2915 __s32 *array = NULL;
@@ -2917,8 +2920,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
2917 __u32 mtu = dev->mtu; 2920 __u32 mtu = dev->mtu;
2918 struct ifla_cacheinfo ci; 2921 struct ifla_cacheinfo ci;
2919 2922
2920 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); 2923 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2921 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
2922 r = NLMSG_DATA(nlh); 2924 r = NLMSG_DATA(nlh);
2923 r->ifi_family = AF_INET6; 2925 r->ifi_family = AF_INET6;
2924 r->ifi_type = dev->type; 2926 r->ifi_type = dev->type;
@@ -2985,7 +2987,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
2985 if ((idev = in6_dev_get(dev)) == NULL) 2987 if ((idev = in6_dev_get(dev)) == NULL)
2986 continue; 2988 continue;
2987 err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid, 2989 err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid,
2988 cb->nlh->nlmsg_seq, RTM_NEWLINK); 2990 cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI);
2989 in6_dev_put(idev); 2991 in6_dev_put(idev);
2990 if (err <= 0) 2992 if (err <= 0)
2991 break; 2993 break;
@@ -3007,7 +3009,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
3007 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS); 3009 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS);
3008 return; 3010 return;
3009 } 3011 }
3010 if (inet6_fill_ifinfo(skb, idev, 0, 0, event) < 0) { 3012 if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) {
3011 kfree_skb(skb); 3013 kfree_skb(skb);
3012 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL); 3014 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL);
3013 return; 3015 return;
@@ -3017,18 +3019,15 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
3017} 3019}
3018 3020
3019static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, 3021static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
3020 struct prefix_info *pinfo, u32 pid, u32 seq, int event) 3022 struct prefix_info *pinfo, u32 pid, u32 seq,
3023 int event, unsigned int flags)
3021{ 3024{
3022 struct prefixmsg *pmsg; 3025 struct prefixmsg *pmsg;
3023 struct nlmsghdr *nlh; 3026 struct nlmsghdr *nlh;
3024 unsigned char *b = skb->tail; 3027 unsigned char *b = skb->tail;
3025 struct prefix_cacheinfo ci; 3028 struct prefix_cacheinfo ci;
3026 3029
3027 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*pmsg)); 3030 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
3028
3029 if (pid)
3030 nlh->nlmsg_flags |= NLM_F_MULTI;
3031
3032 pmsg = NLMSG_DATA(nlh); 3031 pmsg = NLMSG_DATA(nlh);
3033 pmsg->prefix_family = AF_INET6; 3032 pmsg->prefix_family = AF_INET6;
3034 pmsg->prefix_ifindex = idev->dev->ifindex; 3033 pmsg->prefix_ifindex = idev->dev->ifindex;
@@ -3067,7 +3066,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
3067 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS); 3066 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS);
3068 return; 3067 return;
3069 } 3068 }
3070 if (inet6_fill_prefix(skb, idev, pinfo, 0, 0, event) < 0) { 3069 if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) {
3071 kfree_skb(skb); 3070 kfree_skb(skb);
3072 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL); 3071 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL);
3073 return; 3072 return;
@@ -3096,7 +3095,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3096 switch (event) { 3095 switch (event) {
3097 case RTM_NEWADDR: 3096 case RTM_NEWADDR:
3098 dst_hold(&ifp->rt->u.dst); 3097 dst_hold(&ifp->rt->u.dst);
3099 if (ip6_ins_rt(ifp->rt, NULL, NULL)) 3098 if (ip6_ins_rt(ifp->rt, NULL, NULL, NULL))
3100 dst_release(&ifp->rt->u.dst); 3099 dst_release(&ifp->rt->u.dst);
3101 if (ifp->idev->cnf.forwarding) 3100 if (ifp->idev->cnf.forwarding)
3102 addrconf_join_anycast(ifp); 3101 addrconf_join_anycast(ifp);
@@ -3106,7 +3105,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3106 addrconf_leave_anycast(ifp); 3105 addrconf_leave_anycast(ifp);
3107 addrconf_leave_solict(ifp->idev, &ifp->addr); 3106 addrconf_leave_solict(ifp->idev, &ifp->addr);
3108 dst_hold(&ifp->rt->u.dst); 3107 dst_hold(&ifp->rt->u.dst);
3109 if (ip6_del_rt(ifp->rt, NULL, NULL)) 3108 if (ip6_del_rt(ifp->rt, NULL, NULL, NULL))
3110 dst_free(&ifp->rt->u.dst); 3109 dst_free(&ifp->rt->u.dst);
3111 else 3110 else
3112 dst_release(&ifp->rt->u.dst); 3111 dst_release(&ifp->rt->u.dst);
@@ -3439,7 +3438,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
3439 * by sysctl and we wouldn't want anyone to change it under our feet 3438 * by sysctl and we wouldn't want anyone to change it under our feet
3440 * (see SIOCSIFNAME). 3439 * (see SIOCSIFNAME).
3441 */ 3440 */
3442 dev_name = net_sysctl_strdup(dev_name); 3441 dev_name = kstrdup(dev_name, GFP_KERNEL);
3443 if (!dev_name) 3442 if (!dev_name)
3444 goto free; 3443 goto free;
3445 3444
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index e3ecf626cbf7..986fdfdccbcd 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -339,7 +339,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
339 xfrm_state_put(x); 339 xfrm_state_put(x);
340} 340}
341 341
342static int ah6_init_state(struct xfrm_state *x, void *args) 342static int ah6_init_state(struct xfrm_state *x)
343{ 343{
344 struct ah_data *ahp = NULL; 344 struct ah_data *ahp = NULL;
345 struct xfrm_algo_desc *aalg_desc; 345 struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 5d22ca3cca2e..6b7294047238 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -337,7 +337,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
337 write_unlock_bh(&idev->lock); 337 write_unlock_bh(&idev->lock);
338 338
339 dst_hold(&rt->u.dst); 339 dst_hold(&rt->u.dst);
340 if (ip6_ins_rt(rt, NULL, NULL)) 340 if (ip6_ins_rt(rt, NULL, NULL, NULL))
341 dst_release(&rt->u.dst); 341 dst_release(&rt->u.dst);
342 342
343 addrconf_join_solict(dev, &aca->aca_addr); 343 addrconf_join_solict(dev, &aca->aca_addr);
@@ -380,7 +380,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
380 addrconf_leave_solict(idev, &aca->aca_addr); 380 addrconf_leave_solict(idev, &aca->aca_addr);
381 381
382 dst_hold(&aca->aca_rt->u.dst); 382 dst_hold(&aca->aca_rt->u.dst);
383 if (ip6_del_rt(aca->aca_rt, NULL, NULL)) 383 if (ip6_del_rt(aca->aca_rt, NULL, NULL, NULL))
384 dst_free(&aca->aca_rt->u.dst); 384 dst_free(&aca->aca_rt->u.dst);
385 else 385 else
386 dst_release(&aca->aca_rt->u.dst); 386 dst_release(&aca->aca_rt->u.dst);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 65b9375df57d..5229365cd8b4 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -353,14 +353,14 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
353 err = copied; 353 err = copied;
354 354
355 /* Reset and regenerate socket error */ 355 /* Reset and regenerate socket error */
356 spin_lock_irq(&sk->sk_error_queue.lock); 356 spin_lock_bh(&sk->sk_error_queue.lock);
357 sk->sk_err = 0; 357 sk->sk_err = 0;
358 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { 358 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
359 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; 359 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
360 spin_unlock_irq(&sk->sk_error_queue.lock); 360 spin_unlock_bh(&sk->sk_error_queue.lock);
361 sk->sk_error_report(sk); 361 sk->sk_error_report(sk);
362 } else { 362 } else {
363 spin_unlock_irq(&sk->sk_error_queue.lock); 363 spin_unlock_bh(&sk->sk_error_queue.lock);
364 } 364 }
365 365
366out_free_skb: 366out_free_skb:
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index be7095d6babe..324db62515a2 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -296,7 +296,7 @@ static void esp6_destroy(struct xfrm_state *x)
296 kfree(esp); 296 kfree(esp);
297} 297}
298 298
299static int esp6_init_state(struct xfrm_state *x, void *args) 299static int esp6_init_state(struct xfrm_state *x)
300{ 300{
301 struct esp_data *esp = NULL; 301 struct esp_data *esp = NULL;
302 302
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 8e0f569b883e..ff3ec9822e36 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -277,8 +277,8 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
277{ 277{
278 struct inet6_dev *idev = NULL; 278 struct inet6_dev *idev = NULL;
279 struct ipv6hdr *hdr = skb->nh.ipv6h; 279 struct ipv6hdr *hdr = skb->nh.ipv6h;
280 struct sock *sk = icmpv6_socket->sk; 280 struct sock *sk;
281 struct ipv6_pinfo *np = inet6_sk(sk); 281 struct ipv6_pinfo *np;
282 struct in6_addr *saddr = NULL; 282 struct in6_addr *saddr = NULL;
283 struct dst_entry *dst; 283 struct dst_entry *dst;
284 struct icmp6hdr tmp_hdr; 284 struct icmp6hdr tmp_hdr;
@@ -358,6 +358,9 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
358 if (icmpv6_xmit_lock()) 358 if (icmpv6_xmit_lock())
359 return; 359 return;
360 360
361 sk = icmpv6_socket->sk;
362 np = inet6_sk(sk);
363
361 if (!icmpv6_xrlim_allow(sk, type, &fl)) 364 if (!icmpv6_xrlim_allow(sk, type, &fl))
362 goto out; 365 goto out;
363 366
@@ -423,9 +426,9 @@ out:
423 426
424static void icmpv6_echo_reply(struct sk_buff *skb) 427static void icmpv6_echo_reply(struct sk_buff *skb)
425{ 428{
426 struct sock *sk = icmpv6_socket->sk; 429 struct sock *sk;
427 struct inet6_dev *idev; 430 struct inet6_dev *idev;
428 struct ipv6_pinfo *np = inet6_sk(sk); 431 struct ipv6_pinfo *np;
429 struct in6_addr *saddr = NULL; 432 struct in6_addr *saddr = NULL;
430 struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw; 433 struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw;
431 struct icmp6hdr tmp_hdr; 434 struct icmp6hdr tmp_hdr;
@@ -454,6 +457,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
454 if (icmpv6_xmit_lock()) 457 if (icmpv6_xmit_lock())
455 return; 458 return;
456 459
460 sk = icmpv6_socket->sk;
461 np = inet6_sk(sk);
462
457 if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) 463 if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
458 fl.oif = np->mcast_oif; 464 fl.oif = np->mcast_oif;
459 465
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 405740b75abb..1b354aa97934 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -394,7 +394,7 @@ insert_above:
394 */ 394 */
395 395
396static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, 396static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
397 struct nlmsghdr *nlh) 397 struct nlmsghdr *nlh, struct netlink_skb_parms *req)
398{ 398{
399 struct rt6_info *iter = NULL; 399 struct rt6_info *iter = NULL;
400 struct rt6_info **ins; 400 struct rt6_info **ins;
@@ -449,7 +449,7 @@ out:
449 *ins = rt; 449 *ins = rt;
450 rt->rt6i_node = fn; 450 rt->rt6i_node = fn;
451 atomic_inc(&rt->rt6i_ref); 451 atomic_inc(&rt->rt6i_ref);
452 inet6_rt_notify(RTM_NEWROUTE, rt, nlh); 452 inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
453 rt6_stats.fib_rt_entries++; 453 rt6_stats.fib_rt_entries++;
454 454
455 if ((fn->fn_flags & RTN_RTINFO) == 0) { 455 if ((fn->fn_flags & RTN_RTINFO) == 0) {
@@ -479,7 +479,8 @@ void fib6_force_start_gc(void)
479 * with source addr info in sub-trees 479 * with source addr info in sub-trees
480 */ 480 */
481 481
482int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 482int fib6_add(struct fib6_node *root, struct rt6_info *rt,
483 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
483{ 484{
484 struct fib6_node *fn; 485 struct fib6_node *fn;
485 int err = -ENOMEM; 486 int err = -ENOMEM;
@@ -552,7 +553,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh,
552 } 553 }
553#endif 554#endif
554 555
555 err = fib6_add_rt2node(fn, rt, nlh); 556 err = fib6_add_rt2node(fn, rt, nlh, req);
556 557
557 if (err == 0) { 558 if (err == 0) {
558 fib6_start_gc(rt); 559 fib6_start_gc(rt);
@@ -859,7 +860,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
859} 860}
860 861
861static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, 862static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
862 struct nlmsghdr *nlh, void *_rtattr) 863 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
863{ 864{
864 struct fib6_walker_t *w; 865 struct fib6_walker_t *w;
865 struct rt6_info *rt = *rtp; 866 struct rt6_info *rt = *rtp;
@@ -915,11 +916,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
915 if (atomic_read(&rt->rt6i_ref) != 1) BUG(); 916 if (atomic_read(&rt->rt6i_ref) != 1) BUG();
916 } 917 }
917 918
918 inet6_rt_notify(RTM_DELROUTE, rt, nlh); 919 inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
919 rt6_release(rt); 920 rt6_release(rt);
920} 921}
921 922
922int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 923int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
923{ 924{
924 struct fib6_node *fn = rt->rt6i_node; 925 struct fib6_node *fn = rt->rt6i_node;
925 struct rt6_info **rtp; 926 struct rt6_info **rtp;
@@ -944,7 +945,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
944 945
945 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { 946 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
946 if (*rtp == rt) { 947 if (*rtp == rt) {
947 fib6_del_route(fn, rtp, nlh, _rtattr); 948 fib6_del_route(fn, rtp, nlh, _rtattr, req);
948 return 0; 949 return 0;
949 } 950 }
950 } 951 }
@@ -1073,7 +1074,7 @@ static int fib6_clean_node(struct fib6_walker_t *w)
1073 res = c->func(rt, c->arg); 1074 res = c->func(rt, c->arg);
1074 if (res < 0) { 1075 if (res < 0) {
1075 w->leaf = rt; 1076 w->leaf = rt;
1076 res = fib6_del(rt, NULL, NULL); 1077 res = fib6_del(rt, NULL, NULL, NULL);
1077 if (res) { 1078 if (res) {
1078#if RT6_DEBUG >= 2 1079#if RT6_DEBUG >= 2
1079 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); 1080 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index a93f6dc51979..0e5f7499debb 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -535,10 +535,12 @@ release:
535 if (err) 535 if (err)
536 goto done; 536 goto done;
537 537
538 /* Do not check for fault */ 538 if (!freq.flr_label) {
539 if (!freq.flr_label) 539 if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label,
540 copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, 540 &fl->label, sizeof(fl->label))) {
541 &fl->label, sizeof(fl->label)); 541 /* Intentionally ignore fault. */
542 }
543 }
542 544
543 sfl1->fl = fl; 545 sfl1->fl = fl;
544 sfl1->next = np->ipv6_fl_list; 546 sfl1->next = np->ipv6_fl_list;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b78a53586804..06e7cdaeedc5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -484,9 +484,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
484 to->nf_bridge = from->nf_bridge; 484 to->nf_bridge = from->nf_bridge;
485 nf_bridge_get(to->nf_bridge); 485 nf_bridge_get(to->nf_bridge);
486#endif 486#endif
487#ifdef CONFIG_NETFILTER_DEBUG
488 to->nf_debug = from->nf_debug;
489#endif
490#endif 487#endif
491} 488}
492 489
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 3b1c9fa184ae..ba3b0c267f75 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -882,6 +882,7 @@ ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
882 t->parms.hop_limit = p->hop_limit; 882 t->parms.hop_limit = p->hop_limit;
883 t->parms.encap_limit = p->encap_limit; 883 t->parms.encap_limit = p->encap_limit;
884 t->parms.flowinfo = p->flowinfo; 884 t->parms.flowinfo = p->flowinfo;
885 t->parms.link = p->link;
885 ip6ip6_tnl_link_config(t); 886 ip6ip6_tnl_link_config(t);
886 return 0; 887 return 0;
887} 888}
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 6cde5310cd76..423feb46ccc0 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -234,14 +234,9 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
234 t->props.mode = 1; 234 t->props.mode = 1;
235 memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); 235 memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
236 236
237 t->type = xfrm_get_type(IPPROTO_IPV6, t->props.family); 237 if (xfrm_init_state(t))
238 if (t->type == NULL)
239 goto error; 238 goto error;
240 239
241 if (t->type->init_state(t, NULL))
242 goto error;
243
244 t->km.state = XFRM_STATE_VALID;
245 atomic_set(&t->tunnel_users, 1); 240 atomic_set(&t->tunnel_users, 1);
246 241
247out: 242out:
@@ -420,7 +415,7 @@ static void ipcomp6_destroy(struct xfrm_state *x)
420 xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); 415 xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr);
421} 416}
422 417
423static int ipcomp6_init_state(struct xfrm_state *x, void *args) 418static int ipcomp6_init_state(struct xfrm_state *x)
424{ 419{
425 int err; 420 int err;
426 struct ipcomp_data *ipcd; 421 struct ipcomp_data *ipcd;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 279ab86be662..f3ef4c38d315 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -423,11 +423,12 @@ done:
423 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; 423 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
424 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, 424 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
425 &psin6->sin6_addr); 425 &psin6->sin6_addr);
426 if (retv) 426 /* prior join w/ different source is ok */
427 if (retv && retv != -EADDRINUSE)
427 break; 428 break;
428 omode = MCAST_INCLUDE; 429 omode = MCAST_INCLUDE;
429 add = 1; 430 add = 1;
430 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 431 } else /* MCAST_LEAVE_SOURCE_GROUP */ {
431 omode = MCAST_INCLUDE; 432 omode = MCAST_INCLUDE;
432 add = 0; 433 add = 0;
433 } 434 }
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 2f4c91ddc9a3..5ade5a5d1990 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -37,5 +37,4 @@ EXPORT_SYMBOL(in6_dev_finish_destroy);
37EXPORT_SYMBOL(xfrm6_rcv); 37EXPORT_SYMBOL(xfrm6_rcv);
38#endif 38#endif
39EXPORT_SYMBOL(rt6_lookup); 39EXPORT_SYMBOL(rt6_lookup);
40EXPORT_SYMBOL(fl6_sock_lookup);
41EXPORT_SYMBOL(ipv6_push_nfrag_opts); 40EXPORT_SYMBOL(ipv6_push_nfrag_opts);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 393b6e6f50a9..562fcd14fdea 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -188,6 +188,16 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
188 if (!ipv6_addr_is_multicast(addr)) 188 if (!ipv6_addr_is_multicast(addr))
189 return -EINVAL; 189 return -EINVAL;
190 190
191 read_lock_bh(&ipv6_sk_mc_lock);
192 for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) {
193 if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
194 ipv6_addr_equal(&mc_lst->addr, addr)) {
195 read_unlock_bh(&ipv6_sk_mc_lock);
196 return -EADDRINUSE;
197 }
198 }
199 read_unlock_bh(&ipv6_sk_mc_lock);
200
191 mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); 201 mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
192 202
193 if (mc_lst == NULL) 203 if (mc_lst == NULL)
@@ -349,6 +359,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
349 struct ipv6_pinfo *inet6 = inet6_sk(sk); 359 struct ipv6_pinfo *inet6 = inet6_sk(sk);
350 struct ip6_sf_socklist *psl; 360 struct ip6_sf_socklist *psl;
351 int i, j, rv; 361 int i, j, rv;
362 int leavegroup = 0;
352 int err; 363 int err;
353 364
354 if (pgsr->gsr_group.ss_family != AF_INET6 || 365 if (pgsr->gsr_group.ss_family != AF_INET6 ||
@@ -368,6 +379,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
368 379
369 err = -EADDRNOTAVAIL; 380 err = -EADDRNOTAVAIL;
370 381
382 read_lock_bh(&ipv6_sk_mc_lock);
371 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { 383 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
372 if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) 384 if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
373 continue; 385 continue;
@@ -401,6 +413,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
401 if (rv) /* source not found */ 413 if (rv) /* source not found */
402 goto done; 414 goto done;
403 415
416 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
417 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
418 leavegroup = 1;
419 goto done;
420 }
421
404 /* update the interface filter */ 422 /* update the interface filter */
405 ip6_mc_del_src(idev, group, omode, 1, source, 1); 423 ip6_mc_del_src(idev, group, omode, 1, source, 1);
406 424
@@ -453,9 +471,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
453 /* update the interface list */ 471 /* update the interface list */
454 ip6_mc_add_src(idev, group, omode, 1, source, 1); 472 ip6_mc_add_src(idev, group, omode, 1, source, 1);
455done: 473done:
474 read_unlock_bh(&ipv6_sk_mc_lock);
456 read_unlock_bh(&idev->lock); 475 read_unlock_bh(&idev->lock);
457 in6_dev_put(idev); 476 in6_dev_put(idev);
458 dev_put(dev); 477 dev_put(dev);
478 if (leavegroup)
479 return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
459 return err; 480 return err;
460} 481}
461 482
@@ -1280,15 +1301,6 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
1280 return NULL; 1301 return NULL;
1281 1302
1282 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 1303 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1283 if (dev->hard_header) {
1284 unsigned char ha[MAX_ADDR_LEN];
1285
1286 ndisc_mc_map(&mld2_all_mcr, ha, dev, 1);
1287 if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) {
1288 kfree_skb(skb);
1289 return NULL;
1290 }
1291 }
1292 1304
1293 if (ipv6_get_lladdr(dev, &addr_buf)) { 1305 if (ipv6_get_lladdr(dev, &addr_buf)) {
1294 /* <draft-ietf-magma-mld-source-05.txt>: 1306 /* <draft-ietf-magma-mld-source-05.txt>:
@@ -1312,6 +1324,30 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
1312 return skb; 1324 return skb;
1313} 1325}
1314 1326
1327static inline int mld_dev_queue_xmit2(struct sk_buff *skb)
1328{
1329 struct net_device *dev = skb->dev;
1330
1331 if (dev->hard_header) {
1332 unsigned char ha[MAX_ADDR_LEN];
1333 int err;
1334
1335 ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1);
1336 err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len);
1337 if (err < 0) {
1338 kfree_skb(skb);
1339 return err;
1340 }
1341 }
1342 return dev_queue_xmit(skb);
1343}
1344
1345static inline int mld_dev_queue_xmit(struct sk_buff *skb)
1346{
1347 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev,
1348 mld_dev_queue_xmit2);
1349}
1350
1315static void mld_sendpack(struct sk_buff *skb) 1351static void mld_sendpack(struct sk_buff *skb)
1316{ 1352{
1317 struct ipv6hdr *pip6 = skb->nh.ipv6h; 1353 struct ipv6hdr *pip6 = skb->nh.ipv6h;
@@ -1329,7 +1365,7 @@ static void mld_sendpack(struct sk_buff *skb)
1329 pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, 1365 pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
1330 IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0)); 1366 IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0));
1331 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, 1367 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
1332 dev_queue_xmit); 1368 mld_dev_queue_xmit);
1333 if (!err) { 1369 if (!err) {
1334 ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS); 1370 ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
1335 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); 1371 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
@@ -1635,12 +1671,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1635 } 1671 }
1636 1672
1637 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 1673 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1638 if (dev->hard_header) {
1639 unsigned char ha[MAX_ADDR_LEN];
1640 ndisc_mc_map(snd_addr, ha, dev, 1);
1641 if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0)
1642 goto out;
1643 }
1644 1674
1645 if (ipv6_get_lladdr(dev, &addr_buf)) { 1675 if (ipv6_get_lladdr(dev, &addr_buf)) {
1646 /* <draft-ietf-magma-mld-source-05.txt>: 1676 /* <draft-ietf-magma-mld-source-05.txt>:
@@ -1668,7 +1698,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1668 idev = in6_dev_get(skb->dev); 1698 idev = in6_dev_get(skb->dev);
1669 1699
1670 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, 1700 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
1671 dev_queue_xmit); 1701 mld_dev_queue_xmit);
1672 if (!err) { 1702 if (!err) {
1673 if (type == ICMPV6_MGM_REDUCTION) 1703 if (type == ICMPV6_MGM_REDUCTION)
1674 ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS); 1704 ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
@@ -1682,10 +1712,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1682 if (likely(idev != NULL)) 1712 if (likely(idev != NULL))
1683 in6_dev_put(idev); 1713 in6_dev_put(idev);
1684 return; 1714 return;
1685
1686out:
1687 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1688 kfree_skb(skb);
1689} 1715}
1690 1716
1691static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, 1717static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7c291f4e9edc..7ae72d4c9bd2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -955,7 +955,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
955 struct rt6_info *rt; 955 struct rt6_info *rt;
956 rt = rt6_get_dflt_router(saddr, dev); 956 rt = rt6_get_dflt_router(saddr, dev);
957 if (rt) 957 if (rt)
958 ip6_del_rt(rt, NULL, NULL); 958 ip6_del_rt(rt, NULL, NULL, NULL);
959 } 959 }
960 960
961out: 961out:
@@ -1096,7 +1096,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1096 1096
1097 if (rt && lifetime == 0) { 1097 if (rt && lifetime == 0) {
1098 neigh_clone(neigh); 1098 neigh_clone(neigh);
1099 ip6_del_rt(rt, NULL, NULL); 1099 ip6_del_rt(rt, NULL, NULL, NULL);
1100 rt = NULL; 1100 rt = NULL;
1101 } 1101 }
1102 1102
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c735276fdd5f..73034511c8db 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -71,7 +71,6 @@ static DECLARE_MUTEX(ip6t_mutex);
71/* Must have mutex */ 71/* Must have mutex */
72#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) 72#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
73#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) 73#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
74#include <linux/netfilter_ipv4/lockhelp.h>
75#include <linux/netfilter_ipv4/listhelp.h> 74#include <linux/netfilter_ipv4/listhelp.h>
76 75
77#if 0 76#if 0
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index bfc3d0185d19..c44685e391b7 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -366,8 +366,6 @@ ip6t_log_packet(unsigned int hooknum,
366 const char *level_string, 366 const char *level_string,
367 const char *prefix) 367 const char *prefix)
368{ 368{
369 struct ipv6hdr *ipv6h = skb->nh.ipv6h;
370
371 spin_lock_bh(&log_lock); 369 spin_lock_bh(&log_lock);
372 printk(level_string); 370 printk(level_string);
373 printk("%sIN=%s OUT=%s ", 371 printk("%sIN=%s OUT=%s ",
@@ -377,39 +375,25 @@ ip6t_log_packet(unsigned int hooknum,
377 if (in && !out) { 375 if (in && !out) {
378 /* MAC logging for input chain only. */ 376 /* MAC logging for input chain only. */
379 printk("MAC="); 377 printk("MAC=");
380 if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) { 378 if (skb->dev && skb->dev->hard_header_len &&
381 if (skb->dev->type != ARPHRD_SIT){ 379 skb->mac.raw != skb->nh.raw) {
382 int i; 380 unsigned char *p = skb->mac.raw;
383 unsigned char *p = skb->mac.raw; 381 int i;
384 for (i = 0; i < skb->dev->hard_header_len; i++,p++) 382
385 printk("%02x%c", *p, 383 if (skb->dev->type == ARPHRD_SIT &&
386 i==skb->dev->hard_header_len - 1 384 (p -= ETH_HLEN) < skb->head)
387 ? ' ':':'); 385 p = NULL;
388 } else { 386
389 int i; 387 if (p != NULL)
390 unsigned char *p = skb->mac.raw; 388 for (i = 0; i < skb->dev->hard_header_len; i++)
391 if ( p - (ETH_ALEN*2+2) > skb->head ){ 389 printk("%02x", p[i]);
392 p -= (ETH_ALEN+2); 390 printk(" ");
393 for (i = 0; i < (ETH_ALEN); i++,p++) 391
394 printk("%02x%s", *p, 392 if (skb->dev->type == ARPHRD_SIT) {
395 i == ETH_ALEN-1 ? "->" : ":"); 393 struct iphdr *iph = (struct iphdr *)skb->mac.raw;
396 p -= (ETH_ALEN*2); 394 printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ",
397 for (i = 0; i < (ETH_ALEN); i++,p++) 395 NIPQUAD(iph->saddr),
398 printk("%02x%c", *p, 396 NIPQUAD(iph->daddr));
399 i == ETH_ALEN-1 ? ' ' : ':');
400 }
401
402 if ((skb->dev->addr_len == 4) &&
403 skb->dev->hard_header_len > 20){
404 printk("TUNNEL=");
405 p = skb->mac.raw + 12;
406 for (i = 0; i < 4; i++,p++)
407 printk("%3d%s", *p,
408 i == 3 ? "->" : ".");
409 for (i = 0; i < 4; i++,p++)
410 printk("%3d%c", *p,
411 i == 3 ? ' ' : '.');
412 }
413 } 397 }
414 } else 398 } else
415 printk(" "); 399 printk(" ");
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 71407beaf790..c2982efd14af 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -129,13 +129,15 @@ static struct nf_hook_ops ip6t_ops[] = {
129 .hook = ip6t_hook, 129 .hook = ip6t_hook,
130 .pf = PF_INET6, 130 .pf = PF_INET6,
131 .hooknum = NF_IP6_PRE_ROUTING, 131 .hooknum = NF_IP6_PRE_ROUTING,
132 .priority = NF_IP6_PRI_FIRST 132 .priority = NF_IP6_PRI_FIRST,
133 .owner = THIS_MODULE,
133 }, 134 },
134 { 135 {
135 .hook = ip6t_hook, 136 .hook = ip6t_hook,
136 .pf = PF_INET6, 137 .pf = PF_INET6,
137 .hooknum = NF_IP6_LOCAL_OUT, 138 .hooknum = NF_IP6_LOCAL_OUT,
138 .priority = NF_IP6_PRI_FIRST 139 .priority = NF_IP6_PRI_FIRST,
140 .owner = THIS_MODULE,
139 }, 141 },
140}; 142};
141 143
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 617645bc5ed6..e2b848ec9851 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -434,12 +434,12 @@ csum_copy_err:
434 /* Clear queue. */ 434 /* Clear queue. */
435 if (flags&MSG_PEEK) { 435 if (flags&MSG_PEEK) {
436 int clear = 0; 436 int clear = 0;
437 spin_lock_irq(&sk->sk_receive_queue.lock); 437 spin_lock_bh(&sk->sk_receive_queue.lock);
438 if (skb == skb_peek(&sk->sk_receive_queue)) { 438 if (skb == skb_peek(&sk->sk_receive_queue)) {
439 __skb_unlink(skb, &sk->sk_receive_queue); 439 __skb_unlink(skb, &sk->sk_receive_queue);
440 clear = 1; 440 clear = 1;
441 } 441 }
442 spin_unlock_irq(&sk->sk_receive_queue.lock); 442 spin_unlock_bh(&sk->sk_receive_queue.lock);
443 if (clear) 443 if (clear)
444 kfree_skb(skb); 444 kfree_skb(skb);
445 } 445 }
@@ -971,11 +971,11 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
971 struct sk_buff *skb; 971 struct sk_buff *skb;
972 int amount = 0; 972 int amount = 0;
973 973
974 spin_lock_irq(&sk->sk_receive_queue.lock); 974 spin_lock_bh(&sk->sk_receive_queue.lock);
975 skb = skb_peek(&sk->sk_receive_queue); 975 skb = skb_peek(&sk->sk_receive_queue);
976 if (skb != NULL) 976 if (skb != NULL)
977 amount = skb->tail - skb->h.raw; 977 amount = skb->tail - skb->h.raw;
978 spin_unlock_irq(&sk->sk_receive_queue.lock); 978 spin_unlock_bh(&sk->sk_receive_queue.lock);
979 return put_user(amount, (int __user *)arg); 979 return put_user(amount, (int __user *)arg);
980 } 980 }
981 981
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3bf8a0254f81..878789b3122d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -384,12 +384,13 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
384 be destroyed. 384 be destroyed.
385 */ 385 */
386 386
387int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 387int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
388 void *_rtattr, struct netlink_skb_parms *req)
388{ 389{
389 int err; 390 int err;
390 391
391 write_lock_bh(&rt6_lock); 392 write_lock_bh(&rt6_lock);
392 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr); 393 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
393 write_unlock_bh(&rt6_lock); 394 write_unlock_bh(&rt6_lock);
394 395
395 return err; 396 return err;
@@ -400,7 +401,7 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
400 */ 401 */
401 402
402static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, 403static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
403 struct in6_addr *saddr) 404 struct in6_addr *saddr, struct netlink_skb_parms *req)
404{ 405{
405 int err; 406 int err;
406 struct rt6_info *rt; 407 struct rt6_info *rt;
@@ -432,7 +433,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
432 433
433 dst_hold(&rt->u.dst); 434 dst_hold(&rt->u.dst);
434 435
435 err = ip6_ins_rt(rt, NULL, NULL); 436 err = ip6_ins_rt(rt, NULL, NULL, req);
436 if (err == 0) 437 if (err == 0)
437 return rt; 438 return rt;
438 439
@@ -491,7 +492,8 @@ restart:
491 read_unlock_bh(&rt6_lock); 492 read_unlock_bh(&rt6_lock);
492 493
493 nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr, 494 nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
494 &skb->nh.ipv6h->saddr); 495 &skb->nh.ipv6h->saddr,
496 &NETLINK_CB(skb));
495 497
496 dst_release(&rt->u.dst); 498 dst_release(&rt->u.dst);
497 rt = nrt; 499 rt = nrt;
@@ -551,7 +553,7 @@ restart:
551 dst_hold(&rt->u.dst); 553 dst_hold(&rt->u.dst);
552 read_unlock_bh(&rt6_lock); 554 read_unlock_bh(&rt6_lock);
553 555
554 nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src); 556 nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src, NULL);
555 557
556 dst_release(&rt->u.dst); 558 dst_release(&rt->u.dst);
557 rt = nrt; 559 rt = nrt;
@@ -598,7 +600,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
598 600
599 if (rt) { 601 if (rt) {
600 if (rt->rt6i_flags & RTF_CACHE) 602 if (rt->rt6i_flags & RTF_CACHE)
601 ip6_del_rt(rt, NULL, NULL); 603 ip6_del_rt(rt, NULL, NULL, NULL);
602 else 604 else
603 dst_release(dst); 605 dst_release(dst);
604 } 606 }
@@ -787,7 +789,8 @@ int ipv6_get_hoplimit(struct net_device *dev)
787 * 789 *
788 */ 790 */
789 791
790int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) 792int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
793 void *_rtattr, struct netlink_skb_parms *req)
791{ 794{
792 int err; 795 int err;
793 struct rtmsg *r; 796 struct rtmsg *r;
@@ -974,7 +977,7 @@ install_route:
974 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); 977 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
975 rt->u.dst.dev = dev; 978 rt->u.dst.dev = dev;
976 rt->rt6i_idev = idev; 979 rt->rt6i_idev = idev;
977 return ip6_ins_rt(rt, nlh, _rtattr); 980 return ip6_ins_rt(rt, nlh, _rtattr, req);
978 981
979out: 982out:
980 if (dev) 983 if (dev)
@@ -986,7 +989,7 @@ out:
986 return err; 989 return err;
987} 990}
988 991
989int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 992int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
990{ 993{
991 int err; 994 int err;
992 995
@@ -994,7 +997,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
994 997
995 rt6_reset_dflt_pointer(NULL); 998 rt6_reset_dflt_pointer(NULL);
996 999
997 err = fib6_del(rt, nlh, _rtattr); 1000 err = fib6_del(rt, nlh, _rtattr, req);
998 dst_release(&rt->u.dst); 1001 dst_release(&rt->u.dst);
999 1002
1000 write_unlock_bh(&rt6_lock); 1003 write_unlock_bh(&rt6_lock);
@@ -1002,7 +1005,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
1002 return err; 1005 return err;
1003} 1006}
1004 1007
1005static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) 1008static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1006{ 1009{
1007 struct fib6_node *fn; 1010 struct fib6_node *fn;
1008 struct rt6_info *rt; 1011 struct rt6_info *rt;
@@ -1029,7 +1032,7 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_r
1029 dst_hold(&rt->u.dst); 1032 dst_hold(&rt->u.dst);
1030 read_unlock_bh(&rt6_lock); 1033 read_unlock_bh(&rt6_lock);
1031 1034
1032 return ip6_del_rt(rt, nlh, _rtattr); 1035 return ip6_del_rt(rt, nlh, _rtattr, req);
1033 } 1036 }
1034 } 1037 }
1035 read_unlock_bh(&rt6_lock); 1038 read_unlock_bh(&rt6_lock);
@@ -1136,11 +1139,11 @@ source_ok:
1136 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); 1139 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1137 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); 1140 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1138 1141
1139 if (ip6_ins_rt(nrt, NULL, NULL)) 1142 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1140 goto out; 1143 goto out;
1141 1144
1142 if (rt->rt6i_flags&RTF_CACHE) { 1145 if (rt->rt6i_flags&RTF_CACHE) {
1143 ip6_del_rt(rt, NULL, NULL); 1146 ip6_del_rt(rt, NULL, NULL, NULL);
1144 return; 1147 return;
1145 } 1148 }
1146 1149
@@ -1204,7 +1207,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1204 2. It is gatewayed route or NONEXTHOP route. Action: clone it. 1207 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1205 */ 1208 */
1206 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { 1209 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
1207 nrt = rt6_cow(rt, daddr, saddr); 1210 nrt = rt6_cow(rt, daddr, saddr, NULL);
1208 if (!nrt->u.dst.error) { 1211 if (!nrt->u.dst.error) {
1209 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; 1212 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1210 if (allfrag) 1213 if (allfrag)
@@ -1232,7 +1235,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1232 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; 1235 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1233 if (allfrag) 1236 if (allfrag)
1234 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; 1237 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1235 ip6_ins_rt(nrt, NULL, NULL); 1238 ip6_ins_rt(nrt, NULL, NULL, NULL);
1236 } 1239 }
1237 1240
1238out: 1241out:
@@ -1305,7 +1308,7 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1305 1308
1306 rtmsg.rtmsg_ifindex = dev->ifindex; 1309 rtmsg.rtmsg_ifindex = dev->ifindex;
1307 1310
1308 ip6_route_add(&rtmsg, NULL, NULL); 1311 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1309 return rt6_get_dflt_router(gwaddr, dev); 1312 return rt6_get_dflt_router(gwaddr, dev);
1310} 1313}
1311 1314
@@ -1323,7 +1326,7 @@ restart:
1323 1326
1324 read_unlock_bh(&rt6_lock); 1327 read_unlock_bh(&rt6_lock);
1325 1328
1326 ip6_del_rt(rt, NULL, NULL); 1329 ip6_del_rt(rt, NULL, NULL, NULL);
1327 1330
1328 goto restart; 1331 goto restart;
1329 } 1332 }
@@ -1349,10 +1352,10 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1349 rtnl_lock(); 1352 rtnl_lock();
1350 switch (cmd) { 1353 switch (cmd) {
1351 case SIOCADDRT: 1354 case SIOCADDRT:
1352 err = ip6_route_add(&rtmsg, NULL, NULL); 1355 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1353 break; 1356 break;
1354 case SIOCDELRT: 1357 case SIOCDELRT:
1355 err = ip6_route_del(&rtmsg, NULL, NULL); 1358 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1356 break; 1359 break;
1357 default: 1360 default:
1358 err = -EINVAL; 1361 err = -EINVAL;
@@ -1546,7 +1549,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1546 1549
1547 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) 1550 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1548 return -EINVAL; 1551 return -EINVAL;
1549 return ip6_route_del(&rtmsg, nlh, arg); 1552 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1550} 1553}
1551 1554
1552int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 1555int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -1556,7 +1559,7 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1556 1559
1557 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) 1560 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1558 return -EINVAL; 1561 return -EINVAL;
1559 return ip6_route_add(&rtmsg, nlh, arg); 1562 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1560} 1563}
1561 1564
1562struct rt6_rtnl_dump_arg 1565struct rt6_rtnl_dump_arg
@@ -1566,11 +1569,9 @@ struct rt6_rtnl_dump_arg
1566}; 1569};
1567 1570
1568static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, 1571static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1569 struct in6_addr *dst, 1572 struct in6_addr *dst, struct in6_addr *src,
1570 struct in6_addr *src, 1573 int iif, int type, u32 pid, u32 seq,
1571 int iif, 1574 int prefix, unsigned int flags)
1572 int type, u32 pid, u32 seq,
1573 struct nlmsghdr *in_nlh, int prefix)
1574{ 1575{
1575 struct rtmsg *rtm; 1576 struct rtmsg *rtm;
1576 struct nlmsghdr *nlh; 1577 struct nlmsghdr *nlh;
@@ -1584,11 +1585,7 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1584 } 1585 }
1585 } 1586 }
1586 1587
1587 if (!pid && in_nlh) { 1588 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1588 pid = in_nlh->nlmsg_pid;
1589 }
1590
1591 nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
1592 rtm = NLMSG_DATA(nlh); 1589 rtm = NLMSG_DATA(nlh);
1593 rtm->rtm_family = AF_INET6; 1590 rtm->rtm_family = AF_INET6;
1594 rtm->rtm_dst_len = rt->rt6i_dst.plen; 1591 rtm->rtm_dst_len = rt->rt6i_dst.plen;
@@ -1674,7 +1671,7 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1674 1671
1675 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 1672 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1676 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, 1673 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1677 NULL, prefix); 1674 prefix, NLM_F_MULTI);
1678} 1675}
1679 1676
1680static int fib6_dump_node(struct fib6_walker_t *w) 1677static int fib6_dump_node(struct fib6_walker_t *w)
@@ -1822,7 +1819,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1822 &fl.fl6_dst, &fl.fl6_src, 1819 &fl.fl6_dst, &fl.fl6_src,
1823 iif, 1820 iif,
1824 RTM_NEWROUTE, NETLINK_CB(in_skb).pid, 1821 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1825 nlh->nlmsg_seq, nlh, 0); 1822 nlh->nlmsg_seq, 0, 0);
1826 if (err < 0) { 1823 if (err < 0) {
1827 err = -EMSGSIZE; 1824 err = -EMSGSIZE;
1828 goto out_free; 1825 goto out_free;
@@ -1838,17 +1835,25 @@ out_free:
1838 goto out; 1835 goto out;
1839} 1836}
1840 1837
1841void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh) 1838void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1839 struct netlink_skb_parms *req)
1842{ 1840{
1843 struct sk_buff *skb; 1841 struct sk_buff *skb;
1844 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); 1842 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1843 u32 pid = current->pid;
1844 u32 seq = 0;
1845 1845
1846 if (req)
1847 pid = req->pid;
1848 if (nlh)
1849 seq = nlh->nlmsg_seq;
1850
1846 skb = alloc_skb(size, gfp_any()); 1851 skb = alloc_skb(size, gfp_any());
1847 if (!skb) { 1852 if (!skb) {
1848 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); 1853 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
1849 return; 1854 return;
1850 } 1855 }
1851 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0) < 0) { 1856 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1852 kfree_skb(skb); 1857 kfree_skb(skb);
1853 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); 1858 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
1854 return; 1859 return;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0f69e800a0ad..9dac7fdf4726 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -65,7 +65,7 @@
65#include <linux/seq_file.h> 65#include <linux/seq_file.h>
66 66
67static void tcp_v6_send_reset(struct sk_buff *skb); 67static void tcp_v6_send_reset(struct sk_buff *skb);
68static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req); 68static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
69static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 69static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len,
70 struct sk_buff *skb); 70 struct sk_buff *skb);
71 71
@@ -394,24 +394,26 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
394 return c & (TCP_SYNQ_HSIZE - 1); 394 return c & (TCP_SYNQ_HSIZE - 1);
395} 395}
396 396
397static struct open_request *tcp_v6_search_req(struct tcp_sock *tp, 397static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp,
398 struct open_request ***prevp, 398 struct request_sock ***prevp,
399 __u16 rport, 399 __u16 rport,
400 struct in6_addr *raddr, 400 struct in6_addr *raddr,
401 struct in6_addr *laddr, 401 struct in6_addr *laddr,
402 int iif) 402 int iif)
403{ 403{
404 struct tcp_listen_opt *lopt = tp->listen_opt; 404 struct listen_sock *lopt = tp->accept_queue.listen_opt;
405 struct open_request *req, **prev; 405 struct request_sock *req, **prev;
406 406
407 for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; 407 for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
408 (req = *prev) != NULL; 408 (req = *prev) != NULL;
409 prev = &req->dl_next) { 409 prev = &req->dl_next) {
410 if (req->rmt_port == rport && 410 const struct tcp6_request_sock *treq = tcp6_rsk(req);
411 req->class->family == AF_INET6 && 411
412 ipv6_addr_equal(&req->af.v6_req.rmt_addr, raddr) && 412 if (inet_rsk(req)->rmt_port == rport &&
413 ipv6_addr_equal(&req->af.v6_req.loc_addr, laddr) && 413 req->rsk_ops->family == AF_INET6 &&
414 (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) { 414 ipv6_addr_equal(&treq->rmt_addr, raddr) &&
415 ipv6_addr_equal(&treq->loc_addr, laddr) &&
416 (!treq->iif || treq->iif == iif)) {
415 BUG_TRAP(req->sk == NULL); 417 BUG_TRAP(req->sk == NULL);
416 *prevp = prev; 418 *prevp = prev;
417 return req; 419 return req;
@@ -906,9 +908,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
906 908
907 icmpv6_err_convert(type, code, &err); 909 icmpv6_err_convert(type, code, &err);
908 910
909 /* Might be for an open_request */ 911 /* Might be for an request_sock */
910 switch (sk->sk_state) { 912 switch (sk->sk_state) {
911 struct open_request *req, **prev; 913 struct request_sock *req, **prev;
912 case TCP_LISTEN: 914 case TCP_LISTEN:
913 if (sock_owned_by_user(sk)) 915 if (sock_owned_by_user(sk))
914 goto out; 916 goto out;
@@ -923,7 +925,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
923 */ 925 */
924 BUG_TRAP(req->sk == NULL); 926 BUG_TRAP(req->sk == NULL);
925 927
926 if (seq != req->snt_isn) { 928 if (seq != tcp_rsk(req)->snt_isn) {
927 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); 929 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
928 goto out; 930 goto out;
929 } 931 }
@@ -957,9 +959,10 @@ out:
957} 959}
958 960
959 961
960static int tcp_v6_send_synack(struct sock *sk, struct open_request *req, 962static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
961 struct dst_entry *dst) 963 struct dst_entry *dst)
962{ 964{
965 struct tcp6_request_sock *treq = tcp6_rsk(req);
963 struct ipv6_pinfo *np = inet6_sk(sk); 966 struct ipv6_pinfo *np = inet6_sk(sk);
964 struct sk_buff * skb; 967 struct sk_buff * skb;
965 struct ipv6_txoptions *opt = NULL; 968 struct ipv6_txoptions *opt = NULL;
@@ -969,19 +972,19 @@ static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
969 972
970 memset(&fl, 0, sizeof(fl)); 973 memset(&fl, 0, sizeof(fl));
971 fl.proto = IPPROTO_TCP; 974 fl.proto = IPPROTO_TCP;
972 ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); 975 ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
973 ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); 976 ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
974 fl.fl6_flowlabel = 0; 977 fl.fl6_flowlabel = 0;
975 fl.oif = req->af.v6_req.iif; 978 fl.oif = treq->iif;
976 fl.fl_ip_dport = req->rmt_port; 979 fl.fl_ip_dport = inet_rsk(req)->rmt_port;
977 fl.fl_ip_sport = inet_sk(sk)->sport; 980 fl.fl_ip_sport = inet_sk(sk)->sport;
978 981
979 if (dst == NULL) { 982 if (dst == NULL) {
980 opt = np->opt; 983 opt = np->opt;
981 if (opt == NULL && 984 if (opt == NULL &&
982 np->rxopt.bits.srcrt == 2 && 985 np->rxopt.bits.srcrt == 2 &&
983 req->af.v6_req.pktopts) { 986 treq->pktopts) {
984 struct sk_buff *pktopts = req->af.v6_req.pktopts; 987 struct sk_buff *pktopts = treq->pktopts;
985 struct inet6_skb_parm *rxopt = IP6CB(pktopts); 988 struct inet6_skb_parm *rxopt = IP6CB(pktopts);
986 if (rxopt->srcrt) 989 if (rxopt->srcrt)
987 opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt)); 990 opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt));
@@ -1008,10 +1011,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
1008 struct tcphdr *th = skb->h.th; 1011 struct tcphdr *th = skb->h.th;
1009 1012
1010 th->check = tcp_v6_check(th, skb->len, 1013 th->check = tcp_v6_check(th, skb->len,
1011 &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, 1014 &treq->loc_addr, &treq->rmt_addr,
1012 csum_partial((char *)th, skb->len, skb->csum)); 1015 csum_partial((char *)th, skb->len, skb->csum));
1013 1016
1014 ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); 1017 ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
1015 err = ip6_xmit(sk, skb, &fl, opt, 0); 1018 err = ip6_xmit(sk, skb, &fl, opt, 0);
1016 if (err == NET_XMIT_CN) 1019 if (err == NET_XMIT_CN)
1017 err = 0; 1020 err = 0;
@@ -1024,17 +1027,18 @@ done:
1024 return err; 1027 return err;
1025} 1028}
1026 1029
1027static void tcp_v6_or_free(struct open_request *req) 1030static void tcp_v6_reqsk_destructor(struct request_sock *req)
1028{ 1031{
1029 if (req->af.v6_req.pktopts) 1032 if (tcp6_rsk(req)->pktopts)
1030 kfree_skb(req->af.v6_req.pktopts); 1033 kfree_skb(tcp6_rsk(req)->pktopts);
1031} 1034}
1032 1035
1033static struct or_calltable or_ipv6 = { 1036static struct request_sock_ops tcp6_request_sock_ops = {
1034 .family = AF_INET6, 1037 .family = AF_INET6,
1038 .obj_size = sizeof(struct tcp6_request_sock),
1035 .rtx_syn_ack = tcp_v6_send_synack, 1039 .rtx_syn_ack = tcp_v6_send_synack,
1036 .send_ack = tcp_v6_or_send_ack, 1040 .send_ack = tcp_v6_reqsk_send_ack,
1037 .destructor = tcp_v6_or_free, 1041 .destructor = tcp_v6_reqsk_destructor,
1038 .send_reset = tcp_v6_send_reset 1042 .send_reset = tcp_v6_send_reset
1039}; 1043};
1040 1044
@@ -1219,15 +1223,15 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
1219 tcp_tw_put(tw); 1223 tcp_tw_put(tw);
1220} 1224}
1221 1225
1222static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req) 1226static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1223{ 1227{
1224 tcp_v6_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent); 1228 tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent);
1225} 1229}
1226 1230
1227 1231
1228static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) 1232static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1229{ 1233{
1230 struct open_request *req, **prev; 1234 struct request_sock *req, **prev;
1231 struct tcphdr *th = skb->h.th; 1235 struct tcphdr *th = skb->h.th;
1232 struct tcp_sock *tp = tcp_sk(sk); 1236 struct tcp_sock *tp = tcp_sk(sk);
1233 struct sock *nsk; 1237 struct sock *nsk;
@@ -1260,21 +1264,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1260 return sk; 1264 return sk;
1261} 1265}
1262 1266
1263static void tcp_v6_synq_add(struct sock *sk, struct open_request *req) 1267static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
1264{ 1268{
1265 struct tcp_sock *tp = tcp_sk(sk); 1269 struct tcp_sock *tp = tcp_sk(sk);
1266 struct tcp_listen_opt *lopt = tp->listen_opt; 1270 struct listen_sock *lopt = tp->accept_queue.listen_opt;
1267 u32 h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port, lopt->hash_rnd); 1271 u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
1268
1269 req->sk = NULL;
1270 req->expires = jiffies + TCP_TIMEOUT_INIT;
1271 req->retrans = 0;
1272 req->dl_next = lopt->syn_table[h];
1273
1274 write_lock(&tp->syn_wait_lock);
1275 lopt->syn_table[h] = req;
1276 write_unlock(&tp->syn_wait_lock);
1277 1272
1273 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
1278 tcp_synq_added(sk); 1274 tcp_synq_added(sk);
1279} 1275}
1280 1276
@@ -1284,10 +1280,11 @@ static void tcp_v6_synq_add(struct sock *sk, struct open_request *req)
1284 */ 1280 */
1285static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) 1281static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1286{ 1282{
1283 struct tcp6_request_sock *treq;
1287 struct ipv6_pinfo *np = inet6_sk(sk); 1284 struct ipv6_pinfo *np = inet6_sk(sk);
1288 struct tcp_options_received tmp_opt; 1285 struct tcp_options_received tmp_opt;
1289 struct tcp_sock *tp = tcp_sk(sk); 1286 struct tcp_sock *tp = tcp_sk(sk);
1290 struct open_request *req = NULL; 1287 struct request_sock *req = NULL;
1291 __u32 isn = TCP_SKB_CB(skb)->when; 1288 __u32 isn = TCP_SKB_CB(skb)->when;
1292 1289
1293 if (skb->protocol == htons(ETH_P_IP)) 1290 if (skb->protocol == htons(ETH_P_IP))
@@ -1308,7 +1305,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1308 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 1305 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1309 goto drop; 1306 goto drop;
1310 1307
1311 req = tcp_openreq_alloc(); 1308 req = reqsk_alloc(&tcp6_request_sock_ops);
1312 if (req == NULL) 1309 if (req == NULL)
1313 goto drop; 1310 goto drop;
1314 1311
@@ -1321,28 +1318,28 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1321 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1318 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1322 tcp_openreq_init(req, &tmp_opt, skb); 1319 tcp_openreq_init(req, &tmp_opt, skb);
1323 1320
1324 req->class = &or_ipv6; 1321 treq = tcp6_rsk(req);
1325 ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); 1322 ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
1326 ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); 1323 ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
1327 TCP_ECN_create_request(req, skb->h.th); 1324 TCP_ECN_create_request(req, skb->h.th);
1328 req->af.v6_req.pktopts = NULL; 1325 treq->pktopts = NULL;
1329 if (ipv6_opt_accepted(sk, skb) || 1326 if (ipv6_opt_accepted(sk, skb) ||
1330 np->rxopt.bits.rxinfo || 1327 np->rxopt.bits.rxinfo ||
1331 np->rxopt.bits.rxhlim) { 1328 np->rxopt.bits.rxhlim) {
1332 atomic_inc(&skb->users); 1329 atomic_inc(&skb->users);
1333 req->af.v6_req.pktopts = skb; 1330 treq->pktopts = skb;
1334 } 1331 }
1335 req->af.v6_req.iif = sk->sk_bound_dev_if; 1332 treq->iif = sk->sk_bound_dev_if;
1336 1333
1337 /* So that link locals have meaning */ 1334 /* So that link locals have meaning */
1338 if (!sk->sk_bound_dev_if && 1335 if (!sk->sk_bound_dev_if &&
1339 ipv6_addr_type(&req->af.v6_req.rmt_addr) & IPV6_ADDR_LINKLOCAL) 1336 ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
1340 req->af.v6_req.iif = tcp_v6_iif(skb); 1337 treq->iif = tcp_v6_iif(skb);
1341 1338
1342 if (isn == 0) 1339 if (isn == 0)
1343 isn = tcp_v6_init_sequence(sk,skb); 1340 isn = tcp_v6_init_sequence(sk,skb);
1344 1341
1345 req->snt_isn = isn; 1342 tcp_rsk(req)->snt_isn = isn;
1346 1343
1347 if (tcp_v6_send_synack(sk, req, NULL)) 1344 if (tcp_v6_send_synack(sk, req, NULL))
1348 goto drop; 1345 goto drop;
@@ -1353,16 +1350,17 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1353 1350
1354drop: 1351drop:
1355 if (req) 1352 if (req)
1356 tcp_openreq_free(req); 1353 reqsk_free(req);
1357 1354
1358 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1355 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1359 return 0; /* don't send reset */ 1356 return 0; /* don't send reset */
1360} 1357}
1361 1358
1362static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 1359static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1363 struct open_request *req, 1360 struct request_sock *req,
1364 struct dst_entry *dst) 1361 struct dst_entry *dst)
1365{ 1362{
1363 struct tcp6_request_sock *treq = tcp6_rsk(req);
1366 struct ipv6_pinfo *newnp, *np = inet6_sk(sk); 1364 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
1367 struct tcp6_sock *newtcp6sk; 1365 struct tcp6_sock *newtcp6sk;
1368 struct inet_sock *newinet; 1366 struct inet_sock *newinet;
@@ -1426,10 +1424,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1426 goto out_overflow; 1424 goto out_overflow;
1427 1425
1428 if (np->rxopt.bits.srcrt == 2 && 1426 if (np->rxopt.bits.srcrt == 2 &&
1429 opt == NULL && req->af.v6_req.pktopts) { 1427 opt == NULL && treq->pktopts) {
1430 struct inet6_skb_parm *rxopt = IP6CB(req->af.v6_req.pktopts); 1428 struct inet6_skb_parm *rxopt = IP6CB(treq->pktopts);
1431 if (rxopt->srcrt) 1429 if (rxopt->srcrt)
1432 opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt)); 1430 opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr *)(treq->pktopts->nh.raw + rxopt->srcrt));
1433 } 1431 }
1434 1432
1435 if (dst == NULL) { 1433 if (dst == NULL) {
@@ -1438,16 +1436,16 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1438 1436
1439 memset(&fl, 0, sizeof(fl)); 1437 memset(&fl, 0, sizeof(fl));
1440 fl.proto = IPPROTO_TCP; 1438 fl.proto = IPPROTO_TCP;
1441 ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); 1439 ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
1442 if (opt && opt->srcrt) { 1440 if (opt && opt->srcrt) {
1443 struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; 1441 struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
1444 ipv6_addr_copy(&final, &fl.fl6_dst); 1442 ipv6_addr_copy(&final, &fl.fl6_dst);
1445 ipv6_addr_copy(&fl.fl6_dst, rt0->addr); 1443 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
1446 final_p = &final; 1444 final_p = &final;
1447 } 1445 }
1448 ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); 1446 ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
1449 fl.oif = sk->sk_bound_dev_if; 1447 fl.oif = sk->sk_bound_dev_if;
1450 fl.fl_ip_dport = req->rmt_port; 1448 fl.fl_ip_dport = inet_rsk(req)->rmt_port;
1451 fl.fl_ip_sport = inet_sk(sk)->sport; 1449 fl.fl_ip_sport = inet_sk(sk)->sport;
1452 1450
1453 if (ip6_dst_lookup(sk, &dst, &fl)) 1451 if (ip6_dst_lookup(sk, &dst, &fl))
@@ -1482,10 +1480,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1482 1480
1483 memcpy(newnp, np, sizeof(struct ipv6_pinfo)); 1481 memcpy(newnp, np, sizeof(struct ipv6_pinfo));
1484 1482
1485 ipv6_addr_copy(&newnp->daddr, &req->af.v6_req.rmt_addr); 1483 ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr);
1486 ipv6_addr_copy(&newnp->saddr, &req->af.v6_req.loc_addr); 1484 ipv6_addr_copy(&newnp->saddr, &treq->loc_addr);
1487 ipv6_addr_copy(&newnp->rcv_saddr, &req->af.v6_req.loc_addr); 1485 ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr);
1488 newsk->sk_bound_dev_if = req->af.v6_req.iif; 1486 newsk->sk_bound_dev_if = treq->iif;
1489 1487
1490 /* Now IPv6 options... 1488 /* Now IPv6 options...
1491 1489
@@ -1498,11 +1496,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1498 1496
1499 /* Clone pktoptions received with SYN */ 1497 /* Clone pktoptions received with SYN */
1500 newnp->pktoptions = NULL; 1498 newnp->pktoptions = NULL;
1501 if (req->af.v6_req.pktopts) { 1499 if (treq->pktopts != NULL) {
1502 newnp->pktoptions = skb_clone(req->af.v6_req.pktopts, 1500 newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC);
1503 GFP_ATOMIC); 1501 kfree_skb(treq->pktopts);
1504 kfree_skb(req->af.v6_req.pktopts); 1502 treq->pktopts = NULL;
1505 req->af.v6_req.pktopts = NULL;
1506 if (newnp->pktoptions) 1503 if (newnp->pktoptions)
1507 skb_set_owner_r(newnp->pktoptions, newsk); 1504 skb_set_owner_r(newnp->pktoptions, newsk);
1508 } 1505 }
@@ -2028,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
2028 sk->sk_state = TCP_CLOSE; 2025 sk->sk_state = TCP_CLOSE;
2029 2026
2030 tp->af_specific = &ipv6_specific; 2027 tp->af_specific = &ipv6_specific;
2031 2028 tp->ca_ops = &tcp_init_congestion_ops;
2032 sk->sk_write_space = sk_stream_write_space; 2029 sk->sk_write_space = sk_stream_write_space;
2033 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2034 2031
@@ -2050,7 +2047,7 @@ static int tcp_v6_destroy_sock(struct sock *sk)
2050 2047
2051/* Proc filesystem TCPv6 sock list dumping. */ 2048/* Proc filesystem TCPv6 sock list dumping. */
2052static void get_openreq6(struct seq_file *seq, 2049static void get_openreq6(struct seq_file *seq,
2053 struct sock *sk, struct open_request *req, int i, int uid) 2050 struct sock *sk, struct request_sock *req, int i, int uid)
2054{ 2051{
2055 struct in6_addr *dest, *src; 2052 struct in6_addr *dest, *src;
2056 int ttd = req->expires - jiffies; 2053 int ttd = req->expires - jiffies;
@@ -2058,8 +2055,8 @@ static void get_openreq6(struct seq_file *seq,
2058 if (ttd < 0) 2055 if (ttd < 0)
2059 ttd = 0; 2056 ttd = 0;
2060 2057
2061 src = &req->af.v6_req.loc_addr; 2058 src = &tcp6_rsk(req)->loc_addr;
2062 dest = &req->af.v6_req.rmt_addr; 2059 dest = &tcp6_rsk(req)->rmt_addr;
2063 seq_printf(seq, 2060 seq_printf(seq,
2064 "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " 2061 "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
2065 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", 2062 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
@@ -2069,7 +2066,7 @@ static void get_openreq6(struct seq_file *seq,
2069 ntohs(inet_sk(sk)->sport), 2066 ntohs(inet_sk(sk)->sport),
2070 dest->s6_addr32[0], dest->s6_addr32[1], 2067 dest->s6_addr32[0], dest->s6_addr32[1],
2071 dest->s6_addr32[2], dest->s6_addr32[3], 2068 dest->s6_addr32[2], dest->s6_addr32[3],
2072 ntohs(req->rmt_port), 2069 ntohs(inet_rsk(req)->rmt_port),
2073 TCP_SYN_RECV, 2070 TCP_SYN_RECV,
2074 0,0, /* could print option size, but that is af dependent. */ 2071 0,0, /* could print option size, but that is af dependent. */
2075 1, /* timers active (only the expire timer) */ 2072 1, /* timers active (only the expire timer) */
@@ -2239,6 +2236,7 @@ struct proto tcpv6_prot = {
2239 .sysctl_rmem = sysctl_tcp_rmem, 2236 .sysctl_rmem = sysctl_tcp_rmem,
2240 .max_header = MAX_TCP_HEADER, 2237 .max_header = MAX_TCP_HEADER,
2241 .obj_size = sizeof(struct tcp6_sock), 2238 .obj_size = sizeof(struct tcp6_sock),
2239 .rsk_prot = &tcp6_request_sock_ops,
2242}; 2240};
2243 2241
2244static struct inet6_protocol tcpv6_protocol = { 2242static struct inet6_protocol tcpv6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e251d0ba4f39..eff050ac7049 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -300,12 +300,12 @@ csum_copy_err:
300 /* Clear queue. */ 300 /* Clear queue. */
301 if (flags&MSG_PEEK) { 301 if (flags&MSG_PEEK) {
302 int clear = 0; 302 int clear = 0;
303 spin_lock_irq(&sk->sk_receive_queue.lock); 303 spin_lock_bh(&sk->sk_receive_queue.lock);
304 if (skb == skb_peek(&sk->sk_receive_queue)) { 304 if (skb == skb_peek(&sk->sk_receive_queue)) {
305 __skb_unlink(skb, &sk->sk_receive_queue); 305 __skb_unlink(skb, &sk->sk_receive_queue);
306 clear = 1; 306 clear = 1;
307 } 307 }
308 spin_unlock_irq(&sk->sk_receive_queue.lock); 308 spin_unlock_bh(&sk->sk_receive_queue.lock);
309 if (clear) 309 if (clear)
310 kfree_skb(skb); 310 kfree_skb(skb);
311 } 311 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index ffcadd68b951..60c26c87277e 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -466,7 +466,7 @@ static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
466 return; 466 return;
467} 467}
468 468
469static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args) 469static int xfrm6_tunnel_init_state(struct xfrm_state *x)
470{ 470{
471 if (!x->props.mode) 471 if (!x->props.mode)
472 return -EINVAL; 472 return -EINVAL;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index ce980aa94ed8..4879743b945a 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -656,13 +656,18 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
656 sa->sadb_sa_exttype = SADB_EXT_SA; 656 sa->sadb_sa_exttype = SADB_EXT_SA;
657 sa->sadb_sa_spi = x->id.spi; 657 sa->sadb_sa_spi = x->id.spi;
658 sa->sadb_sa_replay = x->props.replay_window; 658 sa->sadb_sa_replay = x->props.replay_window;
659 sa->sadb_sa_state = SADB_SASTATE_DYING; 659 switch (x->km.state) {
660 if (x->km.state == XFRM_STATE_VALID && !x->km.dying) 660 case XFRM_STATE_VALID:
661 sa->sadb_sa_state = SADB_SASTATE_MATURE; 661 sa->sadb_sa_state = x->km.dying ?
662 else if (x->km.state == XFRM_STATE_ACQ) 662 SADB_SASTATE_DYING : SADB_SASTATE_MATURE;
663 break;
664 case XFRM_STATE_ACQ:
663 sa->sadb_sa_state = SADB_SASTATE_LARVAL; 665 sa->sadb_sa_state = SADB_SASTATE_LARVAL;
664 else if (x->km.state == XFRM_STATE_EXPIRED) 666 break;
667 default:
665 sa->sadb_sa_state = SADB_SASTATE_DEAD; 668 sa->sadb_sa_state = SADB_SASTATE_DEAD;
669 break;
670 }
666 sa->sadb_sa_auth = 0; 671 sa->sadb_sa_auth = 0;
667 if (x->aalg) { 672 if (x->aalg) {
668 struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); 673 struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
@@ -685,6 +690,8 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
685 sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; 690 sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN;
686 if (x->props.flags & XFRM_STATE_DECAP_DSCP) 691 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
687 sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; 692 sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP;
693 if (x->props.flags & XFRM_STATE_NOPMTUDISC)
694 sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC;
688 695
689 /* hard time */ 696 /* hard time */
690 if (hsc & 2) { 697 if (hsc & 2) {
@@ -969,6 +976,8 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
969 x->props.flags |= XFRM_STATE_NOECN; 976 x->props.flags |= XFRM_STATE_NOECN;
970 if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) 977 if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP)
971 x->props.flags |= XFRM_STATE_DECAP_DSCP; 978 x->props.flags |= XFRM_STATE_DECAP_DSCP;
979 if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
980 x->props.flags |= XFRM_STATE_NOPMTUDISC;
972 981
973 lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1]; 982 lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
974 if (lifetime != NULL) { 983 if (lifetime != NULL) {
@@ -1091,17 +1100,11 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
1091 } 1100 }
1092 } 1101 }
1093 1102
1094 x->type = xfrm_get_type(proto, x->props.family); 1103 err = xfrm_init_state(x);
1095 if (x->type == NULL) { 1104 if (err)
1096 err = -ENOPROTOOPT;
1097 goto out;
1098 }
1099 if (x->type->init_state(x, NULL)) {
1100 err = -EINVAL;
1101 goto out; 1105 goto out;
1102 } 1106
1103 x->km.seq = hdr->sadb_msg_seq; 1107 x->km.seq = hdr->sadb_msg_seq;
1104 x->km.state = XFRM_STATE_VALID;
1105 return x; 1108 return x;
1106 1109
1107out: 1110out:
@@ -1240,13 +1243,78 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
1240 return 0; 1243 return 0;
1241} 1244}
1242 1245
1246static inline int event2poltype(int event)
1247{
1248 switch (event) {
1249 case XFRM_MSG_DELPOLICY:
1250 return SADB_X_SPDDELETE;
1251 case XFRM_MSG_NEWPOLICY:
1252 return SADB_X_SPDADD;
1253 case XFRM_MSG_UPDPOLICY:
1254 return SADB_X_SPDUPDATE;
1255 case XFRM_MSG_POLEXPIRE:
1256 // return SADB_X_SPDEXPIRE;
1257 default:
1258 printk("pfkey: Unknown policy event %d\n", event);
1259 break;
1260 }
1261
1262 return 0;
1263}
1264
1265static inline int event2keytype(int event)
1266{
1267 switch (event) {
1268 case XFRM_MSG_DELSA:
1269 return SADB_DELETE;
1270 case XFRM_MSG_NEWSA:
1271 return SADB_ADD;
1272 case XFRM_MSG_UPDSA:
1273 return SADB_UPDATE;
1274 case XFRM_MSG_EXPIRE:
1275 return SADB_EXPIRE;
1276 default:
1277 printk("pfkey: Unknown SA event %d\n", event);
1278 break;
1279 }
1280
1281 return 0;
1282}
1283
1284/* ADD/UPD/DEL */
1285static int key_notify_sa(struct xfrm_state *x, struct km_event *c)
1286{
1287 struct sk_buff *skb;
1288 struct sadb_msg *hdr;
1289 int hsc = 3;
1290
1291 if (c->event == XFRM_MSG_DELSA)
1292 hsc = 0;
1293
1294 skb = pfkey_xfrm_state2msg(x, 0, hsc);
1295
1296 if (IS_ERR(skb))
1297 return PTR_ERR(skb);
1298
1299 hdr = (struct sadb_msg *) skb->data;
1300 hdr->sadb_msg_version = PF_KEY_V2;
1301 hdr->sadb_msg_type = event2keytype(c->event);
1302 hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
1303 hdr->sadb_msg_errno = 0;
1304 hdr->sadb_msg_reserved = 0;
1305 hdr->sadb_msg_seq = c->seq;
1306 hdr->sadb_msg_pid = c->pid;
1307
1308 pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
1309
1310 return 0;
1311}
1243 1312
1244static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1313static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1245{ 1314{
1246 struct sk_buff *out_skb;
1247 struct sadb_msg *out_hdr;
1248 struct xfrm_state *x; 1315 struct xfrm_state *x;
1249 int err; 1316 int err;
1317 struct km_event c;
1250 1318
1251 xfrm_probe_algs(); 1319 xfrm_probe_algs();
1252 1320
@@ -1254,6 +1322,7 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
1254 if (IS_ERR(x)) 1322 if (IS_ERR(x))
1255 return PTR_ERR(x); 1323 return PTR_ERR(x);
1256 1324
1325 xfrm_state_hold(x);
1257 if (hdr->sadb_msg_type == SADB_ADD) 1326 if (hdr->sadb_msg_type == SADB_ADD)
1258 err = xfrm_state_add(x); 1327 err = xfrm_state_add(x);
1259 else 1328 else
@@ -1262,30 +1331,26 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
1262 if (err < 0) { 1331 if (err < 0) {
1263 x->km.state = XFRM_STATE_DEAD; 1332 x->km.state = XFRM_STATE_DEAD;
1264 xfrm_state_put(x); 1333 xfrm_state_put(x);
1265 return err; 1334 goto out;
1266 } 1335 }
1267 1336
1268 out_skb = pfkey_xfrm_state2msg(x, 0, 3); 1337 if (hdr->sadb_msg_type == SADB_ADD)
1269 if (IS_ERR(out_skb)) 1338 c.event = XFRM_MSG_NEWSA;
1270 return PTR_ERR(out_skb); /* XXX Should we return 0 here ? */ 1339 else
1271 1340 c.event = XFRM_MSG_UPDSA;
1272 out_hdr = (struct sadb_msg *) out_skb->data; 1341 c.seq = hdr->sadb_msg_seq;
1273 out_hdr->sadb_msg_version = hdr->sadb_msg_version; 1342 c.pid = hdr->sadb_msg_pid;
1274 out_hdr->sadb_msg_type = hdr->sadb_msg_type; 1343 km_state_notify(x, &c);
1275 out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); 1344out:
1276 out_hdr->sadb_msg_errno = 0; 1345 xfrm_state_put(x);
1277 out_hdr->sadb_msg_reserved = 0; 1346 return err;
1278 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
1279 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
1280
1281 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
1282
1283 return 0;
1284} 1347}
1285 1348
1286static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1349static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1287{ 1350{
1288 struct xfrm_state *x; 1351 struct xfrm_state *x;
1352 struct km_event c;
1353 int err;
1289 1354
1290 if (!ext_hdrs[SADB_EXT_SA-1] || 1355 if (!ext_hdrs[SADB_EXT_SA-1] ||
1291 !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 1356 !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
@@ -1301,13 +1366,19 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
1301 return -EPERM; 1366 return -EPERM;
1302 } 1367 }
1303 1368
1304 xfrm_state_delete(x); 1369 err = xfrm_state_delete(x);
1305 xfrm_state_put(x); 1370 if (err < 0) {
1371 xfrm_state_put(x);
1372 return err;
1373 }
1306 1374
1307 pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, 1375 c.seq = hdr->sadb_msg_seq;
1308 BROADCAST_ALL, sk); 1376 c.pid = hdr->sadb_msg_pid;
1377 c.event = XFRM_MSG_DELSA;
1378 km_state_notify(x, &c);
1379 xfrm_state_put(x);
1309 1380
1310 return 0; 1381 return err;
1311} 1382}
1312 1383
1313static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1384static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
@@ -1445,28 +1516,42 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg
1445 return 0; 1516 return 0;
1446} 1517}
1447 1518
1519static int key_notify_sa_flush(struct km_event *c)
1520{
1521 struct sk_buff *skb;
1522 struct sadb_msg *hdr;
1523
1524 skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
1525 if (!skb)
1526 return -ENOBUFS;
1527 hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
1528 hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto);
1529 hdr->sadb_msg_seq = c->seq;
1530 hdr->sadb_msg_pid = c->pid;
1531 hdr->sadb_msg_version = PF_KEY_V2;
1532 hdr->sadb_msg_errno = (uint8_t) 0;
1533 hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
1534
1535 pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
1536
1537 return 0;
1538}
1539
1448static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1540static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1449{ 1541{
1450 unsigned proto; 1542 unsigned proto;
1451 struct sk_buff *skb_out; 1543 struct km_event c;
1452 struct sadb_msg *hdr_out;
1453 1544
1454 proto = pfkey_satype2proto(hdr->sadb_msg_satype); 1545 proto = pfkey_satype2proto(hdr->sadb_msg_satype);
1455 if (proto == 0) 1546 if (proto == 0)
1456 return -EINVAL; 1547 return -EINVAL;
1457 1548
1458 skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
1459 if (!skb_out)
1460 return -ENOBUFS;
1461
1462 xfrm_state_flush(proto); 1549 xfrm_state_flush(proto);
1463 1550 c.data.proto = proto;
1464 hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); 1551 c.seq = hdr->sadb_msg_seq;
1465 pfkey_hdr_dup(hdr_out, hdr); 1552 c.pid = hdr->sadb_msg_pid;
1466 hdr_out->sadb_msg_errno = (uint8_t) 0; 1553 c.event = XFRM_MSG_FLUSHSA;
1467 hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); 1554 km_state_notify(NULL, &c);
1468
1469 pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL);
1470 1555
1471 return 0; 1556 return 0;
1472} 1557}
@@ -1859,6 +1944,35 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
1859 hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); 1944 hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
1860} 1945}
1861 1946
1947static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
1948{
1949 struct sk_buff *out_skb;
1950 struct sadb_msg *out_hdr;
1951 int err;
1952
1953 out_skb = pfkey_xfrm_policy2msg_prep(xp);
1954 if (IS_ERR(out_skb)) {
1955 err = PTR_ERR(out_skb);
1956 goto out;
1957 }
1958 pfkey_xfrm_policy2msg(out_skb, xp, dir);
1959
1960 out_hdr = (struct sadb_msg *) out_skb->data;
1961 out_hdr->sadb_msg_version = PF_KEY_V2;
1962
1963 if (c->data.byid && c->event == XFRM_MSG_DELPOLICY)
1964 out_hdr->sadb_msg_type = SADB_X_SPDDELETE2;
1965 else
1966 out_hdr->sadb_msg_type = event2poltype(c->event);
1967 out_hdr->sadb_msg_errno = 0;
1968 out_hdr->sadb_msg_seq = c->seq;
1969 out_hdr->sadb_msg_pid = c->pid;
1970 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
1971out:
1972 return 0;
1973
1974}
1975
1862static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1976static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1863{ 1977{
1864 int err; 1978 int err;
@@ -1866,8 +1980,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
1866 struct sadb_address *sa; 1980 struct sadb_address *sa;
1867 struct sadb_x_policy *pol; 1981 struct sadb_x_policy *pol;
1868 struct xfrm_policy *xp; 1982 struct xfrm_policy *xp;
1869 struct sk_buff *out_skb; 1983 struct km_event c;
1870 struct sadb_msg *out_hdr;
1871 1984
1872 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 1985 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
1873 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || 1986 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -1935,31 +2048,23 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
1935 (err = parse_ipsecrequests(xp, pol)) < 0) 2048 (err = parse_ipsecrequests(xp, pol)) < 0)
1936 goto out; 2049 goto out;
1937 2050
1938 out_skb = pfkey_xfrm_policy2msg_prep(xp);
1939 if (IS_ERR(out_skb)) {
1940 err = PTR_ERR(out_skb);
1941 goto out;
1942 }
1943
1944 err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, 2051 err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
1945 hdr->sadb_msg_type != SADB_X_SPDUPDATE); 2052 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
1946 if (err) { 2053 if (err) {
1947 kfree_skb(out_skb); 2054 kfree(xp);
1948 goto out; 2055 return err;
1949 } 2056 }
1950 2057
1951 pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); 2058 if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
2059 c.event = XFRM_MSG_UPDPOLICY;
2060 else
2061 c.event = XFRM_MSG_NEWPOLICY;
1952 2062
1953 xfrm_pol_put(xp); 2063 c.seq = hdr->sadb_msg_seq;
2064 c.pid = hdr->sadb_msg_pid;
1954 2065
1955 out_hdr = (struct sadb_msg *) out_skb->data; 2066 km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
1956 out_hdr->sadb_msg_version = hdr->sadb_msg_version; 2067 xfrm_pol_put(xp);
1957 out_hdr->sadb_msg_type = hdr->sadb_msg_type;
1958 out_hdr->sadb_msg_satype = 0;
1959 out_hdr->sadb_msg_errno = 0;
1960 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
1961 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
1962 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
1963 return 0; 2068 return 0;
1964 2069
1965out: 2070out:
@@ -1973,9 +2078,8 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
1973 struct sadb_address *sa; 2078 struct sadb_address *sa;
1974 struct sadb_x_policy *pol; 2079 struct sadb_x_policy *pol;
1975 struct xfrm_policy *xp; 2080 struct xfrm_policy *xp;
1976 struct sk_buff *out_skb;
1977 struct sadb_msg *out_hdr;
1978 struct xfrm_selector sel; 2081 struct xfrm_selector sel;
2082 struct km_event c;
1979 2083
1980 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 2084 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
1981 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || 2085 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2010,25 +2114,40 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
2010 2114
2011 err = 0; 2115 err = 0;
2012 2116
2117 c.seq = hdr->sadb_msg_seq;
2118 c.pid = hdr->sadb_msg_pid;
2119 c.event = XFRM_MSG_DELPOLICY;
2120 km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
2121
2122 xfrm_pol_put(xp);
2123 return err;
2124}
2125
2126static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, struct sadb_msg *hdr, int dir)
2127{
2128 int err;
2129 struct sk_buff *out_skb;
2130 struct sadb_msg *out_hdr;
2131 err = 0;
2132
2013 out_skb = pfkey_xfrm_policy2msg_prep(xp); 2133 out_skb = pfkey_xfrm_policy2msg_prep(xp);
2014 if (IS_ERR(out_skb)) { 2134 if (IS_ERR(out_skb)) {
2015 err = PTR_ERR(out_skb); 2135 err = PTR_ERR(out_skb);
2016 goto out; 2136 goto out;
2017 } 2137 }
2018 pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); 2138 pfkey_xfrm_policy2msg(out_skb, xp, dir);
2019 2139
2020 out_hdr = (struct sadb_msg *) out_skb->data; 2140 out_hdr = (struct sadb_msg *) out_skb->data;
2021 out_hdr->sadb_msg_version = hdr->sadb_msg_version; 2141 out_hdr->sadb_msg_version = hdr->sadb_msg_version;
2022 out_hdr->sadb_msg_type = SADB_X_SPDDELETE; 2142 out_hdr->sadb_msg_type = hdr->sadb_msg_type;
2023 out_hdr->sadb_msg_satype = 0; 2143 out_hdr->sadb_msg_satype = 0;
2024 out_hdr->sadb_msg_errno = 0; 2144 out_hdr->sadb_msg_errno = 0;
2025 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; 2145 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
2026 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; 2146 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
2027 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); 2147 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk);
2028 err = 0; 2148 err = 0;
2029 2149
2030out: 2150out:
2031 xfrm_pol_put(xp);
2032 return err; 2151 return err;
2033} 2152}
2034 2153
@@ -2037,8 +2156,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2037 int err; 2156 int err;
2038 struct sadb_x_policy *pol; 2157 struct sadb_x_policy *pol;
2039 struct xfrm_policy *xp; 2158 struct xfrm_policy *xp;
2040 struct sk_buff *out_skb; 2159 struct km_event c;
2041 struct sadb_msg *out_hdr;
2042 2160
2043 if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) 2161 if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL)
2044 return -EINVAL; 2162 return -EINVAL;
@@ -2050,24 +2168,16 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2050 2168
2051 err = 0; 2169 err = 0;
2052 2170
2053 out_skb = pfkey_xfrm_policy2msg_prep(xp); 2171 c.seq = hdr->sadb_msg_seq;
2054 if (IS_ERR(out_skb)) { 2172 c.pid = hdr->sadb_msg_pid;
2055 err = PTR_ERR(out_skb); 2173 if (hdr->sadb_msg_type == SADB_X_SPDDELETE2) {
2056 goto out; 2174 c.data.byid = 1;
2175 c.event = XFRM_MSG_DELPOLICY;
2176 km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
2177 } else {
2178 err = key_pol_get_resp(sk, xp, hdr, pol->sadb_x_policy_dir-1);
2057 } 2179 }
2058 pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
2059 2180
2060 out_hdr = (struct sadb_msg *) out_skb->data;
2061 out_hdr->sadb_msg_version = hdr->sadb_msg_version;
2062 out_hdr->sadb_msg_type = hdr->sadb_msg_type;
2063 out_hdr->sadb_msg_satype = 0;
2064 out_hdr->sadb_msg_errno = 0;
2065 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
2066 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
2067 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
2068 err = 0;
2069
2070out:
2071 xfrm_pol_put(xp); 2181 xfrm_pol_put(xp);
2072 return err; 2182 return err;
2073} 2183}
@@ -2102,22 +2212,34 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
2102 return xfrm_policy_walk(dump_sp, &data); 2212 return xfrm_policy_walk(dump_sp, &data);
2103} 2213}
2104 2214
2105static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 2215static int key_notify_policy_flush(struct km_event *c)
2106{ 2216{
2107 struct sk_buff *skb_out; 2217 struct sk_buff *skb_out;
2108 struct sadb_msg *hdr_out; 2218 struct sadb_msg *hdr;
2109 2219
2110 skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); 2220 skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
2111 if (!skb_out) 2221 if (!skb_out)
2112 return -ENOBUFS; 2222 return -ENOBUFS;
2223 hdr = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
2224 hdr->sadb_msg_seq = c->seq;
2225 hdr->sadb_msg_pid = c->pid;
2226 hdr->sadb_msg_version = PF_KEY_V2;
2227 hdr->sadb_msg_errno = (uint8_t) 0;
2228 hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
2229 pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL);
2230 return 0;
2113 2231
2114 xfrm_policy_flush(); 2232}
2115 2233
2116 hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); 2234static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
2117 pfkey_hdr_dup(hdr_out, hdr); 2235{
2118 hdr_out->sadb_msg_errno = (uint8_t) 0; 2236 struct km_event c;
2119 hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); 2237
2120 pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL); 2238 xfrm_policy_flush();
2239 c.event = XFRM_MSG_FLUSHPOLICY;
2240 c.pid = hdr->sadb_msg_pid;
2241 c.seq = hdr->sadb_msg_seq;
2242 km_policy_notify(NULL, 0, &c);
2121 2243
2122 return 0; 2244 return 0;
2123} 2245}
@@ -2317,11 +2439,23 @@ static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
2317 } 2439 }
2318} 2440}
2319 2441
2320static int pfkey_send_notify(struct xfrm_state *x, int hard) 2442static int key_notify_policy_expire(struct xfrm_policy *xp, struct km_event *c)
2443{
2444 return 0;
2445}
2446
2447static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
2321{ 2448{
2322 struct sk_buff *out_skb; 2449 struct sk_buff *out_skb;
2323 struct sadb_msg *out_hdr; 2450 struct sadb_msg *out_hdr;
2324 int hsc = (hard ? 2 : 1); 2451 int hard;
2452 int hsc;
2453
2454 hard = c->data.hard;
2455 if (hard)
2456 hsc = 2;
2457 else
2458 hsc = 1;
2325 2459
2326 out_skb = pfkey_xfrm_state2msg(x, 0, hsc); 2460 out_skb = pfkey_xfrm_state2msg(x, 0, hsc);
2327 if (IS_ERR(out_skb)) 2461 if (IS_ERR(out_skb))
@@ -2340,6 +2474,44 @@ static int pfkey_send_notify(struct xfrm_state *x, int hard)
2340 return 0; 2474 return 0;
2341} 2475}
2342 2476
2477static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
2478{
2479 switch (c->event) {
2480 case XFRM_MSG_EXPIRE:
2481 return key_notify_sa_expire(x, c);
2482 case XFRM_MSG_DELSA:
2483 case XFRM_MSG_NEWSA:
2484 case XFRM_MSG_UPDSA:
2485 return key_notify_sa(x, c);
2486 case XFRM_MSG_FLUSHSA:
2487 return key_notify_sa_flush(c);
2488 default:
2489 printk("pfkey: Unknown SA event %d\n", c->event);
2490 break;
2491 }
2492
2493 return 0;
2494}
2495
2496static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
2497{
2498 switch (c->event) {
2499 case XFRM_MSG_POLEXPIRE:
2500 return key_notify_policy_expire(xp, c);
2501 case XFRM_MSG_DELPOLICY:
2502 case XFRM_MSG_NEWPOLICY:
2503 case XFRM_MSG_UPDPOLICY:
2504 return key_notify_policy(xp, dir, c);
2505 case XFRM_MSG_FLUSHPOLICY:
2506 return key_notify_policy_flush(c);
2507 default:
2508 printk("pfkey: Unknown policy event %d\n", c->event);
2509 break;
2510 }
2511
2512 return 0;
2513}
2514
2343static u32 get_acqseq(void) 2515static u32 get_acqseq(void)
2344{ 2516{
2345 u32 res; 2517 u32 res;
@@ -2856,6 +3028,7 @@ static struct xfrm_mgr pfkeyv2_mgr =
2856 .acquire = pfkey_send_acquire, 3028 .acquire = pfkey_send_acquire,
2857 .compile_policy = pfkey_compile_policy, 3029 .compile_policy = pfkey_compile_policy,
2858 .new_mapping = pfkey_send_new_mapping, 3030 .new_mapping = pfkey_send_new_mapping,
3031 .notify_policy = pfkey_send_policy_notify,
2859}; 3032};
2860 3033
2861static void __exit ipsec_pfkey_exit(void) 3034static void __exit ipsec_pfkey_exit(void)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e41ce458c2a9..fc456a7aaec3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -315,8 +315,8 @@ err:
315static void netlink_remove(struct sock *sk) 315static void netlink_remove(struct sock *sk)
316{ 316{
317 netlink_table_grab(); 317 netlink_table_grab();
318 nl_table[sk->sk_protocol].hash.entries--; 318 if (sk_del_node_init(sk))
319 sk_del_node_init(sk); 319 nl_table[sk->sk_protocol].hash.entries--;
320 if (nlk_sk(sk)->groups) 320 if (nlk_sk(sk)->groups)
321 __sk_del_bind_node(sk); 321 __sk_del_bind_node(sk);
322 netlink_table_ungrab(); 322 netlink_table_ungrab();
@@ -429,7 +429,12 @@ retry:
429 err = netlink_insert(sk, pid); 429 err = netlink_insert(sk, pid);
430 if (err == -EADDRINUSE) 430 if (err == -EADDRINUSE)
431 goto retry; 431 goto retry;
432 return 0; 432
433 /* If 2 threads race to autobind, that is fine. */
434 if (err == -EBUSY)
435 err = 0;
436
437 return err;
433} 438}
434 439
435static inline int netlink_capable(struct socket *sock, unsigned int flag) 440static inline int netlink_capable(struct socket *sock, unsigned int flag)
@@ -1095,8 +1100,7 @@ static int netlink_dump(struct sock *sk)
1095 return 0; 1100 return 0;
1096 } 1101 }
1097 1102
1098 nlh = __nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLMSG_DONE, sizeof(int)); 1103 nlh = NLMSG_NEW_ANSWER(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
1099 nlh->nlmsg_flags |= NLM_F_MULTI;
1100 memcpy(NLMSG_DATA(nlh), &len, sizeof(len)); 1104 memcpy(NLMSG_DATA(nlh), &len, sizeof(len));
1101 skb_queue_tail(&sk->sk_receive_queue, skb); 1105 skb_queue_tail(&sk->sk_receive_queue, skb);
1102 sk->sk_data_ready(sk, skb->len); 1106 sk->sk_data_ready(sk, skb->len);
@@ -1107,6 +1111,9 @@ static int netlink_dump(struct sock *sk)
1107 1111
1108 netlink_destroy_callback(cb); 1112 netlink_destroy_callback(cb);
1109 return 0; 1113 return 0;
1114
1115nlmsg_failure:
1116 return -ENOBUFS;
1110} 1117}
1111 1118
1112int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, 1119int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
@@ -1178,7 +1185,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
1178 } 1185 }
1179 1186
1180 rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 1187 rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1181 NLMSG_ERROR, sizeof(struct nlmsgerr)); 1188 NLMSG_ERROR, sizeof(struct nlmsgerr), 0);
1182 errmsg = NLMSG_DATA(rep); 1189 errmsg = NLMSG_DATA(rep);
1183 errmsg->error = err; 1190 errmsg->error = err;
1184 memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr)); 1191 memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr));
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a17..dada34a77b21 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
138 138
139 _debug("### End Work"); 139 _debug("### End Work");
140 140
141 try_to_freeze(PF_FREEZE); 141 try_to_freeze();
142 142
143 /* discard pending signals */ 143 /* discard pending signals */
144 rxrpc_discard_my_signals(); 144 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d9228..1aadd026d354 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
107 107
108 _debug("### End Inbound Calls"); 108 _debug("### End Inbound Calls");
109 109
110 try_to_freeze(PF_FREEZE); 110 try_to_freeze();
111 111
112 /* discard pending signals */ 112 /* discard pending signals */
113 rxrpc_discard_my_signals(); 113 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290bb..3ac81cdd1211 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
90 complete_and_exit(&krxtimod_dead, 0); 90 complete_and_exit(&krxtimod_dead, 0);
91 } 91 }
92 92
93 try_to_freeze(PF_FREEZE); 93 try_to_freeze();
94 94
95 /* discard pending signals */ 95 /* discard pending signals */
96 rxrpc_discard_my_signals(); 96 rxrpc_discard_my_signals();
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b0941186f867..7bac249258e3 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -405,7 +405,7 @@ config NET_EMATCH_STACK
405 ---help--- 405 ---help---
406 Size of the local stack variable used while evaluating the tree of 406 Size of the local stack variable used while evaluating the tree of
407 ematches. Limits the depth of the tree, i.e. the number of 407 ematches. Limits the depth of the tree, i.e. the number of
408 encapsulated precedences. Every level requires 4 bytes of addtional 408 encapsulated precedences. Every level requires 4 bytes of additional
409 stack space. 409 stack space.
410 410
411config NET_EMATCH_CMP 411config NET_EMATCH_CMP
@@ -449,6 +449,19 @@ config NET_EMATCH_META
449 To compile this code as a module, choose M here: the 449 To compile this code as a module, choose M here: the
450 module will be called em_meta. 450 module will be called em_meta.
451 451
452config NET_EMATCH_TEXT
453 tristate "Textsearch"
454 depends on NET_EMATCH
455 select TEXTSEARCH
456 select TEXTSEARCH_KMP
457 select TEXTSEARCH_FSM
458 ---help---
459 Say Y here if you want to be ablt to classify packets based on
460 textsearch comparisons.
461
462 To compile this code as a module, choose M here: the
463 module will be called em_text.
464
452config NET_CLS_ACT 465config NET_CLS_ACT
453 bool "Packet ACTION" 466 bool "Packet ACTION"
454 depends on EXPERIMENTAL && NET_CLS && NET_QOS 467 depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eba8..8f58cecd6266 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o 40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o 41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o 42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
43obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index cafcb084098d..9594206e6035 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -428,15 +428,15 @@ errout:
428 428
429static int 429static int
430tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, 430tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
431 unsigned flags, int event, int bind, int ref) 431 u16 flags, int event, int bind, int ref)
432{ 432{
433 struct tcamsg *t; 433 struct tcamsg *t;
434 struct nlmsghdr *nlh; 434 struct nlmsghdr *nlh;
435 unsigned char *b = skb->tail; 435 unsigned char *b = skb->tail;
436 struct rtattr *x; 436 struct rtattr *x;
437 437
438 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); 438 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
439 nlh->nlmsg_flags = flags; 439
440 t = NLMSG_DATA(nlh); 440 t = NLMSG_DATA(nlh);
441 t->tca_family = AF_UNSPEC; 441 t->tca_family = AF_UNSPEC;
442 442
@@ -669,7 +669,7 @@ err:
669} 669}
670 670
671static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, 671static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
672 unsigned flags) 672 u16 flags)
673{ 673{
674 struct tcamsg *t; 674 struct tcamsg *t;
675 struct nlmsghdr *nlh; 675 struct nlmsghdr *nlh;
@@ -684,8 +684,7 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
684 684
685 b = (unsigned char *)skb->tail; 685 b = (unsigned char *)skb->tail;
686 686
687 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); 687 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
688 nlh->nlmsg_flags = flags;
689 t = NLMSG_DATA(nlh); 688 t = NLMSG_DATA(nlh);
690 t->tca_family = AF_UNSPEC; 689 t->tca_family = AF_UNSPEC;
691 690
@@ -881,7 +880,7 @@ static int __init tc_action_init(void)
881 link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action; 880 link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action;
882 } 881 }
883 882
884 printk("TC classifier action (bugs to netdev@oss.sgi.com cc " 883 printk("TC classifier action (bugs to netdev@vger.kernel.org cc "
885 "hadi@cyberus.ca)\n"); 884 "hadi@cyberus.ca)\n");
886 return 0; 885 return 0;
887} 886}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 56e66c3fe0fa..1616bf5c9627 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -322,14 +322,13 @@ errout:
322 322
323static int 323static int
324tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, 324tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
325 u32 pid, u32 seq, unsigned flags, int event) 325 u32 pid, u32 seq, u16 flags, int event)
326{ 326{
327 struct tcmsg *tcm; 327 struct tcmsg *tcm;
328 struct nlmsghdr *nlh; 328 struct nlmsghdr *nlh;
329 unsigned char *b = skb->tail; 329 unsigned char *b = skb->tail;
330 330
331 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 331 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
332 nlh->nlmsg_flags = flags;
333 tcm = NLMSG_DATA(nlh); 332 tcm = NLMSG_DATA(nlh);
334 tcm->tcm_family = AF_UNSPEC; 333 tcm->tcm_family = AF_UNSPEC;
335 tcm->tcm_ifindex = tp->q->dev->ifindex; 334 tcm->tcm_ifindex = tp->q->dev->ifindex;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 0d2d4415f334..dfb300bb6baa 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -261,6 +261,9 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh,
261 rta = (struct rtattr *) b; 261 rta = (struct rtattr *) b;
262 RTA_PUT(skb, TCA_OPTIONS, 0, NULL); 262 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
263 263
264 if (f->res.classid)
265 RTA_PUT(skb, TCA_BASIC_CLASSID, sizeof(u32), &f->res.classid);
266
264 if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || 267 if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
265 tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) 268 tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
266 goto rtattr_failure; 269 goto rtattr_failure;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index f1eeaf65cee5..48bb23c2a35a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -32,7 +32,7 @@
32 * +-----------+ +-----------+ 32 * +-----------+ +-----------+
33 * | | 33 * | |
34 * ---> meta_ops[INT][INDEV](...) | 34 * ---> meta_ops[INT][INDEV](...) |
35 * | | 35 * | |
36 * ----------- | 36 * ----------- |
37 * V V 37 * V V
38 * +-----------+ +-----------+ 38 * +-----------+ +-----------+
@@ -70,6 +70,7 @@
70#include <net/dst.h> 70#include <net/dst.h>
71#include <net/route.h> 71#include <net/route.h>
72#include <net/pkt_cls.h> 72#include <net/pkt_cls.h>
73#include <net/sock.h>
73 74
74struct meta_obj 75struct meta_obj
75{ 76{
@@ -284,6 +285,214 @@ META_COLLECTOR(int_rtiif)
284} 285}
285 286
286/************************************************************************** 287/**************************************************************************
288 * Socket Attributes
289 **************************************************************************/
290
291#define SKIP_NONLOCAL(skb) \
292 if (unlikely(skb->sk == NULL)) { \
293 *err = -1; \
294 return; \
295 }
296
297META_COLLECTOR(int_sk_family)
298{
299 SKIP_NONLOCAL(skb);
300 dst->value = skb->sk->sk_family;
301}
302
303META_COLLECTOR(int_sk_state)
304{
305 SKIP_NONLOCAL(skb);
306 dst->value = skb->sk->sk_state;
307}
308
309META_COLLECTOR(int_sk_reuse)
310{
311 SKIP_NONLOCAL(skb);
312 dst->value = skb->sk->sk_reuse;
313}
314
315META_COLLECTOR(int_sk_bound_if)
316{
317 SKIP_NONLOCAL(skb);
318 /* No error if bound_dev_if is 0, legal userspace check */
319 dst->value = skb->sk->sk_bound_dev_if;
320}
321
322META_COLLECTOR(var_sk_bound_if)
323{
324 SKIP_NONLOCAL(skb);
325
326 if (skb->sk->sk_bound_dev_if == 0) {
327 dst->value = (unsigned long) "any";
328 dst->len = 3;
329 } else {
330 struct net_device *dev;
331
332 dev = dev_get_by_index(skb->sk->sk_bound_dev_if);
333 *err = var_dev(dev, dst);
334 if (dev)
335 dev_put(dev);
336 }
337}
338
339META_COLLECTOR(int_sk_refcnt)
340{
341 SKIP_NONLOCAL(skb);
342 dst->value = atomic_read(&skb->sk->sk_refcnt);
343}
344
345META_COLLECTOR(int_sk_rcvbuf)
346{
347 SKIP_NONLOCAL(skb);
348 dst->value = skb->sk->sk_rcvbuf;
349}
350
351META_COLLECTOR(int_sk_shutdown)
352{
353 SKIP_NONLOCAL(skb);
354 dst->value = skb->sk->sk_shutdown;
355}
356
357META_COLLECTOR(int_sk_proto)
358{
359 SKIP_NONLOCAL(skb);
360 dst->value = skb->sk->sk_protocol;
361}
362
363META_COLLECTOR(int_sk_type)
364{
365 SKIP_NONLOCAL(skb);
366 dst->value = skb->sk->sk_type;
367}
368
369META_COLLECTOR(int_sk_rmem_alloc)
370{
371 SKIP_NONLOCAL(skb);
372 dst->value = atomic_read(&skb->sk->sk_rmem_alloc);
373}
374
375META_COLLECTOR(int_sk_wmem_alloc)
376{
377 SKIP_NONLOCAL(skb);
378 dst->value = atomic_read(&skb->sk->sk_wmem_alloc);
379}
380
381META_COLLECTOR(int_sk_omem_alloc)
382{
383 SKIP_NONLOCAL(skb);
384 dst->value = atomic_read(&skb->sk->sk_omem_alloc);
385}
386
387META_COLLECTOR(int_sk_rcv_qlen)
388{
389 SKIP_NONLOCAL(skb);
390 dst->value = skb->sk->sk_receive_queue.qlen;
391}
392
393META_COLLECTOR(int_sk_snd_qlen)
394{
395 SKIP_NONLOCAL(skb);
396 dst->value = skb->sk->sk_write_queue.qlen;
397}
398
399META_COLLECTOR(int_sk_wmem_queued)
400{
401 SKIP_NONLOCAL(skb);
402 dst->value = skb->sk->sk_wmem_queued;
403}
404
405META_COLLECTOR(int_sk_fwd_alloc)
406{
407 SKIP_NONLOCAL(skb);
408 dst->value = skb->sk->sk_forward_alloc;
409}
410
411META_COLLECTOR(int_sk_sndbuf)
412{
413 SKIP_NONLOCAL(skb);
414 dst->value = skb->sk->sk_sndbuf;
415}
416
417META_COLLECTOR(int_sk_alloc)
418{
419 SKIP_NONLOCAL(skb);
420 dst->value = skb->sk->sk_allocation;
421}
422
423META_COLLECTOR(int_sk_route_caps)
424{
425 SKIP_NONLOCAL(skb);
426 dst->value = skb->sk->sk_route_caps;
427}
428
429META_COLLECTOR(int_sk_hashent)
430{
431 SKIP_NONLOCAL(skb);
432 dst->value = skb->sk->sk_hashent;
433}
434
435META_COLLECTOR(int_sk_lingertime)
436{
437 SKIP_NONLOCAL(skb);
438 dst->value = skb->sk->sk_lingertime / HZ;
439}
440
441META_COLLECTOR(int_sk_err_qlen)
442{
443 SKIP_NONLOCAL(skb);
444 dst->value = skb->sk->sk_error_queue.qlen;
445}
446
447META_COLLECTOR(int_sk_ack_bl)
448{
449 SKIP_NONLOCAL(skb);
450 dst->value = skb->sk->sk_ack_backlog;
451}
452
453META_COLLECTOR(int_sk_max_ack_bl)
454{
455 SKIP_NONLOCAL(skb);
456 dst->value = skb->sk->sk_max_ack_backlog;
457}
458
459META_COLLECTOR(int_sk_prio)
460{
461 SKIP_NONLOCAL(skb);
462 dst->value = skb->sk->sk_priority;
463}
464
465META_COLLECTOR(int_sk_rcvlowat)
466{
467 SKIP_NONLOCAL(skb);
468 dst->value = skb->sk->sk_rcvlowat;
469}
470
471META_COLLECTOR(int_sk_rcvtimeo)
472{
473 SKIP_NONLOCAL(skb);
474 dst->value = skb->sk->sk_rcvtimeo / HZ;
475}
476
477META_COLLECTOR(int_sk_sndtimeo)
478{
479 SKIP_NONLOCAL(skb);
480 dst->value = skb->sk->sk_sndtimeo / HZ;
481}
482
483META_COLLECTOR(int_sk_sendmsg_off)
484{
485 SKIP_NONLOCAL(skb);
486 dst->value = skb->sk->sk_sndmsg_off;
487}
488
489META_COLLECTOR(int_sk_write_pend)
490{
491 SKIP_NONLOCAL(skb);
492 dst->value = skb->sk->sk_write_pending;
493}
494
495/**************************************************************************
287 * Meta value collectors assignment table 496 * Meta value collectors assignment table
288 **************************************************************************/ 497 **************************************************************************/
289 498
@@ -293,41 +502,75 @@ struct meta_ops
293 struct meta_value *, struct meta_obj *, int *); 502 struct meta_value *, struct meta_obj *, int *);
294}; 503};
295 504
505#define META_ID(name) TCF_META_ID_##name
506#define META_FUNC(name) { .get = meta_##name }
507
296/* Meta value operations table listing all meta value collectors and 508/* Meta value operations table listing all meta value collectors and
297 * assigns them to a type and meta id. */ 509 * assigns them to a type and meta id. */
298static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { 510static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
299 [TCF_META_TYPE_VAR] = { 511 [TCF_META_TYPE_VAR] = {
300 [TCF_META_ID_DEV] = { .get = meta_var_dev }, 512 [META_ID(DEV)] = META_FUNC(var_dev),
301 [TCF_META_ID_INDEV] = { .get = meta_var_indev }, 513 [META_ID(INDEV)] = META_FUNC(var_indev),
302 [TCF_META_ID_REALDEV] = { .get = meta_var_realdev } 514 [META_ID(REALDEV)] = META_FUNC(var_realdev),
515 [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
303 }, 516 },
304 [TCF_META_TYPE_INT] = { 517 [TCF_META_TYPE_INT] = {
305 [TCF_META_ID_RANDOM] = { .get = meta_int_random }, 518 [META_ID(RANDOM)] = META_FUNC(int_random),
306 [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 }, 519 [META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0),
307 [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 }, 520 [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
308 [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 }, 521 [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
309 [TCF_META_ID_DEV] = { .get = meta_int_dev }, 522 [META_ID(DEV)] = META_FUNC(int_dev),
310 [TCF_META_ID_INDEV] = { .get = meta_int_indev }, 523 [META_ID(INDEV)] = META_FUNC(int_indev),
311 [TCF_META_ID_REALDEV] = { .get = meta_int_realdev }, 524 [META_ID(REALDEV)] = META_FUNC(int_realdev),
312 [TCF_META_ID_PRIORITY] = { .get = meta_int_priority }, 525 [META_ID(PRIORITY)] = META_FUNC(int_priority),
313 [TCF_META_ID_PROTOCOL] = { .get = meta_int_protocol }, 526 [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
314 [TCF_META_ID_SECURITY] = { .get = meta_int_security }, 527 [META_ID(SECURITY)] = META_FUNC(int_security),
315 [TCF_META_ID_PKTTYPE] = { .get = meta_int_pkttype }, 528 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
316 [TCF_META_ID_PKTLEN] = { .get = meta_int_pktlen }, 529 [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
317 [TCF_META_ID_DATALEN] = { .get = meta_int_datalen }, 530 [META_ID(DATALEN)] = META_FUNC(int_datalen),
318 [TCF_META_ID_MACLEN] = { .get = meta_int_maclen }, 531 [META_ID(MACLEN)] = META_FUNC(int_maclen),
319#ifdef CONFIG_NETFILTER 532#ifdef CONFIG_NETFILTER
320 [TCF_META_ID_NFMARK] = { .get = meta_int_nfmark }, 533 [META_ID(NFMARK)] = META_FUNC(int_nfmark),
321#endif 534#endif
322 [TCF_META_ID_TCINDEX] = { .get = meta_int_tcindex }, 535 [META_ID(TCINDEX)] = META_FUNC(int_tcindex),
323#ifdef CONFIG_NET_CLS_ACT 536#ifdef CONFIG_NET_CLS_ACT
324 [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd }, 537 [META_ID(TCVERDICT)] = META_FUNC(int_tcverd),
325 [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid }, 538 [META_ID(TCCLASSID)] = META_FUNC(int_tcclassid),
326#endif 539#endif
327#ifdef CONFIG_NET_CLS_ROUTE 540#ifdef CONFIG_NET_CLS_ROUTE
328 [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid }, 541 [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
329#endif 542#endif
330 [TCF_META_ID_RTIIF] = { .get = meta_int_rtiif } 543 [META_ID(RTIIF)] = META_FUNC(int_rtiif),
544 [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
545 [META_ID(SK_STATE)] = META_FUNC(int_sk_state),
546 [META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse),
547 [META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if),
548 [META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt),
549 [META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf),
550 [META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf),
551 [META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown),
552 [META_ID(SK_PROTO)] = META_FUNC(int_sk_proto),
553 [META_ID(SK_TYPE)] = META_FUNC(int_sk_type),
554 [META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc),
555 [META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc),
556 [META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc),
557 [META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued),
558 [META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen),
559 [META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen),
560 [META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen),
561 [META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
562 [META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
563 [META_ID(SK_ROUTE_CAPS)] = META_FUNC(int_sk_route_caps),
564 [META_ID(SK_HASHENT)] = META_FUNC(int_sk_hashent),
565 [META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
566 [META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
567 [META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl),
568 [META_ID(SK_PRIO)] = META_FUNC(int_sk_prio),
569 [META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat),
570 [META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo),
571 [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo),
572 [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
573 [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
331 } 574 }
332}; 575};
333 576
@@ -396,9 +639,9 @@ static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
396 /* Let gcc optimize it, the unlikely is not really based on 639 /* Let gcc optimize it, the unlikely is not really based on
397 * some numbers but jump free code for mismatches seems 640 * some numbers but jump free code for mismatches seems
398 * more logical. */ 641 * more logical. */
399 if (unlikely(a == b)) 642 if (unlikely(a->value == b->value))
400 return 0; 643 return 0;
401 else if (a < b) 644 else if (a->value < b->value)
402 return -1; 645 return -1;
403 else 646 else
404 return 1; 647 return 1;
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000000..873840d8d072
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
1/*
2 * net/sched/em_text.c Textsearch ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/string.h>
18#include <linux/skbuff.h>
19#include <linux/textsearch.h>
20#include <linux/tc_ematch/tc_em_text.h>
21#include <net/pkt_cls.h>
22
23struct text_match
24{
25 u16 from_offset;
26 u16 to_offset;
27 u8 from_layer;
28 u8 to_layer;
29 struct ts_config *config;
30};
31
32#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
33
34static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
35 struct tcf_pkt_info *info)
36{
37 struct text_match *tm = EM_TEXT_PRIV(m);
38 int from, to;
39 struct ts_state state;
40
41 from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
42 from += tm->from_offset;
43
44 to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
45 to += tm->to_offset;
46
47 return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
48}
49
50static int em_text_change(struct tcf_proto *tp, void *data, int len,
51 struct tcf_ematch *m)
52{
53 struct text_match *tm;
54 struct tcf_em_text *conf = data;
55 struct ts_config *ts_conf;
56 int flags = 0;
57
58 printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
59 conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
60
61 if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
62 return -EINVAL;
63
64 if (conf->from_layer > conf->to_layer)
65 return -EINVAL;
66
67 if (conf->from_layer == conf->to_layer &&
68 conf->from_offset > conf->to_offset)
69 return -EINVAL;
70
71retry:
72 ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
73 conf->pattern_len, GFP_KERNEL, flags);
74
75 if (flags & TS_AUTOLOAD)
76 rtnl_lock();
77
78 if (IS_ERR(ts_conf)) {
79 if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
80 rtnl_unlock();
81 flags |= TS_AUTOLOAD;
82 goto retry;
83 } else
84 return PTR_ERR(ts_conf);
85 } else if (flags & TS_AUTOLOAD) {
86 textsearch_destroy(ts_conf);
87 return -EAGAIN;
88 }
89
90 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
91 if (tm == NULL) {
92 textsearch_destroy(ts_conf);
93 return -ENOBUFS;
94 }
95
96 tm->from_offset = conf->from_offset;
97 tm->to_offset = conf->to_offset;
98 tm->from_layer = conf->from_layer;
99 tm->to_layer = conf->to_layer;
100 tm->config = ts_conf;
101
102 m->datalen = sizeof(*tm);
103 m->data = (unsigned long) tm;
104
105 return 0;
106}
107
108static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
109{
110 textsearch_destroy(EM_TEXT_PRIV(m)->config);
111}
112
113static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
114{
115 struct text_match *tm = EM_TEXT_PRIV(m);
116 struct tcf_em_text conf;
117
118 strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
119 conf.from_offset = tm->from_offset;
120 conf.to_offset = tm->to_offset;
121 conf.from_layer = tm->from_layer;
122 conf.to_layer = tm->to_layer;
123 conf.pattern_len = textsearch_get_pattern_len(tm->config);
124 conf.pad = 0;
125
126 RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
127 RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
128 return 0;
129
130rtattr_failure:
131 return -1;
132}
133
134static struct tcf_ematch_ops em_text_ops = {
135 .kind = TCF_EM_TEXT,
136 .change = em_text_change,
137 .match = em_text_match,
138 .destroy = em_text_destroy,
139 .dump = em_text_dump,
140 .owner = THIS_MODULE,
141 .link = LIST_HEAD_INIT(em_text_ops.link)
142};
143
144static int __init init_em_text(void)
145{
146 return tcf_em_register(&em_text_ops);
147}
148
149static void __exit exit_em_text(void)
150{
151 tcf_em_unregister(&em_text_ops);
152}
153
154MODULE_LICENSE("GPL");
155
156module_init(init_em_text);
157module_exit(exit_em_text);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 07977f8f2679..97c1c75d5c78 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -760,15 +760,14 @@ graft:
760} 760}
761 761
762static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 762static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
763 u32 pid, u32 seq, unsigned flags, int event) 763 u32 pid, u32 seq, u16 flags, int event)
764{ 764{
765 struct tcmsg *tcm; 765 struct tcmsg *tcm;
766 struct nlmsghdr *nlh; 766 struct nlmsghdr *nlh;
767 unsigned char *b = skb->tail; 767 unsigned char *b = skb->tail;
768 struct gnet_dump d; 768 struct gnet_dump d;
769 769
770 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 770 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
771 nlh->nlmsg_flags = flags;
772 tcm = NLMSG_DATA(nlh); 771 tcm = NLMSG_DATA(nlh);
773 tcm->tcm_family = AF_UNSPEC; 772 tcm->tcm_family = AF_UNSPEC;
774 tcm->tcm_ifindex = q->dev->ifindex; 773 tcm->tcm_ifindex = q->dev->ifindex;
@@ -997,7 +996,7 @@ out:
997 996
998static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 997static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
999 unsigned long cl, 998 unsigned long cl,
1000 u32 pid, u32 seq, unsigned flags, int event) 999 u32 pid, u32 seq, u16 flags, int event)
1001{ 1000{
1002 struct tcmsg *tcm; 1001 struct tcmsg *tcm;
1003 struct nlmsghdr *nlh; 1002 struct nlmsghdr *nlh;
@@ -1005,8 +1004,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1005 struct gnet_dump d; 1004 struct gnet_dump d;
1006 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 1005 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1007 1006
1008 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 1007 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1009 nlh->nlmsg_flags = flags;
1010 tcm = NLMSG_DATA(nlh); 1008 tcm = NLMSG_DATA(nlh);
1011 tcm->tcm_family = AF_UNSPEC; 1009 tcm->tcm_family = AF_UNSPEC;
1012 tcm->tcm_ifindex = q->dev->ifindex; 1010 tcm->tcm_ifindex = q->dev->ifindex;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 8a3db9d95bab..13e0e7b3856b 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -18,7 +18,7 @@
18#include <asm/byteorder.h> 18#include <asm/byteorder.h>
19 19
20 20
21#if 1 /* control */ 21#if 0 /* control */
22#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) 22#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
23#else 23#else
24#define DPRINTK(format,args...) 24#define DPRINTK(format,args...)
@@ -31,7 +31,7 @@
31#endif 31#endif
32 32
33 33
34#define PRIV(sch) qdisc_priv(sch) 34#define PRIV(sch) ((struct dsmark_qdisc_data *) qdisc_priv(sch))
35 35
36 36
37/* 37/*
@@ -55,145 +55,163 @@
55struct dsmark_qdisc_data { 55struct dsmark_qdisc_data {
56 struct Qdisc *q; 56 struct Qdisc *q;
57 struct tcf_proto *filter_list; 57 struct tcf_proto *filter_list;
58 __u8 *mask; /* "owns" the array */ 58 u8 *mask; /* "owns" the array */
59 __u8 *value; 59 u8 *value;
60 __u16 indices; 60 u16 indices;
61 __u32 default_index; /* index range is 0...0xffff */ 61 u32 default_index; /* index range is 0...0xffff */
62 int set_tc_index; 62 int set_tc_index;
63}; 63};
64 64
65static inline int dsmark_valid_indices(u16 indices)
66{
67 while (indices != 1) {
68 if (indices & 1)
69 return 0;
70 indices >>= 1;
71 }
72
73 return 1;
74}
65 75
66/* ------------------------- Class/flow operations ------------------------- */ 76static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
77{
78 return (index <= p->indices && index > 0);
79}
67 80
81/* ------------------------- Class/flow operations ------------------------- */
68 82
69static int dsmark_graft(struct Qdisc *sch,unsigned long arg, 83static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
70 struct Qdisc *new,struct Qdisc **old) 84 struct Qdisc *new, struct Qdisc **old)
71{ 85{
72 struct dsmark_qdisc_data *p = PRIV(sch); 86 struct dsmark_qdisc_data *p = PRIV(sch);
73 87
74 DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, 88 DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
75 old); 89 sch, p, new, old);
76 if (!new) 90
77 new = &noop_qdisc; 91 if (new == NULL) {
92 new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
93 if (new == NULL)
94 new = &noop_qdisc;
95 }
96
78 sch_tree_lock(sch); 97 sch_tree_lock(sch);
79 *old = xchg(&p->q,new); 98 *old = xchg(&p->q, new);
80 if (*old) 99 qdisc_reset(*old);
81 qdisc_reset(*old);
82 sch->q.qlen = 0; 100 sch->q.qlen = 0;
83 sch_tree_unlock(sch); /* @@@ move up ? */ 101 sch_tree_unlock(sch);
102
84 return 0; 103 return 0;
85} 104}
86 105
87
88static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) 106static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
89{ 107{
90 struct dsmark_qdisc_data *p = PRIV(sch); 108 return PRIV(sch)->q;
91
92 return p->q;
93} 109}
94 110
95 111static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
96static unsigned long dsmark_get(struct Qdisc *sch,u32 classid)
97{ 112{
98 struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch); 113 DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
114 sch, PRIV(sch), classid);
99 115
100 DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); 116 return TC_H_MIN(classid) + 1;
101 return TC_H_MIN(classid)+1;
102} 117}
103 118
104
105static unsigned long dsmark_bind_filter(struct Qdisc *sch, 119static unsigned long dsmark_bind_filter(struct Qdisc *sch,
106 unsigned long parent, u32 classid) 120 unsigned long parent, u32 classid)
107{ 121{
108 return dsmark_get(sch,classid); 122 return dsmark_get(sch, classid);
109} 123}
110 124
111
112static void dsmark_put(struct Qdisc *sch, unsigned long cl) 125static void dsmark_put(struct Qdisc *sch, unsigned long cl)
113{ 126{
114} 127}
115 128
116
117static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, 129static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
118 struct rtattr **tca, unsigned long *arg) 130 struct rtattr **tca, unsigned long *arg)
119{ 131{
120 struct dsmark_qdisc_data *p = PRIV(sch); 132 struct dsmark_qdisc_data *p = PRIV(sch);
121 struct rtattr *opt = tca[TCA_OPTIONS-1]; 133 struct rtattr *opt = tca[TCA_OPTIONS-1];
122 struct rtattr *tb[TCA_DSMARK_MAX]; 134 struct rtattr *tb[TCA_DSMARK_MAX];
135 int err = -EINVAL;
136 u8 mask = 0;
123 137
124 DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," 138 DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
125 "arg 0x%lx\n",sch,p,classid,parent,*arg); 139 "arg 0x%lx\n", sch, p, classid, parent, *arg);
126 if (*arg > p->indices) 140
127 return -ENOENT; 141 if (!dsmark_valid_index(p, *arg)) {
128 if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt)) 142 err = -ENOENT;
129 return -EINVAL; 143 goto rtattr_failure;
130 if (tb[TCA_DSMARK_MASK-1]) {
131 if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1]))
132 return -EINVAL;
133 p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]);
134 }
135 if (tb[TCA_DSMARK_VALUE-1]) {
136 if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1]))
137 return -EINVAL;
138 p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]);
139 } 144 }
140 return 0;
141}
142 145
146 if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
147 goto rtattr_failure;
148
149 if (tb[TCA_DSMARK_MASK-1])
150 mask = RTA_GET_U8(tb[TCA_DSMARK_MASK-1]);
151
152 if (tb[TCA_DSMARK_VALUE-1])
153 p->value[*arg-1] = RTA_GET_U8(tb[TCA_DSMARK_VALUE-1]);
154
155 if (tb[TCA_DSMARK_MASK-1])
156 p->mask[*arg-1] = mask;
143 157
144static int dsmark_delete(struct Qdisc *sch,unsigned long arg) 158 err = 0;
159
160rtattr_failure:
161 return err;
162}
163
164static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
145{ 165{
146 struct dsmark_qdisc_data *p = PRIV(sch); 166 struct dsmark_qdisc_data *p = PRIV(sch);
147 167
148 if (!arg || arg > p->indices) 168 if (!dsmark_valid_index(p, arg))
149 return -EINVAL; 169 return -EINVAL;
170
150 p->mask[arg-1] = 0xff; 171 p->mask[arg-1] = 0xff;
151 p->value[arg-1] = 0; 172 p->value[arg-1] = 0;
173
152 return 0; 174 return 0;
153} 175}
154 176
155
156static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) 177static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
157{ 178{
158 struct dsmark_qdisc_data *p = PRIV(sch); 179 struct dsmark_qdisc_data *p = PRIV(sch);
159 int i; 180 int i;
160 181
161 DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); 182 DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
183
162 if (walker->stop) 184 if (walker->stop)
163 return; 185 return;
186
164 for (i = 0; i < p->indices; i++) { 187 for (i = 0; i < p->indices; i++) {
165 if (p->mask[i] == 0xff && !p->value[i]) 188 if (p->mask[i] == 0xff && !p->value[i])
166 continue; 189 goto ignore;
167 if (walker->count >= walker->skip) { 190 if (walker->count >= walker->skip) {
168 if (walker->fn(sch, i+1, walker) < 0) { 191 if (walker->fn(sch, i+1, walker) < 0) {
169 walker->stop = 1; 192 walker->stop = 1;
170 break; 193 break;
171 } 194 }
172 } 195 }
173 walker->count++; 196ignore:
197 walker->count++;
174 } 198 }
175} 199}
176 200
177
178static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) 201static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl)
179{ 202{
180 struct dsmark_qdisc_data *p = PRIV(sch); 203 return &PRIV(sch)->filter_list;
181
182 return &p->filter_list;
183} 204}
184 205
185
186/* --------------------------- Qdisc operations ---------------------------- */ 206/* --------------------------- Qdisc operations ---------------------------- */
187 207
188
189static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) 208static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
190{ 209{
191 struct dsmark_qdisc_data *p = PRIV(sch); 210 struct dsmark_qdisc_data *p = PRIV(sch);
192 struct tcf_result res; 211 int err;
193 int result; 212
194 int ret = NET_XMIT_POLICED; 213 D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
195 214
196 D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
197 if (p->set_tc_index) { 215 if (p->set_tc_index) {
198 /* FIXME: Safe with non-linear skbs? --RR */ 216 /* FIXME: Safe with non-linear skbs? --RR */
199 switch (skb->protocol) { 217 switch (skb->protocol) {
@@ -210,17 +228,21 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
210 break; 228 break;
211 }; 229 };
212 } 230 }
213 result = TC_POLICE_OK; /* be nice to gcc */ 231
214 if (TC_H_MAJ(skb->priority) == sch->handle) { 232 if (TC_H_MAJ(skb->priority) == sch->handle)
215 skb->tc_index = TC_H_MIN(skb->priority); 233 skb->tc_index = TC_H_MIN(skb->priority);
216 } else { 234 else {
217 result = tc_classify(skb,p->filter_list,&res); 235 struct tcf_result res;
218 D2PRINTK("result %d class 0x%04x\n",result,res.classid); 236 int result = tc_classify(skb, p->filter_list, &res);
237
238 D2PRINTK("result %d class 0x%04x\n", result, res.classid);
239
219 switch (result) { 240 switch (result) {
220#ifdef CONFIG_NET_CLS_POLICE 241#ifdef CONFIG_NET_CLS_POLICE
221 case TC_POLICE_SHOT: 242 case TC_POLICE_SHOT:
222 kfree_skb(skb); 243 kfree_skb(skb);
223 break; 244 sch->qstats.drops++;
245 return NET_XMIT_POLICED;
224#if 0 246#if 0
225 case TC_POLICE_RECLASSIFY: 247 case TC_POLICE_RECLASSIFY:
226 /* FIXME: what to do here ??? */ 248 /* FIXME: what to do here ??? */
@@ -237,43 +259,45 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
237 break; 259 break;
238 }; 260 };
239 } 261 }
240 if (
241#ifdef CONFIG_NET_CLS_POLICE
242 result == TC_POLICE_SHOT ||
243#endif
244 262
245 ((ret = p->q->enqueue(skb,p->q)) != 0)) { 263 err = p->q->enqueue(skb,p->q);
264 if (err != NET_XMIT_SUCCESS) {
246 sch->qstats.drops++; 265 sch->qstats.drops++;
247 return ret; 266 return err;
248 } 267 }
268
249 sch->bstats.bytes += skb->len; 269 sch->bstats.bytes += skb->len;
250 sch->bstats.packets++; 270 sch->bstats.packets++;
251 sch->q.qlen++; 271 sch->q.qlen++;
252 return ret;
253}
254 272
273 return NET_XMIT_SUCCESS;
274}
255 275
256static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) 276static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
257{ 277{
258 struct dsmark_qdisc_data *p = PRIV(sch); 278 struct dsmark_qdisc_data *p = PRIV(sch);
259 struct sk_buff *skb; 279 struct sk_buff *skb;
260 int index; 280 u32 index;
281
282 D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
261 283
262 D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p);
263 skb = p->q->ops->dequeue(p->q); 284 skb = p->q->ops->dequeue(p->q);
264 if (!skb) 285 if (skb == NULL)
265 return NULL; 286 return NULL;
287
266 sch->q.qlen--; 288 sch->q.qlen--;
267 index = skb->tc_index & (p->indices-1); 289
268 D2PRINTK("index %d->%d\n",skb->tc_index,index); 290 index = skb->tc_index & (p->indices - 1);
291 D2PRINTK("index %d->%d\n", skb->tc_index, index);
292
269 switch (skb->protocol) { 293 switch (skb->protocol) {
270 case __constant_htons(ETH_P_IP): 294 case __constant_htons(ETH_P_IP):
271 ipv4_change_dsfield(skb->nh.iph, 295 ipv4_change_dsfield(skb->nh.iph, p->mask[index],
272 p->mask[index],p->value[index]); 296 p->value[index]);
273 break; 297 break;
274 case __constant_htons(ETH_P_IPV6): 298 case __constant_htons(ETH_P_IPV6):
275 ipv6_change_dsfield(skb->nh.ipv6h, 299 ipv6_change_dsfield(skb->nh.ipv6h, p->mask[index],
276 p->mask[index],p->value[index]); 300 p->value[index]);
277 break; 301 break;
278 default: 302 default:
279 /* 303 /*
@@ -287,152 +311,162 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
287 htons(skb->protocol)); 311 htons(skb->protocol));
288 break; 312 break;
289 }; 313 };
314
290 return skb; 315 return skb;
291} 316}
292 317
293
294static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) 318static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch)
295{ 319{
296 int ret;
297 struct dsmark_qdisc_data *p = PRIV(sch); 320 struct dsmark_qdisc_data *p = PRIV(sch);
321 int err;
298 322
299 D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); 323 D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
300 if ((ret = p->q->ops->requeue(skb, p->q)) == 0) { 324
301 sch->q.qlen++; 325 err = p->q->ops->requeue(skb, p->q);
302 sch->qstats.requeues++; 326 if (err != NET_XMIT_SUCCESS) {
303 return 0; 327 sch->qstats.drops++;
328 return err;
304 } 329 }
305 sch->qstats.drops++;
306 return ret;
307}
308 330
331 sch->q.qlen++;
332 sch->qstats.requeues++;
333
334 return NET_XMIT_SUCCESS;
335}
309 336
310static unsigned int dsmark_drop(struct Qdisc *sch) 337static unsigned int dsmark_drop(struct Qdisc *sch)
311{ 338{
312 struct dsmark_qdisc_data *p = PRIV(sch); 339 struct dsmark_qdisc_data *p = PRIV(sch);
313 unsigned int len; 340 unsigned int len;
314 341
315 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); 342 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
316 if (!p->q->ops->drop) 343
317 return 0; 344 if (p->q->ops->drop == NULL)
318 if (!(len = p->q->ops->drop(p->q)))
319 return 0; 345 return 0;
320 sch->q.qlen--; 346
347 len = p->q->ops->drop(p->q);
348 if (len)
349 sch->q.qlen--;
350
321 return len; 351 return len;
322} 352}
323 353
324 354static int dsmark_init(struct Qdisc *sch, struct rtattr *opt)
325static int dsmark_init(struct Qdisc *sch,struct rtattr *opt)
326{ 355{
327 struct dsmark_qdisc_data *p = PRIV(sch); 356 struct dsmark_qdisc_data *p = PRIV(sch);
328 struct rtattr *tb[TCA_DSMARK_MAX]; 357 struct rtattr *tb[TCA_DSMARK_MAX];
329 __u16 tmp; 358 int err = -EINVAL;
330 359 u32 default_index = NO_DEFAULT_INDEX;
331 DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); 360 u16 indices;
332 if (!opt || 361 u8 *mask;
333 rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 || 362
334 !tb[TCA_DSMARK_INDICES-1] || 363 DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
335 RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16)) 364
336 return -EINVAL; 365 if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt) < 0)
337 p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]); 366 goto errout;
338 if (!p->indices) 367
339 return -EINVAL; 368 indices = RTA_GET_U16(tb[TCA_DSMARK_INDICES-1]);
340 for (tmp = p->indices; tmp != 1; tmp >>= 1) { 369 if (!indices || !dsmark_valid_indices(indices))
341 if (tmp & 1) 370 goto errout;
342 return -EINVAL; 371
343 } 372 if (tb[TCA_DSMARK_DEFAULT_INDEX-1])
344 p->default_index = NO_DEFAULT_INDEX; 373 default_index = RTA_GET_U16(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
345 if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) { 374
346 if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16)) 375 mask = kmalloc(indices * 2, GFP_KERNEL);
347 return -EINVAL; 376 if (mask == NULL) {
348 p->default_index = 377 err = -ENOMEM;
349 *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]); 378 goto errout;
350 } 379 }
351 p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1]; 380
352 p->mask = kmalloc(p->indices*2,GFP_KERNEL); 381 p->mask = mask;
353 if (!p->mask) 382 memset(p->mask, 0xff, indices);
354 return -ENOMEM; 383
355 p->value = p->mask+p->indices; 384 p->value = p->mask + indices;
356 memset(p->mask,0xff,p->indices); 385 memset(p->value, 0, indices);
357 memset(p->value,0,p->indices); 386
358 if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) 387 p->indices = indices;
388 p->default_index = default_index;
389 p->set_tc_index = RTA_GET_FLAG(tb[TCA_DSMARK_SET_TC_INDEX-1]);
390
391 p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
392 if (p->q == NULL)
359 p->q = &noop_qdisc; 393 p->q = &noop_qdisc;
360 DPRINTK("dsmark_init: qdisc %p\n",&p->q);
361 return 0;
362}
363 394
395 DPRINTK("dsmark_init: qdisc %p\n", p->q);
396
397 err = 0;
398errout:
399rtattr_failure:
400 return err;
401}
364 402
365static void dsmark_reset(struct Qdisc *sch) 403static void dsmark_reset(struct Qdisc *sch)
366{ 404{
367 struct dsmark_qdisc_data *p = PRIV(sch); 405 struct dsmark_qdisc_data *p = PRIV(sch);
368 406
369 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); 407 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
370 qdisc_reset(p->q); 408 qdisc_reset(p->q);
371 sch->q.qlen = 0; 409 sch->q.qlen = 0;
372} 410}
373 411
374
375static void dsmark_destroy(struct Qdisc *sch) 412static void dsmark_destroy(struct Qdisc *sch)
376{ 413{
377 struct dsmark_qdisc_data *p = PRIV(sch); 414 struct dsmark_qdisc_data *p = PRIV(sch);
378 struct tcf_proto *tp; 415 struct tcf_proto *tp;
379 416
380 DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p); 417 DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
418
381 while (p->filter_list) { 419 while (p->filter_list) {
382 tp = p->filter_list; 420 tp = p->filter_list;
383 p->filter_list = tp->next; 421 p->filter_list = tp->next;
384 tcf_destroy(tp); 422 tcf_destroy(tp);
385 } 423 }
424
386 qdisc_destroy(p->q); 425 qdisc_destroy(p->q);
387 kfree(p->mask); 426 kfree(p->mask);
388} 427}
389 428
390
391static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, 429static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
392 struct sk_buff *skb, struct tcmsg *tcm) 430 struct sk_buff *skb, struct tcmsg *tcm)
393{ 431{
394 struct dsmark_qdisc_data *p = PRIV(sch); 432 struct dsmark_qdisc_data *p = PRIV(sch);
395 unsigned char *b = skb->tail; 433 struct rtattr *opts = NULL;
396 struct rtattr *rta; 434
435 DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
397 436
398 DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl); 437 if (!dsmark_valid_index(p, cl))
399 if (!cl || cl > p->indices)
400 return -EINVAL; 438 return -EINVAL;
401 tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1); 439
402 rta = (struct rtattr *) b; 440 tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
403 RTA_PUT(skb,TCA_OPTIONS,0,NULL); 441
404 RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]); 442 opts = RTA_NEST(skb, TCA_OPTIONS);
405 RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]); 443 RTA_PUT_U8(skb,TCA_DSMARK_MASK, p->mask[cl-1]);
406 rta->rta_len = skb->tail-b; 444 RTA_PUT_U8(skb,TCA_DSMARK_VALUE, p->value[cl-1]);
407 return skb->len; 445
446 return RTA_NEST_END(skb, opts);
408 447
409rtattr_failure: 448rtattr_failure:
410 skb_trim(skb,b-skb->data); 449 return RTA_NEST_CANCEL(skb, opts);
411 return -1;
412} 450}
413 451
414static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) 452static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
415{ 453{
416 struct dsmark_qdisc_data *p = PRIV(sch); 454 struct dsmark_qdisc_data *p = PRIV(sch);
417 unsigned char *b = skb->tail; 455 struct rtattr *opts = NULL;
418 struct rtattr *rta;
419 456
420 rta = (struct rtattr *) b; 457 opts = RTA_NEST(skb, TCA_OPTIONS);
421 RTA_PUT(skb,TCA_OPTIONS,0,NULL); 458 RTA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
422 RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices); 459
423 if (p->default_index != NO_DEFAULT_INDEX) { 460 if (p->default_index != NO_DEFAULT_INDEX)
424 __u16 tmp = p->default_index; 461 RTA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
425 462
426 RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp);
427 }
428 if (p->set_tc_index) 463 if (p->set_tc_index)
429 RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL); 464 RTA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
430 rta->rta_len = skb->tail-b; 465
431 return skb->len; 466 return RTA_NEST_END(skb, opts);
432 467
433rtattr_failure: 468rtattr_failure:
434 skb_trim(skb,b-skb->data); 469 return RTA_NEST_CANCEL(skb, opts);
435 return -1;
436} 470}
437 471
438static struct Qdisc_class_ops dsmark_class_ops = { 472static struct Qdisc_class_ops dsmark_class_ops = {
@@ -470,10 +504,13 @@ static int __init dsmark_module_init(void)
470{ 504{
471 return register_qdisc(&dsmark_qdisc_ops); 505 return register_qdisc(&dsmark_qdisc_ops);
472} 506}
507
473static void __exit dsmark_module_exit(void) 508static void __exit dsmark_module_exit(void)
474{ 509{
475 unregister_qdisc(&dsmark_qdisc_ops); 510 unregister_qdisc(&dsmark_qdisc_ops);
476} 511}
512
477module_init(dsmark_module_init) 513module_init(dsmark_module_init)
478module_exit(dsmark_module_exit) 514module_exit(dsmark_module_exit)
515
479MODULE_LICENSE("GPL"); 516MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 4888305c96da..033083bf0e74 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -11,131 +11,38 @@
11 11
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h> 14#include <linux/types.h>
18#include <linux/kernel.h> 15#include <linux/kernel.h>
19#include <linux/sched.h>
20#include <linux/string.h>
21#include <linux/mm.h>
22#include <linux/socket.h>
23#include <linux/sockios.h>
24#include <linux/in.h>
25#include <linux/errno.h> 16#include <linux/errno.h>
26#include <linux/interrupt.h>
27#include <linux/if_ether.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h> 17#include <linux/netdevice.h>
30#include <linux/etherdevice.h>
31#include <linux/notifier.h>
32#include <net/ip.h>
33#include <net/route.h>
34#include <linux/skbuff.h> 18#include <linux/skbuff.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h> 19#include <net/pkt_sched.h>
37 20
38/* 1 band FIFO pseudo-"scheduler" */ 21/* 1 band FIFO pseudo-"scheduler" */
39 22
40struct fifo_sched_data 23struct fifo_sched_data
41{ 24{
42 unsigned limit; 25 u32 limit;
43}; 26};
44 27
45static int 28static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
46bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
47{ 29{
48 struct fifo_sched_data *q = qdisc_priv(sch); 30 struct fifo_sched_data *q = qdisc_priv(sch);
49 31
50 if (sch->qstats.backlog + skb->len <= q->limit) { 32 if (likely(sch->qstats.backlog + skb->len <= q->limit))
51 __skb_queue_tail(&sch->q, skb); 33 return qdisc_enqueue_tail(skb, sch);
52 sch->qstats.backlog += skb->len;
53 sch->bstats.bytes += skb->len;
54 sch->bstats.packets++;
55 return 0;
56 }
57 sch->qstats.drops++;
58#ifdef CONFIG_NET_CLS_POLICE
59 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
60#endif
61 kfree_skb(skb);
62 return NET_XMIT_DROP;
63}
64
65static int
66bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
67{
68 __skb_queue_head(&sch->q, skb);
69 sch->qstats.backlog += skb->len;
70 sch->qstats.requeues++;
71 return 0;
72}
73
74static struct sk_buff *
75bfifo_dequeue(struct Qdisc* sch)
76{
77 struct sk_buff *skb;
78 34
79 skb = __skb_dequeue(&sch->q); 35 return qdisc_reshape_fail(skb, sch);
80 if (skb)
81 sch->qstats.backlog -= skb->len;
82 return skb;
83} 36}
84 37
85static unsigned int 38static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
86fifo_drop(struct Qdisc* sch)
87{
88 struct sk_buff *skb;
89
90 skb = __skb_dequeue_tail(&sch->q);
91 if (skb) {
92 unsigned int len = skb->len;
93 sch->qstats.backlog -= len;
94 kfree_skb(skb);
95 return len;
96 }
97 return 0;
98}
99
100static void
101fifo_reset(struct Qdisc* sch)
102{
103 skb_queue_purge(&sch->q);
104 sch->qstats.backlog = 0;
105}
106
107static int
108pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
109{ 39{
110 struct fifo_sched_data *q = qdisc_priv(sch); 40 struct fifo_sched_data *q = qdisc_priv(sch);
111 41
112 if (sch->q.qlen < q->limit) { 42 if (likely(skb_queue_len(&sch->q) < q->limit))
113 __skb_queue_tail(&sch->q, skb); 43 return qdisc_enqueue_tail(skb, sch);
114 sch->bstats.bytes += skb->len;
115 sch->bstats.packets++;
116 return 0;
117 }
118 sch->qstats.drops++;
119#ifdef CONFIG_NET_CLS_POLICE
120 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
121#endif
122 kfree_skb(skb);
123 return NET_XMIT_DROP;
124}
125
126static int
127pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
128{
129 __skb_queue_head(&sch->q, skb);
130 sch->qstats.requeues++;
131 return 0;
132}
133
134 44
135static struct sk_buff * 45 return qdisc_reshape_fail(skb, sch);
136pfifo_dequeue(struct Qdisc* sch)
137{
138 return __skb_dequeue(&sch->q);
139} 46}
140 47
141static int fifo_init(struct Qdisc *sch, struct rtattr *opt) 48static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
@@ -143,66 +50,59 @@ static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
143 struct fifo_sched_data *q = qdisc_priv(sch); 50 struct fifo_sched_data *q = qdisc_priv(sch);
144 51
145 if (opt == NULL) { 52 if (opt == NULL) {
146 unsigned int limit = sch->dev->tx_queue_len ? : 1; 53 u32 limit = sch->dev->tx_queue_len ? : 1;
147 54
148 if (sch->ops == &bfifo_qdisc_ops) 55 if (sch->ops == &bfifo_qdisc_ops)
149 q->limit = limit*sch->dev->mtu; 56 limit *= sch->dev->mtu;
150 else 57
151 q->limit = limit; 58 q->limit = limit;
152 } else { 59 } else {
153 struct tc_fifo_qopt *ctl = RTA_DATA(opt); 60 struct tc_fifo_qopt *ctl = RTA_DATA(opt);
154 if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) 61
62 if (RTA_PAYLOAD(opt) < sizeof(*ctl))
155 return -EINVAL; 63 return -EINVAL;
64
156 q->limit = ctl->limit; 65 q->limit = ctl->limit;
157 } 66 }
67
158 return 0; 68 return 0;
159} 69}
160 70
161static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) 71static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
162{ 72{
163 struct fifo_sched_data *q = qdisc_priv(sch); 73 struct fifo_sched_data *q = qdisc_priv(sch);
164 unsigned char *b = skb->tail; 74 struct tc_fifo_qopt opt = { .limit = q->limit };
165 struct tc_fifo_qopt opt;
166 75
167 opt.limit = q->limit;
168 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 76 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
169
170 return skb->len; 77 return skb->len;
171 78
172rtattr_failure: 79rtattr_failure:
173 skb_trim(skb, b - skb->data);
174 return -1; 80 return -1;
175} 81}
176 82
177struct Qdisc_ops pfifo_qdisc_ops = { 83struct Qdisc_ops pfifo_qdisc_ops = {
178 .next = NULL,
179 .cl_ops = NULL,
180 .id = "pfifo", 84 .id = "pfifo",
181 .priv_size = sizeof(struct fifo_sched_data), 85 .priv_size = sizeof(struct fifo_sched_data),
182 .enqueue = pfifo_enqueue, 86 .enqueue = pfifo_enqueue,
183 .dequeue = pfifo_dequeue, 87 .dequeue = qdisc_dequeue_head,
184 .requeue = pfifo_requeue, 88 .requeue = qdisc_requeue,
185 .drop = fifo_drop, 89 .drop = qdisc_queue_drop,
186 .init = fifo_init, 90 .init = fifo_init,
187 .reset = fifo_reset, 91 .reset = qdisc_reset_queue,
188 .destroy = NULL,
189 .change = fifo_init, 92 .change = fifo_init,
190 .dump = fifo_dump, 93 .dump = fifo_dump,
191 .owner = THIS_MODULE, 94 .owner = THIS_MODULE,
192}; 95};
193 96
194struct Qdisc_ops bfifo_qdisc_ops = { 97struct Qdisc_ops bfifo_qdisc_ops = {
195 .next = NULL,
196 .cl_ops = NULL,
197 .id = "bfifo", 98 .id = "bfifo",
198 .priv_size = sizeof(struct fifo_sched_data), 99 .priv_size = sizeof(struct fifo_sched_data),
199 .enqueue = bfifo_enqueue, 100 .enqueue = bfifo_enqueue,
200 .dequeue = bfifo_dequeue, 101 .dequeue = qdisc_dequeue_head,
201 .requeue = bfifo_requeue, 102 .requeue = qdisc_requeue,
202 .drop = fifo_drop, 103 .drop = qdisc_queue_drop,
203 .init = fifo_init, 104 .init = fifo_init,
204 .reset = fifo_reset, 105 .reset = qdisc_reset_queue,
205 .destroy = NULL,
206 .change = fifo_init, 106 .change = fifo_init,
207 .dump = fifo_dump, 107 .dump = fifo_dump,
208 .owner = THIS_MODULE, 108 .owner = THIS_MODULE,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 87e48a4e1051..7683b34dc6a9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -243,31 +243,27 @@ static void dev_watchdog_down(struct net_device *dev)
243 cheaper. 243 cheaper.
244 */ 244 */
245 245
246static int 246static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
247noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
248{ 247{
249 kfree_skb(skb); 248 kfree_skb(skb);
250 return NET_XMIT_CN; 249 return NET_XMIT_CN;
251} 250}
252 251
253static struct sk_buff * 252static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
254noop_dequeue(struct Qdisc * qdisc)
255{ 253{
256 return NULL; 254 return NULL;
257} 255}
258 256
259static int 257static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
260noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
261{ 258{
262 if (net_ratelimit()) 259 if (net_ratelimit())
263 printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); 260 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
261 skb->dev->name);
264 kfree_skb(skb); 262 kfree_skb(skb);
265 return NET_XMIT_CN; 263 return NET_XMIT_CN;
266} 264}
267 265
268struct Qdisc_ops noop_qdisc_ops = { 266struct Qdisc_ops noop_qdisc_ops = {
269 .next = NULL,
270 .cl_ops = NULL,
271 .id = "noop", 267 .id = "noop",
272 .priv_size = 0, 268 .priv_size = 0,
273 .enqueue = noop_enqueue, 269 .enqueue = noop_enqueue,
@@ -285,8 +281,6 @@ struct Qdisc noop_qdisc = {
285}; 281};
286 282
287static struct Qdisc_ops noqueue_qdisc_ops = { 283static struct Qdisc_ops noqueue_qdisc_ops = {
288 .next = NULL,
289 .cl_ops = NULL,
290 .id = "noqueue", 284 .id = "noqueue",
291 .priv_size = 0, 285 .priv_size = 0,
292 .enqueue = noop_enqueue, 286 .enqueue = noop_enqueue,
@@ -311,97 +305,87 @@ static const u8 prio2band[TC_PRIO_MAX+1] =
311 generic prio+fifo combination. 305 generic prio+fifo combination.
312 */ 306 */
313 307
314static int 308#define PFIFO_FAST_BANDS 3
315pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) 309
310static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
311 struct Qdisc *qdisc)
316{ 312{
317 struct sk_buff_head *list = qdisc_priv(qdisc); 313 struct sk_buff_head *list = qdisc_priv(qdisc);
314 return list + prio2band[skb->priority & TC_PRIO_MAX];
315}
318 316
319 list += prio2band[skb->priority&TC_PRIO_MAX]; 317static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
318{
319 struct sk_buff_head *list = prio2list(skb, qdisc);
320 320
321 if (list->qlen < qdisc->dev->tx_queue_len) { 321 if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
322 __skb_queue_tail(list, skb);
323 qdisc->q.qlen++; 322 qdisc->q.qlen++;
324 qdisc->bstats.bytes += skb->len; 323 return __qdisc_enqueue_tail(skb, qdisc, list);
325 qdisc->bstats.packets++;
326 return 0;
327 } 324 }
328 qdisc->qstats.drops++; 325
329 kfree_skb(skb); 326 return qdisc_drop(skb, qdisc);
330 return NET_XMIT_DROP;
331} 327}
332 328
333static struct sk_buff * 329static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
334pfifo_fast_dequeue(struct Qdisc* qdisc)
335{ 330{
336 int prio; 331 int prio;
337 struct sk_buff_head *list = qdisc_priv(qdisc); 332 struct sk_buff_head *list = qdisc_priv(qdisc);
338 struct sk_buff *skb;
339 333
340 for (prio = 0; prio < 3; prio++, list++) { 334 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
341 skb = __skb_dequeue(list); 335 struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
342 if (skb) { 336 if (skb) {
343 qdisc->q.qlen--; 337 qdisc->q.qlen--;
344 return skb; 338 return skb;
345 } 339 }
346 } 340 }
341
347 return NULL; 342 return NULL;
348} 343}
349 344
350static int 345static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
351pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
352{ 346{
353 struct sk_buff_head *list = qdisc_priv(qdisc);
354
355 list += prio2band[skb->priority&TC_PRIO_MAX];
356
357 __skb_queue_head(list, skb);
358 qdisc->q.qlen++; 347 qdisc->q.qlen++;
359 qdisc->qstats.requeues++; 348 return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
360 return 0;
361} 349}
362 350
363static void 351static void pfifo_fast_reset(struct Qdisc* qdisc)
364pfifo_fast_reset(struct Qdisc* qdisc)
365{ 352{
366 int prio; 353 int prio;
367 struct sk_buff_head *list = qdisc_priv(qdisc); 354 struct sk_buff_head *list = qdisc_priv(qdisc);
368 355
369 for (prio=0; prio < 3; prio++) 356 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
370 skb_queue_purge(list+prio); 357 __qdisc_reset_queue(qdisc, list + prio);
358
359 qdisc->qstats.backlog = 0;
371 qdisc->q.qlen = 0; 360 qdisc->q.qlen = 0;
372} 361}
373 362
374static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) 363static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
375{ 364{
376 unsigned char *b = skb->tail; 365 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
377 struct tc_prio_qopt opt;
378 366
379 opt.bands = 3;
380 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); 367 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
381 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 368 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
382 return skb->len; 369 return skb->len;
383 370
384rtattr_failure: 371rtattr_failure:
385 skb_trim(skb, b - skb->data);
386 return -1; 372 return -1;
387} 373}
388 374
389static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) 375static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
390{ 376{
391 int i; 377 int prio;
392 struct sk_buff_head *list = qdisc_priv(qdisc); 378 struct sk_buff_head *list = qdisc_priv(qdisc);
393 379
394 for (i=0; i<3; i++) 380 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
395 skb_queue_head_init(list+i); 381 skb_queue_head_init(list + prio);
396 382
397 return 0; 383 return 0;
398} 384}
399 385
400static struct Qdisc_ops pfifo_fast_ops = { 386static struct Qdisc_ops pfifo_fast_ops = {
401 .next = NULL,
402 .cl_ops = NULL,
403 .id = "pfifo_fast", 387 .id = "pfifo_fast",
404 .priv_size = 3 * sizeof(struct sk_buff_head), 388 .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
405 .enqueue = pfifo_fast_enqueue, 389 .enqueue = pfifo_fast_enqueue,
406 .dequeue = pfifo_fast_dequeue, 390 .dequeue = pfifo_fast_dequeue,
407 .requeue = pfifo_fast_requeue, 391 .requeue = pfifo_fast_requeue,
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 663843d97a92..7ae6aa772dab 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -191,10 +191,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
191 asoc->last_cwr_tsn = asoc->ctsn_ack_point; 191 asoc->last_cwr_tsn = asoc->ctsn_ack_point;
192 asoc->unack_data = 0; 192 asoc->unack_data = 0;
193 193
194 SCTP_DEBUG_PRINTK("myctsnap for %s INIT as 0x%x.\n",
195 asoc->ep->debug_name,
196 asoc->ctsn_ack_point);
197
198 /* ADDIP Section 4.1 Asconf Chunk Procedures 194 /* ADDIP Section 4.1 Asconf Chunk Procedures
199 * 195 *
200 * When an endpoint has an ASCONF signaled change to be sent to the 196 * When an endpoint has an ASCONF signaled change to be sent to the
@@ -211,6 +207,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
211 207
212 /* Make an empty list of remote transport addresses. */ 208 /* Make an empty list of remote transport addresses. */
213 INIT_LIST_HEAD(&asoc->peer.transport_addr_list); 209 INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
210 asoc->peer.transport_count = 0;
214 211
215 /* RFC 2960 5.1 Normal Establishment of an Association 212 /* RFC 2960 5.1 Normal Establishment of an Association
216 * 213 *
@@ -288,6 +285,7 @@ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
288 285
289 asoc->base.malloced = 1; 286 asoc->base.malloced = 1;
290 SCTP_DBG_OBJCNT_INC(assoc); 287 SCTP_DBG_OBJCNT_INC(assoc);
288 SCTP_DEBUG_PRINTK("Created asoc %p\n", asoc);
291 289
292 return asoc; 290 return asoc;
293 291
@@ -356,6 +354,8 @@ void sctp_association_free(struct sctp_association *asoc)
356 sctp_transport_free(transport); 354 sctp_transport_free(transport);
357 } 355 }
358 356
357 asoc->peer.transport_count = 0;
358
359 /* Free any cached ASCONF_ACK chunk. */ 359 /* Free any cached ASCONF_ACK chunk. */
360 if (asoc->addip_last_asconf_ack) 360 if (asoc->addip_last_asconf_ack)
361 sctp_chunk_free(asoc->addip_last_asconf_ack); 361 sctp_chunk_free(asoc->addip_last_asconf_ack);
@@ -400,7 +400,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
400 /* If the primary path is changing, assume that the 400 /* If the primary path is changing, assume that the
401 * user wants to use this new path. 401 * user wants to use this new path.
402 */ 402 */
403 if (transport->active) 403 if (transport->state != SCTP_INACTIVE)
404 asoc->peer.active_path = transport; 404 asoc->peer.active_path = transport;
405 405
406 /* 406 /*
@@ -428,10 +428,58 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
428 transport->cacc.next_tsn_at_change = asoc->next_tsn; 428 transport->cacc.next_tsn_at_change = asoc->next_tsn;
429} 429}
430 430
431/* Remove a transport from an association. */
432void sctp_assoc_rm_peer(struct sctp_association *asoc,
433 struct sctp_transport *peer)
434{
435 struct list_head *pos;
436 struct sctp_transport *transport;
437
438 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_rm_peer:association %p addr: ",
439 " port: %d\n",
440 asoc,
441 (&peer->ipaddr),
442 peer->ipaddr.v4.sin_port);
443
444 /* If we are to remove the current retran_path, update it
445 * to the next peer before removing this peer from the list.
446 */
447 if (asoc->peer.retran_path == peer)
448 sctp_assoc_update_retran_path(asoc);
449
450 /* Remove this peer from the list. */
451 list_del(&peer->transports);
452
453 /* Get the first transport of asoc. */
454 pos = asoc->peer.transport_addr_list.next;
455 transport = list_entry(pos, struct sctp_transport, transports);
456
457 /* Update any entries that match the peer to be deleted. */
458 if (asoc->peer.primary_path == peer)
459 sctp_assoc_set_primary(asoc, transport);
460 if (asoc->peer.active_path == peer)
461 asoc->peer.active_path = transport;
462 if (asoc->peer.last_data_from == peer)
463 asoc->peer.last_data_from = transport;
464
465 /* If we remove the transport an INIT was last sent to, set it to
466 * NULL. Combined with the update of the retran path above, this
467 * will cause the next INIT to be sent to the next available
468 * transport, maintaining the cycle.
469 */
470 if (asoc->init_last_sent_to == peer)
471 asoc->init_last_sent_to = NULL;
472
473 asoc->peer.transport_count--;
474
475 sctp_transport_free(peer);
476}
477
431/* Add a transport address to an association. */ 478/* Add a transport address to an association. */
432struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, 479struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
433 const union sctp_addr *addr, 480 const union sctp_addr *addr,
434 int gfp) 481 const int gfp,
482 const int peer_state)
435{ 483{
436 struct sctp_transport *peer; 484 struct sctp_transport *peer;
437 struct sctp_sock *sp; 485 struct sctp_sock *sp;
@@ -442,14 +490,25 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
442 /* AF_INET and AF_INET6 share common port field. */ 490 /* AF_INET and AF_INET6 share common port field. */
443 port = addr->v4.sin_port; 491 port = addr->v4.sin_port;
444 492
493 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_add_peer:association %p addr: ",
494 " port: %d state:%s\n",
495 asoc,
496 addr,
497 addr->v4.sin_port,
498 peer_state == SCTP_UNKNOWN?"UNKNOWN":"ACTIVE");
499
445 /* Set the port if it has not been set yet. */ 500 /* Set the port if it has not been set yet. */
446 if (0 == asoc->peer.port) 501 if (0 == asoc->peer.port)
447 asoc->peer.port = port; 502 asoc->peer.port = port;
448 503
449 /* Check to see if this is a duplicate. */ 504 /* Check to see if this is a duplicate. */
450 peer = sctp_assoc_lookup_paddr(asoc, addr); 505 peer = sctp_assoc_lookup_paddr(asoc, addr);
451 if (peer) 506 if (peer) {
507 if (peer_state == SCTP_ACTIVE &&
508 peer->state == SCTP_UNKNOWN)
509 peer->state = SCTP_ACTIVE;
452 return peer; 510 return peer;
511 }
453 512
454 peer = sctp_transport_new(addr, gfp); 513 peer = sctp_transport_new(addr, gfp);
455 if (!peer) 514 if (!peer)
@@ -516,8 +575,12 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
516 /* Set the transport's RTO.initial value */ 575 /* Set the transport's RTO.initial value */
517 peer->rto = asoc->rto_initial; 576 peer->rto = asoc->rto_initial;
518 577
578 /* Set the peer's active state. */
579 peer->state = peer_state;
580
519 /* Attach the remote transport to our asoc. */ 581 /* Attach the remote transport to our asoc. */
520 list_add_tail(&peer->transports, &asoc->peer.transport_addr_list); 582 list_add_tail(&peer->transports, &asoc->peer.transport_addr_list);
583 asoc->peer.transport_count++;
521 584
522 /* If we do not yet have a primary path, set one. */ 585 /* If we do not yet have a primary path, set one. */
523 if (!asoc->peer.primary_path) { 586 if (!asoc->peer.primary_path) {
@@ -525,8 +588,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
525 asoc->peer.retran_path = peer; 588 asoc->peer.retran_path = peer;
526 } 589 }
527 590
528 if (asoc->peer.active_path == asoc->peer.retran_path) 591 if (asoc->peer.active_path == asoc->peer.retran_path) {
529 asoc->peer.retran_path = peer; 592 asoc->peer.retran_path = peer;
593 }
530 594
531 return peer; 595 return peer;
532} 596}
@@ -537,37 +601,16 @@ void sctp_assoc_del_peer(struct sctp_association *asoc,
537{ 601{
538 struct list_head *pos; 602 struct list_head *pos;
539 struct list_head *temp; 603 struct list_head *temp;
540 struct sctp_transport *peer = NULL;
541 struct sctp_transport *transport; 604 struct sctp_transport *transport;
542 605
543 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { 606 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
544 transport = list_entry(pos, struct sctp_transport, transports); 607 transport = list_entry(pos, struct sctp_transport, transports);
545 if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { 608 if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
546 peer = transport; 609 /* Do book keeping for removing the peer and free it. */
547 list_del(pos); 610 sctp_assoc_rm_peer(asoc, transport);
548 break; 611 break;
549 } 612 }
550 } 613 }
551
552 /* The address we want delete is not in the association. */
553 if (!peer)
554 return;
555
556 /* Get the first transport of asoc. */
557 pos = asoc->peer.transport_addr_list.next;
558 transport = list_entry(pos, struct sctp_transport, transports);
559
560 /* Update any entries that match the peer to be deleted. */
561 if (asoc->peer.primary_path == peer)
562 sctp_assoc_set_primary(asoc, transport);
563 if (asoc->peer.active_path == peer)
564 asoc->peer.active_path = transport;
565 if (asoc->peer.retran_path == peer)
566 asoc->peer.retran_path = transport;
567 if (asoc->peer.last_data_from == peer)
568 asoc->peer.last_data_from = transport;
569
570 sctp_transport_free(peer);
571} 614}
572 615
573/* Lookup a transport by address. */ 616/* Lookup a transport by address. */
@@ -608,12 +651,12 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
608 /* Record the transition on the transport. */ 651 /* Record the transition on the transport. */
609 switch (command) { 652 switch (command) {
610 case SCTP_TRANSPORT_UP: 653 case SCTP_TRANSPORT_UP:
611 transport->active = SCTP_ACTIVE; 654 transport->state = SCTP_ACTIVE;
612 spc_state = SCTP_ADDR_AVAILABLE; 655 spc_state = SCTP_ADDR_AVAILABLE;
613 break; 656 break;
614 657
615 case SCTP_TRANSPORT_DOWN: 658 case SCTP_TRANSPORT_DOWN:
616 transport->active = SCTP_INACTIVE; 659 transport->state = SCTP_INACTIVE;
617 spc_state = SCTP_ADDR_UNREACHABLE; 660 spc_state = SCTP_ADDR_UNREACHABLE;
618 break; 661 break;
619 662
@@ -643,7 +686,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
643 list_for_each(pos, &asoc->peer.transport_addr_list) { 686 list_for_each(pos, &asoc->peer.transport_addr_list) {
644 t = list_entry(pos, struct sctp_transport, transports); 687 t = list_entry(pos, struct sctp_transport, transports);
645 688
646 if (!t->active) 689 if (t->state == SCTP_INACTIVE)
647 continue; 690 continue;
648 if (!first || t->last_time_heard > first->last_time_heard) { 691 if (!first || t->last_time_heard > first->last_time_heard) {
649 second = first; 692 second = first;
@@ -663,7 +706,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
663 * [If the primary is active but not most recent, bump the most 706 * [If the primary is active but not most recent, bump the most
664 * recently used transport.] 707 * recently used transport.]
665 */ 708 */
666 if (asoc->peer.primary_path->active && 709 if (asoc->peer.primary_path->state != SCTP_INACTIVE &&
667 first != asoc->peer.primary_path) { 710 first != asoc->peer.primary_path) {
668 second = first; 711 second = first;
669 first = asoc->peer.primary_path; 712 first = asoc->peer.primary_path;
@@ -958,7 +1001,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
958 transports); 1001 transports);
959 if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr)) 1002 if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr))
960 sctp_assoc_add_peer(asoc, &trans->ipaddr, 1003 sctp_assoc_add_peer(asoc, &trans->ipaddr,
961 GFP_ATOMIC); 1004 GFP_ATOMIC, SCTP_ACTIVE);
962 } 1005 }
963 1006
964 asoc->ctsn_ack_point = asoc->next_tsn - 1; 1007 asoc->ctsn_ack_point = asoc->next_tsn - 1;
@@ -998,7 +1041,7 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
998 1041
999 /* Try to find an active transport. */ 1042 /* Try to find an active transport. */
1000 1043
1001 if (t->active) { 1044 if (t->state != SCTP_INACTIVE) {
1002 break; 1045 break;
1003 } else { 1046 } else {
1004 /* Keep track of the next transport in case 1047 /* Keep track of the next transport in case
@@ -1019,6 +1062,40 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
1019 } 1062 }
1020 1063
1021 asoc->peer.retran_path = t; 1064 asoc->peer.retran_path = t;
1065
1066 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
1067 " %p addr: ",
1068 " port: %d\n",
1069 asoc,
1070 (&t->ipaddr),
1071 t->ipaddr.v4.sin_port);
1072}
1073
1074/* Choose the transport for sending a INIT packet. */
1075struct sctp_transport *sctp_assoc_choose_init_transport(
1076 struct sctp_association *asoc)
1077{
1078 struct sctp_transport *t;
1079
1080 /* Use the retran path. If the last INIT was sent over the
1081 * retran path, update the retran path and use it.
1082 */
1083 if (!asoc->init_last_sent_to) {
1084 t = asoc->peer.active_path;
1085 } else {
1086 if (asoc->init_last_sent_to == asoc->peer.retran_path)
1087 sctp_assoc_update_retran_path(asoc);
1088 t = asoc->peer.retran_path;
1089 }
1090
1091 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
1092 " %p addr: ",
1093 " port: %d\n",
1094 asoc,
1095 (&t->ipaddr),
1096 t->ipaddr.v4.sin_port);
1097
1098 return t;
1022} 1099}
1023 1100
1024/* Choose the transport for sending a SHUTDOWN packet. */ 1101/* Choose the transport for sending a SHUTDOWN packet. */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 334f61773e6d..2ec0320fac3b 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -134,7 +134,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
134 ep->last_key = ep->current_key = 0; 134 ep->last_key = ep->current_key = 0;
135 ep->key_changed_at = jiffies; 135 ep->key_changed_at = jiffies;
136 136
137 ep->debug_name = "unnamedEndpoint";
138 return ep; 137 return ep;
139} 138}
140 139
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b719a77d66b4..339f7acfdb64 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -178,6 +178,37 @@ int sctp_rcv(struct sk_buff *skb)
178 178
179 asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); 179 asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
180 180
181 if (!asoc)
182 ep = __sctp_rcv_lookup_endpoint(&dest);
183
184 /* Retrieve the common input handling substructure. */
185 rcvr = asoc ? &asoc->base : &ep->base;
186 sk = rcvr->sk;
187
188 /*
189 * If a frame arrives on an interface and the receiving socket is
190 * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
191 */
192 if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb)))
193 {
194 sock_put(sk);
195 if (asoc) {
196 sctp_association_put(asoc);
197 asoc = NULL;
198 } else {
199 sctp_endpoint_put(ep);
200 ep = NULL;
201 }
202 sk = sctp_get_ctl_sock();
203 ep = sctp_sk(sk)->ep;
204 sctp_endpoint_hold(ep);
205 sock_hold(sk);
206 rcvr = &ep->base;
207 }
208
209 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
210 goto discard_release;
211
181 /* 212 /*
182 * RFC 2960, 8.4 - Handle "Out of the blue" Packets. 213 * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
183 * An SCTP packet is called an "out of the blue" (OOTB) 214 * An SCTP packet is called an "out of the blue" (OOTB)
@@ -187,22 +218,12 @@ int sctp_rcv(struct sk_buff *skb)
187 * packet belongs. 218 * packet belongs.
188 */ 219 */
189 if (!asoc) { 220 if (!asoc) {
190 ep = __sctp_rcv_lookup_endpoint(&dest);
191 if (sctp_rcv_ootb(skb)) { 221 if (sctp_rcv_ootb(skb)) {
192 SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES); 222 SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES);
193 goto discard_release; 223 goto discard_release;
194 } 224 }
195 } 225 }
196 226
197 /* Retrieve the common input handling substructure. */
198 rcvr = asoc ? &asoc->base : &ep->base;
199 sk = rcvr->sk;
200
201 if ((sk) && (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)) {
202 goto discard_release;
203 }
204
205
206 /* SCTP seems to always need a timestamp right now (FIXME) */ 227 /* SCTP seems to always need a timestamp right now (FIXME) */
207 if (skb->stamp.tv_sec == 0) { 228 if (skb->stamp.tv_sec == 0) {
208 do_gettimeofday(&skb->stamp); 229 do_gettimeofday(&skb->stamp);
@@ -265,13 +286,11 @@ discard_it:
265 286
266discard_release: 287discard_release:
267 /* Release any structures we may be holding. */ 288 /* Release any structures we may be holding. */
268 if (asoc) { 289 sock_put(sk);
269 sock_put(asoc->base.sk); 290 if (asoc)
270 sctp_association_put(asoc); 291 sctp_association_put(asoc);
271 } else { 292 else
272 sock_put(ep->base.sk);
273 sctp_endpoint_put(ep); 293 sctp_endpoint_put(ep);
274 }
275 294
276 goto discard_it; 295 goto discard_it;
277} 296}
@@ -334,7 +353,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
334 353
335 sctp_do_sm(SCTP_EVENT_T_OTHER, 354 sctp_do_sm(SCTP_EVENT_T_OTHER,
336 SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), 355 SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
337 asoc->state, asoc->ep, asoc, NULL, 356 asoc->state, asoc->ep, asoc, t,
338 GFP_ATOMIC); 357 GFP_ATOMIC);
339 358
340} 359}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c9d9ea064734..c7e42d125b9c 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -812,26 +812,23 @@ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
812 if (addr->sa.sa_family != AF_INET6) 812 if (addr->sa.sa_family != AF_INET6)
813 af = sctp_get_af_specific(addr->sa.sa_family); 813 af = sctp_get_af_specific(addr->sa.sa_family);
814 else { 814 else {
815 struct sock *sk;
816 int type = ipv6_addr_type(&addr->v6.sin6_addr); 815 int type = ipv6_addr_type(&addr->v6.sin6_addr);
817 sk = sctp_opt2sk(opt); 816 struct net_device *dev;
817
818 if (type & IPV6_ADDR_LINKLOCAL) { 818 if (type & IPV6_ADDR_LINKLOCAL) {
819 /* Note: Behavior similar to af_inet6.c: 819 if (!addr->v6.sin6_scope_id)
820 * 1) Overrides previous bound_dev_if 820 return 0;
821 * 2) Destructive even if bind isn't successful. 821 dev = dev_get_by_index(addr->v6.sin6_scope_id);
822 */ 822 if (!dev)
823
824 if (addr->v6.sin6_scope_id)
825 sk->sk_bound_dev_if = addr->v6.sin6_scope_id;
826 if (!sk->sk_bound_dev_if)
827 return 0; 823 return 0;
824 dev_put(dev);
828 } 825 }
829 af = opt->pf->af; 826 af = opt->pf->af;
830 } 827 }
831 return af->available(addr, opt); 828 return af->available(addr, opt);
832} 829}
833 830
834/* Verify that the provided sockaddr looks bindable. Common verification, 831/* Verify that the provided sockaddr looks sendable. Common verification,
835 * has already been taken care of. 832 * has already been taken care of.
836 */ 833 */
837static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr) 834static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
@@ -842,19 +839,16 @@ static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
842 if (addr->sa.sa_family != AF_INET6) 839 if (addr->sa.sa_family != AF_INET6)
843 af = sctp_get_af_specific(addr->sa.sa_family); 840 af = sctp_get_af_specific(addr->sa.sa_family);
844 else { 841 else {
845 struct sock *sk;
846 int type = ipv6_addr_type(&addr->v6.sin6_addr); 842 int type = ipv6_addr_type(&addr->v6.sin6_addr);
847 sk = sctp_opt2sk(opt); 843 struct net_device *dev;
844
848 if (type & IPV6_ADDR_LINKLOCAL) { 845 if (type & IPV6_ADDR_LINKLOCAL) {
849 /* Note: Behavior similar to af_inet6.c: 846 if (!addr->v6.sin6_scope_id)
850 * 1) Overrides previous bound_dev_if 847 return 0;
851 * 2) Destructive even if bind isn't successful. 848 dev = dev_get_by_index(addr->v6.sin6_scope_id);
852 */ 849 if (!dev)
853
854 if (addr->v6.sin6_scope_id)
855 sk->sk_bound_dev_if = addr->v6.sin6_scope_id;
856 if (!sk->sk_bound_dev_if)
857 return 0; 850 return 0;
851 dev_put(dev);
858 } 852 }
859 af = opt->pf->af; 853 af = opt->pf->af;
860 } 854 }
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 1b2d4adc4ddb..4eb81a1407b7 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -682,9 +682,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
682 682
683 if (!new_transport) { 683 if (!new_transport) {
684 new_transport = asoc->peer.active_path; 684 new_transport = asoc->peer.active_path;
685 } else if (!new_transport->active) { 685 } else if (new_transport->state == SCTP_INACTIVE) {
686 /* If the chunk is Heartbeat or Heartbeat Ack, 686 /* If the chunk is Heartbeat or Heartbeat Ack,
687 * send it to chunk->transport, even if it's 687 * send it to chunk->transport, even if it's
688 * inactive. 688 * inactive.
689 * 689 *
690 * 3.3.6 Heartbeat Acknowledgement: 690 * 3.3.6 Heartbeat Acknowledgement:
@@ -840,7 +840,8 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
840 * Otherwise, we want to use the active path. 840 * Otherwise, we want to use the active path.
841 */ 841 */
842 new_transport = chunk->transport; 842 new_transport = chunk->transport;
843 if (!new_transport || !new_transport->active) 843 if (!new_transport ||
844 new_transport->state == SCTP_INACTIVE)
844 new_transport = asoc->peer.active_path; 845 new_transport = asoc->peer.active_path;
845 846
846 /* Change packets if necessary. */ 847 /* Change packets if necessary. */
@@ -1454,7 +1455,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1454 /* Mark the destination transport address as 1455 /* Mark the destination transport address as
1455 * active if it is not so marked. 1456 * active if it is not so marked.
1456 */ 1457 */
1457 if (!transport->active) { 1458 if (transport->state == SCTP_INACTIVE) {
1458 sctp_assoc_control_transport( 1459 sctp_assoc_control_transport(
1459 transport->asoc, 1460 transport->asoc,
1460 transport, 1461 transport,
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index e42fd8c2916b..98d49ec9b74b 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -132,14 +132,25 @@ void sctp_snmp_proc_exit(void)
132static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) 132static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
133{ 133{
134 struct list_head *pos; 134 struct list_head *pos;
135 struct sctp_association *asoc;
135 struct sctp_sockaddr_entry *laddr; 136 struct sctp_sockaddr_entry *laddr;
136 union sctp_addr *addr; 137 struct sctp_transport *peer;
138 union sctp_addr *addr, *primary = NULL;
137 struct sctp_af *af; 139 struct sctp_af *af;
138 140
141 if (epb->type == SCTP_EP_TYPE_ASSOCIATION) {
142 asoc = sctp_assoc(epb);
143 peer = asoc->peer.primary_path;
144 primary = &peer->saddr;
145 }
146
139 list_for_each(pos, &epb->bind_addr.address_list) { 147 list_for_each(pos, &epb->bind_addr.address_list) {
140 laddr = list_entry(pos, struct sctp_sockaddr_entry, list); 148 laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
141 addr = (union sctp_addr *)&laddr->a; 149 addr = (union sctp_addr *)&laddr->a;
142 af = sctp_get_af_specific(addr->sa.sa_family); 150 af = sctp_get_af_specific(addr->sa.sa_family);
151 if (primary && af->cmp_addr(addr, primary)) {
152 seq_printf(seq, "*");
153 }
143 af->seq_dump_addr(seq, addr); 154 af->seq_dump_addr(seq, addr);
144 } 155 }
145} 156}
@@ -149,17 +160,54 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
149{ 160{
150 struct list_head *pos; 161 struct list_head *pos;
151 struct sctp_transport *transport; 162 struct sctp_transport *transport;
152 union sctp_addr *addr; 163 union sctp_addr *addr, *primary;
153 struct sctp_af *af; 164 struct sctp_af *af;
154 165
166 primary = &(assoc->peer.primary_addr);
155 list_for_each(pos, &assoc->peer.transport_addr_list) { 167 list_for_each(pos, &assoc->peer.transport_addr_list) {
156 transport = list_entry(pos, struct sctp_transport, transports); 168 transport = list_entry(pos, struct sctp_transport, transports);
157 addr = (union sctp_addr *)&transport->ipaddr; 169 addr = (union sctp_addr *)&transport->ipaddr;
158 af = sctp_get_af_specific(addr->sa.sa_family); 170 af = sctp_get_af_specific(addr->sa.sa_family);
171 if (af->cmp_addr(addr, primary)) {
172 seq_printf(seq, "*");
173 }
159 af->seq_dump_addr(seq, addr); 174 af->seq_dump_addr(seq, addr);
160 } 175 }
161} 176}
162 177
178static void * sctp_eps_seq_start(struct seq_file *seq, loff_t *pos)
179{
180 if (*pos > sctp_ep_hashsize)
181 return NULL;
182
183 if (*pos < 0)
184 *pos = 0;
185
186 if (*pos == 0)
187 seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT UID INODE LADDRS\n");
188
189 ++*pos;
190
191 return (void *)pos;
192}
193
194static void sctp_eps_seq_stop(struct seq_file *seq, void *v)
195{
196 return;
197}
198
199
200static void * sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos)
201{
202 if (*pos > sctp_ep_hashsize)
203 return NULL;
204
205 ++*pos;
206
207 return pos;
208}
209
210
163/* Display sctp endpoints (/proc/net/sctp/eps). */ 211/* Display sctp endpoints (/proc/net/sctp/eps). */
164static int sctp_eps_seq_show(struct seq_file *seq, void *v) 212static int sctp_eps_seq_show(struct seq_file *seq, void *v)
165{ 213{
@@ -167,38 +215,50 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v)
167 struct sctp_ep_common *epb; 215 struct sctp_ep_common *epb;
168 struct sctp_endpoint *ep; 216 struct sctp_endpoint *ep;
169 struct sock *sk; 217 struct sock *sk;
170 int hash; 218 int hash = *(int *)v;
171 219
172 seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT LADDRS\n"); 220 if (hash > sctp_ep_hashsize)
173 for (hash = 0; hash < sctp_ep_hashsize; hash++) { 221 return -ENOMEM;
174 head = &sctp_ep_hashtable[hash]; 222
175 read_lock(&head->lock); 223 head = &sctp_ep_hashtable[hash-1];
176 for (epb = head->chain; epb; epb = epb->next) { 224 sctp_local_bh_disable();
177 ep = sctp_ep(epb); 225 read_lock(&head->lock);
178 sk = epb->sk; 226 for (epb = head->chain; epb; epb = epb->next) {
179 seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d ", ep, sk, 227 ep = sctp_ep(epb);
180 sctp_sk(sk)->type, sk->sk_state, hash, 228 sk = epb->sk;
181 epb->bind_addr.port); 229 seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d %5d %5lu ", ep, sk,
182 sctp_seq_dump_local_addrs(seq, epb); 230 sctp_sk(sk)->type, sk->sk_state, hash-1,
183 seq_printf(seq, "\n"); 231 epb->bind_addr.port,
184 } 232 sock_i_uid(sk), sock_i_ino(sk));
185 read_unlock(&head->lock); 233
234 sctp_seq_dump_local_addrs(seq, epb);
235 seq_printf(seq, "\n");
186 } 236 }
237 read_unlock(&head->lock);
238 sctp_local_bh_enable();
187 239
188 return 0; 240 return 0;
189} 241}
190 242
243static struct seq_operations sctp_eps_ops = {
244 .start = sctp_eps_seq_start,
245 .next = sctp_eps_seq_next,
246 .stop = sctp_eps_seq_stop,
247 .show = sctp_eps_seq_show,
248};
249
250
191/* Initialize the seq file operations for 'eps' object. */ 251/* Initialize the seq file operations for 'eps' object. */
192static int sctp_eps_seq_open(struct inode *inode, struct file *file) 252static int sctp_eps_seq_open(struct inode *inode, struct file *file)
193{ 253{
194 return single_open(file, sctp_eps_seq_show, NULL); 254 return seq_open(file, &sctp_eps_ops);
195} 255}
196 256
197static struct file_operations sctp_eps_seq_fops = { 257static struct file_operations sctp_eps_seq_fops = {
198 .open = sctp_eps_seq_open, 258 .open = sctp_eps_seq_open,
199 .read = seq_read, 259 .read = seq_read,
200 .llseek = seq_lseek, 260 .llseek = seq_lseek,
201 .release = single_release, 261 .release = seq_release,
202}; 262};
203 263
204/* Set up the proc fs entry for 'eps' object. */ 264/* Set up the proc fs entry for 'eps' object. */
@@ -221,6 +281,40 @@ void sctp_eps_proc_exit(void)
221 remove_proc_entry("eps", proc_net_sctp); 281 remove_proc_entry("eps", proc_net_sctp);
222} 282}
223 283
284
285static void * sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
286{
287 if (*pos > sctp_assoc_hashsize)
288 return NULL;
289
290 if (*pos < 0)
291 *pos = 0;
292
293 if (*pos == 0)
294 seq_printf(seq, " ASSOC SOCK STY SST ST HBKT ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT "
295 "RPORT LADDRS <-> RADDRS\n");
296
297 ++*pos;
298
299 return (void *)pos;
300}
301
302static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
303{
304 return;
305}
306
307
308static void * sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
309{
310 if (*pos > sctp_assoc_hashsize)
311 return NULL;
312
313 ++*pos;
314
315 return pos;
316}
317
224/* Display sctp associations (/proc/net/sctp/assocs). */ 318/* Display sctp associations (/proc/net/sctp/assocs). */
225static int sctp_assocs_seq_show(struct seq_file *seq, void *v) 319static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
226{ 320{
@@ -228,43 +322,57 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
228 struct sctp_ep_common *epb; 322 struct sctp_ep_common *epb;
229 struct sctp_association *assoc; 323 struct sctp_association *assoc;
230 struct sock *sk; 324 struct sock *sk;
231 int hash; 325 int hash = *(int *)v;
232 326
233 seq_printf(seq, " ASSOC SOCK STY SST ST HBKT LPORT RPORT " 327 if (hash > sctp_assoc_hashsize)
234 "LADDRS <-> RADDRS\n"); 328 return -ENOMEM;
235 for (hash = 0; hash < sctp_assoc_hashsize; hash++) { 329
236 head = &sctp_assoc_hashtable[hash]; 330 head = &sctp_assoc_hashtable[hash-1];
237 read_lock(&head->lock); 331 sctp_local_bh_disable();
238 for (epb = head->chain; epb; epb = epb->next) { 332 read_lock(&head->lock);
239 assoc = sctp_assoc(epb); 333 for (epb = head->chain; epb; epb = epb->next) {
240 sk = epb->sk; 334 assoc = sctp_assoc(epb);
241 seq_printf(seq, 335 sk = epb->sk;
242 "%8p %8p %-3d %-3d %-2d %-4d %-5d %-5d ", 336 seq_printf(seq,
243 assoc, sk, sctp_sk(sk)->type, sk->sk_state, 337 "%8p %8p %-3d %-3d %-2d %-4d %4d %8d %8d %7d %5lu %-5d %5d ",
244 assoc->state, hash, epb->bind_addr.port, 338 assoc, sk, sctp_sk(sk)->type, sk->sk_state,
245 assoc->peer.port); 339 assoc->state, hash-1, assoc->assoc_id,
246 sctp_seq_dump_local_addrs(seq, epb); 340 (sk->sk_rcvbuf - assoc->rwnd),
247 seq_printf(seq, "<-> "); 341 assoc->sndbuf_used,
248 sctp_seq_dump_remote_addrs(seq, assoc); 342 sock_i_uid(sk), sock_i_ino(sk),
249 seq_printf(seq, "\n"); 343 epb->bind_addr.port,
250 } 344 assoc->peer.port);
251 read_unlock(&head->lock); 345
346 seq_printf(seq, " ");
347 sctp_seq_dump_local_addrs(seq, epb);
348 seq_printf(seq, "<-> ");
349 sctp_seq_dump_remote_addrs(seq, assoc);
350 seq_printf(seq, "\n");
252 } 351 }
352 read_unlock(&head->lock);
353 sctp_local_bh_enable();
253 354
254 return 0; 355 return 0;
255} 356}
256 357
358static struct seq_operations sctp_assoc_ops = {
359 .start = sctp_assocs_seq_start,
360 .next = sctp_assocs_seq_next,
361 .stop = sctp_assocs_seq_stop,
362 .show = sctp_assocs_seq_show,
363};
364
257/* Initialize the seq file operations for 'assocs' object. */ 365/* Initialize the seq file operations for 'assocs' object. */
258static int sctp_assocs_seq_open(struct inode *inode, struct file *file) 366static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
259{ 367{
260 return single_open(file, sctp_assocs_seq_show, NULL); 368 return seq_open(file, &sctp_assoc_ops);
261} 369}
262 370
263static struct file_operations sctp_assocs_seq_fops = { 371static struct file_operations sctp_assocs_seq_fops = {
264 .open = sctp_assocs_seq_open, 372 .open = sctp_assocs_seq_open,
265 .read = seq_read, 373 .read = seq_read,
266 .llseek = seq_lseek, 374 .llseek = seq_lseek,
267 .release = single_release, 375 .release = seq_release,
268}; 376};
269 377
270/* Set up the proc fs entry for 'assocs' object. */ 378/* Set up the proc fs entry for 'assocs' object. */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2e1f9c3556f5..5135e1a25d25 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -378,10 +378,13 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
378{ 378{
379 int ret = inet_addr_type(addr->v4.sin_addr.s_addr); 379 int ret = inet_addr_type(addr->v4.sin_addr.s_addr);
380 380
381 /* FIXME: ip_nonlocal_bind sysctl support. */
382 381
383 if (addr->v4.sin_addr.s_addr != INADDR_ANY && ret != RTN_LOCAL) 382 if (addr->v4.sin_addr.s_addr != INADDR_ANY &&
383 ret != RTN_LOCAL &&
384 !sp->inet.freebind &&
385 !sysctl_ip_nonlocal_bind)
384 return 0; 386 return 0;
387
385 return 1; 388 return 1;
386} 389}
387 390
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 33ac8bf47b0e..5baed9bb7de5 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1830,7 +1830,7 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1830 * be a a better choice than any of the embedded addresses. 1830 * be a a better choice than any of the embedded addresses.
1831 */ 1831 */
1832 if (peer_addr) 1832 if (peer_addr)
1833 if(!sctp_assoc_add_peer(asoc, peer_addr, gfp)) 1833 if(!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
1834 goto nomem; 1834 goto nomem;
1835 1835
1836 /* Process the initialization parameters. */ 1836 /* Process the initialization parameters. */
@@ -1841,6 +1841,14 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1841 goto clean_up; 1841 goto clean_up;
1842 } 1842 }
1843 1843
1844 /* Walk list of transports, removing transports in the UNKNOWN state. */
1845 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
1846 transport = list_entry(pos, struct sctp_transport, transports);
1847 if (transport->state == SCTP_UNKNOWN) {
1848 sctp_assoc_rm_peer(asoc, transport);
1849 }
1850 }
1851
1844 /* The fixed INIT headers are always in network byte 1852 /* The fixed INIT headers are always in network byte
1845 * order. 1853 * order.
1846 */ 1854 */
@@ -1906,7 +1914,8 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1906 * stream sequence number shall be set to 0. 1914 * stream sequence number shall be set to 0.
1907 */ 1915 */
1908 1916
1909 /* Allocate storage for the negotiated streams if it is not a temporary * association. 1917 /* Allocate storage for the negotiated streams if it is not a temporary
1918 * association.
1910 */ 1919 */
1911 if (!asoc->temp) { 1920 if (!asoc->temp) {
1912 int assoc_id; 1921 int assoc_id;
@@ -1952,6 +1961,9 @@ clean_up:
1952 list_del_init(pos); 1961 list_del_init(pos);
1953 sctp_transport_free(transport); 1962 sctp_transport_free(transport);
1954 } 1963 }
1964
1965 asoc->peer.transport_count = 0;
1966
1955nomem: 1967nomem:
1956 return 0; 1968 return 0;
1957} 1969}
@@ -1995,7 +2007,7 @@ static int sctp_process_param(struct sctp_association *asoc,
1995 af->from_addr_param(&addr, param.addr, asoc->peer.port, 0); 2007 af->from_addr_param(&addr, param.addr, asoc->peer.port, 0);
1996 scope = sctp_scope(peer_addr); 2008 scope = sctp_scope(peer_addr);
1997 if (sctp_in_scope(&addr, scope)) 2009 if (sctp_in_scope(&addr, scope))
1998 if (!sctp_assoc_add_peer(asoc, &addr, gfp)) 2010 if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_ACTIVE))
1999 return 0; 2011 return 0;
2000 break; 2012 break;
2001 2013
@@ -2396,7 +2408,7 @@ static __u16 sctp_process_asconf_param(struct sctp_association *asoc,
2396 * Due to Resource Shortage'. 2408 * Due to Resource Shortage'.
2397 */ 2409 */
2398 2410
2399 peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC); 2411 peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_ACTIVE);
2400 if (!peer) 2412 if (!peer)
2401 return SCTP_ERROR_RSRC_LOW; 2413 return SCTP_ERROR_RSRC_LOW;
2402 2414
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f65fa441952f..778639db125a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -414,11 +414,13 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
414 */ 414 */
415 asoc->overall_error_count++; 415 asoc->overall_error_count++;
416 416
417 if (transport->active && 417 if (transport->state != SCTP_INACTIVE &&
418 (transport->error_count++ >= transport->max_retrans)) { 418 (transport->error_count++ >= transport->max_retrans)) {
419 SCTP_DEBUG_PRINTK("transport_strike: transport " 419 SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
420 "IP:%d.%d.%d.%d failed.\n", 420 " transport IP: port:%d failed.\n",
421 NIPQUAD(transport->ipaddr.v4.sin_addr)); 421 asoc,
422 (&transport->ipaddr),
423 transport->ipaddr.v4.sin_port);
422 sctp_assoc_control_transport(asoc, transport, 424 sctp_assoc_control_transport(asoc, transport,
423 SCTP_TRANSPORT_DOWN, 425 SCTP_TRANSPORT_DOWN,
424 SCTP_FAILED_THRESHOLD); 426 SCTP_FAILED_THRESHOLD);
@@ -593,7 +595,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
593 /* Mark the destination transport address as active if it is not so 595 /* Mark the destination transport address as active if it is not so
594 * marked. 596 * marked.
595 */ 597 */
596 if (!t->active) 598 if (t->state == SCTP_INACTIVE)
597 sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, 599 sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
598 SCTP_HEARTBEAT_SUCCESS); 600 SCTP_HEARTBEAT_SUCCESS);
599 601
@@ -665,8 +667,11 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
665 667
666 asoc->state = state; 668 asoc->state = state;
667 669
670 SCTP_DEBUG_PRINTK("sctp_cmd_new_state: asoc %p[%s]\n",
671 asoc, sctp_state_tbl[state]);
672
668 if (sctp_style(sk, TCP)) { 673 if (sctp_style(sk, TCP)) {
669 /* Change the sk->sk_state of a TCP-style socket that has 674 /* Change the sk->sk_state of a TCP-style socket that has
670 * sucessfully completed a connect() call. 675 * sucessfully completed a connect() call.
671 */ 676 */
672 if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) 677 if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
@@ -678,6 +683,16 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
678 sk->sk_shutdown |= RCV_SHUTDOWN; 683 sk->sk_shutdown |= RCV_SHUTDOWN;
679 } 684 }
680 685
686 if (sctp_state(asoc, COOKIE_WAIT)) {
687 /* Reset init timeouts since they may have been
688 * increased due to timer expirations.
689 */
690 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
691 asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT];
692 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
693 asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE];
694 }
695
681 if (sctp_state(asoc, ESTABLISHED) || 696 if (sctp_state(asoc, ESTABLISHED) ||
682 sctp_state(asoc, CLOSED) || 697 sctp_state(asoc, CLOSED) ||
683 sctp_state(asoc, SHUTDOWN_RECEIVED)) { 698 sctp_state(asoc, SHUTDOWN_RECEIVED)) {
@@ -1120,10 +1135,10 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1120 * to be executed only during failed attempts of 1135 * to be executed only during failed attempts of
1121 * association establishment. 1136 * association establishment.
1122 */ 1137 */
1123 if ((asoc->peer.retran_path != 1138 if ((asoc->peer.retran_path !=
1124 asoc->peer.primary_path) && 1139 asoc->peer.primary_path) &&
1125 (asoc->counters[SCTP_COUNTER_INIT_ERROR] > 0)) { 1140 (asoc->init_err_counter > 0)) {
1126 sctp_add_cmd_sf(commands, 1141 sctp_add_cmd_sf(commands,
1127 SCTP_CMD_FORCE_PRIM_RETRAN, 1142 SCTP_CMD_FORCE_PRIM_RETRAN,
1128 SCTP_NULL()); 1143 SCTP_NULL());
1129 } 1144 }
@@ -1237,18 +1252,67 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1237 sctp_association_put(asoc); 1252 sctp_association_put(asoc);
1238 break; 1253 break;
1239 1254
1255 case SCTP_CMD_INIT_CHOOSE_TRANSPORT:
1256 chunk = cmd->obj.ptr;
1257 t = sctp_assoc_choose_init_transport(asoc);
1258 asoc->init_last_sent_to = t;
1259 chunk->transport = t;
1260 t->init_sent_count++;
1261 break;
1262
1240 case SCTP_CMD_INIT_RESTART: 1263 case SCTP_CMD_INIT_RESTART:
1241 /* Do the needed accounting and updates 1264 /* Do the needed accounting and updates
1242 * associated with restarting an initialization 1265 * associated with restarting an initialization
1243 * timer. 1266 * timer. Only multiply the timeout by two if
1267 * all transports have been tried at the current
1268 * timeout.
1269 */
1270 t = asoc->init_last_sent_to;
1271 asoc->init_err_counter++;
1272
1273 if (t->init_sent_count > (asoc->init_cycle + 1)) {
1274 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] *= 2;
1275 if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] >
1276 asoc->max_init_timeo) {
1277 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
1278 asoc->max_init_timeo;
1279 }
1280 asoc->init_cycle++;
1281 SCTP_DEBUG_PRINTK(
1282 "T1 INIT Timeout adjustment"
1283 " init_err_counter: %d"
1284 " cycle: %d"
1285 " timeout: %d\n",
1286 asoc->init_err_counter,
1287 asoc->init_cycle,
1288 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]);
1289 }
1290
1291 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
1292 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
1293 break;
1294
1295 case SCTP_CMD_COOKIEECHO_RESTART:
1296 /* Do the needed accounting and updates
1297 * associated with restarting an initialization
1298 * timer. Only multiply the timeout by two if
1299 * all transports have been tried at the current
1300 * timeout.
1244 */ 1301 */
1245 asoc->counters[SCTP_COUNTER_INIT_ERROR]++; 1302 asoc->init_err_counter++;
1246 asoc->timeouts[cmd->obj.to] *= 2; 1303
1247 if (asoc->timeouts[cmd->obj.to] > 1304 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] *= 2;
1305 if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] >
1248 asoc->max_init_timeo) { 1306 asoc->max_init_timeo) {
1249 asoc->timeouts[cmd->obj.to] = 1307 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
1250 asoc->max_init_timeo; 1308 asoc->max_init_timeo;
1251 } 1309 }
1310 SCTP_DEBUG_PRINTK(
1311 "T1 COOKIE Timeout adjustment"
1312 " init_err_counter: %d"
1313 " timeout: %d\n",
1314 asoc->init_err_counter,
1315 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]);
1252 1316
1253 /* If we've sent any data bundled with 1317 /* If we've sent any data bundled with
1254 * COOKIE-ECHO we need to resend. 1318 * COOKIE-ECHO we need to resend.
@@ -1261,7 +1325,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1261 1325
1262 sctp_add_cmd_sf(commands, 1326 sctp_add_cmd_sf(commands,
1263 SCTP_CMD_TIMER_RESTART, 1327 SCTP_CMD_TIMER_RESTART,
1264 SCTP_TO(cmd->obj.to)); 1328 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
1265 break; 1329 break;
1266 1330
1267 case SCTP_CMD_INIT_FAILED: 1331 case SCTP_CMD_INIT_FAILED:
@@ -1273,12 +1337,13 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1273 subtype, chunk, cmd->obj.u32); 1337 subtype, chunk, cmd->obj.u32);
1274 break; 1338 break;
1275 1339
1276 case SCTP_CMD_COUNTER_INC: 1340 case SCTP_CMD_INIT_COUNTER_INC:
1277 asoc->counters[cmd->obj.counter]++; 1341 asoc->init_err_counter++;
1278 break; 1342 break;
1279 1343
1280 case SCTP_CMD_COUNTER_RESET: 1344 case SCTP_CMD_INIT_COUNTER_RESET:
1281 asoc->counters[cmd->obj.counter] = 0; 1345 asoc->init_err_counter = 0;
1346 asoc->init_cycle = 0;
1282 break; 1347 break;
1283 1348
1284 case SCTP_CMD_REPORT_DUP: 1349 case SCTP_CMD_REPORT_DUP:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8e01b8f09ac2..86073df418f5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
92 sctp_cmd_seq_t *commands); 92 sctp_cmd_seq_t *commands);
93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); 93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
94 94
95static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
96 __u16 error,
97 const struct sctp_association *asoc,
98 struct sctp_transport *transport);
99
100static sctp_disposition_t sctp_sf_violation_chunklen(
101 const struct sctp_endpoint *ep,
102 const struct sctp_association *asoc,
103 const sctp_subtype_t type,
104 void *arg,
105 sctp_cmd_seq_t *commands);
95 106
96/* Small helper function that checks if the chunk length 107/* Small helper function that checks if the chunk length
97 * is of the appropriate length. The 'required_length' argument 108 * is of the appropriate length. The 'required_length' argument
@@ -533,6 +544,9 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep,
533 sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT, 544 sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT,
534 SCTP_PEER_INIT(initchunk)); 545 SCTP_PEER_INIT(initchunk));
535 546
547 /* Reset init error count upon receipt of INIT-ACK. */
548 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
549
536 /* 5.1 C) "A" shall stop the T1-init timer and leave 550 /* 5.1 C) "A" shall stop the T1-init timer and leave
537 * COOKIE-WAIT state. "A" shall then ... start the T1-cookie 551 * COOKIE-WAIT state. "A" shall then ... start the T1-cookie
538 * timer, and enter the COOKIE-ECHOED state. 552 * timer, and enter the COOKIE-ECHOED state.
@@ -775,8 +789,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep,
775 * from the COOKIE-ECHOED state to the COOKIE-WAIT 789 * from the COOKIE-ECHOED state to the COOKIE-WAIT
776 * state is performed. 790 * state is performed.
777 */ 791 */
778 sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_RESET, 792 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
779 SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
780 793
781 /* RFC 2960 5.1 Normal Establishment of an Association 794 /* RFC 2960 5.1 Normal Establishment of an Association
782 * 795 *
@@ -1019,10 +1032,22 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
1019 link = sctp_assoc_lookup_paddr(asoc, &from_addr); 1032 link = sctp_assoc_lookup_paddr(asoc, &from_addr);
1020 1033
1021 /* This should never happen, but lets log it if so. */ 1034 /* This should never happen, but lets log it if so. */
1022 if (!link) { 1035 if (unlikely(!link)) {
1023 printk(KERN_WARNING 1036 if (from_addr.sa.sa_family == AF_INET6) {
1024 "%s: Could not find address %d.%d.%d.%d\n", 1037 printk(KERN_WARNING
1025 __FUNCTION__, NIPQUAD(from_addr.v4.sin_addr)); 1038 "%s association %p could not find address "
1039 "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
1040 __FUNCTION__,
1041 asoc,
1042 NIP6(from_addr.v6.sin6_addr));
1043 } else {
1044 printk(KERN_WARNING
1045 "%s association %p could not find address "
1046 "%u.%u.%u.%u\n",
1047 __FUNCTION__,
1048 asoc,
1049 NIPQUAD(from_addr.v4.sin_addr.s_addr));
1050 }
1026 return SCTP_DISPOSITION_DISCARD; 1051 return SCTP_DISPOSITION_DISCARD;
1027 } 1052 }
1028 1053
@@ -2095,9 +2120,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
2095 sctp_errhdr_t *err; 2120 sctp_errhdr_t *err;
2096 struct sctp_chunk *reply; 2121 struct sctp_chunk *reply;
2097 struct sctp_bind_addr *bp; 2122 struct sctp_bind_addr *bp;
2098 int attempts; 2123 int attempts = asoc->init_err_counter + 1;
2099
2100 attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
2101 2124
2102 if (attempts >= asoc->max_init_attempts) { 2125 if (attempts >= asoc->max_init_attempts) {
2103 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 2126 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -2157,8 +2180,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
2157 /* Cast away the const modifier, as we want to just 2180 /* Cast away the const modifier, as we want to just
2158 * rerun it through as a sideffect. 2181 * rerun it through as a sideffect.
2159 */ 2182 */
2160 sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_INC, 2183 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL());
2161 SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
2162 2184
2163 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, 2185 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
2164 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); 2186 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
@@ -2281,8 +2303,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(const struct sctp_endpoint *ep,
2281 if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) 2303 if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
2282 error = ((sctp_errhdr_t *)chunk->skb->data)->cause; 2304 error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
2283 2305
2284 sctp_stop_t1_and_abort(commands, error); 2306 return sctp_stop_t1_and_abort(commands, error, asoc, chunk->transport);
2285 return SCTP_DISPOSITION_ABORT;
2286} 2307}
2287 2308
2288/* 2309/*
@@ -2294,8 +2315,8 @@ sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(const struct sctp_endpoint *ep
2294 void *arg, 2315 void *arg,
2295 sctp_cmd_seq_t *commands) 2316 sctp_cmd_seq_t *commands)
2296{ 2317{
2297 sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR); 2318 return sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR, asoc,
2298 return SCTP_DISPOSITION_ABORT; 2319 (struct sctp_transport *)arg);
2299} 2320}
2300 2321
2301/* 2322/*
@@ -2318,8 +2339,12 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
2318 * 2339 *
2319 * This is common code called by several sctp_sf_*_abort() functions above. 2340 * This is common code called by several sctp_sf_*_abort() functions above.
2320 */ 2341 */
2321void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error) 2342static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
2343 __u16 error,
2344 const struct sctp_association *asoc,
2345 struct sctp_transport *transport)
2322{ 2346{
2347 SCTP_DEBUG_PRINTK("ABORT received (INIT).\n");
2323 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, 2348 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
2324 SCTP_STATE(SCTP_STATE_CLOSED)); 2349 SCTP_STATE(SCTP_STATE_CLOSED));
2325 SCTP_INC_STATS(SCTP_MIB_ABORTEDS); 2350 SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
@@ -2328,6 +2353,7 @@ void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error)
2328 /* CMD_INIT_FAILED will DELETE_TCB. */ 2353 /* CMD_INIT_FAILED will DELETE_TCB. */
2329 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 2354 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
2330 SCTP_U32(error)); 2355 SCTP_U32(error));
2356 return SCTP_DISPOSITION_ABORT;
2331} 2357}
2332 2358
2333/* 2359/*
@@ -3672,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
3672 * 3698 *
3673 * Generate an ABORT chunk and terminate the association. 3699 * Generate an ABORT chunk and terminate the association.
3674 */ 3700 */
3675sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep, 3701static sctp_disposition_t sctp_sf_violation_chunklen(
3702 const struct sctp_endpoint *ep,
3676 const struct sctp_association *asoc, 3703 const struct sctp_association *asoc,
3677 const sctp_subtype_t type, 3704 const sctp_subtype_t type,
3678 void *arg, 3705 void *arg,
@@ -3805,6 +3832,10 @@ sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep,
3805 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, 3832 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC,
3806 SCTP_ASOC((struct sctp_association *) asoc)); 3833 SCTP_ASOC((struct sctp_association *) asoc));
3807 3834
3835 /* Choose transport for INIT. */
3836 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
3837 SCTP_CHUNK(repl));
3838
3808 /* After sending the INIT, "A" starts the T1-init timer and 3839 /* After sending the INIT, "A" starts the T1-init timer and
3809 * enters the COOKIE-WAIT state. 3840 * enters the COOKIE-WAIT state.
3810 */ 3841 */
@@ -4589,7 +4620,7 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4589} 4620}
4590 4621
4591/* 4622/*
4592 * sctp_sf_t1_timer_expire 4623 * sctp_sf_t1_init_timer_expire
4593 * 4624 *
4594 * Section: 4 Note: 2 4625 * Section: 4 Note: 2
4595 * Verification Tag: 4626 * Verification Tag:
@@ -4603,7 +4634,59 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4603 * endpoint MUST abort the initialization process and report the 4634 * endpoint MUST abort the initialization process and report the
4604 * error to SCTP user. 4635 * error to SCTP user.
4605 * 4636 *
4606 * 3) If the T1-cookie timer expires, the endpoint MUST retransmit 4637 * Outputs
4638 * (timers, events)
4639 *
4640 */
4641sctp_disposition_t sctp_sf_t1_init_timer_expire(const struct sctp_endpoint *ep,
4642 const struct sctp_association *asoc,
4643 const sctp_subtype_t type,
4644 void *arg,
4645 sctp_cmd_seq_t *commands)
4646{
4647 struct sctp_chunk *repl = NULL;
4648 struct sctp_bind_addr *bp;
4649 int attempts = asoc->init_err_counter + 1;
4650
4651 SCTP_DEBUG_PRINTK("Timer T1 expired (INIT).\n");
4652
4653 if (attempts < asoc->max_init_attempts) {
4654 bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
4655 repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
4656 if (!repl)
4657 return SCTP_DISPOSITION_NOMEM;
4658
4659 /* Choose transport for INIT. */
4660 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
4661 SCTP_CHUNK(repl));
4662
4663 /* Issue a sideeffect to do the needed accounting. */
4664 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
4665 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
4666
4667 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
4668 } else {
4669 SCTP_DEBUG_PRINTK("Giving up on INIT, attempts: %d"
4670 " max_init_attempts: %d\n",
4671 attempts, asoc->max_init_attempts);
4672 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
4673 SCTP_U32(SCTP_ERROR_NO_ERROR));
4674 return SCTP_DISPOSITION_DELETE_TCB;
4675 }
4676
4677 return SCTP_DISPOSITION_CONSUME;
4678}
4679
4680/*
4681 * sctp_sf_t1_cookie_timer_expire
4682 *
4683 * Section: 4 Note: 2
4684 * Verification Tag:
4685 * Inputs
4686 * (endpoint, asoc)
4687 *
4688 * RFC 2960 Section 4 Notes
4689 * 3) If the T1-cookie timer expires, the endpoint MUST retransmit
4607 * COOKIE ECHO and re-start the T1-cookie timer without changing 4690 * COOKIE ECHO and re-start the T1-cookie timer without changing
4608 * state. This MUST be repeated up to 'Max.Init.Retransmits' times. 4691 * state. This MUST be repeated up to 'Max.Init.Retransmits' times.
4609 * After that, the endpoint MUST abort the initialization process and 4692 * After that, the endpoint MUST abort the initialization process and
@@ -4613,46 +4696,26 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4613 * (timers, events) 4696 * (timers, events)
4614 * 4697 *
4615 */ 4698 */
4616sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep, 4699sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep,
4617 const struct sctp_association *asoc, 4700 const struct sctp_association *asoc,
4618 const sctp_subtype_t type, 4701 const sctp_subtype_t type,
4619 void *arg, 4702 void *arg,
4620 sctp_cmd_seq_t *commands) 4703 sctp_cmd_seq_t *commands)
4621{ 4704{
4622 struct sctp_chunk *repl; 4705 struct sctp_chunk *repl = NULL;
4623 struct sctp_bind_addr *bp; 4706 int attempts = asoc->init_err_counter + 1;
4624 sctp_event_timeout_t timer = (sctp_event_timeout_t) arg;
4625 int timeout;
4626 int attempts;
4627 4707
4628 timeout = asoc->timeouts[timer]; 4708 SCTP_DEBUG_PRINTK("Timer T1 expired (COOKIE-ECHO).\n");
4629 attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
4630 repl = NULL;
4631
4632 SCTP_DEBUG_PRINTK("Timer T1 expired.\n");
4633 4709
4634 if (attempts < asoc->max_init_attempts) { 4710 if (attempts < asoc->max_init_attempts) {
4635 switch (timer) { 4711 repl = sctp_make_cookie_echo(asoc, NULL);
4636 case SCTP_EVENT_TIMEOUT_T1_INIT:
4637 bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
4638 repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
4639 break;
4640
4641 case SCTP_EVENT_TIMEOUT_T1_COOKIE:
4642 repl = sctp_make_cookie_echo(asoc, NULL);
4643 break;
4644
4645 default:
4646 BUG();
4647 break;
4648 };
4649
4650 if (!repl) 4712 if (!repl)
4651 goto nomem; 4713 return SCTP_DISPOSITION_NOMEM;
4652 4714
4653 /* Issue a sideeffect to do the needed accounting. */ 4715 /* Issue a sideeffect to do the needed accounting. */
4654 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART, 4716 sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
4655 SCTP_TO(timer)); 4717 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
4718
4656 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); 4719 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
4657 } else { 4720 } else {
4658 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 4721 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -4661,9 +4724,6 @@ sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep,
4661 } 4724 }
4662 4725
4663 return SCTP_DISPOSITION_CONSUME; 4726 return SCTP_DISPOSITION_CONSUME;
4664
4665nomem:
4666 return SCTP_DISPOSITION_NOMEM;
4667} 4727}
4668 4728
4669/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN 4729/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 8967846f69e8..75ef10408764 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -783,7 +783,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
783 /* SCTP_STATE_COOKIE_WAIT */ \ 783 /* SCTP_STATE_COOKIE_WAIT */ \
784 {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ 784 {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
785 /* SCTP_STATE_COOKIE_ECHOED */ \ 785 /* SCTP_STATE_COOKIE_ECHOED */ \
786 {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \ 786 {.fn = sctp_sf_t1_cookie_timer_expire, \
787 .name = "sctp_sf_t1_cookie_timer_expire"}, \
787 /* SCTP_STATE_ESTABLISHED */ \ 788 /* SCTP_STATE_ESTABLISHED */ \
788 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 789 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
789 /* SCTP_STATE_SHUTDOWN_PENDING */ \ 790 /* SCTP_STATE_SHUTDOWN_PENDING */ \
@@ -802,7 +803,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
802 /* SCTP_STATE_CLOSED */ \ 803 /* SCTP_STATE_CLOSED */ \
803 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 804 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
804 /* SCTP_STATE_COOKIE_WAIT */ \ 805 /* SCTP_STATE_COOKIE_WAIT */ \
805 {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \ 806 {.fn = sctp_sf_t1_init_timer_expire, \
807 .name = "sctp_sf_t1_init_timer_expire"}, \
806 /* SCTP_STATE_COOKIE_ECHOED */ \ 808 /* SCTP_STATE_COOKIE_ECHOED */ \
807 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 809 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
808 /* SCTP_STATE_ESTABLISHED */ \ 810 /* SCTP_STATE_ESTABLISHED */ \
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0b338eca6dc0..aad55dc3792b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -262,18 +262,18 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
262 * sockaddr_in6 [RFC 2553]), 262 * sockaddr_in6 [RFC 2553]),
263 * addr_len - the size of the address structure. 263 * addr_len - the size of the address structure.
264 */ 264 */
265SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) 265SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
266{ 266{
267 int retval = 0; 267 int retval = 0;
268 268
269 sctp_lock_sock(sk); 269 sctp_lock_sock(sk);
270 270
271 SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, uaddr: %p, addr_len: %d)\n", 271 SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, addr: %p, addr_len: %d)\n",
272 sk, uaddr, addr_len); 272 sk, addr, addr_len);
273 273
274 /* Disallow binding twice. */ 274 /* Disallow binding twice. */
275 if (!sctp_sk(sk)->ep->base.bind_addr.port) 275 if (!sctp_sk(sk)->ep->base.bind_addr.port)
276 retval = sctp_do_bind(sk, (union sctp_addr *)uaddr, 276 retval = sctp_do_bind(sk, (union sctp_addr *)addr,
277 addr_len); 277 addr_len);
278 else 278 else
279 retval = -EINVAL; 279 retval = -EINVAL;
@@ -318,23 +318,27 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
318 unsigned short snum; 318 unsigned short snum;
319 int ret = 0; 319 int ret = 0;
320 320
321 SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d)\n",
322 sk, addr, len);
323
324 /* Common sockaddr verification. */ 321 /* Common sockaddr verification. */
325 af = sctp_sockaddr_af(sp, addr, len); 322 af = sctp_sockaddr_af(sp, addr, len);
326 if (!af) 323 if (!af) {
324 SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d) EINVAL\n",
325 sk, addr, len);
327 return -EINVAL; 326 return -EINVAL;
327 }
328
329 snum = ntohs(addr->v4.sin_port);
330
331 SCTP_DEBUG_PRINTK_IPADDR("sctp_do_bind(sk: %p, new addr: ",
332 ", port: %d, new port: %d, len: %d)\n",
333 sk,
334 addr,
335 bp->port, snum,
336 len);
328 337
329 /* PF specific bind() address verification. */ 338 /* PF specific bind() address verification. */
330 if (!sp->pf->bind_verify(sp, addr)) 339 if (!sp->pf->bind_verify(sp, addr))
331 return -EADDRNOTAVAIL; 340 return -EADDRNOTAVAIL;
332 341
333 snum= ntohs(addr->v4.sin_port);
334
335 SCTP_DEBUG_PRINTK("sctp_do_bind: port: %d, new port: %d\n",
336 bp->port, snum);
337
338 /* We must either be unbound, or bind to the same port. */ 342 /* We must either be unbound, or bind to the same port. */
339 if (bp->port && (snum != bp->port)) { 343 if (bp->port && (snum != bp->port)) {
340 SCTP_DEBUG_PRINTK("sctp_do_bind:" 344 SCTP_DEBUG_PRINTK("sctp_do_bind:"
@@ -816,7 +820,8 @@ out:
816 * 820 *
817 * Basically do nothing but copying the addresses from user to kernel 821 * Basically do nothing but copying the addresses from user to kernel
818 * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk. 822 * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk.
819 * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace. 823 * This is used for tunneling the sctp_bindx() request through sctp_setsockopt()
824 * from userspace.
820 * 825 *
821 * We don't use copy_from_user() for optimization: we first do the 826 * We don't use copy_from_user() for optimization: we first do the
822 * sanity checks (buffer size -fast- and access check-healthy 827 * sanity checks (buffer size -fast- and access check-healthy
@@ -913,6 +918,243 @@ out:
913 return err; 918 return err;
914} 919}
915 920
921/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
922 *
923 * Common routine for handling connect() and sctp_connectx().
924 * Connect will come in with just a single address.
925 */
926static int __sctp_connect(struct sock* sk,
927 struct sockaddr *kaddrs,
928 int addrs_size)
929{
930 struct sctp_sock *sp;
931 struct sctp_endpoint *ep;
932 struct sctp_association *asoc = NULL;
933 struct sctp_association *asoc2;
934 struct sctp_transport *transport;
935 union sctp_addr to;
936 struct sctp_af *af;
937 sctp_scope_t scope;
938 long timeo;
939 int err = 0;
940 int addrcnt = 0;
941 int walk_size = 0;
942 struct sockaddr *sa_addr;
943 void *addr_buf;
944
945 sp = sctp_sk(sk);
946 ep = sp->ep;
947
948 /* connect() cannot be done on a socket that is already in ESTABLISHED
949 * state - UDP-style peeled off socket or a TCP-style socket that
950 * is already connected.
951 * It cannot be done even on a TCP-style listening socket.
952 */
953 if (sctp_sstate(sk, ESTABLISHED) ||
954 (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
955 err = -EISCONN;
956 goto out_free;
957 }
958
959 /* Walk through the addrs buffer and count the number of addresses. */
960 addr_buf = kaddrs;
961 while (walk_size < addrs_size) {
962 sa_addr = (struct sockaddr *)addr_buf;
963 af = sctp_get_af_specific(sa_addr->sa_family);
964
965 /* If the address family is not supported or if this address
966 * causes the address buffer to overflow return EINVAL.
967 */
968 if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
969 err = -EINVAL;
970 goto out_free;
971 }
972
973 err = sctp_verify_addr(sk, (union sctp_addr *)sa_addr,
974 af->sockaddr_len);
975 if (err)
976 goto out_free;
977
978 memcpy(&to, sa_addr, af->sockaddr_len);
979 to.v4.sin_port = ntohs(to.v4.sin_port);
980
981 /* Check if there already is a matching association on the
982 * endpoint (other than the one created here).
983 */
984 asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport);
985 if (asoc2 && asoc2 != asoc) {
986 if (asoc2->state >= SCTP_STATE_ESTABLISHED)
987 err = -EISCONN;
988 else
989 err = -EALREADY;
990 goto out_free;
991 }
992
993 /* If we could not find a matching association on the endpoint,
994 * make sure that there is no peeled-off association matching
995 * the peer address even on another socket.
996 */
997 if (sctp_endpoint_is_peeled_off(ep, &to)) {
998 err = -EADDRNOTAVAIL;
999 goto out_free;
1000 }
1001
1002 if (!asoc) {
1003 /* If a bind() or sctp_bindx() is not called prior to
1004 * an sctp_connectx() call, the system picks an
1005 * ephemeral port and will choose an address set
1006 * equivalent to binding with a wildcard address.
1007 */
1008 if (!ep->base.bind_addr.port) {
1009 if (sctp_autobind(sk)) {
1010 err = -EAGAIN;
1011 goto out_free;
1012 }
1013 }
1014
1015 scope = sctp_scope(&to);
1016 asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
1017 if (!asoc) {
1018 err = -ENOMEM;
1019 goto out_free;
1020 }
1021 }
1022
1023 /* Prime the peer's transport structures. */
1024 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL,
1025 SCTP_UNKNOWN);
1026 if (!transport) {
1027 err = -ENOMEM;
1028 goto out_free;
1029 }
1030
1031 addrcnt++;
1032 addr_buf += af->sockaddr_len;
1033 walk_size += af->sockaddr_len;
1034 }
1035
1036 err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
1037 if (err < 0) {
1038 goto out_free;
1039 }
1040
1041 err = sctp_primitive_ASSOCIATE(asoc, NULL);
1042 if (err < 0) {
1043 goto out_free;
1044 }
1045
1046 /* Initialize sk's dport and daddr for getpeername() */
1047 inet_sk(sk)->dport = htons(asoc->peer.port);
1048 af = sctp_get_af_specific(to.sa.sa_family);
1049 af->to_sk_daddr(&to, sk);
1050
1051 timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
1052 err = sctp_wait_for_connect(asoc, &timeo);
1053
1054 /* Don't free association on exit. */
1055 asoc = NULL;
1056
1057out_free:
1058
1059 SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p"
1060 " kaddrs: %p err: %d\n",
1061 asoc, kaddrs, err);
1062 if (asoc)
1063 sctp_association_free(asoc);
1064 return err;
1065}
1066
1067/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt()
1068 *
1069 * API 8.9
1070 * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt);
1071 *
1072 * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
1073 * If the sd is an IPv6 socket, the addresses passed can either be IPv4
1074 * or IPv6 addresses.
1075 *
1076 * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
1077 * Section 3.1.2 for this usage.
1078 *
1079 * addrs is a pointer to an array of one or more socket addresses. Each
1080 * address is contained in its appropriate structure (i.e. struct
1081 * sockaddr_in or struct sockaddr_in6) the family of the address type
1082 * must be used to distengish the address length (note that this
1083 * representation is termed a "packed array" of addresses). The caller
1084 * specifies the number of addresses in the array with addrcnt.
1085 *
1086 * On success, sctp_connectx() returns 0. On failure, sctp_connectx() returns
1087 * -1, and sets errno to the appropriate error code.
1088 *
1089 * For SCTP, the port given in each socket address must be the same, or
1090 * sctp_connectx() will fail, setting errno to EINVAL.
1091 *
1092 * An application can use sctp_connectx to initiate an association with
1093 * an endpoint that is multi-homed. Much like sctp_bindx() this call
1094 * allows a caller to specify multiple addresses at which a peer can be
1095 * reached. The way the SCTP stack uses the list of addresses to set up
1096 * the association is implementation dependant. This function only
1097 * specifies that the stack will try to make use of all the addresses in
1098 * the list when needed.
1099 *
1100 * Note that the list of addresses passed in is only used for setting up
1101 * the association. It does not necessarily equal the set of addresses
1102 * the peer uses for the resulting association. If the caller wants to
1103 * find out the set of peer addresses, it must use sctp_getpaddrs() to
1104 * retrieve them after the association has been set up.
1105 *
1106 * Basically do nothing but copying the addresses from user to kernel
1107 * land and invoking either sctp_connectx(). This is used for tunneling
1108 * the sctp_connectx() request through sctp_setsockopt() from userspace.
1109 *
1110 * We don't use copy_from_user() for optimization: we first do the
1111 * sanity checks (buffer size -fast- and access check-healthy
1112 * pointer); if all of those succeed, then we can alloc the memory
1113 * (expensive operation) needed to copy the data to kernel. Then we do
1114 * the copying without checking the user space area
1115 * (__copy_from_user()).
1116 *
1117 * On exit there is no need to do sockfd_put(), sys_setsockopt() does
1118 * it.
1119 *
1120 * sk The sk of the socket
1121 * addrs The pointer to the addresses in user land
1122 * addrssize Size of the addrs buffer
1123 *
1124 * Returns 0 if ok, <0 errno code on error.
1125 */
1126SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk,
1127 struct sockaddr __user *addrs,
1128 int addrs_size)
1129{
1130 int err = 0;
1131 struct sockaddr *kaddrs;
1132
1133 SCTP_DEBUG_PRINTK("%s - sk %p addrs %p addrs_size %d\n",
1134 __FUNCTION__, sk, addrs, addrs_size);
1135
1136 if (unlikely(addrs_size <= 0))
1137 return -EINVAL;
1138
1139 /* Check the user passed a healthy pointer. */
1140 if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
1141 return -EFAULT;
1142
1143 /* Alloc space for the address array in kernel memory. */
1144 kaddrs = (struct sockaddr *)kmalloc(addrs_size, GFP_KERNEL);
1145 if (unlikely(!kaddrs))
1146 return -ENOMEM;
1147
1148 if (__copy_from_user(kaddrs, addrs, addrs_size)) {
1149 err = -EFAULT;
1150 } else {
1151 err = __sctp_connect(sk, kaddrs, addrs_size);
1152 }
1153
1154 kfree(kaddrs);
1155 return err;
1156}
1157
916/* API 3.1.4 close() - UDP Style Syntax 1158/* API 3.1.4 close() - UDP Style Syntax
917 * Applications use close() to perform graceful shutdown (as described in 1159 * Applications use close() to perform graceful shutdown (as described in
918 * Section 10.1 of [SCTP]) on ALL the associations currently represented 1160 * Section 10.1 of [SCTP]) on ALL the associations currently represented
@@ -1095,7 +1337,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
1095 sp = sctp_sk(sk); 1337 sp = sctp_sk(sk);
1096 ep = sp->ep; 1338 ep = sp->ep;
1097 1339
1098 SCTP_DEBUG_PRINTK("Using endpoint: %s.\n", ep->debug_name); 1340 SCTP_DEBUG_PRINTK("Using endpoint: %p.\n", ep);
1099 1341
1100 /* We cannot send a message over a TCP-style listening socket. */ 1342 /* We cannot send a message over a TCP-style listening socket. */
1101 if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) { 1343 if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) {
@@ -1306,7 +1548,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
1306 } 1548 }
1307 1549
1308 /* Prime the peer's transport structures. */ 1550 /* Prime the peer's transport structures. */
1309 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL); 1551 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN);
1310 if (!transport) { 1552 if (!transport) {
1311 err = -ENOMEM; 1553 err = -ENOMEM;
1312 goto out_free; 1554 goto out_free;
@@ -2208,6 +2450,12 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
2208 optlen, SCTP_BINDX_REM_ADDR); 2450 optlen, SCTP_BINDX_REM_ADDR);
2209 break; 2451 break;
2210 2452
2453 case SCTP_SOCKOPT_CONNECTX:
2454 /* 'optlen' is the size of the addresses buffer. */
2455 retval = sctp_setsockopt_connectx(sk, (struct sockaddr __user *)optval,
2456 optlen);
2457 break;
2458
2211 case SCTP_DISABLE_FRAGMENTS: 2459 case SCTP_DISABLE_FRAGMENTS:
2212 retval = sctp_setsockopt_disable_fragments(sk, optval, optlen); 2460 retval = sctp_setsockopt_disable_fragments(sk, optval, optlen);
2213 break; 2461 break;
@@ -2283,112 +2531,29 @@ out_nounlock:
2283 * 2531 *
2284 * len: the size of the address. 2532 * len: the size of the address.
2285 */ 2533 */
2286SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *uaddr, 2534SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *addr,
2287 int addr_len) 2535 int addr_len)
2288{ 2536{
2289 struct sctp_sock *sp;
2290 struct sctp_endpoint *ep;
2291 struct sctp_association *asoc;
2292 struct sctp_transport *transport;
2293 union sctp_addr to;
2294 struct sctp_af *af;
2295 sctp_scope_t scope;
2296 long timeo;
2297 int err = 0; 2537 int err = 0;
2538 struct sctp_af *af;
2298 2539
2299 sctp_lock_sock(sk); 2540 sctp_lock_sock(sk);
2300 2541
2301 SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d)\n", 2542 SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d\n",
2302 __FUNCTION__, sk, uaddr, addr_len); 2543 __FUNCTION__, sk, addr, addr_len);
2303
2304 sp = sctp_sk(sk);
2305 ep = sp->ep;
2306
2307 /* connect() cannot be done on a socket that is already in ESTABLISHED
2308 * state - UDP-style peeled off socket or a TCP-style socket that
2309 * is already connected.
2310 * It cannot be done even on a TCP-style listening socket.
2311 */
2312 if (sctp_sstate(sk, ESTABLISHED) ||
2313 (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
2314 err = -EISCONN;
2315 goto out_unlock;
2316 }
2317
2318 err = sctp_verify_addr(sk, (union sctp_addr *)uaddr, addr_len);
2319 if (err)
2320 goto out_unlock;
2321 2544
2322 if (addr_len > sizeof(to)) 2545 /* Validate addr_len before calling common connect/connectx routine. */
2323 addr_len = sizeof(to); 2546 af = sctp_get_af_specific(addr->sa_family);
2324 memcpy(&to, uaddr, addr_len); 2547 if (!af || addr_len < af->sockaddr_len) {
2325 to.v4.sin_port = ntohs(to.v4.sin_port); 2548 err = -EINVAL;
2326 2549 } else {
2327 asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); 2550 /* Pass correct addr len to common routine (so it knows there
2328 if (asoc) { 2551 * is only one address being passed.
2329 if (asoc->state >= SCTP_STATE_ESTABLISHED) 2552 */
2330 err = -EISCONN; 2553 err = __sctp_connect(sk, addr, af->sockaddr_len);
2331 else
2332 err = -EALREADY;
2333 goto out_unlock;
2334 }
2335
2336 /* If we could not find a matching association on the endpoint,
2337 * make sure that there is no peeled-off association matching the
2338 * peer address even on another socket.
2339 */
2340 if (sctp_endpoint_is_peeled_off(ep, &to)) {
2341 err = -EADDRNOTAVAIL;
2342 goto out_unlock;
2343 }
2344
2345 /* If a bind() or sctp_bindx() is not called prior to a connect()
2346 * call, the system picks an ephemeral port and will choose an address
2347 * set equivalent to binding with a wildcard address.
2348 */
2349 if (!ep->base.bind_addr.port) {
2350 if (sctp_autobind(sk)) {
2351 err = -EAGAIN;
2352 goto out_unlock;
2353 }
2354 }
2355
2356 scope = sctp_scope(&to);
2357 asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
2358 if (!asoc) {
2359 err = -ENOMEM;
2360 goto out_unlock;
2361 }
2362
2363 /* Prime the peer's transport structures. */
2364 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL);
2365 if (!transport) {
2366 sctp_association_free(asoc);
2367 goto out_unlock;
2368 }
2369 err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
2370 if (err < 0) {
2371 sctp_association_free(asoc);
2372 goto out_unlock;
2373 }
2374
2375 err = sctp_primitive_ASSOCIATE(asoc, NULL);
2376 if (err < 0) {
2377 sctp_association_free(asoc);
2378 goto out_unlock;
2379 } 2554 }
2380 2555
2381 /* Initialize sk's dport and daddr for getpeername() */
2382 inet_sk(sk)->dport = htons(asoc->peer.port);
2383 af = sctp_get_af_specific(to.sa.sa_family);
2384 af->to_sk_daddr(&to, sk);
2385
2386 timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
2387 err = sctp_wait_for_connect(asoc, &timeo);
2388
2389out_unlock:
2390 sctp_release_sock(sk); 2556 sctp_release_sock(sk);
2391
2392 return err; 2557 return err;
2393} 2558}
2394 2559
@@ -2677,12 +2842,15 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
2677 /* Map ipv4 address into v4-mapped-on-v6 address. */ 2842 /* Map ipv4 address into v4-mapped-on-v6 address. */
2678 sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), 2843 sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
2679 (union sctp_addr *)&status.sstat_primary.spinfo_address); 2844 (union sctp_addr *)&status.sstat_primary.spinfo_address);
2680 status.sstat_primary.spinfo_state = transport->active; 2845 status.sstat_primary.spinfo_state = transport->state;
2681 status.sstat_primary.spinfo_cwnd = transport->cwnd; 2846 status.sstat_primary.spinfo_cwnd = transport->cwnd;
2682 status.sstat_primary.spinfo_srtt = transport->srtt; 2847 status.sstat_primary.spinfo_srtt = transport->srtt;
2683 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); 2848 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
2684 status.sstat_primary.spinfo_mtu = transport->pmtu; 2849 status.sstat_primary.spinfo_mtu = transport->pmtu;
2685 2850
2851 if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
2852 status.sstat_primary.spinfo_state = SCTP_ACTIVE;
2853
2686 if (put_user(len, optlen)) { 2854 if (put_user(len, optlen)) {
2687 retval = -EFAULT; 2855 retval = -EFAULT;
2688 goto out; 2856 goto out;
@@ -2733,12 +2901,15 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
2733 return -EINVAL; 2901 return -EINVAL;
2734 2902
2735 pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); 2903 pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
2736 pinfo.spinfo_state = transport->active; 2904 pinfo.spinfo_state = transport->state;
2737 pinfo.spinfo_cwnd = transport->cwnd; 2905 pinfo.spinfo_cwnd = transport->cwnd;
2738 pinfo.spinfo_srtt = transport->srtt; 2906 pinfo.spinfo_srtt = transport->srtt;
2739 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); 2907 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
2740 pinfo.spinfo_mtu = transport->pmtu; 2908 pinfo.spinfo_mtu = transport->pmtu;
2741 2909
2910 if (pinfo.spinfo_state == SCTP_UNKNOWN)
2911 pinfo.spinfo_state = SCTP_ACTIVE;
2912
2742 if (put_user(len, optlen)) { 2913 if (put_user(len, optlen)) {
2743 retval = -EFAULT; 2914 retval = -EFAULT;
2744 goto out; 2915 goto out;
@@ -3591,7 +3762,8 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
3591 int retval = 0; 3762 int retval = 0;
3592 int len; 3763 int len;
3593 3764
3594 SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p, ...)\n", sk); 3765 SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p... optname: %d)\n",
3766 sk, optname);
3595 3767
3596 /* I can hardly begin to describe how wrong this is. This is 3768 /* I can hardly begin to describe how wrong this is. This is
3597 * so broken as to be worse than useless. The API draft 3769 * so broken as to be worse than useless. The API draft
@@ -4368,15 +4540,11 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
4368 * However, this function was corrent in any case. 8) 4540 * However, this function was corrent in any case. 8)
4369 */ 4541 */
4370 if (flags & MSG_PEEK) { 4542 if (flags & MSG_PEEK) {
4371 unsigned long cpu_flags; 4543 spin_lock_bh(&sk->sk_receive_queue.lock);
4372
4373 sctp_spin_lock_irqsave(&sk->sk_receive_queue.lock,
4374 cpu_flags);
4375 skb = skb_peek(&sk->sk_receive_queue); 4544 skb = skb_peek(&sk->sk_receive_queue);
4376 if (skb) 4545 if (skb)
4377 atomic_inc(&skb->users); 4546 atomic_inc(&skb->users);
4378 sctp_spin_unlock_irqrestore(&sk->sk_receive_queue.lock, 4547 spin_unlock_bh(&sk->sk_receive_queue.lock);
4379 cpu_flags);
4380 } else { 4548 } else {
4381 skb = skb_dequeue(&sk->sk_receive_queue); 4549 skb = skb_dequeue(&sk->sk_receive_queue);
4382 } 4550 }
@@ -4600,8 +4768,7 @@ out:
4600 return err; 4768 return err;
4601 4769
4602do_error: 4770do_error:
4603 if (asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1 >= 4771 if (asoc->init_err_counter + 1 >= asoc->max_init_attempts)
4604 asoc->max_init_attempts)
4605 err = -ETIMEDOUT; 4772 err = -ETIMEDOUT;
4606 else 4773 else
4607 err = -ECONNREFUSED; 4774 err = -ECONNREFUSED;
@@ -4686,6 +4853,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
4686 struct sctp_endpoint *newep = newsp->ep; 4853 struct sctp_endpoint *newep = newsp->ep;
4687 struct sk_buff *skb, *tmp; 4854 struct sk_buff *skb, *tmp;
4688 struct sctp_ulpevent *event; 4855 struct sctp_ulpevent *event;
4856 int flags = 0;
4689 4857
4690 /* Migrate socket buffer sizes and all the socket level options to the 4858 /* Migrate socket buffer sizes and all the socket level options to the
4691 * new socket. 4859 * new socket.
@@ -4707,6 +4875,17 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
4707 sctp_sk(newsk)->bind_hash = pp; 4875 sctp_sk(newsk)->bind_hash = pp;
4708 inet_sk(newsk)->num = inet_sk(oldsk)->num; 4876 inet_sk(newsk)->num = inet_sk(oldsk)->num;
4709 4877
4878 /* Copy the bind_addr list from the original endpoint to the new
4879 * endpoint so that we can handle restarts properly
4880 */
4881 if (assoc->peer.ipv4_address)
4882 flags |= SCTP_ADDR4_PEERSUPP;
4883 if (assoc->peer.ipv6_address)
4884 flags |= SCTP_ADDR6_PEERSUPP;
4885 sctp_bind_addr_copy(&newsp->ep->base.bind_addr,
4886 &oldsp->ep->base.bind_addr,
4887 SCTP_SCOPE_GLOBAL, GFP_KERNEL, flags);
4888
4710 /* Move any messages in the old socket's receive queue that are for the 4889 /* Move any messages in the old socket's receive queue that are for the
4711 * peeled off association to the new socket's receive queue. 4890 * peeled off association to the new socket's receive queue.
4712 */ 4891 */
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index f30882e1e96a..0ec0fde6e6c5 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -83,7 +83,9 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
83 peer->last_time_used = jiffies; 83 peer->last_time_used = jiffies;
84 peer->last_time_ecne_reduced = jiffies; 84 peer->last_time_ecne_reduced = jiffies;
85 85
86 peer->active = SCTP_ACTIVE; 86 peer->init_sent_count = 0;
87
88 peer->state = SCTP_ACTIVE;
87 peer->hb_allowed = 0; 89 peer->hb_allowed = 0;
88 90
89 /* Initialize the default path max_retrans. */ 91 /* Initialize the default path max_retrans. */
diff --git a/net/socket.c b/net/socket.c
index cec0cb38b9ce..6f2a17881972 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -81,6 +81,7 @@
81#include <linux/syscalls.h> 81#include <linux/syscalls.h>
82#include <linux/compat.h> 82#include <linux/compat.h>
83#include <linux/kmod.h> 83#include <linux/kmod.h>
84#include <linux/audit.h>
84 85
85#ifdef CONFIG_NET_RADIO 86#ifdef CONFIG_NET_RADIO
86#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ 87#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */
@@ -226,7 +227,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
226 return 0; 227 return 0;
227 if(copy_from_user(kaddr,uaddr,ulen)) 228 if(copy_from_user(kaddr,uaddr,ulen))
228 return -EFAULT; 229 return -EFAULT;
229 return 0; 230 return audit_sockaddr(ulen, kaddr);
230} 231}
231 232
232/** 233/**
@@ -382,9 +383,8 @@ int sock_map_fd(struct socket *sock)
382 goto out; 383 goto out;
383 } 384 }
384 385
385 sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino); 386 this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
386 this.name = name; 387 this.name = name;
387 this.len = strlen(name);
388 this.hash = SOCK_INODE(sock)->i_ino; 388 this.hash = SOCK_INODE(sock)->i_ino;
389 389
390 file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this); 390 file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
@@ -1906,7 +1906,11 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
1906 /* copy_from_user should be SMP safe. */ 1906 /* copy_from_user should be SMP safe. */
1907 if (copy_from_user(a, args, nargs[call])) 1907 if (copy_from_user(a, args, nargs[call]))
1908 return -EFAULT; 1908 return -EFAULT;
1909 1909
1910 err = audit_socketcall(nargs[call]/sizeof(unsigned long), a);
1911 if (err)
1912 return err;
1913
1910 a0=a[0]; 1914 a0=a[0];
1911 a1=a[1]; 1915 a1=a[1];
1912 1916
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 9bcec9b927b9..505e2d4b3d62 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -66,10 +66,10 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
66 u32 flavor = pseudoflavor_to_flavor(pseudoflavor); 66 u32 flavor = pseudoflavor_to_flavor(pseudoflavor);
67 67
68 if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor])) 68 if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor]))
69 return NULL; 69 return ERR_PTR(-EINVAL);
70 auth = ops->create(clnt, pseudoflavor); 70 auth = ops->create(clnt, pseudoflavor);
71 if (!auth) 71 if (IS_ERR(auth))
72 return NULL; 72 return auth;
73 if (clnt->cl_auth) 73 if (clnt->cl_auth)
74 rpcauth_destroy(clnt->cl_auth); 74 rpcauth_destroy(clnt->cl_auth);
75 clnt->cl_auth = auth; 75 clnt->cl_auth = auth;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a33b627cbef4..2f7b867161d2 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -660,14 +660,16 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
660{ 660{
661 struct gss_auth *gss_auth; 661 struct gss_auth *gss_auth;
662 struct rpc_auth * auth; 662 struct rpc_auth * auth;
663 int err = -ENOMEM; /* XXX? */
663 664
664 dprintk("RPC: creating GSS authenticator for client %p\n",clnt); 665 dprintk("RPC: creating GSS authenticator for client %p\n",clnt);
665 666
666 if (!try_module_get(THIS_MODULE)) 667 if (!try_module_get(THIS_MODULE))
667 return NULL; 668 return ERR_PTR(err);
668 if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) 669 if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
669 goto out_dec; 670 goto out_dec;
670 gss_auth->client = clnt; 671 gss_auth->client = clnt;
672 err = -EINVAL;
671 gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); 673 gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
672 if (!gss_auth->mech) { 674 if (!gss_auth->mech) {
673 printk(KERN_WARNING "%s: Pseudoflavor %d not found!", 675 printk(KERN_WARNING "%s: Pseudoflavor %d not found!",
@@ -675,9 +677,8 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
675 goto err_free; 677 goto err_free;
676 } 678 }
677 gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); 679 gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
678 /* FIXME: Will go away once privacy support is merged in */ 680 if (gss_auth->service == 0)
679 if (gss_auth->service == RPC_GSS_SVC_PRIVACY) 681 goto err_put_mech;
680 gss_auth->service = RPC_GSS_SVC_INTEGRITY;
681 INIT_LIST_HEAD(&gss_auth->upcalls); 682 INIT_LIST_HEAD(&gss_auth->upcalls);
682 spin_lock_init(&gss_auth->lock); 683 spin_lock_init(&gss_auth->lock);
683 auth = &gss_auth->rpc_auth; 684 auth = &gss_auth->rpc_auth;
@@ -687,15 +688,18 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
687 auth->au_flavor = flavor; 688 auth->au_flavor = flavor;
688 atomic_set(&auth->au_count, 1); 689 atomic_set(&auth->au_count, 1);
689 690
690 if (rpcauth_init_credcache(auth, GSS_CRED_EXPIRE) < 0) 691 err = rpcauth_init_credcache(auth, GSS_CRED_EXPIRE);
692 if (err)
691 goto err_put_mech; 693 goto err_put_mech;
692 694
693 snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", 695 snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s",
694 clnt->cl_pathname, 696 clnt->cl_pathname,
695 gss_auth->mech->gm_name); 697 gss_auth->mech->gm_name);
696 gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); 698 gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
697 if (IS_ERR(gss_auth->dentry)) 699 if (IS_ERR(gss_auth->dentry)) {
700 err = PTR_ERR(gss_auth->dentry);
698 goto err_put_mech; 701 goto err_put_mech;
702 }
699 703
700 return auth; 704 return auth;
701err_put_mech: 705err_put_mech:
@@ -704,7 +708,7 @@ err_free:
704 kfree(gss_auth); 708 kfree(gss_auth);
705out_dec: 709out_dec:
706 module_put(THIS_MODULE); 710 module_put(THIS_MODULE);
707 return NULL; 711 return ERR_PTR(err);
708} 712}
709 713
710static void 714static void
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 02bc029d46fe..f17e6153b688 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -97,12 +97,13 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
97 * made to sleep too long. 97 * made to sleep too long.
98 */ 98 */
99struct rpc_clnt * 99struct rpc_clnt *
100rpc_create_client(struct rpc_xprt *xprt, char *servname, 100rpc_new_client(struct rpc_xprt *xprt, char *servname,
101 struct rpc_program *program, u32 vers, 101 struct rpc_program *program, u32 vers,
102 rpc_authflavor_t flavor) 102 rpc_authflavor_t flavor)
103{ 103{
104 struct rpc_version *version; 104 struct rpc_version *version;
105 struct rpc_clnt *clnt = NULL; 105 struct rpc_clnt *clnt = NULL;
106 struct rpc_auth *auth;
106 int err; 107 int err;
107 int len; 108 int len;
108 109
@@ -157,10 +158,11 @@ rpc_create_client(struct rpc_xprt *xprt, char *servname,
157 if (err < 0) 158 if (err < 0)
158 goto out_no_path; 159 goto out_no_path;
159 160
160 err = -ENOMEM; 161 auth = rpcauth_create(flavor, clnt);
161 if (!rpcauth_create(flavor, clnt)) { 162 if (IS_ERR(auth)) {
162 printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", 163 printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
163 flavor); 164 flavor);
165 err = PTR_ERR(auth);
164 goto out_no_auth; 166 goto out_no_auth;
165 } 167 }
166 168
@@ -178,6 +180,37 @@ out_no_path:
178 kfree(clnt->cl_server); 180 kfree(clnt->cl_server);
179 kfree(clnt); 181 kfree(clnt);
180out_err: 182out_err:
183 xprt_destroy(xprt);
184 return ERR_PTR(err);
185}
186
187/**
188 * Create an RPC client
189 * @xprt - pointer to xprt struct
190 * @servname - name of server
191 * @info - rpc_program
192 * @version - rpc_program version
193 * @authflavor - rpc_auth flavour to use
194 *
195 * Creates an RPC client structure, then pings the server in order to
196 * determine if it is up, and if it supports this program and version.
197 *
198 * This function should never be called by asynchronous tasks such as
199 * the portmapper.
200 */
201struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
202 struct rpc_program *info, u32 version, rpc_authflavor_t authflavor)
203{
204 struct rpc_clnt *clnt;
205 int err;
206
207 clnt = rpc_new_client(xprt, servname, info, version, authflavor);
208 if (IS_ERR(clnt))
209 return clnt;
210 err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
211 if (err == 0)
212 return clnt;
213 rpc_shutdown_client(clnt);
181 return ERR_PTR(err); 214 return ERR_PTR(err);
182} 215}
183 216
@@ -208,6 +241,8 @@ rpc_clone_client(struct rpc_clnt *clnt)
208 rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); 241 rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
209 if (new->cl_auth) 242 if (new->cl_auth)
210 atomic_inc(&new->cl_auth->au_count); 243 atomic_inc(&new->cl_auth->au_count);
244 new->cl_pmap = &new->cl_pmap_default;
245 rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
211 return new; 246 return new;
212out_no_clnt: 247out_no_clnt:
213 printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__); 248 printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__);
@@ -296,6 +331,44 @@ rpc_release_client(struct rpc_clnt *clnt)
296 rpc_destroy_client(clnt); 331 rpc_destroy_client(clnt);
297} 332}
298 333
334/**
335 * rpc_bind_new_program - bind a new RPC program to an existing client
336 * @old - old rpc_client
337 * @program - rpc program to set
338 * @vers - rpc program version
339 *
340 * Clones the rpc client and sets up a new RPC program. This is mainly
341 * of use for enabling different RPC programs to share the same transport.
342 * The Sun NFSv2/v3 ACL protocol can do this.
343 */
344struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
345 struct rpc_program *program,
346 int vers)
347{
348 struct rpc_clnt *clnt;
349 struct rpc_version *version;
350 int err;
351
352 BUG_ON(vers >= program->nrvers || !program->version[vers]);
353 version = program->version[vers];
354 clnt = rpc_clone_client(old);
355 if (IS_ERR(clnt))
356 goto out;
357 clnt->cl_procinfo = version->procs;
358 clnt->cl_maxproc = version->nrprocs;
359 clnt->cl_protname = program->name;
360 clnt->cl_prog = program->number;
361 clnt->cl_vers = version->number;
362 clnt->cl_stats = program->stats;
363 err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
364 if (err != 0) {
365 rpc_shutdown_client(clnt);
366 clnt = ERR_PTR(err);
367 }
368out:
369 return clnt;
370}
371
299/* 372/*
300 * Default callback for async RPC calls 373 * Default callback for async RPC calls
301 */ 374 */
@@ -305,38 +378,41 @@ rpc_default_callback(struct rpc_task *task)
305} 378}
306 379
307/* 380/*
308 * Export the signal mask handling for aysnchronous code that 381 * Export the signal mask handling for synchronous code that
309 * sleeps on RPC calls 382 * sleeps on RPC calls
310 */ 383 */
384#define RPC_INTR_SIGNALS (sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGKILL))
311 385
386static void rpc_save_sigmask(sigset_t *oldset, int intr)
387{
388 unsigned long sigallow = 0;
389 sigset_t sigmask;
390
391 /* Block all signals except those listed in sigallow */
392 if (intr)
393 sigallow |= RPC_INTR_SIGNALS;
394 siginitsetinv(&sigmask, sigallow);
395 sigprocmask(SIG_BLOCK, &sigmask, oldset);
396}
397
398static inline void rpc_task_sigmask(struct rpc_task *task, sigset_t *oldset)
399{
400 rpc_save_sigmask(oldset, !RPC_TASK_UNINTERRUPTIBLE(task));
401}
402
403static inline void rpc_restore_sigmask(sigset_t *oldset)
404{
405 sigprocmask(SIG_SETMASK, oldset, NULL);
406}
407
312void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset) 408void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset)
313{ 409{
314 unsigned long sigallow = sigmask(SIGKILL); 410 rpc_save_sigmask(oldset, clnt->cl_intr);
315 unsigned long irqflags;
316
317 /* Turn off various signals */
318 if (clnt->cl_intr) {
319 struct k_sigaction *action = current->sighand->action;
320 if (action[SIGINT-1].sa.sa_handler == SIG_DFL)
321 sigallow |= sigmask(SIGINT);
322 if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL)
323 sigallow |= sigmask(SIGQUIT);
324 }
325 spin_lock_irqsave(&current->sighand->siglock, irqflags);
326 *oldset = current->blocked;
327 siginitsetinv(&current->blocked, sigallow & ~oldset->sig[0]);
328 recalc_sigpending();
329 spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
330} 411}
331 412
332void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset) 413void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset)
333{ 414{
334 unsigned long irqflags; 415 rpc_restore_sigmask(oldset);
335
336 spin_lock_irqsave(&current->sighand->siglock, irqflags);
337 current->blocked = *oldset;
338 recalc_sigpending();
339 spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
340} 416}
341 417
342/* 418/*
@@ -354,26 +430,26 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
354 430
355 BUG_ON(flags & RPC_TASK_ASYNC); 431 BUG_ON(flags & RPC_TASK_ASYNC);
356 432
357 rpc_clnt_sigmask(clnt, &oldset);
358
359 status = -ENOMEM; 433 status = -ENOMEM;
360 task = rpc_new_task(clnt, NULL, flags); 434 task = rpc_new_task(clnt, NULL, flags);
361 if (task == NULL) 435 if (task == NULL)
362 goto out; 436 goto out;
363 437
438 /* Mask signals on RPC calls _and_ GSS_AUTH upcalls */
439 rpc_task_sigmask(task, &oldset);
440
364 rpc_call_setup(task, msg, 0); 441 rpc_call_setup(task, msg, 0);
365 442
366 /* Set up the call info struct and execute the task */ 443 /* Set up the call info struct and execute the task */
367 if (task->tk_status == 0) 444 if (task->tk_status == 0) {
368 status = rpc_execute(task); 445 status = rpc_execute(task);
369 else { 446 } else {
370 status = task->tk_status; 447 status = task->tk_status;
371 rpc_release_task(task); 448 rpc_release_task(task);
372 } 449 }
373 450
451 rpc_restore_sigmask(&oldset);
374out: 452out:
375 rpc_clnt_sigunmask(clnt, &oldset);
376
377 return status; 453 return status;
378} 454}
379 455
@@ -394,8 +470,6 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
394 470
395 flags |= RPC_TASK_ASYNC; 471 flags |= RPC_TASK_ASYNC;
396 472
397 rpc_clnt_sigmask(clnt, &oldset);
398
399 /* Create/initialize a new RPC task */ 473 /* Create/initialize a new RPC task */
400 if (!callback) 474 if (!callback)
401 callback = rpc_default_callback; 475 callback = rpc_default_callback;
@@ -404,6 +478,9 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
404 goto out; 478 goto out;
405 task->tk_calldata = data; 479 task->tk_calldata = data;
406 480
481 /* Mask signals on GSS_AUTH upcalls */
482 rpc_task_sigmask(task, &oldset);
483
407 rpc_call_setup(task, msg, 0); 484 rpc_call_setup(task, msg, 0);
408 485
409 /* Set up the call info struct and execute the task */ 486 /* Set up the call info struct and execute the task */
@@ -413,9 +490,8 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
413 else 490 else
414 rpc_release_task(task); 491 rpc_release_task(task);
415 492
493 rpc_restore_sigmask(&oldset);
416out: 494out:
417 rpc_clnt_sigunmask(clnt, &oldset);
418
419 return status; 495 return status;
420} 496}
421 497
@@ -593,7 +669,7 @@ call_allocate(struct rpc_task *task)
593 return; 669 return;
594 printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); 670 printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task);
595 671
596 if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) { 672 if (RPC_IS_ASYNC(task) || !signalled()) {
597 xprt_release(task); 673 xprt_release(task);
598 task->tk_action = call_reserve; 674 task->tk_action = call_reserve;
599 rpc_delay(task, HZ>>4); 675 rpc_delay(task, HZ>>4);
@@ -957,7 +1033,9 @@ call_header(struct rpc_task *task)
957 *p++ = htonl(clnt->cl_prog); /* program number */ 1033 *p++ = htonl(clnt->cl_prog); /* program number */
958 *p++ = htonl(clnt->cl_vers); /* program version */ 1034 *p++ = htonl(clnt->cl_vers); /* program version */
959 *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */ 1035 *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */
960 return rpcauth_marshcred(task, p); 1036 p = rpcauth_marshcred(task, p);
1037 req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
1038 return p;
961} 1039}
962 1040
963/* 1041/*
@@ -986,10 +1064,11 @@ call_verify(struct rpc_task *task)
986 case RPC_AUTH_ERROR: 1064 case RPC_AUTH_ERROR:
987 break; 1065 break;
988 case RPC_MISMATCH: 1066 case RPC_MISMATCH:
989 printk(KERN_WARNING "%s: RPC call version mismatch!\n", __FUNCTION__); 1067 dprintk("%s: RPC call version mismatch!\n", __FUNCTION__);
990 goto out_eio; 1068 error = -EPROTONOSUPPORT;
1069 goto out_err;
991 default: 1070 default:
992 printk(KERN_WARNING "%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n); 1071 dprintk("%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n);
993 goto out_eio; 1072 goto out_eio;
994 } 1073 }
995 if (--len < 0) 1074 if (--len < 0)
@@ -1040,23 +1119,26 @@ call_verify(struct rpc_task *task)
1040 case RPC_SUCCESS: 1119 case RPC_SUCCESS:
1041 return p; 1120 return p;
1042 case RPC_PROG_UNAVAIL: 1121 case RPC_PROG_UNAVAIL:
1043 printk(KERN_WARNING "RPC: call_verify: program %u is unsupported by server %s\n", 1122 dprintk("RPC: call_verify: program %u is unsupported by server %s\n",
1044 (unsigned int)task->tk_client->cl_prog, 1123 (unsigned int)task->tk_client->cl_prog,
1045 task->tk_client->cl_server); 1124 task->tk_client->cl_server);
1046 goto out_eio; 1125 error = -EPFNOSUPPORT;
1126 goto out_err;
1047 case RPC_PROG_MISMATCH: 1127 case RPC_PROG_MISMATCH:
1048 printk(KERN_WARNING "RPC: call_verify: program %u, version %u unsupported by server %s\n", 1128 dprintk("RPC: call_verify: program %u, version %u unsupported by server %s\n",
1049 (unsigned int)task->tk_client->cl_prog, 1129 (unsigned int)task->tk_client->cl_prog,
1050 (unsigned int)task->tk_client->cl_vers, 1130 (unsigned int)task->tk_client->cl_vers,
1051 task->tk_client->cl_server); 1131 task->tk_client->cl_server);
1052 goto out_eio; 1132 error = -EPROTONOSUPPORT;
1133 goto out_err;
1053 case RPC_PROC_UNAVAIL: 1134 case RPC_PROC_UNAVAIL:
1054 printk(KERN_WARNING "RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n", 1135 dprintk("RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n",
1055 task->tk_msg.rpc_proc, 1136 task->tk_msg.rpc_proc,
1056 task->tk_client->cl_prog, 1137 task->tk_client->cl_prog,
1057 task->tk_client->cl_vers, 1138 task->tk_client->cl_vers,
1058 task->tk_client->cl_server); 1139 task->tk_client->cl_server);
1059 goto out_eio; 1140 error = -EOPNOTSUPP;
1141 goto out_err;
1060 case RPC_GARBAGE_ARGS: 1142 case RPC_GARBAGE_ARGS:
1061 dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__); 1143 dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__);
1062 break; /* retry */ 1144 break; /* retry */
@@ -1069,7 +1151,7 @@ out_retry:
1069 task->tk_client->cl_stats->rpcgarbage++; 1151 task->tk_client->cl_stats->rpcgarbage++;
1070 if (task->tk_garb_retry) { 1152 if (task->tk_garb_retry) {
1071 task->tk_garb_retry--; 1153 task->tk_garb_retry--;
1072 dprintk(KERN_WARNING "RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid); 1154 dprintk("RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
1073 task->tk_action = call_bind; 1155 task->tk_action = call_bind;
1074 return NULL; 1156 return NULL;
1075 } 1157 }
@@ -1083,3 +1165,30 @@ out_overflow:
1083 printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__); 1165 printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__);
1084 goto out_retry; 1166 goto out_retry;
1085} 1167}
1168
1169static int rpcproc_encode_null(void *rqstp, u32 *data, void *obj)
1170{
1171 return 0;
1172}
1173
1174static int rpcproc_decode_null(void *rqstp, u32 *data, void *obj)
1175{
1176 return 0;
1177}
1178
1179static struct rpc_procinfo rpcproc_null = {
1180 .p_encode = rpcproc_encode_null,
1181 .p_decode = rpcproc_decode_null,
1182};
1183
1184int rpc_ping(struct rpc_clnt *clnt, int flags)
1185{
1186 struct rpc_message msg = {
1187 .rpc_proc = &rpcproc_null,
1188 };
1189 int err;
1190 msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
1191 err = rpc_call_sync(clnt, &msg, flags);
1192 put_rpccred(msg.rpc_cred);
1193 return err;
1194}
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index d0b1d2c34a4d..4e81f2766923 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -53,6 +53,9 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
53 task->tk_pid, clnt->cl_server, 53 task->tk_pid, clnt->cl_server,
54 map->pm_prog, map->pm_vers, map->pm_prot); 54 map->pm_prog, map->pm_vers, map->pm_prot);
55 55
56 /* Autobind on cloned rpc clients is discouraged */
57 BUG_ON(clnt->cl_parent != clnt);
58
56 spin_lock(&pmap_lock); 59 spin_lock(&pmap_lock);
57 if (map->pm_binding) { 60 if (map->pm_binding) {
58 rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL); 61 rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL);
@@ -207,12 +210,10 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
207 xprt->addr.sin_port = htons(RPC_PMAP_PORT); 210 xprt->addr.sin_port = htons(RPC_PMAP_PORT);
208 211
209 /* printk("pmap: create clnt\n"); */ 212 /* printk("pmap: create clnt\n"); */
210 clnt = rpc_create_client(xprt, hostname, 213 clnt = rpc_new_client(xprt, hostname,
211 &pmap_program, RPC_PMAP_VERSION, 214 &pmap_program, RPC_PMAP_VERSION,
212 RPC_AUTH_UNIX); 215 RPC_AUTH_UNIX);
213 if (IS_ERR(clnt)) { 216 if (!IS_ERR(clnt)) {
214 xprt_destroy(xprt);
215 } else {
216 clnt->cl_softrtry = 1; 217 clnt->cl_softrtry = 1;
217 clnt->cl_chatty = 1; 218 clnt->cl_chatty = 1;
218 clnt->cl_oneshot = 1; 219 clnt->cl_oneshot = 1;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index c06614d0e31d..2d9eb7fbd521 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -290,7 +290,7 @@ static void rpc_make_runnable(struct rpc_task *task)
290 return; 290 return;
291 } 291 }
292 } else 292 } else
293 wake_up(&task->u.tk_wait.waitq); 293 wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
294} 294}
295 295
296/* 296/*
@@ -555,6 +555,38 @@ __rpc_atrun(struct rpc_task *task)
555} 555}
556 556
557/* 557/*
558 * Helper that calls task->tk_exit if it exists and then returns
559 * true if we should exit __rpc_execute.
560 */
561static inline int __rpc_do_exit(struct rpc_task *task)
562{
563 if (task->tk_exit != NULL) {
564 lock_kernel();
565 task->tk_exit(task);
566 unlock_kernel();
567 /* If tk_action is non-null, we should restart the call */
568 if (task->tk_action != NULL) {
569 if (!RPC_ASSASSINATED(task)) {
570 /* Release RPC slot and buffer memory */
571 xprt_release(task);
572 rpc_free(task);
573 return 0;
574 }
575 printk(KERN_ERR "RPC: dead task tried to walk away.\n");
576 }
577 }
578 return 1;
579}
580
581static int rpc_wait_bit_interruptible(void *word)
582{
583 if (signal_pending(current))
584 return -ERESTARTSYS;
585 schedule();
586 return 0;
587}
588
589/*
558 * This is the RPC `scheduler' (or rather, the finite state machine). 590 * This is the RPC `scheduler' (or rather, the finite state machine).
559 */ 591 */
560static int __rpc_execute(struct rpc_task *task) 592static int __rpc_execute(struct rpc_task *task)
@@ -566,8 +598,7 @@ static int __rpc_execute(struct rpc_task *task)
566 598
567 BUG_ON(RPC_IS_QUEUED(task)); 599 BUG_ON(RPC_IS_QUEUED(task));
568 600
569 restarted: 601 for (;;) {
570 while (1) {
571 /* 602 /*
572 * Garbage collection of pending timers... 603 * Garbage collection of pending timers...
573 */ 604 */
@@ -600,11 +631,12 @@ static int __rpc_execute(struct rpc_task *task)
600 * by someone else. 631 * by someone else.
601 */ 632 */
602 if (!RPC_IS_QUEUED(task)) { 633 if (!RPC_IS_QUEUED(task)) {
603 if (!task->tk_action) 634 if (task->tk_action != NULL) {
635 lock_kernel();
636 task->tk_action(task);
637 unlock_kernel();
638 } else if (__rpc_do_exit(task))
604 break; 639 break;
605 lock_kernel();
606 task->tk_action(task);
607 unlock_kernel();
608 } 640 }
609 641
610 /* 642 /*
@@ -624,44 +656,26 @@ static int __rpc_execute(struct rpc_task *task)
624 656
625 /* sync task: sleep here */ 657 /* sync task: sleep here */
626 dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); 658 dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
627 if (RPC_TASK_UNINTERRUPTIBLE(task)) { 659 /* Note: Caller should be using rpc_clnt_sigmask() */
628 __wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task)); 660 status = out_of_line_wait_on_bit(&task->tk_runstate,
629 } else { 661 RPC_TASK_QUEUED, rpc_wait_bit_interruptible,
630 __wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status); 662 TASK_INTERRUPTIBLE);
663 if (status == -ERESTARTSYS) {
631 /* 664 /*
632 * When a sync task receives a signal, it exits with 665 * When a sync task receives a signal, it exits with
633 * -ERESTARTSYS. In order to catch any callbacks that 666 * -ERESTARTSYS. In order to catch any callbacks that
634 * clean up after sleeping on some queue, we don't 667 * clean up after sleeping on some queue, we don't
635 * break the loop here, but go around once more. 668 * break the loop here, but go around once more.
636 */ 669 */
637 if (status == -ERESTARTSYS) { 670 dprintk("RPC: %4d got signal\n", task->tk_pid);
638 dprintk("RPC: %4d got signal\n", task->tk_pid); 671 task->tk_flags |= RPC_TASK_KILLED;
639 task->tk_flags |= RPC_TASK_KILLED; 672 rpc_exit(task, -ERESTARTSYS);
640 rpc_exit(task, -ERESTARTSYS); 673 rpc_wake_up_task(task);
641 rpc_wake_up_task(task);
642 }
643 } 674 }
644 rpc_set_running(task); 675 rpc_set_running(task);
645 dprintk("RPC: %4d sync task resuming\n", task->tk_pid); 676 dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
646 } 677 }
647 678
648 if (task->tk_exit) {
649 lock_kernel();
650 task->tk_exit(task);
651 unlock_kernel();
652 /* If tk_action is non-null, the user wants us to restart */
653 if (task->tk_action) {
654 if (!RPC_ASSASSINATED(task)) {
655 /* Release RPC slot and buffer memory */
656 if (task->tk_rqstp)
657 xprt_release(task);
658 rpc_free(task);
659 goto restarted;
660 }
661 printk(KERN_ERR "RPC: dead task tries to walk away.\n");
662 }
663 }
664
665 dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status); 679 dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status);
666 status = task->tk_status; 680 status = task->tk_status;
667 681
@@ -759,8 +773,6 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
759 773
760 /* Initialize workqueue for async tasks */ 774 /* Initialize workqueue for async tasks */
761 task->tk_workqueue = rpciod_workqueue; 775 task->tk_workqueue = rpciod_workqueue;
762 if (!RPC_IS_ASYNC(task))
763 init_waitqueue_head(&task->u.tk_wait.waitq);
764 776
765 if (clnt) { 777 if (clnt) {
766 atomic_inc(&clnt->cl_users); 778 atomic_inc(&clnt->cl_users);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index d4f26bf9e732..62a073495276 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,7 +41,9 @@ EXPORT_SYMBOL(rpc_release_task);
41 41
42/* RPC client functions */ 42/* RPC client functions */
43EXPORT_SYMBOL(rpc_create_client); 43EXPORT_SYMBOL(rpc_create_client);
44EXPORT_SYMBOL(rpc_new_client);
44EXPORT_SYMBOL(rpc_clone_client); 45EXPORT_SYMBOL(rpc_clone_client);
46EXPORT_SYMBOL(rpc_bind_new_program);
45EXPORT_SYMBOL(rpc_destroy_client); 47EXPORT_SYMBOL(rpc_destroy_client);
46EXPORT_SYMBOL(rpc_shutdown_client); 48EXPORT_SYMBOL(rpc_shutdown_client);
47EXPORT_SYMBOL(rpc_release_client); 49EXPORT_SYMBOL(rpc_release_client);
@@ -61,7 +63,6 @@ EXPORT_SYMBOL(rpc_mkpipe);
61 63
62/* Client transport */ 64/* Client transport */
63EXPORT_SYMBOL(xprt_create_proto); 65EXPORT_SYMBOL(xprt_create_proto);
64EXPORT_SYMBOL(xprt_destroy);
65EXPORT_SYMBOL(xprt_set_timeout); 66EXPORT_SYMBOL(xprt_set_timeout);
66EXPORT_SYMBOL(xprt_udp_slot_table_entries); 67EXPORT_SYMBOL(xprt_udp_slot_table_entries);
67EXPORT_SYMBOL(xprt_tcp_slot_table_entries); 68EXPORT_SYMBOL(xprt_tcp_slot_table_entries);
@@ -129,6 +130,10 @@ EXPORT_SYMBOL(xdr_encode_netobj);
129EXPORT_SYMBOL(xdr_encode_pages); 130EXPORT_SYMBOL(xdr_encode_pages);
130EXPORT_SYMBOL(xdr_inline_pages); 131EXPORT_SYMBOL(xdr_inline_pages);
131EXPORT_SYMBOL(xdr_shift_buf); 132EXPORT_SYMBOL(xdr_shift_buf);
133EXPORT_SYMBOL(xdr_encode_word);
134EXPORT_SYMBOL(xdr_decode_word);
135EXPORT_SYMBOL(xdr_encode_array2);
136EXPORT_SYMBOL(xdr_decode_array2);
132EXPORT_SYMBOL(xdr_buf_from_iov); 137EXPORT_SYMBOL(xdr_buf_from_iov);
133EXPORT_SYMBOL(xdr_buf_subsegment); 138EXPORT_SYMBOL(xdr_buf_subsegment);
134EXPORT_SYMBOL(xdr_buf_read_netobj); 139EXPORT_SYMBOL(xdr_buf_read_netobj);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bb2d99f33315..e9bd91265f70 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -35,20 +35,24 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
35 if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL))) 35 if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL)))
36 return NULL; 36 return NULL;
37 memset(serv, 0, sizeof(*serv)); 37 memset(serv, 0, sizeof(*serv));
38 serv->sv_name = prog->pg_name;
38 serv->sv_program = prog; 39 serv->sv_program = prog;
39 serv->sv_nrthreads = 1; 40 serv->sv_nrthreads = 1;
40 serv->sv_stats = prog->pg_stats; 41 serv->sv_stats = prog->pg_stats;
41 serv->sv_bufsz = bufsize? bufsize : 4096; 42 serv->sv_bufsz = bufsize? bufsize : 4096;
42 prog->pg_lovers = prog->pg_nvers-1;
43 xdrsize = 0; 43 xdrsize = 0;
44 for (vers=0; vers<prog->pg_nvers ; vers++) 44 while (prog) {
45 if (prog->pg_vers[vers]) { 45 prog->pg_lovers = prog->pg_nvers-1;
46 prog->pg_hivers = vers; 46 for (vers=0; vers<prog->pg_nvers ; vers++)
47 if (prog->pg_lovers > vers) 47 if (prog->pg_vers[vers]) {
48 prog->pg_lovers = vers; 48 prog->pg_hivers = vers;
49 if (prog->pg_vers[vers]->vs_xdrsize > xdrsize) 49 if (prog->pg_lovers > vers)
50 xdrsize = prog->pg_vers[vers]->vs_xdrsize; 50 prog->pg_lovers = vers;
51 } 51 if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
52 xdrsize = prog->pg_vers[vers]->vs_xdrsize;
53 }
54 prog = prog->pg_next;
55 }
52 serv->sv_xdrsize = xdrsize; 56 serv->sv_xdrsize = xdrsize;
53 INIT_LIST_HEAD(&serv->sv_threads); 57 INIT_LIST_HEAD(&serv->sv_threads);
54 INIT_LIST_HEAD(&serv->sv_sockets); 58 INIT_LIST_HEAD(&serv->sv_sockets);
@@ -56,8 +60,6 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
56 INIT_LIST_HEAD(&serv->sv_permsocks); 60 INIT_LIST_HEAD(&serv->sv_permsocks);
57 spin_lock_init(&serv->sv_lock); 61 spin_lock_init(&serv->sv_lock);
58 62
59 serv->sv_name = prog->pg_name;
60
61 /* Remove any stale portmap registrations */ 63 /* Remove any stale portmap registrations */
62 svc_register(serv, 0, 0); 64 svc_register(serv, 0, 0);
63 65
@@ -281,6 +283,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
281 rqstp->rq_res.len = 0; 283 rqstp->rq_res.len = 0;
282 rqstp->rq_res.page_base = 0; 284 rqstp->rq_res.page_base = 0;
283 rqstp->rq_res.page_len = 0; 285 rqstp->rq_res.page_len = 0;
286 rqstp->rq_res.buflen = PAGE_SIZE;
284 rqstp->rq_res.tail[0].iov_len = 0; 287 rqstp->rq_res.tail[0].iov_len = 0;
285 /* tcp needs a space for the record length... */ 288 /* tcp needs a space for the record length... */
286 if (rqstp->rq_prot == IPPROTO_TCP) 289 if (rqstp->rq_prot == IPPROTO_TCP)
@@ -338,7 +341,10 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
338 goto sendit; 341 goto sendit;
339 } 342 }
340 343
341 if (prog != progp->pg_prog) 344 for (progp = serv->sv_program; progp; progp = progp->pg_next)
345 if (prog == progp->pg_prog)
346 break;
347 if (progp == NULL)
342 goto err_bad_prog; 348 goto err_bad_prog;
343 349
344 if (vers >= progp->pg_nvers || 350 if (vers >= progp->pg_nvers ||
@@ -451,11 +457,7 @@ err_bad_auth:
451 goto sendit; 457 goto sendit;
452 458
453err_bad_prog: 459err_bad_prog:
454#ifdef RPC_PARANOIA 460 dprintk("svc: unknown program %d\n", prog);
455 if (prog != 100227 || progp->pg_prog != 100003)
456 printk("svc: unknown program %d (me %d)\n", prog, progp->pg_prog);
457 /* else it is just a Solaris client seeing if ACLs are supported */
458#endif
459 serv->sv_stats->rpcbadfmt++; 461 serv->sv_stats->rpcbadfmt++;
460 svc_putu32(resv, rpc_prog_unavail); 462 svc_putu32(resv, rpc_prog_unavail);
461 goto sendit; 463 goto sendit;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2b99b4028d31..d6baf6fdf8a9 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -8,6 +8,7 @@
8#include <linux/err.h> 8#include <linux/err.h>
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/hash.h> 10#include <linux/hash.h>
11#include <linux/string.h>
11 12
12#define RPCDBG_FACILITY RPCDBG_AUTH 13#define RPCDBG_FACILITY RPCDBG_AUTH
13 14
@@ -20,14 +21,6 @@
20 */ 21 */
21 22
22 23
23static char *strdup(char *s)
24{
25 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
26 if (rv)
27 strcpy(rv, s);
28 return rv;
29}
30
31struct unix_domain { 24struct unix_domain {
32 struct auth_domain h; 25 struct auth_domain h;
33 int addr_changes; 26 int addr_changes;
@@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name)
55 if (new == NULL) 48 if (new == NULL)
56 return NULL; 49 return NULL;
57 cache_init(&new->h.h); 50 cache_init(&new->h.h);
58 new->h.name = strdup(name); 51 new->h.name = kstrdup(name, GFP_KERNEL);
59 new->h.flavour = RPC_AUTH_UNIX; 52 new->h.flavour = RPC_AUTH_UNIX;
60 new->addr_changes = 0; 53 new->addr_changes = 0;
61 new->h.h.expiry_time = NEVER; 54 new->h.h.expiry_time = NEVER;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc96..56db8f13e6cb 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1185 arg->page_len = (pages-2)*PAGE_SIZE; 1185 arg->page_len = (pages-2)*PAGE_SIZE;
1186 arg->len = (pages-1)*PAGE_SIZE; 1186 arg->len = (pages-1)*PAGE_SIZE;
1187 arg->tail[0].iov_len = 0; 1187 arg->tail[0].iov_len = 0;
1188 1188
1189 try_to_freeze(PF_FREEZE); 1189 try_to_freeze();
1190 if (signalled()) 1190 if (signalled())
1191 return -EINTR; 1191 return -EINTR;
1192 1192
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1227 1227
1228 schedule_timeout(timeout); 1228 schedule_timeout(timeout);
1229 1229
1230 try_to_freeze(PF_FREEZE); 1230 try_to_freeze();
1231 1231
1232 spin_lock_bh(&serv->sv_lock); 1232 spin_lock_bh(&serv->sv_lock);
1233 remove_wait_queue(&rqstp->rq_wait, &wait); 1233 remove_wait_queue(&rqstp->rq_wait, &wait);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 67b9f035ba86..8a4d9c106af1 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,21 +176,23 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
176 xdr->buflen += len; 176 xdr->buflen += len;
177} 177}
178 178
179void 179ssize_t
180xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, 180xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
181 skb_reader_t *desc, 181 skb_reader_t *desc,
182 skb_read_actor_t copy_actor) 182 skb_read_actor_t copy_actor)
183{ 183{
184 struct page **ppage = xdr->pages; 184 struct page **ppage = xdr->pages;
185 unsigned int len, pglen = xdr->page_len; 185 unsigned int len, pglen = xdr->page_len;
186 ssize_t copied = 0;
186 int ret; 187 int ret;
187 188
188 len = xdr->head[0].iov_len; 189 len = xdr->head[0].iov_len;
189 if (base < len) { 190 if (base < len) {
190 len -= base; 191 len -= base;
191 ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); 192 ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
193 copied += ret;
192 if (ret != len || !desc->count) 194 if (ret != len || !desc->count)
193 return; 195 goto out;
194 base = 0; 196 base = 0;
195 } else 197 } else
196 base -= len; 198 base -= len;
@@ -210,6 +212,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
210 do { 212 do {
211 char *kaddr; 213 char *kaddr;
212 214
215 /* ACL likes to be lazy in allocating pages - ACLs
216 * are small by default but can get huge. */
217 if (unlikely(*ppage == NULL)) {
218 *ppage = alloc_page(GFP_ATOMIC);
219 if (unlikely(*ppage == NULL)) {
220 if (copied == 0)
221 copied = -ENOMEM;
222 goto out;
223 }
224 }
225
213 len = PAGE_CACHE_SIZE; 226 len = PAGE_CACHE_SIZE;
214 kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); 227 kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
215 if (base) { 228 if (base) {
@@ -225,14 +238,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
225 } 238 }
226 flush_dcache_page(*ppage); 239 flush_dcache_page(*ppage);
227 kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); 240 kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
241 copied += ret;
228 if (ret != len || !desc->count) 242 if (ret != len || !desc->count)
229 return; 243 goto out;
230 ppage++; 244 ppage++;
231 } while ((pglen -= len) != 0); 245 } while ((pglen -= len) != 0);
232copy_tail: 246copy_tail:
233 len = xdr->tail[0].iov_len; 247 len = xdr->tail[0].iov_len;
234 if (base < len) 248 if (base < len)
235 copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); 249 copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
250out:
251 return copied;
236} 252}
237 253
238 254
@@ -616,12 +632,24 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len)
616void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p) 632void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p)
617{ 633{
618 struct kvec *iov = buf->head; 634 struct kvec *iov = buf->head;
635 int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
619 636
637 BUG_ON(scratch_len < 0);
620 xdr->buf = buf; 638 xdr->buf = buf;
621 xdr->iov = iov; 639 xdr->iov = iov;
622 xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len); 640 xdr->p = (uint32_t *)((char *)iov->iov_base + iov->iov_len);
623 buf->len = iov->iov_len = (char *)p - (char *)iov->iov_base; 641 xdr->end = (uint32_t *)((char *)iov->iov_base + scratch_len);
624 xdr->p = p; 642 BUG_ON(iov->iov_len > scratch_len);
643
644 if (p != xdr->p && p != NULL) {
645 size_t len;
646
647 BUG_ON(p < xdr->p || p > xdr->end);
648 len = (char *)p - (char *)xdr->p;
649 xdr->p = p;
650 buf->len += len;
651 iov->iov_len += len;
652 }
625} 653}
626EXPORT_SYMBOL(xdr_init_encode); 654EXPORT_SYMBOL(xdr_init_encode);
627 655
@@ -859,8 +887,34 @@ out:
859 return status; 887 return status;
860} 888}
861 889
862static int 890/* obj is assumed to point to allocated memory of size at least len: */
863read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) 891int
892write_bytes_to_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len)
893{
894 struct xdr_buf subbuf;
895 int this_len;
896 int status;
897
898 status = xdr_buf_subsegment(buf, &subbuf, base, len);
899 if (status)
900 goto out;
901 this_len = min(len, (int)subbuf.head[0].iov_len);
902 memcpy(subbuf.head[0].iov_base, obj, this_len);
903 len -= this_len;
904 obj += this_len;
905 this_len = min(len, (int)subbuf.page_len);
906 if (this_len)
907 _copy_to_pages(subbuf.pages, subbuf.page_base, obj, this_len);
908 len -= this_len;
909 obj += this_len;
910 this_len = min(len, (int)subbuf.tail[0].iov_len);
911 memcpy(subbuf.tail[0].iov_base, obj, this_len);
912out:
913 return status;
914}
915
916int
917xdr_decode_word(struct xdr_buf *buf, int base, u32 *obj)
864{ 918{
865 u32 raw; 919 u32 raw;
866 int status; 920 int status;
@@ -872,6 +926,14 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
872 return 0; 926 return 0;
873} 927}
874 928
929int
930xdr_encode_word(struct xdr_buf *buf, int base, u32 obj)
931{
932 u32 raw = htonl(obj);
933
934 return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
935}
936
875/* If the netobj starting offset bytes from the start of xdr_buf is contained 937/* If the netobj starting offset bytes from the start of xdr_buf is contained
876 * entirely in the head or the tail, set object to point to it; otherwise 938 * entirely in the head or the tail, set object to point to it; otherwise
877 * try to find space for it at the end of the tail, copy it there, and 939 * try to find space for it at the end of the tail, copy it there, and
@@ -882,7 +944,7 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
882 u32 tail_offset = buf->head[0].iov_len + buf->page_len; 944 u32 tail_offset = buf->head[0].iov_len + buf->page_len;
883 u32 obj_end_offset; 945 u32 obj_end_offset;
884 946
885 if (read_u32_from_xdr_buf(buf, offset, &obj->len)) 947 if (xdr_decode_word(buf, offset, &obj->len))
886 goto out; 948 goto out;
887 obj_end_offset = offset + 4 + obj->len; 949 obj_end_offset = offset + 4 + obj->len;
888 950
@@ -915,3 +977,219 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
915out: 977out:
916 return -1; 978 return -1;
917} 979}
980
981/* Returns 0 on success, or else a negative error code. */
982static int
983xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
984 struct xdr_array2_desc *desc, int encode)
985{
986 char *elem = NULL, *c;
987 unsigned int copied = 0, todo, avail_here;
988 struct page **ppages = NULL;
989 int err;
990
991 if (encode) {
992 if (xdr_encode_word(buf, base, desc->array_len) != 0)
993 return -EINVAL;
994 } else {
995 if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
996 (unsigned long) base + 4 + desc->array_len *
997 desc->elem_size > buf->len)
998 return -EINVAL;
999 }
1000 base += 4;
1001
1002 if (!desc->xcode)
1003 return 0;
1004
1005 todo = desc->array_len * desc->elem_size;
1006
1007 /* process head */
1008 if (todo && base < buf->head->iov_len) {
1009 c = buf->head->iov_base + base;
1010 avail_here = min_t(unsigned int, todo,
1011 buf->head->iov_len - base);
1012 todo -= avail_here;
1013
1014 while (avail_here >= desc->elem_size) {
1015 err = desc->xcode(desc, c);
1016 if (err)
1017 goto out;
1018 c += desc->elem_size;
1019 avail_here -= desc->elem_size;
1020 }
1021 if (avail_here) {
1022 if (!elem) {
1023 elem = kmalloc(desc->elem_size, GFP_KERNEL);
1024 err = -ENOMEM;
1025 if (!elem)
1026 goto out;
1027 }
1028 if (encode) {
1029 err = desc->xcode(desc, elem);
1030 if (err)
1031 goto out;
1032 memcpy(c, elem, avail_here);
1033 } else
1034 memcpy(elem, c, avail_here);
1035 copied = avail_here;
1036 }
1037 base = buf->head->iov_len; /* align to start of pages */
1038 }
1039
1040 /* process pages array */
1041 base -= buf->head->iov_len;
1042 if (todo && base < buf->page_len) {
1043 unsigned int avail_page;
1044
1045 avail_here = min(todo, buf->page_len - base);
1046 todo -= avail_here;
1047
1048 base += buf->page_base;
1049 ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
1050 base &= ~PAGE_CACHE_MASK;
1051 avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
1052 avail_here);
1053 c = kmap(*ppages) + base;
1054
1055 while (avail_here) {
1056 avail_here -= avail_page;
1057 if (copied || avail_page < desc->elem_size) {
1058 unsigned int l = min(avail_page,
1059 desc->elem_size - copied);
1060 if (!elem) {
1061 elem = kmalloc(desc->elem_size,
1062 GFP_KERNEL);
1063 err = -ENOMEM;
1064 if (!elem)
1065 goto out;
1066 }
1067 if (encode) {
1068 if (!copied) {
1069 err = desc->xcode(desc, elem);
1070 if (err)
1071 goto out;
1072 }
1073 memcpy(c, elem + copied, l);
1074 copied += l;
1075 if (copied == desc->elem_size)
1076 copied = 0;
1077 } else {
1078 memcpy(elem + copied, c, l);
1079 copied += l;
1080 if (copied == desc->elem_size) {
1081 err = desc->xcode(desc, elem);
1082 if (err)
1083 goto out;
1084 copied = 0;
1085 }
1086 }
1087 avail_page -= l;
1088 c += l;
1089 }
1090 while (avail_page >= desc->elem_size) {
1091 err = desc->xcode(desc, c);
1092 if (err)
1093 goto out;
1094 c += desc->elem_size;
1095 avail_page -= desc->elem_size;
1096 }
1097 if (avail_page) {
1098 unsigned int l = min(avail_page,
1099 desc->elem_size - copied);
1100 if (!elem) {
1101 elem = kmalloc(desc->elem_size,
1102 GFP_KERNEL);
1103 err = -ENOMEM;
1104 if (!elem)
1105 goto out;
1106 }
1107 if (encode) {
1108 if (!copied) {
1109 err = desc->xcode(desc, elem);
1110 if (err)
1111 goto out;
1112 }
1113 memcpy(c, elem + copied, l);
1114 copied += l;
1115 if (copied == desc->elem_size)
1116 copied = 0;
1117 } else {
1118 memcpy(elem + copied, c, l);
1119 copied += l;
1120 if (copied == desc->elem_size) {
1121 err = desc->xcode(desc, elem);
1122 if (err)
1123 goto out;
1124 copied = 0;
1125 }
1126 }
1127 }
1128 if (avail_here) {
1129 kunmap(*ppages);
1130 ppages++;
1131 c = kmap(*ppages);
1132 }
1133
1134 avail_page = min(avail_here,
1135 (unsigned int) PAGE_CACHE_SIZE);
1136 }
1137 base = buf->page_len; /* align to start of tail */
1138 }
1139
1140 /* process tail */
1141 base -= buf->page_len;
1142 if (todo) {
1143 c = buf->tail->iov_base + base;
1144 if (copied) {
1145 unsigned int l = desc->elem_size - copied;
1146
1147 if (encode)
1148 memcpy(c, elem + copied, l);
1149 else {
1150 memcpy(elem + copied, c, l);
1151 err = desc->xcode(desc, elem);
1152 if (err)
1153 goto out;
1154 }
1155 todo -= l;
1156 c += l;
1157 }
1158 while (todo) {
1159 err = desc->xcode(desc, c);
1160 if (err)
1161 goto out;
1162 c += desc->elem_size;
1163 todo -= desc->elem_size;
1164 }
1165 }
1166 err = 0;
1167
1168out:
1169 if (elem)
1170 kfree(elem);
1171 if (ppages)
1172 kunmap(*ppages);
1173 return err;
1174}
1175
1176int
1177xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
1178 struct xdr_array2_desc *desc)
1179{
1180 if (base >= buf->len)
1181 return -EINVAL;
1182
1183 return xdr_xcode_array2(buf, base, desc, 0);
1184}
1185
1186int
1187xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
1188 struct xdr_array2_desc *desc)
1189{
1190 if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
1191 buf->head->iov_len + buf->page_len + buf->tail->iov_len)
1192 return -EINVAL;
1193
1194 return xdr_xcode_array2(buf, base, desc, 1);
1195}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c74a6bb94074..269f217918a3 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -569,8 +569,11 @@ void xprt_connect(struct rpc_task *task)
569 if (xprt->sock != NULL) 569 if (xprt->sock != NULL)
570 schedule_delayed_work(&xprt->sock_connect, 570 schedule_delayed_work(&xprt->sock_connect,
571 RPC_REESTABLISH_TIMEOUT); 571 RPC_REESTABLISH_TIMEOUT);
572 else 572 else {
573 schedule_work(&xprt->sock_connect); 573 schedule_work(&xprt->sock_connect);
574 if (!RPC_IS_ASYNC(task))
575 flush_scheduled_work();
576 }
574 } 577 }
575 return; 578 return;
576 out_write: 579 out_write:
@@ -725,7 +728,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
725 goto no_checksum; 728 goto no_checksum;
726 729
727 desc.csum = csum_partial(skb->data, desc.offset, skb->csum); 730 desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
728 xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits); 731 if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0)
732 return -1;
729 if (desc.offset != skb->len) { 733 if (desc.offset != skb->len) {
730 unsigned int csum2; 734 unsigned int csum2;
731 csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); 735 csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
@@ -737,7 +741,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
737 return -1; 741 return -1;
738 return 0; 742 return 0;
739no_checksum: 743no_checksum:
740 xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits); 744 if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
745 return -1;
741 if (desc.count) 746 if (desc.count)
742 return -1; 747 return -1;
743 return 0; 748 return 0;
@@ -821,10 +826,15 @@ tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
821{ 826{
822 if (len > desc->count) 827 if (len > desc->count)
823 len = desc->count; 828 len = desc->count;
824 if (skb_copy_bits(desc->skb, desc->offset, p, len)) 829 if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
830 dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n",
831 len, desc->count);
825 return 0; 832 return 0;
833 }
826 desc->offset += len; 834 desc->offset += len;
827 desc->count -= len; 835 desc->count -= len;
836 dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n",
837 len, desc->count);
828 return len; 838 return len;
829} 839}
830 840
@@ -863,6 +873,8 @@ tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
863static void 873static void
864tcp_check_recm(struct rpc_xprt *xprt) 874tcp_check_recm(struct rpc_xprt *xprt)
865{ 875{
876 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
877 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
866 if (xprt->tcp_offset == xprt->tcp_reclen) { 878 if (xprt->tcp_offset == xprt->tcp_reclen) {
867 xprt->tcp_flags |= XPRT_COPY_RECM; 879 xprt->tcp_flags |= XPRT_COPY_RECM;
868 xprt->tcp_offset = 0; 880 xprt->tcp_offset = 0;
@@ -907,6 +919,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
907 struct rpc_rqst *req; 919 struct rpc_rqst *req;
908 struct xdr_buf *rcvbuf; 920 struct xdr_buf *rcvbuf;
909 size_t len; 921 size_t len;
922 ssize_t r;
910 923
911 /* Find and lock the request corresponding to this xid */ 924 /* Find and lock the request corresponding to this xid */
912 spin_lock(&xprt->sock_lock); 925 spin_lock(&xprt->sock_lock);
@@ -927,15 +940,40 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
927 len = xprt->tcp_reclen - xprt->tcp_offset; 940 len = xprt->tcp_reclen - xprt->tcp_offset;
928 memcpy(&my_desc, desc, sizeof(my_desc)); 941 memcpy(&my_desc, desc, sizeof(my_desc));
929 my_desc.count = len; 942 my_desc.count = len;
930 xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, 943 r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
931 &my_desc, tcp_copy_data); 944 &my_desc, tcp_copy_data);
932 desc->count -= len; 945 desc->count -= r;
933 desc->offset += len; 946 desc->offset += r;
934 } else 947 } else
935 xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, 948 r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
936 desc, tcp_copy_data); 949 desc, tcp_copy_data);
937 xprt->tcp_copied += len; 950
938 xprt->tcp_offset += len; 951 if (r > 0) {
952 xprt->tcp_copied += r;
953 xprt->tcp_offset += r;
954 }
955 if (r != len) {
956 /* Error when copying to the receive buffer,
957 * usually because we weren't able to allocate
958 * additional buffer pages. All we can do now
959 * is turn off XPRT_COPY_DATA, so the request
960 * will not receive any additional updates,
961 * and time out.
962 * Any remaining data from this record will
963 * be discarded.
964 */
965 xprt->tcp_flags &= ~XPRT_COPY_DATA;
966 dprintk("RPC: XID %08x truncated request\n",
967 ntohl(xprt->tcp_xid));
968 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
969 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
970 goto out;
971 }
972
973 dprintk("RPC: XID %08x read %Zd bytes\n",
974 ntohl(xprt->tcp_xid), r);
975 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
976 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
939 977
940 if (xprt->tcp_copied == req->rq_private_buf.buflen) 978 if (xprt->tcp_copied == req->rq_private_buf.buflen)
941 xprt->tcp_flags &= ~XPRT_COPY_DATA; 979 xprt->tcp_flags &= ~XPRT_COPY_DATA;
@@ -944,6 +982,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
944 xprt->tcp_flags &= ~XPRT_COPY_DATA; 982 xprt->tcp_flags &= ~XPRT_COPY_DATA;
945 } 983 }
946 984
985out:
947 if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { 986 if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
948 dprintk("RPC: %4d received reply complete\n", 987 dprintk("RPC: %4d received reply complete\n",
949 req->rq_task->tk_pid); 988 req->rq_task->tk_pid);
@@ -967,6 +1006,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
967 desc->count -= len; 1006 desc->count -= len;
968 desc->offset += len; 1007 desc->offset += len;
969 xprt->tcp_offset += len; 1008 xprt->tcp_offset += len;
1009 dprintk("RPC: discarded %Zu bytes\n", len);
970 tcp_check_recm(xprt); 1010 tcp_check_recm(xprt);
971} 1011}
972 1012
@@ -1064,8 +1104,7 @@ tcp_state_change(struct sock *sk)
1064 case TCP_SYN_RECV: 1104 case TCP_SYN_RECV:
1065 break; 1105 break;
1066 default: 1106 default:
1067 if (xprt_test_and_clear_connected(xprt)) 1107 xprt_disconnect(xprt);
1068 rpc_wake_up_status(&xprt->pending, -ENOTCONN);
1069 break; 1108 break;
1070 } 1109 }
1071 out: 1110 out:
@@ -1203,6 +1242,8 @@ xprt_transmit(struct rpc_task *task)
1203 list_add_tail(&req->rq_list, &xprt->recv); 1242 list_add_tail(&req->rq_list, &xprt->recv);
1204 spin_unlock_bh(&xprt->sock_lock); 1243 spin_unlock_bh(&xprt->sock_lock);
1205 xprt_reset_majortimeo(req); 1244 xprt_reset_majortimeo(req);
1245 /* Turn off autodisconnect */
1246 del_singleshot_timer_sync(&xprt->timer);
1206 } 1247 }
1207 } else if (!req->rq_bytes_sent) 1248 } else if (!req->rq_bytes_sent)
1208 return; 1249 return;
@@ -1333,8 +1374,6 @@ xprt_reserve(struct rpc_task *task)
1333 spin_lock(&xprt->xprt_lock); 1374 spin_lock(&xprt->xprt_lock);
1334 do_xprt_reserve(task); 1375 do_xprt_reserve(task);
1335 spin_unlock(&xprt->xprt_lock); 1376 spin_unlock(&xprt->xprt_lock);
1336 if (task->tk_rqstp)
1337 del_timer_sync(&xprt->timer);
1338 } 1377 }
1339} 1378}
1340 1379
@@ -1649,6 +1688,10 @@ xprt_shutdown(struct rpc_xprt *xprt)
1649 rpc_wake_up(&xprt->backlog); 1688 rpc_wake_up(&xprt->backlog);
1650 wake_up(&xprt->cong_wait); 1689 wake_up(&xprt->cong_wait);
1651 del_timer_sync(&xprt->timer); 1690 del_timer_sync(&xprt->timer);
1691
1692 /* synchronously wait for connect worker to finish */
1693 cancel_delayed_work(&xprt->sock_connect);
1694 flush_scheduled_work();
1652} 1695}
1653 1696
1654/* 1697/*
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 2a24b243b841..04bec047fa9a 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -29,6 +29,10 @@
29 * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN 29 * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN
30 * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to 30 * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
31 * x25_proc.c, using seq_file 31 * x25_proc.c, using seq_file
32 * 2005-04-02 Shaun Pereira Selective sub address matching
33 * with call user data
34 * 2005-04-15 Shaun Pereira Fast select with no restriction on
35 * response
32 */ 36 */
33 37
34#include <linux/config.h> 38#include <linux/config.h>
@@ -219,7 +223,8 @@ static void x25_insert_socket(struct sock *sk)
219 * Note: if a listening socket has cud set it must only get calls 223 * Note: if a listening socket has cud set it must only get calls
220 * with matching cud. 224 * with matching cud.
221 */ 225 */
222static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata) 226static struct sock *x25_find_listener(struct x25_address *addr,
227 struct sk_buff *skb)
223{ 228{
224 struct sock *s; 229 struct sock *s;
225 struct sock *next_best; 230 struct sock *next_best;
@@ -230,22 +235,23 @@ static struct sock *x25_find_listener(struct x25_address *addr, struct x25_callu
230 235
231 sk_for_each(s, node, &x25_list) 236 sk_for_each(s, node, &x25_list)
232 if ((!strcmp(addr->x25_addr, 237 if ((!strcmp(addr->x25_addr,
233 x25_sk(s)->source_addr.x25_addr) || 238 x25_sk(s)->source_addr.x25_addr) ||
234 !strcmp(addr->x25_addr, 239 !strcmp(addr->x25_addr,
235 null_x25_address.x25_addr)) && 240 null_x25_address.x25_addr)) &&
236 s->sk_state == TCP_LISTEN) { 241 s->sk_state == TCP_LISTEN) {
237
238 /* 242 /*
239 * Found a listening socket, now check the incoming 243 * Found a listening socket, now check the incoming
240 * call user data vs this sockets call user data 244 * call user data vs this sockets call user data
241 */ 245 */
242 if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) { 246 if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) {
243 sock_hold(s); 247 if((memcmp(x25_sk(s)->calluserdata.cuddata,
244 goto found; 248 skb->data,
245 } 249 x25_sk(s)->cudmatchlength)) == 0) {
246 if (x25_sk(s)->calluserdata.cudlength == 0) { 250 sock_hold(s);
251 goto found;
252 }
253 } else
247 next_best = s; 254 next_best = s;
248 }
249 } 255 }
250 if (next_best) { 256 if (next_best) {
251 s = next_best; 257 s = next_best;
@@ -497,6 +503,9 @@ static int x25_create(struct socket *sock, int protocol)
497 x25->t23 = sysctl_x25_clear_request_timeout; 503 x25->t23 = sysctl_x25_clear_request_timeout;
498 x25->t2 = sysctl_x25_ack_holdback_timeout; 504 x25->t2 = sysctl_x25_ack_holdback_timeout;
499 x25->state = X25_STATE_0; 505 x25->state = X25_STATE_0;
506 x25->cudmatchlength = 0;
507 x25->accptapprv = X25_DENY_ACCPT_APPRV; /* normally no cud */
508 /* on call accept */
500 509
501 x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; 510 x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE;
502 x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; 511 x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -545,6 +554,8 @@ static struct sock *x25_make_new(struct sock *osk)
545 x25->t2 = ox25->t2; 554 x25->t2 = ox25->t2;
546 x25->facilities = ox25->facilities; 555 x25->facilities = ox25->facilities;
547 x25->qbitincl = ox25->qbitincl; 556 x25->qbitincl = ox25->qbitincl;
557 x25->cudmatchlength = ox25->cudmatchlength;
558 x25->accptapprv = ox25->accptapprv;
548 559
549 x25_init_timers(sk); 560 x25_init_timers(sk);
550out: 561out:
@@ -822,7 +833,6 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
822 struct x25_sock *makex25; 833 struct x25_sock *makex25;
823 struct x25_address source_addr, dest_addr; 834 struct x25_address source_addr, dest_addr;
824 struct x25_facilities facilities; 835 struct x25_facilities facilities;
825 struct x25_calluserdata calluserdata;
826 int len, rc; 836 int len, rc;
827 837
828 /* 838 /*
@@ -845,19 +855,10 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
845 skb_pull(skb,len); 855 skb_pull(skb,len);
846 856
847 /* 857 /*
848 * Incoming Call User Data.
849 */
850 if (skb->len >= 0) {
851 memcpy(calluserdata.cuddata, skb->data, skb->len);
852 calluserdata.cudlength = skb->len;
853 }
854
855 skb_push(skb,len);
856
857 /*
858 * Find a listener for the particular address/cud pair. 858 * Find a listener for the particular address/cud pair.
859 */ 859 */
860 sk = x25_find_listener(&source_addr,&calluserdata); 860 sk = x25_find_listener(&source_addr,skb);
861 skb_push(skb,len);
861 862
862 /* 863 /*
863 * We can't accept the Call Request. 864 * We can't accept the Call Request.
@@ -900,11 +901,23 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
900 makex25->neighbour = nb; 901 makex25->neighbour = nb;
901 makex25->facilities = facilities; 902 makex25->facilities = facilities;
902 makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; 903 makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
903 makex25->calluserdata = calluserdata; 904 /* ensure no reverse facil on accept */
904 905 makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
905 x25_write_internal(make, X25_CALL_ACCEPTED); 906 makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
907
908 /* Normally all calls are accepted immediatly */
909 if(makex25->accptapprv & X25_DENY_ACCPT_APPRV) {
910 x25_write_internal(make, X25_CALL_ACCEPTED);
911 makex25->state = X25_STATE_3;
912 }
906 913
907 makex25->state = X25_STATE_3; 914 /*
915 * Incoming Call User Data.
916 */
917 if (skb->len >= 0) {
918 memcpy(makex25->calluserdata.cuddata, skb->data, skb->len);
919 makex25->calluserdata.cudlength = skb->len;
920 }
908 921
909 sk->sk_ack_backlog++; 922 sk->sk_ack_backlog++;
910 923
@@ -1288,7 +1301,8 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1288 if (facilities.throughput < 0x03 || 1301 if (facilities.throughput < 0x03 ||
1289 facilities.throughput > 0xDD) 1302 facilities.throughput > 0xDD)
1290 break; 1303 break;
1291 if (facilities.reverse && facilities.reverse != 1) 1304 if (facilities.reverse &&
1305 (facilities.reverse | 0x81)!= 0x81)
1292 break; 1306 break;
1293 x25->facilities = facilities; 1307 x25->facilities = facilities;
1294 rc = 0; 1308 rc = 0;
@@ -1325,6 +1339,44 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1325 break; 1339 break;
1326 } 1340 }
1327 1341
1342 case SIOCX25SCUDMATCHLEN: {
1343 struct x25_subaddr sub_addr;
1344 rc = -EINVAL;
1345 if(sk->sk_state != TCP_CLOSE)
1346 break;
1347 rc = -EFAULT;
1348 if (copy_from_user(&sub_addr, argp,
1349 sizeof(sub_addr)))
1350 break;
1351 rc = -EINVAL;
1352 if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
1353 break;
1354 x25->cudmatchlength = sub_addr.cudmatchlength;
1355 rc = 0;
1356 break;
1357 }
1358
1359 case SIOCX25CALLACCPTAPPRV: {
1360 rc = -EINVAL;
1361 if (sk->sk_state != TCP_CLOSE)
1362 break;
1363 x25->accptapprv = X25_ALLOW_ACCPT_APPRV;
1364 rc = 0;
1365 break;
1366 }
1367
1368 case SIOCX25SENDCALLACCPT: {
1369 rc = -EINVAL;
1370 if (sk->sk_state != TCP_ESTABLISHED)
1371 break;
1372 if (x25->accptapprv) /* must call accptapprv above */
1373 break;
1374 x25_write_internal(sk, X25_CALL_ACCEPTED);
1375 x25->state = X25_STATE_3;
1376 rc = 0;
1377 break;
1378 }
1379
1328 default: 1380 default:
1329 rc = dev_ioctl(cmd, argp); 1381 rc = dev_ioctl(cmd, argp);
1330 break; 1382 break;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21bdb95f9a8..54278b962f4c 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -17,6 +17,8 @@
17 * X.25 001 Split from x25_subr.c 17 * X.25 001 Split from x25_subr.c
18 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities 18 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
19 * negotiation. 19 * negotiation.
20 * apr/14/05 Shaun Pereira - Allow fast select with no restriction
21 * on response.
20 */ 22 */
21 23
22#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -43,9 +45,31 @@ int x25_parse_facilities(struct sk_buff *skb,
43 case X25_FAC_CLASS_A: 45 case X25_FAC_CLASS_A:
44 switch (*p) { 46 switch (*p) {
45 case X25_FAC_REVERSE: 47 case X25_FAC_REVERSE:
46 facilities->reverse = p[1] & 0x01; 48 if((p[1] & 0x81) == 0x81) {
47 *vc_fac_mask |= X25_MASK_REVERSE; 49 facilities->reverse = p[1] & 0x81;
48 break; 50 *vc_fac_mask |= X25_MASK_REVERSE;
51 break;
52 }
53
54 if((p[1] & 0x01) == 0x01) {
55 facilities->reverse = p[1] & 0x01;
56 *vc_fac_mask |= X25_MASK_REVERSE;
57 break;
58 }
59
60 if((p[1] & 0x80) == 0x80) {
61 facilities->reverse = p[1] & 0x80;
62 *vc_fac_mask |= X25_MASK_REVERSE;
63 break;
64 }
65
66 if(p[1] == 0x00) {
67 facilities->reverse
68 = X25_DEFAULT_REVERSE;
69 *vc_fac_mask |= X25_MASK_REVERSE;
70 break;
71 }
72
49 case X25_FAC_THROUGHPUT: 73 case X25_FAC_THROUGHPUT:
50 facilities->throughput = p[1]; 74 facilities->throughput = p[1];
51 *vc_fac_mask |= X25_MASK_THROUGHPUT; 75 *vc_fac_mask |= X25_MASK_THROUGHPUT;
@@ -122,7 +146,7 @@ int x25_create_facilities(unsigned char *buffer,
122 146
123 if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) { 147 if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
124 *p++ = X25_FAC_REVERSE; 148 *p++ = X25_FAC_REVERSE;
125 *p++ = !!facilities->reverse; 149 *p++ = facilities->reverse;
126 } 150 }
127 151
128 if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) { 152 if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
@@ -171,7 +195,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
171 /* 195 /*
172 * They want reverse charging, we won't accept it. 196 * They want reverse charging, we won't accept it.
173 */ 197 */
174 if (theirs.reverse && ours->reverse) { 198 if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
175 SOCK_DEBUG(sk, "X.25: rejecting reverse charging request"); 199 SOCK_DEBUG(sk, "X.25: rejecting reverse charging request");
176 return -1; 200 return -1;
177 } 201 }
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 183fea3bba67..7fd872ad0c20 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -19,6 +19,8 @@
19 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities 19 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
20 * negotiation. 20 * negotiation.
21 * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups 21 * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups
22 * apr/04/15 Shaun Pereira Fast select with no
23 * restriction on response.
22 */ 24 */
23 25
24#include <linux/kernel.h> 26#include <linux/kernel.h>
@@ -127,8 +129,12 @@ void x25_write_internal(struct sock *sk, int frametype)
127 len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN + 129 len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN +
128 X25_MAX_CUD_LEN; 130 X25_MAX_CUD_LEN;
129 break; 131 break;
130 case X25_CALL_ACCEPTED: 132 case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
131 len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN; 133 if(x25->facilities.reverse & 0x80) {
134 len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
135 } else {
136 len += 1 + X25_MAX_FAC_LEN;
137 }
132 break; 138 break;
133 case X25_CLEAR_REQUEST: 139 case X25_CLEAR_REQUEST:
134 case X25_RESET_REQUEST: 140 case X25_RESET_REQUEST:
@@ -203,9 +209,16 @@ void x25_write_internal(struct sock *sk, int frametype)
203 x25->vc_facil_mask); 209 x25->vc_facil_mask);
204 dptr = skb_put(skb, len); 210 dptr = skb_put(skb, len);
205 memcpy(dptr, facilities, len); 211 memcpy(dptr, facilities, len);
206 dptr = skb_put(skb, x25->calluserdata.cudlength); 212
207 memcpy(dptr, x25->calluserdata.cuddata, 213 /* fast select with no restriction on response
208 x25->calluserdata.cudlength); 214 allows call user data. Userland must
215 ensure it is ours and not theirs */
216 if(x25->facilities.reverse & 0x80) {
217 dptr = skb_put(skb,
218 x25->calluserdata.cudlength);
219 memcpy(dptr, x25->calluserdata.cuddata,
220 x25->calluserdata.cudlength);
221 }
209 x25->calluserdata.cudlength = 0; 222 x25->calluserdata.cudlength = 0;
210 break; 223 break;
211 224
@@ -354,21 +367,3 @@ void x25_check_rbuf(struct sock *sk)
354 } 367 }
355} 368}
356 369
357/*
358 * Compare 2 calluserdata structures, used to find correct listening sockets
359 * when call user data is used.
360 */
361int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs)
362{
363 int i;
364 if (ours->cudlength != theirs->cudlength)
365 return 0;
366
367 for (i=0;i<ours->cudlength;i++) {
368 if (ours->cuddata[i] != theirs->cuddata[i]) {
369 return 0;
370 }
371 }
372 return 1;
373}
374
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d07f5ce31824..d65ed8684fc1 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -118,7 +118,6 @@ retry:
118 xfrm_policy_put_afinfo(afinfo); 118 xfrm_policy_put_afinfo(afinfo);
119 return type; 119 return type;
120} 120}
121EXPORT_SYMBOL(xfrm_get_type);
122 121
123int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, 122int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl,
124 unsigned short family) 123 unsigned short family)
@@ -216,8 +215,8 @@ out:
216 215
217expired: 216expired:
218 read_unlock(&xp->lock); 217 read_unlock(&xp->lock);
219 km_policy_expired(xp, dir, 1); 218 if (!xfrm_policy_delete(xp, dir))
220 xfrm_policy_delete(xp, dir); 219 km_policy_expired(xp, dir, 1);
221 xfrm_pol_put(xp); 220 xfrm_pol_put(xp);
222} 221}
223 222
@@ -555,7 +554,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
555 return NULL; 554 return NULL;
556} 555}
557 556
558void xfrm_policy_delete(struct xfrm_policy *pol, int dir) 557int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
559{ 558{
560 write_lock_bh(&xfrm_policy_lock); 559 write_lock_bh(&xfrm_policy_lock);
561 pol = __xfrm_policy_unlink(pol, dir); 560 pol = __xfrm_policy_unlink(pol, dir);
@@ -564,7 +563,9 @@ void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
564 if (dir < XFRM_POLICY_MAX) 563 if (dir < XFRM_POLICY_MAX)
565 atomic_inc(&flow_cache_genid); 564 atomic_inc(&flow_cache_genid);
566 xfrm_policy_kill(pol); 565 xfrm_policy_kill(pol);
566 return 0;
567 } 567 }
568 return -ENOENT;
568} 569}
569 570
570int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) 571int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d11747c2a763..9d206c282cf1 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -50,7 +50,7 @@ static DEFINE_SPINLOCK(xfrm_state_gc_lock);
50 50
51static int xfrm_state_gc_flush_bundles; 51static int xfrm_state_gc_flush_bundles;
52 52
53static void __xfrm_state_delete(struct xfrm_state *x); 53static int __xfrm_state_delete(struct xfrm_state *x);
54 54
55static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family); 55static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family);
56static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); 56static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
@@ -154,6 +154,7 @@ static void xfrm_timer_handler(unsigned long data)
154 next = tmo; 154 next = tmo;
155 } 155 }
156 156
157 x->km.dying = warn;
157 if (warn) 158 if (warn)
158 km_state_expired(x, 0); 159 km_state_expired(x, 0);
159resched: 160resched:
@@ -169,9 +170,8 @@ expired:
169 next = 2; 170 next = 2;
170 goto resched; 171 goto resched;
171 } 172 }
172 if (x->id.spi != 0) 173 if (!__xfrm_state_delete(x) && x->id.spi)
173 km_state_expired(x, 1); 174 km_state_expired(x, 1);
174 __xfrm_state_delete(x);
175 175
176out: 176out:
177 spin_unlock(&x->lock); 177 spin_unlock(&x->lock);
@@ -215,8 +215,10 @@ void __xfrm_state_destroy(struct xfrm_state *x)
215} 215}
216EXPORT_SYMBOL(__xfrm_state_destroy); 216EXPORT_SYMBOL(__xfrm_state_destroy);
217 217
218static void __xfrm_state_delete(struct xfrm_state *x) 218static int __xfrm_state_delete(struct xfrm_state *x)
219{ 219{
220 int err = -ESRCH;
221
220 if (x->km.state != XFRM_STATE_DEAD) { 222 if (x->km.state != XFRM_STATE_DEAD) {
221 x->km.state = XFRM_STATE_DEAD; 223 x->km.state = XFRM_STATE_DEAD;
222 spin_lock(&xfrm_state_lock); 224 spin_lock(&xfrm_state_lock);
@@ -245,14 +247,21 @@ static void __xfrm_state_delete(struct xfrm_state *x)
245 * is what we are dropping here. 247 * is what we are dropping here.
246 */ 248 */
247 atomic_dec(&x->refcnt); 249 atomic_dec(&x->refcnt);
250 err = 0;
248 } 251 }
252
253 return err;
249} 254}
250 255
251void xfrm_state_delete(struct xfrm_state *x) 256int xfrm_state_delete(struct xfrm_state *x)
252{ 257{
258 int err;
259
253 spin_lock_bh(&x->lock); 260 spin_lock_bh(&x->lock);
254 __xfrm_state_delete(x); 261 err = __xfrm_state_delete(x);
255 spin_unlock_bh(&x->lock); 262 spin_unlock_bh(&x->lock);
263
264 return err;
256} 265}
257EXPORT_SYMBOL(xfrm_state_delete); 266EXPORT_SYMBOL(xfrm_state_delete);
258 267
@@ -557,16 +566,18 @@ int xfrm_state_check_expire(struct xfrm_state *x)
557 566
558 if (x->curlft.bytes >= x->lft.hard_byte_limit || 567 if (x->curlft.bytes >= x->lft.hard_byte_limit ||
559 x->curlft.packets >= x->lft.hard_packet_limit) { 568 x->curlft.packets >= x->lft.hard_packet_limit) {
560 km_state_expired(x, 1); 569 x->km.state = XFRM_STATE_EXPIRED;
561 if (!mod_timer(&x->timer, jiffies + XFRM_ACQ_EXPIRES*HZ)) 570 if (!mod_timer(&x->timer, jiffies))
562 xfrm_state_hold(x); 571 xfrm_state_hold(x);
563 return -EINVAL; 572 return -EINVAL;
564 } 573 }
565 574
566 if (!x->km.dying && 575 if (!x->km.dying &&
567 (x->curlft.bytes >= x->lft.soft_byte_limit || 576 (x->curlft.bytes >= x->lft.soft_byte_limit ||
568 x->curlft.packets >= x->lft.soft_packet_limit)) 577 x->curlft.packets >= x->lft.soft_packet_limit)) {
578 x->km.dying = 1;
569 km_state_expired(x, 0); 579 km_state_expired(x, 0);
580 }
570 return 0; 581 return 0;
571} 582}
572EXPORT_SYMBOL(xfrm_state_check_expire); 583EXPORT_SYMBOL(xfrm_state_check_expire);
@@ -796,34 +807,56 @@ EXPORT_SYMBOL(xfrm_replay_advance);
796static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list); 807static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list);
797static DEFINE_RWLOCK(xfrm_km_lock); 808static DEFINE_RWLOCK(xfrm_km_lock);
798 809
799static void km_state_expired(struct xfrm_state *x, int hard) 810void km_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
800{ 811{
801 struct xfrm_mgr *km; 812 struct xfrm_mgr *km;
802 813
803 if (hard) 814 read_lock(&xfrm_km_lock);
804 x->km.state = XFRM_STATE_EXPIRED; 815 list_for_each_entry(km, &xfrm_km_list, list)
805 else 816 if (km->notify_policy)
806 x->km.dying = 1; 817 km->notify_policy(xp, dir, c);
818 read_unlock(&xfrm_km_lock);
819}
807 820
821void km_state_notify(struct xfrm_state *x, struct km_event *c)
822{
823 struct xfrm_mgr *km;
808 read_lock(&xfrm_km_lock); 824 read_lock(&xfrm_km_lock);
809 list_for_each_entry(km, &xfrm_km_list, list) 825 list_for_each_entry(km, &xfrm_km_list, list)
810 km->notify(x, hard); 826 if (km->notify)
827 km->notify(x, c);
811 read_unlock(&xfrm_km_lock); 828 read_unlock(&xfrm_km_lock);
829}
830
831EXPORT_SYMBOL(km_policy_notify);
832EXPORT_SYMBOL(km_state_notify);
833
834static void km_state_expired(struct xfrm_state *x, int hard)
835{
836 struct km_event c;
837
838 c.data.hard = hard;
839 c.event = XFRM_MSG_EXPIRE;
840 km_state_notify(x, &c);
812 841
813 if (hard) 842 if (hard)
814 wake_up(&km_waitq); 843 wake_up(&km_waitq);
815} 844}
816 845
846/*
847 * We send to all registered managers regardless of failure
848 * We are happy with one success
849*/
817static int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) 850static int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
818{ 851{
819 int err = -EINVAL; 852 int err = -EINVAL, acqret;
820 struct xfrm_mgr *km; 853 struct xfrm_mgr *km;
821 854
822 read_lock(&xfrm_km_lock); 855 read_lock(&xfrm_km_lock);
823 list_for_each_entry(km, &xfrm_km_list, list) { 856 list_for_each_entry(km, &xfrm_km_list, list) {
824 err = km->acquire(x, t, pol, XFRM_POLICY_OUT); 857 acqret = km->acquire(x, t, pol, XFRM_POLICY_OUT);
825 if (!err) 858 if (!acqret)
826 break; 859 err = acqret;
827 } 860 }
828 read_unlock(&xfrm_km_lock); 861 read_unlock(&xfrm_km_lock);
829 return err; 862 return err;
@@ -848,13 +881,11 @@ EXPORT_SYMBOL(km_new_mapping);
848 881
849void km_policy_expired(struct xfrm_policy *pol, int dir, int hard) 882void km_policy_expired(struct xfrm_policy *pol, int dir, int hard)
850{ 883{
851 struct xfrm_mgr *km; 884 struct km_event c;
852 885
853 read_lock(&xfrm_km_lock); 886 c.data.hard = hard;
854 list_for_each_entry(km, &xfrm_km_list, list) 887 c.event = XFRM_MSG_POLEXPIRE;
855 if (km->notify_policy) 888 km_policy_notify(pol, dir, &c);
856 km->notify_policy(pol, dir, hard);
857 read_unlock(&xfrm_km_lock);
858 889
859 if (hard) 890 if (hard)
860 wake_up(&km_waitq); 891 wake_up(&km_waitq);
@@ -1024,6 +1055,43 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu)
1024} 1055}
1025 1056
1026EXPORT_SYMBOL(xfrm_state_mtu); 1057EXPORT_SYMBOL(xfrm_state_mtu);
1058
1059int xfrm_init_state(struct xfrm_state *x)
1060{
1061 struct xfrm_state_afinfo *afinfo;
1062 int family = x->props.family;
1063 int err;
1064
1065 err = -EAFNOSUPPORT;
1066 afinfo = xfrm_state_get_afinfo(family);
1067 if (!afinfo)
1068 goto error;
1069
1070 err = 0;
1071 if (afinfo->init_flags)
1072 err = afinfo->init_flags(x);
1073
1074 xfrm_state_put_afinfo(afinfo);
1075
1076 if (err)
1077 goto error;
1078
1079 err = -EPROTONOSUPPORT;
1080 x->type = xfrm_get_type(x->id.proto, family);
1081 if (x->type == NULL)
1082 goto error;
1083
1084 err = x->type->init_state(x);
1085 if (err)
1086 goto error;
1087
1088 x->km.state = XFRM_STATE_VALID;
1089
1090error:
1091 return err;
1092}
1093
1094EXPORT_SYMBOL(xfrm_init_state);
1027 1095
1028void __init xfrm_state_init(void) 1096void __init xfrm_state_init(void)
1029{ 1097{
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 97509011c274..ecade4893a13 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -249,17 +249,10 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
249 if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1]))) 249 if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1])))
250 goto error; 250 goto error;
251 251
252 err = -ENOENT; 252 err = xfrm_init_state(x);
253 x->type = xfrm_get_type(x->id.proto, x->props.family);
254 if (x->type == NULL)
255 goto error;
256
257 err = x->type->init_state(x, NULL);
258 if (err) 253 if (err)
259 goto error; 254 goto error;
260 255
261 x->curlft.add_time = (unsigned long) xtime.tv_sec;
262 x->km.state = XFRM_STATE_VALID;
263 x->km.seq = p->seq; 256 x->km.seq = p->seq;
264 257
265 return x; 258 return x;
@@ -277,6 +270,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
277 struct xfrm_usersa_info *p = NLMSG_DATA(nlh); 270 struct xfrm_usersa_info *p = NLMSG_DATA(nlh);
278 struct xfrm_state *x; 271 struct xfrm_state *x;
279 int err; 272 int err;
273 struct km_event c;
280 274
281 err = verify_newsa_info(p, (struct rtattr **) xfrma); 275 err = verify_newsa_info(p, (struct rtattr **) xfrma);
282 if (err) 276 if (err)
@@ -286,6 +280,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
286 if (!x) 280 if (!x)
287 return err; 281 return err;
288 282
283 xfrm_state_hold(x);
289 if (nlh->nlmsg_type == XFRM_MSG_NEWSA) 284 if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
290 err = xfrm_state_add(x); 285 err = xfrm_state_add(x);
291 else 286 else
@@ -294,14 +289,24 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
294 if (err < 0) { 289 if (err < 0) {
295 x->km.state = XFRM_STATE_DEAD; 290 x->km.state = XFRM_STATE_DEAD;
296 xfrm_state_put(x); 291 xfrm_state_put(x);
292 goto out;
297 } 293 }
298 294
295 c.seq = nlh->nlmsg_seq;
296 c.pid = nlh->nlmsg_pid;
297 c.event = nlh->nlmsg_type;
298
299 km_state_notify(x, &c);
300out:
301 xfrm_state_put(x);
299 return err; 302 return err;
300} 303}
301 304
302static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) 305static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
303{ 306{
304 struct xfrm_state *x; 307 struct xfrm_state *x;
308 int err;
309 struct km_event c;
305 struct xfrm_usersa_id *p = NLMSG_DATA(nlh); 310 struct xfrm_usersa_id *p = NLMSG_DATA(nlh);
306 311
307 x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family); 312 x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
@@ -313,10 +318,19 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
313 return -EPERM; 318 return -EPERM;
314 } 319 }
315 320
316 xfrm_state_delete(x); 321 err = xfrm_state_delete(x);
322 if (err < 0) {
323 xfrm_state_put(x);
324 return err;
325 }
326
327 c.seq = nlh->nlmsg_seq;
328 c.pid = nlh->nlmsg_pid;
329 c.event = nlh->nlmsg_type;
330 km_state_notify(x, &c);
317 xfrm_state_put(x); 331 xfrm_state_put(x);
318 332
319 return 0; 333 return err;
320} 334}
321 335
322static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) 336static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
@@ -681,6 +695,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
681{ 695{
682 struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh); 696 struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh);
683 struct xfrm_policy *xp; 697 struct xfrm_policy *xp;
698 struct km_event c;
684 int err; 699 int err;
685 int excl; 700 int excl;
686 701
@@ -692,6 +707,10 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
692 if (!xp) 707 if (!xp)
693 return err; 708 return err;
694 709
710 /* shouldnt excl be based on nlh flags??
711 * Aha! this is anti-netlink really i.e more pfkey derived
712 * in netlink excl is a flag and you wouldnt need
713 * a type XFRM_MSG_UPDPOLICY - JHS */
695 excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; 714 excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
696 err = xfrm_policy_insert(p->dir, xp, excl); 715 err = xfrm_policy_insert(p->dir, xp, excl);
697 if (err) { 716 if (err) {
@@ -699,6 +718,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
699 return err; 718 return err;
700 } 719 }
701 720
721 c.event = nlh->nlmsg_type;
722 c.seq = nlh->nlmsg_seq;
723 c.pid = nlh->nlmsg_pid;
724 km_policy_notify(xp, p->dir, &c);
725
702 xfrm_pol_put(xp); 726 xfrm_pol_put(xp);
703 727
704 return 0; 728 return 0;
@@ -816,6 +840,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
816 struct xfrm_policy *xp; 840 struct xfrm_policy *xp;
817 struct xfrm_userpolicy_id *p; 841 struct xfrm_userpolicy_id *p;
818 int err; 842 int err;
843 struct km_event c;
819 int delete; 844 int delete;
820 845
821 p = NLMSG_DATA(nlh); 846 p = NLMSG_DATA(nlh);
@@ -843,6 +868,12 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
843 NETLINK_CB(skb).pid, 868 NETLINK_CB(skb).pid,
844 MSG_DONTWAIT); 869 MSG_DONTWAIT);
845 } 870 }
871 } else {
872 c.data.byid = p->index;
873 c.event = nlh->nlmsg_type;
874 c.seq = nlh->nlmsg_seq;
875 c.pid = nlh->nlmsg_pid;
876 km_policy_notify(xp, p->dir, &c);
846 } 877 }
847 878
848 xfrm_pol_put(xp); 879 xfrm_pol_put(xp);
@@ -852,15 +883,28 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
852 883
853static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) 884static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
854{ 885{
886 struct km_event c;
855 struct xfrm_usersa_flush *p = NLMSG_DATA(nlh); 887 struct xfrm_usersa_flush *p = NLMSG_DATA(nlh);
856 888
857 xfrm_state_flush(p->proto); 889 xfrm_state_flush(p->proto);
890 c.data.proto = p->proto;
891 c.event = nlh->nlmsg_type;
892 c.seq = nlh->nlmsg_seq;
893 c.pid = nlh->nlmsg_pid;
894 km_state_notify(NULL, &c);
895
858 return 0; 896 return 0;
859} 897}
860 898
861static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) 899static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
862{ 900{
901 struct km_event c;
902
863 xfrm_policy_flush(); 903 xfrm_policy_flush();
904 c.event = nlh->nlmsg_type;
905 c.seq = nlh->nlmsg_seq;
906 c.pid = nlh->nlmsg_pid;
907 km_policy_notify(NULL, 0, &c);
864 return 0; 908 return 0;
865} 909}
866 910
@@ -1069,15 +1113,16 @@ nlmsg_failure:
1069 return -1; 1113 return -1;
1070} 1114}
1071 1115
1072static int xfrm_send_state_notify(struct xfrm_state *x, int hard) 1116static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
1073{ 1117{
1074 struct sk_buff *skb; 1118 struct sk_buff *skb;
1119 int len = NLMSG_LENGTH(sizeof(struct xfrm_user_expire));
1075 1120
1076 skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC); 1121 skb = alloc_skb(len, GFP_ATOMIC);
1077 if (skb == NULL) 1122 if (skb == NULL)
1078 return -ENOMEM; 1123 return -ENOMEM;
1079 1124
1080 if (build_expire(skb, x, hard) < 0) 1125 if (build_expire(skb, x, c->data.hard) < 0)
1081 BUG(); 1126 BUG();
1082 1127
1083 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; 1128 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
@@ -1085,6 +1130,131 @@ static int xfrm_send_state_notify(struct xfrm_state *x, int hard)
1085 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); 1130 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
1086} 1131}
1087 1132
1133static int xfrm_notify_sa_flush(struct km_event *c)
1134{
1135 struct xfrm_usersa_flush *p;
1136 struct nlmsghdr *nlh;
1137 struct sk_buff *skb;
1138 unsigned char *b;
1139 int len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush));
1140
1141 skb = alloc_skb(len, GFP_ATOMIC);
1142 if (skb == NULL)
1143 return -ENOMEM;
1144 b = skb->tail;
1145
1146 nlh = NLMSG_PUT(skb, c->pid, c->seq,
1147 XFRM_MSG_FLUSHSA, sizeof(*p));
1148 nlh->nlmsg_flags = 0;
1149
1150 p = NLMSG_DATA(nlh);
1151 p->proto = c->data.proto;
1152
1153 nlh->nlmsg_len = skb->tail - b;
1154
1155 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
1156
1157nlmsg_failure:
1158 kfree_skb(skb);
1159 return -1;
1160}
1161
1162static int inline xfrm_sa_len(struct xfrm_state *x)
1163{
1164 int l = 0;
1165 if (x->aalg)
1166 l += RTA_SPACE(sizeof(*x->aalg) + (x->aalg->alg_key_len+7)/8);
1167 if (x->ealg)
1168 l += RTA_SPACE(sizeof(*x->ealg) + (x->ealg->alg_key_len+7)/8);
1169 if (x->calg)
1170 l += RTA_SPACE(sizeof(*x->calg));
1171 if (x->encap)
1172 l += RTA_SPACE(sizeof(*x->encap));
1173
1174 return l;
1175}
1176
1177static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
1178{
1179 struct xfrm_usersa_info *p;
1180 struct xfrm_usersa_id *id;
1181 struct nlmsghdr *nlh;
1182 struct sk_buff *skb;
1183 unsigned char *b;
1184 int len = xfrm_sa_len(x);
1185 int headlen;
1186
1187 headlen = sizeof(*p);
1188 if (c->event == XFRM_MSG_DELSA) {
1189 len += RTA_SPACE(headlen);
1190 headlen = sizeof(*id);
1191 }
1192 len += NLMSG_SPACE(headlen);
1193
1194 skb = alloc_skb(len, GFP_ATOMIC);
1195 if (skb == NULL)
1196 return -ENOMEM;
1197 b = skb->tail;
1198
1199 nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, headlen);
1200 nlh->nlmsg_flags = 0;
1201
1202 p = NLMSG_DATA(nlh);
1203 if (c->event == XFRM_MSG_DELSA) {
1204 id = NLMSG_DATA(nlh);
1205 memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
1206 id->spi = x->id.spi;
1207 id->family = x->props.family;
1208 id->proto = x->id.proto;
1209
1210 p = RTA_DATA(__RTA_PUT(skb, XFRMA_SA, sizeof(*p)));
1211 }
1212
1213 copy_to_user_state(x, p);
1214
1215 if (x->aalg)
1216 RTA_PUT(skb, XFRMA_ALG_AUTH,
1217 sizeof(*(x->aalg))+(x->aalg->alg_key_len+7)/8, x->aalg);
1218 if (x->ealg)
1219 RTA_PUT(skb, XFRMA_ALG_CRYPT,
1220 sizeof(*(x->ealg))+(x->ealg->alg_key_len+7)/8, x->ealg);
1221 if (x->calg)
1222 RTA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg);
1223
1224 if (x->encap)
1225 RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
1226
1227 nlh->nlmsg_len = skb->tail - b;
1228
1229 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
1230
1231nlmsg_failure:
1232rtattr_failure:
1233 kfree_skb(skb);
1234 return -1;
1235}
1236
1237static int xfrm_send_state_notify(struct xfrm_state *x, struct km_event *c)
1238{
1239
1240 switch (c->event) {
1241 case XFRM_MSG_EXPIRE:
1242 return xfrm_exp_state_notify(x, c);
1243 case XFRM_MSG_DELSA:
1244 case XFRM_MSG_UPDSA:
1245 case XFRM_MSG_NEWSA:
1246 return xfrm_notify_sa(x, c);
1247 case XFRM_MSG_FLUSHSA:
1248 return xfrm_notify_sa_flush(c);
1249 default:
1250 printk("xfrm_user: Unknown SA event %d\n", c->event);
1251 break;
1252 }
1253
1254 return 0;
1255
1256}
1257
1088static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, 1258static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
1089 struct xfrm_tmpl *xt, struct xfrm_policy *xp, 1259 struct xfrm_tmpl *xt, struct xfrm_policy *xp,
1090 int dir) 1260 int dir)
@@ -1218,7 +1388,7 @@ nlmsg_failure:
1218 return -1; 1388 return -1;
1219} 1389}
1220 1390
1221static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard) 1391static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
1222{ 1392{
1223 struct sk_buff *skb; 1393 struct sk_buff *skb;
1224 size_t len; 1394 size_t len;
@@ -1229,7 +1399,7 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
1229 if (skb == NULL) 1399 if (skb == NULL)
1230 return -ENOMEM; 1400 return -ENOMEM;
1231 1401
1232 if (build_polexpire(skb, xp, dir, hard) < 0) 1402 if (build_polexpire(skb, xp, dir, c->data.hard) < 0)
1233 BUG(); 1403 BUG();
1234 1404
1235 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; 1405 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
@@ -1237,6 +1407,103 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
1237 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); 1407 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
1238} 1408}
1239 1409
1410static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
1411{
1412 struct xfrm_userpolicy_info *p;
1413 struct xfrm_userpolicy_id *id;
1414 struct nlmsghdr *nlh;
1415 struct sk_buff *skb;
1416 unsigned char *b;
1417 int len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
1418 int headlen;
1419
1420 headlen = sizeof(*p);
1421 if (c->event == XFRM_MSG_DELPOLICY) {
1422 len += RTA_SPACE(headlen);
1423 headlen = sizeof(*id);
1424 }
1425 len += NLMSG_SPACE(headlen);
1426
1427 skb = alloc_skb(len, GFP_ATOMIC);
1428 if (skb == NULL)
1429 return -ENOMEM;
1430 b = skb->tail;
1431
1432 nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, headlen);
1433
1434 p = NLMSG_DATA(nlh);
1435 if (c->event == XFRM_MSG_DELPOLICY) {
1436 id = NLMSG_DATA(nlh);
1437 memset(id, 0, sizeof(*id));
1438 id->dir = dir;
1439 if (c->data.byid)
1440 id->index = xp->index;
1441 else
1442 memcpy(&id->sel, &xp->selector, sizeof(id->sel));
1443
1444 p = RTA_DATA(__RTA_PUT(skb, XFRMA_POLICY, sizeof(*p)));
1445 }
1446
1447 nlh->nlmsg_flags = 0;
1448
1449 copy_to_user_policy(xp, p, dir);
1450 if (copy_to_user_tmpl(xp, skb) < 0)
1451 goto nlmsg_failure;
1452
1453 nlh->nlmsg_len = skb->tail - b;
1454
1455 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
1456
1457nlmsg_failure:
1458rtattr_failure:
1459 kfree_skb(skb);
1460 return -1;
1461}
1462
1463static int xfrm_notify_policy_flush(struct km_event *c)
1464{
1465 struct nlmsghdr *nlh;
1466 struct sk_buff *skb;
1467 unsigned char *b;
1468 int len = NLMSG_LENGTH(0);
1469
1470 skb = alloc_skb(len, GFP_ATOMIC);
1471 if (skb == NULL)
1472 return -ENOMEM;
1473 b = skb->tail;
1474
1475
1476 nlh = NLMSG_PUT(skb, c->pid, c->seq, XFRM_MSG_FLUSHPOLICY, 0);
1477
1478 nlh->nlmsg_len = skb->tail - b;
1479
1480 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
1481
1482nlmsg_failure:
1483 kfree_skb(skb);
1484 return -1;
1485}
1486
1487static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
1488{
1489
1490 switch (c->event) {
1491 case XFRM_MSG_NEWPOLICY:
1492 case XFRM_MSG_UPDPOLICY:
1493 case XFRM_MSG_DELPOLICY:
1494 return xfrm_notify_policy(xp, dir, c);
1495 case XFRM_MSG_FLUSHPOLICY:
1496 return xfrm_notify_policy_flush(c);
1497 case XFRM_MSG_POLEXPIRE:
1498 return xfrm_exp_policy_notify(xp, dir, c);
1499 default:
1500 printk("xfrm_user: Unknown Policy event %d\n", c->event);
1501 }
1502
1503 return 0;
1504
1505}
1506
1240static struct xfrm_mgr netlink_mgr = { 1507static struct xfrm_mgr netlink_mgr = {
1241 .id = "netlink", 1508 .id = "netlink",
1242 .notify = xfrm_send_state_notify, 1509 .notify = xfrm_send_state_notify,