aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/fddi.c4
-rw-r--r--net/802/tr.c22
-rw-r--r--net/8021q/Kconfig19
-rw-r--r--net/8021q/vlan.c8
-rw-r--r--net/Kconfig472
-rw-r--r--net/appletalk/aarp.c7
-rw-r--r--net/atm/Kconfig74
-rw-r--r--net/atm/br2684.c3
-rw-r--r--net/atm/svc.c4
-rw-r--r--net/ax25/af_ax25.c27
-rw-r--r--net/ax25/ax25_route.c12
-rw-r--r--net/ax25/ax25_uid.c83
-rw-r--r--net/bluetooth/cmtp/core.c6
-rw-r--r--net/bluetooth/hci_core.c2
-rw-r--r--net/bluetooth/hci_event.c4
-rw-r--r--net/bluetooth/hidp/core.c5
-rw-r--r--net/bluetooth/lib.c25
-rw-r--r--net/bluetooth/rfcomm/core.c4
-rw-r--r--net/bluetooth/rfcomm/sock.c7
-rw-r--r--net/bluetooth/rfcomm/tty.c2
-rw-r--r--net/bridge/Kconfig31
-rw-r--r--net/bridge/br_netfilter.c2
-rw-r--r--net/bridge/netfilter/Kconfig2
-rw-r--r--net/bridge/netfilter/ebt_log.c6
-rw-r--r--net/bridge/netfilter/ebtables.c21
-rw-r--r--net/compat.c9
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/dev.c142
-rw-r--r--net/core/dst.c15
-rw-r--r--net/core/filter.c104
-rw-r--r--net/core/neighbour.c9
-rw-r--r--net/core/netpoll.c133
-rw-r--r--net/core/pktgen.c31
-rw-r--r--net/core/rtnetlink.c2
-rw-r--r--net/core/skbuff.c180
-rw-r--r--net/core/sock.c24
-rw-r--r--net/core/sysctl_net_core.c61
-rw-r--r--net/core/utils.c37
-rw-r--r--net/core/wireless.c1
-rw-r--r--net/decnet/Kconfig23
-rw-r--r--net/decnet/af_decnet.c21
-rw-r--r--net/decnet/dn_fib.c3
-rw-r--r--net/decnet/dn_neigh.c2
-rw-r--r--net/decnet/dn_nsp_out.c3
-rw-r--r--net/econet/Kconfig36
-rw-r--r--net/ethernet/eth.c9
-rw-r--r--net/ipv4/Kconfig180
-rw-r--r--net/ipv4/Makefile12
-rw-r--r--net/ipv4/af_inet.c13
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/fib_semantics.c9
-rw-r--r--net/ipv4/fib_trie.c954
-rw-r--r--net/ipv4/icmp.c18
-rw-r--r--net/ipv4/igmp.c96
-rw-r--r--net/ipv4/inetpeer.c11
-rw-r--r--net/ipv4/ip_fragment.c8
-rw-r--r--net/ipv4/ip_gre.c21
-rw-r--r--net/ipv4/ip_input.c6
-rw-r--r--net/ipv4/ip_output.c19
-rw-r--r--net/ipv4/ip_sockglue.c9
-rw-r--r--net/ipv4/ipcomp.c2
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipip.c56
-rw-r--r--net/ipv4/ipmr.c16
-rw-r--r--net/ipv4/ipvs/Kconfig4
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c31
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c17
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c4
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c8
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c50
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c14
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c8
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c8
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c7
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c3
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c3
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c4
-rw-r--r--net/ipv4/netfilter/ip_queue.c7
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c9
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c17
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c7
-rw-r--r--net/ipv4/route.c145
-rw-r--r--net/ipv4/sysctl_net_ipv4.c114
-rw-r--r--net/ipv4/tcp.c87
-rw-r--r--net/ipv4/tcp_bic.c331
-rw-r--r--net/ipv4/tcp_cong.c237
-rw-r--r--net/ipv4/tcp_diag.c34
-rw-r--r--net/ipv4/tcp_highspeed.c181
-rw-r--r--net/ipv4/tcp_htcp.c289
-rw-r--r--net/ipv4/tcp_hybla.c187
-rw-r--r--net/ipv4/tcp_input.c824
-rw-r--r--net/ipv4/tcp_ipv4.c28
-rw-r--r--net/ipv4/tcp_minisocks.c4
-rw-r--r--net/ipv4/tcp_output.c584
-rw-r--r--net/ipv4/tcp_scalable.c68
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv4/tcp_vegas.c411
-rw-r--r--net/ipv4/tcp_westwood.c259
-rw-r--r--net/ipv4/udp.c34
-rw-r--r--net/ipv4/utils.c59
-rw-r--r--net/ipv4/xfrm4_tunnel.c3
-rw-r--r--net/ipv6/Kconfig23
-rw-r--r--net/ipv6/addrconf.c22
-rw-r--r--net/ipv6/af_inet6.c4
-rw-r--r--net/ipv6/ip6_flowlabel.c1
-rw-r--r--net/ipv6/ip6_input.c9
-rw-r--r--net/ipv6/ip6_output.c8
-rw-r--r--net/ipv6/ip6_tunnel.c38
-rw-r--r--net/ipv6/ipcomp6.c2
-rw-r--r--net/ipv6/ipv6_sockglue.c3
-rw-r--r--net/ipv6/mcast.c29
-rw-r--r--net/ipv6/netfilter/ip6_queue.c9
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c11
-rw-r--r--net/ipv6/raw.c2
-rw-r--r--net/ipv6/sit.c21
-rw-r--r--net/ipv6/tcp_ipv6.c13
-rw-r--r--net/ipx/Kconfig33
-rw-r--r--net/irda/irlap.c3
-rw-r--r--net/irda/irlap_event.c14
-rw-r--r--net/irda/irlap_frame.c8
-rw-r--r--net/irda/irttp.c2
-rw-r--r--net/lapb/Kconfig22
-rw-r--r--net/llc/llc_c_ev.c2
-rw-r--r--net/netlink/af_netlink.c18
-rw-r--r--net/netrom/af_netrom.c31
-rw-r--r--net/packet/Kconfig26
-rw-r--r--net/packet/af_packet.c6
-rw-r--r--net/rose/af_rose.c27
-rw-r--r--net/rose/rose_route.c6
-rw-r--r--net/rxrpc/krxiod.c2
-rw-r--r--net/rxrpc/krxsecd.c2
-rw-r--r--net/rxrpc/krxtimod.c2
-rw-r--r--net/sched/Kconfig50
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c10
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/cls_rsvp.h1
-rw-r--r--net/sched/em_meta.c68
-rw-r--r--net/sched/em_text.c154
-rw-r--r--net/sched/sch_api.c65
-rw-r--r--net/sched/sch_blackhole.c54
-rw-r--r--net/sched/sch_cbq.c3
-rw-r--r--net/sched/sch_generic.c43
-rw-r--r--net/sched/sch_red.c2
-rw-r--r--net/sctp/associola.c15
-rw-r--r--net/sctp/bind_addr.c16
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/endpointola.c19
-rw-r--r--net/sctp/input.c71
-rw-r--r--net/sctp/inqueue.c18
-rw-r--r--net/sctp/ipv6.c7
-rw-r--r--net/sctp/objcnt.c6
-rw-r--r--net/sctp/output.c22
-rw-r--r--net/sctp/outqueue.c50
-rw-r--r--net/sctp/proc.c1
-rw-r--r--net/sctp/protocol.c7
-rw-r--r--net/sctp/sm_make_chunk.c27
-rw-r--r--net/sctp/sm_sideeffect.c13
-rw-r--r--net/sctp/sm_statefuns.c16
-rw-r--r--net/sctp/socket.c2
-rw-r--r--net/sctp/ssnmap.c3
-rw-r--r--net/sctp/sysctl.c13
-rw-r--r--net/sctp/transport.c6
-rw-r--r--net/sctp/ulpevent.c19
-rw-r--r--net/sctp/ulpqueue.c9
-rw-r--r--net/socket.c3
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c2
-rw-r--r--net/sunrpc/sunrpc_syms.c1
-rw-r--r--net/sunrpc/svcauth_unix.c11
-rw-r--r--net/sunrpc/svcsock.c8
-rw-r--r--net/sunrpc/xdr.c1
-rw-r--r--net/sunrpc/xprt.c6
-rw-r--r--net/unix/Kconfig21
-rw-r--r--net/unix/af_unix.c4
-rw-r--r--net/wanrouter/Kconfig29
-rw-r--r--net/wanrouter/wanmain.c6
-rw-r--r--net/x25/Kconfig36
-rw-r--r--net/x25/af_x25.c110
-rw-r--r--net/x25/x25_facilities.c34
-rw-r--r--net/x25/x25_subr.c41
-rw-r--r--net/xfrm/Kconfig15
-rw-r--r--net/xfrm/xfrm_user.c3
187 files changed, 5560 insertions, 3213 deletions
diff --git a/net/802/fddi.c b/net/802/fddi.c
index ebcf4830d6..5ce24c4bb8 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -122,10 +122,10 @@ static int fddi_rebuild_header(struct sk_buff *skb)
122 * the proper pointer to the start of packet data (skb->data). 122 * the proper pointer to the start of packet data (skb->data).
123 */ 123 */
124 124
125unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev) 125__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
126{ 126{
127 struct fddihdr *fddi = (struct fddihdr *)skb->data; 127 struct fddihdr *fddi = (struct fddihdr *)skb->data;
128 unsigned short type; 128 __be16 type;
129 129
130 /* 130 /*
131 * Set mac.raw field to point to FC byte, set data field to point 131 * Set mac.raw field to point to FC byte, set data field to point
diff --git a/net/802/tr.c b/net/802/tr.c
index a755e880f4..1bb7dc1b85 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -251,10 +251,11 @@ void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device *
251 unsigned int hash; 251 unsigned int hash;
252 struct rif_cache *entry; 252 struct rif_cache *entry;
253 unsigned char *olddata; 253 unsigned char *olddata;
254 unsigned long flags;
254 static const unsigned char mcast_func_addr[] 255 static const unsigned char mcast_func_addr[]
255 = {0xC0,0x00,0x00,0x04,0x00,0x00}; 256 = {0xC0,0x00,0x00,0x04,0x00,0x00};
256 257
257 spin_lock_bh(&rif_lock); 258 spin_lock_irqsave(&rif_lock, flags);
258 259
259 /* 260 /*
260 * Broadcasts are single route as stated in RFC 1042 261 * Broadcasts are single route as stated in RFC 1042
@@ -323,7 +324,7 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
323 else 324 else
324 slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8); 325 slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8);
325 olddata = skb->data; 326 olddata = skb->data;
326 spin_unlock_bh(&rif_lock); 327 spin_unlock_irqrestore(&rif_lock, flags);
327 328
328 skb_pull(skb, slack); 329 skb_pull(skb, slack);
329 memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack); 330 memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack);
@@ -337,10 +338,11 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
337static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev) 338static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev)
338{ 339{
339 unsigned int hash, rii_p = 0; 340 unsigned int hash, rii_p = 0;
341 unsigned long flags;
340 struct rif_cache *entry; 342 struct rif_cache *entry;
341 343
342 344
343 spin_lock_bh(&rif_lock); 345 spin_lock_irqsave(&rif_lock, flags);
344 346
345 /* 347 /*
346 * Firstly see if the entry exists 348 * Firstly see if the entry exists
@@ -378,7 +380,7 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
378 if(!entry) 380 if(!entry)
379 { 381 {
380 printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n"); 382 printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n");
381 spin_unlock_bh(&rif_lock); 383 spin_unlock_irqrestore(&rif_lock, flags);
382 return; 384 return;
383 } 385 }
384 386
@@ -420,7 +422,7 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
420 } 422 }
421 entry->last_used=jiffies; 423 entry->last_used=jiffies;
422 } 424 }
423 spin_unlock_bh(&rif_lock); 425 spin_unlock_irqrestore(&rif_lock, flags);
424} 426}
425 427
426/* 428/*
@@ -430,9 +432,9 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
430static void rif_check_expire(unsigned long dummy) 432static void rif_check_expire(unsigned long dummy)
431{ 433{
432 int i; 434 int i;
433 unsigned long next_interval = jiffies + sysctl_tr_rif_timeout/2; 435 unsigned long flags, next_interval = jiffies + sysctl_tr_rif_timeout/2;
434 436
435 spin_lock_bh(&rif_lock); 437 spin_lock_irqsave(&rif_lock, flags);
436 438
437 for(i =0; i < RIF_TABLE_SIZE; i++) { 439 for(i =0; i < RIF_TABLE_SIZE; i++) {
438 struct rif_cache *entry, **pentry; 440 struct rif_cache *entry, **pentry;
@@ -454,7 +456,7 @@ static void rif_check_expire(unsigned long dummy)
454 } 456 }
455 } 457 }
456 458
457 spin_unlock_bh(&rif_lock); 459 spin_unlock_irqrestore(&rif_lock, flags);
458 460
459 mod_timer(&rif_timer, next_interval); 461 mod_timer(&rif_timer, next_interval);
460 462
@@ -485,7 +487,7 @@ static struct rif_cache *rif_get_idx(loff_t pos)
485 487
486static void *rif_seq_start(struct seq_file *seq, loff_t *pos) 488static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
487{ 489{
488 spin_lock_bh(&rif_lock); 490 spin_lock_irq(&rif_lock);
489 491
490 return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN; 492 return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN;
491} 493}
@@ -516,7 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
516 518
517static void rif_seq_stop(struct seq_file *seq, void *v) 519static void rif_seq_stop(struct seq_file *seq, void *v)
518{ 520{
519 spin_unlock_bh(&rif_lock); 521 spin_unlock_irq(&rif_lock);
520} 522}
521 523
522static int rif_seq_show(struct seq_file *seq, void *v) 524static int rif_seq_show(struct seq_file *seq, void *v)
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 0000000000..c4a382e450
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,19 @@
1#
2# Configuration for 802.1Q VLAN support
3#
4
5config VLAN_8021Q
6 tristate "802.1Q VLAN Support"
7 ---help---
8 Select this and you will be able to create 802.1Q VLAN interfaces
9 on your ethernet interfaces. 802.1Q VLAN supports almost
10 everything a regular ethernet interface does, including
11 firewalling, bridging, and of course IP traffic. You will need
12 the 'vconfig' tool from the VLAN project in order to effectively
13 use VLANs. See the VLAN web page for more information:
14 <http://www.candelatech.com/~greear/vlan.html>
15
16 To compile this code as a module, choose M here: the module
17 will be called 8021q.
18
19 If unsure, say N.
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1f6d31670b..91e412b0ab 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -578,6 +578,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
578 if (!vlandev) 578 if (!vlandev)
579 continue; 579 continue;
580 580
581 if (netif_carrier_ok(dev)) {
582 if (!netif_carrier_ok(vlandev))
583 netif_carrier_on(vlandev);
584 } else {
585 if (netif_carrier_ok(vlandev))
586 netif_carrier_off(vlandev);
587 }
588
581 if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) { 589 if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) {
582 vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) 590 vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK)
583 | flgs; 591 | flgs;
diff --git a/net/Kconfig b/net/Kconfig
index 9251b28e8d..40a31ba86d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -2,7 +2,7 @@
2# Network configuration 2# Network configuration
3# 3#
4 4
5menu "Networking support" 5menu "Networking"
6 6
7config NET 7config NET
8 bool "Networking support" 8 bool "Networking support"
@@ -10,7 +10,9 @@ config NET
10 Unless you really know what you are doing, you should say Y here. 10 Unless you really know what you are doing, you should say Y here.
11 The reason is that some programs need kernel networking support even 11 The reason is that some programs need kernel networking support even
12 when running on a stand-alone machine that isn't connected to any 12 when running on a stand-alone machine that isn't connected to any
13 other computer. If you are upgrading from an older kernel, you 13 other computer.
14
15 If you are upgrading from an older kernel, you
14 should consider updating your networking tools too because changes 16 should consider updating your networking tools too because changes
15 in the kernel and the tools often go hand in hand. The tools are 17 in the kernel and the tools often go hand in hand. The tools are
16 contained in the package net-tools, the location and version number 18 contained in the package net-tools, the location and version number
@@ -20,57 +22,14 @@ config NET
20 recommended to read the NET-HOWTO, available from 22 recommended to read the NET-HOWTO, available from
21 <http://www.tldp.org/docs.html#howto>. 23 <http://www.tldp.org/docs.html#howto>.
22 24
23menu "Networking options" 25# Make sure that all config symbols are dependent on NET
24 depends on NET 26if NET
25
26config PACKET
27 tristate "Packet socket"
28 ---help---
29 The Packet protocol is used by applications which communicate
30 directly with network devices without an intermediate network
31 protocol implemented in the kernel, e.g. tcpdump. If you want them
32 to work, choose Y.
33 27
34 To compile this driver as a module, choose M here: the module will 28menu "Networking options"
35 be called af_packet.
36
37 If unsure, say Y.
38
39config PACKET_MMAP
40 bool "Packet socket: mmapped IO"
41 depends on PACKET
42 help
43 If you say Y here, the Packet protocol driver will use an IO
44 mechanism that results in faster communication.
45
46 If unsure, say N.
47
48config UNIX
49 tristate "Unix domain sockets"
50 ---help---
51 If you say Y here, you will include support for Unix domain sockets;
52 sockets are the standard Unix mechanism for establishing and
53 accessing network connections. Many commonly used programs such as
54 the X Window system and syslog use these sockets even if your
55 machine is not connected to any network. Unless you are working on
56 an embedded system or something similar, you therefore definitely
57 want to say Y here.
58
59 To compile this driver as a module, choose M here: the module will be
60 called unix. Note that several important services won't work
61 correctly if you say M here and then neglect to load the module.
62
63 Say Y unless you know what you are doing.
64
65config NET_KEY
66 tristate "PF_KEY sockets"
67 select XFRM
68 ---help---
69 PF_KEYv2 socket family, compatible to KAME ones.
70 They are required if you are going to use IPsec tools ported
71 from KAME.
72 29
73 Say Y unless you know what you are doing. 30source "net/packet/Kconfig"
31source "net/unix/Kconfig"
32source "net/xfrm/Kconfig"
74 33
75config INET 34config INET
76 bool "TCP/IP networking" 35 bool "TCP/IP networking"
@@ -94,30 +53,12 @@ config INET
94 53
95 Short answer: say Y. 54 Short answer: say Y.
96 55
56if INET
97source "net/ipv4/Kconfig" 57source "net/ipv4/Kconfig"
98
99# IPv6 as module will cause a CRASH if you try to unload it
100config IPV6
101 tristate "The IPv6 protocol"
102 depends on INET
103 default m
104 select CRYPTO if IPV6_PRIVACY
105 select CRYPTO_MD5 if IPV6_PRIVACY
106 ---help---
107 This is complemental support for the IP version 6.
108 You will still be able to do traditional IPv4 networking as well.
109
110 For general information about IPv6, see
111 <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
112 For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
113 For specific information about IPv6 under Linux, read the HOWTO at
114 <http://www.bieringer.de/linux/IPv6/>.
115
116 To compile this protocol support as a module, choose M here: the
117 module will be called ipv6.
118
119source "net/ipv6/Kconfig" 58source "net/ipv6/Kconfig"
120 59
60endif # if INET
61
121menuconfig NETFILTER 62menuconfig NETFILTER
122 bool "Network packet filtering (replaces ipchains)" 63 bool "Network packet filtering (replaces ipchains)"
123 ---help--- 64 ---help---
@@ -206,269 +147,16 @@ source "net/bridge/netfilter/Kconfig"
206 147
207endif 148endif
208 149
209config XFRM
210 bool
211 depends on NET
212
213source "net/xfrm/Kconfig"
214
215source "net/sctp/Kconfig" 150source "net/sctp/Kconfig"
216 151source "net/atm/Kconfig"
217config ATM 152source "net/bridge/Kconfig"
218 tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)" 153source "net/8021q/Kconfig"
219 depends on EXPERIMENTAL
220 ---help---
221 ATM is a high-speed networking technology for Local Area Networks
222 and Wide Area Networks. It uses a fixed packet size and is
223 connection oriented, allowing for the negotiation of minimum
224 bandwidth requirements.
225
226 In order to participate in an ATM network, your Linux box needs an
227 ATM networking card. If you have that, say Y here and to the driver
228 of your ATM card below.
229
230 Note that you need a set of user-space programs to actually make use
231 of ATM. See the file <file:Documentation/networking/atm.txt> for
232 further details.
233
234config ATM_CLIP
235 tristate "Classical IP over ATM (EXPERIMENTAL)"
236 depends on ATM && INET
237 help
238 Classical IP over ATM for PVCs and SVCs, supporting InARP and
239 ATMARP. If you want to communication with other IP hosts on your ATM
240 network, you will typically either say Y here or to "LAN Emulation
241 (LANE)" below.
242
243config ATM_CLIP_NO_ICMP
244 bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
245 depends on ATM_CLIP
246 help
247 Normally, an "ICMP host unreachable" message is sent if a neighbour
248 cannot be reached because there is no VC to it in the kernel's
249 ATMARP table. This may cause problems when ATMARP table entries are
250 briefly removed during revalidation. If you say Y here, packets to
251 such neighbours are silently discarded instead.
252
253config ATM_LANE
254 tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
255 depends on ATM
256 help
257 LAN Emulation emulates services of existing LANs across an ATM
258 network. Besides operating as a normal ATM end station client, Linux
259 LANE client can also act as an proxy client bridging packets between
260 ELAN and Ethernet segments. You need LANE if you want to try MPOA.
261
262config ATM_MPOA
263 tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
264 depends on ATM && INET && ATM_LANE!=n
265 help
266 Multi-Protocol Over ATM allows ATM edge devices such as routers,
267 bridges and ATM attached hosts establish direct ATM VCs across
268 subnetwork boundaries. These shortcut connections bypass routers
269 enhancing overall network performance.
270
271config ATM_BR2684
272 tristate "RFC1483/2684 Bridged protocols"
273 depends on ATM && INET
274 help
275 ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
276 This device will act like an ethernet from the kernels point of view,
277 with the traffic being carried by ATM PVCs (currently 1 PVC/device).
278 This is sometimes used over DSL lines. If in doubt, say N.
279
280config ATM_BR2684_IPFILTER
281 bool "Per-VC IP filter kludge"
282 depends on ATM_BR2684
283 help
284 This is an experimental mechanism for users who need to terminating a
285 large number of IP-only vcc's. Do not enable this unless you are sure
286 you know what you are doing.
287
288config BRIDGE
289 tristate "802.1d Ethernet Bridging"
290 ---help---
291 If you say Y here, then your Linux box will be able to act as an
292 Ethernet bridge, which means that the different Ethernet segments it
293 is connected to will appear as one Ethernet to the participants.
294 Several such bridges can work together to create even larger
295 networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
296 As this is a standard, Linux bridges will cooperate properly with
297 other third party bridge products.
298
299 In order to use the Ethernet bridge, you'll need the bridge
300 configuration tools; see <file:Documentation/networking/bridge.txt>
301 for location. Please read the Bridge mini-HOWTO for more
302 information.
303
304 If you enable iptables support along with the bridge support then you
305 turn your bridge into a bridging IP firewall.
306 iptables will then see the IP packets being bridged, so you need to
307 take this into account when setting up your firewall rules.
308 Enabling arptables support when bridging will let arptables see
309 bridged ARP traffic in the arptables FORWARD chain.
310
311 To compile this code as a module, choose M here: the module
312 will be called bridge.
313
314 If unsure, say N.
315
316config VLAN_8021Q
317 tristate "802.1Q VLAN Support"
318 ---help---
319 Select this and you will be able to create 802.1Q VLAN interfaces
320 on your ethernet interfaces. 802.1Q VLAN supports almost
321 everything a regular ethernet interface does, including
322 firewalling, bridging, and of course IP traffic. You will need
323 the 'vconfig' tool from the VLAN project in order to effectively
324 use VLANs. See the VLAN web page for more information:
325 <http://www.candelatech.com/~greear/vlan.html>
326
327 To compile this code as a module, choose M here: the module
328 will be called 8021q.
329
330 If unsure, say N.
331
332config DECNET
333 tristate "DECnet Support"
334 ---help---
335 The DECnet networking protocol was used in many products made by
336 Digital (now Compaq). It provides reliable stream and sequenced
337 packet communications over which run a variety of services similar
338 to those which run over TCP/IP.
339
340 To find some tools to use with the kernel layer support, please
341 look at Patrick Caulfield's web site:
342 <http://linux-decnet.sourceforge.net/>.
343
344 More detailed documentation is available in
345 <file:Documentation/networking/decnet.txt>.
346
347 Be sure to say Y to "/proc file system support" and "Sysctl support"
348 below when using DECnet, since you will need sysctl support to aid
349 in configuration at run time.
350
351 The DECnet code is also available as a module ( = code which can be
352 inserted in and removed from the running kernel whenever you want).
353 The module is called decnet.
354
355source "net/decnet/Kconfig" 154source "net/decnet/Kconfig"
356
357source "net/llc/Kconfig" 155source "net/llc/Kconfig"
358
359config IPX
360 tristate "The IPX protocol"
361 select LLC
362 ---help---
363 This is support for the Novell networking protocol, IPX, commonly
364 used for local networks of Windows machines. You need it if you
365 want to access Novell NetWare file or print servers using the Linux
366 Novell client ncpfs (available from
367 <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
368 within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
369 available from <http://www.tldp.org/docs.html#howto>). In order
370 to do the former, you'll also have to say Y to "NCP file system
371 support", below.
372
373 IPX is similar in scope to IP, while SPX, which runs on top of IPX,
374 is similar to TCP. There is also experimental support for SPX in
375 Linux (see "SPX networking", below).
376
377 To turn your Linux box into a fully featured NetWare file server and
378 IPX router, say Y here and fetch either lwared from
379 <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
380 mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
381 information, read the IPX-HOWTO available from
382 <http://www.tldp.org/docs.html#howto>.
383
384 General information about how to connect Linux, Windows machines and
385 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
386
387 The IPX driver would enlarge your kernel by about 16 KB. To compile
388 this driver as a module, choose M here: the module will be called ipx.
389 Unless you want to integrate your Linux box with a local Novell
390 network, say N.
391
392source "net/ipx/Kconfig" 156source "net/ipx/Kconfig"
393
394config ATALK
395 tristate "Appletalk protocol support"
396 select LLC
397 ---help---
398 AppleTalk is the protocol that Apple computers can use to communicate
399 on a network. If your Linux box is connected to such a network and you
400 wish to connect to it, say Y. You will need to use the netatalk package
401 so that your Linux box can act as a print and file server for Macs as
402 well as access AppleTalk printers. Check out
403 <http://www.zettabyte.net/netatalk/> on the WWW for details.
404 EtherTalk is the name used for AppleTalk over Ethernet and the
405 cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
406 network using serial links. EtherTalk and LocalTalk are fully
407 supported by Linux.
408
409 General information about how to connect Linux, Windows machines and
410 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. The
411 NET-3-HOWTO, available from
412 <http://www.tldp.org/docs.html#howto>, contains valuable
413 information as well.
414
415 To compile this driver as a module, choose M here: the module will be
416 called appletalk. You almost certainly want to compile it as a
417 module so you can restart your AppleTalk stack without rebooting
418 your machine. I hear that the GNU boycott of Apple is over, so
419 even politically correct people are allowed to say Y here.
420
421source "drivers/net/appletalk/Kconfig" 157source "drivers/net/appletalk/Kconfig"
422 158source "net/x25/Kconfig"
423config X25 159source "net/lapb/Kconfig"
424 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
425 depends on EXPERIMENTAL
426 ---help---
427 X.25 is a set of standardized network protocols, similar in scope to
428 frame relay; the one physical line from your box to the X.25 network
429 entry point can carry several logical point-to-point connections
430 (called "virtual circuits") to other computers connected to the X.25
431 network. Governments, banks, and other organizations tend to use it
432 to connect to each other or to form Wide Area Networks (WANs). Many
433 countries have public X.25 networks. X.25 consists of two
434 protocols: the higher level Packet Layer Protocol (PLP) (say Y here
435 if you want that) and the lower level data link layer protocol LAPB
436 (say Y to "LAPB Data Link Driver" below if you want that).
437
438 You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
439 <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
440 Information about X.25 for Linux is contained in the files
441 <file:Documentation/networking/x25.txt> and
442 <file:Documentation/networking/x25-iface.txt>.
443
444 One connects to an X.25 network either with a dedicated network card
445 using the X.21 protocol (not yet supported by Linux) or one can do
446 X.25 over a standard telephone line using an ordinary modem (say Y
447 to "X.25 async driver" below) or over Ethernet using an ordinary
448 Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
449 Driver" and "LAPB over Ethernet driver" below).
450
451 To compile this driver as a module, choose M here: the module
452 will be called x25. If unsure, say N.
453
454config LAPB
455 tristate "LAPB Data Link Driver (EXPERIMENTAL)"
456 depends on EXPERIMENTAL
457 ---help---
458 Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
459 the lower) part of the X.25 protocol. It offers a reliable
460 connection service to exchange data frames with one other host, and
461 it is used to transport higher level protocols (mostly X.25 Packet
462 Layer, the higher part of X.25, but others are possible as well).
463 Usually, LAPB is used with specialized X.21 network cards, but Linux
464 currently supports LAPB only over Ethernet connections. If you want
465 to use LAPB connections over Ethernet, say Y here and to "LAPB over
466 Ethernet driver" below. Read
467 <file:Documentation/networking/lapb-module.txt> for technical
468 details.
469
470 To compile this driver as a module, choose M here: the
471 module will be called lapb. If unsure, say N.
472 160
473config NET_DIVERT 161config NET_DIVERT
474 bool "Frame Diverter (EXPERIMENTAL)" 162 bool "Frame Diverter (EXPERIMENTAL)"
@@ -496,107 +184,10 @@ config NET_DIVERT
496 184
497 If unsure, say N. 185 If unsure, say N.
498 186
499config ECONET 187source "net/econet/Kconfig"
500 tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)" 188source "net/wanrouter/Kconfig"
501 depends on EXPERIMENTAL && INET
502 ---help---
503 Econet is a fairly old and slow networking protocol mainly used by
504 Acorn computers to access file and print servers. It uses native
505 Econet network cards. AUN is an implementation of the higher level
506 parts of Econet that runs over ordinary Ethernet connections, on
507 top of the UDP packet protocol, which in turn runs on top of the
508 Internet protocol IP.
509
510 If you say Y here, you can choose with the next two options whether
511 to send Econet/AUN traffic over a UDP Ethernet connection or over
512 a native Econet network card.
513
514 To compile this driver as a module, choose M here: the module
515 will be called econet.
516
517config ECONET_AUNUDP
518 bool "AUN over UDP"
519 depends on ECONET
520 help
521 Say Y here if you want to send Econet/AUN traffic over a UDP
522 connection (UDP is a packet based protocol that runs on top of the
523 Internet protocol IP) using an ordinary Ethernet network card.
524
525config ECONET_NATIVE
526 bool "Native Econet"
527 depends on ECONET
528 help
529 Say Y here if you have a native Econet network card installed in
530 your computer.
531
532config WAN_ROUTER
533 tristate "WAN router"
534 depends on EXPERIMENTAL
535 ---help---
536 Wide Area Networks (WANs), such as X.25, frame relay and leased
537 lines, are used to interconnect Local Area Networks (LANs) over vast
538 distances with data transfer rates significantly higher than those
539 achievable with commonly used asynchronous modem connections.
540 Usually, a quite expensive external device called a `WAN router' is
541 needed to connect to a WAN.
542
543 As an alternative, WAN routing can be built into the Linux kernel.
544 With relatively inexpensive WAN interface cards available on the
545 market, a perfectly usable router can be built for less than half
546 the price of an external router. If you have one of those cards and
547 wish to use your Linux box as a WAN router, say Y here and also to
548 the WAN driver for your card, below. You will then need the
549 wan-tools package which is available from <ftp://ftp.sangoma.com/>.
550 Read <file:Documentation/networking/wan-router.txt> for more
551 information.
552
553 To compile WAN routing support as a module, choose M here: the
554 module will be called wanrouter.
555
556 If unsure, say N.
557
558menu "QoS and/or fair queueing"
559
560config NET_SCHED
561 bool "QoS and/or fair queueing"
562 ---help---
563 When the kernel has several packets to send out over a network
564 device, it has to decide which ones to send first, which ones to
565 delay, and which ones to drop. This is the job of the packet
566 scheduler, and several different algorithms for how to do this
567 "fairly" have been proposed.
568
569 If you say N here, you will get the standard packet scheduler, which
570 is a FIFO (first come, first served). If you say Y here, you will be
571 able to choose from among several alternative algorithms which can
572 then be attached to different network devices. This is useful for
573 example if some of your network devices are real time devices that
574 need a certain minimum data flow rate, or if you need to limit the
575 maximum data flow rate for traffic which matches specified criteria.
576 This code is considered to be experimental.
577
578 To administer these schedulers, you'll need the user-level utilities
579 from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
580 That package also contains some documentation; for more, check out
581 <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
582
583 This Quality of Service (QoS) support will enable you to use
584 Differentiated Services (diffserv) and Resource Reservation Protocol
585 (RSVP) on your Linux router if you also say Y to "QoS support",
586 "Packet classifier API" and to some classifiers below. Documentation
587 and software is at <http://diffserv.sourceforge.net/>.
588
589 If you say Y here and to "/proc file system" below, you will be able
590 to read status information about packet schedulers from the file
591 /proc/net/psched.
592
593 The available schedulers are listed in the following questions; you
594 can say Y to as many as you like. If unsure, say N now.
595
596source "net/sched/Kconfig" 189source "net/sched/Kconfig"
597 190
598endmenu
599
600menu "Network testing" 191menu "Network testing"
601 192
602config NET_PKTGEN 193config NET_PKTGEN
@@ -618,29 +209,10 @@ endmenu
618 209
619endmenu 210endmenu
620 211
621config NETPOLL
622 def_bool NETCONSOLE
623
624config NETPOLL_RX
625 bool "Netpoll support for trapping incoming packets"
626 default n
627 depends on NETPOLL
628
629config NETPOLL_TRAP
630 bool "Netpoll traffic trapping"
631 default n
632 depends on NETPOLL
633
634config NET_POLL_CONTROLLER
635 def_bool NETPOLL
636
637source "net/ax25/Kconfig" 212source "net/ax25/Kconfig"
638
639source "net/irda/Kconfig" 213source "net/irda/Kconfig"
640
641source "net/bluetooth/Kconfig" 214source "net/bluetooth/Kconfig"
642 215
643source "drivers/net/Kconfig" 216endif # if NET
644 217endmenu # Networking
645endmenu
646 218
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 10d0404610..c34614ea5f 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -35,6 +35,7 @@
35#include <net/datalink.h> 35#include <net/datalink.h>
36#include <net/psnap.h> 36#include <net/psnap.h>
37#include <linux/atalk.h> 37#include <linux/atalk.h>
38#include <linux/delay.h>
38#include <linux/init.h> 39#include <linux/init.h>
39#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
40#include <linux/seq_file.h> 41#include <linux/seq_file.h>
@@ -462,8 +463,7 @@ void aarp_probe_network(struct atalk_iface *atif)
462 aarp_send_probe(atif->dev, &atif->address); 463 aarp_send_probe(atif->dev, &atif->address);
463 464
464 /* Defer 1/10th */ 465 /* Defer 1/10th */
465 current->state = TASK_INTERRUPTIBLE; 466 msleep(100);
466 schedule_timeout(HZ / 10);
467 467
468 if (atif->status & ATIF_PROBE_FAIL) 468 if (atif->status & ATIF_PROBE_FAIL)
469 break; 469 break;
@@ -510,9 +510,8 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
510 aarp_send_probe(atif->dev, sa); 510 aarp_send_probe(atif->dev, sa);
511 511
512 /* Defer 1/10th */ 512 /* Defer 1/10th */
513 current->state = TASK_INTERRUPTIBLE;
514 write_unlock_bh(&aarp_lock); 513 write_unlock_bh(&aarp_lock);
515 schedule_timeout(HZ / 10); 514 msleep(100);
516 write_lock_bh(&aarp_lock); 515 write_lock_bh(&aarp_lock);
517 516
518 if (entry->status & ATIF_PROBE_FAIL) 517 if (entry->status & ATIF_PROBE_FAIL)
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 0000000000..21ff276b2d
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,74 @@
1#
2# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)
3#
4
5config ATM
6 tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 ATM is a high-speed networking technology for Local Area Networks
10 and Wide Area Networks. It uses a fixed packet size and is
11 connection oriented, allowing for the negotiation of minimum
12 bandwidth requirements.
13
14 In order to participate in an ATM network, your Linux box needs an
15 ATM networking card. If you have that, say Y here and to the driver
16 of your ATM card below.
17
18 Note that you need a set of user-space programs to actually make use
19 of ATM. See the file <file:Documentation/networking/atm.txt> for
20 further details.
21
22config ATM_CLIP
23 tristate "Classical IP over ATM (EXPERIMENTAL)"
24 depends on ATM && INET
25 help
26 Classical IP over ATM for PVCs and SVCs, supporting InARP and
27 ATMARP. If you want to communication with other IP hosts on your ATM
28 network, you will typically either say Y here or to "LAN Emulation
29 (LANE)" below.
30
31config ATM_CLIP_NO_ICMP
32 bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
33 depends on ATM_CLIP
34 help
35 Normally, an "ICMP host unreachable" message is sent if a neighbour
36 cannot be reached because there is no VC to it in the kernel's
37 ATMARP table. This may cause problems when ATMARP table entries are
38 briefly removed during revalidation. If you say Y here, packets to
39 such neighbours are silently discarded instead.
40
41config ATM_LANE
42 tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
43 depends on ATM
44 help
45 LAN Emulation emulates services of existing LANs across an ATM
46 network. Besides operating as a normal ATM end station client, Linux
47 LANE client can also act as an proxy client bridging packets between
48 ELAN and Ethernet segments. You need LANE if you want to try MPOA.
49
50config ATM_MPOA
51 tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
52 depends on ATM && INET && ATM_LANE!=n
53 help
54 Multi-Protocol Over ATM allows ATM edge devices such as routers,
55 bridges and ATM attached hosts establish direct ATM VCs across
56 subnetwork boundaries. These shortcut connections bypass routers
57 enhancing overall network performance.
58
59config ATM_BR2684
60 tristate "RFC1483/2684 Bridged protocols"
61 depends on ATM && INET
62 help
63 ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
64 This device will act like an ethernet from the kernels point of view,
65 with the traffic being carried by ATM PVCs (currently 1 PVC/device).
66 This is sometimes used over DSL lines. If in doubt, say N.
67
68config ATM_BR2684_IPFILTER
69 bool "Per-VC IP filter kludge"
70 depends on ATM_BR2684
71 help
72 This is an experimental mechanism for users who need to terminate a
73 large number of IP-only vcc's. Do not enable this unless you are sure
74 you know what you are doing.
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index e6954cf145..289956c4dd 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -289,8 +289,7 @@ xmit will add the additional header part in that case */
289 * This is similar to eth_type_trans, which cannot be used because of 289 * This is similar to eth_type_trans, which cannot be used because of
290 * our dev->hard_header_len 290 * our dev->hard_header_len
291 */ 291 */
292static inline unsigned short br_type_trans(struct sk_buff *skb, 292static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
293 struct net_device *dev)
294{ 293{
295 struct ethhdr *eth; 294 struct ethhdr *eth;
296 unsigned char *rawp; 295 unsigned char *rawp;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 02f5374a51..08e46052a3 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -118,10 +118,6 @@ static int svc_bind(struct socket *sock,struct sockaddr *sockaddr,
118 goto out; 118 goto out;
119 } 119 }
120 vcc = ATM_SD(sock); 120 vcc = ATM_SD(sock);
121 if (test_bit(ATM_VF_SESSION, &vcc->flags)) {
122 error = -EINVAL;
123 goto out;
124 }
125 addr = (struct sockaddr_atmsvc *) sockaddr; 121 addr = (struct sockaddr_atmsvc *) sockaddr;
126 if (addr->sas_family != AF_ATMSVC) { 122 if (addr->sas_family != AF_ATMSVC) {
127 error = -EAFNOSUPPORT; 123 error = -EAFNOSUPPORT;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 707097deac..a5c94f1154 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -875,12 +875,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
875 sk->sk_sndbuf = osk->sk_sndbuf; 875 sk->sk_sndbuf = osk->sk_sndbuf;
876 sk->sk_state = TCP_ESTABLISHED; 876 sk->sk_state = TCP_ESTABLISHED;
877 sk->sk_sleep = osk->sk_sleep; 877 sk->sk_sleep = osk->sk_sleep;
878 878 sock_copy_flags(sk, osk);
879 if (sock_flag(osk, SOCK_DBG))
880 sock_set_flag(sk, SOCK_DBG);
881
882 if (sock_flag(osk, SOCK_ZAPPED))
883 sock_set_flag(sk, SOCK_ZAPPED);
884 879
885 oax25 = ax25_sk(osk); 880 oax25 = ax25_sk(osk);
886 881
@@ -1007,7 +1002,8 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1007 struct sock *sk = sock->sk; 1002 struct sock *sk = sock->sk;
1008 struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; 1003 struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
1009 ax25_dev *ax25_dev = NULL; 1004 ax25_dev *ax25_dev = NULL;
1010 ax25_address *call; 1005 ax25_uid_assoc *user;
1006 ax25_address call;
1011 ax25_cb *ax25; 1007 ax25_cb *ax25;
1012 int err = 0; 1008 int err = 0;
1013 1009
@@ -1026,9 +1022,15 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1026 if (addr->fsa_ax25.sax25_family != AF_AX25) 1022 if (addr->fsa_ax25.sax25_family != AF_AX25)
1027 return -EINVAL; 1023 return -EINVAL;
1028 1024
1029 call = ax25_findbyuid(current->euid); 1025 user = ax25_findbyuid(current->euid);
1030 if (call == NULL && ax25_uid_policy && !capable(CAP_NET_ADMIN)) { 1026 if (user) {
1031 return -EACCES; 1027 call = user->call;
1028 ax25_uid_put(user);
1029 } else {
1030 if (ax25_uid_policy && !capable(CAP_NET_ADMIN))
1031 return -EACCES;
1032
1033 call = addr->fsa_ax25.sax25_call;
1032 } 1034 }
1033 1035
1034 lock_sock(sk); 1036 lock_sock(sk);
@@ -1039,10 +1041,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1039 goto out; 1041 goto out;
1040 } 1042 }
1041 1043
1042 if (call == NULL) 1044 ax25->source_addr = call;
1043 ax25->source_addr = addr->fsa_ax25.sax25_call;
1044 else
1045 ax25->source_addr = *call;
1046 1045
1047 /* 1046 /*
1048 * User already set interface with SO_BINDTODEVICE 1047 * User already set interface with SO_BINDTODEVICE
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 44b99b1ff9..c288526da4 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -422,8 +422,8 @@ static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
422 */ 422 */
423int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) 423int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
424{ 424{
425 ax25_uid_assoc *user;
425 ax25_route *ax25_rt; 426 ax25_route *ax25_rt;
426 ax25_address *call;
427 int err; 427 int err;
428 428
429 if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) 429 if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
@@ -434,16 +434,18 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
434 goto put; 434 goto put;
435 } 435 }
436 436
437 if ((call = ax25_findbyuid(current->euid)) == NULL) { 437 user = ax25_findbyuid(current->euid);
438 if (user) {
439 ax25->source_addr = user->call;
440 ax25_uid_put(user);
441 } else {
438 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { 442 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
439 err = -EPERM; 443 err = -EPERM;
440 goto put; 444 goto put;
441 } 445 }
442 call = (ax25_address *)ax25->ax25_dev->dev->dev_addr; 446 ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
443 } 447 }
444 448
445 ax25->source_addr = *call;
446
447 if (ax25_rt->digipeat != NULL) { 449 if (ax25_rt->digipeat != NULL) {
448 if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { 450 if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
449 err = -ENOMEM; 451 err = -ENOMEM;
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index cea6b7d197..a8b3822f3e 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -28,6 +28,7 @@
28#include <linux/fcntl.h> 28#include <linux/fcntl.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/interrupt.h> 30#include <linux/interrupt.h>
31#include <linux/list.h>
31#include <linux/notifier.h> 32#include <linux/notifier.h>
32#include <linux/proc_fs.h> 33#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 34#include <linux/seq_file.h>
@@ -41,38 +42,41 @@
41 * Callsign/UID mapper. This is in kernel space for security on multi-amateur machines. 42 * Callsign/UID mapper. This is in kernel space for security on multi-amateur machines.
42 */ 43 */
43 44
44static ax25_uid_assoc *ax25_uid_list; 45HLIST_HEAD(ax25_uid_list);
45static DEFINE_RWLOCK(ax25_uid_lock); 46static DEFINE_RWLOCK(ax25_uid_lock);
46 47
47int ax25_uid_policy = 0; 48int ax25_uid_policy = 0;
48 49
49ax25_address *ax25_findbyuid(uid_t uid) 50ax25_uid_assoc *ax25_findbyuid(uid_t uid)
50{ 51{
51 ax25_uid_assoc *ax25_uid; 52 ax25_uid_assoc *ax25_uid, *res = NULL;
52 ax25_address *res = NULL; 53 struct hlist_node *node;
53 54
54 read_lock(&ax25_uid_lock); 55 read_lock(&ax25_uid_lock);
55 for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { 56 ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
56 if (ax25_uid->uid == uid) { 57 if (ax25_uid->uid == uid) {
57 res = &ax25_uid->call; 58 ax25_uid_hold(ax25_uid);
59 res = ax25_uid;
58 break; 60 break;
59 } 61 }
60 } 62 }
61 read_unlock(&ax25_uid_lock); 63 read_unlock(&ax25_uid_lock);
62 64
63 return NULL; 65 return res;
64} 66}
65 67
66int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) 68int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
67{ 69{
68 ax25_uid_assoc *s, *ax25_uid; 70 ax25_uid_assoc *ax25_uid;
71 struct hlist_node *node;
72 ax25_uid_assoc *user;
69 unsigned long res; 73 unsigned long res;
70 74
71 switch (cmd) { 75 switch (cmd) {
72 case SIOCAX25GETUID: 76 case SIOCAX25GETUID:
73 res = -ENOENT; 77 res = -ENOENT;
74 read_lock(&ax25_uid_lock); 78 read_lock(&ax25_uid_lock);
75 for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { 79 ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
76 if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) { 80 if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
77 res = ax25_uid->uid; 81 res = ax25_uid->uid;
78 break; 82 break;
@@ -85,19 +89,22 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
85 case SIOCAX25ADDUID: 89 case SIOCAX25ADDUID:
86 if (!capable(CAP_NET_ADMIN)) 90 if (!capable(CAP_NET_ADMIN))
87 return -EPERM; 91 return -EPERM;
88 if (ax25_findbyuid(sax->sax25_uid)) 92 user = ax25_findbyuid(sax->sax25_uid);
93 if (user) {
94 ax25_uid_put(user);
89 return -EEXIST; 95 return -EEXIST;
96 }
90 if (sax->sax25_uid == 0) 97 if (sax->sax25_uid == 0)
91 return -EINVAL; 98 return -EINVAL;
92 if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL) 99 if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL)
93 return -ENOMEM; 100 return -ENOMEM;
94 101
102 atomic_set(&ax25_uid->refcount, 1);
95 ax25_uid->uid = sax->sax25_uid; 103 ax25_uid->uid = sax->sax25_uid;
96 ax25_uid->call = sax->sax25_call; 104 ax25_uid->call = sax->sax25_call;
97 105
98 write_lock(&ax25_uid_lock); 106 write_lock(&ax25_uid_lock);
99 ax25_uid->next = ax25_uid_list; 107 hlist_add_head(&ax25_uid->uid_node, &ax25_uid_list);
100 ax25_uid_list = ax25_uid;
101 write_unlock(&ax25_uid_lock); 108 write_unlock(&ax25_uid_lock);
102 109
103 return 0; 110 return 0;
@@ -106,34 +113,21 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
106 if (!capable(CAP_NET_ADMIN)) 113 if (!capable(CAP_NET_ADMIN))
107 return -EPERM; 114 return -EPERM;
108 115
116 ax25_uid = NULL;
109 write_lock(&ax25_uid_lock); 117 write_lock(&ax25_uid_lock);
110 for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { 118 ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
111 if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) { 119 if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0)
112 break; 120 break;
113 }
114 } 121 }
115 if (ax25_uid == NULL) { 122 if (ax25_uid == NULL) {
116 write_unlock(&ax25_uid_lock); 123 write_unlock(&ax25_uid_lock);
117 return -ENOENT; 124 return -ENOENT;
118 } 125 }
119 if ((s = ax25_uid_list) == ax25_uid) { 126 hlist_del_init(&ax25_uid->uid_node);
120 ax25_uid_list = s->next; 127 ax25_uid_put(ax25_uid);
121 write_unlock(&ax25_uid_lock);
122 kfree(ax25_uid);
123 return 0;
124 }
125 while (s != NULL && s->next != NULL) {
126 if (s->next == ax25_uid) {
127 s->next = ax25_uid->next;
128 write_unlock(&ax25_uid_lock);
129 kfree(ax25_uid);
130 return 0;
131 }
132 s = s->next;
133 }
134 write_unlock(&ax25_uid_lock); 128 write_unlock(&ax25_uid_lock);
135 129
136 return -ENOENT; 130 return 0;
137 131
138 default: 132 default:
139 return -EINVAL; 133 return -EINVAL;
@@ -147,13 +141,11 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
147static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos) 141static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
148{ 142{
149 struct ax25_uid_assoc *pt; 143 struct ax25_uid_assoc *pt;
150 int i = 1; 144 struct hlist_node *node;
145 int i = 0;
151 146
152 read_lock(&ax25_uid_lock); 147 read_lock(&ax25_uid_lock);
153 if (*pos == 0) 148 ax25_uid_for_each(pt, node, &ax25_uid_list) {
154 return SEQ_START_TOKEN;
155
156 for (pt = ax25_uid_list; pt != NULL; pt = pt->next) {
157 if (i == *pos) 149 if (i == *pos)
158 return pt; 150 return pt;
159 ++i; 151 ++i;
@@ -164,8 +156,9 @@ static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
164static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos) 156static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
165{ 157{
166 ++*pos; 158 ++*pos;
167 return (v == SEQ_START_TOKEN) ? ax25_uid_list : 159
168 ((struct ax25_uid_assoc *) v)->next; 160 return hlist_entry(((ax25_uid_assoc *)v)->uid_node.next,
161 ax25_uid_assoc, uid_node);
169} 162}
170 163
171static void ax25_uid_seq_stop(struct seq_file *seq, void *v) 164static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
@@ -179,7 +172,6 @@ static int ax25_uid_seq_show(struct seq_file *seq, void *v)
179 seq_printf(seq, "Policy: %d\n", ax25_uid_policy); 172 seq_printf(seq, "Policy: %d\n", ax25_uid_policy);
180 else { 173 else {
181 struct ax25_uid_assoc *pt = v; 174 struct ax25_uid_assoc *pt = v;
182
183 175
184 seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(&pt->call)); 176 seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(&pt->call));
185 } 177 }
@@ -213,16 +205,13 @@ struct file_operations ax25_uid_fops = {
213 */ 205 */
214void __exit ax25_uid_free(void) 206void __exit ax25_uid_free(void)
215{ 207{
216 ax25_uid_assoc *s, *ax25_uid; 208 ax25_uid_assoc *ax25_uid;
209 struct hlist_node *node;
217 210
218 write_lock(&ax25_uid_lock); 211 write_lock(&ax25_uid_lock);
219 ax25_uid = ax25_uid_list; 212 ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
220 while (ax25_uid != NULL) { 213 hlist_del_init(&ax25_uid->uid_node);
221 s = ax25_uid; 214 ax25_uid_put(ax25_uid);
222 ax25_uid = ax25_uid->next;
223
224 kfree(s);
225 } 215 }
226 ax25_uid_list = NULL;
227 write_unlock(&ax25_uid_lock); 216 write_unlock(&ax25_uid_lock);
228} 217}
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 2e341de3e7..901eff7ebe 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, in
213 return kernel_sendmsg(sock, &msg, &iv, 1, len); 213 return kernel_sendmsg(sock, &msg, &iv, 1, len);
214} 214}
215 215
216static int cmtp_process_transmit(struct cmtp_session *session) 216static void cmtp_process_transmit(struct cmtp_session *session)
217{ 217{
218 struct sk_buff *skb, *nskb; 218 struct sk_buff *skb, *nskb;
219 unsigned char *hdr; 219 unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct cmtp_session *session)
223 223
224 if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) { 224 if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
225 BT_ERR("Can't allocate memory for new frame"); 225 BT_ERR("Can't allocate memory for new frame");
226 return -ENOMEM; 226 return;
227 } 227 }
228 228
229 while ((skb = skb_dequeue(&session->transmit))) { 229 while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct cmtp_session *session)
275 cmtp_send_frame(session, nskb->data, nskb->len); 275 cmtp_send_frame(session, nskb->data, nskb->len);
276 276
277 kfree_skb(nskb); 277 kfree_skb(nskb);
278
279 return skb_queue_len(&session->transmit);
280} 278}
281 279
282static int cmtp_session(void *arg) 280static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index fb5524365b..ffa26c10bf 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -299,7 +299,6 @@ struct hci_dev *hci_dev_get(int index)
299 read_unlock(&hci_dev_list_lock); 299 read_unlock(&hci_dev_list_lock);
300 return hdev; 300 return hdev;
301} 301}
302EXPORT_SYMBOL(hci_dev_get);
303 302
304/* ---- Inquiry support ---- */ 303/* ---- Inquiry support ---- */
305static void inquiry_cache_flush(struct hci_dev *hdev) 304static void inquiry_cache_flush(struct hci_dev *hdev)
@@ -1042,7 +1041,6 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p
1042 1041
1043 return 0; 1042 return 0;
1044} 1043}
1045EXPORT_SYMBOL(hci_send_cmd);
1046 1044
1047/* Get data from the previously sent command */ 1045/* Get data from the previously sent command */
1048void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf) 1046void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c4b592b4ef..46367bd129 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1035,9 +1035,11 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
1035 ev->type = type; 1035 ev->type = type;
1036 memcpy(ev->data, data, dlen); 1036 memcpy(ev->data, data, dlen);
1037 1037
1038 bt_cb(skb)->incoming = 1;
1039 do_gettimeofday(&skb->stamp);
1040
1038 skb->pkt_type = HCI_EVENT_PKT; 1041 skb->pkt_type = HCI_EVENT_PKT;
1039 skb->dev = (void *) hdev; 1042 skb->dev = (void *) hdev;
1040 hci_send_to_sock(hdev, skb); 1043 hci_send_to_sock(hdev, skb);
1041 kfree_skb(skb); 1044 kfree_skb(skb);
1042} 1045}
1043EXPORT_SYMBOL(hci_si_event);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index affbc55462..de8af5f423 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
428 return kernel_sendmsg(sock, &msg, &iv, 1, len); 428 return kernel_sendmsg(sock, &msg, &iv, 1, len);
429} 429}
430 430
431static int hidp_process_transmit(struct hidp_session *session) 431static void hidp_process_transmit(struct hidp_session *session)
432{ 432{
433 struct sk_buff *skb; 433 struct sk_buff *skb;
434 434
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct hidp_session *session)
453 hidp_set_timer(session); 453 hidp_set_timer(session);
454 kfree_skb(skb); 454 kfree_skb(skb);
455 } 455 }
456
457 return skb_queue_len(&session->ctrl_transmit) +
458 skb_queue_len(&session->intr_transmit);
459} 456}
460 457
461static int hidp_session(void *arg) 458static int hidp_session(void *arg)
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 9efb0a0936..ee6a669799 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -34,31 +34,6 @@
34 34
35#include <net/bluetooth/bluetooth.h> 35#include <net/bluetooth/bluetooth.h>
36 36
37void bt_dump(char *pref, __u8 *buf, int count)
38{
39 char *ptr;
40 char line[100];
41 unsigned int i;
42
43 printk(KERN_INFO "%s: dump, len %d\n", pref, count);
44
45 ptr = line;
46 *ptr = 0;
47 for (i = 0; i < count; i++) {
48 ptr += sprintf(ptr, " %2.2X", buf[i]);
49
50 if (i && !((i + 1) % 20)) {
51 printk(KERN_INFO "%s:%s\n", pref, line);
52 ptr = line;
53 *ptr = 0;
54 }
55 }
56
57 if (line[0])
58 printk(KERN_INFO "%s:%s\n", pref, line);
59}
60EXPORT_SYMBOL(bt_dump);
61
62void baswap(bdaddr_t *dst, bdaddr_t *src) 37void baswap(bdaddr_t *dst, bdaddr_t *src)
63{ 38{
64 unsigned char *d = (unsigned char *) dst; 39 unsigned char *d = (unsigned char *) dst;
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index e9e6fda66f..27bf5047cd 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -389,8 +389,6 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
389 rfcomm_dlc_unlock(d); 389 rfcomm_dlc_unlock(d);
390 390
391 skb_queue_purge(&d->tx_queue); 391 skb_queue_purge(&d->tx_queue);
392 rfcomm_session_put(s);
393
394 rfcomm_dlc_unlink(d); 392 rfcomm_dlc_unlink(d);
395 } 393 }
396 394
@@ -600,8 +598,6 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
600 goto failed; 598 goto failed;
601 } 599 }
602 600
603 rfcomm_session_hold(s);
604
605 s->initiator = 1; 601 s->initiator = 1;
606 602
607 bacpy(&addr.l2_bdaddr, dst); 603 bacpy(&addr.l2_bdaddr, dst);
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index f3f6355a27..63a123c5c4 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
590 for (;;) { 590 for (;;) {
591 set_current_state(TASK_INTERRUPTIBLE); 591 set_current_state(TASK_INTERRUPTIBLE);
592 592
593 if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || 593 if (!skb_queue_empty(&sk->sk_receive_queue) ||
594 signal_pending(current) || !timeo) 594 sk->sk_err ||
595 (sk->sk_shutdown & RCV_SHUTDOWN) ||
596 signal_pending(current) ||
597 !timeo)
595 break; 598 break;
596 599
597 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 600 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6d689200bc..6304590fd3 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
781 781
782 BT_DBG("tty %p dev %p", tty, dev); 782 BT_DBG("tty %p dev %p", tty, dev);
783 783
784 if (skb_queue_len(&dlc->tx_queue)) 784 if (!skb_queue_empty(&dlc->tx_queue))
785 return dlc->mtu; 785 return dlc->mtu;
786 786
787 return 0; 787 return 0;
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 0000000000..db23d59746
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,31 @@
1#
2# 802.1d Ethernet Bridging
3#
4
5config BRIDGE
6 tristate "802.1d Ethernet Bridging"
7 ---help---
8 If you say Y here, then your Linux box will be able to act as an
9 Ethernet bridge, which means that the different Ethernet segments it
10 is connected to will appear as one Ethernet to the participants.
11 Several such bridges can work together to create even larger
12 networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
13 As this is a standard, Linux bridges will cooperate properly with
14 other third party bridge products.
15
16 In order to use the Ethernet bridge, you'll need the bridge
17 configuration tools; see <file:Documentation/networking/bridge.txt>
18 for location. Please read the Bridge mini-HOWTO for more
19 information.
20
21 If you enable iptables support along with the bridge support then you
22 turn your bridge into a bridging IP firewall.
23 iptables will then see the IP packets being bridged, so you need to
24 take this into account when setting up your firewall rules.
25 Enabling arptables support when bridging will let arptables see
26 bridged ARP traffic in the arptables FORWARD chain.
27
28 To compile this code as a module, choose M here: the module
29 will be called bridge.
30
31 If unsure, say N.
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 03ae4eddda..2d52fee63a 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -844,7 +844,7 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
844 * doesn't use the bridge parent of the indev by using 844 * doesn't use the bridge parent of the indev by using
845 * the BRNF_DONT_TAKE_PARENT mask. */ 845 * the BRNF_DONT_TAKE_PARENT mask. */
846 if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) { 846 if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
847 nf_bridge->mask &= BRNF_DONT_TAKE_PARENT; 847 nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
848 nf_bridge->physindev = (struct net_device *)in; 848 nf_bridge->physindev = (struct net_device *)in;
849 } 849 }
850#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) 850#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 68ccef507b..c70b3be230 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -138,7 +138,7 @@ config BRIDGE_EBT_VLAN
138# 138#
139config BRIDGE_EBT_ARPREPLY 139config BRIDGE_EBT_ARPREPLY
140 tristate "ebt: arp reply target support" 140 tristate "ebt: arp reply target support"
141 depends on BRIDGE_NF_EBTABLES 141 depends on BRIDGE_NF_EBTABLES && INET
142 help 142 help
143 This option adds the arp reply target, which allows 143 This option adds the arp reply target, which allows
144 automatically sending arp replies to arp requests. 144 automatically sending arp replies to arp requests.
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e4ae34b889..662975be3d 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -61,8 +61,6 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
61{ 61{
62 struct ebt_log_info *info = (struct ebt_log_info *)data; 62 struct ebt_log_info *info = (struct ebt_log_info *)data;
63 char level_string[4] = "< >"; 63 char level_string[4] = "< >";
64 union {struct iphdr iph; struct tcpudphdr ports;
65 struct arphdr arph; struct arppayload arpp;} u;
66 64
67 level_string[1] = '0' + info->loglevel; 65 level_string[1] = '0' + info->loglevel;
68 spin_lock_bh(&ebt_log_lock); 66 spin_lock_bh(&ebt_log_lock);
@@ -88,7 +86,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
88 } 86 }
89 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", 87 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
90 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); 88 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
91 printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos, 89 printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
92 ih->protocol); 90 ih->protocol);
93 if (ih->protocol == IPPROTO_TCP || 91 if (ih->protocol == IPPROTO_TCP ||
94 ih->protocol == IPPROTO_UDP) { 92 ih->protocol == IPPROTO_UDP) {
@@ -127,7 +125,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
127 ah->ar_pln == sizeof(uint32_t)) { 125 ah->ar_pln == sizeof(uint32_t)) {
128 struct arppayload _arpp, *ap; 126 struct arppayload _arpp, *ap;
129 127
130 ap = skb_header_pointer(skb, sizeof(u.arph), 128 ap = skb_header_pointer(skb, sizeof(_arph),
131 sizeof(_arpp), &_arpp); 129 sizeof(_arpp), &_arpp);
132 if (ap == NULL) { 130 if (ap == NULL) {
133 printk(" INCOMPLETE ARP payload"); 131 printk(" INCOMPLETE ARP payload");
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 18ebc66476..c4540144f0 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -859,8 +859,7 @@ static int translate_table(struct ebt_replace *repl,
859 if (repl->valid_hooks & (1 << i)) 859 if (repl->valid_hooks & (1 << i))
860 if (check_chainloops(newinfo->hook_entry[i], 860 if (check_chainloops(newinfo->hook_entry[i],
861 cl_s, udc_cnt, i, newinfo->entries)) { 861 cl_s, udc_cnt, i, newinfo->entries)) {
862 if (cl_s) 862 vfree(cl_s);
863 vfree(cl_s);
864 return -EINVAL; 863 return -EINVAL;
865 } 864 }
866 865
@@ -883,8 +882,7 @@ static int translate_table(struct ebt_replace *repl,
883 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, 882 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
884 ebt_cleanup_entry, &i); 883 ebt_cleanup_entry, &i);
885 } 884 }
886 if (cl_s) 885 vfree(cl_s);
887 vfree(cl_s);
888 return ret; 886 return ret;
889} 887}
890 888
@@ -1030,8 +1028,7 @@ static int do_replace(void __user *user, unsigned int len)
1030 } 1028 }
1031 vfree(table); 1029 vfree(table);
1032 1030
1033 if (counterstmp) 1031 vfree(counterstmp);
1034 vfree(counterstmp);
1035 return ret; 1032 return ret;
1036 1033
1037free_unlock: 1034free_unlock:
@@ -1040,8 +1037,7 @@ free_iterate:
1040 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, 1037 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
1041 ebt_cleanup_entry, NULL); 1038 ebt_cleanup_entry, NULL);
1042free_counterstmp: 1039free_counterstmp:
1043 if (counterstmp) 1040 vfree(counterstmp);
1044 vfree(counterstmp);
1045 /* can be initialized in translate_table() */ 1041 /* can be initialized in translate_table() */
1046 if (newinfo->chainstack) { 1042 if (newinfo->chainstack) {
1047 for (i = 0; i < num_possible_cpus(); i++) 1043 for (i = 0; i < num_possible_cpus(); i++)
@@ -1049,11 +1045,9 @@ free_counterstmp:
1049 vfree(newinfo->chainstack); 1045 vfree(newinfo->chainstack);
1050 } 1046 }
1051free_entries: 1047free_entries:
1052 if (newinfo->entries) 1048 vfree(newinfo->entries);
1053 vfree(newinfo->entries);
1054free_newinfo: 1049free_newinfo:
1055 if (newinfo) 1050 vfree(newinfo);
1056 vfree(newinfo);
1057 return ret; 1051 return ret;
1058} 1052}
1059 1053
@@ -1213,8 +1207,7 @@ void ebt_unregister_table(struct ebt_table *table)
1213 down(&ebt_mutex); 1207 down(&ebt_mutex);
1214 LIST_DELETE(&ebt_tables, table); 1208 LIST_DELETE(&ebt_tables, table);
1215 up(&ebt_mutex); 1209 up(&ebt_mutex);
1216 if (table->private->entries) 1210 vfree(table->private->entries);
1217 vfree(table->private->entries);
1218 if (table->private->chainstack) { 1211 if (table->private->chainstack) {
1219 for (i = 0; i < num_possible_cpus(); i++) 1212 for (i = 0; i < num_possible_cpus(); i++)
1220 vfree(table->private->chainstack[i]); 1213 vfree(table->private->chainstack[i]);
diff --git a/net/compat.c b/net/compat.c
index be5d936dc4..d99ab96958 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -91,20 +91,11 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
91 } else 91 } else
92 kern_msg->msg_name = NULL; 92 kern_msg->msg_name = NULL;
93 93
94 if(kern_msg->msg_iovlen > UIO_FASTIOV) {
95 kern_iov = kmalloc(kern_msg->msg_iovlen * sizeof(struct iovec),
96 GFP_KERNEL);
97 if(!kern_iov)
98 return -ENOMEM;
99 }
100
101 tot_len = iov_from_user_compat_to_kern(kern_iov, 94 tot_len = iov_from_user_compat_to_kern(kern_iov,
102 (struct compat_iovec __user *)kern_msg->msg_iov, 95 (struct compat_iovec __user *)kern_msg->msg_iov,
103 kern_msg->msg_iovlen); 96 kern_msg->msg_iovlen);
104 if(tot_len >= 0) 97 if(tot_len >= 0)
105 kern_msg->msg_iov = kern_iov; 98 kern_msg->msg_iov = kern_iov;
106 else if(kern_msg->msg_iovlen > UIO_FASTIOV)
107 kfree(kern_iov);
108 99
109 return tot_len; 100 return tot_len;
110} 101}
diff --git a/net/core/Makefile b/net/core/Makefile
index 5e0c56b7f6..f5f5e58943 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -7,9 +7,10 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
7 7
8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o 8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
9 9
10obj-y += flow.o dev.o ethtool.o dev_mcast.o dst.o \ 10obj-y += dev.o ethtool.o dev_mcast.o dst.o \
11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o 11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o
12 12
13obj-$(CONFIG_XFRM) += flow.o
13obj-$(CONFIG_SYSFS) += net-sysfs.o 14obj-$(CONFIG_SYSFS) += net-sysfs.o
14obj-$(CONFIG_NETFILTER) += netfilter.o 15obj-$(CONFIG_NETFILTER) += netfilter.o
15obj-$(CONFIG_NET_DIVERT) += dv.o 16obj-$(CONFIG_NET_DIVERT) += dv.o
diff --git a/net/core/dev.c b/net/core/dev.c
index ab935778ce..faf59b02c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
115#endif /* CONFIG_NET_RADIO */ 115#endif /* CONFIG_NET_RADIO */
116#include <asm/current.h> 116#include <asm/current.h>
117 117
118/* This define, if set, will randomly drop a packet when congestion
119 * is more than moderate. It helps fairness in the multi-interface
120 * case when one of them is a hog, but it kills performance for the
121 * single interface case so it is off now by default.
122 */
123#undef RAND_LIE
124
125/* Setting this will sample the queue lengths and thus congestion
126 * via a timer instead of as each packet is received.
127 */
128#undef OFFLINE_SAMPLE
129
130/* 118/*
131 * The list of packet types we will receive (as opposed to discard) 119 * The list of packet types we will receive (as opposed to discard)
132 * and the routines to invoke. 120 * and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
159static struct list_head ptype_base[16]; /* 16 way hashed list */ 147static struct list_head ptype_base[16]; /* 16 way hashed list */
160static struct list_head ptype_all; /* Taps */ 148static struct list_head ptype_all; /* Taps */
161 149
162#ifdef OFFLINE_SAMPLE
163static void sample_queue(unsigned long dummy);
164static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
165#endif
166
167/* 150/*
168 * The @dev_base list is protected by @dev_base_lock and the rtln 151 * The @dev_base list is protected by @dev_base_lock and the rtln
169 * semaphore. 152 * semaphore.
@@ -215,7 +198,7 @@ static struct notifier_block *netdev_chain;
215 * Device drivers call our routines to queue packets here. We empty the 198 * Device drivers call our routines to queue packets here. We empty the
216 * queue in the local softnet handler. 199 * queue in the local softnet handler.
217 */ 200 */
218DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; 201DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
219 202
220#ifdef CONFIG_SYSFS 203#ifdef CONFIG_SYSFS
221extern int netdev_sysfs_init(void); 204extern int netdev_sysfs_init(void);
@@ -918,8 +901,7 @@ int dev_close(struct net_device *dev)
918 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 901 smp_mb__after_clear_bit(); /* Commit netif_running(). */
919 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { 902 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
920 /* No hurry. */ 903 /* No hurry. */
921 current->state = TASK_INTERRUPTIBLE; 904 msleep(1);
922 schedule_timeout(1);
923 } 905 }
924 906
925 /* 907 /*
@@ -1144,7 +1126,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1144extern void skb_release_data(struct sk_buff *); 1126extern void skb_release_data(struct sk_buff *);
1145 1127
1146/* Keep head the same: replace data */ 1128/* Keep head the same: replace data */
1147int __skb_linearize(struct sk_buff *skb, int gfp_mask) 1129int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
1148{ 1130{
1149 unsigned int size; 1131 unsigned int size;
1150 u8 *data; 1132 u8 *data;
@@ -1363,71 +1345,13 @@ out:
1363 Receiver routines 1345 Receiver routines
1364 =======================================================================*/ 1346 =======================================================================*/
1365 1347
1366int netdev_max_backlog = 300; 1348int netdev_max_backlog = 1000;
1349int netdev_budget = 300;
1367int weight_p = 64; /* old backlog weight */ 1350int weight_p = 64; /* old backlog weight */
1368/* These numbers are selected based on intuition and some
1369 * experimentatiom, if you have more scientific way of doing this
1370 * please go ahead and fix things.
1371 */
1372int no_cong_thresh = 10;
1373int no_cong = 20;
1374int lo_cong = 100;
1375int mod_cong = 290;
1376 1351
1377DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 1352DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1378 1353
1379 1354
1380static void get_sample_stats(int cpu)
1381{
1382#ifdef RAND_LIE
1383 unsigned long rd;
1384 int rq;
1385#endif
1386 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1387 int blog = sd->input_pkt_queue.qlen;
1388 int avg_blog = sd->avg_blog;
1389
1390 avg_blog = (avg_blog >> 1) + (blog >> 1);
1391
1392 if (avg_blog > mod_cong) {
1393 /* Above moderate congestion levels. */
1394 sd->cng_level = NET_RX_CN_HIGH;
1395#ifdef RAND_LIE
1396 rd = net_random();
1397 rq = rd % netdev_max_backlog;
1398 if (rq < avg_blog) /* unlucky bastard */
1399 sd->cng_level = NET_RX_DROP;
1400#endif
1401 } else if (avg_blog > lo_cong) {
1402 sd->cng_level = NET_RX_CN_MOD;
1403#ifdef RAND_LIE
1404 rd = net_random();
1405 rq = rd % netdev_max_backlog;
1406 if (rq < avg_blog) /* unlucky bastard */
1407 sd->cng_level = NET_RX_CN_HIGH;
1408#endif
1409 } else if (avg_blog > no_cong)
1410 sd->cng_level = NET_RX_CN_LOW;
1411 else /* no congestion */
1412 sd->cng_level = NET_RX_SUCCESS;
1413
1414 sd->avg_blog = avg_blog;
1415}
1416
1417#ifdef OFFLINE_SAMPLE
1418static void sample_queue(unsigned long dummy)
1419{
1420/* 10 ms 0r 1ms -- i don't care -- JHS */
1421 int next_tick = 1;
1422 int cpu = smp_processor_id();
1423
1424 get_sample_stats(cpu);
1425 next_tick += jiffies;
1426 mod_timer(&samp_timer, next_tick);
1427}
1428#endif
1429
1430
1431/** 1355/**
1432 * netif_rx - post buffer to the network code 1356 * netif_rx - post buffer to the network code
1433 * @skb: buffer to post 1357 * @skb: buffer to post
@@ -1448,7 +1372,6 @@ static void sample_queue(unsigned long dummy)
1448 1372
1449int netif_rx(struct sk_buff *skb) 1373int netif_rx(struct sk_buff *skb)
1450{ 1374{
1451 int this_cpu;
1452 struct softnet_data *queue; 1375 struct softnet_data *queue;
1453 unsigned long flags; 1376 unsigned long flags;
1454 1377
@@ -1464,38 +1387,22 @@ int netif_rx(struct sk_buff *skb)
1464 * short when CPU is congested, but is still operating. 1387 * short when CPU is congested, but is still operating.
1465 */ 1388 */
1466 local_irq_save(flags); 1389 local_irq_save(flags);
1467 this_cpu = smp_processor_id();
1468 queue = &__get_cpu_var(softnet_data); 1390 queue = &__get_cpu_var(softnet_data);
1469 1391
1470 __get_cpu_var(netdev_rx_stat).total++; 1392 __get_cpu_var(netdev_rx_stat).total++;
1471 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 1393 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1472 if (queue->input_pkt_queue.qlen) { 1394 if (queue->input_pkt_queue.qlen) {
1473 if (queue->throttle)
1474 goto drop;
1475
1476enqueue: 1395enqueue:
1477 dev_hold(skb->dev); 1396 dev_hold(skb->dev);
1478 __skb_queue_tail(&queue->input_pkt_queue, skb); 1397 __skb_queue_tail(&queue->input_pkt_queue, skb);
1479#ifndef OFFLINE_SAMPLE
1480 get_sample_stats(this_cpu);
1481#endif
1482 local_irq_restore(flags); 1398 local_irq_restore(flags);
1483 return queue->cng_level; 1399 return NET_RX_SUCCESS;
1484 } 1400 }
1485 1401
1486 if (queue->throttle)
1487 queue->throttle = 0;
1488
1489 netif_rx_schedule(&queue->backlog_dev); 1402 netif_rx_schedule(&queue->backlog_dev);
1490 goto enqueue; 1403 goto enqueue;
1491 } 1404 }
1492 1405
1493 if (!queue->throttle) {
1494 queue->throttle = 1;
1495 __get_cpu_var(netdev_rx_stat).throttled++;
1496 }
1497
1498drop:
1499 __get_cpu_var(netdev_rx_stat).dropped++; 1406 __get_cpu_var(netdev_rx_stat).dropped++;
1500 local_irq_restore(flags); 1407 local_irq_restore(flags);
1501 1408
@@ -1780,8 +1687,6 @@ job_done:
1780 smp_mb__before_clear_bit(); 1687 smp_mb__before_clear_bit();
1781 netif_poll_enable(backlog_dev); 1688 netif_poll_enable(backlog_dev);
1782 1689
1783 if (queue->throttle)
1784 queue->throttle = 0;
1785 local_irq_enable(); 1690 local_irq_enable();
1786 return 0; 1691 return 0;
1787} 1692}
@@ -1790,9 +1695,9 @@ static void net_rx_action(struct softirq_action *h)
1790{ 1695{
1791 struct softnet_data *queue = &__get_cpu_var(softnet_data); 1696 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1792 unsigned long start_time = jiffies; 1697 unsigned long start_time = jiffies;
1793 int budget = netdev_max_backlog; 1698 int budget = netdev_budget;
1699 void *have;
1794 1700
1795
1796 local_irq_disable(); 1701 local_irq_disable();
1797 1702
1798 while (!list_empty(&queue->poll_list)) { 1703 while (!list_empty(&queue->poll_list)) {
@@ -1805,10 +1710,10 @@ static void net_rx_action(struct softirq_action *h)
1805 1710
1806 dev = list_entry(queue->poll_list.next, 1711 dev = list_entry(queue->poll_list.next,
1807 struct net_device, poll_list); 1712 struct net_device, poll_list);
1808 netpoll_poll_lock(dev); 1713 have = netpoll_poll_lock(dev);
1809 1714
1810 if (dev->quota <= 0 || dev->poll(dev, &budget)) { 1715 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1811 netpoll_poll_unlock(dev); 1716 netpoll_poll_unlock(have);
1812 local_irq_disable(); 1717 local_irq_disable();
1813 list_del(&dev->poll_list); 1718 list_del(&dev->poll_list);
1814 list_add_tail(&dev->poll_list, &queue->poll_list); 1719 list_add_tail(&dev->poll_list, &queue->poll_list);
@@ -1817,7 +1722,7 @@ static void net_rx_action(struct softirq_action *h)
1817 else 1722 else
1818 dev->quota = dev->weight; 1723 dev->quota = dev->weight;
1819 } else { 1724 } else {
1820 netpoll_poll_unlock(dev); 1725 netpoll_poll_unlock(have);
1821 dev_put(dev); 1726 dev_put(dev);
1822 local_irq_disable(); 1727 local_irq_disable();
1823 } 1728 }
@@ -2055,15 +1960,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
2055 struct netif_rx_stats *s = v; 1960 struct netif_rx_stats *s = v;
2056 1961
2057 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 1962 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2058 s->total, s->dropped, s->time_squeeze, s->throttled, 1963 s->total, s->dropped, s->time_squeeze, 0,
2059 s->fastroute_hit, s->fastroute_success, s->fastroute_defer, 1964 0, 0, 0, 0, /* was fastroute */
2060 s->fastroute_deferred_out, 1965 s->cpu_collision );
2061#if 0
2062 s->fastroute_latency_reduction
2063#else
2064 s->cpu_collision
2065#endif
2066 );
2067 return 0; 1966 return 0;
2068} 1967}
2069 1968
@@ -2190,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
2190{ 2089{
2191 unsigned short old_flags = dev->flags; 2090 unsigned short old_flags = dev->flags;
2192 2091
2193 dev->flags |= IFF_PROMISC;
2194 if ((dev->promiscuity += inc) == 0) 2092 if ((dev->promiscuity += inc) == 0)
2195 dev->flags &= ~IFF_PROMISC; 2093 dev->flags &= ~IFF_PROMISC;
2196 if (dev->flags ^ old_flags) { 2094 else
2095 dev->flags |= IFF_PROMISC;
2096 if (dev->flags != old_flags) {
2197 dev_mc_upload(dev); 2097 dev_mc_upload(dev);
2198 printk(KERN_INFO "device %s %s promiscuous mode\n", 2098 printk(KERN_INFO "device %s %s promiscuous mode\n",
2199 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 2099 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
@@ -3305,9 +3205,6 @@ static int __init net_dev_init(void)
3305 3205
3306 queue = &per_cpu(softnet_data, i); 3206 queue = &per_cpu(softnet_data, i);
3307 skb_queue_head_init(&queue->input_pkt_queue); 3207 skb_queue_head_init(&queue->input_pkt_queue);
3308 queue->throttle = 0;
3309 queue->cng_level = 0;
3310 queue->avg_blog = 10; /* arbitrary non-zero */
3311 queue->completion_queue = NULL; 3208 queue->completion_queue = NULL;
3312 INIT_LIST_HEAD(&queue->poll_list); 3209 INIT_LIST_HEAD(&queue->poll_list);
3313 set_bit(__LINK_STATE_START, &queue->backlog_dev.state); 3210 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3316,11 +3213,6 @@ static int __init net_dev_init(void)
3316 atomic_set(&queue->backlog_dev.refcnt, 1); 3213 atomic_set(&queue->backlog_dev.refcnt, 1);
3317 } 3214 }
3318 3215
3319#ifdef OFFLINE_SAMPLE
3320 samp_timer.expires = jiffies + (10 * HZ);
3321 add_timer(&samp_timer);
3322#endif
3323
3324 dev_boot_phase = 0; 3216 dev_boot_phase = 0;
3325 3217
3326 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); 3218 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/dst.c b/net/core/dst.c
index fc434ade52..334790da9f 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,6 +45,7 @@ static struct timer_list dst_gc_timer =
45static void dst_run_gc(unsigned long dummy) 45static void dst_run_gc(unsigned long dummy)
46{ 46{
47 int delayed = 0; 47 int delayed = 0;
48 int work_performed;
48 struct dst_entry * dst, **dstp; 49 struct dst_entry * dst, **dstp;
49 50
50 if (!spin_trylock(&dst_lock)) { 51 if (!spin_trylock(&dst_lock)) {
@@ -52,9 +53,9 @@ static void dst_run_gc(unsigned long dummy)
52 return; 53 return;
53 } 54 }
54 55
55
56 del_timer(&dst_gc_timer); 56 del_timer(&dst_gc_timer);
57 dstp = &dst_garbage_list; 57 dstp = &dst_garbage_list;
58 work_performed = 0;
58 while ((dst = *dstp) != NULL) { 59 while ((dst = *dstp) != NULL) {
59 if (atomic_read(&dst->__refcnt)) { 60 if (atomic_read(&dst->__refcnt)) {
60 dstp = &dst->next; 61 dstp = &dst->next;
@@ -62,6 +63,7 @@ static void dst_run_gc(unsigned long dummy)
62 continue; 63 continue;
63 } 64 }
64 *dstp = dst->next; 65 *dstp = dst->next;
66 work_performed = 1;
65 67
66 dst = dst_destroy(dst); 68 dst = dst_destroy(dst);
67 if (dst) { 69 if (dst) {
@@ -86,9 +88,14 @@ static void dst_run_gc(unsigned long dummy)
86 dst_gc_timer_inc = DST_GC_MAX; 88 dst_gc_timer_inc = DST_GC_MAX;
87 goto out; 89 goto out;
88 } 90 }
89 if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) 91 if (!work_performed) {
90 dst_gc_timer_expires = DST_GC_MAX; 92 if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
91 dst_gc_timer_inc += DST_GC_INC; 93 dst_gc_timer_expires = DST_GC_MAX;
94 dst_gc_timer_inc += DST_GC_INC;
95 } else {
96 dst_gc_timer_inc = DST_GC_INC;
97 dst_gc_timer_expires = DST_GC_MIN;
98 }
92 dst_gc_timer.expires = jiffies + dst_gc_timer_expires; 99 dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
93#if RT_CACHE_DEBUG >= 2 100#if RT_CACHE_DEBUG >= 2
94 printk("dst_total: %d/%d %ld\n", 101 printk("dst_total: %d/%d %ld\n",
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ac..cd91a24f97 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
36#include <linux/filter.h> 36#include <linux/filter.h>
37 37
38/* No hurry in this branch */ 38/* No hurry in this branch */
39static u8 *load_pointer(struct sk_buff *skb, int k) 39static void *__load_pointer(struct sk_buff *skb, int k)
40{ 40{
41 u8 *ptr = NULL; 41 u8 *ptr = NULL;
42 42
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
50 return NULL; 50 return NULL;
51} 51}
52 52
53static inline void *load_pointer(struct sk_buff *skb, int k,
54 unsigned int size, void *buffer)
55{
56 if (k >= 0)
57 return skb_header_pointer(skb, k, size, buffer);
58 else {
59 if (k >= SKF_AD_OFF)
60 return NULL;
61 return __load_pointer(skb, k);
62 }
63}
64
53/** 65/**
54 * sk_run_filter - run a filter on a socket 66 * sk_run_filter - run a filter on a socket
55 * @skb: buffer to run the filter on 67 * @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
64 76
65int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) 77int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
66{ 78{
67 unsigned char *data = skb->data;
68 /* len is UNSIGNED. Byte wide insns relies only on implicit
69 type casts to prevent reading arbitrary memory locations.
70 */
71 unsigned int len = skb->len-skb->data_len;
72 struct sock_filter *fentry; /* We walk down these */ 79 struct sock_filter *fentry; /* We walk down these */
80 void *ptr;
73 u32 A = 0; /* Accumulator */ 81 u32 A = 0; /* Accumulator */
74 u32 X = 0; /* Index Register */ 82 u32 X = 0; /* Index Register */
75 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ 83 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
84 u32 tmp;
76 int k; 85 int k;
77 int pc; 86 int pc;
78 87
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
168 case BPF_LD|BPF_W|BPF_ABS: 177 case BPF_LD|BPF_W|BPF_ABS:
169 k = fentry->k; 178 k = fentry->k;
170 load_w: 179 load_w:
171 if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { 180 ptr = load_pointer(skb, k, 4, &tmp);
172 A = ntohl(*(u32*)&data[k]); 181 if (ptr != NULL) {
182 A = ntohl(*(u32 *)ptr);
173 continue; 183 continue;
174 } 184 }
175 if (k < 0) {
176 u8 *ptr;
177
178 if (k >= SKF_AD_OFF)
179 break;
180 ptr = load_pointer(skb, k);
181 if (ptr) {
182 A = ntohl(*(u32*)ptr);
183 continue;
184 }
185 } else {
186 u32 _tmp, *p;
187 p = skb_header_pointer(skb, k, 4, &_tmp);
188 if (p != NULL) {
189 A = ntohl(*p);
190 continue;
191 }
192 }
193 return 0; 185 return 0;
194 case BPF_LD|BPF_H|BPF_ABS: 186 case BPF_LD|BPF_H|BPF_ABS:
195 k = fentry->k; 187 k = fentry->k;
196 load_h: 188 load_h:
197 if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { 189 ptr = load_pointer(skb, k, 2, &tmp);
198 A = ntohs(*(u16*)&data[k]); 190 if (ptr != NULL) {
191 A = ntohs(*(u16 *)ptr);
199 continue; 192 continue;
200 } 193 }
201 if (k < 0) {
202 u8 *ptr;
203
204 if (k >= SKF_AD_OFF)
205 break;
206 ptr = load_pointer(skb, k);
207 if (ptr) {
208 A = ntohs(*(u16*)ptr);
209 continue;
210 }
211 } else {
212 u16 _tmp, *p;
213 p = skb_header_pointer(skb, k, 2, &_tmp);
214 if (p != NULL) {
215 A = ntohs(*p);
216 continue;
217 }
218 }
219 return 0; 194 return 0;
220 case BPF_LD|BPF_B|BPF_ABS: 195 case BPF_LD|BPF_B|BPF_ABS:
221 k = fentry->k; 196 k = fentry->k;
222load_b: 197load_b:
223 if (k >= 0 && (unsigned int)k < len) { 198 ptr = load_pointer(skb, k, 1, &tmp);
224 A = data[k]; 199 if (ptr != NULL) {
200 A = *(u8 *)ptr;
225 continue; 201 continue;
226 } 202 }
227 if (k < 0) {
228 u8 *ptr;
229
230 if (k >= SKF_AD_OFF)
231 break;
232 ptr = load_pointer(skb, k);
233 if (ptr) {
234 A = *ptr;
235 continue;
236 }
237 } else {
238 u8 _tmp, *p;
239 p = skb_header_pointer(skb, k, 1, &_tmp);
240 if (p != NULL) {
241 A = *p;
242 continue;
243 }
244 }
245 return 0; 203 return 0;
246 case BPF_LD|BPF_W|BPF_LEN: 204 case BPF_LD|BPF_W|BPF_LEN:
247 A = len; 205 A = skb->len;
248 continue; 206 continue;
249 case BPF_LDX|BPF_W|BPF_LEN: 207 case BPF_LDX|BPF_W|BPF_LEN:
250 X = len; 208 X = skb->len;
251 continue; 209 continue;
252 case BPF_LD|BPF_W|BPF_IND: 210 case BPF_LD|BPF_W|BPF_IND:
253 k = X + fentry->k; 211 k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
259 k = X + fentry->k; 217 k = X + fentry->k;
260 goto load_b; 218 goto load_b;
261 case BPF_LDX|BPF_B|BPF_MSH: 219 case BPF_LDX|BPF_B|BPF_MSH:
262 if (fentry->k >= len) 220 ptr = load_pointer(skb, fentry->k, 1, &tmp);
263 return 0; 221 if (ptr != NULL) {
264 X = (data[fentry->k] & 0xf) << 2; 222 X = (*(u8 *)ptr & 0xf) << 2;
265 continue; 223 continue;
224 }
225 return 0;
266 case BPF_LD|BPF_IMM: 226 case BPF_LD|BPF_IMM:
267 A = fentry->k; 227 A = fentry->k;
268 continue; 228 continue;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f6bdcad47d..1beb782ac4 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -32,6 +32,7 @@
32#include <net/sock.h> 32#include <net/sock.h>
33#include <linux/rtnetlink.h> 33#include <linux/rtnetlink.h>
34#include <linux/random.h> 34#include <linux/random.h>
35#include <linux/string.h>
35 36
36#define NEIGH_DEBUG 1 37#define NEIGH_DEBUG 1
37 38
@@ -1597,6 +1598,8 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
1597 1598
1598 read_lock_bh(&tbl->lock); 1599 read_lock_bh(&tbl->lock);
1599 ndtmsg->ndtm_family = tbl->family; 1600 ndtmsg->ndtm_family = tbl->family;
1601 ndtmsg->ndtm_pad1 = 0;
1602 ndtmsg->ndtm_pad2 = 0;
1600 1603
1601 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id); 1604 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1602 RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval); 1605 RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
@@ -1682,6 +1685,8 @@ static int neightbl_fill_param_info(struct neigh_table *tbl,
1682 1685
1683 read_lock_bh(&tbl->lock); 1686 read_lock_bh(&tbl->lock);
1684 ndtmsg->ndtm_family = tbl->family; 1687 ndtmsg->ndtm_family = tbl->family;
1688 ndtmsg->ndtm_pad1 = 0;
1689 ndtmsg->ndtm_pad2 = 0;
1685 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id); 1690 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1686 1691
1687 if (neightbl_fill_parms(skb, parms) < 0) 1692 if (neightbl_fill_parms(skb, parms) < 0)
@@ -1871,6 +1876,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
1871 struct ndmsg *ndm = NLMSG_DATA(nlh); 1876 struct ndmsg *ndm = NLMSG_DATA(nlh);
1872 1877
1873 ndm->ndm_family = n->ops->family; 1878 ndm->ndm_family = n->ops->family;
1879 ndm->ndm_pad1 = 0;
1880 ndm->ndm_pad2 = 0;
1874 ndm->ndm_flags = n->flags; 1881 ndm->ndm_flags = n->flags;
1875 ndm->ndm_type = n->type; 1882 ndm->ndm_type = n->type;
1876 ndm->ndm_ifindex = n->dev->ifindex; 1883 ndm->ndm_ifindex = n->dev->ifindex;
@@ -2592,7 +2599,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
2592 t->neigh_vars[17].extra1 = dev; 2599 t->neigh_vars[17].extra1 = dev;
2593 } 2600 }
2594 2601
2595 dev_name = net_sysctl_strdup(dev_name_source); 2602 dev_name = kstrdup(dev_name_source, GFP_KERNEL);
2596 if (!dev_name) { 2603 if (!dev_name) {
2597 err = -ENOBUFS; 2604 err = -ENOBUFS;
2598 goto free; 2605 goto free;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a119696d55..a1a9a7abff 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -33,6 +33,7 @@
33#define MAX_UDP_CHUNK 1460 33#define MAX_UDP_CHUNK 1460
34#define MAX_SKBS 32 34#define MAX_SKBS 32
35#define MAX_QUEUE_DEPTH (MAX_SKBS / 2) 35#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
36#define MAX_RETRIES 20000
36 37
37static DEFINE_SPINLOCK(skb_list_lock); 38static DEFINE_SPINLOCK(skb_list_lock);
38static int nr_skbs; 39static int nr_skbs;
@@ -130,19 +131,20 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
130 */ 131 */
131static void poll_napi(struct netpoll *np) 132static void poll_napi(struct netpoll *np)
132{ 133{
134 struct netpoll_info *npinfo = np->dev->npinfo;
133 int budget = 16; 135 int budget = 16;
134 136
135 if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && 137 if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
136 np->poll_owner != smp_processor_id() && 138 npinfo->poll_owner != smp_processor_id() &&
137 spin_trylock(&np->poll_lock)) { 139 spin_trylock(&npinfo->poll_lock)) {
138 np->rx_flags |= NETPOLL_RX_DROP; 140 npinfo->rx_flags |= NETPOLL_RX_DROP;
139 atomic_inc(&trapped); 141 atomic_inc(&trapped);
140 142
141 np->dev->poll(np->dev, &budget); 143 np->dev->poll(np->dev, &budget);
142 144
143 atomic_dec(&trapped); 145 atomic_dec(&trapped);
144 np->rx_flags &= ~NETPOLL_RX_DROP; 146 npinfo->rx_flags &= ~NETPOLL_RX_DROP;
145 spin_unlock(&np->poll_lock); 147 spin_unlock(&npinfo->poll_lock);
146 } 148 }
147} 149}
148 150
@@ -245,16 +247,18 @@ repeat:
245static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) 247static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
246{ 248{
247 int status; 249 int status;
250 struct netpoll_info *npinfo;
248 251
249repeat: 252 if (!np || !np->dev || !netif_running(np->dev)) {
250 if(!np || !np->dev || !netif_running(np->dev)) {
251 __kfree_skb(skb); 253 __kfree_skb(skb);
252 return; 254 return;
253 } 255 }
254 256
257 npinfo = np->dev->npinfo;
258
255 /* avoid recursion */ 259 /* avoid recursion */
256 if(np->poll_owner == smp_processor_id() || 260 if (npinfo->poll_owner == smp_processor_id() ||
257 np->dev->xmit_lock_owner == smp_processor_id()) { 261 np->dev->xmit_lock_owner == smp_processor_id()) {
258 if (np->drop) 262 if (np->drop)
259 np->drop(skb); 263 np->drop(skb);
260 else 264 else
@@ -262,30 +266,37 @@ repeat:
262 return; 266 return;
263 } 267 }
264 268
265 spin_lock(&np->dev->xmit_lock); 269 do {
266 np->dev->xmit_lock_owner = smp_processor_id(); 270 npinfo->tries--;
271 spin_lock(&np->dev->xmit_lock);
272 np->dev->xmit_lock_owner = smp_processor_id();
267 273
268 /* 274 /*
269 * network drivers do not expect to be called if the queue is 275 * network drivers do not expect to be called if the queue is
270 * stopped. 276 * stopped.
271 */ 277 */
272 if (netif_queue_stopped(np->dev)) { 278 if (netif_queue_stopped(np->dev)) {
279 np->dev->xmit_lock_owner = -1;
280 spin_unlock(&np->dev->xmit_lock);
281 netpoll_poll(np);
282 udelay(50);
283 continue;
284 }
285
286 status = np->dev->hard_start_xmit(skb, np->dev);
273 np->dev->xmit_lock_owner = -1; 287 np->dev->xmit_lock_owner = -1;
274 spin_unlock(&np->dev->xmit_lock); 288 spin_unlock(&np->dev->xmit_lock);
275 289
276 netpoll_poll(np); 290 /* success */
277 goto repeat; 291 if(!status) {
278 } 292 npinfo->tries = MAX_RETRIES; /* reset */
279 293 return;
280 status = np->dev->hard_start_xmit(skb, np->dev); 294 }
281 np->dev->xmit_lock_owner = -1;
282 spin_unlock(&np->dev->xmit_lock);
283 295
284 /* transmit busy */ 296 /* transmit busy */
285 if(status) {
286 netpoll_poll(np); 297 netpoll_poll(np);
287 goto repeat; 298 udelay(50);
288 } 299 } while (npinfo->tries > 0);
289} 300}
290 301
291void netpoll_send_udp(struct netpoll *np, const char *msg, int len) 302void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -341,14 +352,18 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
341 352
342static void arp_reply(struct sk_buff *skb) 353static void arp_reply(struct sk_buff *skb)
343{ 354{
355 struct netpoll_info *npinfo = skb->dev->npinfo;
344 struct arphdr *arp; 356 struct arphdr *arp;
345 unsigned char *arp_ptr; 357 unsigned char *arp_ptr;
346 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; 358 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
347 u32 sip, tip; 359 u32 sip, tip;
348 struct sk_buff *send_skb; 360 struct sk_buff *send_skb;
349 struct netpoll *np = skb->dev->np; 361 struct netpoll *np = NULL;
350 362
351 if (!np) return; 363 if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
364 np = npinfo->rx_np;
365 if (!np)
366 return;
352 367
353 /* No arp on this interface */ 368 /* No arp on this interface */
354 if (skb->dev->flags & IFF_NOARP) 369 if (skb->dev->flags & IFF_NOARP)
@@ -429,9 +444,9 @@ int __netpoll_rx(struct sk_buff *skb)
429 int proto, len, ulen; 444 int proto, len, ulen;
430 struct iphdr *iph; 445 struct iphdr *iph;
431 struct udphdr *uh; 446 struct udphdr *uh;
432 struct netpoll *np = skb->dev->np; 447 struct netpoll *np = skb->dev->npinfo->rx_np;
433 448
434 if (!np->rx_hook) 449 if (!np)
435 goto out; 450 goto out;
436 if (skb->dev->type != ARPHRD_ETHER) 451 if (skb->dev->type != ARPHRD_ETHER)
437 goto out; 452 goto out;
@@ -611,9 +626,8 @@ int netpoll_setup(struct netpoll *np)
611{ 626{
612 struct net_device *ndev = NULL; 627 struct net_device *ndev = NULL;
613 struct in_device *in_dev; 628 struct in_device *in_dev;
614 629 struct netpoll_info *npinfo;
615 np->poll_lock = SPIN_LOCK_UNLOCKED; 630 unsigned long flags;
616 np->poll_owner = -1;
617 631
618 if (np->dev_name) 632 if (np->dev_name)
619 ndev = dev_get_by_name(np->dev_name); 633 ndev = dev_get_by_name(np->dev_name);
@@ -624,7 +638,19 @@ int netpoll_setup(struct netpoll *np)
624 } 638 }
625 639
626 np->dev = ndev; 640 np->dev = ndev;
627 ndev->np = np; 641 if (!ndev->npinfo) {
642 npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
643 if (!npinfo)
644 goto release;
645
646 npinfo->rx_flags = 0;
647 npinfo->rx_np = NULL;
648 npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
649 npinfo->poll_owner = -1;
650 npinfo->tries = MAX_RETRIES;
651 npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
652 } else
653 npinfo = ndev->npinfo;
628 654
629 if (!ndev->poll_controller) { 655 if (!ndev->poll_controller) {
630 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", 656 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
@@ -692,13 +718,27 @@ int netpoll_setup(struct netpoll *np)
692 np->name, HIPQUAD(np->local_ip)); 718 np->name, HIPQUAD(np->local_ip));
693 } 719 }
694 720
695 if(np->rx_hook) 721 if (np->rx_hook) {
696 np->rx_flags = NETPOLL_RX_ENABLED; 722 spin_lock_irqsave(&npinfo->rx_lock, flags);
723 npinfo->rx_flags |= NETPOLL_RX_ENABLED;
724 npinfo->rx_np = np;
725 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
726 }
727
728 /* fill up the skb queue */
729 refill_skbs();
730
731 /* last thing to do is link it to the net device structure */
732 ndev->npinfo = npinfo;
733
734 /* avoid racing with NAPI reading npinfo */
735 synchronize_rcu();
697 736
698 return 0; 737 return 0;
699 738
700 release: 739 release:
701 ndev->np = NULL; 740 if (!ndev->npinfo)
741 kfree(npinfo);
702 np->dev = NULL; 742 np->dev = NULL;
703 dev_put(ndev); 743 dev_put(ndev);
704 return -1; 744 return -1;
@@ -706,9 +746,20 @@ int netpoll_setup(struct netpoll *np)
706 746
707void netpoll_cleanup(struct netpoll *np) 747void netpoll_cleanup(struct netpoll *np)
708{ 748{
709 if (np->dev) 749 struct netpoll_info *npinfo;
710 np->dev->np = NULL; 750 unsigned long flags;
711 dev_put(np->dev); 751
752 if (np->dev) {
753 npinfo = np->dev->npinfo;
754 if (npinfo && npinfo->rx_np == np) {
755 spin_lock_irqsave(&npinfo->rx_lock, flags);
756 npinfo->rx_np = NULL;
757 npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
758 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
759 }
760 dev_put(np->dev);
761 }
762
712 np->dev = NULL; 763 np->dev = NULL;
713} 764}
714 765
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06bc79..8eb083b604 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
151#include <asm/timex.h> 151#include <asm/timex.h>
152 152
153 153
154#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n" 154#define VERSION "pktgen v2.62: Packet Generator for packet performance testing.\n"
155 155
156/* #define PG_DEBUG(a) a */ 156/* #define PG_DEBUG(a) a */
157#define PG_DEBUG(a) 157#define PG_DEBUG(a)
@@ -363,7 +363,7 @@ struct pktgen_thread {
363 * All Rights Reserved. 363 * All Rights Reserved.
364 * 364 *
365 */ 365 */
366inline static s64 divremdi3(s64 x, s64 y, int type) 366static inline s64 divremdi3(s64 x, s64 y, int type)
367{ 367{
368 u64 a = (x < 0) ? -x : x; 368 u64 a = (x < 0) ? -x : x;
369 u64 b = (y < 0) ? -y : y; 369 u64 b = (y < 0) ? -y : y;
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1921 struct iphdr *iph; 1921 struct iphdr *iph;
1922 struct pktgen_hdr *pgh = NULL; 1922 struct pktgen_hdr *pgh = NULL;
1923 1923
1924 /* Update any of the values, used when we're incrementing various
1925 * fields.
1926 */
1927 mod_cur_headers(pkt_dev);
1928
1924 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 1929 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
1925 if (!skb) { 1930 if (!skb) {
1926 sprintf(pkt_dev->result, "No memory"); 1931 sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1934 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); 1939 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
1935 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 1940 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
1936 1941
1937 /* Update any of the values, used when we're incrementing various
1938 * fields.
1939 */
1940 mod_cur_headers(pkt_dev);
1941
1942 memcpy(eth, pkt_dev->hh, 12); 1942 memcpy(eth, pkt_dev->hh, 12);
1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP); 1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
1944 1944
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2192 int datalen; 2192 int datalen;
2193 struct ipv6hdr *iph; 2193 struct ipv6hdr *iph;
2194 struct pktgen_hdr *pgh = NULL; 2194 struct pktgen_hdr *pgh = NULL;
2195 2195
2196 /* Update any of the values, used when we're incrementing various
2197 * fields.
2198 */
2199 mod_cur_headers(pkt_dev);
2200
2196 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 2201 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
2197 if (!skb) { 2202 if (!skb) {
2198 sprintf(pkt_dev->result, "No memory"); 2203 sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2206 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); 2211 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
2207 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 2212 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
2208 2213
2209
2210 /* Update any of the values, used when we're incrementing various
2211 * fields.
2212 */
2213 mod_cur_headers(pkt_dev);
2214
2215
2216 memcpy(eth, pkt_dev->hh, 12); 2214 memcpy(eth, pkt_dev->hh, 12);
2217 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6); 2215 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
2218 2216
2219
2220 datalen = pkt_dev->cur_pkt_size-14- 2217 datalen = pkt_dev->cur_pkt_size-14-
2221 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */ 2218 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
2222 2219
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e013d836a7..4b1bb30e63 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -126,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
126 rta->rta_type = attrtype; 126 rta->rta_type = attrtype;
127 rta->rta_len = size; 127 rta->rta_len = size;
128 memcpy(RTA_DATA(rta), data, attrlen); 128 memcpy(RTA_DATA(rta), data, attrlen);
129 memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
129} 130}
130 131
131size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size) 132size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
@@ -188,6 +189,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
188 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags); 189 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
189 r = NLMSG_DATA(nlh); 190 r = NLMSG_DATA(nlh);
190 r->ifi_family = AF_UNSPEC; 191 r->ifi_family = AF_UNSPEC;
192 r->__ifi_pad = 0;
191 r->ifi_type = dev->type; 193 r->ifi_type = dev->type;
192 r->ifi_index = dev->ifindex; 194 r->ifi_index = dev->ifindex;
193 r->ifi_flags = dev_get_flags(dev); 195 r->ifi_flags = dev_get_flags(dev);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6d68c03bc0..7eab867ede 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -129,7 +129,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
129 * Buffers may only be allocated from interrupts using a @gfp_mask of 129 * Buffers may only be allocated from interrupts using a @gfp_mask of
130 * %GFP_ATOMIC. 130 * %GFP_ATOMIC.
131 */ 131 */
132struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) 132struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
133{ 133{
134 struct sk_buff *skb; 134 struct sk_buff *skb;
135 u8 *data; 135 u8 *data;
@@ -182,7 +182,8 @@ nodata:
182 * %GFP_ATOMIC. 182 * %GFP_ATOMIC.
183 */ 183 */
184struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, 184struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
185 unsigned int size, int gfp_mask) 185 unsigned int size,
186 unsigned int __nocast gfp_mask)
186{ 187{
187 struct sk_buff *skb; 188 struct sk_buff *skb;
188 u8 *data; 189 u8 *data;
@@ -322,7 +323,7 @@ void __kfree_skb(struct sk_buff *skb)
322 * %GFP_ATOMIC. 323 * %GFP_ATOMIC.
323 */ 324 */
324 325
325struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) 326struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
326{ 327{
327 struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 328 struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
328 329
@@ -357,7 +358,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
357 C(ip_summed); 358 C(ip_summed);
358 C(priority); 359 C(priority);
359 C(protocol); 360 C(protocol);
360 C(security);
361 n->destructor = NULL; 361 n->destructor = NULL;
362#ifdef CONFIG_NETFILTER 362#ifdef CONFIG_NETFILTER
363 C(nfmark); 363 C(nfmark);
@@ -377,8 +377,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
377 C(tc_index); 377 C(tc_index);
378#ifdef CONFIG_NET_CLS_ACT 378#ifdef CONFIG_NET_CLS_ACT
379 n->tc_verd = SET_TC_VERD(skb->tc_verd,0); 379 n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
380 n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd); 380 n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
381 n->tc_verd = CLR_TC_MUNGED(skb->tc_verd); 381 n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
382 C(input_dev); 382 C(input_dev);
383 C(tc_classid); 383 C(tc_classid);
384#endif 384#endif
@@ -422,7 +422,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
422 new->pkt_type = old->pkt_type; 422 new->pkt_type = old->pkt_type;
423 new->stamp = old->stamp; 423 new->stamp = old->stamp;
424 new->destructor = NULL; 424 new->destructor = NULL;
425 new->security = old->security;
426#ifdef CONFIG_NETFILTER 425#ifdef CONFIG_NETFILTER
427 new->nfmark = old->nfmark; 426 new->nfmark = old->nfmark;
428 new->nfcache = old->nfcache; 427 new->nfcache = old->nfcache;
@@ -462,7 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
462 * header is going to be modified. Use pskb_copy() instead. 461 * header is going to be modified. Use pskb_copy() instead.
463 */ 462 */
464 463
465struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) 464struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
466{ 465{
467 int headerlen = skb->data - skb->head; 466 int headerlen = skb->data - skb->head;
468 /* 467 /*
@@ -501,7 +500,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
501 * The returned buffer has a reference count of 1. 500 * The returned buffer has a reference count of 1.
502 */ 501 */
503 502
504struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) 503struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
505{ 504{
506 /* 505 /*
507 * Allocate the copy buffer 506 * Allocate the copy buffer
@@ -559,7 +558,8 @@ out:
559 * reloaded after call to this function. 558 * reloaded after call to this function.
560 */ 559 */
561 560
562int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) 561int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
562 unsigned int __nocast gfp_mask)
563{ 563{
564 int i; 564 int i;
565 u8 *data; 565 u8 *data;
@@ -649,7 +649,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
649 * only by netfilter in the cases when checksum is recalculated? --ANK 649 * only by netfilter in the cases when checksum is recalculated? --ANK
650 */ 650 */
651struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 651struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
652 int newheadroom, int newtailroom, int gfp_mask) 652 int newheadroom, int newtailroom,
653 unsigned int __nocast gfp_mask)
653{ 654{
654 /* 655 /*
655 * Allocate the copy buffer 656 * Allocate the copy buffer
@@ -1500,6 +1501,159 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
1500 skb_split_no_header(skb, skb1, len, pos); 1501 skb_split_no_header(skb, skb1, len, pos);
1501} 1502}
1502 1503
1504/**
1505 * skb_prepare_seq_read - Prepare a sequential read of skb data
1506 * @skb: the buffer to read
1507 * @from: lower offset of data to be read
1508 * @to: upper offset of data to be read
1509 * @st: state variable
1510 *
1511 * Initializes the specified state variable. Must be called before
1512 * invoking skb_seq_read() for the first time.
1513 */
1514void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
1515 unsigned int to, struct skb_seq_state *st)
1516{
1517 st->lower_offset = from;
1518 st->upper_offset = to;
1519 st->root_skb = st->cur_skb = skb;
1520 st->frag_idx = st->stepped_offset = 0;
1521 st->frag_data = NULL;
1522}
1523
1524/**
1525 * skb_seq_read - Sequentially read skb data
1526 * @consumed: number of bytes consumed by the caller so far
1527 * @data: destination pointer for data to be returned
1528 * @st: state variable
1529 *
1530 * Reads a block of skb data at &consumed relative to the
1531 * lower offset specified to skb_prepare_seq_read(). Assigns
1532 * the head of the data block to &data and returns the length
1533 * of the block or 0 if the end of the skb data or the upper
1534 * offset has been reached.
1535 *
1536 * The caller is not required to consume all of the data
1537 * returned, i.e. &consumed is typically set to the number
1538 * of bytes already consumed and the next call to
1539 * skb_seq_read() will return the remaining part of the block.
1540 *
1541 * Note: The size of each block of data returned can be arbitary,
1542 * this limitation is the cost for zerocopy seqeuental
1543 * reads of potentially non linear data.
1544 *
1545 * Note: Fragment lists within fragments are not implemented
1546 * at the moment, state->root_skb could be replaced with
1547 * a stack for this purpose.
1548 */
1549unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
1550 struct skb_seq_state *st)
1551{
1552 unsigned int block_limit, abs_offset = consumed + st->lower_offset;
1553 skb_frag_t *frag;
1554
1555 if (unlikely(abs_offset >= st->upper_offset))
1556 return 0;
1557
1558next_skb:
1559 block_limit = skb_headlen(st->cur_skb);
1560
1561 if (abs_offset < block_limit) {
1562 *data = st->cur_skb->data + abs_offset;
1563 return block_limit - abs_offset;
1564 }
1565
1566 if (st->frag_idx == 0 && !st->frag_data)
1567 st->stepped_offset += skb_headlen(st->cur_skb);
1568
1569 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
1570 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
1571 block_limit = frag->size + st->stepped_offset;
1572
1573 if (abs_offset < block_limit) {
1574 if (!st->frag_data)
1575 st->frag_data = kmap_skb_frag(frag);
1576
1577 *data = (u8 *) st->frag_data + frag->page_offset +
1578 (abs_offset - st->stepped_offset);
1579
1580 return block_limit - abs_offset;
1581 }
1582
1583 if (st->frag_data) {
1584 kunmap_skb_frag(st->frag_data);
1585 st->frag_data = NULL;
1586 }
1587
1588 st->frag_idx++;
1589 st->stepped_offset += frag->size;
1590 }
1591
1592 if (st->cur_skb->next) {
1593 st->cur_skb = st->cur_skb->next;
1594 st->frag_idx = 0;
1595 goto next_skb;
1596 } else if (st->root_skb == st->cur_skb &&
1597 skb_shinfo(st->root_skb)->frag_list) {
1598 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
1599 goto next_skb;
1600 }
1601
1602 return 0;
1603}
1604
1605/**
1606 * skb_abort_seq_read - Abort a sequential read of skb data
1607 * @st: state variable
1608 *
1609 * Must be called if skb_seq_read() was not called until it
1610 * returned 0.
1611 */
1612void skb_abort_seq_read(struct skb_seq_state *st)
1613{
1614 if (st->frag_data)
1615 kunmap_skb_frag(st->frag_data);
1616}
1617
1618#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
1619
1620static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
1621 struct ts_config *conf,
1622 struct ts_state *state)
1623{
1624 return skb_seq_read(offset, text, TS_SKB_CB(state));
1625}
1626
1627static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
1628{
1629 skb_abort_seq_read(TS_SKB_CB(state));
1630}
1631
1632/**
1633 * skb_find_text - Find a text pattern in skb data
1634 * @skb: the buffer to look in
1635 * @from: search offset
1636 * @to: search limit
1637 * @config: textsearch configuration
1638 * @state: uninitialized textsearch state variable
1639 *
1640 * Finds a pattern in the skb data according to the specified
1641 * textsearch configuration. Use textsearch_next() to retrieve
1642 * subsequent occurrences of the pattern. Returns the offset
1643 * to the first occurrence or UINT_MAX if no match was found.
1644 */
1645unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
1646 unsigned int to, struct ts_config *config,
1647 struct ts_state *state)
1648{
1649 config->get_next_block = skb_ts_get_next_block;
1650 config->finish = skb_ts_finish;
1651
1652 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
1653
1654 return textsearch_find(config, state);
1655}
1656
1503void __init skb_init(void) 1657void __init skb_init(void)
1504{ 1658{
1505 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 1659 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1538,3 +1692,7 @@ EXPORT_SYMBOL(skb_queue_tail);
1538EXPORT_SYMBOL(skb_unlink); 1692EXPORT_SYMBOL(skb_unlink);
1539EXPORT_SYMBOL(skb_append); 1693EXPORT_SYMBOL(skb_append);
1540EXPORT_SYMBOL(skb_split); 1694EXPORT_SYMBOL(skb_split);
1695EXPORT_SYMBOL(skb_prepare_seq_read);
1696EXPORT_SYMBOL(skb_seq_read);
1697EXPORT_SYMBOL(skb_abort_seq_read);
1698EXPORT_SYMBOL(skb_find_text);
diff --git a/net/core/sock.c b/net/core/sock.c
index a6ec3ada7f..12f6d9a2a5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -206,13 +206,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
206 */ 206 */
207 207
208#ifdef SO_DONTLINGER /* Compatibility item... */ 208#ifdef SO_DONTLINGER /* Compatibility item... */
209 switch (optname) { 209 if (optname == SO_DONTLINGER) {
210 case SO_DONTLINGER: 210 lock_sock(sk);
211 sock_reset_flag(sk, SOCK_LINGER); 211 sock_reset_flag(sk, SOCK_LINGER);
212 return 0; 212 release_sock(sk);
213 return 0;
213 } 214 }
214#endif 215#endif
215 216
216 if(optlen<sizeof(int)) 217 if(optlen<sizeof(int))
217 return(-EINVAL); 218 return(-EINVAL);
218 219
@@ -622,7 +623,8 @@ lenout:
622 * @prot: struct proto associated with this new sock instance 623 * @prot: struct proto associated with this new sock instance
623 * @zero_it: if we should zero the newly allocated sock 624 * @zero_it: if we should zero the newly allocated sock
624 */ 625 */
625struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it) 626struct sock *sk_alloc(int family, unsigned int __nocast priority,
627 struct proto *prot, int zero_it)
626{ 628{
627 struct sock *sk = NULL; 629 struct sock *sk = NULL;
628 kmem_cache_t *slab = prot->slab; 630 kmem_cache_t *slab = prot->slab;
@@ -750,7 +752,8 @@ unsigned long sock_i_ino(struct sock *sk)
750/* 752/*
751 * Allocate a skb from the socket's send buffer. 753 * Allocate a skb from the socket's send buffer.
752 */ 754 */
753struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) 755struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
756 unsigned int __nocast priority)
754{ 757{
755 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 758 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
756 struct sk_buff * skb = alloc_skb(size, priority); 759 struct sk_buff * skb = alloc_skb(size, priority);
@@ -765,7 +768,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
765/* 768/*
766 * Allocate a skb from the socket's receive buffer. 769 * Allocate a skb from the socket's receive buffer.
767 */ 770 */
768struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) 771struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
772 unsigned int __nocast priority)
769{ 773{
770 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 774 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
771 struct sk_buff *skb = alloc_skb(size, priority); 775 struct sk_buff *skb = alloc_skb(size, priority);
@@ -780,7 +784,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
780/* 784/*
781 * Allocate a memory block from the socket's option memory buffer. 785 * Allocate a memory block from the socket's option memory buffer.
782 */ 786 */
783void *sock_kmalloc(struct sock *sk, int size, int priority) 787void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
784{ 788{
785 if ((unsigned)size <= sysctl_optmem_max && 789 if ((unsigned)size <= sysctl_optmem_max &&
786 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 790 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8be646cb1..8f817ad9f5 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,12 +13,8 @@
13#ifdef CONFIG_SYSCTL 13#ifdef CONFIG_SYSCTL
14 14
15extern int netdev_max_backlog; 15extern int netdev_max_backlog;
16extern int netdev_budget;
16extern int weight_p; 17extern int weight_p;
17extern int no_cong_thresh;
18extern int no_cong;
19extern int lo_cong;
20extern int mod_cong;
21extern int netdev_fastroute;
22extern int net_msg_cost; 18extern int net_msg_cost;
23extern int net_msg_burst; 19extern int net_msg_burst;
24 20
@@ -35,19 +31,6 @@ extern int sysctl_somaxconn;
35extern char sysctl_divert_version[]; 31extern char sysctl_divert_version[];
36#endif /* CONFIG_NET_DIVERT */ 32#endif /* CONFIG_NET_DIVERT */
37 33
38/*
39 * This strdup() is used for creating copies of network
40 * device names to be handed over to sysctl.
41 */
42
43char *net_sysctl_strdup(const char *s)
44{
45 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
46 if (rv)
47 strcpy(rv, s);
48 return rv;
49}
50
51ctl_table core_table[] = { 34ctl_table core_table[] = {
52#ifdef CONFIG_NET 35#ifdef CONFIG_NET
53 { 36 {
@@ -99,38 +82,6 @@ ctl_table core_table[] = {
99 .proc_handler = &proc_dointvec 82 .proc_handler = &proc_dointvec
100 }, 83 },
101 { 84 {
102 .ctl_name = NET_CORE_NO_CONG_THRESH,
103 .procname = "no_cong_thresh",
104 .data = &no_cong_thresh,
105 .maxlen = sizeof(int),
106 .mode = 0644,
107 .proc_handler = &proc_dointvec
108 },
109 {
110 .ctl_name = NET_CORE_NO_CONG,
111 .procname = "no_cong",
112 .data = &no_cong,
113 .maxlen = sizeof(int),
114 .mode = 0644,
115 .proc_handler = &proc_dointvec
116 },
117 {
118 .ctl_name = NET_CORE_LO_CONG,
119 .procname = "lo_cong",
120 .data = &lo_cong,
121 .maxlen = sizeof(int),
122 .mode = 0644,
123 .proc_handler = &proc_dointvec
124 },
125 {
126 .ctl_name = NET_CORE_MOD_CONG,
127 .procname = "mod_cong",
128 .data = &mod_cong,
129 .maxlen = sizeof(int),
130 .mode = 0644,
131 .proc_handler = &proc_dointvec
132 },
133 {
134 .ctl_name = NET_CORE_MSG_COST, 85 .ctl_name = NET_CORE_MSG_COST,
135 .procname = "message_cost", 86 .procname = "message_cost",
136 .data = &net_msg_cost, 87 .data = &net_msg_cost,
@@ -174,9 +125,15 @@ ctl_table core_table[] = {
174 .mode = 0644, 125 .mode = 0644,
175 .proc_handler = &proc_dointvec 126 .proc_handler = &proc_dointvec
176 }, 127 },
128 {
129 .ctl_name = NET_CORE_BUDGET,
130 .procname = "netdev_budget",
131 .data = &netdev_budget,
132 .maxlen = sizeof(int),
133 .mode = 0644,
134 .proc_handler = &proc_dointvec
135 },
177 { .ctl_name = 0 } 136 { .ctl_name = 0 }
178}; 137};
179 138
180EXPORT_SYMBOL(net_sysctl_strdup);
181
182#endif 139#endif
diff --git a/net/core/utils.c b/net/core/utils.c
index e11a8654f3..88eb8b68e2 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -23,10 +23,10 @@
23#include <linux/percpu.h> 23#include <linux/percpu.h>
24#include <linux/init.h> 24#include <linux/init.h>
25 25
26#include <asm/byteorder.h>
26#include <asm/system.h> 27#include <asm/system.h>
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28 29
29
30/* 30/*
31 This is a maximally equidistributed combined Tausworthe generator 31 This is a maximally equidistributed combined Tausworthe generator
32 based on code from GNU Scientific Library 1.5 (30 Jun 2004) 32 based on code from GNU Scientific Library 1.5 (30 Jun 2004)
@@ -153,3 +153,38 @@ int net_ratelimit(void)
153EXPORT_SYMBOL(net_random); 153EXPORT_SYMBOL(net_random);
154EXPORT_SYMBOL(net_ratelimit); 154EXPORT_SYMBOL(net_ratelimit);
155EXPORT_SYMBOL(net_srandom); 155EXPORT_SYMBOL(net_srandom);
156
157/*
158 * Convert an ASCII string to binary IP.
159 * This is outside of net/ipv4/ because various code that uses IP addresses
160 * is otherwise not dependent on the TCP/IP stack.
161 */
162
163__u32 in_aton(const char *str)
164{
165 unsigned long l;
166 unsigned int val;
167 int i;
168
169 l = 0;
170 for (i = 0; i < 4; i++)
171 {
172 l <<= 8;
173 if (*str != '\0')
174 {
175 val = 0;
176 while (*str != '\0' && *str != '.')
177 {
178 val *= 10;
179 val += *str - '0';
180 str++;
181 }
182 l |= val;
183 if (*str != '\0')
184 str++;
185 }
186 }
187 return(htonl(l));
188}
189
190EXPORT_SYMBOL(in_aton);
diff --git a/net/core/wireless.c b/net/core/wireless.c
index b2fe378dfb..3ff5639c0b 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -1102,6 +1102,7 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb,
1102 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r)); 1102 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
1103 r = NLMSG_DATA(nlh); 1103 r = NLMSG_DATA(nlh);
1104 r->ifi_family = AF_UNSPEC; 1104 r->ifi_family = AF_UNSPEC;
1105 r->__ifi_pad = 0;
1105 r->ifi_type = dev->type; 1106 r->ifi_type = dev->type;
1106 r->ifi_index = dev->ifindex; 1107 r->ifi_index = dev->ifindex;
1107 r->ifi_flags = dev->flags; 1108 r->ifi_flags = dev->flags;
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 2101da542b..92f2ec46fd 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -1,6 +1,29 @@
1# 1#
2# DECnet configuration 2# DECnet configuration
3# 3#
4config DECNET
5 tristate "DECnet Support"
6 ---help---
7 The DECnet networking protocol was used in many products made by
8 Digital (now Compaq). It provides reliable stream and sequenced
9 packet communications over which run a variety of services similar
10 to those which run over TCP/IP.
11
12 To find some tools to use with the kernel layer support, please
13 look at Patrick Caulfield's web site:
14 <http://linux-decnet.sourceforge.net/>.
15
16 More detailed documentation is available in
17 <file:Documentation/networking/decnet.txt>.
18
19 Be sure to say Y to "/proc file system support" and "Sysctl support"
20 below when using DECnet, since you will need sysctl support to aid
21 in configuration at run time.
22
23 The DECnet code is also available as a module ( = code which can be
24 inserted in and removed from the running kernel whenever you want).
25 The module is called decnet.
26
4config DECNET_ROUTER 27config DECNET_ROUTER
5 bool "DECnet: router support (EXPERIMENTAL)" 28 bool "DECnet: router support (EXPERIMENTAL)"
6 depends on DECNET && EXPERIMENTAL 29 depends on DECNET && EXPERIMENTAL
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 29bb3cd219..acdd18e6ad 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk)
536 * we are double checking that we are not sending too 536 * we are double checking that we are not sending too
537 * many of these keepalive frames. 537 * many of these keepalive frames.
538 */ 538 */
539 if (skb_queue_len(&scp->other_xmit_queue) == 0) 539 if (skb_queue_empty(&scp->other_xmit_queue))
540 dn_nsp_send_link(sk, DN_NOCHANGE, 0); 540 dn_nsp_send_link(sk, DN_NOCHANGE, 0);
541} 541}
542 542
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table
1191 struct dn_scp *scp = DN_SK(sk); 1191 struct dn_scp *scp = DN_SK(sk);
1192 int mask = datagram_poll(file, sock, wait); 1192 int mask = datagram_poll(file, sock, wait);
1193 1193
1194 if (skb_queue_len(&scp->other_receive_queue)) 1194 if (!skb_queue_empty(&scp->other_receive_queue))
1195 mask |= POLLRDBAND; 1195 mask |= POLLRDBAND;
1196 1196
1197 return mask; 1197 return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1214 1214
1215 case SIOCATMARK: 1215 case SIOCATMARK:
1216 lock_sock(sk); 1216 lock_sock(sk);
1217 val = (skb_queue_len(&scp->other_receive_queue) != 0); 1217 val = !skb_queue_empty(&scp->other_receive_queue);
1218 if (scp->state != DN_RUN) 1218 if (scp->state != DN_RUN)
1219 val = -ENOTCONN; 1219 val = -ENOTCONN;
1220 release_sock(sk); 1220 release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int
1630 int len = 0; 1630 int len = 0;
1631 1631
1632 if (flags & MSG_OOB) 1632 if (flags & MSG_OOB)
1633 return skb_queue_len(q) ? 1 : 0; 1633 return !skb_queue_empty(q) ? 1 : 0;
1634 1634
1635 while(skb != (struct sk_buff *)q) { 1635 while(skb != (struct sk_buff *)q) {
1636 struct dn_skb_cb *cb = DN_SKB_CB(skb); 1636 struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
1707 if (sk->sk_err) 1707 if (sk->sk_err)
1708 goto out; 1708 goto out;
1709 1709
1710 if (skb_queue_len(&scp->other_receive_queue)) { 1710 if (!skb_queue_empty(&scp->other_receive_queue)) {
1711 if (!(flags & MSG_OOB)) { 1711 if (!(flags & MSG_OOB)) {
1712 msg->msg_flags |= MSG_OOB; 1712 msg->msg_flags |= MSG_OOB;
1713 if (!scp->other_report) { 1713 if (!scp->other_report) {
@@ -1876,15 +1876,6 @@ static inline unsigned int dn_current_mss(struct sock *sk, int flags)
1876 return mss_now; 1876 return mss_now;
1877} 1877}
1878 1878
1879static int dn_error(struct sock *sk, int flags, int err)
1880{
1881 if (err == -EPIPE)
1882 err = sock_error(sk) ? : -EPIPE;
1883 if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
1884 send_sig(SIGPIPE, current, 0);
1885 return err;
1886}
1887
1888static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, 1879static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1889 struct msghdr *msg, size_t size) 1880 struct msghdr *msg, size_t size)
1890{ 1881{
@@ -2045,7 +2036,7 @@ out:
2045 return sent ? sent : err; 2036 return sent ? sent : err;
2046 2037
2047out_err: 2038out_err:
2048 err = dn_error(sk, flags, err); 2039 err = sk_stream_error(sk, flags, err);
2049 release_sock(sk); 2040 release_sock(sk);
2050 return err; 2041 return err;
2051} 2042}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720..99bc061759 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
551 if (t < s_t) 551 if (t < s_t)
552 continue; 552 continue;
553 if (t > s_t) 553 if (t > s_t)
554 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); 554 memset(&cb->args[1], 0,
555 sizeof(cb->args) - sizeof(cb->args[0]));
555 tb = dn_fib_get_table(t, 0); 556 tb = dn_fib_get_table(t, 0);
556 if (tb == NULL) 557 if (tb == NULL)
557 continue; 558 continue;
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f32dba9e26..8d0cc3cf3e 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -148,12 +148,12 @@ static int dn_neigh_construct(struct neighbour *neigh)
148 148
149 __neigh_parms_put(neigh->parms); 149 __neigh_parms_put(neigh->parms);
150 neigh->parms = neigh_parms_clone(parms); 150 neigh->parms = neigh_parms_clone(parms);
151 rcu_read_unlock();
152 151
153 if (dn_db->use_long) 152 if (dn_db->use_long)
154 neigh->ops = &dn_long_ops; 153 neigh->ops = &dn_long_ops;
155 else 154 else
156 neigh->ops = &dn_short_ops; 155 neigh->ops = &dn_short_ops;
156 rcu_read_unlock();
157 157
158 if (dn->flags & DN_NDFLAG_P3) 158 if (dn->flags & DN_NDFLAG_P3)
159 neigh->ops = &dn_phase3_ops; 159 neigh->ops = &dn_phase3_ops;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 42abbf3f52..8cce1fdbda 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
342 342
343 dn_nsp_output(sk); 343 dn_nsp_output(sk);
344 344
345 if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue)) 345 if (!skb_queue_empty(&scp->data_xmit_queue) ||
346 !skb_queue_empty(&scp->other_xmit_queue))
346 scp->persist = dn_nsp_persist(sk); 347 scp->persist = dn_nsp_persist(sk);
347 348
348 return 0; 349 return 0;
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
new file mode 100644
index 0000000000..39a2d2975e
--- /dev/null
+++ b/net/econet/Kconfig
@@ -0,0 +1,36 @@
1#
2# Acorn Econet/AUN protocols
3#
4
5config ECONET
6 tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
7 depends on EXPERIMENTAL && INET
8 ---help---
9 Econet is a fairly old and slow networking protocol mainly used by
10 Acorn computers to access file and print servers. It uses native
11 Econet network cards. AUN is an implementation of the higher level
12 parts of Econet that runs over ordinary Ethernet connections, on
13 top of the UDP packet protocol, which in turn runs on top of the
14 Internet protocol IP.
15
16 If you say Y here, you can choose with the next two options whether
17 to send Econet/AUN traffic over a UDP Ethernet connection or over
18 a native Econet network card.
19
20 To compile this driver as a module, choose M here: the module
21 will be called econet.
22
23config ECONET_AUNUDP
24 bool "AUN over UDP"
25 depends on ECONET
26 help
27 Say Y here if you want to send Econet/AUN traffic over a UDP
28 connection (UDP is a packet based protocol that runs on top of the
29 Internet protocol IP) using an ordinary Ethernet network card.
30
31config ECONET_NATIVE
32 bool "Native Econet"
33 depends on ECONET
34 help
35 Say Y here if you have a native Econet network card installed in
36 your computer.
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 6617ea47d3..f6dbfb99b1 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -92,10 +92,9 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
92 * Set the source hardware address. 92 * Set the source hardware address.
93 */ 93 */
94 94
95 if(saddr) 95 if(!saddr)
96 memcpy(eth->h_source,saddr,dev->addr_len); 96 saddr = dev->dev_addr;
97 else 97 memcpy(eth->h_source,saddr,dev->addr_len);
98 memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
99 98
100 /* 99 /*
101 * Anyway, the loopback-device should never use this function... 100 * Anyway, the loopback-device should never use this function...
@@ -156,7 +155,7 @@ int eth_rebuild_header(struct sk_buff *skb)
156 * This is normal practice and works for any 'now in use' protocol. 155 * This is normal practice and works for any 'now in use' protocol.
157 */ 156 */
158 157
159unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev) 158__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
160{ 159{
161 struct ethhdr *eth; 160 struct ethhdr *eth;
162 unsigned char *rawp; 161 unsigned char *rawp;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 567b03b1c3..0b3d9f1d80 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,35 +1,8 @@
1# 1#
2# IP configuration 2# IP configuration
3# 3#
4choice
5 prompt "Choose IP: FIB lookup"
6 depends on INET
7 default IP_FIB_HASH
8
9config IP_FIB_HASH
10 bool "FIB_HASH"
11 ---help---
12 Current FIB is very proven and good enough for most users.
13
14config IP_FIB_TRIE
15 bool "FIB_TRIE"
16 ---help---
17 Use new experimental LC-trie as FIB lookup algoritm.
18 This improves lookup performance
19
20 LC-trie is described in:
21
22 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 An experimental study of compression methods for dynamic tries
25 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
26 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
27
28endchoice
29
30config IP_MULTICAST 4config IP_MULTICAST
31 bool "IP: multicasting" 5 bool "IP: multicasting"
32 depends on INET
33 help 6 help
34 This is code for addressing several networked computers at once, 7 This is code for addressing several networked computers at once,
35 enlarging your kernel by about 2 KB. You need multicasting if you 8 enlarging your kernel by about 2 KB. You need multicasting if you
@@ -43,7 +16,6 @@ config IP_MULTICAST
43 16
44config IP_ADVANCED_ROUTER 17config IP_ADVANCED_ROUTER
45 bool "IP: advanced router" 18 bool "IP: advanced router"
46 depends on INET
47 ---help--- 19 ---help---
48 If you intend to run your Linux box mostly as a router, i.e. as a 20 If you intend to run your Linux box mostly as a router, i.e. as a
49 computer that forwards and redistributes network packets, say Y; you 21 computer that forwards and redistributes network packets, say Y; you
@@ -79,6 +51,40 @@ config IP_ADVANCED_ROUTER
79 51
80 If unsure, say N here. 52 If unsure, say N here.
81 53
54choice
55 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
56 depends on IP_ADVANCED_ROUTER
57 default ASK_IP_FIB_HASH
58
59config ASK_IP_FIB_HASH
60 bool "FIB_HASH"
61 ---help---
62 Current FIB is very proven and good enough for most users.
63
64config IP_FIB_TRIE
65 bool "FIB_TRIE"
66 ---help---
67 Use new experimental LC-trie as FIB lookup algoritm.
68 This improves lookup performance if you have a large
69 number of routes.
70
71 LC-trie is a longest matching prefix lookup algorithm which
72 performs better than FIB_HASH for large routing tables.
73 But, it consumes more memory and is more complex.
74
75 LC-trie is described in:
76
77 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
78 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
79 An experimental study of compression methods for dynamic tries
80 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
81 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
82
83endchoice
84
85config IP_FIB_HASH
86 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
87
82config IP_MULTIPLE_TABLES 88config IP_MULTIPLE_TABLES
83 bool "IP: policy routing" 89 bool "IP: policy routing"
84 depends on IP_ADVANCED_ROUTER 90 depends on IP_ADVANCED_ROUTER
@@ -118,7 +124,7 @@ config IP_ROUTE_MULTIPATH
118 124
119config IP_ROUTE_MULTIPATH_CACHED 125config IP_ROUTE_MULTIPATH_CACHED
120 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)" 126 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
121 depends on: IP_ROUTE_MULTIPATH 127 depends on IP_ROUTE_MULTIPATH
122 help 128 help
123 Normally, equal cost multipath routing is not supported by the 129 Normally, equal cost multipath routing is not supported by the
124 routing cache. If you say Y here, alternative routes are cached 130 routing cache. If you say Y here, alternative routes are cached
@@ -171,7 +177,6 @@ config IP_ROUTE_VERBOSE
171 177
172config IP_PNP 178config IP_PNP
173 bool "IP: kernel level autoconfiguration" 179 bool "IP: kernel level autoconfiguration"
174 depends on INET
175 help 180 help
176 This enables automatic configuration of IP addresses of devices and 181 This enables automatic configuration of IP addresses of devices and
177 of the routing table during kernel boot, based on either information 182 of the routing table during kernel boot, based on either information
@@ -230,8 +235,6 @@ config IP_PNP_RARP
230# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 235# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
231config NET_IPIP 236config NET_IPIP
232 tristate "IP: tunneling" 237 tristate "IP: tunneling"
233 depends on INET
234 select INET_TUNNEL
235 ---help--- 238 ---help---
236 Tunneling means encapsulating data of one protocol type within 239 Tunneling means encapsulating data of one protocol type within
237 another protocol and sending it over a channel that understands the 240 another protocol and sending it over a channel that understands the
@@ -248,8 +251,6 @@ config NET_IPIP
248 251
249config NET_IPGRE 252config NET_IPGRE
250 tristate "IP: GRE tunnels over IP" 253 tristate "IP: GRE tunnels over IP"
251 depends on INET
252 select XFRM
253 help 254 help
254 Tunneling means encapsulating data of one protocol type within 255 Tunneling means encapsulating data of one protocol type within
255 another protocol and sending it over a channel that understands the 256 another protocol and sending it over a channel that understands the
@@ -307,7 +308,7 @@ config IP_PIMSM_V2
307 308
308config ARPD 309config ARPD
309 bool "IP: ARP daemon support (EXPERIMENTAL)" 310 bool "IP: ARP daemon support (EXPERIMENTAL)"
310 depends on INET && EXPERIMENTAL 311 depends on EXPERIMENTAL
311 ---help--- 312 ---help---
312 Normally, the kernel maintains an internal cache which maps IP 313 Normally, the kernel maintains an internal cache which maps IP
313 addresses to hardware addresses on the local network, so that 314 addresses to hardware addresses on the local network, so that
@@ -332,7 +333,6 @@ config ARPD
332 333
333config SYN_COOKIES 334config SYN_COOKIES
334 bool "IP: TCP syncookie support (disabled per default)" 335 bool "IP: TCP syncookie support (disabled per default)"
335 depends on INET
336 ---help--- 336 ---help---
337 Normal TCP/IP networking is open to an attack known as "SYN 337 Normal TCP/IP networking is open to an attack known as "SYN
338 flooding". This denial-of-service attack prevents legitimate remote 338 flooding". This denial-of-service attack prevents legitimate remote
@@ -369,7 +369,6 @@ config SYN_COOKIES
369 369
370config INET_AH 370config INET_AH
371 tristate "IP: AH transformation" 371 tristate "IP: AH transformation"
372 depends on INET
373 select XFRM 372 select XFRM
374 select CRYPTO 373 select CRYPTO
375 select CRYPTO_HMAC 374 select CRYPTO_HMAC
@@ -382,7 +381,6 @@ config INET_AH
382 381
383config INET_ESP 382config INET_ESP
384 tristate "IP: ESP transformation" 383 tristate "IP: ESP transformation"
385 depends on INET
386 select XFRM 384 select XFRM
387 select CRYPTO 385 select CRYPTO
388 select CRYPTO_HMAC 386 select CRYPTO_HMAC
@@ -396,7 +394,6 @@ config INET_ESP
396 394
397config INET_IPCOMP 395config INET_IPCOMP
398 tristate "IP: IPComp transformation" 396 tristate "IP: IPComp transformation"
399 depends on INET
400 select XFRM 397 select XFRM
401 select INET_TUNNEL 398 select INET_TUNNEL
402 select CRYPTO 399 select CRYPTO
@@ -409,7 +406,6 @@ config INET_IPCOMP
409 406
410config INET_TUNNEL 407config INET_TUNNEL
411 tristate "IP: tunnel transformation" 408 tristate "IP: tunnel transformation"
412 depends on INET
413 select XFRM 409 select XFRM
414 ---help--- 410 ---help---
415 Support for generic IP tunnel transformation, which is required by 411 Support for generic IP tunnel transformation, which is required by
@@ -419,7 +415,6 @@ config INET_TUNNEL
419 415
420config IP_TCPDIAG 416config IP_TCPDIAG
421 tristate "IP: TCP socket monitoring interface" 417 tristate "IP: TCP socket monitoring interface"
422 depends on INET
423 default y 418 default y
424 ---help--- 419 ---help---
425 Support for TCP socket monitoring interface used by native Linux 420 Support for TCP socket monitoring interface used by native Linux
@@ -433,5 +428,108 @@ config IP_TCPDIAG
433config IP_TCPDIAG_IPV6 428config IP_TCPDIAG_IPV6
434 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) 429 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
435 430
431config TCP_CONG_ADVANCED
432 bool "TCP: advanced congestion control"
433 ---help---
434 Support for selection of various TCP congestion control
435 modules.
436
437 Nearly all users can safely say no here, and a safe default
438 selection will be made (BIC-TCP with new Reno as a fallback).
439
440 If unsure, say N.
441
442# TCP Reno is builtin (required as fallback)
443menu "TCP congestion control"
444 depends on TCP_CONG_ADVANCED
445
446config TCP_CONG_BIC
447 tristate "Binary Increase Congestion (BIC) control"
448 default y
449 ---help---
450 BIC-TCP is a sender-side only change that ensures a linear RTT
451 fairness under large windows while offering both scalability and
452 bounded TCP-friendliness. The protocol combines two schemes
453 called additive increase and binary search increase. When the
454 congestion window is large, additive increase with a large
455 increment ensures linear RTT fairness as well as good
456 scalability. Under small congestion windows, binary search
457 increase provides TCP friendliness.
458 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
459
460config TCP_CONG_WESTWOOD
461 tristate "TCP Westwood+"
462 default m
463 ---help---
464 TCP Westwood+ is a sender-side only modification of the TCP Reno
465 protocol stack that optimizes the performance of TCP congestion
466 control. It is based on end-to-end bandwidth estimation to set
467 congestion window and slow start threshold after a congestion
468 episode. Using this estimation, TCP Westwood+ adaptively sets a
469 slow start threshold and a congestion window which takes into
470 account the bandwidth used at the time congestion is experienced.
471 TCP Westwood+ significantly increases fairness wrt TCP Reno in
472 wired networks and throughput over wireless links.
473
474config TCP_CONG_HTCP
475 tristate "H-TCP"
476 default m
477 ---help---
478 H-TCP is a send-side only modifications of the TCP Reno
479 protocol stack that optimizes the performance of TCP
480 congestion control for high speed network links. It uses a
481 modeswitch to change the alpha and beta parameters of TCP Reno
482 based on network conditions and in a way so as to be fair with
483 other Reno and H-TCP flows.
484
485config TCP_CONG_HSTCP
486 tristate "High Speed TCP"
487 depends on EXPERIMENTAL
488 default n
489 ---help---
490 Sally Floyd's High Speed TCP (RFC 3649) congestion control.
491 A modification to TCP's congestion control mechanism for use
492 with large congestion windows. A table indicates how much to
493 increase the congestion window by when an ACK is received.
494 For more detail see http://www.icir.org/floyd/hstcp.html
495
496config TCP_CONG_HYBLA
497 tristate "TCP-Hybla congestion control algorithm"
498 depends on EXPERIMENTAL
499 default n
500 ---help---
501 TCP-Hybla is a sender-side only change that eliminates penalization of
502 long-RTT, large-bandwidth connections, like when satellite legs are
503 involved, expecially when sharing a common bottleneck with normal
504 terrestrial connections.
505
506config TCP_CONG_VEGAS
507 tristate "TCP Vegas"
508 depends on EXPERIMENTAL
509 default n
510 ---help---
511 TCP Vegas is a sender-side only change to TCP that anticipates
512 the onset of congestion by estimating the bandwidth. TCP Vegas
513 adjusts the sending rate by modifying the congestion
514 window. TCP Vegas should provide less packet loss, but it is
515 not as aggressive as TCP Reno.
516
517config TCP_CONG_SCALABLE
518 tristate "Scalable TCP"
519 depends on EXPERIMENTAL
520 default n
521 ---help---
522 Scalable TCP is a sender-side only change to TCP which uses a
523 MIMD congestion control algorithm which has some nice scaling
524 properties, though is known to have fairness issues.
525 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
526
527endmenu
528
529config TCP_CONG_BIC
530 tristate
531 depends on !TCP_CONG_ADVANCED
532 default y
533
436source "net/ipv4/ipvs/Kconfig" 534source "net/ipv4/ipvs/Kconfig"
437 535
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1a..55dc6cca1e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -2,10 +2,11 @@
2# Makefile for the Linux TCP/IP (INET) layer. 2# Makefile for the Linux TCP/IP (INET) layer.
3# 3#
4 4
5obj-y := utils.o route.o inetpeer.o protocol.o \ 5obj-y := route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
9 tcp_minisocks.o tcp_cong.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 10 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o 11 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 12
@@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/
30obj-$(CONFIG_IP_VS) += ipvs/ 31obj-$(CONFIG_IP_VS) += ipvs/
31obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 32obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
32obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 33obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
34obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
37obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
38obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
39obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
40obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
33 41
34obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 42obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
35 xfrm4_output.o 43 xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e797792..163ae4068b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
1009static int ipv4_proc_init(void); 1009static int ipv4_proc_init(void);
1010extern void ipfrag_init(void); 1010extern void ipfrag_init(void);
1011 1011
1012/*
1013 * IP protocol layer initialiser
1014 */
1015
1016static struct packet_type ip_packet_type = {
1017 .type = __constant_htons(ETH_P_IP),
1018 .func = ip_rcv,
1019};
1020
1012static int __init inet_init(void) 1021static int __init inet_init(void)
1013{ 1022{
1014 struct sk_buff *dummy_skb; 1023 struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
1102 1111
1103 ipfrag_init(); 1112 ipfrag_init();
1104 1113
1114 dev_add_pack(&ip_packet_type);
1115
1105 rc = 0; 1116 rc = 0;
1106out: 1117out:
1107 return rc; 1118 return rc;
@@ -1146,7 +1157,7 @@ static int __init ipv4_proc_init(void)
1146#ifdef CONFIG_IP_FIB_TRIE 1157#ifdef CONFIG_IP_FIB_TRIE
1147 if (fib_stat_proc_init()) 1158 if (fib_stat_proc_init())
1148 goto out_fib_stat; 1159 goto out_fib_stat;
1149 #endif 1160#endif
1150 if (ip_misc_proc_init()) 1161 if (ip_misc_proc_init())
1151 goto out_misc; 1162 goto out_misc;
1152out: 1163out:
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 650dcb12d9..d8a10e3dd7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
1471 * by sysctl and we wouldn't want anyone to change it under our feet 1471 * by sysctl and we wouldn't want anyone to change it under our feet
1472 * (see SIOCSIFNAME). 1472 * (see SIOCSIFNAME).
1473 */ 1473 */
1474 dev_name = net_sysctl_strdup(dev_name); 1474 dev_name = kstrdup(dev_name, GFP_KERNEL);
1475 if (!dev_name) 1475 if (!dev_name)
1476 goto free; 1476 goto free;
1477 1477
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c886b28ba9..e278cb9d00 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -593,10 +593,13 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
593 struct hlist_head *new_laddrhash, 593 struct hlist_head *new_laddrhash,
594 unsigned int new_size) 594 unsigned int new_size)
595{ 595{
596 struct hlist_head *old_info_hash, *old_laddrhash;
596 unsigned int old_size = fib_hash_size; 597 unsigned int old_size = fib_hash_size;
597 unsigned int i; 598 unsigned int i, bytes;
598 599
599 write_lock(&fib_info_lock); 600 write_lock(&fib_info_lock);
601 old_info_hash = fib_info_hash;
602 old_laddrhash = fib_info_laddrhash;
600 fib_hash_size = new_size; 603 fib_hash_size = new_size;
601 604
602 for (i = 0; i < old_size; i++) { 605 for (i = 0; i < old_size; i++) {
@@ -636,6 +639,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
636 fib_info_laddrhash = new_laddrhash; 639 fib_info_laddrhash = new_laddrhash;
637 640
638 write_unlock(&fib_info_lock); 641 write_unlock(&fib_info_lock);
642
643 bytes = old_size * sizeof(struct hlist_head *);
644 fib_hash_free(old_info_hash, bytes);
645 fib_hash_free(old_laddrhash, bytes);
639} 646}
640 647
641struct fib_info * 648struct fib_info *
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0671569ee6..45efd5f474 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.323" 46#define VERSION "0.325"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -90,14 +90,14 @@ typedef unsigned int t_key;
90#define T_LEAF 1 90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL 91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \ 92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) 93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \ 94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \ 95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK))) 96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \ 97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type)) 98 ((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \ 99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK) 100 ((_node)->_parent & NODE_TYPE_MASK)
101 101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF)) 102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF) 103#define IS_LEAF(n) (n->_parent & T_LEAF)
@@ -136,6 +136,7 @@ struct trie_use_stats {
136 unsigned int semantic_match_passed; 136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss; 137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit; 138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
139}; 140};
140#endif 141#endif
141 142
@@ -146,7 +147,7 @@ struct trie_stat {
146 unsigned int leaves; 147 unsigned int leaves;
147 unsigned int nullpointers; 148 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_CHILDS]; 149 unsigned int nodesizes[MAX_CHILDS];
149}; 150};
150 151
151struct trie { 152struct trie {
152 struct node *trie; 153 struct node *trie;
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn); 166static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn); 167static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn); 168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
168static struct tnode *halve(struct trie *t, struct tnode *tn); 169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
169static void tnode_free(struct tnode *tn); 170static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t); 171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); 172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -184,9 +185,9 @@ static void trie_bug(char *err)
184 BUG(); 185 BUG();
185} 186}
186 187
187static inline struct node *tnode_get_child(struct tnode *tn, int i) 188static inline struct node *tnode_get_child(struct tnode *tn, int i)
188{ 189{
189 if (i >= 1<<tn->bits) 190 if (i >= 1<<tn->bits)
190 trie_bug("tnode_get_child"); 191 trie_bug("tnode_get_child");
191 192
192 return tn->child[i]; 193 return tn->child[i];
@@ -201,7 +202,7 @@ static inline int tnode_child_length(struct tnode *tn)
201 _________________________________________________________________ 202 _________________________________________________________________
202 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | 203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203 ---------------------------------------------------------------- 204 ----------------------------------------------------------------
204 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
205 206
206 _________________________________________________________________ 207 _________________________________________________________________
207 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | 208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
@@ -225,25 +226,25 @@ static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
225 226
226static inline int tkey_equals(t_key a, t_key b) 227static inline int tkey_equals(t_key a, t_key b)
227{ 228{
228 return a == b; 229 return a == b;
229} 230}
230 231
231static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) 232static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232{ 233{
233 if (bits == 0 || offset >= KEYLENGTH) 234 if (bits == 0 || offset >= KEYLENGTH)
234 return 1; 235 return 1;
235 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 236 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
236 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237} 238}
238 239
239static inline int tkey_mismatch(t_key a, int offset, t_key b) 240static inline int tkey_mismatch(t_key a, int offset, t_key b)
240{ 241{
241 t_key diff = a ^ b; 242 t_key diff = a ^ b;
242 int i = offset; 243 int i = offset;
243 244
244 if(!diff) 245 if (!diff)
245 return 0; 246 return 0;
246 while((diff << i) >> (KEYLENGTH-1) == 0) 247 while ((diff << i) >> (KEYLENGTH-1) == 0)
247 i++; 248 i++;
248 return i; 249 return i;
249} 250}
@@ -313,6 +314,7 @@ static void fn_free_alias(struct fib_alias *fa)
313 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314 n's child array, and will of course be different for each child. 315 n's child array, and will of course be different for each child.
315 316
317
316 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown 318 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
317 at this point. 319 at this point.
318 320
@@ -320,7 +322,7 @@ static void fn_free_alias(struct fib_alias *fa)
320 322
321static void check_tnode(struct tnode *tn) 323static void check_tnode(struct tnode *tn)
322{ 324{
323 if(tn && tn->pos+tn->bits > 32) { 325 if (tn && tn->pos+tn->bits > 32) {
324 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits); 326 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
325 } 327 }
326} 328}
@@ -331,7 +333,7 @@ static int inflate_threshold = 50;
331static struct leaf *leaf_new(void) 333static struct leaf *leaf_new(void)
332{ 334{
333 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); 335 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
334 if(l) { 336 if (l) {
335 NODE_INIT_PARENT(l, T_LEAF); 337 NODE_INIT_PARENT(l, T_LEAF);
336 INIT_HLIST_HEAD(&l->list); 338 INIT_HLIST_HEAD(&l->list);
337 } 339 }
@@ -341,8 +343,10 @@ static struct leaf *leaf_new(void)
341static struct leaf_info *leaf_info_new(int plen) 343static struct leaf_info *leaf_info_new(int plen)
342{ 344{
343 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 345 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
344 li->plen = plen; 346 if (li) {
345 INIT_LIST_HEAD(&li->falh); 347 li->plen = plen;
348 INIT_LIST_HEAD(&li->falh);
349 }
346 return li; 350 return li;
347} 351}
348 352
@@ -356,13 +360,34 @@ static inline void free_leaf_info(struct leaf_info *li)
356 kfree(li); 360 kfree(li);
357} 361}
358 362
363static struct tnode *tnode_alloc(unsigned int size)
364{
365 if (size <= PAGE_SIZE) {
366 return kmalloc(size, GFP_KERNEL);
367 } else {
368 return (struct tnode *)
369 __get_free_pages(GFP_KERNEL, get_order(size));
370 }
371}
372
373static void __tnode_free(struct tnode *tn)
374{
375 unsigned int size = sizeof(struct tnode) +
376 (1<<tn->bits) * sizeof(struct node *);
377
378 if (size <= PAGE_SIZE)
379 kfree(tn);
380 else
381 free_pages((unsigned long)tn, get_order(size));
382}
383
359static struct tnode* tnode_new(t_key key, int pos, int bits) 384static struct tnode* tnode_new(t_key key, int pos, int bits)
360{ 385{
361 int nchildren = 1<<bits; 386 int nchildren = 1<<bits;
362 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 387 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 struct tnode *tn = kmalloc(sz, GFP_KERNEL); 388 struct tnode *tn = tnode_alloc(sz);
364 389
365 if(tn) { 390 if (tn) {
366 memset(tn, 0, sz); 391 memset(tn, 0, sz);
367 NODE_INIT_PARENT(tn, T_TNODE); 392 NODE_INIT_PARENT(tn, T_TNODE);
368 tn->pos = pos; 393 tn->pos = pos;
@@ -371,7 +396,8 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
371 tn->full_children = 0; 396 tn->full_children = 0;
372 tn->empty_children = 1<<bits; 397 tn->empty_children = 1<<bits;
373 } 398 }
374 if(trie_debug > 0) 399
400 if (trie_debug > 0)
375 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), 401 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
376 (unsigned int) (sizeof(struct node) * 1<<bits)); 402 (unsigned int) (sizeof(struct node) * 1<<bits));
377 return tn; 403 return tn;
@@ -379,17 +405,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
379 405
380static void tnode_free(struct tnode *tn) 406static void tnode_free(struct tnode *tn)
381{ 407{
382 if(!tn) { 408 if (!tn) {
383 trie_bug("tnode_free\n"); 409 trie_bug("tnode_free\n");
384 } 410 }
385 if(IS_LEAF(tn)) { 411 if (IS_LEAF(tn)) {
386 free_leaf((struct leaf *)tn); 412 free_leaf((struct leaf *)tn);
387 if(trie_debug > 0 ) 413 if (trie_debug > 0 )
388 printk("FL %p \n", tn); 414 printk("FL %p \n", tn);
389 } 415 }
390 else if(IS_TNODE(tn)) { 416 else if (IS_TNODE(tn)) {
391 kfree(tn); 417 __tnode_free(tn);
392 if(trie_debug > 0 ) 418 if (trie_debug > 0 )
393 printk("FT %p \n", tn); 419 printk("FT %p \n", tn);
394 } 420 }
395 else { 421 else {
@@ -404,66 +430,67 @@ static void tnode_free(struct tnode *tn)
404 430
405static inline int tnode_full(struct tnode *tn, struct node *n) 431static inline int tnode_full(struct tnode *tn, struct node *n)
406{ 432{
407 if(n == NULL || IS_LEAF(n)) 433 if (n == NULL || IS_LEAF(n))
408 return 0; 434 return 0;
409 435
410 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 436 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
411} 437}
412 438
413static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) 439static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
414{ 440{
415 tnode_put_child_reorg(tn, i, n, -1); 441 tnode_put_child_reorg(tn, i, n, -1);
416} 442}
417 443
418 /* 444 /*
419 * Add a child at position i overwriting the old value. 445 * Add a child at position i overwriting the old value.
420 * Update the value of full_children and empty_children. 446 * Update the value of full_children and empty_children.
421 */ 447 */
422 448
423static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 449static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
424{ 450{
425 struct node *chi; 451 struct node *chi;
426 int isfull; 452 int isfull;
427 453
428 if(i >= 1<<tn->bits) { 454 if (i >= 1<<tn->bits) {
429 printk("bits=%d, i=%d\n", tn->bits, i); 455 printk("bits=%d, i=%d\n", tn->bits, i);
430 trie_bug("tnode_put_child_reorg bits"); 456 trie_bug("tnode_put_child_reorg bits");
431 } 457 }
432 write_lock_bh(&fib_lock); 458 write_lock_bh(&fib_lock);
433 chi = tn->child[i]; 459 chi = tn->child[i];
434 460
435 /* update emptyChildren */ 461 /* update emptyChildren */
436 if (n == NULL && chi != NULL) 462 if (n == NULL && chi != NULL)
437 tn->empty_children++; 463 tn->empty_children++;
438 else if (n != NULL && chi == NULL) 464 else if (n != NULL && chi == NULL)
439 tn->empty_children--; 465 tn->empty_children--;
440 466
441 /* update fullChildren */ 467 /* update fullChildren */
442 if (wasfull == -1) 468 if (wasfull == -1)
443 wasfull = tnode_full(tn, chi); 469 wasfull = tnode_full(tn, chi);
444 470
445 isfull = tnode_full(tn, n); 471 isfull = tnode_full(tn, n);
446 if (wasfull && !isfull) 472 if (wasfull && !isfull)
447 tn->full_children--; 473 tn->full_children--;
448 474
449 else if (!wasfull && isfull) 475 else if (!wasfull && isfull)
450 tn->full_children++; 476 tn->full_children++;
451 if(n) 477 if (n)
452 NODE_SET_PARENT(n, tn); 478 NODE_SET_PARENT(n, tn);
453 479
454 tn->child[i] = n; 480 tn->child[i] = n;
455 write_unlock_bh(&fib_lock); 481 write_unlock_bh(&fib_lock);
456} 482}
457 483
458static struct node *resize(struct trie *t, struct tnode *tn) 484static struct node *resize(struct trie *t, struct tnode *tn)
459{ 485{
460 int i; 486 int i;
487 int err = 0;
461 488
462 if (!tn) 489 if (!tn)
463 return NULL; 490 return NULL;
464 491
465 if(trie_debug) 492 if (trie_debug)
466 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 493 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 tn, inflate_threshold, halve_threshold); 494 tn, inflate_threshold, halve_threshold);
468 495
469 /* No children */ 496 /* No children */
@@ -480,7 +507,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
480 507
481 /* compress one level */ 508 /* compress one level */
482 struct node *n = tn->child[i]; 509 struct node *n = tn->child[i];
483 if(n) 510 if (n)
484 NODE_INIT_PARENT(n, NODE_TYPE(n)); 511 NODE_INIT_PARENT(n, NODE_TYPE(n));
485 512
486 write_unlock_bh(&fib_lock); 513 write_unlock_bh(&fib_lock);
@@ -489,77 +516,85 @@ static struct node *resize(struct trie *t, struct tnode *tn)
489 } 516 }
490 write_unlock_bh(&fib_lock); 517 write_unlock_bh(&fib_lock);
491 } 518 }
492 /* 519 /*
493 * Double as long as the resulting node has a number of 520 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold. 521 * nonempty nodes that are above the threshold.
495 */ 522 */
496 523
497 /* 524 /*
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of 525 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia 526 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6: 527 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all 528 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'." 529 * children in the *doubled* node is at least 'high'."
503 * 530 *
504 * 'high' in this instance is the variable 'inflate_threshold'. It 531 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with 532 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the 533 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying 534 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we 535 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50. 536 * multiply the left-hand side by 50.
510 * 537 *
511 * The left-hand side may look a bit weird: tnode_child_length(tn) 538 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children 539 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full" 540 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0. 541 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so 542 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here. 543 * we just count them one extra time here.
517 * 544 *
518 * A clearer way to write this would be: 545 * A clearer way to write this would be:
519 * 546 *
520 * to_be_doubled = tn->full_children; 547 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 548 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 * tn->full_children; 549 * tn->full_children;
523 * 550 *
524 * new_child_length = tnode_child_length(tn) * 2; 551 * new_child_length = tnode_child_length(tn) * 2;
525 * 552 *
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 553 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 * new_child_length; 554 * new_child_length;
528 * if (new_fill_factor >= inflate_threshold) 555 * if (new_fill_factor >= inflate_threshold)
529 * 556 *
530 * ...and so on, tho it would mess up the while() loop. 557 * ...and so on, tho it would mess up the while () loop.
531 * 558 *
532 * anyway, 559 * anyway,
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= 560 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
534 * inflate_threshold 561 * inflate_threshold
535 * 562 *
536 * avoid a division: 563 * avoid a division:
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >= 564 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length 565 * inflate_threshold * new_child_length
539 * 566 *
540 * expand not_to_be_doubled and to_be_doubled, and shorten: 567 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children + 568 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children ) >= inflate_threshold * new_child_length 569 * tn->full_children ) >= inflate_threshold * new_child_length
543 * 570 *
544 * expand new_child_length: 571 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children + 572 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children ) >= 573 * tn->full_children ) >=
547 * inflate_threshold * tnode_child_length(tn) * 2 574 * inflate_threshold * tnode_child_length(tn) * 2
548 * 575 *
549 * shorten again: 576 * shorten again:
550 * 50 * (tn->full_children + tnode_child_length(tn) - 577 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children ) >= inflate_threshold * 578 * tn->empty_children ) >= inflate_threshold *
552 * tnode_child_length(tn) 579 * tnode_child_length(tn)
553 * 580 *
554 */ 581 */
555 582
556 check_tnode(tn); 583 check_tnode(tn);
557 584
585 err = 0;
558 while ((tn->full_children > 0 && 586 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 587 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) { 588 inflate_threshold * tnode_child_length(tn))) {
561 589
562 tn = inflate(t, tn); 590 tn = inflate(t, tn, &err);
591
592 if (err) {
593#ifdef CONFIG_IP_FIB_TRIE_STATS
594 t->stats.resize_node_skipped++;
595#endif
596 break;
597 }
563 } 598 }
564 599
565 check_tnode(tn); 600 check_tnode(tn);
@@ -568,23 +603,34 @@ static struct node *resize(struct trie *t, struct tnode *tn)
568 * Halve as long as the number of empty children in this 603 * Halve as long as the number of empty children in this
569 * node is above threshold. 604 * node is above threshold.
570 */ 605 */
606
607 err = 0;
571 while (tn->bits > 1 && 608 while (tn->bits > 1 &&
572 100 * (tnode_child_length(tn) - tn->empty_children) < 609 100 * (tnode_child_length(tn) - tn->empty_children) <
573 halve_threshold * tnode_child_length(tn)) 610 halve_threshold * tnode_child_length(tn)) {
611
612 tn = halve(t, tn, &err);
613
614 if (err) {
615#ifdef CONFIG_IP_FIB_TRIE_STATS
616 t->stats.resize_node_skipped++;
617#endif
618 break;
619 }
620 }
621
574 622
575 tn = halve(t, tn);
576
577 /* Only one child remains */ 623 /* Only one child remains */
578 624
579 if (tn->empty_children == tnode_child_length(tn) - 1) 625 if (tn->empty_children == tnode_child_length(tn) - 1)
580 for (i = 0; i < tnode_child_length(tn); i++) { 626 for (i = 0; i < tnode_child_length(tn); i++) {
581 627
582 write_lock_bh(&fib_lock); 628 write_lock_bh(&fib_lock);
583 if (tn->child[i] != NULL) { 629 if (tn->child[i] != NULL) {
584 /* compress one level */ 630 /* compress one level */
585 struct node *n = tn->child[i]; 631 struct node *n = tn->child[i];
586 632
587 if(n) 633 if (n)
588 NODE_INIT_PARENT(n, NODE_TYPE(n)); 634 NODE_INIT_PARENT(n, NODE_TYPE(n));
589 635
590 write_unlock_bh(&fib_lock); 636 write_unlock_bh(&fib_lock);
@@ -597,33 +643,88 @@ static struct node *resize(struct trie *t, struct tnode *tn)
597 return (struct node *) tn; 643 return (struct node *) tn;
598} 644}
599 645
600static struct tnode *inflate(struct trie *t, struct tnode *tn) 646static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
601{ 647{
602 struct tnode *inode; 648 struct tnode *inode;
603 struct tnode *oldtnode = tn; 649 struct tnode *oldtnode = tn;
604 int olen = tnode_child_length(tn); 650 int olen = tnode_child_length(tn);
605 int i; 651 int i;
606 652
607 if(trie_debug) 653 if (trie_debug)
608 printk("In inflate\n"); 654 printk("In inflate\n");
609 655
610 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 656 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611 657
612 if (!tn) 658 if (!tn) {
613 trie_bug("tnode_new failed"); 659 *err = -ENOMEM;
660 return oldtnode;
661 }
662
663 /*
664 * Preallocate and store tnodes before the actual work so we
665 * don't get into an inconsistent state if memory allocation
666 * fails. In case of failure we return the oldnode and inflate
667 * of tnode is ignored.
668 */
669
670 for(i = 0; i < olen; i++) {
671 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
672
673 if (inode &&
674 IS_TNODE(inode) &&
675 inode->pos == oldtnode->pos + oldtnode->bits &&
676 inode->bits > 1) {
677 struct tnode *left, *right;
678
679 t_key m = TKEY_GET_MASK(inode->pos, 1);
680
681 left = tnode_new(inode->key&(~m), inode->pos + 1,
682 inode->bits - 1);
683
684 if (!left) {
685 *err = -ENOMEM;
686 break;
687 }
688
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
692 if (!right) {
693 *err = -ENOMEM;
694 break;
695 }
696
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
699 }
700 }
701
702 if (*err) {
703 int size = tnode_child_length(tn);
704 int j;
705
706 for(j = 0; j < size; j++)
707 if (tn->child[j])
708 tnode_free((struct tnode *)tn->child[j]);
709
710 tnode_free(tn);
711
712 *err = -ENOMEM;
713 return oldtnode;
714 }
614 715
615 for(i = 0; i < olen; i++) { 716 for(i = 0; i < olen; i++) {
616 struct node *node = tnode_get_child(oldtnode, i); 717 struct node *node = tnode_get_child(oldtnode, i);
617 718
618 /* An empty child */ 719 /* An empty child */
619 if (node == NULL) 720 if (node == NULL)
620 continue; 721 continue;
621 722
622 /* A leaf or an internal node with skipped bits */ 723 /* A leaf or an internal node with skipped bits */
623 724
624 if(IS_LEAF(node) || ((struct tnode *) node)->pos > 725 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
625 tn->pos + tn->bits - 1) { 726 tn->pos + tn->bits - 1) {
626 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, 727 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
627 1) == 0) 728 1) == 0)
628 put_child(t, tn, 2*i, node); 729 put_child(t, tn, 2*i, node);
629 else 730 else
@@ -646,44 +747,39 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
646 struct tnode *left, *right; 747 struct tnode *left, *right;
647 int size, j; 748 int size, j;
648 749
649 /* We will replace this node 'inode' with two new 750 /* We will replace this node 'inode' with two new
650 * ones, 'left' and 'right', each with half of the 751 * ones, 'left' and 'right', each with half of the
651 * original children. The two new nodes will have 752 * original children. The two new nodes will have
652 * a position one bit further down the key and this 753 * a position one bit further down the key and this
653 * means that the "significant" part of their keys 754 * means that the "significant" part of their keys
654 * (see the discussion near the top of this file) 755 * (see the discussion near the top of this file)
655 * will differ by one bit, which will be "0" in 756 * will differ by one bit, which will be "0" in
656 * left's key and "1" in right's key. Since we are 757 * left's key and "1" in right's key. Since we are
657 * moving the key position by one step, the bit that 758 * moving the key position by one step, the bit that
658 * we are moving away from - the bit at position 759 * we are moving away from - the bit at position
659 * (inode->pos) - is the one that will differ between 760 * (inode->pos) - is the one that will differ between
660 * left and right. So... we synthesize that bit in the 761 * left and right. So... we synthesize that bit in the
661 * two new keys. 762 * two new keys.
662 * The mask 'm' below will be a single "one" bit at 763 * The mask 'm' below will be a single "one" bit at
663 * the position (inode->pos) 764 * the position (inode->pos)
664 */ 765 */
665 766
666 t_key m = TKEY_GET_MASK(inode->pos, 1); 767 /* Use the old key, but set the new significant
667 768 * bit to zero.
668 /* Use the old key, but set the new significant
669 * bit to zero.
670 */ 769 */
671 left = tnode_new(inode->key&(~m), inode->pos + 1,
672 inode->bits - 1);
673 770
674 if(!left) 771 left = (struct tnode *) tnode_get_child(tn, 2*i);
675 trie_bug("tnode_new failed"); 772 put_child(t, tn, 2*i, NULL);
676 773
677 774 if (!left)
678 /* Use the old key, but set the new significant 775 BUG();
679 * bit to one. 776
680 */ 777 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
681 right = tnode_new(inode->key|m, inode->pos + 1, 778 put_child(t, tn, 2*i+1, NULL);
682 inode->bits - 1); 779
780 if (!right)
781 BUG();
683 782
684 if(!right)
685 trie_bug("tnode_new failed");
686
687 size = tnode_child_length(left); 783 size = tnode_child_length(left);
688 for(j = 0; j < size; j++) { 784 for(j = 0; j < size; j++) {
689 put_child(t, left, j, inode->child[j]); 785 put_child(t, left, j, inode->child[j]);
@@ -699,24 +795,64 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
699 return tn; 795 return tn;
700} 796}
701 797
702static struct tnode *halve(struct trie *t, struct tnode *tn) 798static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
703{ 799{
704 struct tnode *oldtnode = tn; 800 struct tnode *oldtnode = tn;
705 struct node *left, *right; 801 struct node *left, *right;
706 int i; 802 int i;
707 int olen = tnode_child_length(tn); 803 int olen = tnode_child_length(tn);
708 804
709 if(trie_debug) printk("In halve\n"); 805 if (trie_debug) printk("In halve\n");
710 806
711 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 807 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808
809 if (!tn) {
810 *err = -ENOMEM;
811 return oldtnode;
812 }
813
814 /*
815 * Preallocate and store tnodes before the actual work so we
816 * don't get into an inconsistent state if memory allocation
817 * fails. In case of failure we return the oldnode and halve
818 * of tnode is ignored.
819 */
820
821 for(i = 0; i < olen; i += 2) {
822 left = tnode_get_child(oldtnode, i);
823 right = tnode_get_child(oldtnode, i+1);
824
825 /* Two nonempty children */
826 if (left && right) {
827 struct tnode *newBinNode =
828 tnode_new(left->key, tn->pos + tn->bits, 1);
829
830 if (!newBinNode) {
831 *err = -ENOMEM;
832 break;
833 }
834 put_child(t, tn, i/2, (struct node *)newBinNode);
835 }
836 }
837
838 if (*err) {
839 int size = tnode_child_length(tn);
840 int j;
841
842 for(j = 0; j < size; j++)
843 if (tn->child[j])
844 tnode_free((struct tnode *)tn->child[j]);
712 845
713 if(!tn) 846 tnode_free(tn);
714 trie_bug("tnode_new failed"); 847
848 *err = -ENOMEM;
849 return oldtnode;
850 }
715 851
716 for(i = 0; i < olen; i += 2) { 852 for(i = 0; i < olen; i += 2) {
717 left = tnode_get_child(oldtnode, i); 853 left = tnode_get_child(oldtnode, i);
718 right = tnode_get_child(oldtnode, i+1); 854 right = tnode_get_child(oldtnode, i+1);
719 855
720 /* At least one of the children is empty */ 856 /* At least one of the children is empty */
721 if (left == NULL) { 857 if (left == NULL) {
722 if (right == NULL) /* Both are empty */ 858 if (right == NULL) /* Both are empty */
@@ -724,14 +860,15 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
724 put_child(t, tn, i/2, right); 860 put_child(t, tn, i/2, right);
725 } else if (right == NULL) 861 } else if (right == NULL)
726 put_child(t, tn, i/2, left); 862 put_child(t, tn, i/2, left);
727 863
728 /* Two nonempty children */ 864 /* Two nonempty children */
729 else { 865 else {
730 struct tnode *newBinNode = 866 struct tnode *newBinNode =
731 tnode_new(left->key, tn->pos + tn->bits, 1); 867 (struct tnode *) tnode_get_child(tn, i/2);
868 put_child(t, tn, i/2, NULL);
732 869
733 if(!newBinNode) 870 if (!newBinNode)
734 trie_bug("tnode_new failed"); 871 BUG();
735 872
736 put_child(t, newBinNode, 0, left); 873 put_child(t, newBinNode, 0, left);
737 put_child(t, newBinNode, 1, right); 874 put_child(t, newBinNode, 1, right);
@@ -744,7 +881,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
744 881
745static void *trie_init(struct trie *t) 882static void *trie_init(struct trie *t)
746{ 883{
747 if(t) { 884 if (t) {
748 t->size = 0; 885 t->size = 0;
749 t->trie = NULL; 886 t->trie = NULL;
750 t->revision = 0; 887 t->revision = 0;
@@ -761,8 +898,7 @@ static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
761 struct leaf_info *li; 898 struct leaf_info *li;
762 899
763 hlist_for_each_entry(li, node, head, hlist) { 900 hlist_for_each_entry(li, node, head, hlist) {
764 901 if (li->plen == plen)
765 if ( li->plen == plen )
766 return li; 902 return li;
767 } 903 }
768 return NULL; 904 return NULL;
@@ -770,35 +906,35 @@ static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
770 906
771static inline struct list_head * get_fa_head(struct leaf *l, int plen) 907static inline struct list_head * get_fa_head(struct leaf *l, int plen)
772{ 908{
773 struct list_head *fa_head=NULL; 909 struct list_head *fa_head = NULL;
774 struct leaf_info *li = find_leaf_info(&l->list, plen); 910 struct leaf_info *li = find_leaf_info(&l->list, plen);
775 911
776 if(li) 912 if (li)
777 fa_head = &li->falh; 913 fa_head = &li->falh;
778 914
779 return fa_head; 915 return fa_head;
780} 916}
781 917
782static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 918static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
783{ 919{
784 struct leaf_info *li=NULL, *last=NULL; 920 struct leaf_info *li = NULL, *last = NULL;
785 struct hlist_node *node, *tmp; 921 struct hlist_node *node, *tmp;
786 922
787 write_lock_bh(&fib_lock); 923 write_lock_bh(&fib_lock);
788 924
789 if(hlist_empty(head)) 925 if (hlist_empty(head))
790 hlist_add_head(&new->hlist, head); 926 hlist_add_head(&new->hlist, head);
791 else { 927 else {
792 hlist_for_each_entry_safe(li, node, tmp, head, hlist) { 928 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
793 929
794 if (new->plen > li->plen) 930 if (new->plen > li->plen)
795 break; 931 break;
796 932
797 last = li; 933 last = li;
798 } 934 }
799 if(last) 935 if (last)
800 hlist_add_after(&last->hlist, &new->hlist); 936 hlist_add_after(&last->hlist, &new->hlist);
801 else 937 else
802 hlist_add_before(&new->hlist, &li->hlist); 938 hlist_add_before(&new->hlist, &li->hlist);
803 } 939 }
804 write_unlock_bh(&fib_lock); 940 write_unlock_bh(&fib_lock);
@@ -812,14 +948,14 @@ fib_find_node(struct trie *t, u32 key)
812 struct node *n; 948 struct node *n;
813 949
814 pos = 0; 950 pos = 0;
815 n=t->trie; 951 n = t->trie;
816 952
817 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 953 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
818 tn = (struct tnode *) n; 954 tn = (struct tnode *) n;
819 955
820 check_tnode(tn); 956 check_tnode(tn);
821 957
822 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 958 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
823 pos=tn->pos + tn->bits; 959 pos=tn->pos + tn->bits;
824 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 960 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
825 } 961 }
@@ -842,23 +978,23 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
842 t_key cindex, key; 978 t_key cindex, key;
843 struct tnode *tp = NULL; 979 struct tnode *tp = NULL;
844 980
845 if(!tn) 981 if (!tn)
846 BUG(); 982 BUG();
847 983
848 key = tn->key; 984 key = tn->key;
849 i = 0; 985 i = 0;
850 986
851 while (tn != NULL && NODE_PARENT(tn) != NULL) { 987 while (tn != NULL && NODE_PARENT(tn) != NULL) {
852 988
853 if( i > 10 ) { 989 if (i > 10) {
854 printk("Rebalance tn=%p \n", tn); 990 printk("Rebalance tn=%p \n", tn);
855 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn)); 991 if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
856 992
857 printk("Rebalance tp=%p \n", tp); 993 printk("Rebalance tp=%p \n", tp);
858 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp)); 994 if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
859 } 995 }
860 996
861 if( i > 12 ) BUG(); 997 if (i > 12) BUG();
862 i++; 998 i++;
863 999
864 tp = NODE_PARENT(tn); 1000 tp = NODE_PARENT(tn);
@@ -866,63 +1002,63 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
866 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 1002 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
867 tn = (struct tnode *) resize (t, (struct tnode *)tn); 1003 tn = (struct tnode *) resize (t, (struct tnode *)tn);
868 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); 1004 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
869 1005
870 if(!NODE_PARENT(tn)) 1006 if (!NODE_PARENT(tn))
871 break; 1007 break;
872 1008
873 tn = NODE_PARENT(tn); 1009 tn = NODE_PARENT(tn);
874 } 1010 }
875 /* Handle last (top) tnode */ 1011 /* Handle last (top) tnode */
876 if (IS_TNODE(tn)) 1012 if (IS_TNODE(tn))
877 tn = (struct tnode*) resize(t, (struct tnode *)tn); 1013 tn = (struct tnode*) resize(t, (struct tnode *)tn);
878 1014
879 return (struct node*) tn; 1015 return (struct node*) tn;
880} 1016}
881 1017
882static struct list_head * 1018static struct list_head *
883fib_insert_node(struct trie *t, u32 key, int plen) 1019fib_insert_node(struct trie *t, int *err, u32 key, int plen)
884{ 1020{
885 int pos, newpos; 1021 int pos, newpos;
886 struct tnode *tp = NULL, *tn = NULL; 1022 struct tnode *tp = NULL, *tn = NULL;
887 struct node *n; 1023 struct node *n;
888 struct leaf *l; 1024 struct leaf *l;
889 int missbit; 1025 int missbit;
890 struct list_head *fa_head=NULL; 1026 struct list_head *fa_head = NULL;
891 struct leaf_info *li; 1027 struct leaf_info *li;
892 t_key cindex; 1028 t_key cindex;
893 1029
894 pos = 0; 1030 pos = 0;
895 n=t->trie; 1031 n = t->trie;
896 1032
897 /* If we point to NULL, stop. Either the tree is empty and we should 1033 /* If we point to NULL, stop. Either the tree is empty and we should
898 * just put a new leaf in if, or we have reached an empty child slot, 1034 * just put a new leaf in if, or we have reached an empty child slot,
899 * and we should just put our new leaf in that. 1035 * and we should just put our new leaf in that.
900 * If we point to a T_TNODE, check if it matches our key. Note that 1036 * If we point to a T_TNODE, check if it matches our key. Note that
901 * a T_TNODE might be skipping any number of bits - its 'pos' need 1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
902 * not be the parent's 'pos'+'bits'! 1038 * not be the parent's 'pos'+'bits'!
903 * 1039 *
904 * If it does match the current key, get pos/bits from it, extract 1040 * If it does match the current key, get pos/bits from it, extract
905 * the index from our key, push the T_TNODE and walk the tree. 1041 * the index from our key, push the T_TNODE and walk the tree.
906 * 1042 *
907 * If it doesn't, we have to replace it with a new T_TNODE. 1043 * If it doesn't, we have to replace it with a new T_TNODE.
908 * 1044 *
909 * If we point to a T_LEAF, it might or might not have the same key 1045 * If we point to a T_LEAF, it might or might not have the same key
910 * as we do. If it does, just change the value, update the T_LEAF's 1046 * as we do. If it does, just change the value, update the T_LEAF's
911 * value, and return it. 1047 * value, and return it.
912 * If it doesn't, we need to replace it with a T_TNODE. 1048 * If it doesn't, we need to replace it with a T_TNODE.
913 */ 1049 */
914 1050
915 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
916 tn = (struct tnode *) n; 1052 tn = (struct tnode *) n;
917
918 check_tnode(tn);
919 1053
920 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1054 check_tnode(tn);
1055
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
921 tp = tn; 1057 tp = tn;
922 pos=tn->pos + tn->bits; 1058 pos=tn->pos + tn->bits;
923 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 1059 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
924 1060
925 if(n && NODE_PARENT(n) != tn) { 1061 if (n && NODE_PARENT(n) != tn) {
926 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 1062 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
927 BUG(); 1063 BUG();
928 } 1064 }
@@ -934,23 +1070,24 @@ fib_insert_node(struct trie *t, u32 key, int plen)
934 /* 1070 /*
935 * n ----> NULL, LEAF or TNODE 1071 * n ----> NULL, LEAF or TNODE
936 * 1072 *
937 * tp is n's (parent) ----> NULL or TNODE 1073 * tp is n's (parent) ----> NULL or TNODE
938 */ 1074 */
939 1075
940 if(tp && IS_LEAF(tp)) 1076 if (tp && IS_LEAF(tp))
941 BUG(); 1077 BUG();
942 1078
943 t->revision++;
944 1079
945 /* Case 1: n is a leaf. Compare prefixes */ 1080 /* Case 1: n is a leaf. Compare prefixes */
946 1081
947 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 1082 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
948 struct leaf *l = ( struct leaf *) n; 1083 struct leaf *l = ( struct leaf *) n;
949 1084
950 li = leaf_info_new(plen); 1085 li = leaf_info_new(plen);
951 1086
952 if(! li) 1087 if (!li) {
953 BUG(); 1088 *err = -ENOMEM;
1089 goto err;
1090 }
954 1091
955 fa_head = &li->falh; 1092 fa_head = &li->falh;
956 insert_leaf_info(&l->list, li); 1093 insert_leaf_info(&l->list, li);
@@ -959,14 +1096,19 @@ fib_insert_node(struct trie *t, u32 key, int plen)
959 t->size++; 1096 t->size++;
960 l = leaf_new(); 1097 l = leaf_new();
961 1098
962 if(! l) 1099 if (!l) {
963 BUG(); 1100 *err = -ENOMEM;
1101 goto err;
1102 }
964 1103
965 l->key = key; 1104 l->key = key;
966 li = leaf_info_new(plen); 1105 li = leaf_info_new(plen);
967 1106
968 if(! li) 1107 if (!li) {
969 BUG(); 1108 tnode_free((struct tnode *) l);
1109 *err = -ENOMEM;
1110 goto err;
1111 }
970 1112
971 fa_head = &li->falh; 1113 fa_head = &li->falh;
972 insert_leaf_info(&l->list, li); 1114 insert_leaf_info(&l->list, li);
@@ -975,8 +1117,8 @@ fib_insert_node(struct trie *t, u32 key, int plen)
975 if (t->trie && n == NULL) { 1117 if (t->trie && n == NULL) {
976 1118
977 NODE_SET_PARENT(l, tp); 1119 NODE_SET_PARENT(l, tp);
978 1120
979 if (!tp) 1121 if (!tp)
980 BUG(); 1122 BUG();
981 1123
982 else { 1124 else {
@@ -986,8 +1128,8 @@ fib_insert_node(struct trie *t, u32 key, int plen)
986 } 1128 }
987 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
988 else { 1130 else {
989 /* 1131 /*
990 * Add a new tnode here 1132 * Add a new tnode here
991 * first tnode need some special handling 1133 * first tnode need some special handling
992 */ 1134 */
993 1135
@@ -995,39 +1137,46 @@ fib_insert_node(struct trie *t, u32 key, int plen)
995 pos=tp->pos+tp->bits; 1137 pos=tp->pos+tp->bits;
996 else 1138 else
997 pos=0; 1139 pos=0;
998 if(n) { 1140 if (n) {
999 newpos = tkey_mismatch(key, pos, n->key); 1141 newpos = tkey_mismatch(key, pos, n->key);
1000 tn = tnode_new(n->key, newpos, 1); 1142 tn = tnode_new(n->key, newpos, 1);
1001 } 1143 }
1002 else { 1144 else {
1003 newpos = 0; 1145 newpos = 0;
1004 tn = tnode_new(key, newpos, 1); /* First tnode */ 1146 tn = tnode_new(key, newpos, 1); /* First tnode */
1005 } 1147 }
1006 if(!tn)
1007 trie_bug("tnode_pfx_new failed");
1008 1148
1149 if (!tn) {
1150 free_leaf_info(li);
1151 tnode_free((struct tnode *) l);
1152 *err = -ENOMEM;
1153 goto err;
1154 }
1155
1009 NODE_SET_PARENT(tn, tp); 1156 NODE_SET_PARENT(tn, tp);
1010 1157
1011 missbit=tkey_extract_bits(key, newpos, 1); 1158 missbit=tkey_extract_bits(key, newpos, 1);
1012 put_child(t, tn, missbit, (struct node *)l); 1159 put_child(t, tn, missbit, (struct node *)l);
1013 put_child(t, tn, 1-missbit, n); 1160 put_child(t, tn, 1-missbit, n);
1014 1161
1015 if(tp) { 1162 if (tp) {
1016 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1163 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); 1164 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1018 } 1165 }
1019 else { 1166 else {
1020 t->trie = (struct node*) tn; /* First tnode */ 1167 t->trie = (struct node*) tn; /* First tnode */
1021 tp = tn; 1168 tp = tn;
1022 } 1169 }
1023 } 1170 }
1024 if(tp && tp->pos+tp->bits > 32) { 1171 if (tp && tp->pos+tp->bits > 32) {
1025 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1172 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026 tp, tp->pos, tp->bits, key, plen); 1173 tp, tp->pos, tp->bits, key, plen);
1027 } 1174 }
1028 /* Rebalance the trie */ 1175 /* Rebalance the trie */
1029 t->trie = trie_rebalance(t, tp); 1176 t->trie = trie_rebalance(t, tp);
1030done:; 1177done:
1178 t->revision++;
1179err:;
1031 return fa_head; 1180 return fa_head;
1032} 1181}
1033 1182
@@ -1037,7 +1186,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1037{ 1186{
1038 struct trie *t = (struct trie *) tb->tb_data; 1187 struct trie *t = (struct trie *) tb->tb_data;
1039 struct fib_alias *fa, *new_fa; 1188 struct fib_alias *fa, *new_fa;
1040 struct list_head *fa_head=NULL; 1189 struct list_head *fa_head = NULL;
1041 struct fib_info *fi; 1190 struct fib_info *fi;
1042 int plen = r->rtm_dst_len; 1191 int plen = r->rtm_dst_len;
1043 int type = r->rtm_type; 1192 int type = r->rtm_type;
@@ -1050,17 +1199,17 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1050 return -EINVAL; 1199 return -EINVAL;
1051 1200
1052 key = 0; 1201 key = 0;
1053 if (rta->rta_dst) 1202 if (rta->rta_dst)
1054 memcpy(&key, rta->rta_dst, 4); 1203 memcpy(&key, rta->rta_dst, 4);
1055 1204
1056 key = ntohl(key); 1205 key = ntohl(key);
1057 1206
1058 if(trie_debug) 1207 if (trie_debug)
1059 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); 1208 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1060 1209
1061 mask = ntohl( inet_make_mask(plen) ); 1210 mask = ntohl( inet_make_mask(plen) );
1062 1211
1063 if(key & ~mask) 1212 if (key & ~mask)
1064 return -EINVAL; 1213 return -EINVAL;
1065 1214
1066 key = key & mask; 1215 key = key & mask;
@@ -1069,9 +1218,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1069 goto err; 1218 goto err;
1070 1219
1071 l = fib_find_node(t, key); 1220 l = fib_find_node(t, key);
1072 fa = NULL; 1221 fa = NULL;
1073 1222
1074 if(l) { 1223 if (l) {
1075 fa_head = get_fa_head(l, plen); 1224 fa_head = get_fa_head(l, plen);
1076 fa = fib_find_alias(fa_head, tos, fi->fib_priority); 1225 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1077 } 1226 }
@@ -1150,14 +1299,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1150 new_fa->fa_scope = r->rtm_scope; 1299 new_fa->fa_scope = r->rtm_scope;
1151 new_fa->fa_state = 0; 1300 new_fa->fa_state = 0;
1152#if 0 1301#if 0
1153 new_fa->dst = NULL; 1302 new_fa->dst = NULL;
1154#endif 1303#endif
1155 /* 1304 /*
1156 * Insert new entry to the list. 1305 * Insert new entry to the list.
1157 */ 1306 */
1158 1307
1159 if(!fa_head) 1308 if (!fa_head) {
1160 fa_head = fib_insert_node(t, key, plen); 1309 fa_head = fib_insert_node(t, &err, key, plen);
1310 err = 0;
1311 if (err)
1312 goto out_free_new_fa;
1313 }
1161 1314
1162 write_lock_bh(&fib_lock); 1315 write_lock_bh(&fib_lock);
1163 1316
@@ -1170,40 +1323,43 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1170 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1323 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171succeeded: 1324succeeded:
1172 return 0; 1325 return 0;
1326
1327out_free_new_fa:
1328 kmem_cache_free(fn_alias_kmem, new_fa);
1173out: 1329out:
1174 fib_release_info(fi); 1330 fib_release_info(fi);
1175err:; 1331err:;
1176 return err; 1332 return err;
1177} 1333}
1178 1334
1179static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, 1335static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1180 struct fib_result *res, int *err) 1336 struct fib_result *res)
1181{ 1337{
1182 int i; 1338 int err, i;
1183 t_key mask; 1339 t_key mask;
1184 struct leaf_info *li; 1340 struct leaf_info *li;
1185 struct hlist_head *hhead = &l->list; 1341 struct hlist_head *hhead = &l->list;
1186 struct hlist_node *node; 1342 struct hlist_node *node;
1187 1343
1188 hlist_for_each_entry(li, node, hhead, hlist) { 1344 hlist_for_each_entry(li, node, hhead, hlist) {
1189 1345
1190 i = li->plen; 1346 i = li->plen;
1191 mask = ntohl(inet_make_mask(i)); 1347 mask = ntohl(inet_make_mask(i));
1192 if (l->key != (key & mask)) 1348 if (l->key != (key & mask))
1193 continue; 1349 continue;
1194 1350
1195 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) { 1351 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1196 *plen = i; 1352 *plen = i;
1197#ifdef CONFIG_IP_FIB_TRIE_STATS 1353#ifdef CONFIG_IP_FIB_TRIE_STATS
1198 t->stats.semantic_match_passed++; 1354 t->stats.semantic_match_passed++;
1199#endif 1355#endif
1200 return 1; 1356 return err;
1201 } 1357 }
1202#ifdef CONFIG_IP_FIB_TRIE_STATS 1358#ifdef CONFIG_IP_FIB_TRIE_STATS
1203 t->stats.semantic_match_miss++; 1359 t->stats.semantic_match_miss++;
1204#endif 1360#endif
1205 } 1361 }
1206 return 0; 1362 return 1;
1207} 1363}
1208 1364
1209static int 1365static int
@@ -1221,7 +1377,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1221 n = t->trie; 1377 n = t->trie;
1222 1378
1223 read_lock(&fib_lock); 1379 read_lock(&fib_lock);
1224 if(!n) 1380 if (!n)
1225 goto failed; 1381 goto failed;
1226 1382
1227#ifdef CONFIG_IP_FIB_TRIE_STATS 1383#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -1230,19 +1386,19 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1230 1386
1231 /* Just a leaf? */ 1387 /* Just a leaf? */
1232 if (IS_LEAF(n)) { 1388 if (IS_LEAF(n)) {
1233 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) ) 1389 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1234 goto found; 1390 goto found;
1235 goto failed; 1391 goto failed;
1236 } 1392 }
1237 pn = (struct tnode *) n; 1393 pn = (struct tnode *) n;
1238 chopped_off = 0; 1394 chopped_off = 0;
1239 1395
1240 while (pn) { 1396 while (pn) {
1241 1397
1242 pos = pn->pos; 1398 pos = pn->pos;
1243 bits = pn->bits; 1399 bits = pn->bits;
1244 1400
1245 if(!chopped_off) 1401 if (!chopped_off)
1246 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits); 1402 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1247 1403
1248 n = tnode_get_child(pn, cindex); 1404 n = tnode_get_child(pn, cindex);
@@ -1262,33 +1418,33 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1262 int mp; 1418 int mp;
1263 1419
1264 /* 1420 /*
1265 * It's a tnode, and we can do some extra checks here if we 1421 * It's a tnode, and we can do some extra checks here if we
1266 * like, to avoid descending into a dead-end branch. 1422 * like, to avoid descending into a dead-end branch.
1267 * This tnode is in the parent's child array at index 1423 * This tnode is in the parent's child array at index
1268 * key[p_pos..p_pos+p_bits] but potentially with some bits 1424 * key[p_pos..p_pos+p_bits] but potentially with some bits
1269 * chopped off, so in reality the index may be just a 1425 * chopped off, so in reality the index may be just a
1270 * subprefix, padded with zero at the end. 1426 * subprefix, padded with zero at the end.
1271 * We can also take a look at any skipped bits in this 1427 * We can also take a look at any skipped bits in this
1272 * tnode - everything up to p_pos is supposed to be ok, 1428 * tnode - everything up to p_pos is supposed to be ok,
1273 * and the non-chopped bits of the index (se previous 1429 * and the non-chopped bits of the index (se previous
1274 * paragraph) are also guaranteed ok, but the rest is 1430 * paragraph) are also guaranteed ok, but the rest is
1275 * considered unknown. 1431 * considered unknown.
1276 * 1432 *
1277 * The skipped bits are key[pos+bits..cn->pos]. 1433 * The skipped bits are key[pos+bits..cn->pos].
1278 */ 1434 */
1279 1435
1280 /* If current_prefix_length < pos+bits, we are already doing 1436 /* If current_prefix_length < pos+bits, we are already doing
1281 * actual prefix matching, which means everything from 1437 * actual prefix matching, which means everything from
1282 * pos+(bits-chopped_off) onward must be zero along some 1438 * pos+(bits-chopped_off) onward must be zero along some
1283 * branch of this subtree - otherwise there is *no* valid 1439 * branch of this subtree - otherwise there is *no* valid
1284 * prefix present. Here we can only check the skipped 1440 * prefix present. Here we can only check the skipped
1285 * bits. Remember, since we have already indexed into the 1441 * bits. Remember, since we have already indexed into the
1286 * parent's child array, we know that the bits we chopped of 1442 * parent's child array, we know that the bits we chopped of
1287 * *are* zero. 1443 * *are* zero.
1288 */ 1444 */
1289 1445
1290 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ 1446 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1291 1447
1292 if (current_prefix_length < pos+bits) { 1448 if (current_prefix_length < pos+bits) {
1293 if (tkey_extract_bits(cn->key, current_prefix_length, 1449 if (tkey_extract_bits(cn->key, current_prefix_length,
1294 cn->pos - current_prefix_length) != 0 || 1450 cn->pos - current_prefix_length) != 0 ||
@@ -1297,13 +1453,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1297 } 1453 }
1298 1454
1299 /* 1455 /*
1300 * If chopped_off=0, the index is fully validated and we 1456 * If chopped_off=0, the index is fully validated and we
1301 * only need to look at the skipped bits for this, the new, 1457 * only need to look at the skipped bits for this, the new,
1302 * tnode. What we actually want to do is to find out if 1458 * tnode. What we actually want to do is to find out if
1303 * these skipped bits match our key perfectly, or if we will 1459 * these skipped bits match our key perfectly, or if we will
1304 * have to count on finding a matching prefix further down, 1460 * have to count on finding a matching prefix further down,
1305 * because if we do, we would like to have some way of 1461 * because if we do, we would like to have some way of
1306 * verifying the existence of such a prefix at this point. 1462 * verifying the existence of such a prefix at this point.
1307 */ 1463 */
1308 1464
1309 /* The only thing we can do at this point is to verify that 1465 /* The only thing we can do at this point is to verify that
@@ -1315,22 +1471,22 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1315 * new tnode's key. 1471 * new tnode's key.
1316 */ 1472 */
1317 1473
1318 /* Note: We aren't very concerned about the piece of the key 1474 /* Note: We aren't very concerned about the piece of the key
1319 * that precede pn->pos+pn->bits, since these have already been 1475 * that precede pn->pos+pn->bits, since these have already been
1320 * checked. The bits after cn->pos aren't checked since these are 1476 * checked. The bits after cn->pos aren't checked since these are
1321 * by definition "unknown" at this point. Thus, what we want to 1477 * by definition "unknown" at this point. Thus, what we want to
1322 * see is if we are about to enter the "prefix matching" state, 1478 * see is if we are about to enter the "prefix matching" state,
1323 * and in that case verify that the skipped bits that will prevail 1479 * and in that case verify that the skipped bits that will prevail
1324 * throughout this subtree are zero, as they have to be if we are 1480 * throughout this subtree are zero, as they have to be if we are
1325 * to find a matching prefix. 1481 * to find a matching prefix.
1326 */ 1482 */
1327 1483
1328 node_prefix = MASK_PFX(cn->key, cn->pos); 1484 node_prefix = MASK_PFX(cn->key, cn->pos);
1329 key_prefix = MASK_PFX(key, cn->pos); 1485 key_prefix = MASK_PFX(key, cn->pos);
1330 pref_mismatch = key_prefix^node_prefix; 1486 pref_mismatch = key_prefix^node_prefix;
1331 mp = 0; 1487 mp = 0;
1332 1488
1333 /* In short: If skipped bits in this node do not match the search 1489 /* In short: If skipped bits in this node do not match the search
1334 * key, enter the "prefix matching" state.directly. 1490 * key, enter the "prefix matching" state.directly.
1335 */ 1491 */
1336 if (pref_mismatch) { 1492 if (pref_mismatch) {
@@ -1339,7 +1495,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1339 pref_mismatch = pref_mismatch <<1; 1495 pref_mismatch = pref_mismatch <<1;
1340 } 1496 }
1341 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); 1497 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1342 1498
1343 if (key_prefix != 0) 1499 if (key_prefix != 0)
1344 goto backtrace; 1500 goto backtrace;
1345 1501
@@ -1350,9 +1506,9 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1350 pn = (struct tnode *)n; /* Descend */ 1506 pn = (struct tnode *)n; /* Descend */
1351 chopped_off = 0; 1507 chopped_off = 0;
1352 continue; 1508 continue;
1353 } 1509 }
1354 if (IS_LEAF(n)) { 1510 if (IS_LEAF(n)) {
1355 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret)) 1511 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1356 goto found; 1512 goto found;
1357 } 1513 }
1358backtrace: 1514backtrace:
@@ -1366,18 +1522,18 @@ backtrace:
1366 /* Decrease current_... with bits chopped off */ 1522 /* Decrease current_... with bits chopped off */
1367 if (current_prefix_length > pn->pos + pn->bits - chopped_off) 1523 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1368 current_prefix_length = pn->pos + pn->bits - chopped_off; 1524 current_prefix_length = pn->pos + pn->bits - chopped_off;
1369 1525
1370 /* 1526 /*
1371 * Either we do the actual chop off according or if we have 1527 * Either we do the actual chop off according or if we have
1372 * chopped off all bits in this tnode walk up to our parent. 1528 * chopped off all bits in this tnode walk up to our parent.
1373 */ 1529 */
1374 1530
1375 if(chopped_off <= pn->bits) 1531 if (chopped_off <= pn->bits)
1376 cindex &= ~(1 << (chopped_off-1)); 1532 cindex &= ~(1 << (chopped_off-1));
1377 else { 1533 else {
1378 if( NODE_PARENT(pn) == NULL) 1534 if (NODE_PARENT(pn) == NULL)
1379 goto failed; 1535 goto failed;
1380 1536
1381 /* Get Child's index */ 1537 /* Get Child's index */
1382 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); 1538 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1383 pn = NODE_PARENT(pn); 1539 pn = NODE_PARENT(pn);
@@ -1387,10 +1543,10 @@ backtrace:
1387 t->stats.backtrack++; 1543 t->stats.backtrack++;
1388#endif 1544#endif
1389 goto backtrace; 1545 goto backtrace;
1390 } 1546 }
1391 } 1547 }
1392failed: 1548failed:
1393 ret = 1; 1549 ret = 1;
1394found: 1550found:
1395 read_unlock(&fib_lock); 1551 read_unlock(&fib_lock);
1396 return ret; 1552 return ret;
@@ -1403,11 +1559,11 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1403 struct node *n = t->trie; 1559 struct node *n = t->trie;
1404 struct leaf *l; 1560 struct leaf *l;
1405 1561
1406 if(trie_debug) 1562 if (trie_debug)
1407 printk("entering trie_leaf_remove(%p)\n", n); 1563 printk("entering trie_leaf_remove(%p)\n", n);
1408 1564
1409 /* Note that in the case skipped bits, those bits are *not* checked! 1565 /* Note that in the case skipped bits, those bits are *not* checked!
1410 * When we finish this, we will have NULL or a T_LEAF, and the 1566 * When we finish this, we will have NULL or a T_LEAF, and the
1411 * T_LEAF may or may not match our key. 1567 * T_LEAF may or may not match our key.
1412 */ 1568 */
1413 1569
@@ -1416,19 +1572,19 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1416 check_tnode(tn); 1572 check_tnode(tn);
1417 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); 1573 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1418 1574
1419 if(n && NODE_PARENT(n) != tn) { 1575 if (n && NODE_PARENT(n) != tn) {
1420 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 1576 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1421 BUG(); 1577 BUG();
1422 } 1578 }
1423 } 1579 }
1424 l = (struct leaf *) n; 1580 l = (struct leaf *) n;
1425 1581
1426 if(!n || !tkey_equals(l->key, key)) 1582 if (!n || !tkey_equals(l->key, key))
1427 return 0; 1583 return 0;
1428 1584
1429 /* 1585 /*
1430 * Key found. 1586 * Key found.
1431 * Remove the leaf and rebalance the tree 1587 * Remove the leaf and rebalance the tree
1432 */ 1588 */
1433 1589
1434 t->revision++; 1590 t->revision++;
@@ -1437,7 +1593,7 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1437 tp = NODE_PARENT(n); 1593 tp = NODE_PARENT(n);
1438 tnode_free((struct tnode *) n); 1594 tnode_free((struct tnode *) n);
1439 1595
1440 if(tp) { 1596 if (tp) {
1441 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1597 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1442 put_child(t, (struct tnode *)tp, cindex, NULL); 1598 put_child(t, (struct tnode *)tp, cindex, NULL);
1443 t->trie = trie_rebalance(t, tp); 1599 t->trie = trie_rebalance(t, tp);
@@ -1460,23 +1616,23 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1460 struct list_head *fa_head; 1616 struct list_head *fa_head;
1461 struct leaf *l; 1617 struct leaf *l;
1462 1618
1463 if (plen > 32) 1619 if (plen > 32)
1464 return -EINVAL; 1620 return -EINVAL;
1465 1621
1466 key = 0; 1622 key = 0;
1467 if (rta->rta_dst) 1623 if (rta->rta_dst)
1468 memcpy(&key, rta->rta_dst, 4); 1624 memcpy(&key, rta->rta_dst, 4);
1469 1625
1470 key = ntohl(key); 1626 key = ntohl(key);
1471 mask = ntohl( inet_make_mask(plen) ); 1627 mask = ntohl( inet_make_mask(plen) );
1472 1628
1473 if(key & ~mask) 1629 if (key & ~mask)
1474 return -EINVAL; 1630 return -EINVAL;
1475 1631
1476 key = key & mask; 1632 key = key & mask;
1477 l = fib_find_node(t, key); 1633 l = fib_find_node(t, key);
1478 1634
1479 if(!l) 1635 if (!l)
1480 return -ESRCH; 1636 return -ESRCH;
1481 1637
1482 fa_head = get_fa_head(l, plen); 1638 fa_head = get_fa_head(l, plen);
@@ -1522,16 +1678,16 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1522 1678
1523 list_del(&fa->fa_list); 1679 list_del(&fa->fa_list);
1524 1680
1525 if(list_empty(fa_head)) { 1681 if (list_empty(fa_head)) {
1526 hlist_del(&li->hlist); 1682 hlist_del(&li->hlist);
1527 kill_li = 1; 1683 kill_li = 1;
1528 } 1684 }
1529 write_unlock_bh(&fib_lock); 1685 write_unlock_bh(&fib_lock);
1530 1686
1531 if(kill_li) 1687 if (kill_li)
1532 free_leaf_info(li); 1688 free_leaf_info(li);
1533 1689
1534 if(hlist_empty(&l->list)) 1690 if (hlist_empty(&l->list))
1535 trie_leaf_remove(t, key); 1691 trie_leaf_remove(t, key);
1536 1692
1537 if (fa->fa_state & FA_S_ACCESSED) 1693 if (fa->fa_state & FA_S_ACCESSED)
@@ -1550,12 +1706,12 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
1550 1706
1551 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1707 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1552 struct fib_info *fi = fa->fa_info; 1708 struct fib_info *fi = fa->fa_info;
1553 1709
1554 if (fi && (fi->fib_flags&RTNH_F_DEAD)) { 1710 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1555 1711
1556 write_lock_bh(&fib_lock); 1712 write_lock_bh(&fib_lock);
1557 list_del(&fa->fa_list); 1713 list_del(&fa->fa_list);
1558 write_unlock_bh(&fib_lock); 1714 write_unlock_bh(&fib_lock);
1559 1715
1560 fn_free_alias(fa); 1716 fn_free_alias(fa);
1561 found++; 1717 found++;
@@ -1572,14 +1728,14 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
1572 struct leaf_info *li = NULL; 1728 struct leaf_info *li = NULL;
1573 1729
1574 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { 1730 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1575 1731
1576 found += trie_flush_list(t, &li->falh); 1732 found += trie_flush_list(t, &li->falh);
1577 1733
1578 if (list_empty(&li->falh)) { 1734 if (list_empty(&li->falh)) {
1579 1735
1580 write_lock_bh(&fib_lock); 1736 write_lock_bh(&fib_lock);
1581 hlist_del(&li->hlist); 1737 hlist_del(&li->hlist);
1582 write_unlock_bh(&fib_lock); 1738 write_unlock_bh(&fib_lock);
1583 1739
1584 free_leaf_info(li); 1740 free_leaf_info(li);
1585 } 1741 }
@@ -1593,8 +1749,8 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1593 struct tnode *p; 1749 struct tnode *p;
1594 int idx; 1750 int idx;
1595 1751
1596 if(c == NULL) { 1752 if (c == NULL) {
1597 if(t->trie == NULL) 1753 if (t->trie == NULL)
1598 return NULL; 1754 return NULL;
1599 1755
1600 if (IS_LEAF(t->trie)) /* trie w. just a leaf */ 1756 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
@@ -1602,33 +1758,34 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1602 1758
1603 p = (struct tnode*) t->trie; /* Start */ 1759 p = (struct tnode*) t->trie; /* Start */
1604 } 1760 }
1605 else 1761 else
1606 p = (struct tnode *) NODE_PARENT(c); 1762 p = (struct tnode *) NODE_PARENT(c);
1763
1607 while (p) { 1764 while (p) {
1608 int pos, last; 1765 int pos, last;
1609 1766
1610 /* Find the next child of the parent */ 1767 /* Find the next child of the parent */
1611 if(c) 1768 if (c)
1612 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); 1769 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1613 else 1770 else
1614 pos = 0; 1771 pos = 0;
1615 1772
1616 last = 1 << p->bits; 1773 last = 1 << p->bits;
1617 for(idx = pos; idx < last ; idx++) { 1774 for(idx = pos; idx < last ; idx++) {
1618 if( p->child[idx]) { 1775 if (p->child[idx]) {
1619 1776
1620 /* Decend if tnode */ 1777 /* Decend if tnode */
1621 1778
1622 while (IS_TNODE(p->child[idx])) { 1779 while (IS_TNODE(p->child[idx])) {
1623 p = (struct tnode*) p->child[idx]; 1780 p = (struct tnode*) p->child[idx];
1624 idx = 0; 1781 idx = 0;
1625 1782
1626 /* Rightmost non-NULL branch */ 1783 /* Rightmost non-NULL branch */
1627 if( p && IS_TNODE(p) ) 1784 if (p && IS_TNODE(p))
1628 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++; 1785 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1629 1786
1630 /* Done with this tnode? */ 1787 /* Done with this tnode? */
1631 if( idx >= (1 << p->bits) || p->child[idx] == NULL ) 1788 if (idx >= (1 << p->bits) || p->child[idx] == NULL )
1632 goto up; 1789 goto up;
1633 } 1790 }
1634 return (struct leaf*) p->child[idx]; 1791 return (struct leaf*) p->child[idx];
@@ -1661,7 +1818,7 @@ static int fn_trie_flush(struct fib_table *tb)
1661 if (ll && hlist_empty(&ll->list)) 1818 if (ll && hlist_empty(&ll->list))
1662 trie_leaf_remove(t, ll->key); 1819 trie_leaf_remove(t, ll->key);
1663 1820
1664 if(trie_debug) 1821 if (trie_debug)
1665 printk("trie_flush found=%d\n", found); 1822 printk("trie_flush found=%d\n", found);
1666 return found; 1823 return found;
1667} 1824}
@@ -1684,32 +1841,32 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1684 order = -1; 1841 order = -1;
1685 1842
1686 read_lock(&fib_lock); 1843 read_lock(&fib_lock);
1687 1844
1688 l = fib_find_node(t, 0); 1845 l = fib_find_node(t, 0);
1689 if(!l) 1846 if (!l)
1690 goto out; 1847 goto out;
1691 1848
1692 fa_head = get_fa_head(l, 0); 1849 fa_head = get_fa_head(l, 0);
1693 if(!fa_head) 1850 if (!fa_head)
1694 goto out; 1851 goto out;
1695 1852
1696 if (list_empty(fa_head)) 1853 if (list_empty(fa_head))
1697 goto out; 1854 goto out;
1698 1855
1699 list_for_each_entry(fa, fa_head, fa_list) { 1856 list_for_each_entry(fa, fa_head, fa_list) {
1700 struct fib_info *next_fi = fa->fa_info; 1857 struct fib_info *next_fi = fa->fa_info;
1701 1858
1702 if (fa->fa_scope != res->scope || 1859 if (fa->fa_scope != res->scope ||
1703 fa->fa_type != RTN_UNICAST) 1860 fa->fa_type != RTN_UNICAST)
1704 continue; 1861 continue;
1705 1862
1706 if (next_fi->fib_priority > res->fi->fib_priority) 1863 if (next_fi->fib_priority > res->fi->fib_priority)
1707 break; 1864 break;
1708 if (!next_fi->fib_nh[0].nh_gw || 1865 if (!next_fi->fib_nh[0].nh_gw ||
1709 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1866 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1710 continue; 1867 continue;
1711 fa->fa_state |= FA_S_ACCESSED; 1868 fa->fa_state |= FA_S_ACCESSED;
1712 1869
1713 if (fi == NULL) { 1870 if (fi == NULL) {
1714 if (next_fi != res->fi) 1871 if (next_fi != res->fi)
1715 break; 1872 break;
@@ -1747,10 +1904,10 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1747 } 1904 }
1748 trie_last_dflt = last_idx; 1905 trie_last_dflt = last_idx;
1749 out:; 1906 out:;
1750 read_unlock(&fib_lock); 1907 read_unlock(&fib_lock);
1751} 1908}
1752 1909
1753static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 1910static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1754 struct sk_buff *skb, struct netlink_callback *cb) 1911 struct sk_buff *skb, struct netlink_callback *cb)
1755{ 1912{
1756 int i, s_i; 1913 int i, s_i;
@@ -1796,7 +1953,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
1796 return skb->len; 1953 return skb->len;
1797} 1954}
1798 1955
1799static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, 1956static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1800 struct netlink_callback *cb) 1957 struct netlink_callback *cb)
1801{ 1958{
1802 int h, s_h; 1959 int h, s_h;
@@ -1813,11 +1970,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
1813 sizeof(cb->args) - 3*sizeof(cb->args[0])); 1970 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1814 1971
1815 fa_head = get_fa_head(l, plen); 1972 fa_head = get_fa_head(l, plen);
1816 1973
1817 if(!fa_head) 1974 if (!fa_head)
1818 continue; 1975 continue;
1819 1976
1820 if(list_empty(fa_head)) 1977 if (list_empty(fa_head))
1821 continue; 1978 continue;
1822 1979
1823 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { 1980 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
@@ -1893,10 +2050,10 @@ struct fib_table * __init fib_hash_init(int id)
1893 2050
1894 trie_init(t); 2051 trie_init(t);
1895 2052
1896 if (id == RT_TABLE_LOCAL) 2053 if (id == RT_TABLE_LOCAL)
1897 trie_local=t; 2054 trie_local = t;
1898 else if (id == RT_TABLE_MAIN) 2055 else if (id == RT_TABLE_MAIN)
1899 trie_main=t; 2056 trie_main = t;
1900 2057
1901 if (id == RT_TABLE_LOCAL) 2058 if (id == RT_TABLE_LOCAL)
1902 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); 2059 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
@@ -1917,7 +2074,7 @@ static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1917 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0"); 2074 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1918} 2075}
1919 2076
1920static void printnode_seq(struct seq_file *seq, int indent, struct node *n, 2077static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1921 int pend, int cindex, int bits) 2078 int pend, int cindex, int bits)
1922{ 2079{
1923 putspace_seq(seq, indent); 2080 putspace_seq(seq, indent);
@@ -1935,12 +2092,12 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1935 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); 2092 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1936 2093
1937 if (IS_LEAF(n)) 2094 if (IS_LEAF(n))
1938 seq_printf(seq, "key=%d.%d.%d.%d\n", 2095 seq_printf(seq, "key=%d.%d.%d.%d\n",
1939 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); 2096 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1940 else { 2097 else {
1941 int plen=((struct tnode *)n)->pos; 2098 int plen = ((struct tnode *)n)->pos;
1942 t_key prf=MASK_PFX(n->key, plen); 2099 t_key prf=MASK_PFX(n->key, plen);
1943 seq_printf(seq, "key=%d.%d.%d.%d/%d\n", 2100 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1944 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); 2101 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1945 } 2102 }
1946 if (IS_LEAF(n)) { 2103 if (IS_LEAF(n)) {
@@ -1948,14 +2105,14 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1948 struct fib_alias *fa; 2105 struct fib_alias *fa;
1949 int i; 2106 int i;
1950 for (i=32; i>=0; i--) 2107 for (i=32; i>=0; i--)
1951 if(find_leaf_info(&l->list, i)) { 2108 if (find_leaf_info(&l->list, i)) {
1952 2109
1953 struct list_head *fa_head = get_fa_head(l, i); 2110 struct list_head *fa_head = get_fa_head(l, i);
1954 2111
1955 if(!fa_head) 2112 if (!fa_head)
1956 continue; 2113 continue;
1957 2114
1958 if(list_empty(fa_head)) 2115 if (list_empty(fa_head))
1959 continue; 2116 continue;
1960 2117
1961 putspace_seq(seq, indent+2); 2118 putspace_seq(seq, indent+2);
@@ -1981,7 +2138,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1981 } 2138 }
1982 } 2139 }
1983 else if (IS_TNODE(n)) { 2140 else if (IS_TNODE(n)) {
1984 struct tnode *tn=(struct tnode *)n; 2141 struct tnode *tn = (struct tnode *)n;
1985 putspace_seq(seq, indent); seq_printf(seq, "| "); 2142 putspace_seq(seq, indent); seq_printf(seq, "| ");
1986 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); 2143 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
1987 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); 2144 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
@@ -1997,7 +2154,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1997 2154
1998static void trie_dump_seq(struct seq_file *seq, struct trie *t) 2155static void trie_dump_seq(struct seq_file *seq, struct trie *t)
1999{ 2156{
2000 struct node *n=t->trie; 2157 struct node *n = t->trie;
2001 int cindex=0; 2158 int cindex=0;
2002 int indent=1; 2159 int indent=1;
2003 int pend=0; 2160 int pend=0;
@@ -2009,7 +2166,7 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2009 if (n) { 2166 if (n) {
2010 printnode_seq(seq, indent, n, pend, cindex, 0); 2167 printnode_seq(seq, indent, n, pend, cindex, 0);
2011 if (IS_TNODE(n)) { 2168 if (IS_TNODE(n)) {
2012 struct tnode *tn=(struct tnode *)n; 2169 struct tnode *tn = (struct tnode *)n;
2013 pend = tn->pos+tn->bits; 2170 pend = tn->pos+tn->bits;
2014 putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); 2171 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2015 indent += 3; 2172 indent += 3;
@@ -2017,42 +2174,42 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2017 2174
2018 while (tn && cindex < (1 << tn->bits)) { 2175 while (tn && cindex < (1 << tn->bits)) {
2019 if (tn->child[cindex]) { 2176 if (tn->child[cindex]) {
2020 2177
2021 /* Got a child */ 2178 /* Got a child */
2022 2179
2023 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); 2180 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2024 if (IS_LEAF(tn->child[cindex])) { 2181 if (IS_LEAF(tn->child[cindex])) {
2025 cindex++; 2182 cindex++;
2026 2183
2027 } 2184 }
2028 else { 2185 else {
2029 /* 2186 /*
2030 * New tnode. Decend one level 2187 * New tnode. Decend one level
2031 */ 2188 */
2032 2189
2033 depth++; 2190 depth++;
2034 n=tn->child[cindex]; 2191 n = tn->child[cindex];
2035 tn=(struct tnode *)n; 2192 tn = (struct tnode *)n;
2036 pend=tn->pos+tn->bits; 2193 pend = tn->pos+tn->bits;
2037 putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); 2194 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2038 indent+=3; 2195 indent+=3;
2039 cindex=0; 2196 cindex=0;
2040 } 2197 }
2041 } 2198 }
2042 else 2199 else
2043 cindex++; 2200 cindex++;
2044 2201
2045 /* 2202 /*
2046 * Test if we are done 2203 * Test if we are done
2047 */ 2204 */
2048 2205
2049 while (cindex >= (1 << tn->bits)) { 2206 while (cindex >= (1 << tn->bits)) {
2050 2207
2051 /* 2208 /*
2052 * Move upwards and test for root 2209 * Move upwards and test for root
2053 * pop off all traversed nodes 2210 * pop off all traversed nodes
2054 */ 2211 */
2055 2212
2056 if (NODE_PARENT(tn) == NULL) { 2213 if (NODE_PARENT(tn) == NULL) {
2057 tn = NULL; 2214 tn = NULL;
2058 n = NULL; 2215 n = NULL;
@@ -2062,8 +2219,8 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2062 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); 2219 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2063 tn = NODE_PARENT(tn); 2220 tn = NODE_PARENT(tn);
2064 cindex++; 2221 cindex++;
2065 n=(struct node *)tn; 2222 n = (struct node *)tn;
2066 pend=tn->pos+tn->bits; 2223 pend = tn->pos+tn->bits;
2067 indent-=3; 2224 indent-=3;
2068 depth--; 2225 depth--;
2069 } 2226 }
@@ -2081,36 +2238,36 @@ static struct trie_stat *trie_stat_new(void)
2081{ 2238{
2082 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); 2239 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2083 int i; 2240 int i;
2084 2241
2085 if(s) { 2242 if (s) {
2086 s->totdepth = 0; 2243 s->totdepth = 0;
2087 s->maxdepth = 0; 2244 s->maxdepth = 0;
2088 s->tnodes = 0; 2245 s->tnodes = 0;
2089 s->leaves = 0; 2246 s->leaves = 0;
2090 s->nullpointers = 0; 2247 s->nullpointers = 0;
2091 2248
2092 for(i=0; i< MAX_CHILDS; i++) 2249 for(i=0; i< MAX_CHILDS; i++)
2093 s->nodesizes[i] = 0; 2250 s->nodesizes[i] = 0;
2094 } 2251 }
2095 return s; 2252 return s;
2096} 2253}
2097 2254
2098static struct trie_stat *trie_collect_stats(struct trie *t) 2255static struct trie_stat *trie_collect_stats(struct trie *t)
2099{ 2256{
2100 struct node *n=t->trie; 2257 struct node *n = t->trie;
2101 struct trie_stat *s = trie_stat_new(); 2258 struct trie_stat *s = trie_stat_new();
2102 int cindex = 0; 2259 int cindex = 0;
2103 int indent = 1; 2260 int indent = 1;
2104 int pend = 0; 2261 int pend = 0;
2105 int depth = 0; 2262 int depth = 0;
2106 2263
2107 read_lock(&fib_lock); 2264 read_lock(&fib_lock);
2108 2265
2109 if (s) { 2266 if (s) {
2110 if (n) { 2267 if (n) {
2111 if (IS_TNODE(n)) { 2268 if (IS_TNODE(n)) {
2112 struct tnode *tn = (struct tnode *)n; 2269 struct tnode *tn = (struct tnode *)n;
2113 pend=tn->pos+tn->bits; 2270 pend = tn->pos+tn->bits;
2114 indent += 3; 2271 indent += 3;
2115 s->nodesizes[tn->bits]++; 2272 s->nodesizes[tn->bits]++;
2116 depth++; 2273 depth++;
@@ -2118,26 +2275,26 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2118 while (tn && cindex < (1 << tn->bits)) { 2275 while (tn && cindex < (1 << tn->bits)) {
2119 if (tn->child[cindex]) { 2276 if (tn->child[cindex]) {
2120 /* Got a child */ 2277 /* Got a child */
2121 2278
2122 if (IS_LEAF(tn->child[cindex])) { 2279 if (IS_LEAF(tn->child[cindex])) {
2123 cindex++; 2280 cindex++;
2124 2281
2125 /* stats */ 2282 /* stats */
2126 if (depth > s->maxdepth) 2283 if (depth > s->maxdepth)
2127 s->maxdepth = depth; 2284 s->maxdepth = depth;
2128 s->totdepth += depth; 2285 s->totdepth += depth;
2129 s->leaves++; 2286 s->leaves++;
2130 } 2287 }
2131 2288
2132 else { 2289 else {
2133 /* 2290 /*
2134 * New tnode. Decend one level 2291 * New tnode. Decend one level
2135 */ 2292 */
2136 2293
2137 s->tnodes++; 2294 s->tnodes++;
2138 s->nodesizes[tn->bits]++; 2295 s->nodesizes[tn->bits]++;
2139 depth++; 2296 depth++;
2140 2297
2141 n = tn->child[cindex]; 2298 n = tn->child[cindex];
2142 tn = (struct tnode *)n; 2299 tn = (struct tnode *)n;
2143 pend = tn->pos+tn->bits; 2300 pend = tn->pos+tn->bits;
@@ -2148,13 +2305,13 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2148 } 2305 }
2149 else { 2306 else {
2150 cindex++; 2307 cindex++;
2151 s->nullpointers++; 2308 s->nullpointers++;
2152 } 2309 }
2153 2310
2154 /* 2311 /*
2155 * Test if we are done 2312 * Test if we are done
2156 */ 2313 */
2157 2314
2158 while (cindex >= (1 << tn->bits)) { 2315 while (cindex >= (1 << tn->bits)) {
2159 2316
2160 /* 2317 /*
@@ -2162,7 +2319,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2162 * pop off all traversed nodes 2319 * pop off all traversed nodes
2163 */ 2320 */
2164 2321
2165 2322
2166 if (NODE_PARENT(tn) == NULL) { 2323 if (NODE_PARENT(tn) == NULL) {
2167 tn = NULL; 2324 tn = NULL;
2168 n = NULL; 2325 n = NULL;
@@ -2171,9 +2328,9 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2171 else { 2328 else {
2172 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); 2329 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2173 tn = NODE_PARENT(tn); 2330 tn = NODE_PARENT(tn);
2174 cindex++; 2331 cindex++;
2175 n = (struct node *)tn; 2332 n = (struct node *)tn;
2176 pend=tn->pos+tn->bits; 2333 pend = tn->pos+tn->bits;
2177 indent -= 3; 2334 indent -= 3;
2178 depth--; 2335 depth--;
2179 } 2336 }
@@ -2184,7 +2341,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2184 } 2341 }
2185 } 2342 }
2186 2343
2187 read_unlock(&fib_lock); 2344 read_unlock(&fib_lock);
2188 return s; 2345 return s;
2189} 2346}
2190 2347
@@ -2220,7 +2377,7 @@ static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2220 2377
2221} 2378}
2222 2379
2223/* 2380/*
2224 * This outputs /proc/net/fib_triestats 2381 * This outputs /proc/net/fib_triestats
2225 * 2382 *
2226 * It always works in backward compatibility mode. 2383 * It always works in backward compatibility mode.
@@ -2246,7 +2403,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2246 avdepth=0; 2403 avdepth=0;
2247 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); 2404 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2248 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); 2405 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2249 2406
2250 seq_printf(seq, "Leaves: %d\n", stat->leaves); 2407 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2251 bytes += sizeof(struct leaf) * stat->leaves; 2408 bytes += sizeof(struct leaf) * stat->leaves;
2252 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); 2409 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
@@ -2258,7 +2415,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2258 max--; 2415 max--;
2259 pointers = 0; 2416 pointers = 0;
2260 2417
2261 for (i = 1; i <= max; i++) 2418 for (i = 1; i <= max; i++)
2262 if (stat->nodesizes[i] != 0) { 2419 if (stat->nodesizes[i] != 0) {
2263 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); 2420 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2264 pointers += (1<<i) * stat->nodesizes[i]; 2421 pointers += (1<<i) * stat->nodesizes[i];
@@ -2279,6 +2436,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2279 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); 2436 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); 2437 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); 2438 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2439 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2282#ifdef CLEAR_STATS 2440#ifdef CLEAR_STATS
2283 memset(&(t->stats), 0, sizeof(t->stats)); 2441 memset(&(t->stats), 0, sizeof(t->stats));
2284#endif 2442#endif
@@ -2288,30 +2446,30 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2288static int fib_triestat_seq_show(struct seq_file *seq, void *v) 2446static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2289{ 2447{
2290 char bf[128]; 2448 char bf[128];
2291 2449
2292 if (v == SEQ_START_TOKEN) { 2450 if (v == SEQ_START_TOKEN) {
2293 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", 2451 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294 sizeof(struct leaf), sizeof(struct tnode)); 2452 sizeof(struct leaf), sizeof(struct tnode));
2295 if (trie_local) 2453 if (trie_local)
2296 collect_and_show(trie_local, seq); 2454 collect_and_show(trie_local, seq);
2297 2455
2298 if (trie_main) 2456 if (trie_main)
2299 collect_and_show(trie_main, seq); 2457 collect_and_show(trie_main, seq);
2300 } 2458 }
2301 else { 2459 else {
2302 snprintf(bf, sizeof(bf), 2460 snprintf(bf, sizeof(bf),
2303 "*\t%08X\t%08X", 200, 400); 2461 "*\t%08X\t%08X", 200, 400);
2304 2462
2305 seq_printf(seq, "%-127s\n", bf); 2463 seq_printf(seq, "%-127s\n", bf);
2306 } 2464 }
2307 return 0; 2465 return 0;
2308} 2466}
2309 2467
2310static struct seq_operations fib_triestat_seq_ops = { 2468static struct seq_operations fib_triestat_seq_ops = {
2311 .start = fib_triestat_seq_start, 2469 .start = fib_triestat_seq_start,
2312 .next = fib_triestat_seq_next, 2470 .next = fib_triestat_seq_next,
2313 .stop = fib_triestat_seq_stop, 2471 .stop = fib_triestat_seq_stop,
2314 .show = fib_triestat_seq_show, 2472 .show = fib_triestat_seq_show,
2315}; 2473};
2316 2474
2317static int fib_triestat_seq_open(struct inode *inode, struct file *file) 2475static int fib_triestat_seq_open(struct inode *inode, struct file *file)
@@ -2323,7 +2481,7 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2323 if (rc) 2481 if (rc)
2324 goto out_kfree; 2482 goto out_kfree;
2325 2483
2326 seq = file->private_data; 2484 seq = file->private_data;
2327out: 2485out:
2328 return rc; 2486 return rc;
2329out_kfree: 2487out_kfree:
@@ -2331,11 +2489,11 @@ out_kfree:
2331} 2489}
2332 2490
2333static struct file_operations fib_triestat_seq_fops = { 2491static struct file_operations fib_triestat_seq_fops = {
2334 .owner = THIS_MODULE, 2492 .owner = THIS_MODULE,
2335 .open = fib_triestat_seq_open, 2493 .open = fib_triestat_seq_open,
2336 .read = seq_read, 2494 .read = seq_read,
2337 .llseek = seq_lseek, 2495 .llseek = seq_lseek,
2338 .release = seq_release_private, 2496 .release = seq_release_private,
2339}; 2497};
2340 2498
2341int __init fib_stat_proc_init(void) 2499int __init fib_stat_proc_init(void)
@@ -2380,7 +2538,7 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2380 2538
2381} 2539}
2382 2540
2383/* 2541/*
2384 * This outputs /proc/net/fib_trie. 2542 * This outputs /proc/net/fib_trie.
2385 * 2543 *
2386 * It always works in backward compatibility mode. 2544 * It always works in backward compatibility mode.
@@ -2392,10 +2550,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2392 char bf[128]; 2550 char bf[128];
2393 2551
2394 if (v == SEQ_START_TOKEN) { 2552 if (v == SEQ_START_TOKEN) {
2395 if (trie_local) 2553 if (trie_local)
2396 trie_dump_seq(seq, trie_local); 2554 trie_dump_seq(seq, trie_local);
2397 2555
2398 if (trie_main) 2556 if (trie_main)
2399 trie_dump_seq(seq, trie_main); 2557 trie_dump_seq(seq, trie_main);
2400 } 2558 }
2401 2559
@@ -2409,10 +2567,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2409} 2567}
2410 2568
2411static struct seq_operations fib_trie_seq_ops = { 2569static struct seq_operations fib_trie_seq_ops = {
2412 .start = fib_trie_seq_start, 2570 .start = fib_trie_seq_start,
2413 .next = fib_trie_seq_next, 2571 .next = fib_trie_seq_next,
2414 .stop = fib_trie_seq_stop, 2572 .stop = fib_trie_seq_stop,
2415 .show = fib_trie_seq_show, 2573 .show = fib_trie_seq_show,
2416}; 2574};
2417 2575
2418static int fib_trie_seq_open(struct inode *inode, struct file *file) 2576static int fib_trie_seq_open(struct inode *inode, struct file *file)
@@ -2424,7 +2582,7 @@ static int fib_trie_seq_open(struct inode *inode, struct file *file)
2424 if (rc) 2582 if (rc)
2425 goto out_kfree; 2583 goto out_kfree;
2426 2584
2427 seq = file->private_data; 2585 seq = file->private_data;
2428out: 2586out:
2429 return rc; 2587 return rc;
2430out_kfree: 2588out_kfree:
@@ -2432,11 +2590,11 @@ out_kfree:
2432} 2590}
2433 2591
2434static struct file_operations fib_trie_seq_fops = { 2592static struct file_operations fib_trie_seq_fops = {
2435 .owner = THIS_MODULE, 2593 .owner = THIS_MODULE,
2436 .open = fib_trie_seq_open, 2594 .open = fib_trie_seq_open,
2437 .read = seq_read, 2595 .read = seq_read,
2438 .llseek = seq_lseek, 2596 .llseek = seq_lseek,
2439 .release = seq_release_private, 2597 .release= seq_release_private,
2440}; 2598};
2441 2599
2442int __init fib_proc_init(void) 2600int __init fib_proc_init(void)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cb75948497..badfc58499 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -349,12 +349,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
349{ 349{
350 struct sk_buff *skb; 350 struct sk_buff *skb;
351 351
352 ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, 352 if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
353 icmp_param->data_len+icmp_param->head_len, 353 icmp_param->data_len+icmp_param->head_len,
354 icmp_param->head_len, 354 icmp_param->head_len,
355 ipc, rt, MSG_DONTWAIT); 355 ipc, rt, MSG_DONTWAIT) < 0)
356 356 ip_flush_pending_frames(icmp_socket->sk);
357 if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { 357 else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
358 struct icmphdr *icmph = skb->h.icmph; 358 struct icmphdr *icmph = skb->h.icmph;
359 unsigned int csum = 0; 359 unsigned int csum = 0;
360 struct sk_buff *skb1; 360 struct sk_buff *skb1;
@@ -936,8 +936,7 @@ int icmp_rcv(struct sk_buff *skb)
936 case CHECKSUM_HW: 936 case CHECKSUM_HW:
937 if (!(u16)csum_fold(skb->csum)) 937 if (!(u16)csum_fold(skb->csum))
938 break; 938 break;
939 NETDEBUG(if (net_ratelimit()) 939 LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
940 printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
941 case CHECKSUM_NONE: 940 case CHECKSUM_NONE:
942 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) 941 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
943 goto error; 942 goto error;
@@ -970,7 +969,8 @@ int icmp_rcv(struct sk_buff *skb)
970 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently 969 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
971 * discarded if to broadcast/multicast. 970 * discarded if to broadcast/multicast.
972 */ 971 */
973 if (icmph->type == ICMP_ECHO && 972 if ((icmph->type == ICMP_ECHO ||
973 icmph->type == ICMP_TIMESTAMP) &&
974 sysctl_icmp_echo_ignore_broadcasts) { 974 sysctl_icmp_echo_ignore_broadcasts) {
975 goto error; 975 goto error;
976 } 976 }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a..5088f90835 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1615{ 1615{
1616 int err; 1616 int err;
1617 u32 addr = imr->imr_multiaddr.s_addr; 1617 u32 addr = imr->imr_multiaddr.s_addr;
1618 struct ip_mc_socklist *iml, *i; 1618 struct ip_mc_socklist *iml=NULL, *i;
1619 struct in_device *in_dev; 1619 struct in_device *in_dev;
1620 struct inet_sock *inet = inet_sk(sk); 1620 struct inet_sock *inet = inet_sk(sk);
1621 int ifindex;
1621 int count = 0; 1622 int count = 0;
1622 1623
1623 if (!MULTICAST(addr)) 1624 if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1633 goto done; 1634 goto done;
1634 } 1635 }
1635 1636
1636 iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
1637
1638 err = -EADDRINUSE; 1637 err = -EADDRINUSE;
1638 ifindex = imr->imr_ifindex;
1639 for (i = inet->mc_list; i; i = i->next) { 1639 for (i = inet->mc_list; i; i = i->next) {
1640 if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { 1640 if (i->multi.imr_multiaddr.s_addr == addr &&
1641 /* New style additions are reference counted */ 1641 i->multi.imr_ifindex == ifindex)
1642 if (imr->imr_address.s_addr == 0) {
1643 i->count++;
1644 err = 0;
1645 }
1646 goto done; 1642 goto done;
1647 }
1648 count++; 1643 count++;
1649 } 1644 }
1650 err = -ENOBUFS; 1645 err = -ENOBUFS;
1651 if (iml == NULL || count >= sysctl_igmp_max_memberships) 1646 if (count >= sysctl_igmp_max_memberships)
1647 goto done;
1648 iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
1649 if (iml == NULL)
1652 goto done; 1650 goto done;
1651
1653 memcpy(&iml->multi, imr, sizeof(*imr)); 1652 memcpy(&iml->multi, imr, sizeof(*imr));
1654 iml->next = inet->mc_list; 1653 iml->next = inet->mc_list;
1655 iml->count = 1;
1656 iml->sflist = NULL; 1654 iml->sflist = NULL;
1657 iml->sfmode = MCAST_EXCLUDE; 1655 iml->sfmode = MCAST_EXCLUDE;
1658 inet->mc_list = iml; 1656 inet->mc_list = iml;
1659 ip_mc_inc_group(in_dev, addr); 1657 ip_mc_inc_group(in_dev, addr);
1660 iml = NULL;
1661 err = 0; 1658 err = 0;
1662
1663done: 1659done:
1664 rtnl_shunlock(); 1660 rtnl_shunlock();
1665 if (iml)
1666 sock_kfree_s(sk, iml, sizeof(*iml));
1667 return err; 1661 return err;
1668} 1662}
1669 1663
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1693{ 1687{
1694 struct inet_sock *inet = inet_sk(sk); 1688 struct inet_sock *inet = inet_sk(sk);
1695 struct ip_mc_socklist *iml, **imlp; 1689 struct ip_mc_socklist *iml, **imlp;
1690 struct in_device *in_dev;
1691 u32 group = imr->imr_multiaddr.s_addr;
1692 u32 ifindex;
1696 1693
1697 rtnl_lock(); 1694 rtnl_lock();
1695 in_dev = ip_mc_find_dev(imr);
1696 if (!in_dev) {
1697 rtnl_unlock();
1698 return -ENODEV;
1699 }
1700 ifindex = imr->imr_ifindex;
1698 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1701 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
1699 if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && 1702 if (iml->multi.imr_multiaddr.s_addr == group &&
1700 iml->multi.imr_address.s_addr==imr->imr_address.s_addr && 1703 iml->multi.imr_ifindex == ifindex) {
1701 (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { 1704 (void) ip_mc_leave_src(sk, iml, in_dev);
1702 struct in_device *in_dev;
1703
1704 in_dev = inetdev_by_index(iml->multi.imr_ifindex);
1705 if (in_dev)
1706 (void) ip_mc_leave_src(sk, iml, in_dev);
1707 if (--iml->count) {
1708 rtnl_unlock();
1709 if (in_dev)
1710 in_dev_put(in_dev);
1711 return 0;
1712 }
1713 1705
1714 *imlp = iml->next; 1706 *imlp = iml->next;
1715 1707
1716 if (in_dev) { 1708 ip_mc_dec_group(in_dev, group);
1717 ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
1718 in_dev_put(in_dev);
1719 }
1720 rtnl_unlock(); 1709 rtnl_unlock();
1721 sock_kfree_s(sk, iml, sizeof(*iml)); 1710 sock_kfree_s(sk, iml, sizeof(*iml));
1722 return 0; 1711 return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1736 struct in_device *in_dev = NULL; 1725 struct in_device *in_dev = NULL;
1737 struct inet_sock *inet = inet_sk(sk); 1726 struct inet_sock *inet = inet_sk(sk);
1738 struct ip_sf_socklist *psl; 1727 struct ip_sf_socklist *psl;
1728 int leavegroup = 0;
1739 int i, j, rv; 1729 int i, j, rv;
1740 1730
1741 if (!MULTICAST(addr)) 1731 if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1755 err = -EADDRNOTAVAIL; 1745 err = -EADDRNOTAVAIL;
1756 1746
1757 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1747 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1758 if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0) 1748 if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
1749 && pmc->multi.imr_ifindex == imr.imr_ifindex)
1759 break; 1750 break;
1760 } 1751 }
1761 if (!pmc) /* must have a prior join */ 1752 if (!pmc) { /* must have a prior join */
1753 err = -EINVAL;
1762 goto done; 1754 goto done;
1755 }
1763 /* if a source filter was set, must be the same mode as before */ 1756 /* if a source filter was set, must be the same mode as before */
1764 if (pmc->sflist) { 1757 if (pmc->sflist) {
1765 if (pmc->sfmode != omode) 1758 if (pmc->sfmode != omode) {
1759 err = -EINVAL;
1766 goto done; 1760 goto done;
1761 }
1767 } else if (pmc->sfmode != omode) { 1762 } else if (pmc->sfmode != omode) {
1768 /* allow mode switches for empty-set filters */ 1763 /* allow mode switches for empty-set filters */
1769 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); 1764 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1775 psl = pmc->sflist; 1770 psl = pmc->sflist;
1776 if (!add) { 1771 if (!add) {
1777 if (!psl) 1772 if (!psl)
1778 goto done; 1773 goto done; /* err = -EADDRNOTAVAIL */
1779 rv = !0; 1774 rv = !0;
1780 for (i=0; i<psl->sl_count; i++) { 1775 for (i=0; i<psl->sl_count; i++) {
1781 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, 1776 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1784 break; 1779 break;
1785 } 1780 }
1786 if (rv) /* source not found */ 1781 if (rv) /* source not found */
1782 goto done; /* err = -EADDRNOTAVAIL */
1783
1784 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1785 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
1786 leavegroup = 1;
1787 goto done; 1787 goto done;
1788 }
1788 1789
1789 /* update the interface filter */ 1790 /* update the interface filter */
1790 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 1791 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1842 &mreqs->imr_sourceaddr, 1); 1843 &mreqs->imr_sourceaddr, 1);
1843done: 1844done:
1844 rtnl_shunlock(); 1845 rtnl_shunlock();
1846 if (leavegroup)
1847 return ip_mc_leave_group(sk, &imr);
1845 return err; 1848 return err;
1846} 1849}
1847 1850
1848int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) 1851int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1849{ 1852{
1850 int err; 1853 int err = 0;
1851 struct ip_mreqn imr; 1854 struct ip_mreqn imr;
1852 u32 addr = msf->imsf_multiaddr; 1855 u32 addr = msf->imsf_multiaddr;
1853 struct ip_mc_socklist *pmc; 1856 struct ip_mc_socklist *pmc;
1854 struct in_device *in_dev; 1857 struct in_device *in_dev;
1855 struct inet_sock *inet = inet_sk(sk); 1858 struct inet_sock *inet = inet_sk(sk);
1856 struct ip_sf_socklist *newpsl, *psl; 1859 struct ip_sf_socklist *newpsl, *psl;
1860 int leavegroup = 0;
1857 1861
1858 if (!MULTICAST(addr)) 1862 if (!MULTICAST(addr))
1859 return -EINVAL; 1863 return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1872 err = -ENODEV; 1876 err = -ENODEV;
1873 goto done; 1877 goto done;
1874 } 1878 }
1875 err = -EADDRNOTAVAIL; 1879
1880 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1881 if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
1882 leavegroup = 1;
1883 goto done;
1884 }
1876 1885
1877 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1886 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1878 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 1887 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
1879 pmc->multi.imr_ifindex == imr.imr_ifindex) 1888 pmc->multi.imr_ifindex == imr.imr_ifindex)
1880 break; 1889 break;
1881 } 1890 }
1882 if (!pmc) /* must have a prior join */ 1891 if (!pmc) { /* must have a prior join */
1892 err = -EINVAL;
1883 goto done; 1893 goto done;
1894 }
1884 if (msf->imsf_numsrc) { 1895 if (msf->imsf_numsrc) {
1885 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, 1896 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
1886 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); 1897 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1909 0, NULL, 0); 1920 0, NULL, 0);
1910 pmc->sflist = newpsl; 1921 pmc->sflist = newpsl;
1911 pmc->sfmode = msf->imsf_fmode; 1922 pmc->sfmode = msf->imsf_fmode;
1923 err = 0;
1912done: 1924done:
1913 rtnl_shunlock(); 1925 rtnl_shunlock();
1926 if (leavegroup)
1927 err = ip_mc_leave_group(sk, &imr);
1914 return err; 1928 return err;
1915} 1929}
1916 1930
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 95473953c4..ab18a853d7 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -450,10 +450,13 @@ static void peer_check_expire(unsigned long dummy)
450 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 450 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
451 * interval depending on the total number of entries (more entries, 451 * interval depending on the total number of entries (more entries,
452 * less interval). */ 452 * less interval). */
453 peer_periodic_timer.expires = jiffies 453 if (peer_total >= inet_peer_threshold)
454 + inet_peer_gc_maxtime 454 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
455 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 455 else
456 peer_total / inet_peer_threshold * HZ; 456 peer_periodic_timer.expires = jiffies
457 + inet_peer_gc_maxtime
458 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
459 peer_total / inet_peer_threshold * HZ;
457 add_timer(&peer_periodic_timer); 460 add_timer(&peer_periodic_timer);
458} 461}
459 462
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7f68e27eb4..eb377ae153 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
377 return ip_frag_intern(hash, qp); 377 return ip_frag_intern(hash, qp);
378 378
379out_nomem: 379out_nomem:
380 NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); 380 LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n"));
381 return NULL; 381 return NULL;
382} 382}
383 383
@@ -625,10 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
625 return head; 625 return head;
626 626
627out_nomem: 627out_nomem:
628 NETDEBUG(if (net_ratelimit()) 628 LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing "
629 printk(KERN_ERR 629 "queue %p\n", qp));
630 "IP: queue_glue: no memory for gluing queue %p\n",
631 qp));
632 goto out_fail; 630 goto out_fail;
633out_oversize: 631out_oversize:
634 if (net_ratelimit()) 632 if (net_ratelimit())
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8848355222..f0d5740d7e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -290,7 +290,6 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
290 290
291 dev_hold(dev); 291 dev_hold(dev);
292 ipgre_tunnel_link(nt); 292 ipgre_tunnel_link(nt);
293 /* Do not decrement MOD_USE_COUNT here. */
294 return nt; 293 return nt;
295 294
296failed: 295failed:
@@ -1277,12 +1276,28 @@ err1:
1277 goto out; 1276 goto out;
1278} 1277}
1279 1278
1280static void ipgre_fini(void) 1279static void __exit ipgre_destroy_tunnels(void)
1280{
1281 int prio;
1282
1283 for (prio = 0; prio < 4; prio++) {
1284 int h;
1285 for (h = 0; h < HASH_SIZE; h++) {
1286 struct ip_tunnel *t;
1287 while ((t = tunnels[prio][h]) != NULL)
1288 unregister_netdevice(t->dev);
1289 }
1290 }
1291}
1292
1293static void __exit ipgre_fini(void)
1281{ 1294{
1282 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1295 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1283 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1296 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1284 1297
1285 unregister_netdev(ipgre_fb_tunnel_dev); 1298 rtnl_lock();
1299 ipgre_destroy_tunnels();
1300 rtnl_unlock();
1286} 1301}
1287 1302
1288module_init(ipgre_init); 1303module_init(ipgre_init);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index af2ec88bbb..c703528e0b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -283,14 +283,18 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
283{ 283{
284 struct net_device *dev = skb->dev; 284 struct net_device *dev = skb->dev;
285 struct iphdr *iph = skb->nh.iph; 285 struct iphdr *iph = skb->nh.iph;
286 int err;
286 287
287 /* 288 /*
288 * Initialise the virtual path cache for the packet. It describes 289 * Initialise the virtual path cache for the packet. It describes
289 * how the packet travels inside Linux networking. 290 * how the packet travels inside Linux networking.
290 */ 291 */
291 if (skb->dst == NULL) { 292 if (skb->dst == NULL) {
292 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) 293 if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
294 if (err == -EHOSTUNREACH)
295 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
293 goto drop; 296 goto drop;
297 }
294 } 298 }
295 299
296#ifdef CONFIG_NET_CLS_ROUTE 300#ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ee07aec215..80d13103b2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,7 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110 nf_reset(newskb);
111 netif_rx(newskb); 110 netif_rx(newskb);
112 return 0; 111 return 0;
113} 112}
@@ -188,8 +187,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
188 skb = skb2; 187 skb = skb2;
189 } 188 }
190 189
191 nf_reset(skb);
192
193 if (hh) { 190 if (hh) {
194 int hh_alen; 191 int hh_alen;
195 192
@@ -383,7 +380,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
383 to->pkt_type = from->pkt_type; 380 to->pkt_type = from->pkt_type;
384 to->priority = from->priority; 381 to->priority = from->priority;
385 to->protocol = from->protocol; 382 to->protocol = from->protocol;
386 to->security = from->security;
387 dst_release(to->dst); 383 dst_release(to->dst);
388 to->dst = dst_clone(from->dst); 384 to->dst = dst_clone(from->dst);
389 to->dev = from->dev; 385 to->dev = from->dev;
@@ -1323,23 +1319,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1323 ip_rt_put(rt); 1319 ip_rt_put(rt);
1324} 1320}
1325 1321
1326/*
1327 * IP protocol layer initialiser
1328 */
1329
1330static struct packet_type ip_packet_type = {
1331 .type = __constant_htons(ETH_P_IP),
1332 .func = ip_rcv,
1333};
1334
1335/*
1336 * IP registers the packet type and then calls the subprotocol initialisers
1337 */
1338
1339void __init ip_init(void) 1322void __init ip_init(void)
1340{ 1323{
1341 dev_add_pack(&ip_packet_type);
1342
1343 ip_rt_init(); 1324 ip_rt_init();
1344 inet_initpeers(); 1325 inet_initpeers();
1345 1326
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f8b172f898..ff4bd067b3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
677 mreq.imr_address.s_addr = mreqs.imr_interface; 677 mreq.imr_address.s_addr = mreqs.imr_interface;
678 mreq.imr_ifindex = 0; 678 mreq.imr_ifindex = 0;
679 err = ip_mc_join_group(sk, &mreq); 679 err = ip_mc_join_group(sk, &mreq);
680 if (err) 680 if (err && err != -EADDRINUSE)
681 break; 681 break;
682 omode = MCAST_INCLUDE; 682 omode = MCAST_INCLUDE;
683 add = 1; 683 add = 1;
684 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 684 } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
685 omode = MCAST_INCLUDE; 685 omode = MCAST_INCLUDE;
686 add = 0; 686 add = 0;
687 } 687 }
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
754 mreq.imr_address.s_addr = 0; 754 mreq.imr_address.s_addr = 0;
755 mreq.imr_ifindex = greqs.gsr_interface; 755 mreq.imr_ifindex = greqs.gsr_interface;
756 err = ip_mc_join_group(sk, &mreq); 756 err = ip_mc_join_group(sk, &mreq);
757 if (err) 757 if (err && err != -EADDRINUSE)
758 break; 758 break;
759 greqs.gsr_interface = mreq.imr_ifindex; 759 greqs.gsr_interface = mreq.imr_ifindex;
760 omode = MCAST_INCLUDE; 760 omode = MCAST_INCLUDE;
@@ -848,6 +848,9 @@ mc_msf_out:
848 848
849 case IP_IPSEC_POLICY: 849 case IP_IPSEC_POLICY:
850 case IP_XFRM_POLICY: 850 case IP_XFRM_POLICY:
851 err = -EPERM;
852 if (!capable(CAP_NET_ADMIN))
853 break;
851 err = xfrm_user_policy(sk, optname, optval, optlen); 854 err = xfrm_user_policy(sk, optname, optval, optlen);
852 break; 855 break;
853 856
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 2065944fd9..7ded6e60f4 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -358,7 +358,7 @@ static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
358 int cpu; 358 int cpu;
359 359
360 /* This can be any valid CPU ID so we don't need locking. */ 360 /* This can be any valid CPU ID so we don't need locking. */
361 cpu = smp_processor_id(); 361 cpu = raw_smp_processor_id();
362 362
363 list_for_each_entry(pos, &ipcomp_tfms_list, list) { 363 list_for_each_entry(pos, &ipcomp_tfms_list, list) {
364 struct crypto_tfm *tfm; 364 struct crypto_tfm *tfm;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f2509034ce..d2bf8e1930 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1149,8 +1149,10 @@ static int __init ic_dynamic(void)
1149 ic_rarp_cleanup(); 1149 ic_rarp_cleanup();
1150#endif 1150#endif
1151 1151
1152 if (!ic_got_reply) 1152 if (!ic_got_reply) {
1153 ic_myaddr = INADDR_NONE;
1153 return -1; 1154 return -1;
1155 }
1154 1156
1155 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ", 1157 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
1156 ((ic_got_reply & IC_RARP) ? "RARP" 1158 ((ic_got_reply & IC_RARP) ? "RARP"
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 68a78731f7..c05c1df0bb 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -255,7 +255,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
255 255
256 dev_hold(dev); 256 dev_hold(dev);
257 ipip_tunnel_link(nt); 257 ipip_tunnel_link(nt);
258 /* Do not decrement MOD_USE_COUNT here. */
259 return nt; 258 return nt;
260 259
261failed: 260failed:
@@ -273,7 +272,7 @@ static void ipip_tunnel_uninit(struct net_device *dev)
273 dev_put(dev); 272 dev_put(dev);
274} 273}
275 274
276static void ipip_err(struct sk_buff *skb, void *__unused) 275static void ipip_err(struct sk_buff *skb, u32 info)
277{ 276{
278#ifndef I_WISH_WORLD_WERE_PERFECT 277#ifndef I_WISH_WORLD_WERE_PERFECT
279 278
@@ -852,11 +851,39 @@ static int __init ipip_fb_tunnel_init(struct net_device *dev)
852 return 0; 851 return 0;
853} 852}
854 853
854#ifdef CONFIG_INET_TUNNEL
855static struct xfrm_tunnel ipip_handler = { 855static struct xfrm_tunnel ipip_handler = {
856 .handler = ipip_rcv, 856 .handler = ipip_rcv,
857 .err_handler = ipip_err, 857 .err_handler = ipip_err,
858}; 858};
859 859
860static inline int ipip_register(void)
861{
862 return xfrm4_tunnel_register(&ipip_handler);
863}
864
865static inline int ipip_unregister(void)
866{
867 return xfrm4_tunnel_deregister(&ipip_handler);
868}
869#else
870static struct net_protocol ipip_protocol = {
871 .handler = ipip_rcv,
872 .err_handler = ipip_err,
873 .no_policy = 1,
874};
875
876static inline int ipip_register(void)
877{
878 return inet_add_protocol(&ipip_protocol, IPPROTO_IPIP);
879}
880
881static inline int ipip_unregister(void)
882{
883 return inet_del_protocol(&ipip_protocol, IPPROTO_IPIP);
884}
885#endif
886
860static char banner[] __initdata = 887static char banner[] __initdata =
861 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 888 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
862 889
@@ -866,7 +893,7 @@ static int __init ipip_init(void)
866 893
867 printk(banner); 894 printk(banner);
868 895
869 if (xfrm4_tunnel_register(&ipip_handler) < 0) { 896 if (ipip_register() < 0) {
870 printk(KERN_INFO "ipip init: can't register tunnel\n"); 897 printk(KERN_INFO "ipip init: can't register tunnel\n");
871 return -EAGAIN; 898 return -EAGAIN;
872 } 899 }
@@ -888,16 +915,33 @@ static int __init ipip_init(void)
888 err2: 915 err2:
889 free_netdev(ipip_fb_tunnel_dev); 916 free_netdev(ipip_fb_tunnel_dev);
890 err1: 917 err1:
891 xfrm4_tunnel_deregister(&ipip_handler); 918 ipip_unregister();
892 goto out; 919 goto out;
893} 920}
894 921
922static void __exit ipip_destroy_tunnels(void)
923{
924 int prio;
925
926 for (prio = 1; prio < 4; prio++) {
927 int h;
928 for (h = 0; h < HASH_SIZE; h++) {
929 struct ip_tunnel *t;
930 while ((t = tunnels[prio][h]) != NULL)
931 unregister_netdevice(t->dev);
932 }
933 }
934}
935
895static void __exit ipip_fini(void) 936static void __exit ipip_fini(void)
896{ 937{
897 if (xfrm4_tunnel_deregister(&ipip_handler) < 0) 938 if (ipip_unregister() < 0)
898 printk(KERN_INFO "ipip close: can't deregister tunnel\n"); 939 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
899 940
900 unregister_netdev(ipip_fb_tunnel_dev); 941 rtnl_lock();
942 ipip_destroy_tunnels();
943 unregister_netdevice(ipip_fb_tunnel_dev);
944 rtnl_unlock();
901} 945}
902 946
903module_init(ipip_init); 947module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e4f809a93f..dc806b5784 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
297static void ipmr_destroy_unres(struct mfc_cache *c) 297static void ipmr_destroy_unres(struct mfc_cache *c)
298{ 298{
299 struct sk_buff *skb; 299 struct sk_buff *skb;
300 struct nlmsgerr *e;
300 301
301 atomic_dec(&cache_resolve_queue_len); 302 atomic_dec(&cache_resolve_queue_len);
302 303
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
306 nlh->nlmsg_type = NLMSG_ERROR; 307 nlh->nlmsg_type = NLMSG_ERROR;
307 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 308 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308 skb_trim(skb, nlh->nlmsg_len); 309 skb_trim(skb, nlh->nlmsg_len);
309 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; 310 e = NLMSG_DATA(nlh);
311 e->error = -ETIMEDOUT;
312 memset(&e->msg, 0, sizeof(e->msg));
310 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 313 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
311 } else 314 } else
312 kfree_skb(skb); 315 kfree_skb(skb);
@@ -359,7 +362,7 @@ out:
359 362
360/* Fill oifs list. It is called under write locked mrt_lock. */ 363/* Fill oifs list. It is called under write locked mrt_lock. */
361 364
362static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) 365static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
363{ 366{
364 int vifi; 367 int vifi;
365 368
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
499static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) 502static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
500{ 503{
501 struct sk_buff *skb; 504 struct sk_buff *skb;
505 struct nlmsgerr *e;
502 506
503 /* 507 /*
504 * Play the pending entries through our router 508 * Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
515 nlh->nlmsg_type = NLMSG_ERROR; 519 nlh->nlmsg_type = NLMSG_ERROR;
516 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 520 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517 skb_trim(skb, nlh->nlmsg_len); 521 skb_trim(skb, nlh->nlmsg_len);
518 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; 522 e = NLMSG_DATA(nlh);
523 e->error = -EMSGSIZE;
524 memset(&e->msg, 0, sizeof(e->msg));
519 } 525 }
520 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 526 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
521 } else 527 } else
@@ -721,7 +727,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
721 if (c != NULL) { 727 if (c != NULL) {
722 write_lock_bh(&mrt_lock); 728 write_lock_bh(&mrt_lock);
723 c->mfc_parent = mfc->mfcc_parent; 729 c->mfc_parent = mfc->mfcc_parent;
724 ipmr_update_threshoulds(c, mfc->mfcc_ttls); 730 ipmr_update_thresholds(c, mfc->mfcc_ttls);
725 if (!mrtsock) 731 if (!mrtsock)
726 c->mfc_flags |= MFC_STATIC; 732 c->mfc_flags |= MFC_STATIC;
727 write_unlock_bh(&mrt_lock); 733 write_unlock_bh(&mrt_lock);
@@ -738,7 +744,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
738 c->mfc_origin=mfc->mfcc_origin.s_addr; 744 c->mfc_origin=mfc->mfcc_origin.s_addr;
739 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; 745 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
740 c->mfc_parent=mfc->mfcc_parent; 746 c->mfc_parent=mfc->mfcc_parent;
741 ipmr_update_threshoulds(c, mfc->mfcc_ttls); 747 ipmr_update_thresholds(c, mfc->mfcc_ttls);
742 if (!mrtsock) 748 if (!mrtsock)
743 c->mfc_flags |= MFC_STATIC; 749 c->mfc_flags |= MFC_STATIC;
744 750
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64..c9820bfc49 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
2# IP Virtual Server configuration 2# IP Virtual Server configuration
3# 3#
4menu "IP: Virtual Server Configuration" 4menu "IP: Virtual Server Configuration"
5 depends on INET && NETFILTER 5 depends on NETFILTER
6 6
7config IP_VS 7config IP_VS
8 tristate "IP virtual server support (EXPERIMENTAL)" 8 tristate "IP virtual server support (EXPERIMENTAL)"
9 depends on INET && NETFILTER 9 depends on NETFILTER
10 ---help--- 10 ---help---
11 IP Virtual Server support will let you build a high-performance 11 IP Virtual Server support will let you build a high-performance
12 virtual server based on cluster of two or more real servers. This 12 virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5499..d0145a8b15 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
548{ 548{
549 if (del_timer(&cp->timer)) 549 if (del_timer(&cp->timer))
550 mod_timer(&cp->timer, jiffies); 550 mod_timer(&cp->timer, jiffies);
551 __ip_vs_conn_put(cp);
552} 551}
553 552
554 553
@@ -759,12 +758,11 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
759 return 1; 758 return 1;
760} 759}
761 760
762 761/* Called from keventd and must protect itself from softirqs */
763void ip_vs_random_dropentry(void) 762void ip_vs_random_dropentry(void)
764{ 763{
765 int idx; 764 int idx;
766 struct ip_vs_conn *cp; 765 struct ip_vs_conn *cp;
767 struct ip_vs_conn *ct;
768 766
769 /* 767 /*
770 * Randomly scan 1/32 of the whole table every second 768 * Randomly scan 1/32 of the whole table every second
@@ -775,7 +773,7 @@ void ip_vs_random_dropentry(void)
775 /* 773 /*
776 * Lock is actually needed in this loop. 774 * Lock is actually needed in this loop.
777 */ 775 */
778 ct_write_lock(hash); 776 ct_write_lock_bh(hash);
779 777
780 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
781 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) 779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -801,23 +799,14 @@ void ip_vs_random_dropentry(void)
801 continue; 799 continue;
802 } 800 }
803 801
804 /*
805 * Drop the entry, and drop its ct if not referenced
806 */
807 atomic_inc(&cp->refcnt);
808 ct_write_unlock(hash);
809
810 if ((ct = cp->control))
811 atomic_inc(&ct->refcnt);
812 IP_VS_DBG(4, "del connection\n"); 802 IP_VS_DBG(4, "del connection\n");
813 ip_vs_conn_expire_now(cp); 803 ip_vs_conn_expire_now(cp);
814 if (ct) { 804 if (cp->control) {
815 IP_VS_DBG(4, "del conn template\n"); 805 IP_VS_DBG(4, "del conn template\n");
816 ip_vs_conn_expire_now(ct); 806 ip_vs_conn_expire_now(cp->control);
817 } 807 }
818 ct_write_lock(hash);
819 } 808 }
820 ct_write_unlock(hash); 809 ct_write_unlock_bh(hash);
821 } 810 }
822} 811}
823 812
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
829{ 818{
830 int idx; 819 int idx;
831 struct ip_vs_conn *cp; 820 struct ip_vs_conn *cp;
832 struct ip_vs_conn *ct;
833 821
834 flush_again: 822 flush_again:
835 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 823 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
839 ct_write_lock_bh(idx); 827 ct_write_lock_bh(idx);
840 828
841 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 829 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
842 atomic_inc(&cp->refcnt);
843 ct_write_unlock(idx);
844 830
845 if ((ct = cp->control))
846 atomic_inc(&ct->refcnt);
847 IP_VS_DBG(4, "del connection\n"); 831 IP_VS_DBG(4, "del connection\n");
848 ip_vs_conn_expire_now(cp); 832 ip_vs_conn_expire_now(cp);
849 if (ct) { 833 if (cp->control) {
850 IP_VS_DBG(4, "del conn template\n"); 834 IP_VS_DBG(4, "del conn template\n");
851 ip_vs_conn_expire_now(ct); 835 ip_vs_conn_expire_now(cp->control);
852 } 836 }
853 ct_write_lock(idx);
854 } 837 }
855 ct_write_unlock_bh(idx); 838 ct_write_unlock_bh(idx);
856 } 839 }
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d970103..7d99ede2ef 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
90#endif 90#endif
91 91
92/* 92/*
93 * update_defense_level is called from keventd and from sysctl. 93 * update_defense_level is called from keventd and from sysctl,
94 * so it needs to protect itself from softirqs
94 */ 95 */
95static void update_defense_level(void) 96static void update_defense_level(void)
96{ 97{
@@ -110,6 +111,8 @@ static void update_defense_level(void)
110 111
111 nomem = (availmem < sysctl_ip_vs_amemthresh); 112 nomem = (availmem < sysctl_ip_vs_amemthresh);
112 113
114 local_bh_disable();
115
113 /* drop_entry */ 116 /* drop_entry */
114 spin_lock(&__ip_vs_dropentry_lock); 117 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) { 118 switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
206 if (to_change >= 0) 209 if (to_change >= 0)
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); 210 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock); 211 write_unlock(&__ip_vs_securetcp_lock);
212
213 local_bh_enable();
209} 214}
210 215
211 216
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1360 /* Restore the correct value */ 1365 /* Restore the correct value */
1361 *valp = val; 1366 *valp = val;
1362 } else { 1367 } else {
1363 local_bh_disable();
1364 update_defense_level(); 1368 update_defense_level();
1365 local_bh_enable();
1366 } 1369 }
1367 } 1370 }
1368 return rc; 1371 return rc;
@@ -2059,7 +2062,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2059 dst->addr = src->addr; 2062 dst->addr = src->addr;
2060 dst->port = src->port; 2063 dst->port = src->port;
2061 dst->fwmark = src->fwmark; 2064 dst->fwmark = src->fwmark;
2062 strcpy(dst->sched_name, src->scheduler->name); 2065 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2063 dst->flags = src->flags; 2066 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ; 2067 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask; 2068 dst->netmask = src->netmask;
@@ -2080,6 +2083,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2083 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services) 2084 if (count >= get->num_services)
2082 goto out; 2085 goto out;
2086 memset(&entry, 0, sizeof(entry));
2083 ip_vs_copy_service(&entry, svc); 2087 ip_vs_copy_service(&entry, svc);
2084 if (copy_to_user(&uptr->entrytable[count], 2088 if (copy_to_user(&uptr->entrytable[count],
2085 &entry, sizeof(entry))) { 2089 &entry, sizeof(entry))) {
@@ -2094,6 +2098,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2094 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2098 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095 if (count >= get->num_services) 2099 if (count >= get->num_services)
2096 goto out; 2100 goto out;
2101 memset(&entry, 0, sizeof(entry));
2097 ip_vs_copy_service(&entry, svc); 2102 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count], 2103 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) { 2104 &entry, sizeof(entry))) {
@@ -2304,12 +2309,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2304 memset(&d, 0, sizeof(d)); 2309 memset(&d, 0, sizeof(d));
2305 if (ip_vs_sync_state & IP_VS_STATE_MASTER) { 2310 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306 d[0].state = IP_VS_STATE_MASTER; 2311 d[0].state = IP_VS_STATE_MASTER;
2307 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); 2312 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2308 d[0].syncid = ip_vs_master_syncid; 2313 d[0].syncid = ip_vs_master_syncid;
2309 } 2314 }
2310 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { 2315 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311 d[1].state = IP_VS_STATE_BACKUP; 2316 d[1].state = IP_VS_STATE_BACKUP;
2312 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); 2317 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2313 d[1].syncid = ip_vs_backup_syncid; 2318 d[1].syncid = ip_vs_backup_syncid;
2314 } 2319 }
2315 if (copy_to_user(user, &d, sizeof(d)) != 0) 2320 if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a..574d1f509b 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
839 839
840 ip_vs_sync_state |= state; 840 ip_vs_sync_state |= state;
841 if (state == IP_VS_STATE_MASTER) { 841 if (state == IP_VS_STATE_MASTER) {
842 strcpy(ip_vs_master_mcast_ifn, mcast_ifn); 842 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
843 ip_vs_master_syncid = syncid; 843 ip_vs_master_syncid = syncid;
844 } else { 844 } else {
845 strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); 845 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
846 ip_vs_backup_syncid = syncid; 846 ip_vs_backup_syncid = syncid;
847 } 847 }
848 848
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a78a320eee..01e1b58322 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -101,14 +101,13 @@ static int help(struct sk_buff **pskb,
101 if (port == 0 || len > 5) 101 if (port == 0 || len > 5)
102 break; 102 break;
103 103
104 exp = ip_conntrack_expect_alloc(); 104 exp = ip_conntrack_expect_alloc(ct);
105 if (exp == NULL) { 105 if (exp == NULL) {
106 ret = NF_DROP; 106 ret = NF_DROP;
107 goto out; 107 goto out;
108 } 108 }
109 109
110 exp->expectfn = NULL; 110 exp->expectfn = NULL;
111 exp->master = ct;
112 111
113 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; 112 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
114 exp->tuple.src.u.tcp.port = 0; 113 exp->tuple.src.u.tcp.port = 0;
@@ -126,10 +125,9 @@ static int help(struct sk_buff **pskb,
126 ret = ip_nat_amanda_hook(pskb, ctinfo, 125 ret = ip_nat_amanda_hook(pskb, ctinfo,
127 tmp - amanda_buffer, 126 tmp - amanda_buffer,
128 len, exp); 127 len, exp);
129 else if (ip_conntrack_expect_related(exp) != 0) { 128 else if (ip_conntrack_expect_related(exp) != 0)
130 ip_conntrack_expect_free(exp);
131 ret = NF_DROP; 129 ret = NF_DROP;
132 } 130 ip_conntrack_expect_put(exp);
133 } 131 }
134 132
135out: 133out:
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 4b78ebeb66..a7f0c821a9 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -137,19 +137,12 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
137 137
138 138
139/* ip_conntrack_expect helper functions */ 139/* ip_conntrack_expect helper functions */
140static void destroy_expect(struct ip_conntrack_expect *exp)
141{
142 ip_conntrack_put(exp->master);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 kmem_cache_free(ip_conntrack_expect_cachep, exp);
145 CONNTRACK_STAT_INC(expect_delete);
146}
147
148static void unlink_expect(struct ip_conntrack_expect *exp) 140static void unlink_expect(struct ip_conntrack_expect *exp)
149{ 141{
150 ASSERT_WRITE_LOCK(&ip_conntrack_lock); 142 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
151 list_del(&exp->list); 144 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */ 145 CONNTRACK_STAT_INC(expect_delete);
153 exp->master->expecting--; 146 exp->master->expecting--;
154} 147}
155 148
@@ -160,7 +153,7 @@ static void expectation_timed_out(unsigned long ul_expect)
160 write_lock_bh(&ip_conntrack_lock); 153 write_lock_bh(&ip_conntrack_lock);
161 unlink_expect(exp); 154 unlink_expect(exp);
162 write_unlock_bh(&ip_conntrack_lock); 155 write_unlock_bh(&ip_conntrack_lock);
163 destroy_expect(exp); 156 ip_conntrack_expect_put(exp);
164} 157}
165 158
166/* If an expectation for this connection is found, it gets delete from 159/* If an expectation for this connection is found, it gets delete from
@@ -198,7 +191,7 @@ static void remove_expectations(struct ip_conntrack *ct)
198 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { 191 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
199 if (i->master == ct && del_timer(&i->timeout)) { 192 if (i->master == ct && del_timer(&i->timeout)) {
200 unlink_expect(i); 193 unlink_expect(i);
201 destroy_expect(i); 194 ip_conntrack_expect_put(i);
202 } 195 }
203 } 196 }
204} 197}
@@ -517,9 +510,14 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
517 /* Welcome, Mr. Bond. We've been expecting you... */ 510 /* Welcome, Mr. Bond. We've been expecting you... */
518 __set_bit(IPS_EXPECTED_BIT, &conntrack->status); 511 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
519 conntrack->master = exp->master; 512 conntrack->master = exp->master;
520#if CONFIG_IP_NF_CONNTRACK_MARK 513#ifdef CONFIG_IP_NF_CONNTRACK_MARK
521 conntrack->mark = exp->master->mark; 514 conntrack->mark = exp->master->mark;
522#endif 515#endif
516#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
517 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
518 /* this is ugly, but there is no other place where to put it */
519 conntrack->nat.masq_index = exp->master->nat.masq_index;
520#endif
523 nf_conntrack_get(&conntrack->master->ct_general); 521 nf_conntrack_get(&conntrack->master->ct_general);
524 CONNTRACK_STAT_INC(expect_new); 522 CONNTRACK_STAT_INC(expect_new);
525 } else { 523 } else {
@@ -537,7 +535,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
537 if (exp) { 535 if (exp) {
538 if (exp->expectfn) 536 if (exp->expectfn)
539 exp->expectfn(conntrack, exp); 537 exp->expectfn(conntrack, exp);
540 destroy_expect(exp); 538 ip_conntrack_expect_put(exp);
541 } 539 }
542 540
543 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; 541 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
@@ -729,14 +727,14 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) { 727 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i); 728 unlink_expect(i);
731 write_unlock_bh(&ip_conntrack_lock); 729 write_unlock_bh(&ip_conntrack_lock);
732 destroy_expect(i); 730 ip_conntrack_expect_put(i);
733 return; 731 return;
734 } 732 }
735 } 733 }
736 write_unlock_bh(&ip_conntrack_lock); 734 write_unlock_bh(&ip_conntrack_lock);
737} 735}
738 736
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) 737struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
740{ 738{
741 struct ip_conntrack_expect *new; 739 struct ip_conntrack_expect *new;
742 740
@@ -745,18 +743,23 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
745 DEBUGP("expect_related: OOM allocating expect\n"); 743 DEBUGP("expect_related: OOM allocating expect\n");
746 return NULL; 744 return NULL;
747 } 745 }
748 new->master = NULL; 746 new->master = me;
747 atomic_inc(&new->master->ct_general.use);
748 atomic_set(&new->use, 1);
749 return new; 749 return new;
750} 750}
751 751
752void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) 752void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
753{ 753{
754 kmem_cache_free(ip_conntrack_expect_cachep, expect); 754 if (atomic_dec_and_test(&exp->use)) {
755 ip_conntrack_put(exp->master);
756 kmem_cache_free(ip_conntrack_expect_cachep, exp);
757 }
755} 758}
756 759
757static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) 760static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
758{ 761{
759 atomic_inc(&exp->master->ct_general.use); 762 atomic_inc(&exp->use);
760 exp->master->expecting++; 763 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list); 764 list_add(&exp->list, &ip_conntrack_expect_list);
762 765
@@ -778,7 +781,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
778 if (i->master == master) { 781 if (i->master == master) {
779 if (del_timer(&i->timeout)) { 782 if (del_timer(&i->timeout)) {
780 unlink_expect(i); 783 unlink_expect(i);
781 destroy_expect(i); 784 ip_conntrack_expect_put(i);
782 } 785 }
783 break; 786 break;
784 } 787 }
@@ -810,8 +813,6 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
810 /* Refresh timer: if it's dying, ignore.. */ 813 /* Refresh timer: if it's dying, ignore.. */
811 if (refresh_timer(i)) { 814 if (refresh_timer(i)) {
812 ret = 0; 815 ret = 0;
813 /* We don't need the one they've given us. */
814 ip_conntrack_expect_free(expect);
815 goto out; 816 goto out;
816 } 817 }
817 } else if (expect_clash(i, expect)) { 818 } else if (expect_clash(i, expect)) {
@@ -881,7 +882,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
881 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { 882 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
882 if (exp->master->helper == me && del_timer(&exp->timeout)) { 883 if (exp->master->helper == me && del_timer(&exp->timeout)) {
883 unlink_expect(exp); 884 unlink_expect(exp);
884 destroy_expect(exp); 885 ip_conntrack_expect_put(exp);
885 } 886 }
886 } 887 }
887 /* Get rid of expecteds, set helpers to NULL. */ 888 /* Get rid of expecteds, set helpers to NULL. */
@@ -1111,6 +1112,9 @@ void ip_conntrack_cleanup(void)
1111 schedule(); 1112 schedule();
1112 goto i_see_dead_people; 1113 goto i_see_dead_people;
1113 } 1114 }
1115 /* wait until all references to ip_conntrack_untracked are dropped */
1116 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1117 schedule();
1114 1118
1115 kmem_cache_destroy(ip_conntrack_cachep); 1119 kmem_cache_destroy(ip_conntrack_cachep);
1116 kmem_cache_destroy(ip_conntrack_expect_cachep); 1120 kmem_cache_destroy(ip_conntrack_expect_cachep);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index fea6dd2a00..7a3b773be3 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -376,7 +376,7 @@ static int help(struct sk_buff **pskb,
376 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); 376 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
377 377
378 /* Allocate expectation which will be inserted */ 378 /* Allocate expectation which will be inserted */
379 exp = ip_conntrack_expect_alloc(); 379 exp = ip_conntrack_expect_alloc(ct);
380 if (exp == NULL) { 380 if (exp == NULL) {
381 ret = NF_DROP; 381 ret = NF_DROP;
382 goto out; 382 goto out;
@@ -403,8 +403,7 @@ static int help(struct sk_buff **pskb,
403 networks, or the packet filter itself). */ 403 networks, or the packet filter itself). */
404 if (!loose) { 404 if (!loose) {
405 ret = NF_ACCEPT; 405 ret = NF_ACCEPT;
406 ip_conntrack_expect_free(exp); 406 goto out_put_expect;
407 goto out_update_nl;
408 } 407 }
409 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) 408 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
410 | (array[2] << 8) | array[3]); 409 | (array[2] << 8) | array[3]);
@@ -419,7 +418,6 @@ static int help(struct sk_buff **pskb,
419 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); 418 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
420 419
421 exp->expectfn = NULL; 420 exp->expectfn = NULL;
422 exp->master = ct;
423 421
424 /* Now, NAT might want to mangle the packet, and register the 422 /* Now, NAT might want to mangle the packet, and register the
425 * (possibly changed) expectation itself. */ 423 * (possibly changed) expectation itself. */
@@ -428,13 +426,15 @@ static int help(struct sk_buff **pskb,
428 matchoff, matchlen, exp, &seq); 426 matchoff, matchlen, exp, &seq);
429 else { 427 else {
430 /* Can't expect this? Best to drop packet now. */ 428 /* Can't expect this? Best to drop packet now. */
431 if (ip_conntrack_expect_related(exp) != 0) { 429 if (ip_conntrack_expect_related(exp) != 0)
432 ip_conntrack_expect_free(exp);
433 ret = NF_DROP; 430 ret = NF_DROP;
434 } else 431 else
435 ret = NF_ACCEPT; 432 ret = NF_ACCEPT;
436 } 433 }
437 434
435out_put_expect:
436 ip_conntrack_expect_put(exp);
437
438out_update_nl: 438out_update_nl:
439 /* Now if this ends in \n, update ftp info. Seq may have been 439 /* Now if this ends in \n, update ftp info. Seq may have been
440 * adjusted by NAT code. */ 440 * adjusted by NAT code. */
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index cd98772cc3..4a28f297d5 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -197,7 +197,7 @@ static int help(struct sk_buff **pskb,
197 continue; 197 continue;
198 } 198 }
199 199
200 exp = ip_conntrack_expect_alloc(); 200 exp = ip_conntrack_expect_alloc(ct);
201 if (exp == NULL) { 201 if (exp == NULL) {
202 ret = NF_DROP; 202 ret = NF_DROP;
203 goto out; 203 goto out;
@@ -221,16 +221,14 @@ static int help(struct sk_buff **pskb,
221 { { 0, { 0 } }, 221 { { 0, { 0 } },
222 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); 222 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
223 exp->expectfn = NULL; 223 exp->expectfn = NULL;
224 exp->master = ct;
225 if (ip_nat_irc_hook) 224 if (ip_nat_irc_hook)
226 ret = ip_nat_irc_hook(pskb, ctinfo, 225 ret = ip_nat_irc_hook(pskb, ctinfo,
227 addr_beg_p - ib_ptr, 226 addr_beg_p - ib_ptr,
228 addr_end_p - addr_beg_p, 227 addr_end_p - addr_beg_p,
229 exp); 228 exp);
230 else if (ip_conntrack_expect_related(exp) != 0) { 229 else if (ip_conntrack_expect_related(exp) != 0)
231 ip_conntrack_expect_free(exp);
232 ret = NF_DROP; 230 ret = NF_DROP;
233 } 231 ip_conntrack_expect_put(exp);
234 goto out; 232 goto out;
235 } /* for .. NUM_DCCPROTO */ 233 } /* for .. NUM_DCCPROTO */
236 } /* while data < ... */ 234 } /* while data < ... */
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 42dc951028..61798c46e9 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -432,6 +432,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
432 const struct net_device *out, 432 const struct net_device *out,
433 int (*okfn)(struct sk_buff *)) 433 int (*okfn)(struct sk_buff *))
434{ 434{
435#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
436 /* Previously seen (loopback)? Ignore. Do this before
437 fragment check. */
438 if ((*pskb)->nfct)
439 return NF_ACCEPT;
440#endif
441
435 /* Gather fragments. */ 442 /* Gather fragments. */
436 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 443 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
437 *pskb = ip_ct_gather_frags(*pskb, 444 *pskb = ip_ct_gather_frags(*pskb,
@@ -978,7 +985,7 @@ EXPORT_SYMBOL(ip_ct_refresh_acct);
978EXPORT_SYMBOL(ip_ct_protos); 985EXPORT_SYMBOL(ip_ct_protos);
979EXPORT_SYMBOL(ip_ct_find_proto); 986EXPORT_SYMBOL(ip_ct_find_proto);
980EXPORT_SYMBOL(ip_conntrack_expect_alloc); 987EXPORT_SYMBOL(ip_conntrack_expect_alloc);
981EXPORT_SYMBOL(ip_conntrack_expect_free); 988EXPORT_SYMBOL(ip_conntrack_expect_put);
982EXPORT_SYMBOL(ip_conntrack_expect_related); 989EXPORT_SYMBOL(ip_conntrack_expect_related);
983EXPORT_SYMBOL(ip_conntrack_unexpect_related); 990EXPORT_SYMBOL(ip_conntrack_unexpect_related);
984EXPORT_SYMBOL(ip_conntrack_tuple_taken); 991EXPORT_SYMBOL(ip_conntrack_tuple_taken);
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index 992fac3e36..f8ff170f39 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -65,7 +65,7 @@ static int tftp_help(struct sk_buff **pskb,
65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
67 67
68 exp = ip_conntrack_expect_alloc(); 68 exp = ip_conntrack_expect_alloc(ct);
69 if (exp == NULL) 69 if (exp == NULL)
70 return NF_DROP; 70 return NF_DROP;
71 71
@@ -75,17 +75,15 @@ static int tftp_help(struct sk_buff **pskb,
75 exp->mask.dst.u.udp.port = 0xffff; 75 exp->mask.dst.u.udp.port = 0xffff;
76 exp->mask.dst.protonum = 0xff; 76 exp->mask.dst.protonum = 0xff;
77 exp->expectfn = NULL; 77 exp->expectfn = NULL;
78 exp->master = ct;
79 78
80 DEBUGP("expect: "); 79 DEBUGP("expect: ");
81 DUMP_TUPLE(&exp->tuple); 80 DUMP_TUPLE(&exp->tuple);
82 DUMP_TUPLE(&exp->mask); 81 DUMP_TUPLE(&exp->mask);
83 if (ip_nat_tftp_hook) 82 if (ip_nat_tftp_hook)
84 ret = ip_nat_tftp_hook(pskb, ctinfo, exp); 83 ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
85 else if (ip_conntrack_expect_related(exp) != 0) { 84 else if (ip_conntrack_expect_related(exp) != 0)
86 ip_conntrack_expect_free(exp);
87 ret = NF_DROP; 85 ret = NF_DROP;
88 } 86 ip_conntrack_expect_put(exp);
89 break; 87 break;
90 case TFTP_OPCODE_DATA: 88 case TFTP_OPCODE_DATA:
91 case TFTP_OPCODE_ACK: 89 case TFTP_OPCODE_ACK:
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
index da1f412583..706c8074f4 100644
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -56,10 +56,8 @@ static unsigned int help(struct sk_buff **pskb,
56 break; 56 break;
57 } 57 }
58 58
59 if (port == 0) { 59 if (port == 0)
60 ip_conntrack_expect_free(exp);
61 return NF_DROP; 60 return NF_DROP;
62 }
63 61
64 sprintf(buffer, "%u", port); 62 sprintf(buffer, "%u", port);
65 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, 63 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index c6000e794a..d83757a70d 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -143,10 +143,8 @@ static unsigned int ip_nat_ftp(struct sk_buff **pskb,
143 break; 143 break;
144 } 144 }
145 145
146 if (port == 0) { 146 if (port == 0)
147 ip_conntrack_expect_free(exp);
148 return NF_DROP; 147 return NF_DROP;
149 }
150 148
151 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, 149 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
152 seq)) { 150 seq)) {
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
index 9c1ca3381d..de31942bab 100644
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -65,10 +65,8 @@ static unsigned int help(struct sk_buff **pskb,
65 break; 65 break;
66 } 66 }
67 67
68 if (port == 0) { 68 if (port == 0)
69 ip_conntrack_expect_free(exp);
70 return NF_DROP; 69 return NF_DROP;
71 }
72 70
73 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 71 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
74 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 72 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index a558cf0eee..6596c9ee16 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -35,16 +35,17 @@ icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
35 const struct ip_conntrack *conntrack) 35 const struct ip_conntrack *conntrack)
36{ 36{
37 static u_int16_t id; 37 static u_int16_t id;
38 unsigned int range_size 38 unsigned int range_size;
39 = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
40 unsigned int i; 39 unsigned int i;
41 40
41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
42 /* If no range specified... */ 42 /* If no range specified... */
43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) 43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF; 44 range_size = 0xFFFF;
45 45
46 for (i = 0; i < range_size; i++, id++) { 46 for (i = 0; i < range_size; i++, id++) {
47 tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size); 47 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
48 (id % range_size));
48 if (!ip_nat_used_tuple(tuple, conntrack)) 49 if (!ip_nat_used_tuple(tuple, conntrack))
49 return 1; 50 return 1;
50 } 51 }
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a91cfceff2..a98e36d2b3 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -40,7 +40,8 @@ tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
40 enum ip_nat_manip_type maniptype, 40 enum ip_nat_manip_type maniptype,
41 const struct ip_conntrack *conntrack) 41 const struct ip_conntrack *conntrack)
42{ 42{
43 static u_int16_t port, *portptr; 43 static u_int16_t port;
44 u_int16_t *portptr;
44 unsigned int range_size, min, i; 45 unsigned int range_size, min, i;
45 46
46 if (maniptype == IP_NAT_MANIP_SRC) 47 if (maniptype == IP_NAT_MANIP_SRC)
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index c669e3b5f5..9f66e56256 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -41,7 +41,8 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple,
41 enum ip_nat_manip_type maniptype, 41 enum ip_nat_manip_type maniptype,
42 const struct ip_conntrack *conntrack) 42 const struct ip_conntrack *conntrack)
43{ 43{
44 static u_int16_t port, *portptr; 44 static u_int16_t port;
45 u_int16_t *portptr;
45 unsigned int range_size, min, i; 46 unsigned int range_size, min, i;
46 47
47 if (maniptype == IP_NAT_MANIP_SRC) 48 if (maniptype == IP_NAT_MANIP_SRC)
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index bc59d0d6e8..91d5ea1dbb 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -102,6 +102,10 @@ ip_nat_fn(unsigned int hooknum,
102 return NF_ACCEPT; 102 return NF_ACCEPT;
103 } 103 }
104 104
105 /* Don't try to NAT if this packet is not conntracked */
106 if (ct == &ip_conntrack_untracked)
107 return NF_ACCEPT;
108
105 switch (ctinfo) { 109 switch (ctinfo) {
106 case IP_CT_RELATED: 110 case IP_CT_RELATED:
107 case IP_CT_RELATED+IP_CT_IS_REPLY: 111 case IP_CT_RELATED+IP_CT_IS_REPLY:
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
index 0343e0d646..2215317c76 100644
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -45,10 +45,8 @@ static unsigned int help(struct sk_buff **pskb,
45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; 45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
46 exp->dir = IP_CT_DIR_REPLY; 46 exp->dir = IP_CT_DIR_REPLY;
47 exp->expectfn = ip_nat_follow_master; 47 exp->expectfn = ip_nat_follow_master;
48 if (ip_conntrack_expect_related(exp) != 0) { 48 if (ip_conntrack_expect_related(exp) != 0)
49 ip_conntrack_expect_free(exp);
50 return NF_DROP; 49 return NF_DROP;
51 }
52 return NF_ACCEPT; 50 return NF_ACCEPT;
53} 51}
54 52
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index eda1fba431..c6baa81743 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -214,6 +214,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
214 break; 214 break;
215 215
216 case IPQ_COPY_PACKET: 216 case IPQ_COPY_PACKET:
217 if (entry->skb->ip_summed == CHECKSUM_HW &&
218 (*errp = skb_checksum_help(entry->skb,
219 entry->info->outdev == NULL))) {
220 read_unlock_bh(&queue_lock);
221 return NULL;
222 }
217 if (copy_range == 0 || copy_range > entry->skb->len) 223 if (copy_range == 0 || copy_range > entry->skb->len)
218 data_len = entry->skb->len; 224 data_len = entry->skb->len;
219 else 225 else
@@ -385,6 +391,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
385 if (!skb_ip_make_writable(&e->skb, v->data_len)) 391 if (!skb_ip_make_writable(&e->skb, v->data_len))
386 return -ENOMEM; 392 return -ENOMEM;
387 memcpy(e->skb->data, v->payload, v->data_len); 393 memcpy(e->skb->data, v->payload, v->data_len);
394 e->skb->ip_summed = CHECKSUM_NONE;
388 e->skb->nfcache |= NFC_ALTERED; 395 e->skb->nfcache |= NFC_ALTERED;
389 396
390 /* 397 /*
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index dc4362b57c..6706d3a1bc 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -30,7 +30,7 @@
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32 32
33#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.7"
34 34
35#define DEBUG_CLUSTERIP 35#define DEBUG_CLUSTERIP
36 36
@@ -339,7 +339,7 @@ target(struct sk_buff **pskb,
339 * error messages (RELATED) and information requests (see below) */ 339 * error messages (RELATED) and information requests (see below) */
340 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 340 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
341 && (ctinfo == IP_CT_RELATED 341 && (ctinfo == IP_CT_RELATED
342 || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) 342 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
343 return IPT_CONTINUE; 343 return IPT_CONTINUE;
344 344
345 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 345 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -524,8 +524,9 @@ arp_mangle(unsigned int hook,
524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) 524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
525 return NF_ACCEPT; 525 return NF_ACCEPT;
526 526
527 /* we only want to mangle arp replies */ 527 /* we only want to mangle arp requests and replies */
528 if (arp->ar_op != htons(ARPOP_REPLY)) 528 if (arp->ar_op != htons(ARPOP_REPLY)
529 && arp->ar_op != htons(ARPOP_REQUEST))
529 return NF_ACCEPT; 530 return NF_ACCEPT;
530 531
531 payload = (void *)(arp+1); 532 payload = (void *)(arp+1);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ada9911118..94a0ce1c1c 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -61,16 +61,20 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
61 if (!tcph) 61 if (!tcph)
62 return 0; 62 return 0;
63 63
64 if (!(einfo->operation & IPT_ECN_OP_SET_ECE 64 if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
65 || tcph->ece == einfo->proto.tcp.ece) 65 tcph->ece == einfo->proto.tcp.ece) &&
66 && (!(einfo->operation & IPT_ECN_OP_SET_CWR 66 ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
67 || tcph->cwr == einfo->proto.tcp.cwr))) 67 tcph->cwr == einfo->proto.tcp.cwr)))
68 return 1; 68 return 1;
69 69
70 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) 70 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
71 return 0; 71 return 0;
72 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; 72 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
73 73
74 if ((*pskb)->ip_summed == CHECKSUM_HW &&
75 skb_checksum_help(*pskb, inward))
76 return 0;
77
74 diffs[0] = ((u_int16_t *)tcph)[6]; 78 diffs[0] = ((u_int16_t *)tcph)[6];
75 if (einfo->operation & IPT_ECN_OP_SET_ECE) 79 if (einfo->operation & IPT_ECN_OP_SET_ECE)
76 tcph->ece = einfo->proto.tcp.ece; 80 tcph->ece = einfo->proto.tcp.ece;
@@ -79,13 +83,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
79 diffs[1] = ((u_int16_t *)tcph)[6]; 83 diffs[1] = ((u_int16_t *)tcph)[6];
80 diffs[0] = diffs[0] ^ 0xFFFF; 84 diffs[0] = diffs[0] ^ 0xFFFF;
81 85
82 if ((*pskb)->ip_summed != CHECKSUM_HW) 86 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY)
83 tcph->check = csum_fold(csum_partial((char *)diffs, 87 tcph->check = csum_fold(csum_partial((char *)diffs,
84 sizeof(diffs), 88 sizeof(diffs),
85 tcph->check^0xFFFF)); 89 tcph->check^0xFFFF));
86 else
87 if (skb_checksum_help(*pskb, inward))
88 return 0;
89 (*pskb)->nfcache |= NFC_ALTERED; 90 (*pskb)->nfcache |= NFC_ALTERED;
90 return 1; 91 return 1;
91} 92}
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 1049050b2b..7b84a25444 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -61,6 +61,10 @@ ipt_tcpmss_target(struct sk_buff **pskb,
61 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 61 if (!skb_ip_make_writable(pskb, (*pskb)->len))
62 return NF_DROP; 62 return NF_DROP;
63 63
64 if ((*pskb)->ip_summed == CHECKSUM_HW &&
65 skb_checksum_help(*pskb, out == NULL))
66 return NF_DROP;
67
64 iph = (*pskb)->nh.iph; 68 iph = (*pskb)->nh.iph;
65 tcplen = (*pskb)->len - iph->ihl*4; 69 tcplen = (*pskb)->len - iph->ihl*4;
66 70
@@ -186,9 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
186 newmss); 190 newmss);
187 191
188 retmodified: 192 retmodified:
189 /* We never hw checksum SYN packets. */
190 BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
191
192 (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; 193 (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
193 return IPT_CONTINUE; 194 return IPT_CONTINUE;
194} 195}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f4d53c9198..d675ff80b0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
54 * Marc Boucher : routing by fwmark 54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics 55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * 58 *
58 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
70#include <linux/kernel.h> 71#include <linux/kernel.h>
71#include <linux/sched.h> 72#include <linux/sched.h>
72#include <linux/mm.h> 73#include <linux/mm.h>
74#include <linux/bootmem.h>
73#include <linux/string.h> 75#include <linux/string.h>
74#include <linux/socket.h> 76#include <linux/socket.h>
75#include <linux/sockios.h> 77#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
201 203
202struct rt_hash_bucket { 204struct rt_hash_bucket {
203 struct rtable *chain; 205 struct rtable *chain;
204 spinlock_t lock; 206};
205} __attribute__((__aligned__(8))); 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
206 237
207static struct rt_hash_bucket *rt_hash_table; 238static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask; 239static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
575/* This runs via a timer and thus is always in BH context. */ 606/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy) 607static void rt_check_expire(unsigned long dummy)
577{ 608{
578 static int rover; 609 static unsigned int rover;
579 int i = rover, t; 610 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 611 struct rtable *rth, **rthp;
581 unsigned long now = jiffies; 612 unsigned long now = jiffies;
582 613 u64 mult;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 614
584 t -= ip_rt_gc_timeout) { 615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
585 unsigned long tmo = ip_rt_gc_timeout; 621 unsigned long tmo = ip_rt_gc_timeout;
586 622
587 i = (i + 1) & rt_hash_mask; 623 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain; 624 rthp = &rt_hash_table[i].chain;
589 625
590 spin_lock(&rt_hash_table[i].lock); 626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
591 while ((rth = *rthp) != NULL) { 629 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) { 630 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */ 631 /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
620 rt_free(rth); 658 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 } 660 }
623 spin_unlock(&rt_hash_table[i].lock); 661 spin_unlock(rt_hash_lock_addr(i));
624 662
625 /* Fallback loop breaker. */ 663 /* Fallback loop breaker. */
626 if (time_after(jiffies, now)) 664 if (time_after(jiffies, now))
627 break; 665 break;
628 } 666 }
629 rover = i; 667 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
631} 669}
632 670
633/* This can run from both BH and non-BH contexts, the latter 671/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
643 get_random_bytes(&rt_hash_rnd, 4); 681 get_random_bytes(&rt_hash_rnd, 4);
644 682
645 for (i = rt_hash_mask; i >= 0; i--) { 683 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock); 684 spin_lock_bh(rt_hash_lock_addr(i));
647 rth = rt_hash_table[i].chain; 685 rth = rt_hash_table[i].chain;
648 if (rth) 686 if (rth)
649 rt_hash_table[i].chain = NULL; 687 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock); 688 spin_unlock_bh(rt_hash_lock_addr(i));
651 689
652 for (; rth; rth = next) { 690 for (; rth; rth = next) {
653 next = rth->u.rt_next; 691 next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
780 818
781 k = (k + 1) & rt_hash_mask; 819 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain; 820 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock); 821 spin_lock_bh(rt_hash_lock_addr(k));
784 while ((rth = *rthp) != NULL) { 822 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) { 823 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1; 824 tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
812 goal--; 850 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 } 852 }
815 spin_unlock_bh(&rt_hash_table[k].lock); 853 spin_unlock_bh(rt_hash_lock_addr(k));
816 if (goal <= 0) 854 if (goal <= 0)
817 break; 855 break;
818 } 856 }
@@ -882,7 +920,7 @@ restart:
882 920
883 rthp = &rt_hash_table[hash].chain; 921 rthp = &rt_hash_table[hash].chain;
884 922
885 spin_lock_bh(&rt_hash_table[hash].lock); 923 spin_lock_bh(rt_hash_lock_addr(hash));
886 while ((rth = *rthp) != NULL) { 924 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) && 926 if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
908 rth->u.dst.__use++; 946 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst); 947 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now; 948 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock); 949 spin_unlock_bh(rt_hash_lock_addr(hash));
912 950
913 rt_drop(rt); 951 rt_drop(rt);
914 *rp = rth; 952 *rp = rth;
@@ -949,7 +987,7 @@ restart:
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst); 988 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) { 989 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock); 990 spin_unlock_bh(rt_hash_lock_addr(hash));
953 991
954 if (err != -ENOBUFS) { 992 if (err != -ENOBUFS) {
955 rt_drop(rt); 993 rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
990 } 1028 }
991#endif 1029#endif
992 rt_hash_table[hash].chain = rt; 1030 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock); 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
994 *rp = rt; 1032 *rp = rt;
995 return 0; 1033 return 0;
996} 1034}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1058{ 1096{
1059 struct rtable **rthp; 1097 struct rtable **rthp;
1060 1098
1061 spin_lock_bh(&rt_hash_table[hash].lock); 1099 spin_lock_bh(rt_hash_lock_addr(hash));
1062 ip_rt_put(rt); 1100 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next) 1102 rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1067 rt_free(rt); 1105 rt_free(rt);
1068 break; 1106 break;
1069 } 1107 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1071} 1109}
1072 1110
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1647,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1647 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1648 "%u.%u.%u.%u, on dev %s\n", 1686 "%u.%u.%u.%u, on dev %s\n",
1649 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1650 if (dev->hard_header_len) { 1688 if (dev->hard_header_len && skb->mac.raw) {
1651 int i; 1689 int i;
1652 unsigned char *p = skb->mac.raw; 1690 unsigned char *p = skb->mac.raw;
1653 printk(KERN_WARNING "ll header: "); 1691 printk(KERN_WARNING "ll header: ");
@@ -1767,7 +1805,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
1767 struct in_device *in_dev, 1805 struct in_device *in_dev,
1768 u32 daddr, u32 saddr, u32 tos) 1806 u32 daddr, u32 saddr, u32 tos)
1769{ 1807{
1770 struct rtable* rth; 1808 struct rtable* rth = NULL;
1771 int err; 1809 int err;
1772 unsigned hash; 1810 unsigned hash;
1773 1811
@@ -1794,7 +1832,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
1794 u32 daddr, u32 saddr, u32 tos) 1832 u32 daddr, u32 saddr, u32 tos)
1795{ 1833{
1796#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1834#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1797 struct rtable* rth; 1835 struct rtable* rth = NULL;
1798 unsigned char hop, hopcount, lasthop; 1836 unsigned char hop, hopcount, lasthop;
1799 int err = -EINVAL; 1837 int err = -EINVAL;
1800 unsigned int hash; 1838 unsigned int hash;
@@ -1909,7 +1947,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1909 */ 1947 */
1910 if ((err = fib_lookup(&fl, &res)) != 0) { 1948 if ((err = fib_lookup(&fl, &res)) != 0) {
1911 if (!IN_DEV_FORWARD(in_dev)) 1949 if (!IN_DEV_FORWARD(in_dev))
1912 goto e_inval; 1950 goto e_hostunreach;
1913 goto no_route; 1951 goto no_route;
1914 } 1952 }
1915 free_res = 1; 1953 free_res = 1;
@@ -1933,7 +1971,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1933 } 1971 }
1934 1972
1935 if (!IN_DEV_FORWARD(in_dev)) 1973 if (!IN_DEV_FORWARD(in_dev))
1936 goto e_inval; 1974 goto e_hostunreach;
1937 if (res.type != RTN_UNICAST) 1975 if (res.type != RTN_UNICAST)
1938 goto martian_destination; 1976 goto martian_destination;
1939 1977
@@ -2025,6 +2063,11 @@ martian_destination:
2025 "%u.%u.%u.%u, dev %s\n", 2063 "%u.%u.%u.%u, dev %s\n",
2026 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 2064 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027#endif 2065#endif
2066
2067e_hostunreach:
2068 err = -EHOSTUNREACH;
2069 goto done;
2070
2028e_inval: 2071e_inval:
2029 err = -EINVAL; 2072 err = -EINVAL;
2030 goto done; 2073 goto done;
@@ -2239,7 +2282,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
2239 struct net_device *dev_out, 2282 struct net_device *dev_out,
2240 unsigned flags) 2283 unsigned flags)
2241{ 2284{
2242 struct rtable *rth; 2285 struct rtable *rth = NULL;
2243 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2286 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2244 unsigned hash; 2287 unsigned hash;
2245 if (err == 0) { 2288 if (err == 0) {
@@ -2267,7 +2310,7 @@ static inline int ip_mkroute_output(struct rtable** rp,
2267 unsigned char hop; 2310 unsigned char hop;
2268 unsigned hash; 2311 unsigned hash;
2269 int err = -EINVAL; 2312 int err = -EINVAL;
2270 struct rtable *rth; 2313 struct rtable *rth = NULL;
2271 2314
2272 if (res->fi && res->fi->fib_nhs > 1) { 2315 if (res->fi && res->fi->fib_nhs > 1) {
2273 unsigned char hopcount = res->fi->fib_nhs; 2316 unsigned char hopcount = res->fi->fib_nhs;
@@ -3068,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
3068 3111
3069int __init ip_rt_init(void) 3112int __init ip_rt_init(void)
3070{ 3113{
3071 int i, order, goal, rc = 0; 3114 int rc = 0;
3072 3115
3073 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074 (jiffies ^ (jiffies >> 7))); 3117 (jiffies ^ (jiffies >> 7)));
3075 3118
3076#ifdef CONFIG_NET_CLS_ROUTE 3119#ifdef CONFIG_NET_CLS_ROUTE
3120 {
3121 int order;
3077 for (order = 0; 3122 for (order = 0;
3078 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079 /* NOTHING */; 3124 /* NOTHING */;
@@ -3081,6 +3126,7 @@ int __init ip_rt_init(void)
3081 if (!ip_rt_acct) 3126 if (!ip_rt_acct)
3082 panic("IP: failed to allocate ip_rt_acct\n"); 3127 panic("IP: failed to allocate ip_rt_acct\n");
3083 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 }
3084#endif 3130#endif
3085 3131
3086 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3091,36 +3137,19 @@ int __init ip_rt_init(void)
3091 if (!ipv4_dst_ops.kmem_cachep) 3137 if (!ipv4_dst_ops.kmem_cachep)
3092 panic("IP: failed to allocate ip_dst_cache\n"); 3138 panic("IP: failed to allocate ip_dst_cache\n");
3093 3139
3094 goal = num_physpages >> (26 - PAGE_SHIFT); 3140 rt_hash_table = (struct rt_hash_bucket *)
3095 if (rhash_entries) 3141 alloc_large_system_hash("IP route cache",
3096 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3142 sizeof(struct rt_hash_bucket),
3097 for (order = 0; (1UL << order) < goal; order++) 3143 rhash_entries,
3098 /* NOTHING */; 3144 (num_physpages >= 128 * 1024) ?
3099 3145 (27 - PAGE_SHIFT) :
3100 do { 3146 (29 - PAGE_SHIFT),
3101 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3147 HASH_HIGHMEM,
3102 sizeof(struct rt_hash_bucket); 3148 &rt_hash_log,
3103 while (rt_hash_mask & (rt_hash_mask - 1)) 3149 &rt_hash_mask,
3104 rt_hash_mask--; 3150 0);
3105 rt_hash_table = (struct rt_hash_bucket *) 3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3106 __get_free_pages(GFP_ATOMIC, order); 3152 rt_hash_lock_init();
3107 } while (rt_hash_table == NULL && --order > 0);
3108
3109 if (!rt_hash_table)
3110 panic("Failed to allocate IP route cache hash table\n");
3111
3112 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113 rt_hash_mask,
3114 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3115
3116 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117 /* NOTHING */;
3118
3119 rt_hash_mask--;
3120 for (i = 0; i <= rt_hash_mask; i++) {
3121 spin_lock_init(&rt_hash_table[i].lock);
3122 rt_hash_table[i].chain = NULL;
3123 }
3124 3153
3125 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf..e328945324 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
118 return 1; 118 return 1;
119} 119}
120 120
121static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
122 void __user *buffer, size_t *lenp, loff_t *ppos)
123{
124 char val[TCP_CA_NAME_MAX];
125 ctl_table tbl = {
126 .data = val,
127 .maxlen = TCP_CA_NAME_MAX,
128 };
129 int ret;
130
131 tcp_get_default_congestion_control(val);
132
133 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
134 if (write && ret == 0)
135 ret = tcp_set_default_congestion_control(val);
136 return ret;
137}
138
139int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
140 void __user *oldval, size_t __user *oldlenp,
141 void __user *newval, size_t newlen,
142 void **context)
143{
144 char val[TCP_CA_NAME_MAX];
145 ctl_table tbl = {
146 .data = val,
147 .maxlen = TCP_CA_NAME_MAX,
148 };
149 int ret;
150
151 tcp_get_default_congestion_control(val);
152 ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
153 context);
154 if (ret == 0 && newval && newlen)
155 ret = tcp_set_default_congestion_control(val);
156 return ret;
157}
158
159
121ctl_table ipv4_table[] = { 160ctl_table ipv4_table[] = {
122 { 161 {
123 .ctl_name = NET_IPV4_TCP_TIMESTAMPS, 162 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -612,70 +651,6 @@ ctl_table ipv4_table[] = {
612 .proc_handler = &proc_dointvec, 651 .proc_handler = &proc_dointvec,
613 }, 652 },
614 { 653 {
615 .ctl_name = NET_TCP_WESTWOOD,
616 .procname = "tcp_westwood",
617 .data = &sysctl_tcp_westwood,
618 .maxlen = sizeof(int),
619 .mode = 0644,
620 .proc_handler = &proc_dointvec,
621 },
622 {
623 .ctl_name = NET_TCP_VEGAS,
624 .procname = "tcp_vegas_cong_avoid",
625 .data = &sysctl_tcp_vegas_cong_avoid,
626 .maxlen = sizeof(int),
627 .mode = 0644,
628 .proc_handler = &proc_dointvec,
629 },
630 {
631 .ctl_name = NET_TCP_VEGAS_ALPHA,
632 .procname = "tcp_vegas_alpha",
633 .data = &sysctl_tcp_vegas_alpha,
634 .maxlen = sizeof(int),
635 .mode = 0644,
636 .proc_handler = &proc_dointvec,
637 },
638 {
639 .ctl_name = NET_TCP_VEGAS_BETA,
640 .procname = "tcp_vegas_beta",
641 .data = &sysctl_tcp_vegas_beta,
642 .maxlen = sizeof(int),
643 .mode = 0644,
644 .proc_handler = &proc_dointvec,
645 },
646 {
647 .ctl_name = NET_TCP_VEGAS_GAMMA,
648 .procname = "tcp_vegas_gamma",
649 .data = &sysctl_tcp_vegas_gamma,
650 .maxlen = sizeof(int),
651 .mode = 0644,
652 .proc_handler = &proc_dointvec,
653 },
654 {
655 .ctl_name = NET_TCP_BIC,
656 .procname = "tcp_bic",
657 .data = &sysctl_tcp_bic,
658 .maxlen = sizeof(int),
659 .mode = 0644,
660 .proc_handler = &proc_dointvec,
661 },
662 {
663 .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
664 .procname = "tcp_bic_fast_convergence",
665 .data = &sysctl_tcp_bic_fast_convergence,
666 .maxlen = sizeof(int),
667 .mode = 0644,
668 .proc_handler = &proc_dointvec,
669 },
670 {
671 .ctl_name = NET_TCP_BIC_LOW_WINDOW,
672 .procname = "tcp_bic_low_window",
673 .data = &sysctl_tcp_bic_low_window,
674 .maxlen = sizeof(int),
675 .mode = 0644,
676 .proc_handler = &proc_dointvec,
677 },
678 {
679 .ctl_name = NET_TCP_MODERATE_RCVBUF, 654 .ctl_name = NET_TCP_MODERATE_RCVBUF,
680 .procname = "tcp_moderate_rcvbuf", 655 .procname = "tcp_moderate_rcvbuf",
681 .data = &sysctl_tcp_moderate_rcvbuf, 656 .data = &sysctl_tcp_moderate_rcvbuf,
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
692 .proc_handler = &proc_dointvec, 667 .proc_handler = &proc_dointvec,
693 }, 668 },
694 { 669 {
695 .ctl_name = NET_TCP_BIC_BETA, 670 .ctl_name = NET_TCP_CONG_CONTROL,
696 .procname = "tcp_bic_beta", 671 .procname = "tcp_congestion_control",
697 .data = &sysctl_tcp_bic_beta,
698 .maxlen = sizeof(int),
699 .mode = 0644, 672 .mode = 0644,
700 .proc_handler = &proc_dointvec, 673 .maxlen = TCP_CA_NAME_MAX,
674 .proc_handler = &proc_tcp_congestion_control,
675 .strategy = &sysctl_tcp_congestion_control,
701 }, 676 },
677
702 { .ctl_name = 0 } 678 { .ctl_name = 0 }
703}; 679};
704 680
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd..69b1fcf700 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -584,7 +584,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
584 sk_charge_skb(sk, skb); 584 sk_charge_skb(sk, skb);
585 if (!sk->sk_send_head) 585 if (!sk->sk_send_head)
586 sk->sk_send_head = skb; 586 sk->sk_send_head = skb;
587 else if (tp->nonagle&TCP_NAGLE_PUSH) 587 if (tp->nonagle & TCP_NAGLE_PUSH)
588 tp->nonagle &= ~TCP_NAGLE_PUSH; 588 tp->nonagle &= ~TCP_NAGLE_PUSH;
589} 589}
590 590
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
615 size_t psize, int flags) 615 size_t psize, int flags)
616{ 616{
617 struct tcp_sock *tp = tcp_sk(sk); 617 struct tcp_sock *tp = tcp_sk(sk);
618 int mss_now; 618 int mss_now, size_goal;
619 int err; 619 int err;
620 ssize_t copied; 620 ssize_t copied;
621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 629
630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 size_goal = tp->xmit_size_goal;
631 copied = 0; 632 copied = 0;
632 633
633 err = -EPIPE; 634 err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
641 int offset = poffset % PAGE_SIZE; 642 int offset = poffset % PAGE_SIZE;
642 int size = min_t(size_t, psize, PAGE_SIZE - offset); 643 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643 644
644 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { 645 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
645new_segment: 646new_segment:
646 if (!sk_stream_memory_free(sk)) 647 if (!sk_stream_memory_free(sk))
647 goto wait_for_sndbuf; 648 goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
652 goto wait_for_memory; 653 goto wait_for_memory;
653 654
654 skb_entail(sk, tp, skb); 655 skb_entail(sk, tp, skb);
655 copy = mss_now; 656 copy = size_goal;
656 } 657 }
657 658
658 if (copy > size) 659 if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
693 if (!(psize -= copy)) 694 if (!(psize -= copy))
694 goto out; 695 goto out;
695 696
696 if (skb->len != mss_now || (flags & MSG_OOB)) 697 if (skb->len < mss_now || (flags & MSG_OOB))
697 continue; 698 continue;
698 699
699 if (forced_push(tp)) { 700 if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
713 goto do_error; 714 goto do_error;
714 715
715 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 716 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 size_goal = tp->xmit_size_goal;
716 } 718 }
717 719
718out: 720out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
754 756
755static inline int select_size(struct sock *sk, struct tcp_sock *tp) 757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{ 758{
757 int tmp = tp->mss_cache_std; 759 int tmp = tp->mss_cache;
758 760
759 if (sk->sk_route_caps & NETIF_F_SG) { 761 if (sk->sk_route_caps & NETIF_F_SG) {
760 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 762 if (sk->sk_route_caps & NETIF_F_TSO)
763 tmp = 0;
764 else {
765 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761 766
762 if (tmp >= pgbreak && 767 if (tmp >= pgbreak &&
763 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) 768 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764 tmp = pgbreak; 769 tmp = pgbreak;
770 }
765 } 771 }
772
766 return tmp; 773 return tmp;
767} 774}
768 775
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
773 struct tcp_sock *tp = tcp_sk(sk); 780 struct tcp_sock *tp = tcp_sk(sk);
774 struct sk_buff *skb; 781 struct sk_buff *skb;
775 int iovlen, flags; 782 int iovlen, flags;
776 int mss_now; 783 int mss_now, size_goal;
777 int err, copied; 784 int err, copied;
778 long timeo; 785 long timeo;
779 786
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
792 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 799 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793 800
794 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 801 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 size_goal = tp->xmit_size_goal;
795 803
796 /* Ok commence sending. */ 804 /* Ok commence sending. */
797 iovlen = msg->msg_iovlen; 805 iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
814 skb = sk->sk_write_queue.prev; 822 skb = sk->sk_write_queue.prev;
815 823
816 if (!sk->sk_send_head || 824 if (!sk->sk_send_head ||
817 (copy = mss_now - skb->len) <= 0) { 825 (copy = size_goal - skb->len) <= 0) {
818 826
819new_segment: 827new_segment:
820 /* Allocate new segment. If the interface is SG, 828 /* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
837 skb->ip_summed = CHECKSUM_HW; 845 skb->ip_summed = CHECKSUM_HW;
838 846
839 skb_entail(sk, tp, skb); 847 skb_entail(sk, tp, skb);
840 copy = mss_now; 848 copy = size_goal;
841 } 849 }
842 850
843 /* Try to append data to the end of skb. */ 851 /* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
872 tcp_mark_push(tp, skb); 880 tcp_mark_push(tp, skb);
873 goto new_segment; 881 goto new_segment;
874 } else if (page) { 882 } else if (page) {
875 /* If page is cached, align
876 * offset to L1 cache boundary
877 */
878 off = (off + L1_CACHE_BYTES - 1) &
879 ~(L1_CACHE_BYTES - 1);
880 if (off == PAGE_SIZE) { 883 if (off == PAGE_SIZE) {
881 put_page(page); 884 put_page(page);
882 TCP_PAGE(sk) = page = NULL; 885 TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
937 if ((seglen -= copy) == 0 && iovlen == 0) 940 if ((seglen -= copy) == 0 && iovlen == 0)
938 goto out; 941 goto out;
939 942
940 if (skb->len != mss_now || (flags & MSG_OOB)) 943 if (skb->len < mss_now || (flags & MSG_OOB))
941 continue; 944 continue;
942 945
943 if (forced_push(tp)) { 946 if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
957 goto do_error; 960 goto do_error;
958 961
959 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 962 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 size_goal = tp->xmit_size_goal;
960 } 964 }
961 } 965 }
962 966
@@ -1101,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
1101 struct sk_buff *skb; 1105 struct sk_buff *skb;
1102 struct tcp_sock *tp = tcp_sk(sk); 1106 struct tcp_sock *tp = tcp_sk(sk);
1103 1107
1104 NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue)); 1108 NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1105 1109
1106 /* RX process wants to run with disabled BHs, though it is not 1110 /* RX process wants to run with disabled BHs, though it is not
1107 * necessary */ 1111 * necessary */
@@ -1365,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1365 * is not empty. It is more elegant, but eats cycles, 1369 * is not empty. It is more elegant, but eats cycles,
1366 * unfortunately. 1370 * unfortunately.
1367 */ 1371 */
1368 if (skb_queue_len(&tp->ucopy.prequeue)) 1372 if (!skb_queue_empty(&tp->ucopy.prequeue))
1369 goto do_prequeue; 1373 goto do_prequeue;
1370 1374
1371 /* __ Set realtime policy in scheduler __ */ 1375 /* __ Set realtime policy in scheduler __ */
@@ -1390,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1390 } 1394 }
1391 1395
1392 if (tp->rcv_nxt == tp->copied_seq && 1396 if (tp->rcv_nxt == tp->copied_seq &&
1393 skb_queue_len(&tp->ucopy.prequeue)) { 1397 !skb_queue_empty(&tp->ucopy.prequeue)) {
1394do_prequeue: 1398do_prequeue:
1395 tcp_prequeue_process(sk); 1399 tcp_prequeue_process(sk);
1396 1400
@@ -1472,7 +1476,7 @@ skip_copy:
1472 } while (len > 0); 1476 } while (len > 0);
1473 1477
1474 if (user_recv) { 1478 if (user_recv) {
1475 if (skb_queue_len(&tp->ucopy.prequeue)) { 1479 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1476 int chunk; 1480 int chunk;
1477 1481
1478 tp->ucopy.len = copied > 0 ? len : 0; 1482 tp->ucopy.len = copied > 0 ? len : 0;
@@ -1927,6 +1931,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1927 return tp->af_specific->setsockopt(sk, level, optname, 1931 return tp->af_specific->setsockopt(sk, level, optname,
1928 optval, optlen); 1932 optval, optlen);
1929 1933
1934 /* This is a string value all the others are int's */
1935 if (optname == TCP_CONGESTION) {
1936 char name[TCP_CA_NAME_MAX];
1937
1938 if (optlen < 1)
1939 return -EINVAL;
1940
1941 val = strncpy_from_user(name, optval,
1942 min(TCP_CA_NAME_MAX-1, optlen));
1943 if (val < 0)
1944 return -EFAULT;
1945 name[val] = 0;
1946
1947 lock_sock(sk);
1948 err = tcp_set_congestion_control(tp, name);
1949 release_sock(sk);
1950 return err;
1951 }
1952
1930 if (optlen < sizeof(int)) 1953 if (optlen < sizeof(int))
1931 return -EINVAL; 1954 return -EINVAL;
1932 1955
@@ -2109,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2109 2132
2110 info->tcpi_rto = jiffies_to_usecs(tp->rto); 2133 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2111 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2112 info->tcpi_snd_mss = tp->mss_cache_std; 2135 info->tcpi_snd_mss = tp->mss_cache;
2113 info->tcpi_rcv_mss = tp->ack.rcv_mss; 2136 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2114 2137
2115 info->tcpi_unacked = tp->packets_out; 2138 info->tcpi_unacked = tp->packets_out;
@@ -2159,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2159 2182
2160 switch (optname) { 2183 switch (optname) {
2161 case TCP_MAXSEG: 2184 case TCP_MAXSEG:
2162 val = tp->mss_cache_std; 2185 val = tp->mss_cache;
2163 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2186 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2164 val = tp->rx_opt.user_mss; 2187 val = tp->rx_opt.user_mss;
2165 break; 2188 break;
@@ -2211,6 +2234,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2211 case TCP_QUICKACK: 2234 case TCP_QUICKACK:
2212 val = !tp->ack.pingpong; 2235 val = !tp->ack.pingpong;
2213 break; 2236 break;
2237
2238 case TCP_CONGESTION:
2239 if (get_user(len, optlen))
2240 return -EFAULT;
2241 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242 if (put_user(len, optlen))
2243 return -EFAULT;
2244 if (copy_to_user(optval, tp->ca_ops->name, len))
2245 return -EFAULT;
2246 return 0;
2214 default: 2247 default:
2215 return -ENOPROTOOPT; 2248 return -ENOPROTOOPT;
2216 }; 2249 };
@@ -2224,7 +2257,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2224 2257
2225 2258
2226extern void __skb_cb_too_small_for_tcp(int, int); 2259extern void __skb_cb_too_small_for_tcp(int, int);
2227extern void tcpdiag_init(void); 2260extern struct tcp_congestion_ops tcp_reno;
2228 2261
2229static __initdata unsigned long thash_entries; 2262static __initdata unsigned long thash_entries;
2230static int __init set_thash_entries(char *str) 2263static int __init set_thash_entries(char *str)
@@ -2333,6 +2366,8 @@ void __init tcp_init(void)
2333 printk(KERN_INFO "TCP: Hash tables configured " 2366 printk(KERN_INFO "TCP: Hash tables configured "
2334 "(established %d bind %d)\n", 2367 "(established %d bind %d)\n",
2335 tcp_ehash_size << 1, tcp_bhash_size); 2368 tcp_ehash_size << 1, tcp_bhash_size);
2369
2370 tcp_register_congestion_control(&tcp_reno);
2336} 2371}
2337 2372
2338EXPORT_SYMBOL(tcp_accept); 2373EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 0000000000..ec38d45d66
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
1/*
2 * Binary Increase Congestion control for TCP
3 *
4 * This is from the implementation of BICTCP in
5 * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
6 * "Binary Increase Congestion Control for Fast, Long Distance
7 * Networks" in InfoComm 2004
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
10 *
11 * Unless BIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28
29static int fast_convergence = 1;
30static int max_increment = 32;
31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100;
36static int smooth_part = 20;
37
38module_param(fast_convergence, int, 0644);
39MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
40module_param(max_increment, int, 0644);
41MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
42module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644);
53MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
54
55
56/* BIC TCP Parameters */
57struct bictcp {
58 u32 cnt; /* increase cwnd by 1 after ACKs */
59 u32 last_max_cwnd; /* last maximum snd_cwnd */
60 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
71};
72
73static inline void bictcp_reset(struct bictcp *ca)
74{
75 ca->cnt = 0;
76 ca->last_max_cwnd = 0;
77 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0;
79 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87}
88
89static void bictcp_init(struct tcp_sock *tp)
90{
91 bictcp_reset(tcp_ca(tp));
92 if (initial_ssthresh)
93 tp->snd_ssthresh = initial_ssthresh;
94}
95
96/*
97 * Compute congestion window to use.
98 */
99static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
100{
101 if (ca->last_cwnd == cwnd &&
102 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
103 return;
104
105 ca->last_cwnd = cwnd;
106 ca->last_time = tcp_time_stamp;
107
108 if (ca->epoch_start == 0) /* record the beginning of an epoch */
109 ca->epoch_start = tcp_time_stamp;
110
111 /* start off normal */
112 if (cwnd <= low_window) {
113 ca->cnt = cwnd;
114 return;
115 }
116
117 /* binary increase */
118 if (cwnd < ca->last_max_cwnd) {
119 __u32 dist = (ca->last_max_cwnd - cwnd)
120 / BICTCP_B;
121
122 if (dist > max_increment)
123 /* linear increase */
124 ca->cnt = cwnd / max_increment;
125 else if (dist <= 1U)
126 /* binary search increase */
127 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
128 else
129 /* binary search increase */
130 ca->cnt = cwnd / dist;
131 } else {
132 /* slow start AMD linear increase */
133 if (cwnd < ca->last_max_cwnd + BICTCP_B)
134 /* slow start */
135 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
136 else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
137 /* slow start */
138 ca->cnt = (cwnd * (BICTCP_B-1))
139 / cwnd-ca->last_max_cwnd;
140 else
141 /* linear increase */
142 ca->cnt = cwnd / max_increment;
143 }
144
145 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 ||
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20;
150 }
151
152 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
153 if (ca->cnt == 0) /* cannot be zero */
154 ca->cnt = 1;
155}
156
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
160{
161 struct bictcp *ca = tcp_ca(tp);
162 u32 dist, delay;
163
164 /* No time stamp */
165 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
166 /* Discard delay samples right after fast recovery */
167 tcp_time_stamp < ca->epoch_start + HZ ||
168 /* this delay samples may not be accurate */
169 flag == 0) {
170 ca->last_delay = 0;
171 goto notlow;
172 }
173
174 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
175 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
176 if (delay == 0) /* no previous delay sample */
177 goto notlow;
178
179 /* first time call or link delay decreases */
180 if (ca->delay_min == 0 || ca->delay_min > delay) {
181 ca->delay_min = ca->delay_max = delay;
182 goto notlow;
183 }
184
185 if (ca->delay_max < delay)
186 ca->delay_max = delay;
187
188 /* utilization is low, if avg delay < dist*threshold
189 for checking_period time */
190 dist = ca->delay_max - ca->delay_min;
191 if (dist <= ca->delay_min>>6 ||
192 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
193 goto notlow;
194
195 if (ca->low_utilization_start == 0) {
196 ca->low_utilization = 0;
197 ca->low_utilization_start = tcp_time_stamp;
198 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
199 > low_utilization_period*HZ) {
200 ca->low_utilization = 1;
201 }
202
203 return;
204
205 notlow:
206 ca->low_utilization = 0;
207 ca->low_utilization_start = 0;
208
209}
210
211static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
212 u32 seq_rtt, u32 in_flight, int data_acked)
213{
214 struct bictcp *ca = tcp_ca(tp);
215
216 bictcp_low_utilization(tp, data_acked);
217
218 if (in_flight < tp->snd_cwnd)
219 return;
220
221 if (tp->snd_cwnd <= tp->snd_ssthresh) {
222 /* In "safe" area, increase. */
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 } else {
226 bictcp_update(ca, tp->snd_cwnd);
227
228 /* In dangerous area, increase slowly.
229 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
230 */
231 if (tp->snd_cwnd_cnt >= ca->cnt) {
232 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
233 tp->snd_cwnd++;
234 tp->snd_cwnd_cnt = 0;
235 } else
236 tp->snd_cwnd_cnt++;
237 }
238
239}
240
241/*
242 * behave like Reno until low_window is reached,
243 * then increase congestion window slowly
244 */
245static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
246{
247 struct bictcp *ca = tcp_ca(tp);
248
249 ca->epoch_start = 0; /* end of epoch */
250
251 /* in case of wrong delay_max*/
252 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
253 ca->delay_max = ca->delay_min
254 + ((ca->delay_max - ca->delay_min)* 90) / 100;
255
256 /* Wmax and fast convergence */
257 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
258 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
259 / (2 * BICTCP_BETA_SCALE);
260 else
261 ca->last_max_cwnd = tp->snd_cwnd;
262
263 ca->loss_cwnd = tp->snd_cwnd;
264
265
266 if (tp->snd_cwnd <= low_window)
267 return max(tp->snd_cwnd >> 1U, 2U);
268 else
269 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
270}
271
272static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
273{
274 struct bictcp *ca = tcp_ca(tp);
275
276 return max(tp->snd_cwnd, ca->last_max_cwnd);
277}
278
279static u32 bictcp_min_cwnd(struct tcp_sock *tp)
280{
281 return tp->snd_ssthresh;
282}
283
284static void bictcp_state(struct tcp_sock *tp, u8 new_state)
285{
286 if (new_state == TCP_CA_Loss)
287 bictcp_reset(tcp_ca(tp));
288}
289
290/* Track delayed acknowledgement ratio using sliding window
291 * ratio = (15*ratio + sample) / 16
292 */
293static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
294{
295 if (cnt > 0 && tp->ca_state == TCP_CA_Open) {
296 struct bictcp *ca = tcp_ca(tp);
297 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
298 ca->delayed_ack += cnt;
299 }
300}
301
302
303static struct tcp_congestion_ops bictcp = {
304 .init = bictcp_init,
305 .ssthresh = bictcp_recalc_ssthresh,
306 .cong_avoid = bictcp_cong_avoid,
307 .set_state = bictcp_state,
308 .undo_cwnd = bictcp_undo_cwnd,
309 .min_cwnd = bictcp_min_cwnd,
310 .pkts_acked = bictcp_acked,
311 .owner = THIS_MODULE,
312 .name = "bic",
313};
314
315static int __init bictcp_register(void)
316{
317 BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
318 return tcp_register_congestion_control(&bictcp);
319}
320
321static void __exit bictcp_unregister(void)
322{
323 tcp_unregister_congestion_control(&bictcp);
324}
325
326module_init(bictcp_register);
327module_exit(bictcp_unregister);
328
329MODULE_AUTHOR("Stephen Hemminger");
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 0000000000..4970d10a77
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,237 @@
1/*
2 * Plugable TCP congestion control support and newReno
3 * congestion control.
4 * Based on ideas from I/O scheduler suport and Web100.
5 *
6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/mm.h>
12#include <linux/types.h>
13#include <linux/list.h>
14#include <net/tcp.h>
15
16static DEFINE_SPINLOCK(tcp_cong_list_lock);
17static LIST_HEAD(tcp_cong_list);
18
19/* Simple linear search, don't expect many entries! */
20static struct tcp_congestion_ops *tcp_ca_find(const char *name)
21{
22 struct tcp_congestion_ops *e;
23
24 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
25 if (strcmp(e->name, name) == 0)
26 return e;
27 }
28
29 return NULL;
30}
31
32/*
33 * Attach new congestion control algorthim to the list
34 * of available options.
35 */
36int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
37{
38 int ret = 0;
39
40 /* all algorithms must implement ssthresh and cong_avoid ops */
41 if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
42 printk(KERN_ERR "TCP %s does not implement required ops\n",
43 ca->name);
44 return -EINVAL;
45 }
46
47 spin_lock(&tcp_cong_list_lock);
48 if (tcp_ca_find(ca->name)) {
49 printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
50 ret = -EEXIST;
51 } else {
52 list_add_rcu(&ca->list, &tcp_cong_list);
53 printk(KERN_INFO "TCP %s registered\n", ca->name);
54 }
55 spin_unlock(&tcp_cong_list_lock);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
60
61/*
62 * Remove congestion control algorithm, called from
63 * the module's remove function. Module ref counts are used
64 * to ensure that this can't be done till all sockets using
65 * that method are closed.
66 */
67void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
68{
69 spin_lock(&tcp_cong_list_lock);
70 list_del_rcu(&ca->list);
71 spin_unlock(&tcp_cong_list_lock);
72}
73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
74
75/* Assign choice of congestion control. */
76void tcp_init_congestion_control(struct tcp_sock *tp)
77{
78 struct tcp_congestion_ops *ca;
79
80 if (tp->ca_ops != &tcp_init_congestion_ops)
81 return;
82
83 rcu_read_lock();
84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
85 if (try_module_get(ca->owner)) {
86 tp->ca_ops = ca;
87 break;
88 }
89
90 }
91 rcu_read_unlock();
92
93 if (tp->ca_ops->init)
94 tp->ca_ops->init(tp);
95}
96
97/* Manage refcounts on socket close. */
98void tcp_cleanup_congestion_control(struct tcp_sock *tp)
99{
100 if (tp->ca_ops->release)
101 tp->ca_ops->release(tp);
102 module_put(tp->ca_ops->owner);
103}
104
105/* Used by sysctl to change default congestion control */
106int tcp_set_default_congestion_control(const char *name)
107{
108 struct tcp_congestion_ops *ca;
109 int ret = -ENOENT;
110
111 spin_lock(&tcp_cong_list_lock);
112 ca = tcp_ca_find(name);
113#ifdef CONFIG_KMOD
114 if (!ca) {
115 spin_unlock(&tcp_cong_list_lock);
116
117 request_module("tcp_%s", name);
118 spin_lock(&tcp_cong_list_lock);
119 ca = tcp_ca_find(name);
120 }
121#endif
122
123 if (ca) {
124 list_move(&ca->list, &tcp_cong_list);
125 ret = 0;
126 }
127 spin_unlock(&tcp_cong_list_lock);
128
129 return ret;
130}
131
132/* Get current default congestion control */
133void tcp_get_default_congestion_control(char *name)
134{
135 struct tcp_congestion_ops *ca;
136 /* We will always have reno... */
137 BUG_ON(list_empty(&tcp_cong_list));
138
139 rcu_read_lock();
140 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
141 strncpy(name, ca->name, TCP_CA_NAME_MAX);
142 rcu_read_unlock();
143}
144
145/* Change congestion control for socket */
146int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
147{
148 struct tcp_congestion_ops *ca;
149 int err = 0;
150
151 rcu_read_lock();
152 ca = tcp_ca_find(name);
153 if (ca == tp->ca_ops)
154 goto out;
155
156 if (!ca)
157 err = -ENOENT;
158
159 else if (!try_module_get(ca->owner))
160 err = -EBUSY;
161
162 else {
163 tcp_cleanup_congestion_control(tp);
164 tp->ca_ops = ca;
165 if (tp->ca_ops->init)
166 tp->ca_ops->init(tp);
167 }
168 out:
169 rcu_read_unlock();
170 return err;
171}
172
173/*
174 * TCP Reno congestion control
175 * This is special case used for fallback as well.
176 */
177/* This is Jacobson's slow start and congestion avoidance.
178 * SIGCOMM '88, p. 328.
179 */
180void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
181 int flag)
182{
183 if (in_flight < tp->snd_cwnd)
184 return;
185
186 if (tp->snd_cwnd <= tp->snd_ssthresh) {
187 /* In "safe" area, increase. */
188 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
189 tp->snd_cwnd++;
190 } else {
191 /* In dangerous area, increase slowly.
192 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
193 */
194 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
195 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
196 tp->snd_cwnd++;
197 tp->snd_cwnd_cnt = 0;
198 } else
199 tp->snd_cwnd_cnt++;
200 }
201}
202EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
203
204/* Slow start threshold is half the congestion window (min 2) */
205u32 tcp_reno_ssthresh(struct tcp_sock *tp)
206{
207 return max(tp->snd_cwnd >> 1U, 2U);
208}
209EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
210
211/* Lower bound on congestion window. */
212u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
213{
214 return tp->snd_ssthresh/2;
215}
216EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
217
218struct tcp_congestion_ops tcp_reno = {
219 .name = "reno",
220 .owner = THIS_MODULE,
221 .ssthresh = tcp_reno_ssthresh,
222 .cong_avoid = tcp_reno_cong_avoid,
223 .min_cwnd = tcp_reno_min_cwnd,
224};
225
226/* Initial congestion control used (until SYN)
227 * really reno under another name so we can tell difference
228 * during tcp_set_default_congestion_control
229 */
230struct tcp_congestion_ops tcp_init_congestion_ops = {
231 .name = "",
232 .owner = THIS_MODULE,
233 .ssthresh = tcp_reno_ssthresh,
234 .cong_avoid = tcp_reno_cong_avoid,
235 .min_cwnd = tcp_reno_min_cwnd,
236};
237EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 634befc079..f66945cb15 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,15 +42,8 @@ struct tcpdiag_entry
42 42
43static struct sock *tcpnl; 43static struct sock *tcpnl;
44 44
45
46#define TCPDIAG_PUT(skb, attrtype, attrlen) \ 45#define TCPDIAG_PUT(skb, attrtype, attrlen) \
47({ int rtalen = RTA_LENGTH(attrlen); \ 46 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
48 struct rtattr *rta; \
49 if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
50 rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
51 rta->rta_type = attrtype; \
52 rta->rta_len = rtalen; \
53 RTA_DATA(rta); })
54 47
55static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, 48static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
56 int ext, u32 pid, u32 seq, u16 nlmsg_flags) 49 int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
61 struct nlmsghdr *nlh; 54 struct nlmsghdr *nlh;
62 struct tcp_info *info = NULL; 55 struct tcp_info *info = NULL;
63 struct tcpdiag_meminfo *minfo = NULL; 56 struct tcpdiag_meminfo *minfo = NULL;
64 struct tcpvegas_info *vinfo = NULL;
65 unsigned char *b = skb->tail; 57 unsigned char *b = skb->tail;
66 58
67 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); 59 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
73 if (ext & (1<<(TCPDIAG_INFO-1))) 65 if (ext & (1<<(TCPDIAG_INFO-1)))
74 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); 66 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
75 67
76 if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) 68 if (ext & (1<<(TCPDIAG_CONG-1))) {
77 && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) 69 size_t len = strlen(tp->ca_ops->name);
78 vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); 70 strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
71 tp->ca_ops->name);
72 }
79 } 73 }
80 r->tcpdiag_family = sk->sk_family; 74 r->tcpdiag_family = sk->sk_family;
81 r->tcpdiag_state = sk->sk_state; 75 r->tcpdiag_state = sk->sk_state;
@@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
166 if (info) 160 if (info)
167 tcp_get_info(sk, info); 161 tcp_get_info(sk, info);
168 162
169 if (vinfo) { 163 if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
170 if (tcp_is_vegas(tp)) { 164 tp->ca_ops->get_info(tp, ext, skb);
171 vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
172 vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
173 vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
174 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
175 } else {
176 vinfo->tcpv_enabled = 0;
177 vinfo->tcpv_rttcnt = 0;
178 vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
179 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
180 }
181 }
182 165
183 nlh->nlmsg_len = skb->tail - b; 166 nlh->nlmsg_len = skb->tail - b;
184 return skb->len; 167 return skb->len;
185 168
169rtattr_failure:
186nlmsg_failure: 170nlmsg_failure:
187 skb_trim(skb, b - skb->data); 171 skb_trim(skb, b - skb->data);
188 return -1; 172 return -1;
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 0000000000..36c51f8136
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
1/*
2 * Sally Floyd's High Speed TCP (RFC 3649) congestion control
3 *
4 * See http://www.icir.org/floyd/hstcp.html
5 *
6 * John Heffner <jheffner@psc.edu>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <net/tcp.h>
12
13
14/* From AIMD tables from RFC 3649 appendix B,
15 * with fixed-point MD scaled <<8.
16 */
17static const struct hstcp_aimd_val {
18 unsigned int cwnd;
19 unsigned int md;
20} hstcp_aimd_vals[] = {
21 { 38, 128, /* 0.50 */ },
22 { 118, 112, /* 0.44 */ },
23 { 221, 104, /* 0.41 */ },
24 { 347, 98, /* 0.38 */ },
25 { 495, 93, /* 0.37 */ },
26 { 663, 89, /* 0.35 */ },
27 { 851, 86, /* 0.34 */ },
28 { 1058, 83, /* 0.33 */ },
29 { 1284, 81, /* 0.32 */ },
30 { 1529, 78, /* 0.31 */ },
31 { 1793, 76, /* 0.30 */ },
32 { 2076, 74, /* 0.29 */ },
33 { 2378, 72, /* 0.28 */ },
34 { 2699, 71, /* 0.28 */ },
35 { 3039, 69, /* 0.27 */ },
36 { 3399, 68, /* 0.27 */ },
37 { 3778, 66, /* 0.26 */ },
38 { 4177, 65, /* 0.26 */ },
39 { 4596, 64, /* 0.25 */ },
40 { 5036, 62, /* 0.25 */ },
41 { 5497, 61, /* 0.24 */ },
42 { 5979, 60, /* 0.24 */ },
43 { 6483, 59, /* 0.23 */ },
44 { 7009, 58, /* 0.23 */ },
45 { 7558, 57, /* 0.22 */ },
46 { 8130, 56, /* 0.22 */ },
47 { 8726, 55, /* 0.22 */ },
48 { 9346, 54, /* 0.21 */ },
49 { 9991, 53, /* 0.21 */ },
50 { 10661, 52, /* 0.21 */ },
51 { 11358, 52, /* 0.20 */ },
52 { 12082, 51, /* 0.20 */ },
53 { 12834, 50, /* 0.20 */ },
54 { 13614, 49, /* 0.19 */ },
55 { 14424, 48, /* 0.19 */ },
56 { 15265, 48, /* 0.19 */ },
57 { 16137, 47, /* 0.19 */ },
58 { 17042, 46, /* 0.18 */ },
59 { 17981, 45, /* 0.18 */ },
60 { 18955, 45, /* 0.18 */ },
61 { 19965, 44, /* 0.17 */ },
62 { 21013, 43, /* 0.17 */ },
63 { 22101, 43, /* 0.17 */ },
64 { 23230, 42, /* 0.17 */ },
65 { 24402, 41, /* 0.16 */ },
66 { 25618, 41, /* 0.16 */ },
67 { 26881, 40, /* 0.16 */ },
68 { 28193, 39, /* 0.16 */ },
69 { 29557, 39, /* 0.15 */ },
70 { 30975, 38, /* 0.15 */ },
71 { 32450, 38, /* 0.15 */ },
72 { 33986, 37, /* 0.15 */ },
73 { 35586, 36, /* 0.14 */ },
74 { 37253, 36, /* 0.14 */ },
75 { 38992, 35, /* 0.14 */ },
76 { 40808, 35, /* 0.14 */ },
77 { 42707, 34, /* 0.13 */ },
78 { 44694, 33, /* 0.13 */ },
79 { 46776, 33, /* 0.13 */ },
80 { 48961, 32, /* 0.13 */ },
81 { 51258, 32, /* 0.13 */ },
82 { 53677, 31, /* 0.12 */ },
83 { 56230, 30, /* 0.12 */ },
84 { 58932, 30, /* 0.12 */ },
85 { 61799, 29, /* 0.12 */ },
86 { 64851, 28, /* 0.11 */ },
87 { 68113, 28, /* 0.11 */ },
88 { 71617, 27, /* 0.11 */ },
89 { 75401, 26, /* 0.10 */ },
90 { 79517, 26, /* 0.10 */ },
91 { 84035, 25, /* 0.10 */ },
92 { 89053, 24, /* 0.10 */ },
93};
94
95#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
96
97struct hstcp {
98 u32 ai;
99};
100
101static void hstcp_init(struct tcp_sock *tp)
102{
103 struct hstcp *ca = tcp_ca(tp);
104
105 ca->ai = 0;
106
107 /* Ensure the MD arithmetic works. This is somewhat pedantic,
108 * since I don't think we will see a cwnd this large. :) */
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110}
111
112static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
113 u32 in_flight, int good)
114{
115 struct hstcp *ca = tcp_ca(tp);
116
117 if (in_flight < tp->snd_cwnd)
118 return;
119
120 if (tp->snd_cwnd <= tp->snd_ssthresh) {
121 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
122 tp->snd_cwnd++;
123 } else {
124 /* Update AIMD parameters */
125 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
126 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
127 ca->ai < HSTCP_AIMD_MAX)
128 ca->ai++;
129 } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
130 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
131 ca->ai > 0)
132 ca->ai--;
133 }
134
135 /* Do additive increase */
136 if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
137 tp->snd_cwnd_cnt += ca->ai;
138 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
139 tp->snd_cwnd++;
140 tp->snd_cwnd_cnt -= tp->snd_cwnd;
141 }
142 }
143 }
144}
145
146static u32 hstcp_ssthresh(struct tcp_sock *tp)
147{
148 struct hstcp *ca = tcp_ca(tp);
149
150 /* Do multiplicative decrease */
151 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
152}
153
154
155static struct tcp_congestion_ops tcp_highspeed = {
156 .init = hstcp_init,
157 .ssthresh = hstcp_ssthresh,
158 .cong_avoid = hstcp_cong_avoid,
159 .min_cwnd = tcp_reno_min_cwnd,
160
161 .owner = THIS_MODULE,
162 .name = "highspeed"
163};
164
165static int __init hstcp_register(void)
166{
167 BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
168 return tcp_register_congestion_control(&tcp_highspeed);
169}
170
171static void __exit hstcp_unregister(void)
172{
173 tcp_unregister_congestion_control(&tcp_highspeed);
174}
175
176module_init(hstcp_register);
177module_exit(hstcp_unregister);
178
179MODULE_AUTHOR("John Heffner");
180MODULE_LICENSE("GPL");
181MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 0000000000..40168275ac
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
1/*
2 * H-TCP congestion control. The algorithm is detailed in:
3 * R.N.Shorten, D.J.Leith:
4 * "H-TCP: TCP for high-speed and long-distance networks"
5 * Proc. PFLDnet, Argonne, 2004.
6 * http://www.hamilton.ie/net/htcp3.pdf
7 */
8
9#include <linux/config.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <net/tcp.h>
13
14#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
15#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
16#define BETA_MAX 102 /* 0.8 with shift << 7 */
17
18static int use_rtt_scaling = 1;
19module_param(use_rtt_scaling, int, 0644);
20MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
21
22static int use_bandwidth_switch = 1;
23module_param(use_bandwidth_switch, int, 0644);
24MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
25
26struct htcp {
27 u16 alpha; /* Fixed point arith, << 7 */
28 u8 beta; /* Fixed point arith, << 7 */
29 u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */
30 u8 ccount; /* Number of RTTs since last congestion event */
31 u8 undo_ccount;
32 u16 packetcount;
33 u32 minRTT;
34 u32 maxRTT;
35 u32 snd_cwnd_cnt2;
36
37 u32 undo_maxRTT;
38 u32 undo_old_maxB;
39
40 /* Bandwidth estimation */
41 u32 minB;
42 u32 maxB;
43 u32 old_maxB;
44 u32 Bi;
45 u32 lasttime;
46};
47
48static inline void htcp_reset(struct htcp *ca)
49{
50 ca->undo_ccount = ca->ccount;
51 ca->undo_maxRTT = ca->maxRTT;
52 ca->undo_old_maxB = ca->old_maxB;
53
54 ca->ccount = 0;
55 ca->snd_cwnd_cnt2 = 0;
56}
57
58static u32 htcp_cwnd_undo(struct tcp_sock *tp)
59{
60 struct htcp *ca = tcp_ca(tp);
61 ca->ccount = ca->undo_ccount;
62 ca->maxRTT = ca->undo_maxRTT;
63 ca->old_maxB = ca->undo_old_maxB;
64 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
65}
66
67static inline void measure_rtt(struct tcp_sock *tp)
68{
69 struct htcp *ca = tcp_ca(tp);
70 u32 srtt = tp->srtt>>3;
71
72 /* keep track of minimum RTT seen so far, minRTT is zero at first */
73 if (ca->minRTT > srtt || !ca->minRTT)
74 ca->minRTT = srtt;
75
76 /* max RTT */
77 if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
78 if (ca->maxRTT < ca->minRTT)
79 ca->maxRTT = ca->minRTT;
80 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
81 ca->maxRTT = srtt;
82 }
83}
84
85static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
86{
87 struct htcp *ca = tcp_ca(tp);
88 u32 now = tcp_time_stamp;
89
90 /* achieved throughput calculations */
91 if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
92 ca->packetcount = 0;
93 ca->lasttime = now;
94 return;
95 }
96
97 ca->packetcount += pkts_acked;
98
99 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
100 && now - ca->lasttime >= ca->minRTT
101 && ca->minRTT > 0) {
102 __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
103 if (ca->ccount <= 3) {
104 /* just after backoff */
105 ca->minB = ca->maxB = ca->Bi = cur_Bi;
106 } else {
107 ca->Bi = (3*ca->Bi + cur_Bi)/4;
108 if (ca->Bi > ca->maxB)
109 ca->maxB = ca->Bi;
110 if (ca->minB > ca->maxB)
111 ca->minB = ca->maxB;
112 }
113 ca->packetcount = 0;
114 ca->lasttime = now;
115 }
116}
117
118static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
119{
120 if (use_bandwidth_switch) {
121 u32 maxB = ca->maxB;
122 u32 old_maxB = ca->old_maxB;
123 ca->old_maxB = ca->maxB;
124
125 if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
126 ca->beta = BETA_MIN;
127 ca->modeswitch = 0;
128 return;
129 }
130 }
131
132 if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
133 ca->beta = (minRTT<<7)/maxRTT;
134 if (ca->beta < BETA_MIN)
135 ca->beta = BETA_MIN;
136 else if (ca->beta > BETA_MAX)
137 ca->beta = BETA_MAX;
138 } else {
139 ca->beta = BETA_MIN;
140 ca->modeswitch = 1;
141 }
142}
143
144static inline void htcp_alpha_update(struct htcp *ca)
145{
146 u32 minRTT = ca->minRTT;
147 u32 factor = 1;
148 u32 diff = ca->ccount * minRTT; /* time since last backoff */
149
150 if (diff > HZ) {
151 diff -= HZ;
152 factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
153 }
154
155 if (use_rtt_scaling && minRTT) {
156 u32 scale = (HZ<<3)/(10*minRTT);
157 scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
158 factor = (factor<<3)/scale;
159 if (!factor)
160 factor = 1;
161 }
162
163 ca->alpha = 2*factor*((1<<7)-ca->beta);
164 if (!ca->alpha)
165 ca->alpha = ALPHA_BASE;
166}
167
168/* After we have the rtt data to calculate beta, we'd still prefer to wait one
169 * rtt before we adjust our beta to ensure we are working from a consistent
170 * data.
171 *
172 * This function should be called when we hit a congestion event since only at
173 * that point do we really have a real sense of maxRTT (the queues en route
174 * were getting just too full now).
175 */
176static void htcp_param_update(struct tcp_sock *tp)
177{
178 struct htcp *ca = tcp_ca(tp);
179 u32 minRTT = ca->minRTT;
180 u32 maxRTT = ca->maxRTT;
181
182 htcp_beta_update(ca, minRTT, maxRTT);
183 htcp_alpha_update(ca);
184
185 /* add slowly fading memory for maxRTT to accommodate routing changes etc */
186 if (minRTT > 0 && maxRTT > minRTT)
187 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
188}
189
190static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
191{
192 struct htcp *ca = tcp_ca(tp);
193 htcp_param_update(tp);
194 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
195}
196
197static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
198 u32 in_flight, int data_acked)
199{
200 struct htcp *ca = tcp_ca(tp);
201
202 if (in_flight < tp->snd_cwnd)
203 return;
204
205 if (tp->snd_cwnd <= tp->snd_ssthresh) {
206 /* In "safe" area, increase. */
207 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
208 tp->snd_cwnd++;
209 } else {
210 measure_rtt(tp);
211
212 /* keep track of number of round-trip times since last backoff event */
213 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
214 ca->ccount++;
215 ca->snd_cwnd_cnt2 = 0;
216 htcp_alpha_update(ca);
217 }
218
219 /* In dangerous area, increase slowly.
220 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
221 */
222 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 tp->snd_cwnd_cnt = 0;
226 ca->ccount++;
227 }
228 }
229}
230
231/* Lower bound on congestion window. */
232static u32 htcp_min_cwnd(struct tcp_sock *tp)
233{
234 return tp->snd_ssthresh;
235}
236
237
238static void htcp_init(struct tcp_sock *tp)
239{
240 struct htcp *ca = tcp_ca(tp);
241
242 memset(ca, 0, sizeof(struct htcp));
243 ca->alpha = ALPHA_BASE;
244 ca->beta = BETA_MIN;
245}
246
247static void htcp_state(struct tcp_sock *tp, u8 new_state)
248{
249 switch (new_state) {
250 case TCP_CA_CWR:
251 case TCP_CA_Recovery:
252 case TCP_CA_Loss:
253 htcp_reset(tcp_ca(tp));
254 break;
255 }
256}
257
258static struct tcp_congestion_ops htcp = {
259 .init = htcp_init,
260 .ssthresh = htcp_recalc_ssthresh,
261 .min_cwnd = htcp_min_cwnd,
262 .cong_avoid = htcp_cong_avoid,
263 .set_state = htcp_state,
264 .undo_cwnd = htcp_cwnd_undo,
265 .pkts_acked = measure_achieved_throughput,
266 .owner = THIS_MODULE,
267 .name = "htcp",
268};
269
270static int __init htcp_register(void)
271{
272 BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
273 BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
274 if (!use_bandwidth_switch)
275 htcp.pkts_acked = NULL;
276 return tcp_register_congestion_control(&htcp);
277}
278
279static void __exit htcp_unregister(void)
280{
281 tcp_unregister_congestion_control(&htcp);
282}
283
284module_init(htcp_register);
285module_exit(htcp_unregister);
286
287MODULE_AUTHOR("Baruch Even");
288MODULE_LICENSE("GPL");
289MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 0000000000..13a66342c3
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
1/*
2 * TCP HYBLA
3 *
4 * TCP-HYBLA Congestion control algorithm, based on:
5 * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
6 * for Heterogeneous Networks",
7 * International Journal on satellite Communications,
8 * September 2004
9 * Daniele Lacamera
10 * root at danielinux.net
11 */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <net/tcp.h>
16
17/* Tcp Hybla structure. */
18struct hybla {
19 u8 hybla_en;
20 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
21 u32 rho; /* Rho parameter, integer part */
22 u32 rho2; /* Rho * Rho, integer part */
23 u32 rho_3ls; /* Rho parameter, <<3 */
24 u32 rho2_7ls; /* Rho^2, <<7 */
25 u32 minrtt; /* Minimum smoothed round trip time value seen */
26};
27
28/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
29 expressed in jiffies */
30static int rtt0 = 25;
31module_param(rtt0, int, 0644);
32MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
33
34
35/* This is called to refresh values for hybla parameters */
36static inline void hybla_recalc_param (struct tcp_sock *tp)
37{
38 struct hybla *ca = tcp_ca(tp);
39
40 ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
41 ca->rho = ca->rho_3ls >> 3;
42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
43 ca->rho2 = ca->rho2_7ls >>7;
44}
45
46static void hybla_init(struct tcp_sock *tp)
47{
48 struct hybla *ca = tcp_ca(tp);
49
50 ca->rho = 0;
51 ca->rho2 = 0;
52 ca->rho_3ls = 0;
53 ca->rho2_7ls = 0;
54 ca->snd_cwnd_cents = 0;
55 ca->hybla_en = 1;
56 tp->snd_cwnd = 2;
57 tp->snd_cwnd_clamp = 65535;
58
59 /* 1st Rho measurement based on initial srtt */
60 hybla_recalc_param(tp);
61
62 /* set minimum rtt as this is the 1st ever seen */
63 ca->minrtt = tp->srtt;
64 tp->snd_cwnd = ca->rho;
65}
66
67static void hybla_state(struct tcp_sock *tp, u8 ca_state)
68{
69 struct hybla *ca = tcp_ca(tp);
70
71 ca->hybla_en = (ca_state == TCP_CA_Open);
72}
73
74static inline u32 hybla_fraction(u32 odds)
75{
76 static const u32 fractions[] = {
77 128, 139, 152, 165, 181, 197, 215, 234,
78 };
79
80 return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
81}
82
83/* TCP Hybla main routine.
84 * This is the algorithm behavior:
85 * o Recalc Hybla parameters if min_rtt has changed
86 * o Give cwnd a new value based on the model proposed
87 * o remember increments <1
88 */
89static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
90 u32 in_flight, int flag)
91{
92 struct hybla *ca = tcp_ca(tp);
93 u32 increment, odd, rho_fractions;
94 int is_slowstart = 0;
95
96 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){
98 hybla_recalc_param(tp);
99 ca->minrtt = tp->srtt;
100 }
101
102 if (!ca->hybla_en)
103 return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
104
105 if (in_flight < tp->snd_cwnd)
106 return;
107
108 if (ca->rho == 0)
109 hybla_recalc_param(tp);
110
111 rho_fractions = ca->rho_3ls - (ca->rho << 3);
112
113 if (tp->snd_cwnd < tp->snd_ssthresh) {
114 /*
115 * slow start
116 * INC = 2^RHO - 1
117 * This is done by splitting the rho parameter
118 * into 2 parts: an integer part and a fraction part.
119 * Inrement<<7 is estimated by doing:
120 * [2^(int+fract)]<<7
121 * that is equal to:
122 * (2^int) * [(2^fract) <<7]
123 * 2^int is straightly computed as 1<<int,
124 * while we will use hybla_slowstart_fraction_increment() to
125 * calculate 2^fract in a <<7 value.
126 */
127 is_slowstart = 1;
128 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
129 - 128;
130 } else {
131 /*
132 * congestion avoidance
133 * INC = RHO^2 / W
134 * as long as increment is estimated as (rho<<7)/window
135 * it already is <<7 and we can easily count its fractions.
136 */
137 increment = ca->rho2_7ls / tp->snd_cwnd;
138 if (increment < 128)
139 tp->snd_cwnd_cnt++;
140 }
141
142 odd = increment % 128;
143 tp->snd_cwnd += increment >> 7;
144 ca->snd_cwnd_cents += odd;
145
146 /* check when fractions goes >=128 and increase cwnd by 1. */
147 while(ca->snd_cwnd_cents >= 128) {
148 tp->snd_cwnd++;
149 ca->snd_cwnd_cents -= 128;
150 tp->snd_cwnd_cnt = 0;
151 }
152
153 /* clamp down slowstart cwnd to ssthresh value. */
154 if (is_slowstart)
155 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
156
157 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
158}
159
160static struct tcp_congestion_ops tcp_hybla = {
161 .init = hybla_init,
162 .ssthresh = tcp_reno_ssthresh,
163 .min_cwnd = tcp_reno_min_cwnd,
164 .cong_avoid = hybla_cong_avoid,
165 .set_state = hybla_state,
166
167 .owner = THIS_MODULE,
168 .name = "hybla"
169};
170
171static int __init hybla_register(void)
172{
173 BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
174 return tcp_register_congestion_control(&tcp_hybla);
175}
176
177static void __exit hybla_unregister(void)
178{
179 tcp_unregister_congestion_control(&tcp_hybla);
180}
181
182module_init(hybla_register);
183module_exit(hybla_unregister);
184
185MODULE_AUTHOR("Daniele Lacamera");
186MODULE_LICENSE("GPL");
187MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630..53a8a5399f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission 61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission
62 * engine. Lots of bugs are found. 62 * engine. Lots of bugs are found.
63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
64 * Angelo Dell'Aera: TCP Westwood+ support
65 */ 64 */
66 65
67#include <linux/config.h> 66#include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
88int sysctl_tcp_max_orphans = NR_FILE; 87int sysctl_tcp_max_orphans = NR_FILE;
89int sysctl_tcp_frto; 88int sysctl_tcp_frto;
90int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
91int sysctl_tcp_westwood;
92int sysctl_tcp_vegas_cong_avoid;
93 90
94int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
95 92
96/* Default values of the Vegas variables, in fixed-point representation
97 * with V_PARAM_SHIFT bits to the right of the binary point.
98 */
99#define V_PARAM_SHIFT 1
100int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
101int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
102int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
103int sysctl_tcp_bic = 1;
104int sysctl_tcp_bic_fast_convergence = 1;
105int sysctl_tcp_bic_low_window = 14;
106int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
107
108#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 93#define FLAG_DATA 0x01 /* Incoming frame contained data. */
109#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
110#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ 95#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
333 tp->snd_cwnd_stamp = tcp_time_stamp; 318 tp->snd_cwnd_stamp = tcp_time_stamp;
334} 319}
335 320
336static void init_bictcp(struct tcp_sock *tp)
337{
338 tp->bictcp.cnt = 0;
339
340 tp->bictcp.last_max_cwnd = 0;
341 tp->bictcp.last_cwnd = 0;
342 tp->bictcp.last_stamp = 0;
343}
344
345/* 5. Recalculate window clamp after socket hit its memory bounds. */ 321/* 5. Recalculate window clamp after socket hit its memory bounds. */
346static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
347{ 323{
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
558 tcp_grow_window(sk, tp, skb); 534 tcp_grow_window(sk, tp, skb);
559} 535}
560 536
561/* When starting a new connection, pin down the current choice of
562 * congestion algorithm.
563 */
564void tcp_ca_init(struct tcp_sock *tp)
565{
566 if (sysctl_tcp_westwood)
567 tp->adv_cong = TCP_WESTWOOD;
568 else if (sysctl_tcp_bic)
569 tp->adv_cong = TCP_BIC;
570 else if (sysctl_tcp_vegas_cong_avoid) {
571 tp->adv_cong = TCP_VEGAS;
572 tp->vegas.baseRTT = 0x7fffffff;
573 tcp_vegas_enable(tp);
574 }
575}
576
577/* Do RTT sampling needed for Vegas.
578 * Basically we:
579 * o min-filter RTT samples from within an RTT to get the current
580 * propagation delay + queuing delay (we are min-filtering to try to
581 * avoid the effects of delayed ACKs)
582 * o min-filter RTT samples from a much longer window (forever for now)
583 * to find the propagation delay (baseRTT)
584 */
585static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
586{
587 __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
588
589 /* Filter to find propagation delay: */
590 if (vrtt < tp->vegas.baseRTT)
591 tp->vegas.baseRTT = vrtt;
592
593 /* Find the min RTT during the last RTT to find
594 * the current prop. delay + queuing delay:
595 */
596 tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
597 tp->vegas.cntRTT++;
598}
599
600/* Called to compute a smoothed rtt estimate. The data fed to this 537/* Called to compute a smoothed rtt estimate. The data fed to this
601 * routine either comes from timestamps, or from segments that were 538 * routine either comes from timestamps, or from segments that were
602 * known _not_ to have been retransmitted [see Karn/Partridge 539 * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
606 * To save cycles in the RFC 1323 implementation it was better to break 543 * To save cycles in the RFC 1323 implementation it was better to break
607 * it up into three procedures. -- erics 544 * it up into three procedures. -- erics
608 */ 545 */
609static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) 546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
610{ 547{
611 long m = mrtt; /* RTT */ 548 long m = mrtt; /* RTT */
612 549
613 if (tcp_vegas_enabled(tp))
614 vegas_rtt_calc(tp, mrtt);
615
616 /* The following amusing code comes from Jacobson's 550 /* The following amusing code comes from Jacobson's
617 * article in SIGCOMM '88. Note that rtt and mdev 551 * article in SIGCOMM '88. Note that rtt and mdev
618 * are scaled versions of rtt and mean deviation. 552 * are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
670 tp->rtt_seq = tp->snd_nxt; 604 tp->rtt_seq = tp->snd_nxt;
671 } 605 }
672 606
673 tcp_westwood_update_rtt(tp, tp->srtt >> 3); 607 if (tp->ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt);
674} 609}
675 610
676/* Calculate rto without backoff. This is the second half of Van Jacobson's 611/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -805,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
805 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
806 741
807 if (!cwnd) { 742 if (!cwnd) {
808 if (tp->mss_cache_std > 1460) 743 if (tp->mss_cache > 1460)
809 cwnd = 2; 744 cwnd = 2;
810 else 745 else
811 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; 746 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
812 } 747 }
813 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
814} 749}
@@ -979,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
979 if (sk->sk_route_caps & NETIF_F_TSO) { 914 if (sk->sk_route_caps & NETIF_F_TSO) {
980 sk->sk_route_caps &= ~NETIF_F_TSO; 915 sk->sk_route_caps &= ~NETIF_F_TSO;
981 sock_set_flag(sk, SOCK_NO_LARGESEND); 916 sock_set_flag(sk, SOCK_NO_LARGESEND);
982 tp->mss_cache = tp->mss_cache_std; 917 tp->mss_cache = tp->mss_cache;
983 } 918 }
984 919
985 if (!tp->sacked_out) 920 if (!tp->sacked_out)
@@ -1142,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1142 (IsFack(tp) || 1077 (IsFack(tp) ||
1143 !before(lost_retrans, 1078 !before(lost_retrans,
1144 TCP_SKB_CB(skb)->ack_seq + tp->reordering * 1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1145 tp->mss_cache_std))) { 1080 tp->mss_cache))) {
1146 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1147 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1148 1083
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
1185 tp->snd_una == tp->high_seq || 1120 tp->snd_una == tp->high_seq ||
1186 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1187 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1122 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1188 if (!tcp_westwood_ssthresh(tp)) 1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1189 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1124 tcp_ca_event(tp, CA_EVENT_FRTO);
1190 } 1125 }
1191 1126
1192 /* Have to clear retransmission markers here to keep the bookkeeping 1127 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
1252 tcp_set_ca_state(tp, TCP_CA_Loss); 1187 tcp_set_ca_state(tp, TCP_CA_Loss);
1253 tp->high_seq = tp->frto_highmark; 1188 tp->high_seq = tp->frto_highmark;
1254 TCP_ECN_queue_cwr(tp); 1189 TCP_ECN_queue_cwr(tp);
1255
1256 init_bictcp(tp);
1257} 1190}
1258 1191
1259void tcp_clear_retrans(struct tcp_sock *tp) 1192void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1283 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1284 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1285 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1218 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1286 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1220 tcp_ca_event(tp, CA_EVENT_LOSS);
1287 } 1221 }
1288 tp->snd_cwnd = 1; 1222 tp->snd_cwnd = 1;
1289 tp->snd_cwnd_cnt = 0; 1223 tp->snd_cwnd_cnt = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1596} 1530}
1597 1531
1598/* Decrease cwnd each second ack. */ 1532/* Decrease cwnd each second ack. */
1599
1600static void tcp_cwnd_down(struct tcp_sock *tp) 1533static void tcp_cwnd_down(struct tcp_sock *tp)
1601{ 1534{
1602 int decr = tp->snd_cwnd_cnt + 1; 1535 int decr = tp->snd_cwnd_cnt + 1;
1603 __u32 limit;
1604
1605 /*
1606 * TCP Westwood
1607 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
1608 * in packets we use mss_cache). If sysctl_tcp_westwood is off
1609 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
1610 * still used as usual. It prevents other strange cases in which
1611 * BWE*RTTmin could assume value 0. It should not happen but...
1612 */
1613
1614 if (!(limit = tcp_westwood_bw_rttmin(tp)))
1615 limit = tp->snd_ssthresh/2;
1616 1536
1617 tp->snd_cwnd_cnt = decr&1; 1537 tp->snd_cwnd_cnt = decr&1;
1618 decr >>= 1; 1538 decr >>= 1;
1619 1539
1620 if (decr && tp->snd_cwnd > limit) 1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
1621 tp->snd_cwnd -= decr; 1541 tp->snd_cwnd -= decr;
1622 1542
1623 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1654static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
1655{ 1575{
1656 if (tp->prior_ssthresh) { 1576 if (tp->prior_ssthresh) {
1657 if (tcp_is_bic(tp)) 1577 if (tp->ca_ops->undo_cwnd)
1658 tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); 1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
1659 else 1579 else
1660 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1661 1581
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1767 1687
1768static inline void tcp_complete_cwr(struct tcp_sock *tp) 1688static inline void tcp_complete_cwr(struct tcp_sock *tp)
1769{ 1689{
1770 if (tcp_westwood_cwnd(tp)) 1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1771 tp->snd_ssthresh = tp->snd_cwnd;
1772 else
1773 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1774 tp->snd_cwnd_stamp = tcp_time_stamp; 1691 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
1775} 1693}
1776 1694
1777static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1946 if (tp->ca_state < TCP_CA_CWR) { 1864 if (tp->ca_state < TCP_CA_CWR) {
1947 if (!(flag&FLAG_ECE)) 1865 if (!(flag&FLAG_ECE))
1948 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1866 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1949 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1950 TCP_ECN_queue_cwr(tp); 1868 TCP_ECN_queue_cwr(tp);
1951 } 1869 }
1952 1870
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1963/* Read draft-ietf-tcplw-high-performance before mucking 1881/* Read draft-ietf-tcplw-high-performance before mucking
1964 * with this code. (Superceeds RFC1323) 1882 * with this code. (Superceeds RFC1323)
1965 */ 1883 */
1966static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) 1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1967{ 1885{
1968 __u32 seq_rtt; 1886 __u32 seq_rtt;
1969 1887
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
1983 * in window is lost... Voila. --ANK (010210) 1901 * in window is lost... Voila. --ANK (010210)
1984 */ 1902 */
1985 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1986 tcp_rtt_estimator(tp, seq_rtt); 1904 tcp_rtt_estimator(tp, seq_rtt, usrtt);
1987 tcp_set_rto(tp); 1905 tcp_set_rto(tp);
1988 tp->backoff = 0; 1906 tp->backoff = 0;
1989 tcp_bound_rto(tp); 1907 tcp_bound_rto(tp);
1990} 1908}
1991 1909
1992static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) 1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
1993{ 1911{
1994 /* We don't have a timestamp. Can only use 1912 /* We don't have a timestamp. Can only use
1995 * packets that are not retransmitted to determine 1913 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
2003 if (flag & FLAG_RETRANS_DATA_ACKED) 1921 if (flag & FLAG_RETRANS_DATA_ACKED)
2004 return; 1922 return;
2005 1923
2006 tcp_rtt_estimator(tp, seq_rtt); 1924 tcp_rtt_estimator(tp, seq_rtt, usrtt);
2007 tcp_set_rto(tp); 1925 tcp_set_rto(tp);
2008 tp->backoff = 0; 1926 tp->backoff = 0;
2009 tcp_bound_rto(tp); 1927 tcp_bound_rto(tp);
2010} 1928}
2011 1929
2012static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
2013 int flag, s32 seq_rtt) 1931 int flag, s32 seq_rtt, u32 *usrtt)
2014{ 1932{
2015 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2016 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2017 tcp_ack_saw_tstamp(tp, flag); 1935 tcp_ack_saw_tstamp(tp, usrtt, flag);
2018 else if (seq_rtt >= 0) 1936 else if (seq_rtt >= 0)
2019 tcp_ack_no_tstamp(tp, seq_rtt, flag); 1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
2020}
2021
2022/*
2023 * Compute congestion window to use.
2024 *
2025 * This is from the implementation of BICTCP in
2026 * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
2027 * "Binary Increase Congestion Control for Fast, Long Distance
2028 * Networks" in InfoComm 2004
2029 * Available from:
2030 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
2031 *
2032 * Unless BIC is enabled and congestion window is large
2033 * this behaves the same as the original Reno.
2034 */
2035static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
2036{
2037 /* orignal Reno behaviour */
2038 if (!tcp_is_bic(tp))
2039 return tp->snd_cwnd;
2040
2041 if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
2042 (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
2043 return tp->bictcp.cnt;
2044
2045 tp->bictcp.last_cwnd = tp->snd_cwnd;
2046 tp->bictcp.last_stamp = tcp_time_stamp;
2047
2048 /* start off normal */
2049 if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
2050 tp->bictcp.cnt = tp->snd_cwnd;
2051
2052 /* binary increase */
2053 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
2054 __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
2055 / BICTCP_B;
2056
2057 if (dist > BICTCP_MAX_INCREMENT)
2058 /* linear increase */
2059 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2060 else if (dist <= 1U)
2061 /* binary search increase */
2062 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2063 / BICTCP_B;
2064 else
2065 /* binary search increase */
2066 tp->bictcp.cnt = tp->snd_cwnd / dist;
2067 } else {
2068 /* slow start amd linear increase */
2069 if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
2070 /* slow start */
2071 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2072 / BICTCP_B;
2073 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
2074 + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
2075 /* slow start */
2076 tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
2077 / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
2078 else
2079 /* linear increase */
2080 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2081 }
2082 return tp->bictcp.cnt;
2083} 1938}
2084 1939
2085/* This is Jacobson's slow start and congestion avoidance. 1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
2086 * SIGCOMM '88, p. 328. 1941 u32 in_flight, int good)
2087 */
2088static inline void reno_cong_avoid(struct tcp_sock *tp)
2089{ 1942{
2090 if (tp->snd_cwnd <= tp->snd_ssthresh) { 1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
2091 /* In "safe" area, increase. */
2092 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2093 tp->snd_cwnd++;
2094 } else {
2095 /* In dangerous area, increase slowly.
2096 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
2097 */
2098 if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
2099 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2100 tp->snd_cwnd++;
2101 tp->snd_cwnd_cnt=0;
2102 } else
2103 tp->snd_cwnd_cnt++;
2104 }
2105 tp->snd_cwnd_stamp = tcp_time_stamp; 1944 tp->snd_cwnd_stamp = tcp_time_stamp;
2106} 1945}
2107 1946
2108/* This is based on the congestion detection/avoidance scheme described in
2109 * Lawrence S. Brakmo and Larry L. Peterson.
2110 * "TCP Vegas: End to end congestion avoidance on a global internet."
2111 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
2112 * October 1995. Available from:
2113 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
2114 *
2115 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
2116 * The main aspects that distinguish this implementation from the
2117 * Arizona Vegas implementation are:
2118 * o We do not change the loss detection or recovery mechanisms of
2119 * Linux in any way. Linux already recovers from losses quite well,
2120 * using fine-grained timers, NewReno, and FACK.
2121 * o To avoid the performance penalty imposed by increasing cwnd
2122 * only every-other RTT during slow start, we increase during
2123 * every RTT during slow start, just like Reno.
2124 * o Largely to allow continuous cwnd growth during slow start,
2125 * we use the rate at which ACKs come back as the "actual"
2126 * rate, rather than the rate at which data is sent.
2127 * o To speed convergence to the right rate, we set the cwnd
2128 * to achieve the right ("actual") rate when we exit slow start.
2129 * o To filter out the noise caused by delayed ACKs, we use the
2130 * minimum RTT sample observed during the last RTT to calculate
2131 * the actual rate.
2132 * o When the sender re-starts from idle, it waits until it has
2133 * received ACKs for an entire flight of new data before making
2134 * a cwnd adjustment decision. The original Vegas implementation
2135 * assumed senders never went idle.
2136 */
2137static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2138{
2139 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
2140 *
2141 * These are so named because they represent the approximate values
2142 * of snd_una and snd_nxt at the beginning of the current RTT. More
2143 * precisely, they represent the amount of data sent during the RTT.
2144 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
2145 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
2146 * bytes of data have been ACKed during the course of the RTT, giving
2147 * an "actual" rate of:
2148 *
2149 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
2150 *
2151 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
2152 * because delayed ACKs can cover more than one segment, so they
2153 * don't line up nicely with the boundaries of RTTs.
2154 *
2155 * Another unfortunate fact of life is that delayed ACKs delay the
2156 * advance of the left edge of our send window, so that the number
2157 * of bytes we send in an RTT is often less than our cwnd will allow.
2158 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
2159 */
2160
2161 if (after(ack, tp->vegas.beg_snd_nxt)) {
2162 /* Do the Vegas once-per-RTT cwnd adjustment. */
2163 u32 old_wnd, old_snd_cwnd;
2164
2165
2166 /* Here old_wnd is essentially the window of data that was
2167 * sent during the previous RTT, and has all
2168 * been acknowledged in the course of the RTT that ended
2169 * with the ACK we just received. Likewise, old_snd_cwnd
2170 * is the cwnd during the previous RTT.
2171 */
2172 old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
2173 tp->mss_cache_std;
2174 old_snd_cwnd = tp->vegas.beg_snd_cwnd;
2175
2176 /* Save the extent of the current window so we can use this
2177 * at the end of the next RTT.
2178 */
2179 tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
2180 tp->vegas.beg_snd_nxt = tp->snd_nxt;
2181 tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
2182
2183 /* Take into account the current RTT sample too, to
2184 * decrease the impact of delayed acks. This double counts
2185 * this sample since we count it for the next window as well,
2186 * but that's not too awful, since we're taking the min,
2187 * rather than averaging.
2188 */
2189 vegas_rtt_calc(tp, seq_rtt);
2190
2191 /* We do the Vegas calculations only if we got enough RTT
2192 * samples that we can be reasonably sure that we got
2193 * at least one RTT sample that wasn't from a delayed ACK.
2194 * If we only had 2 samples total,
2195 * then that means we're getting only 1 ACK per RTT, which
2196 * means they're almost certainly delayed ACKs.
2197 * If we have 3 samples, we should be OK.
2198 */
2199
2200 if (tp->vegas.cntRTT <= 2) {
2201 /* We don't have enough RTT samples to do the Vegas
2202 * calculation, so we'll behave like Reno.
2203 */
2204 if (tp->snd_cwnd > tp->snd_ssthresh)
2205 tp->snd_cwnd++;
2206 } else {
2207 u32 rtt, target_cwnd, diff;
2208
2209 /* We have enough RTT samples, so, using the Vegas
2210 * algorithm, we determine if we should increase or
2211 * decrease cwnd, and by how much.
2212 */
2213
2214 /* Pluck out the RTT we are using for the Vegas
2215 * calculations. This is the min RTT seen during the
2216 * last RTT. Taking the min filters out the effects
2217 * of delayed ACKs, at the cost of noticing congestion
2218 * a bit later.
2219 */
2220 rtt = tp->vegas.minRTT;
2221
2222 /* Calculate the cwnd we should have, if we weren't
2223 * going too fast.
2224 *
2225 * This is:
2226 * (actual rate in segments) * baseRTT
2227 * We keep it as a fixed point number with
2228 * V_PARAM_SHIFT bits to the right of the binary point.
2229 */
2230 target_cwnd = ((old_wnd * tp->vegas.baseRTT)
2231 << V_PARAM_SHIFT) / rtt;
2232
2233 /* Calculate the difference between the window we had,
2234 * and the window we would like to have. This quantity
2235 * is the "Diff" from the Arizona Vegas papers.
2236 *
2237 * Again, this is a fixed point number with
2238 * V_PARAM_SHIFT bits to the right of the binary
2239 * point.
2240 */
2241 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
2242
2243 if (tp->snd_cwnd < tp->snd_ssthresh) {
2244 /* Slow start. */
2245 if (diff > sysctl_tcp_vegas_gamma) {
2246 /* Going too fast. Time to slow down
2247 * and switch to congestion avoidance.
2248 */
2249 tp->snd_ssthresh = 2;
2250
2251 /* Set cwnd to match the actual rate
2252 * exactly:
2253 * cwnd = (actual rate) * baseRTT
2254 * Then we add 1 because the integer
2255 * truncation robs us of full link
2256 * utilization.
2257 */
2258 tp->snd_cwnd = min(tp->snd_cwnd,
2259 (target_cwnd >>
2260 V_PARAM_SHIFT)+1);
2261
2262 }
2263 } else {
2264 /* Congestion avoidance. */
2265 u32 next_snd_cwnd;
2266
2267 /* Figure out where we would like cwnd
2268 * to be.
2269 */
2270 if (diff > sysctl_tcp_vegas_beta) {
2271 /* The old window was too fast, so
2272 * we slow down.
2273 */
2274 next_snd_cwnd = old_snd_cwnd - 1;
2275 } else if (diff < sysctl_tcp_vegas_alpha) {
2276 /* We don't have enough extra packets
2277 * in the network, so speed up.
2278 */
2279 next_snd_cwnd = old_snd_cwnd + 1;
2280 } else {
2281 /* Sending just as fast as we
2282 * should be.
2283 */
2284 next_snd_cwnd = old_snd_cwnd;
2285 }
2286
2287 /* Adjust cwnd upward or downward, toward the
2288 * desired value.
2289 */
2290 if (next_snd_cwnd > tp->snd_cwnd)
2291 tp->snd_cwnd++;
2292 else if (next_snd_cwnd < tp->snd_cwnd)
2293 tp->snd_cwnd--;
2294 }
2295 }
2296
2297 /* Wipe the slate clean for the next RTT. */
2298 tp->vegas.cntRTT = 0;
2299 tp->vegas.minRTT = 0x7fffffff;
2300 }
2301
2302 /* The following code is executed for every ack we receive,
2303 * except for conditions checked in should_advance_cwnd()
2304 * before the call to tcp_cong_avoid(). Mainly this means that
2305 * we only execute this code if the ack actually acked some
2306 * data.
2307 */
2308
2309 /* If we are in slow start, increase our cwnd in response to this ACK.
2310 * (If we are not in slow start then we are in congestion avoidance,
2311 * and adjust our congestion window only once per RTT. See the code
2312 * above.)
2313 */
2314 if (tp->snd_cwnd <= tp->snd_ssthresh)
2315 tp->snd_cwnd++;
2316
2317 /* to keep cwnd from growing without bound */
2318 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
2319
2320 /* Make sure that we are never so timid as to reduce our cwnd below
2321 * 2 MSS.
2322 *
2323 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
2324 */
2325 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
2326
2327 tp->snd_cwnd_stamp = tcp_time_stamp;
2328}
2329
2330static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2331{
2332 if (tcp_vegas_enabled(tp))
2333 vegas_cong_avoid(tp, ack, seq_rtt);
2334 else
2335 reno_cong_avoid(tp);
2336}
2337
2338/* Restart timer after forward progress on connection. 1947/* Restart timer after forward progress on connection.
2339 * RFC2988 recommends to restart timer to now+rto. 1948 * RFC2988 recommends to restart timer to now+rto.
2340 */ 1949 */
@@ -2348,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
2348 } 1957 }
2349} 1958}
2350 1959
2351/* There is one downside to this scheme. Although we keep the
2352 * ACK clock ticking, adjusting packet counters and advancing
2353 * congestion window, we do not liberate socket send buffer
2354 * space.
2355 *
2356 * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
2357 * then making a write space wakeup callback is a possible
2358 * future enhancement. WARNING: it is not trivial to make.
2359 */
2360static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, 1960static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2361 __u32 now, __s32 *seq_rtt) 1961 __u32 now, __s32 *seq_rtt)
2362{ 1962{
@@ -2415,13 +2015,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2415 2015
2416 2016
2417/* Remove acknowledged frames from the retransmission queue. */ 2017/* Remove acknowledged frames from the retransmission queue. */
2418static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) 2018static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
2419{ 2019{
2420 struct tcp_sock *tp = tcp_sk(sk); 2020 struct tcp_sock *tp = tcp_sk(sk);
2421 struct sk_buff *skb; 2021 struct sk_buff *skb;
2422 __u32 now = tcp_time_stamp; 2022 __u32 now = tcp_time_stamp;
2423 int acked = 0; 2023 int acked = 0;
2424 __s32 seq_rtt = -1; 2024 __s32 seq_rtt = -1;
2025 struct timeval usnow;
2026 u32 pkts_acked = 0;
2027
2028 if (seq_usrtt)
2029 do_gettimeofday(&usnow);
2425 2030
2426 while ((skb = skb_peek(&sk->sk_write_queue)) && 2031 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2427 skb != sk->sk_send_head) { 2032 skb != sk->sk_send_head) {
@@ -2433,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2433 * the other end. 2038 * the other end.
2434 */ 2039 */
2435 if (after(scb->end_seq, tp->snd_una)) { 2040 if (after(scb->end_seq, tp->snd_una)) {
2436 if (tcp_skb_pcount(skb) > 1) 2041 if (tcp_skb_pcount(skb) > 1 &&
2042 after(tp->snd_una, scb->seq))
2437 acked |= tcp_tso_acked(sk, skb, 2043 acked |= tcp_tso_acked(sk, skb,
2438 now, &seq_rtt); 2044 now, &seq_rtt);
2439 break; 2045 break;
@@ -2448,6 +2054,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2448 */ 2054 */
2449 if (!(scb->flags & TCPCB_FLAG_SYN)) { 2055 if (!(scb->flags & TCPCB_FLAG_SYN)) {
2450 acked |= FLAG_DATA_ACKED; 2056 acked |= FLAG_DATA_ACKED;
2057 ++pkts_acked;
2451 } else { 2058 } else {
2452 acked |= FLAG_SYN_ACKED; 2059 acked |= FLAG_SYN_ACKED;
2453 tp->retrans_stamp = 0; 2060 tp->retrans_stamp = 0;
@@ -2461,6 +2068,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2461 seq_rtt = -1; 2068 seq_rtt = -1;
2462 } else if (seq_rtt < 0) 2069 } else if (seq_rtt < 0)
2463 seq_rtt = now - scb->when; 2070 seq_rtt = now - scb->when;
2071 if (seq_usrtt)
2072 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
2073 + (usnow.tv_usec - skb->stamp.tv_usec);
2074
2464 if (sacked & TCPCB_SACKED_ACKED) 2075 if (sacked & TCPCB_SACKED_ACKED)
2465 tp->sacked_out -= tcp_skb_pcount(skb); 2076 tp->sacked_out -= tcp_skb_pcount(skb);
2466 if (sacked & TCPCB_LOST) 2077 if (sacked & TCPCB_LOST)
@@ -2479,8 +2090,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2479 } 2090 }
2480 2091
2481 if (acked&FLAG_ACKED) { 2092 if (acked&FLAG_ACKED) {
2482 tcp_ack_update_rtt(tp, acked, seq_rtt); 2093 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
2483 tcp_ack_packets_out(sk, tp); 2094 tcp_ack_packets_out(sk, tp);
2095
2096 if (tp->ca_ops->pkts_acked)
2097 tp->ca_ops->pkts_acked(tp, pkts_acked);
2484 } 2098 }
2485 2099
2486#if FASTRETRANS_DEBUG > 0 2100#if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2238,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2624 tp->frto_counter = (tp->frto_counter + 1) % 3; 2238 tp->frto_counter = (tp->frto_counter + 1) % 3;
2625} 2239}
2626 2240
2627/*
2628 * TCP Westwood+
2629 */
2630
2631/*
2632 * @init_westwood
2633 * This function initializes fields used in TCP Westwood+. We can't
2634 * get no information about RTTmin at this time so we simply set it to
2635 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
2636 * since in this way we're sure it will be updated in a consistent
2637 * way as soon as possible. It will reasonably happen within the first
2638 * RTT period of the connection lifetime.
2639 */
2640
2641static void init_westwood(struct sock *sk)
2642{
2643 struct tcp_sock *tp = tcp_sk(sk);
2644
2645 tp->westwood.bw_ns_est = 0;
2646 tp->westwood.bw_est = 0;
2647 tp->westwood.accounted = 0;
2648 tp->westwood.cumul_ack = 0;
2649 tp->westwood.rtt_win_sx = tcp_time_stamp;
2650 tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
2651 tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
2652 tp->westwood.snd_una = tp->snd_una;
2653}
2654
2655/*
2656 * @westwood_do_filter
2657 * Low-pass filter. Implemented using constant coeffients.
2658 */
2659
2660static inline __u32 westwood_do_filter(__u32 a, __u32 b)
2661{
2662 return (((7 * a) + b) >> 3);
2663}
2664
2665static void westwood_filter(struct sock *sk, __u32 delta)
2666{
2667 struct tcp_sock *tp = tcp_sk(sk);
2668
2669 tp->westwood.bw_ns_est =
2670 westwood_do_filter(tp->westwood.bw_ns_est,
2671 tp->westwood.bk / delta);
2672 tp->westwood.bw_est =
2673 westwood_do_filter(tp->westwood.bw_est,
2674 tp->westwood.bw_ns_est);
2675}
2676
2677/*
2678 * @westwood_update_rttmin
2679 * It is used to update RTTmin. In this case we MUST NOT use
2680 * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
2681 */
2682
2683static inline __u32 westwood_update_rttmin(const struct sock *sk)
2684{
2685 const struct tcp_sock *tp = tcp_sk(sk);
2686 __u32 rttmin = tp->westwood.rtt_min;
2687
2688 if (tp->westwood.rtt != 0 &&
2689 (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
2690 rttmin = tp->westwood.rtt;
2691
2692 return rttmin;
2693}
2694
2695/*
2696 * @westwood_acked
2697 * Evaluate increases for dk.
2698 */
2699
2700static inline __u32 westwood_acked(const struct sock *sk)
2701{
2702 const struct tcp_sock *tp = tcp_sk(sk);
2703
2704 return tp->snd_una - tp->westwood.snd_una;
2705}
2706
2707/*
2708 * @westwood_new_window
2709 * It evaluates if we are receiving data inside the same RTT window as
2710 * when we started.
2711 * Return value:
2712 * It returns 0 if we are still evaluating samples in the same RTT
2713 * window, 1 if the sample has to be considered in the next window.
2714 */
2715
2716static int westwood_new_window(const struct sock *sk)
2717{
2718 const struct tcp_sock *tp = tcp_sk(sk);
2719 __u32 left_bound;
2720 __u32 rtt;
2721 int ret = 0;
2722
2723 left_bound = tp->westwood.rtt_win_sx;
2724 rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
2725
2726 /*
2727 * A RTT-window has passed. Be careful since if RTT is less than
2728 * 50ms we don't filter but we continue 'building the sample'.
2729 * This minimum limit was choosen since an estimation on small
2730 * time intervals is better to avoid...
2731 * Obvioulsy on a LAN we reasonably will always have
2732 * right_bound = left_bound + WESTWOOD_RTT_MIN
2733 */
2734
2735 if ((left_bound + rtt) < tcp_time_stamp)
2736 ret = 1;
2737
2738 return ret;
2739}
2740
2741/*
2742 * @westwood_update_window
2743 * It updates RTT evaluation window if it is the right moment to do
2744 * it. If so it calls filter for evaluating bandwidth.
2745 */
2746
2747static void __westwood_update_window(struct sock *sk, __u32 now)
2748{
2749 struct tcp_sock *tp = tcp_sk(sk);
2750 __u32 delta = now - tp->westwood.rtt_win_sx;
2751
2752 if (delta) {
2753 if (tp->westwood.rtt)
2754 westwood_filter(sk, delta);
2755
2756 tp->westwood.bk = 0;
2757 tp->westwood.rtt_win_sx = tcp_time_stamp;
2758 }
2759}
2760
2761
2762static void westwood_update_window(struct sock *sk, __u32 now)
2763{
2764 if (westwood_new_window(sk))
2765 __westwood_update_window(sk, now);
2766}
2767
2768/*
2769 * @__tcp_westwood_fast_bw
2770 * It is called when we are in fast path. In particular it is called when
2771 * header prediction is successfull. In such case infact update is
2772 * straight forward and doesn't need any particular care.
2773 */
2774
2775static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779 westwood_update_window(sk, tcp_time_stamp);
2780
2781 tp->westwood.bk += westwood_acked(sk);
2782 tp->westwood.snd_una = tp->snd_una;
2783 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2784}
2785
2786static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2787{
2788 if (tcp_is_westwood(tcp_sk(sk)))
2789 __tcp_westwood_fast_bw(sk, skb);
2790}
2791
2792
2793/*
2794 * @westwood_dupack_update
2795 * It updates accounted and cumul_ack when receiving a dupack.
2796 */
2797
2798static void westwood_dupack_update(struct sock *sk)
2799{
2800 struct tcp_sock *tp = tcp_sk(sk);
2801
2802 tp->westwood.accounted += tp->mss_cache_std;
2803 tp->westwood.cumul_ack = tp->mss_cache_std;
2804}
2805
2806static inline int westwood_may_change_cumul(struct tcp_sock *tp)
2807{
2808 return (tp->westwood.cumul_ack > tp->mss_cache_std);
2809}
2810
2811static inline void westwood_partial_update(struct tcp_sock *tp)
2812{
2813 tp->westwood.accounted -= tp->westwood.cumul_ack;
2814 tp->westwood.cumul_ack = tp->mss_cache_std;
2815}
2816
2817static inline void westwood_complete_update(struct tcp_sock *tp)
2818{
2819 tp->westwood.cumul_ack -= tp->westwood.accounted;
2820 tp->westwood.accounted = 0;
2821}
2822
2823/*
2824 * @westwood_acked_count
2825 * This function evaluates cumul_ack for evaluating dk in case of
2826 * delayed or partial acks.
2827 */
2828
2829static inline __u32 westwood_acked_count(struct sock *sk)
2830{
2831 struct tcp_sock *tp = tcp_sk(sk);
2832
2833 tp->westwood.cumul_ack = westwood_acked(sk);
2834
2835 /* If cumul_ack is 0 this is a dupack since it's not moving
2836 * tp->snd_una.
2837 */
2838 if (!(tp->westwood.cumul_ack))
2839 westwood_dupack_update(sk);
2840
2841 if (westwood_may_change_cumul(tp)) {
2842 /* Partial or delayed ack */
2843 if (tp->westwood.accounted >= tp->westwood.cumul_ack)
2844 westwood_partial_update(tp);
2845 else
2846 westwood_complete_update(tp);
2847 }
2848
2849 tp->westwood.snd_una = tp->snd_una;
2850
2851 return tp->westwood.cumul_ack;
2852}
2853
2854
2855/*
2856 * @__tcp_westwood_slow_bw
2857 * It is called when something is going wrong..even if there could
2858 * be no problems! Infact a simple delayed packet may trigger a
2859 * dupack. But we need to be careful in such case.
2860 */
2861
2862static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2863{
2864 struct tcp_sock *tp = tcp_sk(sk);
2865
2866 westwood_update_window(sk, tcp_time_stamp);
2867
2868 tp->westwood.bk += westwood_acked_count(sk);
2869 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2870}
2871
2872static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2873{
2874 if (tcp_is_westwood(tcp_sk(sk)))
2875 __tcp_westwood_slow_bw(sk, skb);
2876}
2877
2878/* This routine deals with incoming acks, but not outgoing ones. */ 2241/* This routine deals with incoming acks, but not outgoing ones. */
2879static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2242static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2880{ 2243{
@@ -2884,6 +2247,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2884 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2247 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2885 u32 prior_in_flight; 2248 u32 prior_in_flight;
2886 s32 seq_rtt; 2249 s32 seq_rtt;
2250 s32 seq_usrtt = 0;
2887 int prior_packets; 2251 int prior_packets;
2888 2252
2889 /* If the ack is newer than sent or older than previous acks 2253 /* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2266,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2902 */ 2266 */
2903 tcp_update_wl(tp, ack, ack_seq); 2267 tcp_update_wl(tp, ack, ack_seq);
2904 tp->snd_una = ack; 2268 tp->snd_una = ack;
2905 tcp_westwood_fast_bw(sk, skb);
2906 flag |= FLAG_WIN_UPDATE; 2269 flag |= FLAG_WIN_UPDATE;
2907 2270
2271 tcp_ca_event(tp, CA_EVENT_FAST_ACK);
2272
2908 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2273 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2909 } else { 2274 } else {
2910 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 2275 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2285,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2920 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2285 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2921 flag |= FLAG_ECE; 2286 flag |= FLAG_ECE;
2922 2287
2923 tcp_westwood_slow_bw(sk,skb); 2288 tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
2924 } 2289 }
2925 2290
2926 /* We passed data and got it acked, remove any soft error 2291 /* We passed data and got it acked, remove any soft error
@@ -2935,22 +2300,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2935 prior_in_flight = tcp_packets_in_flight(tp); 2300 prior_in_flight = tcp_packets_in_flight(tp);
2936 2301
2937 /* See if we can take anything off of the retransmit queue. */ 2302 /* See if we can take anything off of the retransmit queue. */
2938 flag |= tcp_clean_rtx_queue(sk, &seq_rtt); 2303 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2304 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
2939 2305
2940 if (tp->frto_counter) 2306 if (tp->frto_counter)
2941 tcp_process_frto(sk, prior_snd_una); 2307 tcp_process_frto(sk, prior_snd_una);
2942 2308
2943 if (tcp_ack_is_dubious(tp, flag)) { 2309 if (tcp_ack_is_dubious(tp, flag)) {
2944 /* Advanve CWND, if state allows this. */ 2310 /* Advanve CWND, if state allows this. */
2945 if ((flag & FLAG_DATA_ACKED) && 2311 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
2946 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && 2312 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
2947 tcp_may_raise_cwnd(tp, flag))
2948 tcp_cong_avoid(tp, ack, seq_rtt);
2949 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2313 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2950 } else { 2314 } else {
2951 if ((flag & FLAG_DATA_ACKED) && 2315 if ((flag & FLAG_DATA_ACKED))
2952 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) 2316 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
2953 tcp_cong_avoid(tp, ack, seq_rtt);
2954 } 2317 }
2955 2318
2956 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2319 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -3439,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
3439 int this_sack; 2802 int this_sack;
3440 2803
3441 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 2804 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
3442 if (skb_queue_len(&tp->out_of_order_queue) == 0) { 2805 if (skb_queue_empty(&tp->out_of_order_queue)) {
3443 tp->rx_opt.num_sacks = 0; 2806 tp->rx_opt.num_sacks = 0;
3444 tp->rx_opt.eff_sacks = tp->rx_opt.dsack; 2807 tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
3445 return; 2808 return;
@@ -3572,13 +2935,13 @@ queue_and_out:
3572 if(th->fin) 2935 if(th->fin)
3573 tcp_fin(skb, sk, th); 2936 tcp_fin(skb, sk, th);
3574 2937
3575 if (skb_queue_len(&tp->out_of_order_queue)) { 2938 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3576 tcp_ofo_queue(sk); 2939 tcp_ofo_queue(sk);
3577 2940
3578 /* RFC2581. 4.2. SHOULD send immediate ACK, when 2941 /* RFC2581. 4.2. SHOULD send immediate ACK, when
3579 * gap in queue is filled. 2942 * gap in queue is filled.
3580 */ 2943 */
3581 if (!skb_queue_len(&tp->out_of_order_queue)) 2944 if (skb_queue_empty(&tp->out_of_order_queue))
3582 tp->ack.pingpong = 0; 2945 tp->ack.pingpong = 0;
3583 } 2946 }
3584 2947
@@ -3886,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
3886 * This must not ever occur. */ 3249 * This must not ever occur. */
3887 3250
3888 /* First, purge the out_of_order queue. */ 3251 /* First, purge the out_of_order queue. */
3889 if (skb_queue_len(&tp->out_of_order_queue)) { 3252 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3890 NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 3253 NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
3891 skb_queue_len(&tp->out_of_order_queue));
3892 __skb_queue_purge(&tp->out_of_order_queue); 3254 __skb_queue_purge(&tp->out_of_order_queue);
3893 3255
3894 /* Reset SACK state. A conforming SACK implementation will 3256 /* Reset SACK state. A conforming SACK implementation will
@@ -3937,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
3937 tp->snd_cwnd_stamp = tcp_time_stamp; 3299 tp->snd_cwnd_stamp = tcp_time_stamp;
3938} 3300}
3939 3301
3302static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3303{
3304 /* If the user specified a specific send buffer setting, do
3305 * not modify it.
3306 */
3307 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
3308 return 0;
3309
3310 /* If we are under global TCP memory pressure, do not expand. */
3311 if (tcp_memory_pressure)
3312 return 0;
3313
3314 /* If we are under soft global TCP memory pressure, do not expand. */
3315 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
3316 return 0;
3317
3318 /* If we filled the congestion window, do not expand. */
3319 if (tp->packets_out >= tp->snd_cwnd)
3320 return 0;
3321
3322 return 1;
3323}
3940 3324
3941/* When incoming ACK allowed to free some skb from write_queue, 3325/* When incoming ACK allowed to free some skb from write_queue,
3942 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket 3326 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3948,11 +3332,8 @@ static void tcp_new_space(struct sock *sk)
3948{ 3332{
3949 struct tcp_sock *tp = tcp_sk(sk); 3333 struct tcp_sock *tp = tcp_sk(sk);
3950 3334
3951 if (tp->packets_out < tp->snd_cwnd && 3335 if (tcp_should_expand_sndbuf(sk, tp)) {
3952 !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && 3336 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3953 !tcp_memory_pressure &&
3954 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
3955 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
3956 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3337 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3957 demanded = max_t(unsigned int, tp->snd_cwnd, 3338 demanded = max_t(unsigned int, tp->snd_cwnd,
3958 tp->reordering + 1); 3339 tp->reordering + 1);
@@ -3975,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk)
3975 } 3356 }
3976} 3357}
3977 3358
3978static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) 3359static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3979{ 3360{
3980 struct tcp_sock *tp = tcp_sk(sk); 3361 tcp_push_pending_frames(sk, tp);
3981
3982 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
3983 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
3984 tcp_write_xmit(sk, tp->nonagle))
3985 tcp_check_probe_timer(sk, tp);
3986}
3987
3988static __inline__ void tcp_data_snd_check(struct sock *sk)
3989{
3990 struct sk_buff *skb = sk->sk_send_head;
3991
3992 if (skb != NULL)
3993 __tcp_data_snd_check(sk, skb);
3994 tcp_check_space(sk); 3362 tcp_check_space(sk);
3995} 3363}
3996 3364
@@ -4284,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4284 */ 3652 */
4285 tcp_ack(sk, skb, 0); 3653 tcp_ack(sk, skb, 0);
4286 __kfree_skb(skb); 3654 __kfree_skb(skb);
4287 tcp_data_snd_check(sk); 3655 tcp_data_snd_check(sk, tp);
4288 return 0; 3656 return 0;
4289 } else { /* Header too small */ 3657 } else { /* Header too small */
4290 TCP_INC_STATS_BH(TCP_MIB_INERRS); 3658 TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -4350,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4350 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 3718 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
4351 /* Well, only one small jumplet in fast path... */ 3719 /* Well, only one small jumplet in fast path... */
4352 tcp_ack(sk, skb, FLAG_DATA); 3720 tcp_ack(sk, skb, FLAG_DATA);
4353 tcp_data_snd_check(sk); 3721 tcp_data_snd_check(sk, tp);
4354 if (!tcp_ack_scheduled(tp)) 3722 if (!tcp_ack_scheduled(tp))
4355 goto no_ack; 3723 goto no_ack;
4356 } 3724 }
@@ -4428,7 +3796,7 @@ step5:
4428 /* step 7: process the segment text */ 3796 /* step 7: process the segment text */
4429 tcp_data_queue(sk, skb); 3797 tcp_data_queue(sk, skb);
4430 3798
4431 tcp_data_snd_check(sk); 3799 tcp_data_snd_check(sk, tp);
4432 tcp_ack_snd_check(sk); 3800 tcp_ack_snd_check(sk);
4433 return 0; 3801 return 0;
4434 3802
@@ -4552,6 +3920,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4552 3920
4553 tcp_init_metrics(sk); 3921 tcp_init_metrics(sk);
4554 3922
3923 tcp_init_congestion_control(tp);
3924
4555 /* Prevent spurious tcp_cwnd_restart() on first data 3925 /* Prevent spurious tcp_cwnd_restart() on first data
4556 * packet. 3926 * packet.
4557 */ 3927 */
@@ -4708,9 +4078,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4708 if(tp->af_specific->conn_request(sk, skb) < 0) 4078 if(tp->af_specific->conn_request(sk, skb) < 0)
4709 return 1; 4079 return 1;
4710 4080
4711 init_westwood(sk);
4712 init_bictcp(tp);
4713
4714 /* Now we have several options: In theory there is 4081 /* Now we have several options: In theory there is
4715 * nothing else in the frame. KA9Q has an option to 4082 * nothing else in the frame. KA9Q has an option to
4716 * send data with the syn, BSD accepts data with the 4083 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4099,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4732 goto discard; 4099 goto discard;
4733 4100
4734 case TCP_SYN_SENT: 4101 case TCP_SYN_SENT:
4735 init_westwood(sk);
4736 init_bictcp(tp);
4737
4738 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 4102 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4739 if (queued >= 0) 4103 if (queued >= 0)
4740 return queued; 4104 return queued;
@@ -4742,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4742 /* Do step6 onward by hand. */ 4106 /* Do step6 onward by hand. */
4743 tcp_urg(sk, skb, th); 4107 tcp_urg(sk, skb, th);
4744 __kfree_skb(skb); 4108 __kfree_skb(skb);
4745 tcp_data_snd_check(sk); 4109 tcp_data_snd_check(sk, tp);
4746 return 0; 4110 return 0;
4747 } 4111 }
4748 4112
@@ -4816,7 +4180,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4816 */ 4180 */
4817 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4181 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4818 !tp->srtt) 4182 !tp->srtt)
4819 tcp_ack_saw_tstamp(tp, 0); 4183 tcp_ack_saw_tstamp(tp, 0, 0);
4820 4184
4821 if (tp->rx_opt.tstamp_ok) 4185 if (tp->rx_opt.tstamp_ok)
4822 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4186 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4192,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4828 4192
4829 tcp_init_metrics(sk); 4193 tcp_init_metrics(sk);
4830 4194
4195 tcp_init_congestion_control(tp);
4196
4831 /* Prevent spurious tcp_cwnd_restart() on 4197 /* Prevent spurious tcp_cwnd_restart() on
4832 * first data packet. 4198 * first data packet.
4833 */ 4199 */
@@ -4931,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4931 4297
4932 /* tcp_data could move socket to TIME-WAIT */ 4298 /* tcp_data could move socket to TIME-WAIT */
4933 if (sk->sk_state != TCP_CLOSE) { 4299 if (sk->sk_state != TCP_CLOSE) {
4934 tcp_data_snd_check(sk); 4300 tcp_data_snd_check(sk, tp);
4935 tcp_ack_snd_check(sk); 4301 tcp_ack_snd_check(sk);
4936 } 4302 }
4937 4303
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2d41d5d6ad..67c670886c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -242,9 +242,14 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
242 tcp_port_rover = rover; 242 tcp_port_rover = rover;
243 spin_unlock(&tcp_portalloc_lock); 243 spin_unlock(&tcp_portalloc_lock);
244 244
245 /* Exhausted local port range during search? */ 245 /* Exhausted local port range during search? It is not
246 * possible for us to be holding one of the bind hash
247 * locks if this test triggers, because if 'remaining'
248 * drops to zero, we broke out of the do/while loop at
249 * the top level, not from the 'break;' statement.
250 */
246 ret = 1; 251 ret = 1;
247 if (remaining <= 0) 252 if (unlikely(remaining <= 0))
248 goto fail; 253 goto fail;
249 254
250 /* OK, here is the one we will use. HEAD is 255 /* OK, here is the one we will use. HEAD is
@@ -1494,12 +1499,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1494 * to destinations, already remembered 1499 * to destinations, already remembered
1495 * to the moment of synflood. 1500 * to the moment of synflood.
1496 */ 1501 */
1497 NETDEBUG(if (net_ratelimit()) \ 1502 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1498 printk(KERN_DEBUG "TCP: drop open " 1503 "request from %u.%u."
1499 "request from %u.%u." 1504 "%u.%u/%u\n",
1500 "%u.%u/%u\n", \ 1505 NIPQUAD(saddr),
1501 NIPQUAD(saddr), 1506 ntohs(skb->h.th->source)));
1502 ntohs(skb->h.th->source)));
1503 dst_release(dst); 1507 dst_release(dst);
1504 goto drop_and_free; 1508 goto drop_and_free;
1505 } 1509 }
@@ -1627,8 +1631,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
1627 skb->nh.iph->daddr, skb->csum)) 1631 skb->nh.iph->daddr, skb->csum))
1628 return 0; 1632 return 0;
1629 1633
1630 NETDEBUG(if (net_ratelimit()) 1634 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1631 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1632 skb->ip_summed = CHECKSUM_NONE; 1635 skb->ip_summed = CHECKSUM_NONE;
1633 } 1636 }
1634 if (skb->len <= 76) { 1637 if (skb->len <= 76) {
@@ -2045,9 +2048,10 @@ static int tcp_v4_init_sock(struct sock *sk)
2045 */ 2048 */
2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 2049 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2047 tp->snd_cwnd_clamp = ~0; 2050 tp->snd_cwnd_clamp = ~0;
2048 tp->mss_cache_std = tp->mss_cache = 536; 2051 tp->mss_cache = 536;
2049 2052
2050 tp->reordering = sysctl_tcp_reordering; 2053 tp->reordering = sysctl_tcp_reordering;
2054 tp->ca_ops = &tcp_init_congestion_ops;
2051 2055
2052 sk->sk_state = TCP_CLOSE; 2056 sk->sk_state = TCP_CLOSE;
2053 2057
@@ -2070,6 +2074,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
2070 2074
2071 tcp_clear_xmit_timers(sk); 2075 tcp_clear_xmit_timers(sk);
2072 2076
2077 tcp_cleanup_congestion_control(tp);
2078
2073 /* Cleanup up the write buffer. */ 2079 /* Cleanup up the write buffer. */
2074 sk_stream_writequeue_purge(sk); 2080 sk_stream_writequeue_purge(sk);
2075 2081
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b3943e7562..f42a284164 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
774 newtp->frto_counter = 0; 774 newtp->frto_counter = 0;
775 newtp->frto_highmark = 0; 775 newtp->frto_highmark = 0;
776 776
777 newtp->ca_ops = &tcp_reno;
778
777 tcp_set_ca_state(newtp, TCP_CA_Open); 779 tcp_set_ca_state(newtp, TCP_CA_Open);
778 tcp_init_xmit_timers(newsk); 780 tcp_init_xmit_timers(newsk);
779 skb_queue_head_init(&newtp->out_of_order_queue); 781 skb_queue_head_init(&newtp->out_of_order_queue);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
842 if (newtp->ecn_flags&TCP_ECN_OK) 844 if (newtp->ecn_flags&TCP_ECN_OK)
843 sock_set_flag(newsk, SOCK_NO_LARGESEND); 845 sock_set_flag(newsk, SOCK_NO_LARGESEND);
844 846
845 tcp_ca_init(newtp);
846
847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); 847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
848 } 848 }
849 return newsk; 849 return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e3..dd30dd137b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 112 u32 cwnd = tp->snd_cwnd;
113 113
114 if (tcp_is_vegas(tp)) 114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
115 tcp_vegas_enable(tp);
116 115
117 tp->snd_ssthresh = tcp_current_ssthresh(tp); 116 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd); 117 restart_cwnd = min(restart_cwnd, cwnd);
@@ -141,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
141 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
142} 141}
143 142
144static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
145{ 144{
146 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
147 146
148 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
150} 149}
151 150
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_WSCALE 0x2 279#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4 280#define SYSCTL_FLAG_SACK 0x4
282 281
282 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp);
285
283 sysctl_flags = 0; 286 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) { 287 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 288 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 307 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 } 308 }
306 309
307 /* 310 if (tcp_packets_in_flight(tp) == 0)
308 * If the connection is idle and we are restarting, 311 tcp_ca_event(tp, CA_EVENT_TX_START);
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318 312
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 313 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th; 314 skb->h.th = th;
@@ -361,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
361 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
362 356
363 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
365 359
366 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -409,42 +403,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
409 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
410} 404}
411 405
412static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
413{
414 /* Force push to be on for any TSO frames to workaround
415 * problems with busted implementations like Mac OS-X that
416 * hold off socket receive wakeups until push is seen.
417 */
418 if (tcp_skb_pcount(skb) > 1)
419 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
420}
421
422/* Send _single_ skb sitting at the send head. This function requires
423 * true push pending frames to setup probe timer etc.
424 */
425void tcp_push_one(struct sock *sk, unsigned cur_mss)
426{
427 struct tcp_sock *tp = tcp_sk(sk);
428 struct sk_buff *skb = sk->sk_send_head;
429
430 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
431 /* Send it out now. */
432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
433 tcp_tso_set_push(skb);
434 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
435 sk->sk_send_head = NULL;
436 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
437 tcp_packets_out_inc(sk, tp, skb);
438 return;
439 }
440 }
441}
442
443void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
444{ 407{
445 struct tcp_sock *tp = tcp_sk(sk); 408 if (skb->len <= mss_now ||
446
447 if (skb->len <= tp->mss_cache_std ||
448 !(sk->sk_route_caps & NETIF_F_TSO)) { 409 !(sk->sk_route_caps & NETIF_F_TSO)) {
449 /* Avoid the costly divide in the normal 410 /* Avoid the costly divide in the normal
450 * non-TSO case. 411 * non-TSO case.
@@ -454,10 +415,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
454 } else { 415 } else {
455 unsigned int factor; 416 unsigned int factor;
456 417
457 factor = skb->len + (tp->mss_cache_std - 1); 418 factor = skb->len + (mss_now - 1);
458 factor /= tp->mss_cache_std; 419 factor /= mss_now;
459 skb_shinfo(skb)->tso_segs = factor; 420 skb_shinfo(skb)->tso_segs = factor;
460 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 421 skb_shinfo(skb)->tso_size = mss_now;
461 } 422 }
462} 423}
463 424
@@ -466,7 +427,7 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
466 * packet to the list. This won't be called frequently, I hope. 427 * packet to the list. This won't be called frequently, I hope.
467 * Remember, these are still headerless SKBs at this point. 428 * Remember, these are still headerless SKBs at this point.
468 */ 429 */
469static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) 430static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
470{ 431{
471 struct tcp_sock *tp = tcp_sk(sk); 432 struct tcp_sock *tp = tcp_sk(sk);
472 struct sk_buff *buff; 433 struct sk_buff *buff;
@@ -521,6 +482,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
521 * skbs, which it never sent before. --ANK 482 * skbs, which it never sent before. --ANK
522 */ 483 */
523 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 484 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
485 buff->stamp = skb->stamp;
524 486
525 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 487 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
526 tp->lost_out -= tcp_skb_pcount(skb); 488 tp->lost_out -= tcp_skb_pcount(skb);
@@ -528,8 +490,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
528 } 490 }
529 491
530 /* Fix up tso_factor for both original and new SKB. */ 492 /* Fix up tso_factor for both original and new SKB. */
531 tcp_set_skb_tso_segs(sk, skb); 493 tcp_set_skb_tso_segs(sk, skb, mss_now);
532 tcp_set_skb_tso_segs(sk, buff); 494 tcp_set_skb_tso_segs(sk, buff, mss_now);
533 495
534 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 496 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
535 tp->lost_out += tcp_skb_pcount(skb); 497 tp->lost_out += tcp_skb_pcount(skb);
@@ -542,6 +504,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
542 } 504 }
543 505
544 /* Link BUFF into the send queue. */ 506 /* Link BUFF into the send queue. */
507 skb_header_release(buff);
545 __skb_append(skb, buff); 508 __skb_append(skb, buff);
546 509
547 return 0; 510 return 0;
@@ -604,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
604 * factor and mss. 567 * factor and mss.
605 */ 568 */
606 if (tcp_skb_pcount(skb) > 1) 569 if (tcp_skb_pcount(skb) > 1)
607 tcp_set_skb_tso_segs(sk, skb); 570 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
608 571
609 return 0; 572 return 0;
610} 573}
@@ -662,7 +625,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
662 625
663 /* And store cached results */ 626 /* And store cached results */
664 tp->pmtu_cookie = pmtu; 627 tp->pmtu_cookie = pmtu;
665 tp->mss_cache = tp->mss_cache_std = mss_now; 628 tp->mss_cache = mss_now;
666 629
667 return mss_now; 630 return mss_now;
668} 631}
@@ -674,57 +637,315 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
674 * cannot be large. However, taking into account rare use of URG, this 637 * cannot be large. However, taking into account rare use of URG, this
675 * is not a big flaw. 638 * is not a big flaw.
676 */ 639 */
677 640unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
678unsigned int tcp_current_mss(struct sock *sk, int large)
679{ 641{
680 struct tcp_sock *tp = tcp_sk(sk); 642 struct tcp_sock *tp = tcp_sk(sk);
681 struct dst_entry *dst = __sk_dst_get(sk); 643 struct dst_entry *dst = __sk_dst_get(sk);
682 unsigned int do_large, mss_now; 644 u32 mss_now;
645 u16 xmit_size_goal;
646 int doing_tso = 0;
647
648 mss_now = tp->mss_cache;
649
650 if (large_allowed &&
651 (sk->sk_route_caps & NETIF_F_TSO) &&
652 !tp->urg_mode)
653 doing_tso = 1;
683 654
684 mss_now = tp->mss_cache_std;
685 if (dst) { 655 if (dst) {
686 u32 mtu = dst_mtu(dst); 656 u32 mtu = dst_mtu(dst);
687 if (mtu != tp->pmtu_cookie) 657 if (mtu != tp->pmtu_cookie)
688 mss_now = tcp_sync_mss(sk, mtu); 658 mss_now = tcp_sync_mss(sk, mtu);
689 } 659 }
690 660
691 do_large = (large && 661 if (tp->rx_opt.eff_sacks)
692 (sk->sk_route_caps & NETIF_F_TSO) && 662 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
693 !tp->urg_mode); 663 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
694 664
695 if (do_large) { 665 xmit_size_goal = mss_now;
696 unsigned int large_mss, factor, limit;
697 666
698 large_mss = 65535 - tp->af_specific->net_header_len - 667 if (doing_tso) {
668 xmit_size_goal = 65535 -
669 tp->af_specific->net_header_len -
699 tp->ext_header_len - tp->tcp_header_len; 670 tp->ext_header_len - tp->tcp_header_len;
700 671
701 if (tp->max_window && large_mss > (tp->max_window>>1)) 672 if (tp->max_window &&
702 large_mss = max((tp->max_window>>1), 673 (xmit_size_goal > (tp->max_window >> 1)))
703 68U - tp->tcp_header_len); 674 xmit_size_goal = max((tp->max_window >> 1),
675 68U - tp->tcp_header_len);
704 676
705 factor = large_mss / mss_now; 677 xmit_size_goal -= (xmit_size_goal % mss_now);
678 }
679 tp->xmit_size_goal = xmit_size_goal;
706 680
707 /* Always keep large mss multiple of real mss, but 681 return mss_now;
708 * do not exceed 1/tso_win_divisor of the congestion window 682}
709 * so we can keep the ACK clock ticking and minimize 683
710 * bursting. 684/* Congestion window validation. (RFC2861) */
711 */
712 limit = tp->snd_cwnd;
713 if (sysctl_tcp_tso_win_divisor)
714 limit /= sysctl_tcp_tso_win_divisor;
715 limit = max(1U, limit);
716 if (factor > limit)
717 factor = limit;
718 685
719 tp->mss_cache = mss_now * factor; 686static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
687{
688 __u32 packets_out = tp->packets_out;
689
690 if (packets_out >= tp->snd_cwnd) {
691 /* Network is feed fully. */
692 tp->snd_cwnd_used = 0;
693 tp->snd_cwnd_stamp = tcp_time_stamp;
694 } else {
695 /* Network starves. */
696 if (tp->packets_out > tp->snd_cwnd_used)
697 tp->snd_cwnd_used = tp->packets_out;
720 698
721 mss_now = tp->mss_cache; 699 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
700 tcp_cwnd_application_limited(sk);
722 } 701 }
702}
723 703
724 if (tp->rx_opt.eff_sacks) 704static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
725 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 705{
726 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 706 u32 window, cwnd_len;
727 return mss_now; 707
708 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
709 cwnd_len = mss_now * cwnd;
710 return min(window, cwnd_len);
711}
712
713/* Can at least one segment of SKB be sent right now, according to the
714 * congestion window rules? If so, return how many segments are allowed.
715 */
716static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
717{
718 u32 in_flight, cwnd;
719
720 /* Don't be strict about the congestion window for the final FIN. */
721 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
722 return 1;
723
724 in_flight = tcp_packets_in_flight(tp);
725 cwnd = tp->snd_cwnd;
726 if (in_flight < cwnd)
727 return (cwnd - in_flight);
728
729 return 0;
730}
731
732/* This must be invoked the first time we consider transmitting
733 * SKB onto the wire.
734 */
735static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
736{
737 int tso_segs = tcp_skb_pcount(skb);
738
739 if (!tso_segs ||
740 (tso_segs > 1 &&
741 skb_shinfo(skb)->tso_size != mss_now)) {
742 tcp_set_skb_tso_segs(sk, skb, mss_now);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb, cur_mss);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 if (skb->len != skb->data_len)
865 return tcp_fragment(sk, skb, len, mss_now);
866
867 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
868 if (unlikely(buff == NULL))
869 return -ENOMEM;
870
871 buff->truesize = nlen;
872 skb->truesize -= nlen;
873
874 /* Correct the sequence numbers. */
875 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
876 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
877 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
878
879 /* PSH and FIN should only be set in the second packet. */
880 flags = TCP_SKB_CB(skb)->flags;
881 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
882 TCP_SKB_CB(buff)->flags = flags;
883
884 /* This packet was never sent out yet, so no SACK bits. */
885 TCP_SKB_CB(buff)->sacked = 0;
886
887 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
888 skb_split(skb, buff, len);
889
890 /* Fix up tso_factor for both original and new SKB. */
891 tcp_set_skb_tso_segs(sk, skb, mss_now);
892 tcp_set_skb_tso_segs(sk, buff, mss_now);
893
894 /* Link BUFF into the send queue. */
895 skb_header_release(buff);
896 __skb_append(skb, buff);
897
898 return 0;
899}
900
901/* Try to defer sending, if possible, in order to minimize the amount
902 * of TSO splitting we do. View it as a kind of TSO Nagle test.
903 *
904 * This algorithm is from John Heffner.
905 */
906static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
907{
908 u32 send_win, cong_win, limit, in_flight;
909
910 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
911 return 0;
912
913 if (tp->ca_state != TCP_CA_Open)
914 return 0;
915
916 in_flight = tcp_packets_in_flight(tp);
917
918 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
919 (tp->snd_cwnd <= in_flight));
920
921 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
922
923 /* From in_flight test above, we know that cwnd > in_flight. */
924 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
925
926 limit = min(send_win, cong_win);
927
928 if (sysctl_tcp_tso_win_divisor) {
929 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
930
931 /* If at least some fraction of a window is available,
932 * just use it.
933 */
934 chunk /= sysctl_tcp_tso_win_divisor;
935 if (limit >= chunk)
936 return 0;
937 } else {
938 /* Different approach, try not to defer past a single
939 * ACK. Receiver should ACK every other full sized
940 * frame, so if we have space for more than 3 frames
941 * then send now.
942 */
943 if (limit > tcp_max_burst(tp) * tp->mss_cache)
944 return 0;
945 }
946
947 /* Ok, it looks like it is advisable to defer. */
948 return 1;
728} 949}
729 950
730/* This routine writes packets to the network. It advances the 951/* This routine writes packets to the network. It advances the
@@ -734,57 +955,142 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
734 * Returns 1, if no segments are in flight and we have queued segments, but 955 * Returns 1, if no segments are in flight and we have queued segments, but
735 * cannot send anything now because of SWS or another problem. 956 * cannot send anything now because of SWS or another problem.
736 */ 957 */
737int tcp_write_xmit(struct sock *sk, int nonagle) 958static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
738{ 959{
739 struct tcp_sock *tp = tcp_sk(sk); 960 struct tcp_sock *tp = tcp_sk(sk);
740 unsigned int mss_now; 961 struct sk_buff *skb;
962 unsigned int tso_segs, sent_pkts;
963 int cwnd_quota;
741 964
742 /* If we are closed, the bytes will have to remain here. 965 /* If we are closed, the bytes will have to remain here.
743 * In time closedown will finish, we empty the write queue and all 966 * In time closedown will finish, we empty the write queue and all
744 * will be happy. 967 * will be happy.
745 */ 968 */
746 if (sk->sk_state != TCP_CLOSE) { 969 if (unlikely(sk->sk_state == TCP_CLOSE))
747 struct sk_buff *skb; 970 return 0;
748 int sent_pkts = 0;
749 971
750 /* Account for SACKS, we may need to fragment due to this. 972 sent_pkts = 0;
751 * It is just like the real MSS changing on us midstream. 973 while ((skb = sk->sk_send_head)) {
752 * We also handle things correctly when the user adds some 974 unsigned int limit;
753 * IP options mid-stream. Silly to do, but cover it.
754 */
755 mss_now = tcp_current_mss(sk, 1);
756
757 while ((skb = sk->sk_send_head) &&
758 tcp_snd_test(sk, skb, mss_now,
759 tcp_skb_is_last(sk, skb) ? nonagle :
760 TCP_NAGLE_PUSH)) {
761 if (skb->len > mss_now) {
762 if (tcp_fragment(sk, skb, mss_now))
763 break;
764 }
765 975
766 TCP_SKB_CB(skb)->when = tcp_time_stamp; 976 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
767 tcp_tso_set_push(skb); 977 BUG_ON(!tso_segs);
768 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) 978
979 cwnd_quota = tcp_cwnd_test(tp, skb);
980 if (!cwnd_quota)
981 break;
982
983 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
984 break;
985
986 if (tso_segs == 1) {
987 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
988 (tcp_skb_is_last(sk, skb) ?
989 nonagle : TCP_NAGLE_PUSH))))
769 break; 990 break;
991 } else {
992 if (tcp_tso_should_defer(sk, tp, skb))
993 break;
994 }
770 995
771 /* Advance the send_head. This one is sent out. 996 limit = mss_now;
772 * This call will increment packets_out. 997 if (tso_segs > 1) {
773 */ 998 limit = tcp_window_allows(tp, skb,
774 update_send_head(sk, tp, skb); 999 mss_now, cwnd_quota);
1000
1001 if (skb->len < limit) {
1002 unsigned int trim = skb->len % mss_now;
775 1003
776 tcp_minshall_update(tp, mss_now, skb); 1004 if (trim)
777 sent_pkts = 1; 1005 limit = skb->len - trim;
1006 }
778 } 1007 }
779 1008
780 if (sent_pkts) { 1009 if (skb->len > limit &&
781 tcp_cwnd_validate(sk, tp); 1010 unlikely(tso_fragment(sk, skb, limit, mss_now)))
782 return 0; 1011 break;
1012
1013 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1014
1015 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
1016 break;
1017
1018 /* Advance the send_head. This one is sent out.
1019 * This call will increment packets_out.
1020 */
1021 update_send_head(sk, tp, skb);
1022
1023 tcp_minshall_update(tp, mss_now, skb);
1024 sent_pkts++;
1025 }
1026
1027 if (likely(sent_pkts)) {
1028 tcp_cwnd_validate(sk, tp);
1029 return 0;
1030 }
1031 return !tp->packets_out && sk->sk_send_head;
1032}
1033
1034/* Push out any pending frames which were held back due to
1035 * TCP_CORK or attempt at coalescing tiny packets.
1036 * The socket must be locked by the caller.
1037 */
1038void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1039 unsigned int cur_mss, int nonagle)
1040{
1041 struct sk_buff *skb = sk->sk_send_head;
1042
1043 if (skb) {
1044 if (tcp_write_xmit(sk, cur_mss, nonagle))
1045 tcp_check_probe_timer(sk, tp);
1046 }
1047}
1048
1049/* Send _single_ skb sitting at the send head. This function requires
1050 * true push pending frames to setup probe timer etc.
1051 */
1052void tcp_push_one(struct sock *sk, unsigned int mss_now)
1053{
1054 struct tcp_sock *tp = tcp_sk(sk);
1055 struct sk_buff *skb = sk->sk_send_head;
1056 unsigned int tso_segs, cwnd_quota;
1057
1058 BUG_ON(!skb || skb->len < mss_now);
1059
1060 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1061 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1062
1063 if (likely(cwnd_quota)) {
1064 unsigned int limit;
1065
1066 BUG_ON(!tso_segs);
1067
1068 limit = mss_now;
1069 if (tso_segs > 1) {
1070 limit = tcp_window_allows(tp, skb,
1071 mss_now, cwnd_quota);
1072
1073 if (skb->len < limit) {
1074 unsigned int trim = skb->len % mss_now;
1075
1076 if (trim)
1077 limit = skb->len - trim;
1078 }
783 } 1079 }
784 1080
785 return !tp->packets_out && sk->sk_send_head; 1081 if (skb->len > limit &&
1082 unlikely(tso_fragment(sk, skb, limit, mss_now)))
1083 return;
1084
1085 /* Send it out now. */
1086 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1087
1088 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1089 update_send_head(sk, tp, skb);
1090 tcp_cwnd_validate(sk, tp);
1091 return;
1092 }
786 } 1093 }
787 return 0;
788} 1094}
789 1095
790/* This function returns the amount that we can raise the 1096/* This function returns the amount that we can raise the
@@ -1044,7 +1350,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1044 if (sk->sk_route_caps & NETIF_F_TSO) { 1350 if (sk->sk_route_caps & NETIF_F_TSO) {
1045 sk->sk_route_caps &= ~NETIF_F_TSO; 1351 sk->sk_route_caps &= ~NETIF_F_TSO;
1046 sock_set_flag(sk, SOCK_NO_LARGESEND); 1352 sock_set_flag(sk, SOCK_NO_LARGESEND);
1047 tp->mss_cache = tp->mss_cache_std;
1048 } 1353 }
1049 1354
1050 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1355 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1062,15 +1367,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1062 1367
1063 if (skb->len > cur_mss) { 1368 if (skb->len > cur_mss) {
1064 int old_factor = tcp_skb_pcount(skb); 1369 int old_factor = tcp_skb_pcount(skb);
1065 int new_factor; 1370 int diff;
1066 1371
1067 if (tcp_fragment(sk, skb, cur_mss)) 1372 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1068 return -ENOMEM; /* We'll try again later. */ 1373 return -ENOMEM; /* We'll try again later. */
1069 1374
1070 /* New SKB created, account for it. */ 1375 /* New SKB created, account for it. */
1071 new_factor = tcp_skb_pcount(skb); 1376 diff = old_factor - tcp_skb_pcount(skb) -
1072 tp->packets_out -= old_factor - new_factor; 1377 tcp_skb_pcount(skb->next);
1073 tp->packets_out += tcp_skb_pcount(skb->next); 1378 tp->packets_out -= diff;
1379
1380 if (diff > 0) {
1381 tp->fackets_out -= diff;
1382 if ((int)tp->fackets_out < 0)
1383 tp->fackets_out = 0;
1384 }
1074 } 1385 }
1075 1386
1076 /* Collapse two adjacent packets if worthwhile and we can. */ 1387 /* Collapse two adjacent packets if worthwhile and we can. */
@@ -1106,7 +1417,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1106 * is still in somebody's hands, else make a clone. 1417 * is still in somebody's hands, else make a clone.
1107 */ 1418 */
1108 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1419 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1109 tcp_tso_set_push(skb);
1110 1420
1111 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1421 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1112 pskb_copy(skb, GFP_ATOMIC): 1422 pskb_copy(skb, GFP_ATOMIC):
@@ -1290,7 +1600,7 @@ void tcp_send_fin(struct sock *sk)
1290 * was unread data in the receive queue. This behavior is recommended 1600 * was unread data in the receive queue. This behavior is recommended
1291 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM 1601 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1292 */ 1602 */
1293void tcp_send_active_reset(struct sock *sk, int priority) 1603void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
1294{ 1604{
1295 struct tcp_sock *tp = tcp_sk(sk); 1605 struct tcp_sock *tp = tcp_sk(sk);
1296 struct sk_buff *skb; 1606 struct sk_buff *skb;
@@ -1449,7 +1759,6 @@ static inline void tcp_connect_init(struct sock *sk)
1449 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 1759 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1450 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 1760 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451 tcp_initialize_rcv_mss(sk); 1761 tcp_initialize_rcv_mss(sk);
1452 tcp_ca_init(tp);
1453 1762
1454 tcp_select_initial_window(tcp_full_space(sk), 1763 tcp_select_initial_window(tcp_full_space(sk),
1455 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 1764 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1812,6 @@ int tcp_connect(struct sock *sk)
1503 TCP_SKB_CB(buff)->end_seq = tp->write_seq; 1812 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1504 tp->snd_nxt = tp->write_seq; 1813 tp->snd_nxt = tp->write_seq;
1505 tp->pushed_seq = tp->write_seq; 1814 tp->pushed_seq = tp->write_seq;
1506 tcp_ca_init(tp);
1507 1815
1508 /* Send it off. */ 1816 /* Send it off. */
1509 TCP_SKB_CB(buff)->when = tcp_time_stamp; 1817 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1670,21 +1978,19 @@ int tcp_write_wakeup(struct sock *sk)
1670 skb->len > mss) { 1978 skb->len > mss) {
1671 seg_size = min(seg_size, mss); 1979 seg_size = min(seg_size, mss);
1672 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1980 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1673 if (tcp_fragment(sk, skb, seg_size)) 1981 if (tcp_fragment(sk, skb, seg_size, mss))
1674 return -1; 1982 return -1;
1675 /* SWS override triggered forced fragmentation. 1983 /* SWS override triggered forced fragmentation.
1676 * Disable TSO, the connection is too sick. */ 1984 * Disable TSO, the connection is too sick. */
1677 if (sk->sk_route_caps & NETIF_F_TSO) { 1985 if (sk->sk_route_caps & NETIF_F_TSO) {
1678 sock_set_flag(sk, SOCK_NO_LARGESEND); 1986 sock_set_flag(sk, SOCK_NO_LARGESEND);
1679 sk->sk_route_caps &= ~NETIF_F_TSO; 1987 sk->sk_route_caps &= ~NETIF_F_TSO;
1680 tp->mss_cache = tp->mss_cache_std;
1681 } 1988 }
1682 } else if (!tcp_skb_pcount(skb)) 1989 } else if (!tcp_skb_pcount(skb))
1683 tcp_set_skb_tso_segs(sk, skb); 1990 tcp_set_skb_tso_segs(sk, skb, mss);
1684 1991
1685 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1992 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1686 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1993 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1687 tcp_tso_set_push(skb);
1688 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 1994 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1689 if (!err) { 1995 if (!err) {
1690 update_send_head(sk, tp, skb); 1996 update_send_head(sk, tp, skb);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 0000000000..70e108e15c
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
1/* Tom Kelly's Scalable TCP
2 *
3 * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
4 *
5 * John Heffner <jheffner@sc.edu>
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10#include <net/tcp.h>
11
12/* These factors derived from the recommended values in the aer:
13 * .01 and and 7/8. We use 50 instead of 100 to account for
14 * delayed ack.
15 */
16#define TCP_SCALABLE_AI_CNT 50U
17#define TCP_SCALABLE_MD_SCALE 3
18
19static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
20 u32 in_flight, int flag)
21{
22 if (in_flight < tp->snd_cwnd)
23 return;
24
25 if (tp->snd_cwnd <= tp->snd_ssthresh) {
26 tp->snd_cwnd++;
27 } else {
28 tp->snd_cwnd_cnt++;
29 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
30 tp->snd_cwnd++;
31 tp->snd_cwnd_cnt = 0;
32 }
33 }
34 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
35 tp->snd_cwnd_stamp = tcp_time_stamp;
36}
37
38static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
39{
40 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
41}
42
43
44static struct tcp_congestion_ops tcp_scalable = {
45 .ssthresh = tcp_scalable_ssthresh,
46 .cong_avoid = tcp_scalable_cong_avoid,
47 .min_cwnd = tcp_reno_min_cwnd,
48
49 .owner = THIS_MODULE,
50 .name = "scalable",
51};
52
53static int __init tcp_scalable_register(void)
54{
55 return tcp_register_congestion_control(&tcp_scalable);
56}
57
58static void __exit tcp_scalable_unregister(void)
59{
60 tcp_unregister_congestion_control(&tcp_scalable);
61}
62
63module_init(tcp_scalable_register);
64module_exit(tcp_scalable_unregister);
65
66MODULE_AUTHOR("John Heffner");
67MODULE_LICENSE("GPL");
68MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b127b44985..0084227438 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
231 } 231 }
232 tp->ack.pending &= ~TCP_ACK_TIMER; 232 tp->ack.pending &= ~TCP_ACK_TIMER;
233 233
234 if (skb_queue_len(&tp->ucopy.prequeue)) { 234 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb; 235 struct sk_buff *skb;
236 236
237 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 237 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
238 skb_queue_len(&tp->ucopy.prequeue));
239 238
240 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 239 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
241 sk->sk_backlog_rcv(sk, skb); 240 sk->sk_backlog_rcv(sk, skb);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 0000000000..9bd443db51
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
1/*
2 * TCP Vegas congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * Lawrence S. Brakmo and Larry L. Peterson.
6 * "TCP Vegas: End to end congestion avoidance on a global internet."
7 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
8 * October 1995. Available from:
9 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
10 *
11 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
12 * The main aspects that distinguish this implementation from the
13 * Arizona Vegas implementation are:
14 * o We do not change the loss detection or recovery mechanisms of
15 * Linux in any way. Linux already recovers from losses quite well,
16 * using fine-grained timers, NewReno, and FACK.
17 * o To avoid the performance penalty imposed by increasing cwnd
18 * only every-other RTT during slow start, we increase during
19 * every RTT during slow start, just like Reno.
20 * o Largely to allow continuous cwnd growth during slow start,
21 * we use the rate at which ACKs come back as the "actual"
22 * rate, rather than the rate at which data is sent.
23 * o To speed convergence to the right rate, we set the cwnd
24 * to achieve the right ("actual") rate when we exit slow start.
25 * o To filter out the noise caused by delayed ACKs, we use the
26 * minimum RTT sample observed during the last RTT to calculate
27 * the actual rate.
28 * o When the sender re-starts from idle, it waits until it has
29 * received ACKs for an entire flight of new data before making
30 * a cwnd adjustment decision. The original Vegas implementation
31 * assumed senders never went idle.
32 */
33
34#include <linux/config.h>
35#include <linux/mm.h>
36#include <linux/module.h>
37#include <linux/skbuff.h>
38#include <linux/tcp_diag.h>
39
40#include <net/tcp.h>
41
42/* Default values of the Vegas variables, in fixed-point representation
43 * with V_PARAM_SHIFT bits to the right of the binary point.
44 */
45#define V_PARAM_SHIFT 1
46static int alpha = 1<<V_PARAM_SHIFT;
47static int beta = 3<<V_PARAM_SHIFT;
48static int gamma = 1<<V_PARAM_SHIFT;
49
50module_param(alpha, int, 0644);
51MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
52module_param(beta, int, 0644);
53MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
54module_param(gamma, int, 0644);
55MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
56
57
58/* Vegas variables */
59struct vegas {
60 u32 beg_snd_nxt; /* right edge during last RTT */
61 u32 beg_snd_una; /* left edge during last RTT */
62 u32 beg_snd_cwnd; /* saves the size of the cwnd */
63 u8 doing_vegas_now;/* if true, do vegas for this RTT */
64 u16 cntRTT; /* # of RTTs measured within last RTT */
65 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
66 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
67};
68
69/* There are several situations when we must "re-start" Vegas:
70 *
71 * o when a connection is established
72 * o after an RTO
73 * o after fast recovery
74 * o when we send a packet and there is no outstanding
75 * unacknowledged data (restarting an idle connection)
76 *
77 * In these circumstances we cannot do a Vegas calculation at the
78 * end of the first RTT, because any calculation we do is using
79 * stale info -- both the saved cwnd and congestion feedback are
80 * stale.
81 *
82 * Instead we must wait until the completion of an RTT during
83 * which we actually receive ACKs.
84 */
85static inline void vegas_enable(struct tcp_sock *tp)
86{
87 struct vegas *vegas = tcp_ca(tp);
88
89 /* Begin taking Vegas samples next time we send something. */
90 vegas->doing_vegas_now = 1;
91
92 /* Set the beginning of the next send window. */
93 vegas->beg_snd_nxt = tp->snd_nxt;
94
95 vegas->cntRTT = 0;
96 vegas->minRTT = 0x7fffffff;
97}
98
99/* Stop taking Vegas samples for now. */
100static inline void vegas_disable(struct tcp_sock *tp)
101{
102 struct vegas *vegas = tcp_ca(tp);
103
104 vegas->doing_vegas_now = 0;
105}
106
107static void tcp_vegas_init(struct tcp_sock *tp)
108{
109 struct vegas *vegas = tcp_ca(tp);
110
111 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(tp);
113}
114
115/* Do RTT sampling needed for Vegas.
116 * Basically we:
117 * o min-filter RTT samples from within an RTT to get the current
118 * propagation delay + queuing delay (we are min-filtering to try to
119 * avoid the effects of delayed ACKs)
120 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT)
122 */
123static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
124{
125 struct vegas *vegas = tcp_ca(tp);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
127
128 /* Filter to find propagation delay: */
129 if (vrtt < vegas->baseRTT)
130 vegas->baseRTT = vrtt;
131
132 /* Find the min RTT during the last RTT to find
133 * the current prop. delay + queuing delay:
134 */
135 vegas->minRTT = min(vegas->minRTT, vrtt);
136 vegas->cntRTT++;
137}
138
139static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
140{
141
142 if (ca_state == TCP_CA_Open)
143 vegas_enable(tp);
144 else
145 vegas_disable(tp);
146}
147
148/*
149 * If the connection is idle and we are restarting,
150 * then we don't want to do any Vegas calculations
151 * until we get fresh RTT samples. So when we
152 * restart, we reset our Vegas state to a clean
153 * slate. After we get acks for this flight of
154 * packets, _then_ we can make Vegas calculations
155 * again.
156 */
157static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
158{
159 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START)
161 tcp_vegas_init(tp);
162}
163
164static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag)
166{
167 struct vegas *vegas = tcp_ca(tp);
168
169 if (!vegas->doing_vegas_now)
170 return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
171
172 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
173 *
174 * These are so named because they represent the approximate values
175 * of snd_una and snd_nxt at the beginning of the current RTT. More
176 * precisely, they represent the amount of data sent during the RTT.
177 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
178 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
179 * bytes of data have been ACKed during the course of the RTT, giving
180 * an "actual" rate of:
181 *
182 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
183 *
184 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
185 * because delayed ACKs can cover more than one segment, so they
186 * don't line up nicely with the boundaries of RTTs.
187 *
188 * Another unfortunate fact of life is that delayed ACKs delay the
189 * advance of the left edge of our send window, so that the number
190 * of bytes we send in an RTT is often less than our cwnd will allow.
191 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
192 */
193
194 if (after(ack, vegas->beg_snd_nxt)) {
195 /* Do the Vegas once-per-RTT cwnd adjustment. */
196 u32 old_wnd, old_snd_cwnd;
197
198
199 /* Here old_wnd is essentially the window of data that was
200 * sent during the previous RTT, and has all
201 * been acknowledged in the course of the RTT that ended
202 * with the ACK we just received. Likewise, old_snd_cwnd
203 * is the cwnd during the previous RTT.
204 */
205 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
206 tp->mss_cache;
207 old_snd_cwnd = vegas->beg_snd_cwnd;
208
209 /* Save the extent of the current window so we can use this
210 * at the end of the next RTT.
211 */
212 vegas->beg_snd_una = vegas->beg_snd_nxt;
213 vegas->beg_snd_nxt = tp->snd_nxt;
214 vegas->beg_snd_cwnd = tp->snd_cwnd;
215
216 /* Take into account the current RTT sample too, to
217 * decrease the impact of delayed acks. This double counts
218 * this sample since we count it for the next window as well,
219 * but that's not too awful, since we're taking the min,
220 * rather than averaging.
221 */
222 tcp_vegas_rtt_calc(tp, seq_rtt*1000);
223
224 /* We do the Vegas calculations only if we got enough RTT
225 * samples that we can be reasonably sure that we got
226 * at least one RTT sample that wasn't from a delayed ACK.
227 * If we only had 2 samples total,
228 * then that means we're getting only 1 ACK per RTT, which
229 * means they're almost certainly delayed ACKs.
230 * If we have 3 samples, we should be OK.
231 */
232
233 if (vegas->cntRTT <= 2) {
234 /* We don't have enough RTT samples to do the Vegas
235 * calculation, so we'll behave like Reno.
236 */
237 if (tp->snd_cwnd > tp->snd_ssthresh)
238 tp->snd_cwnd++;
239 } else {
240 u32 rtt, target_cwnd, diff;
241
242 /* We have enough RTT samples, so, using the Vegas
243 * algorithm, we determine if we should increase or
244 * decrease cwnd, and by how much.
245 */
246
247 /* Pluck out the RTT we are using for the Vegas
248 * calculations. This is the min RTT seen during the
249 * last RTT. Taking the min filters out the effects
250 * of delayed ACKs, at the cost of noticing congestion
251 * a bit later.
252 */
253 rtt = vegas->minRTT;
254
255 /* Calculate the cwnd we should have, if we weren't
256 * going too fast.
257 *
258 * This is:
259 * (actual rate in segments) * baseRTT
260 * We keep it as a fixed point number with
261 * V_PARAM_SHIFT bits to the right of the binary point.
262 */
263 target_cwnd = ((old_wnd * vegas->baseRTT)
264 << V_PARAM_SHIFT) / rtt;
265
266 /* Calculate the difference between the window we had,
267 * and the window we would like to have. This quantity
268 * is the "Diff" from the Arizona Vegas papers.
269 *
270 * Again, this is a fixed point number with
271 * V_PARAM_SHIFT bits to the right of the binary
272 * point.
273 */
274 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
275
276 if (tp->snd_cwnd < tp->snd_ssthresh) {
277 /* Slow start. */
278 if (diff > gamma) {
279 /* Going too fast. Time to slow down
280 * and switch to congestion avoidance.
281 */
282 tp->snd_ssthresh = 2;
283
284 /* Set cwnd to match the actual rate
285 * exactly:
286 * cwnd = (actual rate) * baseRTT
287 * Then we add 1 because the integer
288 * truncation robs us of full link
289 * utilization.
290 */
291 tp->snd_cwnd = min(tp->snd_cwnd,
292 (target_cwnd >>
293 V_PARAM_SHIFT)+1);
294
295 }
296 } else {
297 /* Congestion avoidance. */
298 u32 next_snd_cwnd;
299
300 /* Figure out where we would like cwnd
301 * to be.
302 */
303 if (diff > beta) {
304 /* The old window was too fast, so
305 * we slow down.
306 */
307 next_snd_cwnd = old_snd_cwnd - 1;
308 } else if (diff < alpha) {
309 /* We don't have enough extra packets
310 * in the network, so speed up.
311 */
312 next_snd_cwnd = old_snd_cwnd + 1;
313 } else {
314 /* Sending just as fast as we
315 * should be.
316 */
317 next_snd_cwnd = old_snd_cwnd;
318 }
319
320 /* Adjust cwnd upward or downward, toward the
321 * desired value.
322 */
323 if (next_snd_cwnd > tp->snd_cwnd)
324 tp->snd_cwnd++;
325 else if (next_snd_cwnd < tp->snd_cwnd)
326 tp->snd_cwnd--;
327 }
328 }
329
330 /* Wipe the slate clean for the next RTT. */
331 vegas->cntRTT = 0;
332 vegas->minRTT = 0x7fffffff;
333 }
334
335 /* The following code is executed for every ack we receive,
336 * except for conditions checked in should_advance_cwnd()
337 * before the call to tcp_cong_avoid(). Mainly this means that
338 * we only execute this code if the ack actually acked some
339 * data.
340 */
341
342 /* If we are in slow start, increase our cwnd in response to this ACK.
343 * (If we are not in slow start then we are in congestion avoidance,
344 * and adjust our congestion window only once per RTT. See the code
345 * above.)
346 */
347 if (tp->snd_cwnd <= tp->snd_ssthresh)
348 tp->snd_cwnd++;
349
350 /* to keep cwnd from growing without bound */
351 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
352
353 /* Make sure that we are never so timid as to reduce our cwnd below
354 * 2 MSS.
355 *
356 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
357 */
358 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
359}
360
361/* Extract info for Tcp socket info provided via netlink. */
362static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
363 struct sk_buff *skb)
364{
365 const struct vegas *ca = tcp_ca(tp);
366 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
367 struct tcpvegas_info *info;
368
369 info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
370 sizeof(*info)));
371
372 info->tcpv_enabled = ca->doing_vegas_now;
373 info->tcpv_rttcnt = ca->cntRTT;
374 info->tcpv_rtt = ca->baseRTT;
375 info->tcpv_minrtt = ca->minRTT;
376 rtattr_failure: ;
377 }
378}
379
380static struct tcp_congestion_ops tcp_vegas = {
381 .init = tcp_vegas_init,
382 .ssthresh = tcp_reno_ssthresh,
383 .cong_avoid = tcp_vegas_cong_avoid,
384 .min_cwnd = tcp_reno_min_cwnd,
385 .rtt_sample = tcp_vegas_rtt_calc,
386 .set_state = tcp_vegas_state,
387 .cwnd_event = tcp_vegas_cwnd_event,
388 .get_info = tcp_vegas_get_info,
389
390 .owner = THIS_MODULE,
391 .name = "vegas",
392};
393
394static int __init tcp_vegas_register(void)
395{
396 BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
397 tcp_register_congestion_control(&tcp_vegas);
398 return 0;
399}
400
401static void __exit tcp_vegas_unregister(void)
402{
403 tcp_unregister_congestion_control(&tcp_vegas);
404}
405
406module_init(tcp_vegas_register);
407module_exit(tcp_vegas_unregister);
408
409MODULE_AUTHOR("Stephen Hemminger");
410MODULE_LICENSE("GPL");
411MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 0000000000..ef827242c9
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
1/*
2 * TCP Westwood+
3 *
4 * Angelo Dell'Aera: TCP Westwood+ support
5 */
6
7#include <linux/config.h>
8#include <linux/mm.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/tcp_diag.h>
12#include <net/tcp.h>
13
14/* TCP Westwood structure */
15struct westwood {
16 u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
17 u32 bw_est; /* bandwidth estimate */
18 u32 rtt_win_sx; /* here starts a new evaluation... */
19 u32 bk;
20 u32 snd_una; /* used for evaluating the number of acked bytes */
21 u32 cumul_ack;
22 u32 accounted;
23 u32 rtt;
24 u32 rtt_min; /* minimum observed RTT */
25};
26
27
28/* TCP Westwood functions and constants */
29#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
30#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
31
32/*
33 * @tcp_westwood_create
34 * This function initializes fields used in TCP Westwood+,
35 * it is called after the initial SYN, so the sequence numbers
36 * are correct but new passive connections we have no
37 * information about RTTmin at this time so we simply set it to
38 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
39 * since in this way we're sure it will be updated in a consistent
40 * way as soon as possible. It will reasonably happen within the first
41 * RTT period of the connection lifetime.
42 */
43static void tcp_westwood_init(struct tcp_sock *tp)
44{
45 struct westwood *w = tcp_ca(tp);
46
47 w->bk = 0;
48 w->bw_ns_est = 0;
49 w->bw_est = 0;
50 w->accounted = 0;
51 w->cumul_ack = 0;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tp->snd_una;
55}
56
57/*
58 * @westwood_do_filter
59 * Low-pass filter. Implemented using constant coefficients.
60 */
61static inline u32 westwood_do_filter(u32 a, u32 b)
62{
63 return (((7 * a) + b) >> 3);
64}
65
66static inline void westwood_filter(struct westwood *w, u32 delta)
67{
68 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
69 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
70}
71
72/*
73 * @westwood_pkts_acked
74 * Called after processing group of packets.
75 * but all westwood needs is the last sample of srtt.
76 */
77static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
78{
79 struct westwood *w = tcp_ca(tp);
80 if (cnt > 0)
81 w->rtt = tp->srtt >> 3;
82}
83
84/*
85 * @westwood_update_window
86 * It updates RTT evaluation window if it is the right moment to do
87 * it. If so it calls filter for evaluating bandwidth.
88 */
89static void westwood_update_window(struct tcp_sock *tp)
90{
91 struct westwood *w = tcp_ca(tp);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93
94 /*
95 * See if a RTT-window has passed.
96 * Be careful since if RTT is less than
97 * 50ms we don't filter but we continue 'building the sample'.
98 * This minimum limit was chosen since an estimation on small
99 * time intervals is better to avoid...
100 * Obviously on a LAN we reasonably will always have
101 * right_bound = left_bound + WESTWOOD_RTT_MIN
102 */
103 if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
104 westwood_filter(w, delta);
105
106 w->bk = 0;
107 w->rtt_win_sx = tcp_time_stamp;
108 }
109}
110
111/*
112 * @westwood_fast_bw
113 * It is called when we are in fast path. In particular it is called when
114 * header prediction is successful. In such case in fact update is
115 * straight forward and doesn't need any particular care.
116 */
117static inline void westwood_fast_bw(struct tcp_sock *tp)
118{
119 struct westwood *w = tcp_ca(tp);
120
121 westwood_update_window(tp);
122
123 w->bk += tp->snd_una - w->snd_una;
124 w->snd_una = tp->snd_una;
125 w->rtt_min = min(w->rtt, w->rtt_min);
126}
127
128/*
129 * @westwood_acked_count
130 * This function evaluates cumul_ack for evaluating bk in case of
131 * delayed or partial acks.
132 */
133static inline u32 westwood_acked_count(struct tcp_sock *tp)
134{
135 struct westwood *w = tcp_ca(tp);
136
137 w->cumul_ack = tp->snd_una - w->snd_una;
138
139 /* If cumul_ack is 0 this is a dupack since it's not moving
140 * tp->snd_una.
141 */
142 if (!w->cumul_ack) {
143 w->accounted += tp->mss_cache;
144 w->cumul_ack = tp->mss_cache;
145 }
146
147 if (w->cumul_ack > tp->mss_cache) {
148 /* Partial or delayed ack */
149 if (w->accounted >= w->cumul_ack) {
150 w->accounted -= w->cumul_ack;
151 w->cumul_ack = tp->mss_cache;
152 } else {
153 w->cumul_ack -= w->accounted;
154 w->accounted = 0;
155 }
156 }
157
158 w->snd_una = tp->snd_una;
159
160 return w->cumul_ack;
161}
162
163static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
164{
165 struct westwood *w = tcp_ca(tp);
166 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
167}
168
169/*
170 * TCP Westwood
171 * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
172 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
173 * so avoids ever returning 0.
174 */
175static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
176{
177 return westwood_bw_rttmin(tp);
178}
179
180static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
181{
182 struct westwood *w = tcp_ca(tp);
183
184 switch(event) {
185 case CA_EVENT_FAST_ACK:
186 westwood_fast_bw(tp);
187 break;
188
189 case CA_EVENT_COMPLETE_CWR:
190 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
191 break;
192
193 case CA_EVENT_FRTO:
194 tp->snd_ssthresh = westwood_bw_rttmin(tp);
195 break;
196
197 case CA_EVENT_SLOW_ACK:
198 westwood_update_window(tp);
199 w->bk += westwood_acked_count(tp);
200 w->rtt_min = min(w->rtt, w->rtt_min);
201 break;
202
203 default:
204 /* don't care */
205 break;
206 }
207}
208
209
210/* Extract info for Tcp socket info provided via netlink. */
211static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
212 struct sk_buff *skb)
213{
214 const struct westwood *ca = tcp_ca(tp);
215 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
216 struct rtattr *rta;
217 struct tcpvegas_info *info;
218
219 rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
220 info = RTA_DATA(rta);
221 info->tcpv_enabled = 1;
222 info->tcpv_rttcnt = 0;
223 info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
224 info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
225 rtattr_failure: ;
226 }
227}
228
229
230static struct tcp_congestion_ops tcp_westwood = {
231 .init = tcp_westwood_init,
232 .ssthresh = tcp_reno_ssthresh,
233 .cong_avoid = tcp_reno_cong_avoid,
234 .min_cwnd = tcp_westwood_cwnd_min,
235 .cwnd_event = tcp_westwood_event,
236 .get_info = tcp_westwood_info,
237 .pkts_acked = tcp_westwood_pkts_acked,
238
239 .owner = THIS_MODULE,
240 .name = "westwood"
241};
242
243static int __init tcp_westwood_register(void)
244{
245 BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
246 return tcp_register_congestion_control(&tcp_westwood);
247}
248
249static void __exit tcp_westwood_unregister(void)
250{
251 tcp_unregister_congestion_control(&tcp_westwood);
252}
253
254module_init(tcp_westwood_register);
255module_exit(tcp_westwood_unregister);
256
257MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
258MODULE_LICENSE("GPL");
259MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7c24e64b44..dc4d07357e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -628,7 +628,7 @@ back_from_confirm:
628 /* ... which is an evident application bug. --ANK */ 628 /* ... which is an evident application bug. --ANK */
629 release_sock(sk); 629 release_sock(sk);
630 630
631 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n")); 631 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
632 err = -EINVAL; 632 err = -EINVAL;
633 goto out; 633 goto out;
634 } 634 }
@@ -693,7 +693,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
693 if (unlikely(!up->pending)) { 693 if (unlikely(!up->pending)) {
694 release_sock(sk); 694 release_sock(sk);
695 695
696 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n")); 696 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n"));
697 return -EINVAL; 697 return -EINVAL;
698 } 698 }
699 699
@@ -1102,7 +1102,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1102 skb->ip_summed = CHECKSUM_UNNECESSARY; 1102 skb->ip_summed = CHECKSUM_UNNECESSARY;
1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) 1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
1104 return 0; 1104 return 0;
1105 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n")); 1105 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
1106 skb->ip_summed = CHECKSUM_NONE; 1106 skb->ip_summed = CHECKSUM_NONE;
1107 } 1107 }
1108 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 1108 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,14 +1181,13 @@ int udp_rcv(struct sk_buff *skb)
1181 return(0); 1181 return(0);
1182 1182
1183short_packet: 1183short_packet:
1184 NETDEBUG(if (net_ratelimit()) 1184 LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
1185 printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", 1185 NIPQUAD(saddr),
1186 NIPQUAD(saddr), 1186 ntohs(uh->source),
1187 ntohs(uh->source), 1187 ulen,
1188 ulen, 1188 len,
1189 len, 1189 NIPQUAD(daddr),
1190 NIPQUAD(daddr), 1190 ntohs(uh->dest)));
1191 ntohs(uh->dest)));
1192no_header: 1191no_header:
1193 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1192 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1194 kfree_skb(skb); 1193 kfree_skb(skb);
@@ -1199,13 +1198,12 @@ csum_error:
1199 * RFC1122: OK. Discards the bad packet silently (as far as 1198 * RFC1122: OK. Discards the bad packet silently (as far as
1200 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 1199 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1201 */ 1200 */
1202 NETDEBUG(if (net_ratelimit()) 1201 LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
1203 printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", 1202 NIPQUAD(saddr),
1204 NIPQUAD(saddr), 1203 ntohs(uh->source),
1205 ntohs(uh->source), 1204 NIPQUAD(daddr),
1206 NIPQUAD(daddr), 1205 ntohs(uh->dest),
1207 ntohs(uh->dest), 1206 ulen));
1208 ulen));
1209drop: 1207drop:
1210 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1208 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1211 kfree_skb(skb); 1209 kfree_skb(skb);
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
deleted file mode 100644
index 6aecd7a435..0000000000
--- a/net/ipv4/utils.c
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Various kernel-resident INET utility functions; mainly
7 * for format conversion and debugging output.
8 *
9 * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
10 *
11 * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *
13 * Fixes:
14 * Alan Cox : verify_area check.
15 * Alan Cox : removed old debugging.
16 * Andi Kleen : add net_ratelimit()
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24#include <linux/module.h>
25#include <linux/types.h>
26#include <asm/byteorder.h>
27
28/*
29 * Convert an ASCII string to binary IP.
30 */
31
32__u32 in_aton(const char *str)
33{
34 unsigned long l;
35 unsigned int val;
36 int i;
37
38 l = 0;
39 for (i = 0; i < 4; i++)
40 {
41 l <<= 8;
42 if (*str != '\0')
43 {
44 val = 0;
45 while (*str != '\0' && *str != '.')
46 {
47 val *= 10;
48 val += *str - '0';
49 str++;
50 }
51 l |= val;
52 if (*str != '\0')
53 str++;
54 }
55 }
56 return(htonl(l));
57}
58
59EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index e1fe360ed2..afbb0d4cc3 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -78,10 +78,9 @@ static int ipip_rcv(struct sk_buff *skb)
78static void ipip_err(struct sk_buff *skb, u32 info) 78static void ipip_err(struct sk_buff *skb, u32 info)
79{ 79{
80 struct xfrm_tunnel *handler = ipip_handler; 80 struct xfrm_tunnel *handler = ipip_handler;
81 u32 arg = info;
82 81
83 if (handler) 82 if (handler)
84 handler->err_handler(skb, &arg); 83 handler->err_handler(skb, info);
85} 84}
86 85
87static int ipip_init_state(struct xfrm_state *x) 86static int ipip_init_state(struct xfrm_state *x)
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index e66ca9381c..ab7a9124f9 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,6 +1,26 @@
1# 1#
2# IPv6 configuration 2# IPv6 configuration
3# 3#
4
5# IPv6 as module will cause a CRASH if you try to unload it
6config IPV6
7 tristate "The IPv6 protocol"
8 default m
9 select CRYPTO if IPV6_PRIVACY
10 select CRYPTO_MD5 if IPV6_PRIVACY
11 ---help---
12 This is complemental support for the IP version 6.
13 You will still be able to do traditional IPv4 networking as well.
14
15 For general information about IPv6, see
16 <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
17 For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
18 For specific information about IPv6 under Linux, read the HOWTO at
19 <http://www.bieringer.de/linux/IPv6/>.
20
21 To compile this protocol support as a module, choose M here: the
22 module will be called ipv6.
23
4config IPV6_PRIVACY 24config IPV6_PRIVACY
5 bool "IPv6: Privacy Extensions (RFC 3041) support" 25 bool "IPv6: Privacy Extensions (RFC 3041) support"
6 depends on IPV6 26 depends on IPV6
@@ -71,7 +91,6 @@ config INET6_TUNNEL
71config IPV6_TUNNEL 91config IPV6_TUNNEL
72 tristate "IPv6: IPv6-in-IPv6 tunnel" 92 tristate "IPv6: IPv6-in-IPv6 tunnel"
73 depends on IPV6 93 depends on IPV6
74 select INET6_TUNNEL
75 ---help--- 94 ---help---
76 Support for IPv6-in-IPv6 tunnels described in RFC 2473. 95 Support for IPv6-in-IPv6 tunnels described in RFC 2473.
77 96
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 14f5c53235..77004b9456 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -57,6 +57,7 @@
57#endif 57#endif
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/notifier.h> 59#include <linux/notifier.h>
60#include <linux/string.h>
60 61
61#include <net/sock.h> 62#include <net/sock.h>
62#include <net/snmp.h> 63#include <net/snmp.h>
@@ -2776,7 +2777,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2776 read_lock_bh(&idev->lock); 2777 read_lock_bh(&idev->lock);
2777 switch (type) { 2778 switch (type) {
2778 case UNICAST_ADDR: 2779 case UNICAST_ADDR:
2779 /* unicast address */ 2780 /* unicast address incl. temp addr */
2780 for (ifa = idev->addr_list; ifa; 2781 for (ifa = idev->addr_list; ifa;
2781 ifa = ifa->if_next, ip_idx++) { 2782 ifa = ifa->if_next, ip_idx++) {
2782 if (ip_idx < s_ip_idx) 2783 if (ip_idx < s_ip_idx)
@@ -2787,19 +2788,6 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2787 NLM_F_MULTI)) <= 0) 2788 NLM_F_MULTI)) <= 0)
2788 goto done; 2789 goto done;
2789 } 2790 }
2790 /* temp addr */
2791#ifdef CONFIG_IPV6_PRIVACY
2792 for (ifa = idev->tempaddr_list; ifa;
2793 ifa = ifa->tmp_next, ip_idx++) {
2794 if (ip_idx < s_ip_idx)
2795 continue;
2796 if ((err = inet6_fill_ifaddr(skb, ifa,
2797 NETLINK_CB(cb->skb).pid,
2798 cb->nlh->nlmsg_seq, RTM_NEWADDR,
2799 NLM_F_MULTI)) <= 0)
2800 goto done;
2801 }
2802#endif
2803 break; 2791 break;
2804 case MULTICAST_ADDR: 2792 case MULTICAST_ADDR:
2805 /* multicast address */ 2793 /* multicast address */
@@ -2922,6 +2910,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
2922 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); 2910 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2923 r = NLMSG_DATA(nlh); 2911 r = NLMSG_DATA(nlh);
2924 r->ifi_family = AF_INET6; 2912 r->ifi_family = AF_INET6;
2913 r->__ifi_pad = 0;
2925 r->ifi_type = dev->type; 2914 r->ifi_type = dev->type;
2926 r->ifi_index = dev->ifindex; 2915 r->ifi_index = dev->ifindex;
2927 r->ifi_flags = dev_get_flags(dev); 2916 r->ifi_flags = dev_get_flags(dev);
@@ -3029,9 +3018,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
3029 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags); 3018 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
3030 pmsg = NLMSG_DATA(nlh); 3019 pmsg = NLMSG_DATA(nlh);
3031 pmsg->prefix_family = AF_INET6; 3020 pmsg->prefix_family = AF_INET6;
3021 pmsg->prefix_pad1 = 0;
3022 pmsg->prefix_pad2 = 0;
3032 pmsg->prefix_ifindex = idev->dev->ifindex; 3023 pmsg->prefix_ifindex = idev->dev->ifindex;
3033 pmsg->prefix_len = pinfo->prefix_len; 3024 pmsg->prefix_len = pinfo->prefix_len;
3034 pmsg->prefix_type = pinfo->type; 3025 pmsg->prefix_type = pinfo->type;
3026 pmsg->prefix_pad3 = 0;
3035 3027
3036 pmsg->prefix_flags = 0; 3028 pmsg->prefix_flags = 0;
3037 if (pinfo->onlink) 3029 if (pinfo->onlink)
@@ -3437,7 +3429,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
3437 * by sysctl and we wouldn't want anyone to change it under our feet 3429 * by sysctl and we wouldn't want anyone to change it under our feet
3438 * (see SIOCSIFNAME). 3430 * (see SIOCSIFNAME).
3439 */ 3431 */
3440 dev_name = net_sysctl_strdup(dev_name); 3432 dev_name = kstrdup(dev_name, GFP_KERNEL);
3441 if (!dev_name) 3433 if (!dev_name)
3442 goto free; 3434 goto free;
3443 3435
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df4..28d9bcab09 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
774 if (if6_proc_init()) 774 if (if6_proc_init())
775 goto proc_if6_fail; 775 goto proc_if6_fail;
776#endif 776#endif
777 ipv6_packet_init();
778 ip6_route_init(); 777 ip6_route_init();
779 ip6_flowlabel_init(); 778 ip6_flowlabel_init();
780 err = addrconf_init(); 779 err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
791 /* Init v6 transport protocols. */ 790 /* Init v6 transport protocols. */
792 udpv6_init(); 791 udpv6_init();
793 tcpv6_init(); 792 tcpv6_init();
793
794 ipv6_packet_init();
794 err = 0; 795 err = 0;
795out: 796out:
796 return err; 797 return err;
@@ -798,7 +799,6 @@ out:
798addrconf_fail: 799addrconf_fail:
799 ip6_flowlabel_cleanup(); 800 ip6_flowlabel_cleanup();
800 ip6_route_cleanup(); 801 ip6_route_cleanup();
801 ipv6_packet_cleanup();
802#ifdef CONFIG_PROC_FS 802#ifdef CONFIG_PROC_FS
803 if6_proc_exit(); 803 if6_proc_exit();
804proc_if6_fail: 804proc_if6_fail:
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499de..b6c73da5ff 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
244 opt_space->opt_nflen = 0; 244 opt_space->opt_nflen = 0;
245 } 245 }
246 opt_space->dst1opt = fopt->dst1opt; 246 opt_space->dst1opt = fopt->dst1opt;
247 opt_space->auth = fopt->auth;
248 opt_space->opt_flen = fopt->opt_flen; 247 opt_space->opt_flen = fopt->opt_flen;
249 return opt_space; 248 return opt_space;
250} 249}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 866f10726c..10fbb50dae 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -198,12 +198,13 @@ resubmit:
198 if (!raw_sk) { 198 if (!raw_sk) {
199 if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { 199 if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
200 IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); 200 IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
201 icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff); 201 icmpv6_send(skb, ICMPV6_PARAMPROB,
202 ICMPV6_UNK_NEXTHDR, nhoff,
203 skb->dev);
202 } 204 }
203 } else { 205 } else
204 IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); 206 IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
205 kfree_skb(skb); 207 kfree_skb(skb);
206 }
207 } 208 }
208 rcu_read_unlock(); 209 rcu_read_unlock();
209 return 0; 210 return 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeed..ae652ca14b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
465 to->pkt_type = from->pkt_type; 465 to->pkt_type = from->pkt_type;
466 to->priority = from->priority; 466 to->priority = from->priority;
467 to->protocol = from->protocol; 467 to->protocol = from->protocol;
468 to->security = from->security;
469 dst_release(to->dst); 468 dst_release(to->dst);
470 to->dst = dst_clone(from->dst); 469 to->dst = dst_clone(from->dst);
471 to->dev = from->dev; 470 to->dev = from->dev;
@@ -793,13 +792,8 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
793 if (ipv6_addr_any(&fl->fl6_src)) { 792 if (ipv6_addr_any(&fl->fl6_src)) {
794 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); 793 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
795 794
796 if (err) { 795 if (err)
797#if IP6_DEBUG >= 2
798 printk(KERN_DEBUG "ip6_dst_lookup: "
799 "no available source address\n");
800#endif
801 goto out_err_release; 796 goto out_err_release;
802 }
803 } 797 }
804 798
805 return 0; 799 return 0;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ba3b0c267f..0961372940 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1110,11 +1110,39 @@ ip6ip6_fb_tnl_dev_init(struct net_device *dev)
1110 return 0; 1110 return 0;
1111} 1111}
1112 1112
1113#ifdef CONFIG_INET6_TUNNEL
1113static struct xfrm6_tunnel ip6ip6_handler = { 1114static struct xfrm6_tunnel ip6ip6_handler = {
1114 .handler = ip6ip6_rcv, 1115 .handler = ip6ip6_rcv,
1115 .err_handler = ip6ip6_err, 1116 .err_handler = ip6ip6_err,
1116}; 1117};
1117 1118
1119static inline int ip6ip6_register(void)
1120{
1121 return xfrm6_tunnel_register(&ip6ip6_handler);
1122}
1123
1124static inline int ip6ip6_unregister(void)
1125{
1126 return xfrm6_tunnel_deregister(&ip6ip6_handler);
1127}
1128#else
1129static struct inet6_protocol xfrm6_tunnel_protocol = {
1130 .handler = ip6ip6_rcv,
1131 .err_handler = ip6ip6_err,
1132 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
1133};
1134
1135static inline int ip6ip6_register(void)
1136{
1137 return inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
1138}
1139
1140static inline int ip6ip6_unregister(void)
1141{
1142 return inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
1143}
1144#endif
1145
1118/** 1146/**
1119 * ip6_tunnel_init - register protocol and reserve needed resources 1147 * ip6_tunnel_init - register protocol and reserve needed resources
1120 * 1148 *
@@ -1125,7 +1153,7 @@ static int __init ip6_tunnel_init(void)
1125{ 1153{
1126 int err; 1154 int err;
1127 1155
1128 if (xfrm6_tunnel_register(&ip6ip6_handler) < 0) { 1156 if (ip6ip6_register() < 0) {
1129 printk(KERN_ERR "ip6ip6 init: can't register tunnel\n"); 1157 printk(KERN_ERR "ip6ip6 init: can't register tunnel\n");
1130 return -EAGAIN; 1158 return -EAGAIN;
1131 } 1159 }
@@ -1144,7 +1172,7 @@ static int __init ip6_tunnel_init(void)
1144 } 1172 }
1145 return 0; 1173 return 0;
1146fail: 1174fail:
1147 xfrm6_tunnel_deregister(&ip6ip6_handler); 1175 ip6ip6_unregister();
1148 return err; 1176 return err;
1149} 1177}
1150 1178
@@ -1154,7 +1182,7 @@ fail:
1154 1182
1155static void __exit ip6_tunnel_cleanup(void) 1183static void __exit ip6_tunnel_cleanup(void)
1156{ 1184{
1157 if (xfrm6_tunnel_deregister(&ip6ip6_handler) < 0) 1185 if (ip6ip6_unregister() < 0)
1158 printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n"); 1186 printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n");
1159 1187
1160 unregister_netdev(ip6ip6_fb_tnl_dev); 1188 unregister_netdev(ip6ip6_fb_tnl_dev);
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 423feb46cc..135383ef53 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -354,7 +354,7 @@ static struct crypto_tfm **ipcomp6_alloc_tfms(const char *alg_name)
354 int cpu; 354 int cpu;
355 355
356 /* This can be any valid CPU ID so we don't need locking. */ 356 /* This can be any valid CPU ID so we don't need locking. */
357 cpu = smp_processor_id(); 357 cpu = raw_smp_processor_id();
358 358
359 list_for_each_entry(pos, &ipcomp6_tfms_list, list) { 359 list_for_each_entry(pos, &ipcomp6_tfms_list, list) {
360 struct crypto_tfm *tfm; 360 struct crypto_tfm *tfm;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f3ef4c38d3..3bc144a79f 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -504,6 +504,9 @@ done:
504 break; 504 break;
505 case IPV6_IPSEC_POLICY: 505 case IPV6_IPSEC_POLICY:
506 case IPV6_XFRM_POLICY: 506 case IPV6_XFRM_POLICY:
507 retv = -EPERM;
508 if (!capable(CAP_NET_ADMIN))
509 break;
507 retv = xfrm_user_policy(sk, optname, optval, optlen); 510 retv = xfrm_user_policy(sk, optname, optval, optlen);
508 break; 511 break;
509 512
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 562fcd14fd..29fed6e58d 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -281,7 +281,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
281 } 281 }
282 write_unlock_bh(&ipv6_sk_mc_lock); 282 write_unlock_bh(&ipv6_sk_mc_lock);
283 283
284 return -ENOENT; 284 return -EADDRNOTAVAIL;
285} 285}
286 286
287static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex) 287static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex)
@@ -386,12 +386,16 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
386 if (ipv6_addr_equal(&pmc->addr, group)) 386 if (ipv6_addr_equal(&pmc->addr, group))
387 break; 387 break;
388 } 388 }
389 if (!pmc) /* must have a prior join */ 389 if (!pmc) { /* must have a prior join */
390 err = -EINVAL;
390 goto done; 391 goto done;
392 }
391 /* if a source filter was set, must be the same mode as before */ 393 /* if a source filter was set, must be the same mode as before */
392 if (pmc->sflist) { 394 if (pmc->sflist) {
393 if (pmc->sfmode != omode) 395 if (pmc->sfmode != omode) {
396 err = -EINVAL;
394 goto done; 397 goto done;
398 }
395 } else if (pmc->sfmode != omode) { 399 } else if (pmc->sfmode != omode) {
396 /* allow mode switches for empty-set filters */ 400 /* allow mode switches for empty-set filters */
397 ip6_mc_add_src(idev, group, omode, 0, NULL, 0); 401 ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
@@ -402,7 +406,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
402 psl = pmc->sflist; 406 psl = pmc->sflist;
403 if (!add) { 407 if (!add) {
404 if (!psl) 408 if (!psl)
405 goto done; 409 goto done; /* err = -EADDRNOTAVAIL */
406 rv = !0; 410 rv = !0;
407 for (i=0; i<psl->sl_count; i++) { 411 for (i=0; i<psl->sl_count; i++) {
408 rv = memcmp(&psl->sl_addr[i], source, 412 rv = memcmp(&psl->sl_addr[i], source,
@@ -411,7 +415,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
411 break; 415 break;
412 } 416 }
413 if (rv) /* source not found */ 417 if (rv) /* source not found */
414 goto done; 418 goto done; /* err = -EADDRNOTAVAIL */
415 419
416 /* special case - (INCLUDE, empty) == LEAVE_GROUP */ 420 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
417 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { 421 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
@@ -488,6 +492,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
488 struct inet6_dev *idev; 492 struct inet6_dev *idev;
489 struct ipv6_pinfo *inet6 = inet6_sk(sk); 493 struct ipv6_pinfo *inet6 = inet6_sk(sk);
490 struct ip6_sf_socklist *newpsl, *psl; 494 struct ip6_sf_socklist *newpsl, *psl;
495 int leavegroup = 0;
491 int i, err; 496 int i, err;
492 497
493 group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; 498 group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
@@ -503,7 +508,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
503 if (!idev) 508 if (!idev)
504 return -ENODEV; 509 return -ENODEV;
505 dev = idev->dev; 510 dev = idev->dev;
506 err = -EADDRNOTAVAIL; 511
512 err = 0;
513 if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
514 leavegroup = 1;
515 goto done;
516 }
507 517
508 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { 518 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
509 if (pmc->ifindex != gsf->gf_interface) 519 if (pmc->ifindex != gsf->gf_interface)
@@ -511,8 +521,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
511 if (ipv6_addr_equal(&pmc->addr, group)) 521 if (ipv6_addr_equal(&pmc->addr, group))
512 break; 522 break;
513 } 523 }
514 if (!pmc) /* must have a prior join */ 524 if (!pmc) { /* must have a prior join */
525 err = -EINVAL;
515 goto done; 526 goto done;
527 }
516 if (gsf->gf_numsrc) { 528 if (gsf->gf_numsrc) {
517 newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, 529 newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk,
518 IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC); 530 IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC);
@@ -544,10 +556,13 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
544 (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); 556 (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
545 pmc->sflist = newpsl; 557 pmc->sflist = newpsl;
546 pmc->sfmode = gsf->gf_fmode; 558 pmc->sfmode = gsf->gf_fmode;
559 err = 0;
547done: 560done:
548 read_unlock_bh(&idev->lock); 561 read_unlock_bh(&idev->lock);
549 in6_dev_put(idev); 562 in6_dev_put(idev);
550 dev_put(dev); 563 dev_put(dev);
564 if (leavegroup)
565 err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
551 return err; 566 return err;
552} 567}
553 568
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 750943e2d3..a16df5b27c 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -76,7 +76,9 @@ static DECLARE_MUTEX(ipqnl_sem);
76static void 76static void
77ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) 77ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
78{ 78{
79 local_bh_disable();
79 nf_reinject(entry->skb, entry->info, verdict); 80 nf_reinject(entry->skb, entry->info, verdict);
81 local_bh_enable();
80 kfree(entry); 82 kfree(entry);
81} 83}
82 84
@@ -209,6 +211,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
209 break; 211 break;
210 212
211 case IPQ_COPY_PACKET: 213 case IPQ_COPY_PACKET:
214 if (entry->skb->ip_summed == CHECKSUM_HW &&
215 (*errp = skb_checksum_help(entry->skb,
216 entry->info->outdev == NULL))) {
217 read_unlock_bh(&queue_lock);
218 return NULL;
219 }
212 if (copy_range == 0 || copy_range > entry->skb->len) 220 if (copy_range == 0 || copy_range > entry->skb->len)
213 data_len = entry->skb->len; 221 data_len = entry->skb->len;
214 else 222 else
@@ -379,6 +387,7 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
379 if (!skb_ip_make_writable(&e->skb, v->data_len)) 387 if (!skb_ip_make_writable(&e->skb, v->data_len))
380 return -ENOMEM; 388 return -ENOMEM;
381 memcpy(e->skb->data, v->payload, v->data_len); 389 memcpy(e->skb->data, v->payload, v->data_len);
390 e->skb->ip_summed = CHECKSUM_NONE;
382 e->skb->nfcache |= NFC_ALTERED; 391 e->skb->nfcache |= NFC_ALTERED;
383 392
384 /* 393 /*
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index c44685e391..a692e26a4f 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -373,9 +373,10 @@ ip6t_log_packet(unsigned int hooknum,
373 in ? in->name : "", 373 in ? in->name : "",
374 out ? out->name : ""); 374 out ? out->name : "");
375 if (in && !out) { 375 if (in && !out) {
376 unsigned int len;
376 /* MAC logging for input chain only. */ 377 /* MAC logging for input chain only. */
377 printk("MAC="); 378 printk("MAC=");
378 if (skb->dev && skb->dev->hard_header_len && 379 if (skb->dev && (len = skb->dev->hard_header_len) &&
379 skb->mac.raw != skb->nh.raw) { 380 skb->mac.raw != skb->nh.raw) {
380 unsigned char *p = skb->mac.raw; 381 unsigned char *p = skb->mac.raw;
381 int i; 382 int i;
@@ -384,9 +385,11 @@ ip6t_log_packet(unsigned int hooknum,
384 (p -= ETH_HLEN) < skb->head) 385 (p -= ETH_HLEN) < skb->head)
385 p = NULL; 386 p = NULL;
386 387
387 if (p != NULL) 388 if (p != NULL) {
388 for (i = 0; i < skb->dev->hard_header_len; i++) 389 for (i = 0; i < len; i++)
389 printk("%02x", p[i]); 390 printk("%02x%s", p[i],
391 i == len - 1 ? "" : ":");
392 }
390 printk(" "); 393 printk(" ");
391 394
392 if (skb->dev->type == ARPHRD_SIT) { 395 if (skb->dev->type == ARPHRD_SIT) {
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e2b848ec98..1d4d75b34d 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -328,6 +328,8 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
328 328
329 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 329 if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
330 if (skb->ip_summed == CHECKSUM_HW) { 330 if (skb->ip_summed == CHECKSUM_HW) {
331 skb_postpull_rcsum(skb, skb->nh.raw,
332 skb->h.raw - skb->nh.raw);
331 skb->ip_summed = CHECKSUM_UNNECESSARY; 333 skb->ip_summed = CHECKSUM_UNNECESSARY;
332 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, 334 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
333 &skb->nh.ipv6h->daddr, 335 &skb->nh.ipv6h->daddr,
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b788f55e13..e553e5b80d 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -195,7 +195,6 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int
195 dev_hold(dev); 195 dev_hold(dev);
196 196
197 ipip6_tunnel_link(nt); 197 ipip6_tunnel_link(nt);
198 /* Do not decrement MOD_USE_COUNT here. */
199 return nt; 198 return nt;
200 199
201failed: 200failed:
@@ -794,10 +793,28 @@ static struct net_protocol sit_protocol = {
794 .err_handler = ipip6_err, 793 .err_handler = ipip6_err,
795}; 794};
796 795
796static void __exit sit_destroy_tunnels(void)
797{
798 int prio;
799
800 for (prio = 1; prio < 4; prio++) {
801 int h;
802 for (h = 0; h < HASH_SIZE; h++) {
803 struct ip_tunnel *t;
804 while ((t = tunnels[prio][h]) != NULL)
805 unregister_netdevice(t->dev);
806 }
807 }
808}
809
797void __exit sit_cleanup(void) 810void __exit sit_cleanup(void)
798{ 811{
799 inet_del_protocol(&sit_protocol, IPPROTO_IPV6); 812 inet_del_protocol(&sit_protocol, IPPROTO_IPV6);
800 unregister_netdev(ipip6_fb_tunnel_dev); 813
814 rtnl_lock();
815 sit_destroy_tunnels();
816 unregister_netdevice(ipip6_fb_tunnel_dev);
817 rtnl_unlock();
801} 818}
802 819
803int __init sit_init(void) 820int __init sit_init(void)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2414937f2a..ef29cfd936 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -158,9 +158,14 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
158 tcp_port_rover = rover; 158 tcp_port_rover = rover;
159 spin_unlock(&tcp_portalloc_lock); 159 spin_unlock(&tcp_portalloc_lock);
160 160
161 /* Exhausted local port range during search? */ 161 /* Exhausted local port range during search? It is not
162 * possible for us to be holding one of the bind hash
163 * locks if this test triggers, because if 'remaining'
164 * drops to zero, we broke out of the do/while loop at
165 * the top level, not from the 'break;' statement.
166 */
162 ret = 1; 167 ret = 1;
163 if (remaining <= 0) 168 if (unlikely(remaining <= 0))
164 goto fail; 169 goto fail;
165 170
166 /* OK, here is the one we will use. */ 171 /* OK, here is the one we will use. */
@@ -2018,14 +2023,14 @@ static int tcp_v6_init_sock(struct sock *sk)
2018 */ 2023 */
2019 tp->snd_ssthresh = 0x7fffffff; 2024 tp->snd_ssthresh = 0x7fffffff;
2020 tp->snd_cwnd_clamp = ~0; 2025 tp->snd_cwnd_clamp = ~0;
2021 tp->mss_cache_std = tp->mss_cache = 536; 2026 tp->mss_cache = 536;
2022 2027
2023 tp->reordering = sysctl_tcp_reordering; 2028 tp->reordering = sysctl_tcp_reordering;
2024 2029
2025 sk->sk_state = TCP_CLOSE; 2030 sk->sk_state = TCP_CLOSE;
2026 2031
2027 tp->af_specific = &ipv6_specific; 2032 tp->af_specific = &ipv6_specific;
2028 2033 tp->ca_ops = &tcp_init_congestion_ops;
2029 sk->sk_write_space = sk_stream_write_space; 2034 sk->sk_write_space = sk_stream_write_space;
2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2035 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2031 2036
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index a16237c0e7..980a826f5d 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -1,6 +1,39 @@
1# 1#
2# IPX configuration 2# IPX configuration
3# 3#
4config IPX
5 tristate "The IPX protocol"
6 select LLC
7 ---help---
8 This is support for the Novell networking protocol, IPX, commonly
9 used for local networks of Windows machines. You need it if you
10 want to access Novell NetWare file or print servers using the Linux
11 Novell client ncpfs (available from
12 <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
13 within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>). In order
15 to do the former, you'll also have to say Y to "NCP file system
16 support", below.
17
18 IPX is similar in scope to IP, while SPX, which runs on top of IPX,
19 is similar to TCP. There is also experimental support for SPX in
20 Linux (see "SPX networking", below).
21
22 To turn your Linux box into a fully featured NetWare file server and
23 IPX router, say Y here and fetch either lwared from
24 <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
25 mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
26 information, read the IPX-HOWTO available from
27 <http://www.tldp.org/docs.html#howto>.
28
29 General information about how to connect Linux, Windows machines and
30 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
31
32 The IPX driver would enlarge your kernel by about 16 KB. To compile
33 this driver as a module, choose M here: the module will be called ipx.
34 Unless you want to integrate your Linux box with a local Novell
35 network, say N.
36
4config IPX_INTERN 37config IPX_INTERN
5 bool "IPX: Full internal IPX network" 38 bool "IPX: Full internal IPX network"
6 depends on IPX 39 depends on IPX
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
index 046ad0750e..7029618f57 100644
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irlap_cb *self)
445 IRDA_ASSERT(self->magic == LAP_MAGIC, return;); 445 IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
446 446
447 /* Don't disconnect until all data frames are successfully sent */ 447 /* Don't disconnect until all data frames are successfully sent */
448 if (skb_queue_len(&self->txq) > 0) { 448 if (!skb_queue_empty(&self->txq)) {
449 self->disconnect_pending = TRUE; 449 self->disconnect_pending = TRUE;
450
451 return; 450 return;
452 } 451 }
453 452
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
index 1cd89f5f3b..a505b54576 100644
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
191 * Send out the RR frames faster if our own transmit queue is empty, or 191 * Send out the RR frames faster if our own transmit queue is empty, or
192 * if the peer is busy. The effect is a much faster conversation 192 * if the peer is busy. The effect is a much faster conversation
193 */ 193 */
194 if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) { 194 if (skb_queue_empty(&self->txq) || self->remote_busy) {
195 if (self->fast_RR == TRUE) { 195 if (self->fast_RR == TRUE) {
196 /* 196 /*
197 * Assert that the fast poll timer has not reached the 197 * Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
263 IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__, 263 IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
264 skb_queue_len(&self->txq)); 264 skb_queue_len(&self->txq));
265 265
266 if (skb_queue_len(&self->txq)) { 266 if (!skb_queue_empty(&self->txq)) {
267 /* Prevent race conditions with irlap_data_request() */ 267 /* Prevent race conditions with irlap_data_request() */
268 self->local_busy = TRUE; 268 self->local_busy = TRUE;
269 269
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
1074#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1074#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
1075 /* Window has been adjusted for the max packet 1075 /* Window has been adjusted for the max packet
1076 * size, so much simpler... - Jean II */ 1076 * size, so much simpler... - Jean II */
1077 nextfit = (skb_queue_len(&self->txq) > 0); 1077 nextfit = !skb_queue_empty(&self->txq);
1078#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1078#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
1079 /* 1079 /*
1080 * Send data with poll bit cleared only if window > 1 1080 * Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
1814#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1814#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
1815 /* Window has been adjusted for the max packet 1815 /* Window has been adjusted for the max packet
1816 * size, so much simpler... - Jean II */ 1816 * size, so much simpler... - Jean II */
1817 nextfit = (skb_queue_len(&self->txq) > 0); 1817 nextfit = !skb_queue_empty(&self->txq);
1818#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1818#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
1819 /* 1819 /*
1820 * Send data with final bit cleared only if window > 1 1820 * Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
1937 irlap_data_indication(self, skb, FALSE); 1937 irlap_data_indication(self, skb, FALSE);
1938 1938
1939 /* Any pending data requests? */ 1939 /* Any pending data requests? */
1940 if ((skb_queue_len(&self->txq) > 0) && 1940 if (!skb_queue_empty(&self->txq) &&
1941 (self->window > 0)) 1941 (self->window > 0))
1942 { 1942 {
1943 self->ack_required = TRUE; 1943 self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
2038 /* 2038 /*
2039 * Any pending data requests? 2039 * Any pending data requests?
2040 */ 2040 */
2041 if ((skb_queue_len(&self->txq) > 0) && 2041 if (!skb_queue_empty(&self->txq) &&
2042 (self->window > 0) && !self->remote_busy) 2042 (self->window > 0) && !self->remote_busy)
2043 { 2043 {
2044 irlap_data_indication(self, skb, TRUE); 2044 irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
2069 */ 2069 */
2070 nr_status = irlap_validate_nr_received(self, info->nr); 2070 nr_status = irlap_validate_nr_received(self, info->nr);
2071 if (nr_status == NR_EXPECTED) { 2071 if (nr_status == NR_EXPECTED) {
2072 if ((skb_queue_len( &self->txq) > 0) && 2072 if (!skb_queue_empty(&self->txq) &&
2073 (self->window > 0)) { 2073 (self->window > 0)) {
2074 self->remote_busy = FALSE; 2074 self->remote_busy = FALSE;
2075 2075
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 040abe714a..6dafbb43b5 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
1018 /* 1018 /*
1019 * We can now fill the window with additional data frames 1019 * We can now fill the window with additional data frames
1020 */ 1020 */
1021 while (skb_queue_len( &self->txq) > 0) { 1021 while (!skb_queue_empty(&self->txq)) {
1022 1022
1023 IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__); 1023 IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
1024 if ((skb_queue_len( &self->txq) > 0) && 1024 if (self->window > 0) {
1025 (self->window > 0)) {
1026 skb = skb_dequeue( &self->txq); 1025 skb = skb_dequeue( &self->txq);
1027 IRDA_ASSERT(skb != NULL, return;); 1026 IRDA_ASSERT(skb != NULL, return;);
1028 1027
@@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
1031 * bit cleared 1030 * bit cleared
1032 */ 1031 */
1033 if ((self->window > 1) && 1032 if ((self->window > 1) &&
1034 skb_queue_len(&self->txq) > 0) 1033 !skb_queue_empty(&self->txq)) {
1035 {
1036 irlap_send_data_primary(self, skb); 1034 irlap_send_data_primary(self, skb);
1037 } else { 1035 } else {
1038 irlap_send_data_primary_poll(self, skb); 1036 irlap_send_data_primary_poll(self, skb);
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index d091ccf773..6602d901f8 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
1513 /* 1513 /*
1514 * Check if there is still data segments in the transmit queue 1514 * Check if there is still data segments in the transmit queue
1515 */ 1515 */
1516 if (skb_queue_len(&self->tx_queue) > 0) { 1516 if (!skb_queue_empty(&self->tx_queue)) {
1517 if (priority == P_HIGH) { 1517 if (priority == P_HIGH) {
1518 /* 1518 /*
1519 * No need to send the queued data, if we are 1519 * No need to send the queued data, if we are
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 0000000000..f0b5efb31a
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,22 @@
1#
2# LAPB Data Link Drive
3#
4
5config LAPB
6 tristate "LAPB Data Link Driver (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
10 the lower) part of the X.25 protocol. It offers a reliable
11 connection service to exchange data frames with one other host, and
12 it is used to transport higher level protocols (mostly X.25 Packet
13 Layer, the higher part of X.25, but others are possible as well).
14 Usually, LAPB is used with specialized X.21 network cards, but Linux
15 currently supports LAPB only over Ethernet connections. If you want
16 to use LAPB connections over Ethernet, say Y here and to "LAPB over
17 Ethernet driver" below. Read
18 <file:Documentation/networking/lapb-module.txt> for technical
19 details.
20
21 To compile this driver as a module, choose M here: the
22 module will be called lapb. If unsure, say N.
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index cd130c3b72..d5bdb53a34 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
84 if (llc->dev->flags & IFF_LOOPBACK) 84 if (llc->dev->flags & IFF_LOOPBACK)
85 goto out; 85 goto out;
86 rc = 1; 86 rc = 1;
87 if (!skb_queue_len(&llc->pdu_unack_q)) 87 if (skb_queue_empty(&llc->pdu_unack_q))
88 goto out; 88 goto out;
89 skb = skb_peek(&llc->pdu_unack_q); 89 skb = skb_peek(&llc->pdu_unack_q);
90 pdu = llc_pdu_sn_hdr(skb); 90 pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 70bcd4744d..ff774a06c8 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -315,8 +315,8 @@ err:
315static void netlink_remove(struct sock *sk) 315static void netlink_remove(struct sock *sk)
316{ 316{
317 netlink_table_grab(); 317 netlink_table_grab();
318 nl_table[sk->sk_protocol].hash.entries--; 318 if (sk_del_node_init(sk))
319 sk_del_node_init(sk); 319 nl_table[sk->sk_protocol].hash.entries--;
320 if (nlk_sk(sk)->groups) 320 if (nlk_sk(sk)->groups)
321 __sk_del_bind_node(sk); 321 __sk_del_bind_node(sk);
322 netlink_table_ungrab(); 322 netlink_table_ungrab();
@@ -429,7 +429,12 @@ retry:
429 err = netlink_insert(sk, pid); 429 err = netlink_insert(sk, pid);
430 if (err == -EADDRINUSE) 430 if (err == -EADDRINUSE)
431 goto retry; 431 goto retry;
432 return 0; 432
433 /* If 2 threads race to autobind, that is fine. */
434 if (err == -EBUSY)
435 err = 0;
436
437 return err;
433} 438}
434 439
435static inline int netlink_capable(struct socket *sock, unsigned int flag) 440static inline int netlink_capable(struct socket *sock, unsigned int flag)
@@ -643,7 +648,8 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
643 sock_put(sk); 648 sock_put(sk);
644} 649}
645 650
646static inline struct sk_buff *netlink_trim(struct sk_buff *skb, int allocation) 651static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
652 unsigned int __nocast allocation)
647{ 653{
648 int delta; 654 int delta;
649 655
@@ -712,7 +718,7 @@ struct netlink_broadcast_data {
712 int failure; 718 int failure;
713 int congested; 719 int congested;
714 int delivered; 720 int delivered;
715 int allocation; 721 unsigned int allocation;
716 struct sk_buff *skb, *skb2; 722 struct sk_buff *skb, *skb2;
717}; 723};
718 724
@@ -853,7 +859,7 @@ static inline void netlink_rcv_wake(struct sock *sk)
853{ 859{
854 struct netlink_sock *nlk = nlk_sk(sk); 860 struct netlink_sock *nlk = nlk_sk(sk);
855 861
856 if (!skb_queue_len(&sk->sk_receive_queue)) 862 if (skb_queue_empty(&sk->sk_receive_queue))
857 clear_bit(0, &nlk->state); 863 clear_bit(0, &nlk->state);
858 if (!test_bit(0, &nlk->state)) 864 if (!test_bit(0, &nlk->state))
859 wake_up_interruptible(&nlk->wait); 865 wake_up_interruptible(&nlk->wait);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 31ed4a9a1d..162a85fed1 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -459,12 +459,7 @@ static struct sock *nr_make_new(struct sock *osk)
459 sk->sk_sndbuf = osk->sk_sndbuf; 459 sk->sk_sndbuf = osk->sk_sndbuf;
460 sk->sk_state = TCP_ESTABLISHED; 460 sk->sk_state = TCP_ESTABLISHED;
461 sk->sk_sleep = osk->sk_sleep; 461 sk->sk_sleep = osk->sk_sleep;
462 462 sock_copy_flags(sk, osk);
463 if (sock_flag(osk, SOCK_ZAPPED))
464 sock_set_flag(sk, SOCK_ZAPPED);
465
466 if (sock_flag(osk, SOCK_DBG))
467 sock_set_flag(sk, SOCK_DBG);
468 463
469 skb_queue_head_init(&nr->ack_queue); 464 skb_queue_head_init(&nr->ack_queue);
470 skb_queue_head_init(&nr->reseq_queue); 465 skb_queue_head_init(&nr->reseq_queue);
@@ -541,7 +536,8 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
541 struct nr_sock *nr = nr_sk(sk); 536 struct nr_sock *nr = nr_sk(sk);
542 struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; 537 struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
543 struct net_device *dev; 538 struct net_device *dev;
544 ax25_address *user, *source; 539 ax25_uid_assoc *user;
540 ax25_address *source;
545 541
546 lock_sock(sk); 542 lock_sock(sk);
547 if (!sock_flag(sk, SOCK_ZAPPED)) { 543 if (!sock_flag(sk, SOCK_ZAPPED)) {
@@ -580,16 +576,19 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
580 } else { 576 } else {
581 source = &addr->fsa_ax25.sax25_call; 577 source = &addr->fsa_ax25.sax25_call;
582 578
583 if ((user = ax25_findbyuid(current->euid)) == NULL) { 579 user = ax25_findbyuid(current->euid);
580 if (user) {
581 nr->user_addr = user->call;
582 ax25_uid_put(user);
583 } else {
584 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { 584 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
585 release_sock(sk); 585 release_sock(sk);
586 dev_put(dev); 586 dev_put(dev);
587 return -EPERM; 587 return -EPERM;
588 } 588 }
589 user = source; 589 nr->user_addr = *source;
590 } 590 }
591 591
592 nr->user_addr = *user;
593 nr->source_addr = *source; 592 nr->source_addr = *source;
594 } 593 }
595 594
@@ -609,7 +608,8 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
609 struct sock *sk = sock->sk; 608 struct sock *sk = sock->sk;
610 struct nr_sock *nr = nr_sk(sk); 609 struct nr_sock *nr = nr_sk(sk);
611 struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; 610 struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr;
612 ax25_address *user, *source = NULL; 611 ax25_address *source = NULL;
612 ax25_uid_assoc *user;
613 struct net_device *dev; 613 struct net_device *dev;
614 614
615 lock_sock(sk); 615 lock_sock(sk);
@@ -650,16 +650,19 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
650 } 650 }
651 source = (ax25_address *)dev->dev_addr; 651 source = (ax25_address *)dev->dev_addr;
652 652
653 if ((user = ax25_findbyuid(current->euid)) == NULL) { 653 user = ax25_findbyuid(current->euid);
654 if (user) {
655 nr->user_addr = user->call;
656 ax25_uid_put(user);
657 } else {
654 if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) { 658 if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
655 dev_put(dev); 659 dev_put(dev);
656 release_sock(sk); 660 release_sock(sk);
657 return -EPERM; 661 return -EPERM;
658 } 662 }
659 user = source; 663 nr->user_addr = *source;
660 } 664 }
661 665
662 nr->user_addr = *user;
663 nr->source_addr = *source; 666 nr->source_addr = *source;
664 nr->device = dev; 667 nr->device = dev;
665 668
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 0000000000..34ff93ff89
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,26 @@
1#
2# Packet configuration
3#
4
5config PACKET
6 tristate "Packet socket"
7 ---help---
8 The Packet protocol is used by applications which communicate
9 directly with network devices without an intermediate network
10 protocol implemented in the kernel, e.g. tcpdump. If you want them
11 to work, choose Y.
12
13 To compile this driver as a module, choose M here: the module will
14 be called af_packet.
15
16 If unsure, say Y.
17
18config PACKET_MMAP
19 bool "Packet socket: mmapped IO"
20 depends on PACKET
21 help
22 If you say Y here, the Packet protocol driver will use an IO
23 mechanism that results in faster communication.
24
25 If unsure, say N.
26
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0269616e75..c9d5980aa4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -274,6 +274,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct
274 dst_release(skb->dst); 274 dst_release(skb->dst);
275 skb->dst = NULL; 275 skb->dst = NULL;
276 276
277 /* drop conntrack reference */
278 nf_reset(skb);
279
277 spkt = (struct sockaddr_pkt*)skb->cb; 280 spkt = (struct sockaddr_pkt*)skb->cb;
278 281
279 skb_push(skb, skb->data-skb->mac.raw); 282 skb_push(skb, skb->data-skb->mac.raw);
@@ -517,6 +520,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
517 dst_release(skb->dst); 520 dst_release(skb->dst);
518 skb->dst = NULL; 521 skb->dst = NULL;
519 522
523 /* drop conntrack reference */
524 nf_reset(skb);
525
520 spin_lock(&sk->sk_receive_queue.lock); 526 spin_lock(&sk->sk_receive_queue.lock);
521 po->stats.tp_packets++; 527 po->stats.tp_packets++;
522 __skb_queue_tail(&sk->sk_receive_queue, skb); 528 __skb_queue_tail(&sk->sk_receive_queue, skb);
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 7eb6a5bf93..5480caf8cc 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -556,12 +556,7 @@ static struct sock *rose_make_new(struct sock *osk)
556 sk->sk_sndbuf = osk->sk_sndbuf; 556 sk->sk_sndbuf = osk->sk_sndbuf;
557 sk->sk_state = TCP_ESTABLISHED; 557 sk->sk_state = TCP_ESTABLISHED;
558 sk->sk_sleep = osk->sk_sleep; 558 sk->sk_sleep = osk->sk_sleep;
559 559 sock_copy_flags(sk, osk);
560 if (sock_flag(osk, SOCK_ZAPPED))
561 sock_set_flag(sk, SOCK_ZAPPED);
562
563 if (sock_flag(osk, SOCK_DBG))
564 sock_set_flag(sk, SOCK_DBG);
565 560
566 init_timer(&rose->timer); 561 init_timer(&rose->timer);
567 init_timer(&rose->idletimer); 562 init_timer(&rose->idletimer);
@@ -631,7 +626,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
631 struct rose_sock *rose = rose_sk(sk); 626 struct rose_sock *rose = rose_sk(sk);
632 struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; 627 struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
633 struct net_device *dev; 628 struct net_device *dev;
634 ax25_address *user, *source; 629 ax25_address *source;
630 ax25_uid_assoc *user;
635 int n; 631 int n;
636 632
637 if (!sock_flag(sk, SOCK_ZAPPED)) 633 if (!sock_flag(sk, SOCK_ZAPPED))
@@ -656,14 +652,17 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
656 652
657 source = &addr->srose_call; 653 source = &addr->srose_call;
658 654
659 if ((user = ax25_findbyuid(current->euid)) == NULL) { 655 user = ax25_findbyuid(current->euid);
656 if (user) {
657 rose->source_call = user->call;
658 ax25_uid_put(user);
659 } else {
660 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) 660 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE))
661 return -EACCES; 661 return -EACCES;
662 user = source; 662 rose->source_call = *source;
663 } 663 }
664 664
665 rose->source_addr = addr->srose_addr; 665 rose->source_addr = addr->srose_addr;
666 rose->source_call = *user;
667 rose->device = dev; 666 rose->device = dev;
668 rose->source_ndigis = addr->srose_ndigis; 667 rose->source_ndigis = addr->srose_ndigis;
669 668
@@ -690,8 +689,8 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
690 struct rose_sock *rose = rose_sk(sk); 689 struct rose_sock *rose = rose_sk(sk);
691 struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; 690 struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
692 unsigned char cause, diagnostic; 691 unsigned char cause, diagnostic;
693 ax25_address *user;
694 struct net_device *dev; 692 struct net_device *dev;
693 ax25_uid_assoc *user;
695 int n; 694 int n;
696 695
697 if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { 696 if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
@@ -741,12 +740,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
741 if ((dev = rose_dev_first()) == NULL) 740 if ((dev = rose_dev_first()) == NULL)
742 return -ENETUNREACH; 741 return -ENETUNREACH;
743 742
744 if ((user = ax25_findbyuid(current->euid)) == NULL) 743 user = ax25_findbyuid(current->euid);
744 if (!user)
745 return -EINVAL; 745 return -EINVAL;
746 746
747 memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN); 747 memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
748 rose->source_call = *user; 748 rose->source_call = user->call;
749 rose->device = dev; 749 rose->device = dev;
750 ax25_uid_put(user);
750 751
751 rose_insert_socket(sk); /* Finish the bind */ 752 rose_insert_socket(sk); /* Finish the bind */
752 } 753 }
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index ff73ebb912..25da6f699f 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -994,8 +994,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
994 * 1. The frame isn't for us, 994 * 1. The frame isn't for us,
995 * 2. It isn't "owned" by any existing route. 995 * 2. It isn't "owned" by any existing route.
996 */ 996 */
997 if (frametype != ROSE_CALL_REQUEST) /* XXX */ 997 if (frametype != ROSE_CALL_REQUEST) { /* XXX */
998 return 0; 998 res = 0;
999 goto out;
1000 }
999 1001
1000 len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; 1002 len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2;
1001 len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; 1003 len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2;
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a..dada34a77b 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
138 138
139 _debug("### End Work"); 139 _debug("### End Work");
140 140
141 try_to_freeze(PF_FREEZE); 141 try_to_freeze();
142 142
143 /* discard pending signals */ 143 /* discard pending signals */
144 rxrpc_discard_my_signals(); 144 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d92..1aadd026d3 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
107 107
108 _debug("### End Inbound Calls"); 108 _debug("### End Inbound Calls");
109 109
110 try_to_freeze(PF_FREEZE); 110 try_to_freeze();
111 111
112 /* discard pending signals */ 112 /* discard pending signals */
113 rxrpc_discard_my_signals(); 113 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290..3ac81cdd12 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
90 complete_and_exit(&krxtimod_dead, 0); 90 complete_and_exit(&krxtimod_dead, 0);
91 } 91 }
92 92
93 try_to_freeze(PF_FREEZE); 93 try_to_freeze();
94 94
95 /* discard pending signals */ 95 /* discard pending signals */
96 rxrpc_discard_my_signals(); 96 rxrpc_discard_my_signals();
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b22c9beb60..59d3e71f8b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,43 @@
1# 1#
2# Traffic control configuration. 2# Traffic control configuration.
3# 3#
4
5menuconfig NET_SCHED
6 bool "QoS and/or fair queueing"
7 ---help---
8 When the kernel has several packets to send out over a network
9 device, it has to decide which ones to send first, which ones to
10 delay, and which ones to drop. This is the job of the packet
11 scheduler, and several different algorithms for how to do this
12 "fairly" have been proposed.
13
14 If you say N here, you will get the standard packet scheduler, which
15 is a FIFO (first come, first served). If you say Y here, you will be
16 able to choose from among several alternative algorithms which can
17 then be attached to different network devices. This is useful for
18 example if some of your network devices are real time devices that
19 need a certain minimum data flow rate, or if you need to limit the
20 maximum data flow rate for traffic which matches specified criteria.
21 This code is considered to be experimental.
22
23 To administer these schedulers, you'll need the user-level utilities
24 from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
25 That package also contains some documentation; for more, check out
26 <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
27
28 This Quality of Service (QoS) support will enable you to use
29 Differentiated Services (diffserv) and Resource Reservation Protocol
30 (RSVP) on your Linux router if you also say Y to "QoS support",
31 "Packet classifier API" and to some classifiers below. Documentation
32 and software is at <http://diffserv.sourceforge.net/>.
33
34 If you say Y here and to "/proc file system" below, you will be able
35 to read status information about packet schedulers from the file
36 /proc/net/psched.
37
38 The available schedulers are listed in the following questions; you
39 can say Y to as many as you like. If unsure, say N now.
40
4choice 41choice
5 prompt "Packet scheduler clock source" 42 prompt "Packet scheduler clock source"
6 depends on NET_SCHED 43 depends on NET_SCHED
@@ -449,6 +486,19 @@ config NET_EMATCH_META
449 To compile this code as a module, choose M here: the 486 To compile this code as a module, choose M here: the
450 module will be called em_meta. 487 module will be called em_meta.
451 488
489config NET_EMATCH_TEXT
490 tristate "Textsearch"
491 depends on NET_EMATCH
492 select TEXTSEARCH
493 select TEXTSEARCH_KMP
494 select TEXTSEARCH_FSM
495 ---help---
496 Say Y here if you want to be ablt to classify packets based on
497 textsearch comparisons.
498
499 To compile this code as a module, choose M here: the
500 module will be called em_text.
501
452config NET_CLS_ACT 502config NET_CLS_ACT
453 bool "Packet ACTION" 503 bool "Packet ACTION"
454 depends on EXPERIMENTAL && NET_CLS && NET_QOS 504 depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eb..e48d0d456b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y := sch_generic.o 5obj-y := sch_generic.o
6 6
7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o 7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o
8obj-$(CONFIG_NET_CLS) += cls_api.o 8obj-$(CONFIG_NET_CLS) += cls_api.o
9obj-$(CONFIG_NET_CLS_ACT) += act_api.o 9obj-$(CONFIG_NET_CLS_ACT) += act_api.o
10obj-$(CONFIG_NET_ACT_POLICE) += police.o 10obj-$(CONFIG_NET_ACT_POLICE) += police.o
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o 40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o 41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o 42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
43obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9594206e60..249c61936e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -439,6 +439,8 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
439 439
440 t = NLMSG_DATA(nlh); 440 t = NLMSG_DATA(nlh);
441 t->tca_family = AF_UNSPEC; 441 t->tca_family = AF_UNSPEC;
442 t->tca__pad1 = 0;
443 t->tca__pad2 = 0;
442 444
443 x = (struct rtattr*) skb->tail; 445 x = (struct rtattr*) skb->tail;
444 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 446 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -580,6 +582,8 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
580 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); 582 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
581 t = NLMSG_DATA(nlh); 583 t = NLMSG_DATA(nlh);
582 t->tca_family = AF_UNSPEC; 584 t->tca_family = AF_UNSPEC;
585 t->tca__pad1 = 0;
586 t->tca__pad2 = 0;
583 587
584 x = (struct rtattr *) skb->tail; 588 x = (struct rtattr *) skb->tail;
585 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 589 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -687,7 +691,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
687 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); 691 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
688 t = NLMSG_DATA(nlh); 692 t = NLMSG_DATA(nlh);
689 t->tca_family = AF_UNSPEC; 693 t->tca_family = AF_UNSPEC;
690 694 t->tca__pad1 = 0;
695 t->tca__pad2 = 0;
696
691 x = (struct rtattr*) skb->tail; 697 x = (struct rtattr*) skb->tail;
692 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 698 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
693 699
@@ -842,6 +848,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
842 cb->nlh->nlmsg_type, sizeof(*t)); 848 cb->nlh->nlmsg_type, sizeof(*t));
843 t = NLMSG_DATA(nlh); 849 t = NLMSG_DATA(nlh);
844 t->tca_family = AF_UNSPEC; 850 t->tca_family = AF_UNSPEC;
851 t->tca__pad1 = 0;
852 t->tca__pad2 = 0;
845 853
846 x = (struct rtattr *) skb->tail; 854 x = (struct rtattr *) skb->tail;
847 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 855 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1616bf5c96..3b5714ef4d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -331,6 +331,8 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
331 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 331 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
332 tcm = NLMSG_DATA(nlh); 332 tcm = NLMSG_DATA(nlh);
333 tcm->tcm_family = AF_UNSPEC; 333 tcm->tcm_family = AF_UNSPEC;
334 tcm->tcm__pad1 = 0;
335 tcm->tcm__pad1 = 0;
334 tcm->tcm_ifindex = tp->q->dev->ifindex; 336 tcm->tcm_ifindex = tp->q->dev->ifindex;
335 tcm->tcm_parent = tp->classid; 337 tcm->tcm_parent = tp->classid;
336 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); 338 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb91968..006168d693 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
618 pinfo.protocol = s->protocol; 618 pinfo.protocol = s->protocol;
619 pinfo.tunnelid = s->tunnelid; 619 pinfo.tunnelid = s->tunnelid;
620 pinfo.tunnelhdr = f->tunnelhdr; 620 pinfo.tunnelhdr = f->tunnelhdr;
621 pinfo.pad = 0;
621 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); 622 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
622 if (f->res.classid) 623 if (f->res.classid)
623 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); 624 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a3..00eae5f9a0 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -27,17 +27,17 @@
27 * lvalue rvalue 27 * lvalue rvalue
28 * +-----------+ +-----------+ 28 * +-----------+ +-----------+
29 * | type: INT | | type: INT | 29 * | type: INT | | type: INT |
30 * def | id: INDEV | | id: VALUE | 30 * def | id: DEV | | id: VALUE |
31 * | data: | | data: 3 | 31 * | data: | | data: 3 |
32 * +-----------+ +-----------+ 32 * +-----------+ +-----------+
33 * | | 33 * | |
34 * ---> meta_ops[INT][INDEV](...) | 34 * ---> meta_ops[INT][DEV](...) |
35 * | | 35 * | |
36 * ----------- | 36 * ----------- |
37 * V V 37 * V V
38 * +-----------+ +-----------+ 38 * +-----------+ +-----------+
39 * | type: INT | | type: INT | 39 * | type: INT | | type: INT |
40 * obj | id: INDEV | | id: VALUE | 40 * obj | id: DEV | | id: VALUE |
41 * | data: 2 |<--data got filled out | data: 3 | 41 * | data: 2 |<--data got filled out | data: 3 |
42 * +-----------+ +-----------+ 42 * +-----------+ +-----------+
43 * | | 43 * | |
@@ -170,26 +170,6 @@ META_COLLECTOR(var_dev)
170 *err = var_dev(skb->dev, dst); 170 *err = var_dev(skb->dev, dst);
171} 171}
172 172
173META_COLLECTOR(int_indev)
174{
175 *err = int_dev(skb->input_dev, dst);
176}
177
178META_COLLECTOR(var_indev)
179{
180 *err = var_dev(skb->input_dev, dst);
181}
182
183META_COLLECTOR(int_realdev)
184{
185 *err = int_dev(skb->real_dev, dst);
186}
187
188META_COLLECTOR(var_realdev)
189{
190 *err = var_dev(skb->real_dev, dst);
191}
192
193/************************************************************************** 173/**************************************************************************
194 * skb attributes 174 * skb attributes
195 **************************************************************************/ 175 **************************************************************************/
@@ -205,11 +185,6 @@ META_COLLECTOR(int_protocol)
205 dst->value = skb->protocol; 185 dst->value = skb->protocol;
206} 186}
207 187
208META_COLLECTOR(int_security)
209{
210 dst->value = skb->security;
211}
212
213META_COLLECTOR(int_pkttype) 188META_COLLECTOR(int_pkttype)
214{ 189{
215 dst->value = skb->pkt_type; 190 dst->value = skb->pkt_type;
@@ -234,12 +209,14 @@ META_COLLECTOR(int_maclen)
234 * Netfilter 209 * Netfilter
235 **************************************************************************/ 210 **************************************************************************/
236 211
237#ifdef CONFIG_NETFILTER
238META_COLLECTOR(int_nfmark) 212META_COLLECTOR(int_nfmark)
239{ 213{
214#ifdef CONFIG_NETFILTER
240 dst->value = skb->nfmark; 215 dst->value = skb->nfmark;
241} 216#else
217 dst->value = 0;
242#endif 218#endif
219}
243 220
244/************************************************************************** 221/**************************************************************************
245 * Traffic Control 222 * Traffic Control
@@ -250,31 +227,21 @@ META_COLLECTOR(int_tcindex)
250 dst->value = skb->tc_index; 227 dst->value = skb->tc_index;
251} 228}
252 229
253#ifdef CONFIG_NET_CLS_ACT
254META_COLLECTOR(int_tcverd)
255{
256 dst->value = skb->tc_verd;
257}
258
259META_COLLECTOR(int_tcclassid)
260{
261 dst->value = skb->tc_classid;
262}
263#endif
264
265/************************************************************************** 230/**************************************************************************
266 * Routing 231 * Routing
267 **************************************************************************/ 232 **************************************************************************/
268 233
269#ifdef CONFIG_NET_CLS_ROUTE
270META_COLLECTOR(int_rtclassid) 234META_COLLECTOR(int_rtclassid)
271{ 235{
272 if (unlikely(skb->dst == NULL)) 236 if (unlikely(skb->dst == NULL))
273 *err = -1; 237 *err = -1;
274 else 238 else
239#ifdef CONFIG_NET_CLS_ROUTE
275 dst->value = skb->dst->tclassid; 240 dst->value = skb->dst->tclassid;
276} 241#else
242 dst->value = 0;
277#endif 243#endif
244}
278 245
279META_COLLECTOR(int_rtiif) 246META_COLLECTOR(int_rtiif)
280{ 247{
@@ -510,8 +477,6 @@ struct meta_ops
510static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { 477static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
511 [TCF_META_TYPE_VAR] = { 478 [TCF_META_TYPE_VAR] = {
512 [META_ID(DEV)] = META_FUNC(var_dev), 479 [META_ID(DEV)] = META_FUNC(var_dev),
513 [META_ID(INDEV)] = META_FUNC(var_indev),
514 [META_ID(REALDEV)] = META_FUNC(var_realdev),
515 [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if), 480 [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
516 }, 481 },
517 [TCF_META_TYPE_INT] = { 482 [TCF_META_TYPE_INT] = {
@@ -520,26 +485,15 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
520 [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1), 485 [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
521 [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2), 486 [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
522 [META_ID(DEV)] = META_FUNC(int_dev), 487 [META_ID(DEV)] = META_FUNC(int_dev),
523 [META_ID(INDEV)] = META_FUNC(int_indev),
524 [META_ID(REALDEV)] = META_FUNC(int_realdev),
525 [META_ID(PRIORITY)] = META_FUNC(int_priority), 488 [META_ID(PRIORITY)] = META_FUNC(int_priority),
526 [META_ID(PROTOCOL)] = META_FUNC(int_protocol), 489 [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
527 [META_ID(SECURITY)] = META_FUNC(int_security),
528 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), 490 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
529 [META_ID(PKTLEN)] = META_FUNC(int_pktlen), 491 [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
530 [META_ID(DATALEN)] = META_FUNC(int_datalen), 492 [META_ID(DATALEN)] = META_FUNC(int_datalen),
531 [META_ID(MACLEN)] = META_FUNC(int_maclen), 493 [META_ID(MACLEN)] = META_FUNC(int_maclen),
532#ifdef CONFIG_NETFILTER
533 [META_ID(NFMARK)] = META_FUNC(int_nfmark), 494 [META_ID(NFMARK)] = META_FUNC(int_nfmark),
534#endif
535 [META_ID(TCINDEX)] = META_FUNC(int_tcindex), 495 [META_ID(TCINDEX)] = META_FUNC(int_tcindex),
536#ifdef CONFIG_NET_CLS_ACT
537 [META_ID(TCVERDICT)] = META_FUNC(int_tcverd),
538 [META_ID(TCCLASSID)] = META_FUNC(int_tcclassid),
539#endif
540#ifdef CONFIG_NET_CLS_ROUTE
541 [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid), 496 [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
542#endif
543 [META_ID(RTIIF)] = META_FUNC(int_rtiif), 497 [META_ID(RTIIF)] = META_FUNC(int_rtiif),
544 [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family), 498 [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
545 [META_ID(SK_STATE)] = META_FUNC(int_sk_state), 499 [META_ID(SK_STATE)] = META_FUNC(int_sk_state),
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 0000000000..77beabc91f
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,154 @@
1/*
2 * net/sched/em_text.c Textsearch ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/string.h>
18#include <linux/skbuff.h>
19#include <linux/textsearch.h>
20#include <linux/tc_ematch/tc_em_text.h>
21#include <net/pkt_cls.h>
22
23struct text_match
24{
25 u16 from_offset;
26 u16 to_offset;
27 u8 from_layer;
28 u8 to_layer;
29 struct ts_config *config;
30};
31
32#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
33
34static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
35 struct tcf_pkt_info *info)
36{
37 struct text_match *tm = EM_TEXT_PRIV(m);
38 int from, to;
39 struct ts_state state;
40
41 from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
42 from += tm->from_offset;
43
44 to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
45 to += tm->to_offset;
46
47 return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
48}
49
50static int em_text_change(struct tcf_proto *tp, void *data, int len,
51 struct tcf_ematch *m)
52{
53 struct text_match *tm;
54 struct tcf_em_text *conf = data;
55 struct ts_config *ts_conf;
56 int flags = 0;
57
58 if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
59 return -EINVAL;
60
61 if (conf->from_layer > conf->to_layer)
62 return -EINVAL;
63
64 if (conf->from_layer == conf->to_layer &&
65 conf->from_offset > conf->to_offset)
66 return -EINVAL;
67
68retry:
69 ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
70 conf->pattern_len, GFP_KERNEL, flags);
71
72 if (flags & TS_AUTOLOAD)
73 rtnl_lock();
74
75 if (IS_ERR(ts_conf)) {
76 if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
77 rtnl_unlock();
78 flags |= TS_AUTOLOAD;
79 goto retry;
80 } else
81 return PTR_ERR(ts_conf);
82 } else if (flags & TS_AUTOLOAD) {
83 textsearch_destroy(ts_conf);
84 return -EAGAIN;
85 }
86
87 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
88 if (tm == NULL) {
89 textsearch_destroy(ts_conf);
90 return -ENOBUFS;
91 }
92
93 tm->from_offset = conf->from_offset;
94 tm->to_offset = conf->to_offset;
95 tm->from_layer = conf->from_layer;
96 tm->to_layer = conf->to_layer;
97 tm->config = ts_conf;
98
99 m->datalen = sizeof(*tm);
100 m->data = (unsigned long) tm;
101
102 return 0;
103}
104
105static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
106{
107 textsearch_destroy(EM_TEXT_PRIV(m)->config);
108}
109
110static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
111{
112 struct text_match *tm = EM_TEXT_PRIV(m);
113 struct tcf_em_text conf;
114
115 strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
116 conf.from_offset = tm->from_offset;
117 conf.to_offset = tm->to_offset;
118 conf.from_layer = tm->from_layer;
119 conf.to_layer = tm->to_layer;
120 conf.pattern_len = textsearch_get_pattern_len(tm->config);
121 conf.pad = 0;
122
123 RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
124 RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
125 return 0;
126
127rtattr_failure:
128 return -1;
129}
130
131static struct tcf_ematch_ops em_text_ops = {
132 .kind = TCF_EM_TEXT,
133 .change = em_text_change,
134 .match = em_text_match,
135 .destroy = em_text_destroy,
136 .dump = em_text_dump,
137 .owner = THIS_MODULE,
138 .link = LIST_HEAD_INIT(em_text_ops.link)
139};
140
141static int __init init_em_text(void)
142{
143 return tcf_em_register(&em_text_ops);
144}
145
146static void __exit exit_em_text(void)
147{
148 tcf_em_unregister(&em_text_ops);
149}
150
151MODULE_LICENSE("GPL");
152
153module_init(init_em_text);
154module_exit(exit_em_text);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 97c1c75d5c..b9a069af4a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{ 399{
400 int err; 400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1]; 401 struct rtattr *kind = tca[TCA_KIND-1];
402 void *p = NULL;
403 struct Qdisc *sch; 402 struct Qdisc *sch;
404 struct Qdisc_ops *ops; 403 struct Qdisc_ops *ops;
405 int size;
406 404
407 ops = qdisc_lookup_ops(kind); 405 ops = qdisc_lookup_ops(kind);
408#ifdef CONFIG_KMOD 406#ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
437 if (ops == NULL) 435 if (ops == NULL)
438 goto err_out; 436 goto err_out;
439 437
440 /* ensure that the Qdisc and the private data are 32-byte aligned */ 438 sch = qdisc_alloc(dev, ops);
441 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 439 if (IS_ERR(sch)) {
442 size += ops->priv_size + QDISC_ALIGN_CONST; 440 err = PTR_ERR(sch);
443
444 p = kmalloc(size, GFP_KERNEL);
445 err = -ENOBUFS;
446 if (!p)
447 goto err_out2; 441 goto err_out2;
448 memset(p, 0, size); 442 }
449 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 & ~QDISC_ALIGN_CONST);
451 sch->padded = (char *)sch - (char *)p;
452
453 INIT_LIST_HEAD(&sch->list);
454 skb_queue_head_init(&sch->q);
455 443
456 if (handle == TC_H_INGRESS) 444 if (handle == TC_H_INGRESS) {
457 sch->flags |= TCQ_F_INGRESS; 445 sch->flags |= TCQ_F_INGRESS;
458 446 handle = TC_H_MAKE(TC_H_INGRESS, 0);
459 sch->ops = ops; 447 } else if (handle == 0) {
460 sch->enqueue = ops->enqueue;
461 sch->dequeue = ops->dequeue;
462 sch->dev = dev;
463 dev_hold(dev);
464 atomic_set(&sch->refcnt, 1);
465 sch->stats_lock = &dev->queue_lock;
466 if (handle == 0) {
467 handle = qdisc_alloc_handle(dev); 448 handle = qdisc_alloc_handle(dev);
468 err = -ENOMEM; 449 err = -ENOMEM;
469 if (handle == 0) 450 if (handle == 0)
470 goto err_out3; 451 goto err_out3;
471 } 452 }
472 453
473 if (handle == TC_H_INGRESS) 454 sch->handle = handle;
474 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475 else
476 sch->handle = handle;
477 455
478 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
457#ifdef CONFIG_NET_ESTIMATOR
458 if (tca[TCA_RATE-1]) {
459 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
460 sch->stats_lock,
461 tca[TCA_RATE-1]);
462 if (err) {
463 /*
464 * Any broken qdiscs that would require
465 * a ops->reset() here? The qdisc was never
466 * in action so it shouldn't be necessary.
467 */
468 if (ops->destroy)
469 ops->destroy(sch);
470 goto err_out3;
471 }
472 }
473#endif
479 qdisc_lock_tree(dev); 474 qdisc_lock_tree(dev);
480 list_add_tail(&sch->list, &dev->qdisc_list); 475 list_add_tail(&sch->list, &dev->qdisc_list);
481 qdisc_unlock_tree(dev); 476 qdisc_unlock_tree(dev);
482 477
483#ifdef CONFIG_NET_ESTIMATOR
484 if (tca[TCA_RATE-1])
485 gen_new_estimator(&sch->bstats, &sch->rate_est,
486 sch->stats_lock, tca[TCA_RATE-1]);
487#endif
488 return sch; 478 return sch;
489 } 479 }
490err_out3: 480err_out3:
491 dev_put(dev); 481 dev_put(dev);
482 kfree((char *) sch - sch->padded);
492err_out2: 483err_out2:
493 module_put(ops->owner); 484 module_put(ops->owner);
494err_out: 485err_out:
495 *errp = err; 486 *errp = err;
496 if (p)
497 kfree(p);
498 return NULL; 487 return NULL;
499} 488}
500 489
@@ -770,6 +759,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
770 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 759 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
771 tcm = NLMSG_DATA(nlh); 760 tcm = NLMSG_DATA(nlh);
772 tcm->tcm_family = AF_UNSPEC; 761 tcm->tcm_family = AF_UNSPEC;
762 tcm->tcm__pad1 = 0;
763 tcm->tcm__pad2 = 0;
773 tcm->tcm_ifindex = q->dev->ifindex; 764 tcm->tcm_ifindex = q->dev->ifindex;
774 tcm->tcm_parent = clid; 765 tcm->tcm_parent = clid;
775 tcm->tcm_handle = q->handle; 766 tcm->tcm_handle = q->handle;
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 0000000000..81f0b8346d
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
1/*
2 * net/sched/sch_blackhole.c Black hole queue
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 *
11 * Note: Quantum tunneling is not supported.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/netdevice.h>
19#include <linux/skbuff.h>
20#include <net/pkt_sched.h>
21
22static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
23{
24 qdisc_drop(skb, sch);
25 return NET_XMIT_SUCCESS;
26}
27
28static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
29{
30 return NULL;
31}
32
33static struct Qdisc_ops blackhole_qdisc_ops = {
34 .id = "blackhole",
35 .priv_size = 0,
36 .enqueue = blackhole_enqueue,
37 .dequeue = blackhole_dequeue,
38 .owner = THIS_MODULE,
39};
40
41static int __init blackhole_module_init(void)
42{
43 return register_qdisc(&blackhole_qdisc_ops);
44}
45
46static void __exit blackhole_module_exit(void)
47{
48 unregister_qdisc(&blackhole_qdisc_ops);
49}
50
51module_init(blackhole_module_init)
52module_exit(blackhole_module_exit)
53
54MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d43e3b8cbf..09453f997d 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1528 1528
1529 opt.strategy = cl->ovl_strategy; 1529 opt.strategy = cl->ovl_strategy;
1530 opt.priority2 = cl->priority2+1; 1530 opt.priority2 = cl->priority2+1;
1531 opt.pad = 0;
1531 opt.penalty = (cl->penalty*1000)/HZ; 1532 opt.penalty = (cl->penalty*1000)/HZ;
1532 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1533 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1533 return skb->len; 1534 return skb->len;
@@ -1563,6 +1564,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1563 1564
1564 if (cl->police) { 1565 if (cl->police) {
1565 opt.police = cl->police; 1566 opt.police = cl->police;
1567 opt.__res1 = 0;
1568 opt.__res2 = 0;
1566 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); 1569 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1567 } 1570 }
1568 return skb->len; 1571 return skb->len;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7683b34dc6..0d066c9653 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -331,11 +331,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
331 int prio; 331 int prio;
332 struct sk_buff_head *list = qdisc_priv(qdisc); 332 struct sk_buff_head *list = qdisc_priv(qdisc);
333 333
334 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) { 334 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
335 struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); 335 if (!skb_queue_empty(list + prio)) {
336 if (skb) {
337 qdisc->q.qlen--; 336 qdisc->q.qlen--;
338 return skb; 337 return __qdisc_dequeue_head(qdisc, list + prio);
339 } 338 }
340 } 339 }
341 340
@@ -395,24 +394,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
395 .owner = THIS_MODULE, 394 .owner = THIS_MODULE,
396}; 395};
397 396
398struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) 397struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
399{ 398{
400 void *p; 399 void *p;
401 struct Qdisc *sch; 400 struct Qdisc *sch;
402 int size; 401 unsigned int size;
402 int err = -ENOBUFS;
403 403
404 /* ensure that the Qdisc and the private data are 32-byte aligned */ 404 /* ensure that the Qdisc and the private data are 32-byte aligned */
405 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 405 size = QDISC_ALIGN(sizeof(*sch));
406 size += ops->priv_size + QDISC_ALIGN_CONST; 406 size += ops->priv_size + (QDISC_ALIGNTO - 1);
407 407
408 p = kmalloc(size, GFP_KERNEL); 408 p = kmalloc(size, GFP_KERNEL);
409 if (!p) 409 if (!p)
410 return NULL; 410 goto errout;
411 memset(p, 0, size); 411 memset(p, 0, size);
412 412 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
413 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 413 sch->padded = (char *) sch - (char *) p;
414 & ~QDISC_ALIGN_CONST);
415 sch->padded = (char *)sch - (char *)p;
416 414
417 INIT_LIST_HEAD(&sch->list); 415 INIT_LIST_HEAD(&sch->list);
418 skb_queue_head_init(&sch->q); 416 skb_queue_head_init(&sch->q);
@@ -423,11 +421,25 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
423 dev_hold(dev); 421 dev_hold(dev);
424 sch->stats_lock = &dev->queue_lock; 422 sch->stats_lock = &dev->queue_lock;
425 atomic_set(&sch->refcnt, 1); 423 atomic_set(&sch->refcnt, 1);
424
425 return sch;
426errout:
427 return ERR_PTR(-err);
428}
429
430struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
431{
432 struct Qdisc *sch;
433
434 sch = qdisc_alloc(dev, ops);
435 if (IS_ERR(sch))
436 goto errout;
437
426 if (!ops->init || ops->init(sch, NULL) == 0) 438 if (!ops->init || ops->init(sch, NULL) == 0)
427 return sch; 439 return sch;
428 440
429 dev_put(dev); 441 qdisc_destroy(sch);
430 kfree(p); 442errout:
431 return NULL; 443 return NULL;
432} 444}
433 445
@@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
591EXPORT_SYMBOL(noop_qdisc); 603EXPORT_SYMBOL(noop_qdisc);
592EXPORT_SYMBOL(noop_qdisc_ops); 604EXPORT_SYMBOL(noop_qdisc_ops);
593EXPORT_SYMBOL(qdisc_create_dflt); 605EXPORT_SYMBOL(qdisc_create_dflt);
606EXPORT_SYMBOL(qdisc_alloc);
594EXPORT_SYMBOL(qdisc_destroy); 607EXPORT_SYMBOL(qdisc_destroy);
595EXPORT_SYMBOL(qdisc_reset); 608EXPORT_SYMBOL(qdisc_reset);
596EXPORT_SYMBOL(qdisc_restart); 609EXPORT_SYMBOL(qdisc_restart);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 664d0e4737..7845d045ee 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt)
385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); 385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
386 386
387 q->qcount = -1; 387 q->qcount = -1;
388 if (skb_queue_len(&sch->q) == 0) 388 if (skb_queue_empty(&sch->q))
389 PSCHED_SET_PASTPERFECT(q->qidlestart); 389 PSCHED_SET_PASTPERFECT(q->qidlestart);
390 sch_tree_unlock(sch); 390 sch_tree_unlock(sch);
391 return 0; 391 return 0;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 7ae6aa772d..5b24ae0650 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
71 const struct sctp_endpoint *ep, 71 const struct sctp_endpoint *ep,
72 const struct sock *sk, 72 const struct sock *sk,
73 sctp_scope_t scope, 73 sctp_scope_t scope,
74 int gfp) 74 unsigned int __nocast gfp)
75{ 75{
76 struct sctp_sock *sp; 76 struct sctp_sock *sp;
77 int i; 77 int i;
@@ -203,7 +203,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
203 */ 203 */
204 asoc->addip_serial = asoc->c.initial_tsn; 204 asoc->addip_serial = asoc->c.initial_tsn;
205 205
206 skb_queue_head_init(&asoc->addip_chunks); 206 INIT_LIST_HEAD(&asoc->addip_chunk_list);
207 207
208 /* Make an empty list of remote transport addresses. */ 208 /* Make an empty list of remote transport addresses. */
209 INIT_LIST_HEAD(&asoc->peer.transport_addr_list); 209 INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
@@ -272,7 +272,8 @@ fail_init:
272/* Allocate and initialize a new association */ 272/* Allocate and initialize a new association */
273struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, 273struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
274 const struct sock *sk, 274 const struct sock *sk,
275 sctp_scope_t scope, int gfp) 275 sctp_scope_t scope,
276 unsigned int __nocast gfp)
276{ 277{
277 struct sctp_association *asoc; 278 struct sctp_association *asoc;
278 279
@@ -478,7 +479,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
478/* Add a transport address to an association. */ 479/* Add a transport address to an association. */
479struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, 480struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
480 const union sctp_addr *addr, 481 const union sctp_addr *addr,
481 const int gfp, 482 const unsigned int __nocast gfp,
482 const int peer_state) 483 const int peer_state)
483{ 484{
484 struct sctp_transport *peer; 485 struct sctp_transport *peer;
@@ -1229,7 +1230,8 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
1229/* Build the bind address list for the association based on info from the 1230/* Build the bind address list for the association based on info from the
1230 * local endpoint and the remote peer. 1231 * local endpoint and the remote peer.
1231 */ 1232 */
1232int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp) 1233int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
1234 unsigned int __nocast gfp)
1233{ 1235{
1234 sctp_scope_t scope; 1236 sctp_scope_t scope;
1235 int flags; 1237 int flags;
@@ -1251,7 +1253,8 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
1251 1253
1252/* Build the association's bind address list from the cookie. */ 1254/* Build the association's bind address list from the cookie. */
1253int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc, 1255int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
1254 struct sctp_cookie *cookie, int gfp) 1256 struct sctp_cookie *cookie,
1257 unsigned int __nocast gfp)
1255{ 1258{
1256 int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length); 1259 int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
1257 int var_size3 = cookie->raw_addr_list_len; 1260 int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f90eadfb60..f71549710f 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,8 @@
53 53
54/* Forward declarations for internal helpers. */ 54/* Forward declarations for internal helpers. */
55static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *, 55static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
56 sctp_scope_t scope, int gfp, int flags); 56 sctp_scope_t scope, unsigned int __nocast gfp,
57 int flags);
57static void sctp_bind_addr_clean(struct sctp_bind_addr *); 58static void sctp_bind_addr_clean(struct sctp_bind_addr *);
58 59
59/* First Level Abstractions. */ 60/* First Level Abstractions. */
@@ -63,7 +64,8 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
63 */ 64 */
64int sctp_bind_addr_copy(struct sctp_bind_addr *dest, 65int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
65 const struct sctp_bind_addr *src, 66 const struct sctp_bind_addr *src,
66 sctp_scope_t scope, int gfp, int flags) 67 sctp_scope_t scope, unsigned int __nocast gfp,
68 int flags)
67{ 69{
68 struct sctp_sockaddr_entry *addr; 70 struct sctp_sockaddr_entry *addr;
69 struct list_head *pos; 71 struct list_head *pos;
@@ -144,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
144 146
145/* Add an address to the bind address list in the SCTP_bind_addr structure. */ 147/* Add an address to the bind address list in the SCTP_bind_addr structure. */
146int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, 148int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
147 int gfp) 149 unsigned int __nocast gfp)
148{ 150{
149 struct sctp_sockaddr_entry *addr; 151 struct sctp_sockaddr_entry *addr;
150 152
@@ -197,7 +199,8 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
197 * The second argument is the return value for the length. 199 * The second argument is the return value for the length.
198 */ 200 */
199union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, 201union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
200 int *addrs_len, int gfp) 202 int *addrs_len,
203 unsigned int __nocast gfp)
201{ 204{
202 union sctp_params addrparms; 205 union sctp_params addrparms;
203 union sctp_params retval; 206 union sctp_params retval;
@@ -249,7 +252,7 @@ end_raw:
249 * address parameters). 252 * address parameters).
250 */ 253 */
251int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, 254int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
252 int addrs_len, __u16 port, int gfp) 255 int addrs_len, __u16 port, unsigned int __nocast gfp)
253{ 256{
254 union sctp_addr_param *rawaddr; 257 union sctp_addr_param *rawaddr;
255 struct sctp_paramhdr *param; 258 struct sctp_paramhdr *param;
@@ -347,7 +350,8 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
347/* Copy out addresses from the global local address list. */ 350/* Copy out addresses from the global local address list. */
348static int sctp_copy_one_addr(struct sctp_bind_addr *dest, 351static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
349 union sctp_addr *addr, 352 union sctp_addr *addr,
350 sctp_scope_t scope, int gfp, int flags) 353 sctp_scope_t scope, unsigned int __nocast gfp,
354 int flags)
351{ 355{
352 int error = 0; 356 int error = 0;
353 357
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0c2ab78850..61da2937e6 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
62} 62}
63 63
64/* Allocate and initialize datamsg. */ 64/* Allocate and initialize datamsg. */
65SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(int gfp) 65SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
66{ 66{
67 struct sctp_datamsg *msg; 67 struct sctp_datamsg *msg;
68 msg = kmalloc(sizeof(struct sctp_datamsg), gfp); 68 msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 2ec0320fac..e47ac0d1a6 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -67,7 +67,8 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
67 * Initialize the base fields of the endpoint structure. 67 * Initialize the base fields of the endpoint structure.
68 */ 68 */
69static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, 69static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
70 struct sock *sk, int gfp) 70 struct sock *sk,
71 unsigned int __nocast gfp)
71{ 72{
72 struct sctp_sock *sp = sctp_sk(sk); 73 struct sctp_sock *sp = sctp_sk(sk);
73 memset(ep, 0, sizeof(struct sctp_endpoint)); 74 memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -102,9 +103,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
102 /* Set up the base timeout information. */ 103 /* Set up the base timeout information. */
103 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; 104 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
104 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = 105 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
105 SCTP_DEFAULT_TIMEOUT_T1_COOKIE; 106 msecs_to_jiffies(sp->rtoinfo.srto_initial);
106 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = 107 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
107 SCTP_DEFAULT_TIMEOUT_T1_INIT; 108 msecs_to_jiffies(sp->rtoinfo.srto_initial);
108 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = 109 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
109 msecs_to_jiffies(sp->rtoinfo.srto_initial); 110 msecs_to_jiffies(sp->rtoinfo.srto_initial);
110 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; 111 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +118,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
117 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] 118 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
118 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); 119 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
119 120
120 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 121 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
121 SCTP_DEFAULT_TIMEOUT_HEARTBEAT; 122 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
122 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = 123 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
123 SCTP_DEFAULT_TIMEOUT_SACK;
124 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
125 sp->autoclose * HZ;
126 124
127 /* Use SCTP specific send buffer space queues. */ 125 /* Use SCTP specific send buffer space queues. */
128 ep->sndbuf_policy = sctp_sndbuf_policy; 126 ep->sndbuf_policy = sctp_sndbuf_policy;
@@ -140,7 +138,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
140/* Create a sctp_endpoint with all that boring stuff initialized. 138/* Create a sctp_endpoint with all that boring stuff initialized.
141 * Returns NULL if there isn't enough memory. 139 * Returns NULL if there isn't enough memory.
142 */ 140 */
143struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, int gfp) 141struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
142 unsigned int __nocast gfp)
144{ 143{
145 struct sctp_endpoint *ep; 144 struct sctp_endpoint *ep;
146 145
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 339f7acfdb..742be9171b 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -115,6 +115,17 @@ static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
115 atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); 115 atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
116} 116}
117 117
118struct sctp_input_cb {
119 union {
120 struct inet_skb_parm h4;
121#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
122 struct inet6_skb_parm h6;
123#endif
124 } header;
125 struct sctp_chunk *chunk;
126};
127#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
128
118/* 129/*
119 * This is the routine which IP calls when receiving an SCTP packet. 130 * This is the routine which IP calls when receiving an SCTP packet.
120 */ 131 */
@@ -243,6 +254,7 @@ int sctp_rcv(struct sk_buff *skb)
243 ret = -ENOMEM; 254 ret = -ENOMEM;
244 goto discard_release; 255 goto discard_release;
245 } 256 }
257 SCTP_INPUT_CB(skb)->chunk = chunk;
246 258
247 sctp_rcv_set_owner_r(skb,sk); 259 sctp_rcv_set_owner_r(skb,sk);
248 260
@@ -265,9 +277,9 @@ int sctp_rcv(struct sk_buff *skb)
265 sctp_bh_lock_sock(sk); 277 sctp_bh_lock_sock(sk);
266 278
267 if (sock_owned_by_user(sk)) 279 if (sock_owned_by_user(sk))
268 sk_add_backlog(sk, (struct sk_buff *) chunk); 280 sk_add_backlog(sk, skb);
269 else 281 else
270 sctp_backlog_rcv(sk, (struct sk_buff *) chunk); 282 sctp_backlog_rcv(sk, skb);
271 283
272 /* Release the sock and any reference counts we took in the 284 /* Release the sock and any reference counts we took in the
273 * lookup calls. 285 * lookup calls.
@@ -302,14 +314,8 @@ discard_release:
302 */ 314 */
303int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) 315int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
304{ 316{
305 struct sctp_chunk *chunk; 317 struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
306 struct sctp_inq *inqueue; 318 struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
307
308 /* One day chunk will live inside the skb, but for
309 * now this works.
310 */
311 chunk = (struct sctp_chunk *) skb;
312 inqueue = &chunk->rcvr->inqueue;
313 319
314 sctp_inq_push(inqueue, chunk); 320 sctp_inq_push(inqueue, chunk);
315 return 0; 321 return 0;
@@ -345,7 +351,6 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
345 * 351 *
346 */ 352 */
347void sctp_icmp_proto_unreachable(struct sock *sk, 353void sctp_icmp_proto_unreachable(struct sock *sk,
348 struct sctp_endpoint *ep,
349 struct sctp_association *asoc, 354 struct sctp_association *asoc,
350 struct sctp_transport *t) 355 struct sctp_transport *t)
351{ 356{
@@ -361,7 +366,6 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
361/* Common lookup code for icmp/icmpv6 error handler. */ 366/* Common lookup code for icmp/icmpv6 error handler. */
362struct sock *sctp_err_lookup(int family, struct sk_buff *skb, 367struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
363 struct sctphdr *sctphdr, 368 struct sctphdr *sctphdr,
364 struct sctp_endpoint **epp,
365 struct sctp_association **app, 369 struct sctp_association **app,
366 struct sctp_transport **tpp) 370 struct sctp_transport **tpp)
367{ 371{
@@ -369,11 +373,10 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
369 union sctp_addr daddr; 373 union sctp_addr daddr;
370 struct sctp_af *af; 374 struct sctp_af *af;
371 struct sock *sk = NULL; 375 struct sock *sk = NULL;
372 struct sctp_endpoint *ep = NULL;
373 struct sctp_association *asoc = NULL; 376 struct sctp_association *asoc = NULL;
374 struct sctp_transport *transport = NULL; 377 struct sctp_transport *transport = NULL;
375 378
376 *app = NULL; *epp = NULL; *tpp = NULL; 379 *app = NULL; *tpp = NULL;
377 380
378 af = sctp_get_af_specific(family); 381 af = sctp_get_af_specific(family);
379 if (unlikely(!af)) { 382 if (unlikely(!af)) {
@@ -388,26 +391,15 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
388 * packet. 391 * packet.
389 */ 392 */
390 asoc = __sctp_lookup_association(&saddr, &daddr, &transport); 393 asoc = __sctp_lookup_association(&saddr, &daddr, &transport);
391 if (!asoc) { 394 if (!asoc)
392 /* If there is no matching association, see if it matches any 395 return NULL;
393 * endpoint. This may happen for an ICMP error generated in
394 * response to an INIT_ACK.
395 */
396 ep = __sctp_rcv_lookup_endpoint(&daddr);
397 if (!ep) {
398 return NULL;
399 }
400 }
401 396
402 if (asoc) { 397 sk = asoc->base.sk;
403 sk = asoc->base.sk;
404 398
405 if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) { 399 if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) {
406 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); 400 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
407 goto out; 401 goto out;
408 } 402 }
409 } else
410 sk = ep->base.sk;
411 403
412 sctp_bh_lock_sock(sk); 404 sctp_bh_lock_sock(sk);
413 405
@@ -417,7 +409,6 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
417 if (sock_owned_by_user(sk)) 409 if (sock_owned_by_user(sk))
418 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); 410 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
419 411
420 *epp = ep;
421 *app = asoc; 412 *app = asoc;
422 *tpp = transport; 413 *tpp = transport;
423 return sk; 414 return sk;
@@ -426,21 +417,16 @@ out:
426 sock_put(sk); 417 sock_put(sk);
427 if (asoc) 418 if (asoc)
428 sctp_association_put(asoc); 419 sctp_association_put(asoc);
429 if (ep)
430 sctp_endpoint_put(ep);
431 return NULL; 420 return NULL;
432} 421}
433 422
434/* Common cleanup code for icmp/icmpv6 error handler. */ 423/* Common cleanup code for icmp/icmpv6 error handler. */
435void sctp_err_finish(struct sock *sk, struct sctp_endpoint *ep, 424void sctp_err_finish(struct sock *sk, struct sctp_association *asoc)
436 struct sctp_association *asoc)
437{ 425{
438 sctp_bh_unlock_sock(sk); 426 sctp_bh_unlock_sock(sk);
439 sock_put(sk); 427 sock_put(sk);
440 if (asoc) 428 if (asoc)
441 sctp_association_put(asoc); 429 sctp_association_put(asoc);
442 if (ep)
443 sctp_endpoint_put(ep);
444} 430}
445 431
446/* 432/*
@@ -465,7 +451,6 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
465 int type = skb->h.icmph->type; 451 int type = skb->h.icmph->type;
466 int code = skb->h.icmph->code; 452 int code = skb->h.icmph->code;
467 struct sock *sk; 453 struct sock *sk;
468 struct sctp_endpoint *ep;
469 struct sctp_association *asoc; 454 struct sctp_association *asoc;
470 struct sctp_transport *transport; 455 struct sctp_transport *transport;
471 struct inet_sock *inet; 456 struct inet_sock *inet;
@@ -482,7 +467,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
482 savesctp = skb->h.raw; 467 savesctp = skb->h.raw;
483 skb->nh.iph = iph; 468 skb->nh.iph = iph;
484 skb->h.raw = (char *)sh; 469 skb->h.raw = (char *)sh;
485 sk = sctp_err_lookup(AF_INET, skb, sh, &ep, &asoc, &transport); 470 sk = sctp_err_lookup(AF_INET, skb, sh, &asoc, &transport);
486 /* Put back, the original pointers. */ 471 /* Put back, the original pointers. */
487 skb->nh.raw = saveip; 472 skb->nh.raw = saveip;
488 skb->h.raw = savesctp; 473 skb->h.raw = savesctp;
@@ -509,7 +494,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
509 } 494 }
510 else { 495 else {
511 if (ICMP_PROT_UNREACH == code) { 496 if (ICMP_PROT_UNREACH == code) {
512 sctp_icmp_proto_unreachable(sk, ep, asoc, 497 sctp_icmp_proto_unreachable(sk, asoc,
513 transport); 498 transport);
514 goto out_unlock; 499 goto out_unlock;
515 } 500 }
@@ -538,7 +523,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
538 } 523 }
539 524
540out_unlock: 525out_unlock:
541 sctp_err_finish(sk, ep, asoc); 526 sctp_err_finish(sk, asoc);
542} 527}
543 528
544/* 529/*
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cedf435155..2d33922c04 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -50,7 +50,7 @@
50/* Initialize an SCTP inqueue. */ 50/* Initialize an SCTP inqueue. */
51void sctp_inq_init(struct sctp_inq *queue) 51void sctp_inq_init(struct sctp_inq *queue)
52{ 52{
53 skb_queue_head_init(&queue->in); 53 INIT_LIST_HEAD(&queue->in_chunk_list);
54 queue->in_progress = NULL; 54 queue->in_progress = NULL;
55 55
56 /* Create a task for delivering data. */ 56 /* Create a task for delivering data. */
@@ -62,11 +62,13 @@ void sctp_inq_init(struct sctp_inq *queue)
62/* Release the memory associated with an SCTP inqueue. */ 62/* Release the memory associated with an SCTP inqueue. */
63void sctp_inq_free(struct sctp_inq *queue) 63void sctp_inq_free(struct sctp_inq *queue)
64{ 64{
65 struct sctp_chunk *chunk; 65 struct sctp_chunk *chunk, *tmp;
66 66
67 /* Empty the queue. */ 67 /* Empty the queue. */
68 while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL) 68 list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
69 list_del_init(&chunk->list);
69 sctp_chunk_free(chunk); 70 sctp_chunk_free(chunk);
71 }
70 72
71 /* If there is a packet which is currently being worked on, 73 /* If there is a packet which is currently being worked on,
72 * free it as well. 74 * free it as well.
@@ -92,7 +94,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
92 * Eventually, we should clean up inqueue to not rely 94 * Eventually, we should clean up inqueue to not rely
93 * on the BH related data structures. 95 * on the BH related data structures.
94 */ 96 */
95 skb_queue_tail(&(q->in), (struct sk_buff *) packet); 97 list_add_tail(&packet->list, &q->in_chunk_list);
96 q->immediate.func(q->immediate.data); 98 q->immediate.func(q->immediate.data);
97} 99}
98 100
@@ -131,12 +133,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
131 133
132 /* Do we need to take the next packet out of the queue to process? */ 134 /* Do we need to take the next packet out of the queue to process? */
133 if (!chunk) { 135 if (!chunk) {
136 struct list_head *entry;
137
134 /* Is the queue empty? */ 138 /* Is the queue empty? */
135 if (skb_queue_empty(&queue->in)) 139 if (list_empty(&queue->in_chunk_list))
136 return NULL; 140 return NULL;
137 141
142 entry = queue->in_chunk_list.next;
138 chunk = queue->in_progress = 143 chunk = queue->in_progress =
139 (struct sctp_chunk *) skb_dequeue(&queue->in); 144 list_entry(entry, struct sctp_chunk, list);
145 list_del_init(entry);
140 146
141 /* This is the first chunk in the packet. */ 147 /* This is the first chunk in the packet. */
142 chunk->singleton = 1; 148 chunk->singleton = 1;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c7e42d125b..e9b2fd480d 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -91,7 +91,6 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
91 struct ipv6hdr *iph = (struct ipv6hdr *)skb->data; 91 struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
92 struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); 92 struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
93 struct sock *sk; 93 struct sock *sk;
94 struct sctp_endpoint *ep;
95 struct sctp_association *asoc; 94 struct sctp_association *asoc;
96 struct sctp_transport *transport; 95 struct sctp_transport *transport;
97 struct ipv6_pinfo *np; 96 struct ipv6_pinfo *np;
@@ -105,7 +104,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
105 savesctp = skb->h.raw; 104 savesctp = skb->h.raw;
106 skb->nh.ipv6h = iph; 105 skb->nh.ipv6h = iph;
107 skb->h.raw = (char *)sh; 106 skb->h.raw = (char *)sh;
108 sk = sctp_err_lookup(AF_INET6, skb, sh, &ep, &asoc, &transport); 107 sk = sctp_err_lookup(AF_INET6, skb, sh, &asoc, &transport);
109 /* Put back, the original pointers. */ 108 /* Put back, the original pointers. */
110 skb->nh.raw = saveip; 109 skb->nh.raw = saveip;
111 skb->h.raw = savesctp; 110 skb->h.raw = savesctp;
@@ -124,7 +123,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
124 goto out_unlock; 123 goto out_unlock;
125 case ICMPV6_PARAMPROB: 124 case ICMPV6_PARAMPROB:
126 if (ICMPV6_UNK_NEXTHDR == code) { 125 if (ICMPV6_UNK_NEXTHDR == code) {
127 sctp_icmp_proto_unreachable(sk, ep, asoc, transport); 126 sctp_icmp_proto_unreachable(sk, asoc, transport);
128 goto out_unlock; 127 goto out_unlock;
129 } 128 }
130 break; 129 break;
@@ -142,7 +141,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
142 } 141 }
143 142
144out_unlock: 143out_unlock:
145 sctp_err_finish(sk, ep, asoc); 144 sctp_err_finish(sk, asoc);
146out: 145out:
147 if (likely(idev != NULL)) 146 if (likely(idev != NULL))
148 in6_dev_put(idev); 147 in6_dev_put(idev);
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 0781e5d509..8ff588f0d7 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -127,8 +127,12 @@ done:
127/* Initialize the objcount in the proc filesystem. */ 127/* Initialize the objcount in the proc filesystem. */
128void sctp_dbg_objcnt_init(void) 128void sctp_dbg_objcnt_init(void)
129{ 129{
130 create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp, 130 struct proc_dir_entry *ent;
131 ent = create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp,
131 sctp_dbg_objcnt_read, NULL); 132 sctp_dbg_objcnt_read, NULL);
133 if (!ent)
134 printk(KERN_WARNING
135 "sctp_dbg_objcnt: Unable to create /proc entry.\n");
132} 136}
133 137
134/* Cleanup the objcount entry in the proc filesystem. */ 138/* Cleanup the objcount entry in the proc filesystem. */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 84b5b370b0..9313716334 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -108,7 +108,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
108 packet->transport = transport; 108 packet->transport = transport;
109 packet->source_port = sport; 109 packet->source_port = sport;
110 packet->destination_port = dport; 110 packet->destination_port = dport;
111 skb_queue_head_init(&packet->chunks); 111 INIT_LIST_HEAD(&packet->chunk_list);
112 if (asoc) { 112 if (asoc) {
113 struct sctp_sock *sp = sctp_sk(asoc->base.sk); 113 struct sctp_sock *sp = sctp_sk(asoc->base.sk);
114 overhead = sp->pf->af->net_header_len; 114 overhead = sp->pf->af->net_header_len;
@@ -129,12 +129,14 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
129/* Free a packet. */ 129/* Free a packet. */
130void sctp_packet_free(struct sctp_packet *packet) 130void sctp_packet_free(struct sctp_packet *packet)
131{ 131{
132 struct sctp_chunk *chunk; 132 struct sctp_chunk *chunk, *tmp;
133 133
134 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); 134 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
135 135
136 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) 136 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
137 list_del_init(&chunk->list);
137 sctp_chunk_free(chunk); 138 sctp_chunk_free(chunk);
139 }
138 140
139 if (packet->malloced) 141 if (packet->malloced)
140 kfree(packet); 142 kfree(packet);
@@ -276,7 +278,7 @@ append:
276 packet->has_sack = 1; 278 packet->has_sack = 1;
277 279
278 /* It is OK to send this chunk. */ 280 /* It is OK to send this chunk. */
279 __skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk); 281 list_add_tail(&chunk->list, &packet->chunk_list);
280 packet->size += chunk_len; 282 packet->size += chunk_len;
281 chunk->transport = packet->transport; 283 chunk->transport = packet->transport;
282finish: 284finish:
@@ -295,7 +297,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
295 struct sctphdr *sh; 297 struct sctphdr *sh;
296 __u32 crc32; 298 __u32 crc32;
297 struct sk_buff *nskb; 299 struct sk_buff *nskb;
298 struct sctp_chunk *chunk; 300 struct sctp_chunk *chunk, *tmp;
299 struct sock *sk; 301 struct sock *sk;
300 int err = 0; 302 int err = 0;
301 int padding; /* How much padding do we need? */ 303 int padding; /* How much padding do we need? */
@@ -305,11 +307,11 @@ int sctp_packet_transmit(struct sctp_packet *packet)
305 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); 307 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
306 308
307 /* Do NOT generate a chunkless packet. */ 309 /* Do NOT generate a chunkless packet. */
308 chunk = (struct sctp_chunk *)skb_peek(&packet->chunks); 310 if (list_empty(&packet->chunk_list))
309 if (unlikely(!chunk))
310 return err; 311 return err;
311 312
312 /* Set up convenience variables... */ 313 /* Set up convenience variables... */
314 chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
313 sk = chunk->skb->sk; 315 sk = chunk->skb->sk;
314 316
315 /* Allocate the new skb. */ 317 /* Allocate the new skb. */
@@ -370,7 +372,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
370 * [This whole comment explains WORD_ROUND() below.] 372 * [This whole comment explains WORD_ROUND() below.]
371 */ 373 */
372 SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n"); 374 SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n");
373 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { 375 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
376 list_del_init(&chunk->list);
374 if (sctp_chunk_is_data(chunk)) { 377 if (sctp_chunk_is_data(chunk)) {
375 378
376 if (!chunk->has_tsn) { 379 if (!chunk->has_tsn) {
@@ -511,7 +514,8 @@ err:
511 * will get resent or dropped later. 514 * will get resent or dropped later.
512 */ 515 */
513 516
514 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { 517 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
518 list_del_init(&chunk->list);
515 if (!sctp_chunk_is_data(chunk)) 519 if (!sctp_chunk_is_data(chunk))
516 sctp_chunk_free(chunk); 520 sctp_chunk_free(chunk);
517 } 521 }
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 4eb81a1407..efb72faba2 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -75,7 +75,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
75static inline void sctp_outq_head_data(struct sctp_outq *q, 75static inline void sctp_outq_head_data(struct sctp_outq *q,
76 struct sctp_chunk *ch) 76 struct sctp_chunk *ch)
77{ 77{
78 __skb_queue_head(&q->out, (struct sk_buff *)ch); 78 list_add(&ch->list, &q->out_chunk_list);
79 q->out_qlen += ch->skb->len; 79 q->out_qlen += ch->skb->len;
80 return; 80 return;
81} 81}
@@ -83,17 +83,22 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
83/* Take data from the front of the queue. */ 83/* Take data from the front of the queue. */
84static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) 84static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
85{ 85{
86 struct sctp_chunk *ch; 86 struct sctp_chunk *ch = NULL;
87 ch = (struct sctp_chunk *)__skb_dequeue(&q->out); 87
88 if (ch) 88 if (!list_empty(&q->out_chunk_list)) {
89 struct list_head *entry = q->out_chunk_list.next;
90
91 ch = list_entry(entry, struct sctp_chunk, list);
92 list_del_init(entry);
89 q->out_qlen -= ch->skb->len; 93 q->out_qlen -= ch->skb->len;
94 }
90 return ch; 95 return ch;
91} 96}
92/* Add data chunk to the end of the queue. */ 97/* Add data chunk to the end of the queue. */
93static inline void sctp_outq_tail_data(struct sctp_outq *q, 98static inline void sctp_outq_tail_data(struct sctp_outq *q,
94 struct sctp_chunk *ch) 99 struct sctp_chunk *ch)
95{ 100{
96 __skb_queue_tail(&q->out, (struct sk_buff *)ch); 101 list_add_tail(&ch->list, &q->out_chunk_list);
97 q->out_qlen += ch->skb->len; 102 q->out_qlen += ch->skb->len;
98 return; 103 return;
99} 104}
@@ -197,8 +202,8 @@ static inline int sctp_cacc_skip(struct sctp_transport *primary,
197void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) 202void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
198{ 203{
199 q->asoc = asoc; 204 q->asoc = asoc;
200 skb_queue_head_init(&q->out); 205 INIT_LIST_HEAD(&q->out_chunk_list);
201 skb_queue_head_init(&q->control); 206 INIT_LIST_HEAD(&q->control_chunk_list);
202 INIT_LIST_HEAD(&q->retransmit); 207 INIT_LIST_HEAD(&q->retransmit);
203 INIT_LIST_HEAD(&q->sacked); 208 INIT_LIST_HEAD(&q->sacked);
204 INIT_LIST_HEAD(&q->abandoned); 209 INIT_LIST_HEAD(&q->abandoned);
@@ -217,7 +222,7 @@ void sctp_outq_teardown(struct sctp_outq *q)
217{ 222{
218 struct sctp_transport *transport; 223 struct sctp_transport *transport;
219 struct list_head *lchunk, *pos, *temp; 224 struct list_head *lchunk, *pos, *temp;
220 struct sctp_chunk *chunk; 225 struct sctp_chunk *chunk, *tmp;
221 226
222 /* Throw away unacknowledged chunks. */ 227 /* Throw away unacknowledged chunks. */
223 list_for_each(pos, &q->asoc->peer.transport_addr_list) { 228 list_for_each(pos, &q->asoc->peer.transport_addr_list) {
@@ -269,8 +274,10 @@ void sctp_outq_teardown(struct sctp_outq *q)
269 q->error = 0; 274 q->error = 0;
270 275
271 /* Throw away any leftover control chunks. */ 276 /* Throw away any leftover control chunks. */
272 while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL) 277 list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
278 list_del_init(&chunk->list);
273 sctp_chunk_free(chunk); 279 sctp_chunk_free(chunk);
280 }
274} 281}
275 282
276/* Free the outqueue structure and any related pending chunks. */ 283/* Free the outqueue structure and any related pending chunks. */
@@ -333,7 +340,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
333 break; 340 break;
334 }; 341 };
335 } else { 342 } else {
336 __skb_queue_tail(&q->control, (struct sk_buff *) chunk); 343 list_add_tail(&chunk->list, &q->control_chunk_list);
337 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); 344 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
338 } 345 }
339 346
@@ -650,10 +657,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
650 __u16 sport = asoc->base.bind_addr.port; 657 __u16 sport = asoc->base.bind_addr.port;
651 __u16 dport = asoc->peer.port; 658 __u16 dport = asoc->peer.port;
652 __u32 vtag = asoc->peer.i.init_tag; 659 __u32 vtag = asoc->peer.i.init_tag;
653 struct sk_buff_head *queue;
654 struct sctp_transport *transport = NULL; 660 struct sctp_transport *transport = NULL;
655 struct sctp_transport *new_transport; 661 struct sctp_transport *new_transport;
656 struct sctp_chunk *chunk; 662 struct sctp_chunk *chunk, *tmp;
657 sctp_xmit_t status; 663 sctp_xmit_t status;
658 int error = 0; 664 int error = 0;
659 int start_timer = 0; 665 int start_timer = 0;
@@ -675,8 +681,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
675 * ... 681 * ...
676 */ 682 */
677 683
678 queue = &q->control; 684 list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
679 while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) { 685 list_del_init(&chunk->list);
686
680 /* Pick the right transport to use. */ 687 /* Pick the right transport to use. */
681 new_transport = chunk->transport; 688 new_transport = chunk->transport;
682 689
@@ -814,8 +821,6 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
814 821
815 /* Finally, transmit new packets. */ 822 /* Finally, transmit new packets. */
816 start_timer = 0; 823 start_timer = 0;
817 queue = &q->out;
818
819 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { 824 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
820 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid 825 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid
821 * stream identifier. 826 * stream identifier.
@@ -1149,8 +1154,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
1149 /* See if all chunks are acked. 1154 /* See if all chunks are acked.
1150 * Make sure the empty queue handler will get run later. 1155 * Make sure the empty queue handler will get run later.
1151 */ 1156 */
1152 q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) && 1157 q->empty = (list_empty(&q->out_chunk_list) &&
1153 list_empty(&q->retransmit); 1158 list_empty(&q->control_chunk_list) &&
1159 list_empty(&q->retransmit));
1154 if (!q->empty) 1160 if (!q->empty)
1155 goto finish; 1161 goto finish;
1156 1162
@@ -1679,9 +1685,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
1679 if (TSN_lte(tsn, ctsn)) { 1685 if (TSN_lte(tsn, ctsn)) {
1680 list_del_init(lchunk); 1686 list_del_init(lchunk);
1681 if (!chunk->tsn_gap_acked) { 1687 if (!chunk->tsn_gap_acked) {
1682 chunk->transport->flight_size -= 1688 chunk->transport->flight_size -=
1683 sctp_data_size(chunk); 1689 sctp_data_size(chunk);
1684 q->outstanding_bytes -= sctp_data_size(chunk); 1690 q->outstanding_bytes -= sctp_data_size(chunk);
1685 } 1691 }
1686 sctp_chunk_free(chunk); 1692 sctp_chunk_free(chunk);
1687 } else { 1693 } else {
@@ -1729,7 +1735,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
1729 nskips, &ftsn_skip_arr[0]); 1735 nskips, &ftsn_skip_arr[0]);
1730 1736
1731 if (ftsn_chunk) { 1737 if (ftsn_chunk) {
1732 __skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk); 1738 list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
1733 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); 1739 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
1734 } 1740 }
1735} 1741}
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 98d49ec9b7..b74f7772b5 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -57,6 +57,7 @@ static struct snmp_mib sctp_snmp_list[] = {
57 SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS), 57 SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS),
58 SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS), 58 SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS),
59 SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS), 59 SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS),
60 SNMP_MIB_SENTINEL
60}; 61};
61 62
62/* Return the current value of a particular entry in the mib by adding its 63/* Return the current value of a particular entry in the mib by adding its
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5135e1a25d..ce9245e71f 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
219 219
220/* Copy the local addresses which are valid for 'scope' into 'bp'. */ 220/* Copy the local addresses which are valid for 'scope' into 'bp'. */
221int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope, 221int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
222 int gfp, int copy_flags) 222 unsigned int __nocast gfp, int copy_flags)
223{ 223{
224 struct sctp_sockaddr_entry *addr; 224 struct sctp_sockaddr_entry *addr;
225 int error = 0; 225 int error = 0;
@@ -1050,7 +1050,10 @@ SCTP_STATIC __init int sctp_init(void)
1050 sctp_sndbuf_policy = 0; 1050 sctp_sndbuf_policy = 0;
1051 1051
1052 /* HB.interval - 30 seconds */ 1052 /* HB.interval - 30 seconds */
1053 sctp_hb_interval = 30 * HZ; 1053 sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
1054
1055 /* delayed SACK timeout */
1056 sctp_sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK;
1054 1057
1055 /* Implementation specific variables. */ 1058 /* Implementation specific variables. */
1056 1059
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5baed9bb7d..00d32b7c82 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
78static int sctp_process_param(struct sctp_association *asoc, 78static int sctp_process_param(struct sctp_association *asoc,
79 union sctp_params param, 79 union sctp_params param,
80 const union sctp_addr *peer_addr, 80 const union sctp_addr *peer_addr,
81 int gfp); 81 unsigned int __nocast gfp);
82 82
83/* What was the inbound interface for this chunk? */ 83/* What was the inbound interface for this chunk? */
84int sctp_chunk_iif(const struct sctp_chunk *chunk) 84int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
174 */ 174 */
175struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, 175struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
176 const struct sctp_bind_addr *bp, 176 const struct sctp_bind_addr *bp,
177 int gfp, int vparam_len) 177 unsigned int __nocast gfp, int vparam_len)
178{ 178{
179 sctp_inithdr_t init; 179 sctp_inithdr_t init;
180 union sctp_params addrs; 180 union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
261 261
262struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, 262struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
263 const struct sctp_chunk *chunk, 263 const struct sctp_chunk *chunk,
264 int gfp, int unkparam_len) 264 unsigned int __nocast gfp, int unkparam_len)
265{ 265{
266 sctp_inithdr_t initack; 266 sctp_inithdr_t initack;
267 struct sctp_chunk *retval; 267 struct sctp_chunk *retval;
@@ -1003,6 +1003,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
1003 SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb); 1003 SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb);
1004 } 1004 }
1005 1005
1006 INIT_LIST_HEAD(&retval->list);
1006 retval->skb = skb; 1007 retval->skb = skb;
1007 retval->asoc = (struct sctp_association *)asoc; 1008 retval->asoc = (struct sctp_association *)asoc;
1008 retval->resent = 0; 1009 retval->resent = 0;
@@ -1116,8 +1117,7 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk)
1116/* Possibly, free the chunk. */ 1117/* Possibly, free the chunk. */
1117void sctp_chunk_free(struct sctp_chunk *chunk) 1118void sctp_chunk_free(struct sctp_chunk *chunk)
1118{ 1119{
1119 /* Make sure that we are not on any list. */ 1120 BUG_ON(!list_empty(&chunk->list));
1120 skb_unlink((struct sk_buff *) chunk);
1121 list_del_init(&chunk->transmitted_list); 1121 list_del_init(&chunk->transmitted_list);
1122 1122
1123 /* Release our reference on the message tracker. */ 1123 /* Release our reference on the message tracker. */
@@ -1233,7 +1233,8 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
1233 1233
1234/* Create a CLOSED association to use with an incoming packet. */ 1234/* Create a CLOSED association to use with an incoming packet. */
1235struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, 1235struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
1236 struct sctp_chunk *chunk, int gfp) 1236 struct sctp_chunk *chunk,
1237 unsigned int __nocast gfp)
1237{ 1238{
1238 struct sctp_association *asoc; 1239 struct sctp_association *asoc;
1239 struct sk_buff *skb; 1240 struct sk_buff *skb;
@@ -1348,7 +1349,7 @@ nodata:
1348struct sctp_association *sctp_unpack_cookie( 1349struct sctp_association *sctp_unpack_cookie(
1349 const struct sctp_endpoint *ep, 1350 const struct sctp_endpoint *ep,
1350 const struct sctp_association *asoc, 1351 const struct sctp_association *asoc,
1351 struct sctp_chunk *chunk, int gfp, 1352 struct sctp_chunk *chunk, unsigned int __nocast gfp,
1352 int *error, struct sctp_chunk **errp) 1353 int *error, struct sctp_chunk **errp)
1353{ 1354{
1354 struct sctp_association *retval = NULL; 1355 struct sctp_association *retval = NULL;
@@ -1812,7 +1813,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
1812 */ 1813 */
1813int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, 1814int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1814 const union sctp_addr *peer_addr, 1815 const union sctp_addr *peer_addr,
1815 sctp_init_chunk_t *peer_init, int gfp) 1816 sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
1816{ 1817{
1817 union sctp_params param; 1818 union sctp_params param;
1818 struct sctp_transport *transport; 1819 struct sctp_transport *transport;
@@ -1983,7 +1984,7 @@ nomem:
1983static int sctp_process_param(struct sctp_association *asoc, 1984static int sctp_process_param(struct sctp_association *asoc,
1984 union sctp_params param, 1985 union sctp_params param,
1985 const union sctp_addr *peer_addr, 1986 const union sctp_addr *peer_addr,
1986 int gfp) 1987 unsigned int __nocast gfp)
1987{ 1988{
1988 union sctp_addr addr; 1989 union sctp_addr addr;
1989 int i; 1990 int i;
@@ -2739,8 +2740,12 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
2739 asoc->addip_last_asconf = NULL; 2740 asoc->addip_last_asconf = NULL;
2740 2741
2741 /* Send the next asconf chunk from the addip chunk queue. */ 2742 /* Send the next asconf chunk from the addip chunk queue. */
2742 asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks); 2743 if (!list_empty(&asoc->addip_chunk_list)) {
2743 if (asconf) { 2744 struct list_head *entry = asoc->addip_chunk_list.next;
2745 asconf = list_entry(entry, struct sctp_chunk, list);
2746
2747 list_del_init(entry);
2748
2744 /* Hold the chunk until an ASCONF_ACK is received. */ 2749 /* Hold the chunk until an ASCONF_ACK is received. */
2745 sctp_chunk_hold(asconf); 2750 sctp_chunk_hold(asconf);
2746 if (sctp_primitive_ASCONF(asoc, asconf)) 2751 if (sctp_primitive_ASCONF(asoc, asconf))
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 778639db12..39c970b5b1 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
63 void *event_arg, 63 void *event_arg,
64 sctp_disposition_t status, 64 sctp_disposition_t status,
65 sctp_cmd_seq_t *commands, 65 sctp_cmd_seq_t *commands,
66 int gfp); 66 unsigned int __nocast gfp);
67static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, 67static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
68 sctp_state_t state, 68 sctp_state_t state,
69 struct sctp_endpoint *ep, 69 struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
71 void *event_arg, 71 void *event_arg,
72 sctp_disposition_t status, 72 sctp_disposition_t status,
73 sctp_cmd_seq_t *commands, 73 sctp_cmd_seq_t *commands,
74 int gfp); 74 unsigned int __nocast gfp);
75 75
76/******************************************************************** 76/********************************************************************
77 * Helper functions 77 * Helper functions
@@ -497,7 +497,8 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
497static int sctp_cmd_process_init(sctp_cmd_seq_t *commands, 497static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
498 struct sctp_association *asoc, 498 struct sctp_association *asoc,
499 struct sctp_chunk *chunk, 499 struct sctp_chunk *chunk,
500 sctp_init_chunk_t *peer_init, int gfp) 500 sctp_init_chunk_t *peer_init,
501 unsigned int __nocast gfp)
501{ 502{
502 int error; 503 int error;
503 504
@@ -852,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
852 struct sctp_endpoint *ep, 853 struct sctp_endpoint *ep,
853 struct sctp_association *asoc, 854 struct sctp_association *asoc,
854 void *event_arg, 855 void *event_arg,
855 int gfp) 856 unsigned int __nocast gfp)
856{ 857{
857 sctp_cmd_seq_t commands; 858 sctp_cmd_seq_t commands;
858 const sctp_sm_table_entry_t *state_fn; 859 const sctp_sm_table_entry_t *state_fn;
@@ -897,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
897 void *event_arg, 898 void *event_arg,
898 sctp_disposition_t status, 899 sctp_disposition_t status,
899 sctp_cmd_seq_t *commands, 900 sctp_cmd_seq_t *commands,
900 int gfp) 901 unsigned int __nocast gfp)
901{ 902{
902 int error; 903 int error;
903 904
@@ -985,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
985 void *event_arg, 986 void *event_arg,
986 sctp_disposition_t status, 987 sctp_disposition_t status,
987 sctp_cmd_seq_t *commands, 988 sctp_cmd_seq_t *commands,
988 int gfp) 989 unsigned int __nocast gfp)
989{ 990{
990 int error = 0; 991 int error = 0;
991 int force; 992 int force;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 058189684c..86073df418 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
92 sctp_cmd_seq_t *commands); 92 sctp_cmd_seq_t *commands);
93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); 93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
94 94
95static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
96 __u16 error,
97 const struct sctp_association *asoc,
98 struct sctp_transport *transport);
99
100static sctp_disposition_t sctp_sf_violation_chunklen(
101 const struct sctp_endpoint *ep,
102 const struct sctp_association *asoc,
103 const sctp_subtype_t type,
104 void *arg,
105 sctp_cmd_seq_t *commands);
95 106
96/* Small helper function that checks if the chunk length 107/* Small helper function that checks if the chunk length
97 * is of the appropriate length. The 'required_length' argument 108 * is of the appropriate length. The 'required_length' argument
@@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
2328 * 2339 *
2329 * This is common code called by several sctp_sf_*_abort() functions above. 2340 * This is common code called by several sctp_sf_*_abort() functions above.
2330 */ 2341 */
2331sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, 2342static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
2332 __u16 error, 2343 __u16 error,
2333 const struct sctp_association *asoc, 2344 const struct sctp_association *asoc,
2334 struct sctp_transport *transport) 2345 struct sctp_transport *transport)
@@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
3687 * 3698 *
3688 * Generate an ABORT chunk and terminate the association. 3699 * Generate an ABORT chunk and terminate the association.
3689 */ 3700 */
3690sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep, 3701static sctp_disposition_t sctp_sf_violation_chunklen(
3702 const struct sctp_endpoint *ep,
3691 const struct sctp_association *asoc, 3703 const struct sctp_association *asoc,
3692 const sctp_subtype_t type, 3704 const sctp_subtype_t type,
3693 void *arg, 3705 void *arg,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aad55dc379..091a66f06a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -406,7 +406,7 @@ static int sctp_send_asconf(struct sctp_association *asoc,
406 * transmission. 406 * transmission.
407 */ 407 */
408 if (asoc->addip_last_asconf) { 408 if (asoc->addip_last_asconf) {
409 __skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk); 409 list_add_tail(&chunk->list, &asoc->addip_chunk_list);
410 goto out; 410 goto out;
411 } 411 }
412 412
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index e627d2b451..25037daf3f 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -57,7 +57,8 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
57/* Create a new sctp_ssnmap. 57/* Create a new sctp_ssnmap.
58 * Allocate room to store at least 'len' contiguous TSNs. 58 * Allocate room to store at least 'len' contiguous TSNs.
59 */ 59 */
60struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int gfp) 60struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
61 unsigned int __nocast gfp)
61{ 62{
62 struct sctp_ssnmap *retval; 63 struct sctp_ssnmap *retval;
63 int size; 64 int size;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc3184931..dc4893474f 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -47,6 +47,8 @@
47static ctl_handler sctp_sysctl_jiffies_ms; 47static ctl_handler sctp_sysctl_jiffies_ms;
48static long rto_timer_min = 1; 48static long rto_timer_min = 1;
49static long rto_timer_max = 86400000; /* One day */ 49static long rto_timer_max = 86400000; /* One day */
50static long sack_timer_min = 1;
51static long sack_timer_max = 500;
50 52
51static ctl_table sctp_table[] = { 53static ctl_table sctp_table[] = {
52 { 54 {
@@ -187,6 +189,17 @@ static ctl_table sctp_table[] = {
187 .mode = 0644, 189 .mode = 0644,
188 .proc_handler = &proc_dointvec 190 .proc_handler = &proc_dointvec
189 }, 191 },
192 {
193 .ctl_name = NET_SCTP_SACK_TIMEOUT,
194 .procname = "sack_timeout",
195 .data = &sctp_sack_timeout,
196 .maxlen = sizeof(long),
197 .mode = 0644,
198 .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
199 .strategy = &sctp_sysctl_jiffies_ms,
200 .extra1 = &sack_timer_min,
201 .extra2 = &sack_timer_max,
202 },
190 { .ctl_name = 0 } 203 { .ctl_name = 0 }
191}; 204};
192 205
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 0ec0fde6e6..d2f04ebe50 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
57/* Initialize a new transport from provided memory. */ 57/* Initialize a new transport from provided memory. */
58static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, 58static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
59 const union sctp_addr *addr, 59 const union sctp_addr *addr,
60 int gfp) 60 unsigned int __nocast gfp)
61{ 61{
62 /* Copy in the address. */ 62 /* Copy in the address. */
63 peer->ipaddr = *addr; 63 peer->ipaddr = *addr;
@@ -103,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
103 103
104 /* Set up the heartbeat timer. */ 104 /* Set up the heartbeat timer. */
105 init_timer(&peer->hb_timer); 105 init_timer(&peer->hb_timer);
106 peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
107 peer->hb_timer.function = sctp_generate_heartbeat_event; 106 peer->hb_timer.function = sctp_generate_heartbeat_event;
108 peer->hb_timer.data = (unsigned long)peer; 107 peer->hb_timer.data = (unsigned long)peer;
109 108
@@ -122,7 +121,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
122} 121}
123 122
124/* Allocate and initialize a new transport. */ 123/* Allocate and initialize a new transport. */
125struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, int gfp) 124struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
125 unsigned int __nocast gfp)
126{ 126{
127 struct sctp_transport *transport; 127 struct sctp_transport *transport;
128 128
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 17d0ff5347..0abd510110 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
74 74
75/* Create a new sctp_ulpevent. */ 75/* Create a new sctp_ulpevent. */
76SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, 76SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
77 int gfp) 77 unsigned int __nocast gfp)
78{ 78{
79 struct sctp_ulpevent *event; 79 struct sctp_ulpevent *event;
80 struct sk_buff *skb; 80 struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
136struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( 136struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
137 const struct sctp_association *asoc, 137 const struct sctp_association *asoc,
138 __u16 flags, __u16 state, __u16 error, __u16 outbound, 138 __u16 flags, __u16 state, __u16 error, __u16 outbound,
139 __u16 inbound, int gfp) 139 __u16 inbound, unsigned int __nocast gfp)
140{ 140{
141 struct sctp_ulpevent *event; 141 struct sctp_ulpevent *event;
142 struct sctp_assoc_change *sac; 142 struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
237struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( 237struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
238 const struct sctp_association *asoc, 238 const struct sctp_association *asoc,
239 const struct sockaddr_storage *aaddr, 239 const struct sockaddr_storage *aaddr,
240 int flags, int state, int error, int gfp) 240 int flags, int state, int error, unsigned int __nocast gfp)
241{ 241{
242 struct sctp_ulpevent *event; 242 struct sctp_ulpevent *event;
243 struct sctp_paddr_change *spc; 243 struct sctp_paddr_change *spc;
@@ -350,7 +350,7 @@ fail:
350 */ 350 */
351struct sctp_ulpevent *sctp_ulpevent_make_remote_error( 351struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
352 const struct sctp_association *asoc, struct sctp_chunk *chunk, 352 const struct sctp_association *asoc, struct sctp_chunk *chunk,
353 __u16 flags, int gfp) 353 __u16 flags, unsigned int __nocast gfp)
354{ 354{
355 struct sctp_ulpevent *event; 355 struct sctp_ulpevent *event;
356 struct sctp_remote_error *sre; 356 struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
448 */ 448 */
449struct sctp_ulpevent *sctp_ulpevent_make_send_failed( 449struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
450 const struct sctp_association *asoc, struct sctp_chunk *chunk, 450 const struct sctp_association *asoc, struct sctp_chunk *chunk,
451 __u16 flags, __u32 error, int gfp) 451 __u16 flags, __u32 error, unsigned int __nocast gfp)
452{ 452{
453 struct sctp_ulpevent *event; 453 struct sctp_ulpevent *event;
454 struct sctp_send_failed *ssf; 454 struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
557 */ 557 */
558struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( 558struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
559 const struct sctp_association *asoc, 559 const struct sctp_association *asoc,
560 __u16 flags, int gfp) 560 __u16 flags, unsigned int __nocast gfp)
561{ 561{
562 struct sctp_ulpevent *event; 562 struct sctp_ulpevent *event;
563 struct sctp_shutdown_event *sse; 563 struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
620 * 5.3.1.6 SCTP_ADAPTION_INDICATION 620 * 5.3.1.6 SCTP_ADAPTION_INDICATION
621 */ 621 */
622struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication( 622struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
623 const struct sctp_association *asoc, int gfp) 623 const struct sctp_association *asoc, unsigned int __nocast gfp)
624{ 624{
625 struct sctp_ulpevent *event; 625 struct sctp_ulpevent *event;
626 struct sctp_adaption_event *sai; 626 struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
657 */ 657 */
658struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, 658struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
659 struct sctp_chunk *chunk, 659 struct sctp_chunk *chunk,
660 int gfp) 660 unsigned int __nocast gfp)
661{ 661{
662 struct sctp_ulpevent *event = NULL; 662 struct sctp_ulpevent *event = NULL;
663 struct sk_buff *skb; 663 struct sk_buff *skb;
@@ -718,7 +718,8 @@ fail:
718 * various events. 718 * various events.
719 */ 719 */
720struct sctp_ulpevent *sctp_ulpevent_make_pdapi( 720struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
721 const struct sctp_association *asoc, __u32 indication, int gfp) 721 const struct sctp_association *asoc, __u32 indication,
722 unsigned int __nocast gfp)
722{ 723{
723 struct sctp_ulpevent *event; 724 struct sctp_ulpevent *event;
724 struct sctp_pdapi_event *pd; 725 struct sctp_pdapi_event *pd;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d5dd2cf7ac..8bbc279d6c 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
100 100
101/* Process an incoming DATA chunk. */ 101/* Process an incoming DATA chunk. */
102int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, 102int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
103 int gfp) 103 unsigned int __nocast gfp)
104{ 104{
105 struct sk_buff_head temp; 105 struct sk_buff_head temp;
106 sctp_data_chunk_t *hdr; 106 sctp_data_chunk_t *hdr;
@@ -778,7 +778,8 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
778 778
779/* Partial deliver the first message as there is pressure on rwnd. */ 779/* Partial deliver the first message as there is pressure on rwnd. */
780void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, 780void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
781 struct sctp_chunk *chunk, int gfp) 781 struct sctp_chunk *chunk,
782 unsigned int __nocast gfp)
782{ 783{
783 struct sctp_ulpevent *event; 784 struct sctp_ulpevent *event;
784 struct sctp_association *asoc; 785 struct sctp_association *asoc;
@@ -802,7 +803,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
802 803
803/* Renege some packets to make room for an incoming chunk. */ 804/* Renege some packets to make room for an incoming chunk. */
804void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, 805void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
805 int gfp) 806 unsigned int __nocast gfp)
806{ 807{
807 struct sctp_association *asoc; 808 struct sctp_association *asoc;
808 __u16 needed, freed; 809 __u16 needed, freed;
@@ -841,7 +842,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
841/* Notify the application if an association is aborted and in 842/* Notify the application if an association is aborted and in
842 * partial delivery mode. Send up any pending received messages. 843 * partial delivery mode. Send up any pending received messages.
843 */ 844 */
844void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, int gfp) 845void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
845{ 846{
846 struct sctp_ulpevent *ev = NULL; 847 struct sctp_ulpevent *ev = NULL;
847 struct sock *sk; 848 struct sock *sk;
diff --git a/net/socket.c b/net/socket.c
index 38729af094..6f2a178819 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -383,9 +383,8 @@ int sock_map_fd(struct socket *sock)
383 goto out; 383 goto out;
384 } 384 }
385 385
386 sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino); 386 this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
387 this.name = name; 387 this.name = name;
388 this.len = strlen(name);
389 this.hash = SOCK_INODE(sock)->i_ino; 388 this.hash = SOCK_INODE(sock)->i_ino;
390 389
391 file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this); 390 file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 24c21f2a33..5a7265aeaf 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -185,9 +185,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
185 sg->page = body->pages[i]; 185 sg->page = body->pages[i];
186 sg->offset = offset; 186 sg->offset = offset;
187 sg->length = thislen; 187 sg->length = thislen;
188 kmap(sg->page); /* XXX kmap_atomic? */
189 crypto_digest_update(tfm, sg, 1); 188 crypto_digest_update(tfm, sg, 1);
190 kunmap(sg->page);
191 len -= thislen; 189 len -= thislen;
192 i++; 190 i++;
193 offset = 0; 191 offset = 0;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 32e8acbc60..62a0734952 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,6 +41,7 @@ EXPORT_SYMBOL(rpc_release_task);
41 41
42/* RPC client functions */ 42/* RPC client functions */
43EXPORT_SYMBOL(rpc_create_client); 43EXPORT_SYMBOL(rpc_create_client);
44EXPORT_SYMBOL(rpc_new_client);
44EXPORT_SYMBOL(rpc_clone_client); 45EXPORT_SYMBOL(rpc_clone_client);
45EXPORT_SYMBOL(rpc_bind_new_program); 46EXPORT_SYMBOL(rpc_bind_new_program);
46EXPORT_SYMBOL(rpc_destroy_client); 47EXPORT_SYMBOL(rpc_destroy_client);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2b99b4028d..d6baf6fdf8 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -8,6 +8,7 @@
8#include <linux/err.h> 8#include <linux/err.h>
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/hash.h> 10#include <linux/hash.h>
11#include <linux/string.h>
11 12
12#define RPCDBG_FACILITY RPCDBG_AUTH 13#define RPCDBG_FACILITY RPCDBG_AUTH
13 14
@@ -20,14 +21,6 @@
20 */ 21 */
21 22
22 23
23static char *strdup(char *s)
24{
25 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
26 if (rv)
27 strcpy(rv, s);
28 return rv;
29}
30
31struct unix_domain { 24struct unix_domain {
32 struct auth_domain h; 25 struct auth_domain h;
33 int addr_changes; 26 int addr_changes;
@@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name)
55 if (new == NULL) 48 if (new == NULL)
56 return NULL; 49 return NULL;
57 cache_init(&new->h.h); 50 cache_init(&new->h.h);
58 new->h.name = strdup(name); 51 new->h.name = kstrdup(name, GFP_KERNEL);
59 new->h.flavour = RPC_AUTH_UNIX; 52 new->h.flavour = RPC_AUTH_UNIX;
60 new->addr_changes = 0; 53 new->addr_changes = 0;
61 new->h.h.expiry_time = NEVER; 54 new->h.h.expiry_time = NEVER;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc..d0c3120d02 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -586,7 +586,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
586 } 586 }
587 if (skb->stamp.tv_sec == 0) { 587 if (skb->stamp.tv_sec == 0) {
588 skb->stamp.tv_sec = xtime.tv_sec; 588 skb->stamp.tv_sec = xtime.tv_sec;
589 skb->stamp.tv_usec = xtime.tv_nsec * 1000; 589 skb->stamp.tv_usec = xtime.tv_nsec / NSEC_PER_USEC;
590 /* Don't enable netstamp, sunrpc doesn't 590 /* Don't enable netstamp, sunrpc doesn't
591 need that much accuracy */ 591 need that much accuracy */
592 } 592 }
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1185 arg->page_len = (pages-2)*PAGE_SIZE; 1185 arg->page_len = (pages-2)*PAGE_SIZE;
1186 arg->len = (pages-1)*PAGE_SIZE; 1186 arg->len = (pages-1)*PAGE_SIZE;
1187 arg->tail[0].iov_len = 0; 1187 arg->tail[0].iov_len = 0;
1188 1188
1189 try_to_freeze(PF_FREEZE); 1189 try_to_freeze();
1190 if (signalled()) 1190 if (signalled())
1191 return -EINTR; 1191 return -EINTR;
1192 1192
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1227 1227
1228 schedule_timeout(timeout); 1228 schedule_timeout(timeout);
1229 1229
1230 try_to_freeze(PF_FREEZE); 1230 try_to_freeze();
1231 1231
1232 spin_lock_bh(&serv->sv_lock); 1232 spin_lock_bh(&serv->sv_lock);
1233 remove_wait_queue(&rqstp->rq_wait, &wait); 1233 remove_wait_queue(&rqstp->rq_wait, &wait);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 8a4d9c106a..fde16f40a5 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -993,6 +993,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
993 return -EINVAL; 993 return -EINVAL;
994 } else { 994 } else {
995 if (xdr_decode_word(buf, base, &desc->array_len) != 0 || 995 if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
996 desc->array_len > desc->array_maxlen ||
996 (unsigned long) base + 4 + desc->array_len * 997 (unsigned long) base + 4 + desc->array_len *
997 desc->elem_size > buf->len) 998 desc->elem_size > buf->len)
998 return -EINVAL; 999 return -EINVAL;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index eca9240594..3c654e06b0 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -145,8 +145,6 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
145 if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { 145 if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) {
146 if (task == xprt->snd_task) 146 if (task == xprt->snd_task)
147 return 1; 147 return 1;
148 if (task == NULL)
149 return 0;
150 goto out_sleep; 148 goto out_sleep;
151 } 149 }
152 if (xprt->nocong || __xprt_get_cong(xprt, task)) { 150 if (xprt->nocong || __xprt_get_cong(xprt, task)) {
@@ -970,7 +968,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
970 goto out; 968 goto out;
971 } 969 }
972 970
973 dprintk("RPC: XID %08x read %u bytes\n", 971 dprintk("RPC: XID %08x read %Zd bytes\n",
974 ntohl(xprt->tcp_xid), r); 972 ntohl(xprt->tcp_xid), r);
975 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", 973 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
976 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); 974 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
@@ -1006,7 +1004,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
1006 desc->count -= len; 1004 desc->count -= len;
1007 desc->offset += len; 1005 desc->offset += len;
1008 xprt->tcp_offset += len; 1006 xprt->tcp_offset += len;
1009 dprintk("RPC: discarded %u bytes\n", len); 1007 dprintk("RPC: discarded %Zu bytes\n", len);
1010 tcp_check_recm(xprt); 1008 tcp_check_recm(xprt);
1011} 1009}
1012 1010
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 0000000000..5a69733bcd
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,21 @@
1#
2# Unix Domain Sockets
3#
4
5config UNIX
6 tristate "Unix domain sockets"
7 ---help---
8 If you say Y here, you will include support for Unix domain sockets;
9 sockets are the standard Unix mechanism for establishing and
10 accessing network connections. Many commonly used programs such as
11 the X Window system and syslog use these sockets even if your
12 machine is not connected to any network. Unless you are working on
13 an embedded system or something similar, you therefore definitely
14 want to say Y here.
15
16 To compile this driver as a module, choose M here: the module will be
17 called unix. Note that several important services won't work
18 correctly if you say M here and then neglect to load the module.
19
20 Say Y unless you know what you are doing.
21
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c420eba487..d403e34088 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock *sk)
302 * may receive messages only from that peer. */ 302 * may receive messages only from that peer. */
303static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 303static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
304{ 304{
305 if (skb_queue_len(&sk->sk_receive_queue)) { 305 if (!skb_queue_empty(&sk->sk_receive_queue)) {
306 skb_queue_purge(&sk->sk_receive_queue); 306 skb_queue_purge(&sk->sk_receive_queue);
307 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 307 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
308 308
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct sock * sk, long timeo)
1619 for (;;) { 1619 for (;;) {
1620 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 1620 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1621 1621
1622 if (skb_queue_len(&sk->sk_receive_queue) || 1622 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1623 sk->sk_err || 1623 sk->sk_err ||
1624 (sk->sk_shutdown & RCV_SHUTDOWN) || 1624 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1625 signal_pending(current) || 1625 signal_pending(current) ||
diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig
new file mode 100644
index 0000000000..1debe1cb05
--- /dev/null
+++ b/net/wanrouter/Kconfig
@@ -0,0 +1,29 @@
1#
2# Configuration for WAN router
3#
4
5config WAN_ROUTER
6 tristate "WAN router"
7 depends on EXPERIMENTAL
8 ---help---
9 Wide Area Networks (WANs), such as X.25, frame relay and leased
10 lines, are used to interconnect Local Area Networks (LANs) over vast
11 distances with data transfer rates significantly higher than those
12 achievable with commonly used asynchronous modem connections.
13 Usually, a quite expensive external device called a `WAN router' is
14 needed to connect to a WAN.
15
16 As an alternative, WAN routing can be built into the Linux kernel.
17 With relatively inexpensive WAN interface cards available on the
18 market, a perfectly usable router can be built for less than half
19 the price of an external router. If you have one of those cards and
20 wish to use your Linux box as a WAN router, say Y here and also to
21 the WAN driver for your card, below. You will then need the
22 wan-tools package which is available from <ftp://ftp.sangoma.com/>.
23 Read <file:Documentation/networking/wan-router.txt> for more
24 information.
25
26 To compile WAN routing support as a module, choose M here: the
27 module will be called wanrouter.
28
29 If unsure, say N.
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index d6844ac226..13b650ad22 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -358,10 +358,10 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
358 */ 358 */
359 359
360 360
361unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev) 361__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
362{ 362{
363 int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ 363 int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */
364 unsigned short ethertype; 364 __be16 ethertype;
365 365
366 switch (skb->data[cnt]) { 366 switch (skb->data[cnt]) {
367 case NLPID_IP: /* IP datagramm */ 367 case NLPID_IP: /* IP datagramm */
@@ -379,7 +379,7 @@ unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
379 skb->data[cnt+3], dev->name); 379 skb->data[cnt+3], dev->name);
380 return 0; 380 return 0;
381 } 381 }
382 ethertype = *((unsigned short*)&skb->data[cnt+4]); 382 ethertype = *((__be16*)&skb->data[cnt+4]);
383 cnt += 6; 383 cnt += 6;
384 break; 384 break;
385 385
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 0000000000..e6759c9660
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,36 @@
1#
2# CCITT X.25 Packet Layer
3#
4
5config X25
6 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 X.25 is a set of standardized network protocols, similar in scope to
10 frame relay; the one physical line from your box to the X.25 network
11 entry point can carry several logical point-to-point connections
12 (called "virtual circuits") to other computers connected to the X.25
13 network. Governments, banks, and other organizations tend to use it
14 to connect to each other or to form Wide Area Networks (WANs). Many
15 countries have public X.25 networks. X.25 consists of two
16 protocols: the higher level Packet Layer Protocol (PLP) (say Y here
17 if you want that) and the lower level data link layer protocol LAPB
18 (say Y to "LAPB Data Link Driver" below if you want that).
19
20 You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
21 <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
22 Information about X.25 for Linux is contained in the files
23 <file:Documentation/networking/x25.txt> and
24 <file:Documentation/networking/x25-iface.txt>.
25
26 One connects to an X.25 network either with a dedicated network card
27 using the X.21 protocol (not yet supported by Linux) or one can do
28 X.25 over a standard telephone line using an ordinary modem (say Y
29 to "X.25 async driver" below) or over Ethernet using an ordinary
30 Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
31 Driver" and "LAPB over Ethernet driver" below).
32
33 To compile this driver as a module, choose M here: the module
34 will be called x25. If unsure, say N.
35
36
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 2a24b243b8..04bec047fa 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -29,6 +29,10 @@
29 * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN 29 * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN
30 * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to 30 * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
31 * x25_proc.c, using seq_file 31 * x25_proc.c, using seq_file
32 * 2005-04-02 Shaun Pereira Selective sub address matching
33 * with call user data
34 * 2005-04-15 Shaun Pereira Fast select with no restriction on
35 * response
32 */ 36 */
33 37
34#include <linux/config.h> 38#include <linux/config.h>
@@ -219,7 +223,8 @@ static void x25_insert_socket(struct sock *sk)
219 * Note: if a listening socket has cud set it must only get calls 223 * Note: if a listening socket has cud set it must only get calls
220 * with matching cud. 224 * with matching cud.
221 */ 225 */
222static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata) 226static struct sock *x25_find_listener(struct x25_address *addr,
227 struct sk_buff *skb)
223{ 228{
224 struct sock *s; 229 struct sock *s;
225 struct sock *next_best; 230 struct sock *next_best;
@@ -230,22 +235,23 @@ static struct sock *x25_find_listener(struct x25_address *addr, struct x25_callu
230 235
231 sk_for_each(s, node, &x25_list) 236 sk_for_each(s, node, &x25_list)
232 if ((!strcmp(addr->x25_addr, 237 if ((!strcmp(addr->x25_addr,
233 x25_sk(s)->source_addr.x25_addr) || 238 x25_sk(s)->source_addr.x25_addr) ||
234 !strcmp(addr->x25_addr, 239 !strcmp(addr->x25_addr,
235 null_x25_address.x25_addr)) && 240 null_x25_address.x25_addr)) &&
236 s->sk_state == TCP_LISTEN) { 241 s->sk_state == TCP_LISTEN) {
237
238 /* 242 /*
239 * Found a listening socket, now check the incoming 243 * Found a listening socket, now check the incoming
240 * call user data vs this sockets call user data 244 * call user data vs this sockets call user data
241 */ 245 */
242 if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) { 246 if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) {
243 sock_hold(s); 247 if((memcmp(x25_sk(s)->calluserdata.cuddata,
244 goto found; 248 skb->data,
245 } 249 x25_sk(s)->cudmatchlength)) == 0) {
246 if (x25_sk(s)->calluserdata.cudlength == 0) { 250 sock_hold(s);
251 goto found;
252 }
253 } else
247 next_best = s; 254 next_best = s;
248 }
249 } 255 }
250 if (next_best) { 256 if (next_best) {
251 s = next_best; 257 s = next_best;
@@ -497,6 +503,9 @@ static int x25_create(struct socket *sock, int protocol)
497 x25->t23 = sysctl_x25_clear_request_timeout; 503 x25->t23 = sysctl_x25_clear_request_timeout;
498 x25->t2 = sysctl_x25_ack_holdback_timeout; 504 x25->t2 = sysctl_x25_ack_holdback_timeout;
499 x25->state = X25_STATE_0; 505 x25->state = X25_STATE_0;
506 x25->cudmatchlength = 0;
507 x25->accptapprv = X25_DENY_ACCPT_APPRV; /* normally no cud */
508 /* on call accept */
500 509
501 x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; 510 x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE;
502 x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; 511 x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -545,6 +554,8 @@ static struct sock *x25_make_new(struct sock *osk)
545 x25->t2 = ox25->t2; 554 x25->t2 = ox25->t2;
546 x25->facilities = ox25->facilities; 555 x25->facilities = ox25->facilities;
547 x25->qbitincl = ox25->qbitincl; 556 x25->qbitincl = ox25->qbitincl;
557 x25->cudmatchlength = ox25->cudmatchlength;
558 x25->accptapprv = ox25->accptapprv;
548 559
549 x25_init_timers(sk); 560 x25_init_timers(sk);
550out: 561out:
@@ -822,7 +833,6 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
822 struct x25_sock *makex25; 833 struct x25_sock *makex25;
823 struct x25_address source_addr, dest_addr; 834 struct x25_address source_addr, dest_addr;
824 struct x25_facilities facilities; 835 struct x25_facilities facilities;
825 struct x25_calluserdata calluserdata;
826 int len, rc; 836 int len, rc;
827 837
828 /* 838 /*
@@ -845,19 +855,10 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
845 skb_pull(skb,len); 855 skb_pull(skb,len);
846 856
847 /* 857 /*
848 * Incoming Call User Data.
849 */
850 if (skb->len >= 0) {
851 memcpy(calluserdata.cuddata, skb->data, skb->len);
852 calluserdata.cudlength = skb->len;
853 }
854
855 skb_push(skb,len);
856
857 /*
858 * Find a listener for the particular address/cud pair. 858 * Find a listener for the particular address/cud pair.
859 */ 859 */
860 sk = x25_find_listener(&source_addr,&calluserdata); 860 sk = x25_find_listener(&source_addr,skb);
861 skb_push(skb,len);
861 862
862 /* 863 /*
863 * We can't accept the Call Request. 864 * We can't accept the Call Request.
@@ -900,11 +901,23 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
900 makex25->neighbour = nb; 901 makex25->neighbour = nb;
901 makex25->facilities = facilities; 902 makex25->facilities = facilities;
902 makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; 903 makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
903 makex25->calluserdata = calluserdata; 904 /* ensure no reverse facil on accept */
904 905 makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
905 x25_write_internal(make, X25_CALL_ACCEPTED); 906 makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
907
908 /* Normally all calls are accepted immediatly */
909 if(makex25->accptapprv & X25_DENY_ACCPT_APPRV) {
910 x25_write_internal(make, X25_CALL_ACCEPTED);
911 makex25->state = X25_STATE_3;
912 }
906 913
907 makex25->state = X25_STATE_3; 914 /*
915 * Incoming Call User Data.
916 */
917 if (skb->len >= 0) {
918 memcpy(makex25->calluserdata.cuddata, skb->data, skb->len);
919 makex25->calluserdata.cudlength = skb->len;
920 }
908 921
909 sk->sk_ack_backlog++; 922 sk->sk_ack_backlog++;
910 923
@@ -1288,7 +1301,8 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1288 if (facilities.throughput < 0x03 || 1301 if (facilities.throughput < 0x03 ||
1289 facilities.throughput > 0xDD) 1302 facilities.throughput > 0xDD)
1290 break; 1303 break;
1291 if (facilities.reverse && facilities.reverse != 1) 1304 if (facilities.reverse &&
1305 (facilities.reverse | 0x81)!= 0x81)
1292 break; 1306 break;
1293 x25->facilities = facilities; 1307 x25->facilities = facilities;
1294 rc = 0; 1308 rc = 0;
@@ -1325,6 +1339,44 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1325 break; 1339 break;
1326 } 1340 }
1327 1341
1342 case SIOCX25SCUDMATCHLEN: {
1343 struct x25_subaddr sub_addr;
1344 rc = -EINVAL;
1345 if(sk->sk_state != TCP_CLOSE)
1346 break;
1347 rc = -EFAULT;
1348 if (copy_from_user(&sub_addr, argp,
1349 sizeof(sub_addr)))
1350 break;
1351 rc = -EINVAL;
1352 if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
1353 break;
1354 x25->cudmatchlength = sub_addr.cudmatchlength;
1355 rc = 0;
1356 break;
1357 }
1358
1359 case SIOCX25CALLACCPTAPPRV: {
1360 rc = -EINVAL;
1361 if (sk->sk_state != TCP_CLOSE)
1362 break;
1363 x25->accptapprv = X25_ALLOW_ACCPT_APPRV;
1364 rc = 0;
1365 break;
1366 }
1367
1368 case SIOCX25SENDCALLACCPT: {
1369 rc = -EINVAL;
1370 if (sk->sk_state != TCP_ESTABLISHED)
1371 break;
1372 if (x25->accptapprv) /* must call accptapprv above */
1373 break;
1374 x25_write_internal(sk, X25_CALL_ACCEPTED);
1375 x25->state = X25_STATE_3;
1376 rc = 0;
1377 break;
1378 }
1379
1328 default: 1380 default:
1329 rc = dev_ioctl(cmd, argp); 1381 rc = dev_ioctl(cmd, argp);
1330 break; 1382 break;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21bdb95f9..54278b962f 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -17,6 +17,8 @@
17 * X.25 001 Split from x25_subr.c 17 * X.25 001 Split from x25_subr.c
18 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities 18 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
19 * negotiation. 19 * negotiation.
20 * apr/14/05 Shaun Pereira - Allow fast select with no restriction
21 * on response.
20 */ 22 */
21 23
22#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -43,9 +45,31 @@ int x25_parse_facilities(struct sk_buff *skb,
43 case X25_FAC_CLASS_A: 45 case X25_FAC_CLASS_A:
44 switch (*p) { 46 switch (*p) {
45 case X25_FAC_REVERSE: 47 case X25_FAC_REVERSE:
46 facilities->reverse = p[1] & 0x01; 48 if((p[1] & 0x81) == 0x81) {
47 *vc_fac_mask |= X25_MASK_REVERSE; 49 facilities->reverse = p[1] & 0x81;
48 break; 50 *vc_fac_mask |= X25_MASK_REVERSE;
51 break;
52 }
53
54 if((p[1] & 0x01) == 0x01) {
55 facilities->reverse = p[1] & 0x01;
56 *vc_fac_mask |= X25_MASK_REVERSE;
57 break;
58 }
59
60 if((p[1] & 0x80) == 0x80) {
61 facilities->reverse = p[1] & 0x80;
62 *vc_fac_mask |= X25_MASK_REVERSE;
63 break;
64 }
65
66 if(p[1] == 0x00) {
67 facilities->reverse
68 = X25_DEFAULT_REVERSE;
69 *vc_fac_mask |= X25_MASK_REVERSE;
70 break;
71 }
72
49 case X25_FAC_THROUGHPUT: 73 case X25_FAC_THROUGHPUT:
50 facilities->throughput = p[1]; 74 facilities->throughput = p[1];
51 *vc_fac_mask |= X25_MASK_THROUGHPUT; 75 *vc_fac_mask |= X25_MASK_THROUGHPUT;
@@ -122,7 +146,7 @@ int x25_create_facilities(unsigned char *buffer,
122 146
123 if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) { 147 if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
124 *p++ = X25_FAC_REVERSE; 148 *p++ = X25_FAC_REVERSE;
125 *p++ = !!facilities->reverse; 149 *p++ = facilities->reverse;
126 } 150 }
127 151
128 if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) { 152 if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
@@ -171,7 +195,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
171 /* 195 /*
172 * They want reverse charging, we won't accept it. 196 * They want reverse charging, we won't accept it.
173 */ 197 */
174 if (theirs.reverse && ours->reverse) { 198 if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
175 SOCK_DEBUG(sk, "X.25: rejecting reverse charging request"); 199 SOCK_DEBUG(sk, "X.25: rejecting reverse charging request");
176 return -1; 200 return -1;
177 } 201 }
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 183fea3bba..7fd872ad0c 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -19,6 +19,8 @@
19 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities 19 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
20 * negotiation. 20 * negotiation.
21 * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups 21 * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups
22 * apr/04/15 Shaun Pereira Fast select with no
23 * restriction on response.
22 */ 24 */
23 25
24#include <linux/kernel.h> 26#include <linux/kernel.h>
@@ -127,8 +129,12 @@ void x25_write_internal(struct sock *sk, int frametype)
127 len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN + 129 len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN +
128 X25_MAX_CUD_LEN; 130 X25_MAX_CUD_LEN;
129 break; 131 break;
130 case X25_CALL_ACCEPTED: 132 case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
131 len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN; 133 if(x25->facilities.reverse & 0x80) {
134 len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
135 } else {
136 len += 1 + X25_MAX_FAC_LEN;
137 }
132 break; 138 break;
133 case X25_CLEAR_REQUEST: 139 case X25_CLEAR_REQUEST:
134 case X25_RESET_REQUEST: 140 case X25_RESET_REQUEST:
@@ -203,9 +209,16 @@ void x25_write_internal(struct sock *sk, int frametype)
203 x25->vc_facil_mask); 209 x25->vc_facil_mask);
204 dptr = skb_put(skb, len); 210 dptr = skb_put(skb, len);
205 memcpy(dptr, facilities, len); 211 memcpy(dptr, facilities, len);
206 dptr = skb_put(skb, x25->calluserdata.cudlength); 212
207 memcpy(dptr, x25->calluserdata.cuddata, 213 /* fast select with no restriction on response
208 x25->calluserdata.cudlength); 214 allows call user data. Userland must
215 ensure it is ours and not theirs */
216 if(x25->facilities.reverse & 0x80) {
217 dptr = skb_put(skb,
218 x25->calluserdata.cudlength);
219 memcpy(dptr, x25->calluserdata.cuddata,
220 x25->calluserdata.cudlength);
221 }
209 x25->calluserdata.cudlength = 0; 222 x25->calluserdata.cudlength = 0;
210 break; 223 break;
211 224
@@ -354,21 +367,3 @@ void x25_check_rbuf(struct sock *sk)
354 } 367 }
355} 368}
356 369
357/*
358 * Compare 2 calluserdata structures, used to find correct listening sockets
359 * when call user data is used.
360 */
361int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs)
362{
363 int i;
364 if (ours->cudlength != theirs->cudlength)
365 return 0;
366
367 for (i=0;i<ours->cudlength;i++) {
368 if (ours->cuddata[i] != theirs->cuddata[i]) {
369 return 0;
370 }
371 }
372 return 1;
373}
374
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 58ca6a972c..0c1c04322b 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -1,6 +1,10 @@
1# 1#
2# XFRM configuration 2# XFRM configuration
3# 3#
4config XFRM
5 bool
6 depends on NET
7
4config XFRM_USER 8config XFRM_USER
5 tristate "IPsec user configuration interface" 9 tristate "IPsec user configuration interface"
6 depends on INET && XFRM 10 depends on INET && XFRM
@@ -10,3 +14,14 @@ config XFRM_USER
10 14
11 If unsure, say Y. 15 If unsure, say Y.
12 16
17config NET_KEY
18 tristate "PF_KEY sockets"
19 select XFRM
20 ---help---
21 PF_KEYv2 socket family, compatible to KAME ones.
22 They are required if you are going to use IPsec tools ported
23 from KAME.
24
25 Say Y unless you know what you are doing.
26
27
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ecade4893a..8da3e25b2c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1350,6 +1350,9 @@ static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
1350 if (nr > XFRM_MAX_DEPTH) 1350 if (nr > XFRM_MAX_DEPTH)
1351 return NULL; 1351 return NULL;
1352 1352
1353 if (p->dir > XFRM_POLICY_OUT)
1354 return NULL;
1355
1353 xp = xfrm_policy_alloc(GFP_KERNEL); 1356 xp = xfrm_policy_alloc(GFP_KERNEL);
1354 if (xp == NULL) { 1357 if (xp == NULL) {
1355 *dir = -ENOBUFS; 1358 *dir = -ENOBUFS;