aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDave Jones <davej@redhat.com>2005-08-18 01:56:07 -0400
committerDave Jones <davej@redhat.com>2005-08-18 01:56:07 -0400
commita8b3e6f10f08f66ae1072efd087b30966a3654f6 (patch)
tree1d1409855f8ad5beabafe061c6453edd84ba94c8 /net
parent46acac3b4fd8ef66eec63b51de8d556a17c7d4f7 (diff)
parent099d44e869f1886b5eb02a5145ca97b5e4142e28 (diff)
Merge /pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'net')
-rw-r--r--net/802/fddi.c4
-rw-r--r--net/8021q/Kconfig19
-rw-r--r--net/8021q/vlan.c8
-rw-r--r--net/Kconfig472
-rw-r--r--net/appletalk/aarp.c9
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/atm/Kconfig74
-rw-r--r--net/atm/br2684.c3
-rw-r--r--net/atm/svc.c4
-rw-r--r--net/bluetooth/cmtp/core.c6
-rw-r--r--net/bluetooth/hci_core.c2
-rw-r--r--net/bluetooth/hci_event.c4
-rw-r--r--net/bluetooth/hidp/core.c5
-rw-r--r--net/bluetooth/lib.c25
-rw-r--r--net/bluetooth/rfcomm/core.c4
-rw-r--r--net/bluetooth/rfcomm/sock.c7
-rw-r--r--net/bluetooth/rfcomm/tty.c2
-rw-r--r--net/bridge/Kconfig31
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_input.c4
-rw-r--r--net/bridge/br_netfilter.c40
-rw-r--r--net/bridge/netfilter/Kconfig2
-rw-r--r--net/bridge/netfilter/ebt_log.c6
-rw-r--r--net/bridge/netfilter/ebtables.c21
-rw-r--r--net/compat.c9
-rw-r--r--net/core/Makefile6
-rw-r--r--net/core/dev.c143
-rw-r--r--net/core/dst.c15
-rw-r--r--net/core/filter.c104
-rw-r--r--net/core/neighbour.c342
-rw-r--r--net/core/net-sysfs.c17
-rw-r--r--net/core/netfilter.c138
-rw-r--r--net/core/netpoll.c133
-rw-r--r--net/core/pktgen.c31
-rw-r--r--net/core/request_sock.c64
-rw-r--r--net/core/rtnetlink.c35
-rw-r--r--net/core/skbuff.c186
-rw-r--r--net/core/sock.c59
-rw-r--r--net/core/sysctl_net_core.c61
-rw-r--r--net/core/utils.c37
-rw-r--r--net/core/wireless.c75
-rw-r--r--net/decnet/Kconfig23
-rw-r--r--net/decnet/af_decnet.c21
-rw-r--r--net/decnet/dn_dev.c9
-rw-r--r--net/decnet/dn_fib.c3
-rw-r--r--net/decnet/dn_neigh.c3
-rw-r--r--net/decnet/dn_nsp_out.c3
-rw-r--r--net/decnet/dn_route.c11
-rw-r--r--net/decnet/dn_rules.c7
-rw-r--r--net/decnet/dn_table.c8
-rw-r--r--net/econet/Kconfig36
-rw-r--r--net/ethernet/eth.c9
-rw-r--r--net/ipv4/Kconfig154
-rw-r--r--net/ipv4/Makefile16
-rw-r--r--net/ipv4/af_inet.c24
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c55
-rw-r--r--net/ipv4/fib_hash.c3
-rw-r--r--net/ipv4/fib_lookup.h3
-rw-r--r--net/ipv4/fib_rules.c7
-rw-r--r--net/ipv4/fib_semantics.c19
-rw-r--r--net/ipv4/fib_trie.c2612
-rw-r--r--net/ipv4/icmp.c15
-rw-r--r--net/ipv4/igmp.c96
-rw-r--r--net/ipv4/ip_fragment.c8
-rw-r--r--net/ipv4/ip_gre.c21
-rw-r--r--net/ipv4/ip_input.c11
-rw-r--r--net/ipv4/ip_output.c30
-rw-r--r--net/ipv4/ip_sockglue.c15
-rw-r--r--net/ipv4/ipcomp.c11
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipip.c56
-rw-r--r--net/ipv4/ipmr.c17
-rw-r--r--net/ipv4/ipvs/Kconfig4
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c31
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c17
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c1
-rw-r--r--net/ipv4/multipath_drr.c2
-rw-r--r--net/ipv4/multipath_random.c2
-rw-r--r--net/ipv4/multipath_rr.c2
-rw-r--r--net/ipv4/multipath_wrandom.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c15
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c157
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c21
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c15
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c23
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c27
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c32
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c8
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c32
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c7
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c3
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c3
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c9
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c4
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c58
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c10
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c13
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c15
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c17
-rw-r--r--net/ipv4/netfilter/ipt_helper.c4
-rw-r--r--net/ipv4/netfilter/ipt_recent.c10
-rw-r--r--net/ipv4/raw.c22
-rw-r--r--net/ipv4/route.c156
-rw-r--r--net/ipv4/syncookies.c49
-rw-r--r--net/ipv4/sysctl_net_ipv4.c123
-rw-r--r--net/ipv4/tcp.c171
-rw-r--r--net/ipv4/tcp_bic.c331
-rw-r--r--net/ipv4/tcp_cong.c237
-rw-r--r--net/ipv4/tcp_diag.c71
-rw-r--r--net/ipv4/tcp_highspeed.c181
-rw-r--r--net/ipv4/tcp_htcp.c289
-rw-r--r--net/ipv4/tcp_hybla.c187
-rw-r--r--net/ipv4/tcp_input.c824
-rw-r--r--net/ipv4/tcp_ipv4.c191
-rw-r--r--net/ipv4/tcp_minisocks.c72
-rw-r--r--net/ipv4/tcp_output.c615
-rw-r--r--net/ipv4/tcp_scalable.c68
-rw-r--r--net/ipv4/tcp_timer.c23
-rw-r--r--net/ipv4/tcp_vegas.c411
-rw-r--r--net/ipv4/tcp_westwood.c259
-rw-r--r--net/ipv4/udp.c34
-rw-r--r--net/ipv4/utils.c59
-rw-r--r--net/ipv4/xfrm4_output.c8
-rw-r--r--net/ipv4/xfrm4_state.c9
-rw-r--r--net/ipv4/xfrm4_tunnel.c5
-rw-r--r--net/ipv6/Kconfig23
-rw-r--r--net/ipv6/addrconf.c90
-rw-r--r--net/ipv6/af_inet6.c4
-rw-r--r--net/ipv6/ah6.c2
-rw-r--r--net/ipv6/anycast.c4
-rw-r--r--net/ipv6/datagram.c6
-rw-r--r--net/ipv6/esp6.c2
-rw-r--r--net/ipv6/icmp.c14
-rw-r--r--net/ipv6/ip6_fib.c19
-rw-r--r--net/ipv6/ip6_flowlabel.c1
-rw-r--r--net/ipv6/ip6_input.c9
-rw-r--r--net/ipv6/ip6_output.c11
-rw-r--r--net/ipv6/ip6_tunnel.c39
-rw-r--r--net/ipv6/ipcomp6.c9
-rw-r--r--net/ipv6/ipv6_sockglue.c8
-rw-r--r--net/ipv6/mcast.c95
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter/ip6_queue.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c1
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c57
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c6
-rw-r--r--net/ipv6/raw.c10
-rw-r--r--net/ipv6/route.c79
-rw-r--r--net/ipv6/sit.c21
-rw-r--r--net/ipv6/tcp_ipv6.c152
-rw-r--r--net/ipv6/udp.c4
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/ipx/Kconfig33
-rw-r--r--net/irda/irlap.c3
-rw-r--r--net/irda/irlap_event.c14
-rw-r--r--net/irda/irlap_frame.c8
-rw-r--r--net/irda/irttp.c2
-rw-r--r--net/key/af_key.c385
-rw-r--r--net/lapb/Kconfig22
-rw-r--r--net/llc/llc_c_ev.c2
-rw-r--r--net/netlink/af_netlink.c26
-rw-r--r--net/packet/Kconfig26
-rw-r--r--net/packet/af_packet.c6
-rw-r--r--net/rxrpc/krxiod.c2
-rw-r--r--net/rxrpc/krxsecd.c2
-rw-r--r--net/rxrpc/krxtimod.c2
-rw-r--r--net/sched/Kconfig52
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c23
-rw-r--r--net/sched/cls_api.c7
-rw-r--r--net/sched/cls_basic.c3
-rw-r--r--net/sched/cls_rsvp.h1
-rw-r--r--net/sched/em_meta.c349
-rw-r--r--net/sched/em_text.c154
-rw-r--r--net/sched/sch_api.c75
-rw-r--r--net/sched/sch_blackhole.c54
-rw-r--r--net/sched/sch_cbq.c3
-rw-r--r--net/sched/sch_dsmark.c357
-rw-r--r--net/sched/sch_fifo.c152
-rw-r--r--net/sched/sch_generic.c122
-rw-r--r--net/sched/sch_red.c2
-rw-r--r--net/sctp/associola.c164
-rw-r--r--net/sctp/bind_addr.c16
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/endpointola.c20
-rw-r--r--net/sctp/input.c122
-rw-r--r--net/sctp/inqueue.c18
-rw-r--r--net/sctp/ipv6.c43
-rw-r--r--net/sctp/objcnt.c6
-rw-r--r--net/sctp/output.c22
-rw-r--r--net/sctp/outqueue.c61
-rw-r--r--net/sctp/proc.c194
-rw-r--r--net/sctp/protocol.c14
-rw-r--r--net/sctp/sm_make_chunk.c47
-rw-r--r--net/sctp/sm_sideeffect.c118
-rw-r--r--net/sctp/sm_statefuns.c162
-rw-r--r--net/sctp/sm_statetable.c6
-rw-r--r--net/sctp/socket.c427
-rw-r--r--net/sctp/ssnmap.c3
-rw-r--r--net/sctp/sysctl.c13
-rw-r--r--net/sctp/transport.c10
-rw-r--r--net/sctp/ulpevent.c19
-rw-r--r--net/sctp/ulpqueue.c9
-rw-r--r--net/socket.c12
-rw-r--r--net/sunrpc/auth.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c18
-rw-r--r--net/sunrpc/clnt.c205
-rw-r--r--net/sunrpc/pmap_clnt.c9
-rw-r--r--net/sunrpc/sched.c84
-rw-r--r--net/sunrpc/sunrpc_syms.c7
-rw-r--r--net/sunrpc/svc.c36
-rw-r--r--net/sunrpc/svcauth_unix.c11
-rw-r--r--net/sunrpc/svcsock.c8
-rw-r--r--net/sunrpc/xdr.c299
-rw-r--r--net/sunrpc/xprt.c73
-rw-r--r--net/unix/Kconfig21
-rw-r--r--net/unix/af_unix.c4
-rw-r--r--net/wanrouter/Kconfig29
-rw-r--r--net/wanrouter/wanmain.c6
-rw-r--r--net/x25/Kconfig36
-rw-r--r--net/x25/af_x25.c110
-rw-r--r--net/x25/x25_facilities.c34
-rw-r--r--net/x25/x25_subr.c41
-rw-r--r--net/xfrm/Kconfig15
-rw-r--r--net/xfrm/xfrm_policy.c9
-rw-r--r--net/xfrm/xfrm_state.c118
-rw-r--r--net/xfrm/xfrm_user.c300
239 files changed, 11435 insertions, 4628 deletions
diff --git a/net/802/fddi.c b/net/802/fddi.c
index ebcf4830d6f1..5ce24c4bb840 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -122,10 +122,10 @@ static int fddi_rebuild_header(struct sk_buff *skb)
122 * the proper pointer to the start of packet data (skb->data). 122 * the proper pointer to the start of packet data (skb->data).
123 */ 123 */
124 124
125unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev) 125__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
126{ 126{
127 struct fddihdr *fddi = (struct fddihdr *)skb->data; 127 struct fddihdr *fddi = (struct fddihdr *)skb->data;
128 unsigned short type; 128 __be16 type;
129 129
130 /* 130 /*
131 * Set mac.raw field to point to FC byte, set data field to point 131 * Set mac.raw field to point to FC byte, set data field to point
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 000000000000..c4a382e450e2
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,19 @@
1#
2# Configuration for 802.1Q VLAN support
3#
4
5config VLAN_8021Q
6 tristate "802.1Q VLAN Support"
7 ---help---
8 Select this and you will be able to create 802.1Q VLAN interfaces
9 on your ethernet interfaces. 802.1Q VLAN supports almost
10 everything a regular ethernet interface does, including
11 firewalling, bridging, and of course IP traffic. You will need
12 the 'vconfig' tool from the VLAN project in order to effectively
13 use VLANs. See the VLAN web page for more information:
14 <http://www.candelatech.com/~greear/vlan.html>
15
16 To compile this code as a module, choose M here: the module
17 will be called 8021q.
18
19 If unsure, say N.
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1f6d31670bc7..91e412b0ab00 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -578,6 +578,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
578 if (!vlandev) 578 if (!vlandev)
579 continue; 579 continue;
580 580
581 if (netif_carrier_ok(dev)) {
582 if (!netif_carrier_ok(vlandev))
583 netif_carrier_on(vlandev);
584 } else {
585 if (netif_carrier_ok(vlandev))
586 netif_carrier_off(vlandev);
587 }
588
581 if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) { 589 if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) {
582 vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) 590 vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK)
583 | flgs; 591 | flgs;
diff --git a/net/Kconfig b/net/Kconfig
index 9251b28e8d5d..40a31ba86d2c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -2,7 +2,7 @@
2# Network configuration 2# Network configuration
3# 3#
4 4
5menu "Networking support" 5menu "Networking"
6 6
7config NET 7config NET
8 bool "Networking support" 8 bool "Networking support"
@@ -10,7 +10,9 @@ config NET
10 Unless you really know what you are doing, you should say Y here. 10 Unless you really know what you are doing, you should say Y here.
11 The reason is that some programs need kernel networking support even 11 The reason is that some programs need kernel networking support even
12 when running on a stand-alone machine that isn't connected to any 12 when running on a stand-alone machine that isn't connected to any
13 other computer. If you are upgrading from an older kernel, you 13 other computer.
14
15 If you are upgrading from an older kernel, you
14 should consider updating your networking tools too because changes 16 should consider updating your networking tools too because changes
15 in the kernel and the tools often go hand in hand. The tools are 17 in the kernel and the tools often go hand in hand. The tools are
16 contained in the package net-tools, the location and version number 18 contained in the package net-tools, the location and version number
@@ -20,57 +22,14 @@ config NET
20 recommended to read the NET-HOWTO, available from 22 recommended to read the NET-HOWTO, available from
21 <http://www.tldp.org/docs.html#howto>. 23 <http://www.tldp.org/docs.html#howto>.
22 24
23menu "Networking options" 25# Make sure that all config symbols are dependent on NET
24 depends on NET 26if NET
25
26config PACKET
27 tristate "Packet socket"
28 ---help---
29 The Packet protocol is used by applications which communicate
30 directly with network devices without an intermediate network
31 protocol implemented in the kernel, e.g. tcpdump. If you want them
32 to work, choose Y.
33 27
34 To compile this driver as a module, choose M here: the module will 28menu "Networking options"
35 be called af_packet.
36
37 If unsure, say Y.
38
39config PACKET_MMAP
40 bool "Packet socket: mmapped IO"
41 depends on PACKET
42 help
43 If you say Y here, the Packet protocol driver will use an IO
44 mechanism that results in faster communication.
45
46 If unsure, say N.
47
48config UNIX
49 tristate "Unix domain sockets"
50 ---help---
51 If you say Y here, you will include support for Unix domain sockets;
52 sockets are the standard Unix mechanism for establishing and
53 accessing network connections. Many commonly used programs such as
54 the X Window system and syslog use these sockets even if your
55 machine is not connected to any network. Unless you are working on
56 an embedded system or something similar, you therefore definitely
57 want to say Y here.
58
59 To compile this driver as a module, choose M here: the module will be
60 called unix. Note that several important services won't work
61 correctly if you say M here and then neglect to load the module.
62
63 Say Y unless you know what you are doing.
64
65config NET_KEY
66 tristate "PF_KEY sockets"
67 select XFRM
68 ---help---
69 PF_KEYv2 socket family, compatible to KAME ones.
70 They are required if you are going to use IPsec tools ported
71 from KAME.
72 29
73 Say Y unless you know what you are doing. 30source "net/packet/Kconfig"
31source "net/unix/Kconfig"
32source "net/xfrm/Kconfig"
74 33
75config INET 34config INET
76 bool "TCP/IP networking" 35 bool "TCP/IP networking"
@@ -94,30 +53,12 @@ config INET
94 53
95 Short answer: say Y. 54 Short answer: say Y.
96 55
56if INET
97source "net/ipv4/Kconfig" 57source "net/ipv4/Kconfig"
98
99# IPv6 as module will cause a CRASH if you try to unload it
100config IPV6
101 tristate "The IPv6 protocol"
102 depends on INET
103 default m
104 select CRYPTO if IPV6_PRIVACY
105 select CRYPTO_MD5 if IPV6_PRIVACY
106 ---help---
107 This is complemental support for the IP version 6.
108 You will still be able to do traditional IPv4 networking as well.
109
110 For general information about IPv6, see
111 <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
112 For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
113 For specific information about IPv6 under Linux, read the HOWTO at
114 <http://www.bieringer.de/linux/IPv6/>.
115
116 To compile this protocol support as a module, choose M here: the
117 module will be called ipv6.
118
119source "net/ipv6/Kconfig" 58source "net/ipv6/Kconfig"
120 59
60endif # if INET
61
121menuconfig NETFILTER 62menuconfig NETFILTER
122 bool "Network packet filtering (replaces ipchains)" 63 bool "Network packet filtering (replaces ipchains)"
123 ---help--- 64 ---help---
@@ -206,269 +147,16 @@ source "net/bridge/netfilter/Kconfig"
206 147
207endif 148endif
208 149
209config XFRM
210 bool
211 depends on NET
212
213source "net/xfrm/Kconfig"
214
215source "net/sctp/Kconfig" 150source "net/sctp/Kconfig"
216 151source "net/atm/Kconfig"
217config ATM 152source "net/bridge/Kconfig"
218 tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)" 153source "net/8021q/Kconfig"
219 depends on EXPERIMENTAL
220 ---help---
221 ATM is a high-speed networking technology for Local Area Networks
222 and Wide Area Networks. It uses a fixed packet size and is
223 connection oriented, allowing for the negotiation of minimum
224 bandwidth requirements.
225
226 In order to participate in an ATM network, your Linux box needs an
227 ATM networking card. If you have that, say Y here and to the driver
228 of your ATM card below.
229
230 Note that you need a set of user-space programs to actually make use
231 of ATM. See the file <file:Documentation/networking/atm.txt> for
232 further details.
233
234config ATM_CLIP
235 tristate "Classical IP over ATM (EXPERIMENTAL)"
236 depends on ATM && INET
237 help
238 Classical IP over ATM for PVCs and SVCs, supporting InARP and
239 ATMARP. If you want to communication with other IP hosts on your ATM
240 network, you will typically either say Y here or to "LAN Emulation
241 (LANE)" below.
242
243config ATM_CLIP_NO_ICMP
244 bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
245 depends on ATM_CLIP
246 help
247 Normally, an "ICMP host unreachable" message is sent if a neighbour
248 cannot be reached because there is no VC to it in the kernel's
249 ATMARP table. This may cause problems when ATMARP table entries are
250 briefly removed during revalidation. If you say Y here, packets to
251 such neighbours are silently discarded instead.
252
253config ATM_LANE
254 tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
255 depends on ATM
256 help
257 LAN Emulation emulates services of existing LANs across an ATM
258 network. Besides operating as a normal ATM end station client, Linux
259 LANE client can also act as an proxy client bridging packets between
260 ELAN and Ethernet segments. You need LANE if you want to try MPOA.
261
262config ATM_MPOA
263 tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
264 depends on ATM && INET && ATM_LANE!=n
265 help
266 Multi-Protocol Over ATM allows ATM edge devices such as routers,
267 bridges and ATM attached hosts establish direct ATM VCs across
268 subnetwork boundaries. These shortcut connections bypass routers
269 enhancing overall network performance.
270
271config ATM_BR2684
272 tristate "RFC1483/2684 Bridged protocols"
273 depends on ATM && INET
274 help
275 ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
276 This device will act like an ethernet from the kernels point of view,
277 with the traffic being carried by ATM PVCs (currently 1 PVC/device).
278 This is sometimes used over DSL lines. If in doubt, say N.
279
280config ATM_BR2684_IPFILTER
281 bool "Per-VC IP filter kludge"
282 depends on ATM_BR2684
283 help
284 This is an experimental mechanism for users who need to terminating a
285 large number of IP-only vcc's. Do not enable this unless you are sure
286 you know what you are doing.
287
288config BRIDGE
289 tristate "802.1d Ethernet Bridging"
290 ---help---
291 If you say Y here, then your Linux box will be able to act as an
292 Ethernet bridge, which means that the different Ethernet segments it
293 is connected to will appear as one Ethernet to the participants.
294 Several such bridges can work together to create even larger
295 networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
296 As this is a standard, Linux bridges will cooperate properly with
297 other third party bridge products.
298
299 In order to use the Ethernet bridge, you'll need the bridge
300 configuration tools; see <file:Documentation/networking/bridge.txt>
301 for location. Please read the Bridge mini-HOWTO for more
302 information.
303
304 If you enable iptables support along with the bridge support then you
305 turn your bridge into a bridging IP firewall.
306 iptables will then see the IP packets being bridged, so you need to
307 take this into account when setting up your firewall rules.
308 Enabling arptables support when bridging will let arptables see
309 bridged ARP traffic in the arptables FORWARD chain.
310
311 To compile this code as a module, choose M here: the module
312 will be called bridge.
313
314 If unsure, say N.
315
316config VLAN_8021Q
317 tristate "802.1Q VLAN Support"
318 ---help---
319 Select this and you will be able to create 802.1Q VLAN interfaces
320 on your ethernet interfaces. 802.1Q VLAN supports almost
321 everything a regular ethernet interface does, including
322 firewalling, bridging, and of course IP traffic. You will need
323 the 'vconfig' tool from the VLAN project in order to effectively
324 use VLANs. See the VLAN web page for more information:
325 <http://www.candelatech.com/~greear/vlan.html>
326
327 To compile this code as a module, choose M here: the module
328 will be called 8021q.
329
330 If unsure, say N.
331
332config DECNET
333 tristate "DECnet Support"
334 ---help---
335 The DECnet networking protocol was used in many products made by
336 Digital (now Compaq). It provides reliable stream and sequenced
337 packet communications over which run a variety of services similar
338 to those which run over TCP/IP.
339
340 To find some tools to use with the kernel layer support, please
341 look at Patrick Caulfield's web site:
342 <http://linux-decnet.sourceforge.net/>.
343
344 More detailed documentation is available in
345 <file:Documentation/networking/decnet.txt>.
346
347 Be sure to say Y to "/proc file system support" and "Sysctl support"
348 below when using DECnet, since you will need sysctl support to aid
349 in configuration at run time.
350
351 The DECnet code is also available as a module ( = code which can be
352 inserted in and removed from the running kernel whenever you want).
353 The module is called decnet.
354
355source "net/decnet/Kconfig" 154source "net/decnet/Kconfig"
356
357source "net/llc/Kconfig" 155source "net/llc/Kconfig"
358
359config IPX
360 tristate "The IPX protocol"
361 select LLC
362 ---help---
363 This is support for the Novell networking protocol, IPX, commonly
364 used for local networks of Windows machines. You need it if you
365 want to access Novell NetWare file or print servers using the Linux
366 Novell client ncpfs (available from
367 <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
368 within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
369 available from <http://www.tldp.org/docs.html#howto>). In order
370 to do the former, you'll also have to say Y to "NCP file system
371 support", below.
372
373 IPX is similar in scope to IP, while SPX, which runs on top of IPX,
374 is similar to TCP. There is also experimental support for SPX in
375 Linux (see "SPX networking", below).
376
377 To turn your Linux box into a fully featured NetWare file server and
378 IPX router, say Y here and fetch either lwared from
379 <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
380 mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
381 information, read the IPX-HOWTO available from
382 <http://www.tldp.org/docs.html#howto>.
383
384 General information about how to connect Linux, Windows machines and
385 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
386
387 The IPX driver would enlarge your kernel by about 16 KB. To compile
388 this driver as a module, choose M here: the module will be called ipx.
389 Unless you want to integrate your Linux box with a local Novell
390 network, say N.
391
392source "net/ipx/Kconfig" 156source "net/ipx/Kconfig"
393
394config ATALK
395 tristate "Appletalk protocol support"
396 select LLC
397 ---help---
398 AppleTalk is the protocol that Apple computers can use to communicate
399 on a network. If your Linux box is connected to such a network and you
400 wish to connect to it, say Y. You will need to use the netatalk package
401 so that your Linux box can act as a print and file server for Macs as
402 well as access AppleTalk printers. Check out
403 <http://www.zettabyte.net/netatalk/> on the WWW for details.
404 EtherTalk is the name used for AppleTalk over Ethernet and the
405 cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
406 network using serial links. EtherTalk and LocalTalk are fully
407 supported by Linux.
408
409 General information about how to connect Linux, Windows machines and
410 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. The
411 NET-3-HOWTO, available from
412 <http://www.tldp.org/docs.html#howto>, contains valuable
413 information as well.
414
415 To compile this driver as a module, choose M here: the module will be
416 called appletalk. You almost certainly want to compile it as a
417 module so you can restart your AppleTalk stack without rebooting
418 your machine. I hear that the GNU boycott of Apple is over, so
419 even politically correct people are allowed to say Y here.
420
421source "drivers/net/appletalk/Kconfig" 157source "drivers/net/appletalk/Kconfig"
422 158source "net/x25/Kconfig"
423config X25 159source "net/lapb/Kconfig"
424 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
425 depends on EXPERIMENTAL
426 ---help---
427 X.25 is a set of standardized network protocols, similar in scope to
428 frame relay; the one physical line from your box to the X.25 network
429 entry point can carry several logical point-to-point connections
430 (called "virtual circuits") to other computers connected to the X.25
431 network. Governments, banks, and other organizations tend to use it
432 to connect to each other or to form Wide Area Networks (WANs). Many
433 countries have public X.25 networks. X.25 consists of two
434 protocols: the higher level Packet Layer Protocol (PLP) (say Y here
435 if you want that) and the lower level data link layer protocol LAPB
436 (say Y to "LAPB Data Link Driver" below if you want that).
437
438 You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
439 <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
440 Information about X.25 for Linux is contained in the files
441 <file:Documentation/networking/x25.txt> and
442 <file:Documentation/networking/x25-iface.txt>.
443
444 One connects to an X.25 network either with a dedicated network card
445 using the X.21 protocol (not yet supported by Linux) or one can do
446 X.25 over a standard telephone line using an ordinary modem (say Y
447 to "X.25 async driver" below) or over Ethernet using an ordinary
448 Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
449 Driver" and "LAPB over Ethernet driver" below).
450
451 To compile this driver as a module, choose M here: the module
452 will be called x25. If unsure, say N.
453
454config LAPB
455 tristate "LAPB Data Link Driver (EXPERIMENTAL)"
456 depends on EXPERIMENTAL
457 ---help---
458 Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
459 the lower) part of the X.25 protocol. It offers a reliable
460 connection service to exchange data frames with one other host, and
461 it is used to transport higher level protocols (mostly X.25 Packet
462 Layer, the higher part of X.25, but others are possible as well).
463 Usually, LAPB is used with specialized X.21 network cards, but Linux
464 currently supports LAPB only over Ethernet connections. If you want
465 to use LAPB connections over Ethernet, say Y here and to "LAPB over
466 Ethernet driver" below. Read
467 <file:Documentation/networking/lapb-module.txt> for technical
468 details.
469
470 To compile this driver as a module, choose M here: the
471 module will be called lapb. If unsure, say N.
472 160
473config NET_DIVERT 161config NET_DIVERT
474 bool "Frame Diverter (EXPERIMENTAL)" 162 bool "Frame Diverter (EXPERIMENTAL)"
@@ -496,107 +184,10 @@ config NET_DIVERT
496 184
497 If unsure, say N. 185 If unsure, say N.
498 186
499config ECONET 187source "net/econet/Kconfig"
500 tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)" 188source "net/wanrouter/Kconfig"
501 depends on EXPERIMENTAL && INET
502 ---help---
503 Econet is a fairly old and slow networking protocol mainly used by
504 Acorn computers to access file and print servers. It uses native
505 Econet network cards. AUN is an implementation of the higher level
506 parts of Econet that runs over ordinary Ethernet connections, on
507 top of the UDP packet protocol, which in turn runs on top of the
508 Internet protocol IP.
509
510 If you say Y here, you can choose with the next two options whether
511 to send Econet/AUN traffic over a UDP Ethernet connection or over
512 a native Econet network card.
513
514 To compile this driver as a module, choose M here: the module
515 will be called econet.
516
517config ECONET_AUNUDP
518 bool "AUN over UDP"
519 depends on ECONET
520 help
521 Say Y here if you want to send Econet/AUN traffic over a UDP
522 connection (UDP is a packet based protocol that runs on top of the
523 Internet protocol IP) using an ordinary Ethernet network card.
524
525config ECONET_NATIVE
526 bool "Native Econet"
527 depends on ECONET
528 help
529 Say Y here if you have a native Econet network card installed in
530 your computer.
531
532config WAN_ROUTER
533 tristate "WAN router"
534 depends on EXPERIMENTAL
535 ---help---
536 Wide Area Networks (WANs), such as X.25, frame relay and leased
537 lines, are used to interconnect Local Area Networks (LANs) over vast
538 distances with data transfer rates significantly higher than those
539 achievable with commonly used asynchronous modem connections.
540 Usually, a quite expensive external device called a `WAN router' is
541 needed to connect to a WAN.
542
543 As an alternative, WAN routing can be built into the Linux kernel.
544 With relatively inexpensive WAN interface cards available on the
545 market, a perfectly usable router can be built for less than half
546 the price of an external router. If you have one of those cards and
547 wish to use your Linux box as a WAN router, say Y here and also to
548 the WAN driver for your card, below. You will then need the
549 wan-tools package which is available from <ftp://ftp.sangoma.com/>.
550 Read <file:Documentation/networking/wan-router.txt> for more
551 information.
552
553 To compile WAN routing support as a module, choose M here: the
554 module will be called wanrouter.
555
556 If unsure, say N.
557
558menu "QoS and/or fair queueing"
559
560config NET_SCHED
561 bool "QoS and/or fair queueing"
562 ---help---
563 When the kernel has several packets to send out over a network
564 device, it has to decide which ones to send first, which ones to
565 delay, and which ones to drop. This is the job of the packet
566 scheduler, and several different algorithms for how to do this
567 "fairly" have been proposed.
568
569 If you say N here, you will get the standard packet scheduler, which
570 is a FIFO (first come, first served). If you say Y here, you will be
571 able to choose from among several alternative algorithms which can
572 then be attached to different network devices. This is useful for
573 example if some of your network devices are real time devices that
574 need a certain minimum data flow rate, or if you need to limit the
575 maximum data flow rate for traffic which matches specified criteria.
576 This code is considered to be experimental.
577
578 To administer these schedulers, you'll need the user-level utilities
579 from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
580 That package also contains some documentation; for more, check out
581 <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
582
583 This Quality of Service (QoS) support will enable you to use
584 Differentiated Services (diffserv) and Resource Reservation Protocol
585 (RSVP) on your Linux router if you also say Y to "QoS support",
586 "Packet classifier API" and to some classifiers below. Documentation
587 and software is at <http://diffserv.sourceforge.net/>.
588
589 If you say Y here and to "/proc file system" below, you will be able
590 to read status information about packet schedulers from the file
591 /proc/net/psched.
592
593 The available schedulers are listed in the following questions; you
594 can say Y to as many as you like. If unsure, say N now.
595
596source "net/sched/Kconfig" 189source "net/sched/Kconfig"
597 190
598endmenu
599
600menu "Network testing" 191menu "Network testing"
601 192
602config NET_PKTGEN 193config NET_PKTGEN
@@ -618,29 +209,10 @@ endmenu
618 209
619endmenu 210endmenu
620 211
621config NETPOLL
622 def_bool NETCONSOLE
623
624config NETPOLL_RX
625 bool "Netpoll support for trapping incoming packets"
626 default n
627 depends on NETPOLL
628
629config NETPOLL_TRAP
630 bool "Netpoll traffic trapping"
631 default n
632 depends on NETPOLL
633
634config NET_POLL_CONTROLLER
635 def_bool NETPOLL
636
637source "net/ax25/Kconfig" 212source "net/ax25/Kconfig"
638
639source "net/irda/Kconfig" 213source "net/irda/Kconfig"
640
641source "net/bluetooth/Kconfig" 214source "net/bluetooth/Kconfig"
642 215
643source "drivers/net/Kconfig" 216endif # if NET
644 217endmenu # Networking
645endmenu
646 218
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 54640c01b50c..c34614ea5fce 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -35,6 +35,7 @@
35#include <net/datalink.h> 35#include <net/datalink.h>
36#include <net/psnap.h> 36#include <net/psnap.h>
37#include <linux/atalk.h> 37#include <linux/atalk.h>
38#include <linux/delay.h>
38#include <linux/init.h> 39#include <linux/init.h>
39#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
40#include <linux/seq_file.h> 41#include <linux/seq_file.h>
@@ -462,8 +463,7 @@ void aarp_probe_network(struct atalk_iface *atif)
462 aarp_send_probe(atif->dev, &atif->address); 463 aarp_send_probe(atif->dev, &atif->address);
463 464
464 /* Defer 1/10th */ 465 /* Defer 1/10th */
465 current->state = TASK_INTERRUPTIBLE; 466 msleep(100);
466 schedule_timeout(HZ / 10);
467 467
468 if (atif->status & ATIF_PROBE_FAIL) 468 if (atif->status & ATIF_PROBE_FAIL)
469 break; 469 break;
@@ -510,9 +510,8 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
510 aarp_send_probe(atif->dev, sa); 510 aarp_send_probe(atif->dev, sa);
511 511
512 /* Defer 1/10th */ 512 /* Defer 1/10th */
513 current->state = TASK_INTERRUPTIBLE;
514 write_unlock_bh(&aarp_lock); 513 write_unlock_bh(&aarp_lock);
515 schedule_timeout(HZ / 10); 514 msleep(100);
516 write_lock_bh(&aarp_lock); 515 write_lock_bh(&aarp_lock);
517 516
518 if (entry->status & ATIF_PROBE_FAIL) 517 if (entry->status & ATIF_PROBE_FAIL)
@@ -565,7 +564,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
565 * numbers we just happen to need. Now put the 564 * numbers we just happen to need. Now put the
566 * length in the lower two. 565 * length in the lower two.
567 */ 566 */
568 *((__u16 *)skb->data) = htons(skb->len); 567 *((__be16 *)skb->data) = htons(skb->len);
569 ft = 1; 568 ft = 1;
570 } 569 }
571 /* 570 /*
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 876dbac71060..192b529f86a4 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -401,7 +401,7 @@ out_err:
401} 401}
402 402
403/* Find a match for a specific network:node pair */ 403/* Find a match for a specific network:node pair */
404static struct atalk_iface *atalk_find_interface(int net, int node) 404static struct atalk_iface *atalk_find_interface(__be16 net, int node)
405{ 405{
406 struct atalk_iface *iface; 406 struct atalk_iface *iface;
407 407
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 000000000000..21ff276b2d80
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,74 @@
1#
2# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)
3#
4
5config ATM
6 tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 ATM is a high-speed networking technology for Local Area Networks
10 and Wide Area Networks. It uses a fixed packet size and is
11 connection oriented, allowing for the negotiation of minimum
12 bandwidth requirements.
13
14 In order to participate in an ATM network, your Linux box needs an
15 ATM networking card. If you have that, say Y here and to the driver
16 of your ATM card below.
17
18 Note that you need a set of user-space programs to actually make use
19 of ATM. See the file <file:Documentation/networking/atm.txt> for
20 further details.
21
22config ATM_CLIP
23 tristate "Classical IP over ATM (EXPERIMENTAL)"
24 depends on ATM && INET
25 help
26 Classical IP over ATM for PVCs and SVCs, supporting InARP and
27 ATMARP. If you want to communication with other IP hosts on your ATM
28 network, you will typically either say Y here or to "LAN Emulation
29 (LANE)" below.
30
31config ATM_CLIP_NO_ICMP
32 bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
33 depends on ATM_CLIP
34 help
35 Normally, an "ICMP host unreachable" message is sent if a neighbour
36 cannot be reached because there is no VC to it in the kernel's
37 ATMARP table. This may cause problems when ATMARP table entries are
38 briefly removed during revalidation. If you say Y here, packets to
39 such neighbours are silently discarded instead.
40
41config ATM_LANE
42 tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
43 depends on ATM
44 help
45 LAN Emulation emulates services of existing LANs across an ATM
46 network. Besides operating as a normal ATM end station client, Linux
47 LANE client can also act as an proxy client bridging packets between
48 ELAN and Ethernet segments. You need LANE if you want to try MPOA.
49
50config ATM_MPOA
51 tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
52 depends on ATM && INET && ATM_LANE!=n
53 help
54 Multi-Protocol Over ATM allows ATM edge devices such as routers,
55 bridges and ATM attached hosts establish direct ATM VCs across
56 subnetwork boundaries. These shortcut connections bypass routers
57 enhancing overall network performance.
58
59config ATM_BR2684
60 tristate "RFC1483/2684 Bridged protocols"
61 depends on ATM && INET
62 help
63 ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
64 This device will act like an ethernet from the kernels point of view,
65 with the traffic being carried by ATM PVCs (currently 1 PVC/device).
66 This is sometimes used over DSL lines. If in doubt, say N.
67
68config ATM_BR2684_IPFILTER
69 bool "Per-VC IP filter kludge"
70 depends on ATM_BR2684
71 help
72 This is an experimental mechanism for users who need to terminate a
73 large number of IP-only vcc's. Do not enable this unless you are sure
74 you know what you are doing.
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index e6954cf1459d..289956c4dd3e 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -289,8 +289,7 @@ xmit will add the additional header part in that case */
289 * This is similar to eth_type_trans, which cannot be used because of 289 * This is similar to eth_type_trans, which cannot be used because of
290 * our dev->hard_header_len 290 * our dev->hard_header_len
291 */ 291 */
292static inline unsigned short br_type_trans(struct sk_buff *skb, 292static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
293 struct net_device *dev)
294{ 293{
295 struct ethhdr *eth; 294 struct ethhdr *eth;
296 unsigned char *rawp; 295 unsigned char *rawp;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 02f5374a51f2..08e46052a3e4 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -118,10 +118,6 @@ static int svc_bind(struct socket *sock,struct sockaddr *sockaddr,
118 goto out; 118 goto out;
119 } 119 }
120 vcc = ATM_SD(sock); 120 vcc = ATM_SD(sock);
121 if (test_bit(ATM_VF_SESSION, &vcc->flags)) {
122 error = -EINVAL;
123 goto out;
124 }
125 addr = (struct sockaddr_atmsvc *) sockaddr; 121 addr = (struct sockaddr_atmsvc *) sockaddr;
126 if (addr->sas_family != AF_ATMSVC) { 122 if (addr->sas_family != AF_ATMSVC) {
127 error = -EAFNOSUPPORT; 123 error = -EAFNOSUPPORT;
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 2e341de3e763..901eff7ebe74 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, in
213 return kernel_sendmsg(sock, &msg, &iv, 1, len); 213 return kernel_sendmsg(sock, &msg, &iv, 1, len);
214} 214}
215 215
216static int cmtp_process_transmit(struct cmtp_session *session) 216static void cmtp_process_transmit(struct cmtp_session *session)
217{ 217{
218 struct sk_buff *skb, *nskb; 218 struct sk_buff *skb, *nskb;
219 unsigned char *hdr; 219 unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct cmtp_session *session)
223 223
224 if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) { 224 if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
225 BT_ERR("Can't allocate memory for new frame"); 225 BT_ERR("Can't allocate memory for new frame");
226 return -ENOMEM; 226 return;
227 } 227 }
228 228
229 while ((skb = skb_dequeue(&session->transmit))) { 229 while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct cmtp_session *session)
275 cmtp_send_frame(session, nskb->data, nskb->len); 275 cmtp_send_frame(session, nskb->data, nskb->len);
276 276
277 kfree_skb(nskb); 277 kfree_skb(nskb);
278
279 return skb_queue_len(&session->transmit);
280} 278}
281 279
282static int cmtp_session(void *arg) 280static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index fb5524365bc2..ffa26c10bfe8 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -299,7 +299,6 @@ struct hci_dev *hci_dev_get(int index)
299 read_unlock(&hci_dev_list_lock); 299 read_unlock(&hci_dev_list_lock);
300 return hdev; 300 return hdev;
301} 301}
302EXPORT_SYMBOL(hci_dev_get);
303 302
304/* ---- Inquiry support ---- */ 303/* ---- Inquiry support ---- */
305static void inquiry_cache_flush(struct hci_dev *hdev) 304static void inquiry_cache_flush(struct hci_dev *hdev)
@@ -1042,7 +1041,6 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p
1042 1041
1043 return 0; 1042 return 0;
1044} 1043}
1045EXPORT_SYMBOL(hci_send_cmd);
1046 1044
1047/* Get data from the previously sent command */ 1045/* Get data from the previously sent command */
1048void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf) 1046void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c4b592b4ef10..46367bd129c3 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1035,9 +1035,11 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
1035 ev->type = type; 1035 ev->type = type;
1036 memcpy(ev->data, data, dlen); 1036 memcpy(ev->data, data, dlen);
1037 1037
1038 bt_cb(skb)->incoming = 1;
1039 do_gettimeofday(&skb->stamp);
1040
1038 skb->pkt_type = HCI_EVENT_PKT; 1041 skb->pkt_type = HCI_EVENT_PKT;
1039 skb->dev = (void *) hdev; 1042 skb->dev = (void *) hdev;
1040 hci_send_to_sock(hdev, skb); 1043 hci_send_to_sock(hdev, skb);
1041 kfree_skb(skb); 1044 kfree_skb(skb);
1042} 1045}
1043EXPORT_SYMBOL(hci_si_event);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index affbc55462e8..de8af5f42394 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
428 return kernel_sendmsg(sock, &msg, &iv, 1, len); 428 return kernel_sendmsg(sock, &msg, &iv, 1, len);
429} 429}
430 430
431static int hidp_process_transmit(struct hidp_session *session) 431static void hidp_process_transmit(struct hidp_session *session)
432{ 432{
433 struct sk_buff *skb; 433 struct sk_buff *skb;
434 434
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct hidp_session *session)
453 hidp_set_timer(session); 453 hidp_set_timer(session);
454 kfree_skb(skb); 454 kfree_skb(skb);
455 } 455 }
456
457 return skb_queue_len(&session->ctrl_transmit) +
458 skb_queue_len(&session->intr_transmit);
459} 456}
460 457
461static int hidp_session(void *arg) 458static int hidp_session(void *arg)
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 9efb0a093612..ee6a66979913 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -34,31 +34,6 @@
34 34
35#include <net/bluetooth/bluetooth.h> 35#include <net/bluetooth/bluetooth.h>
36 36
37void bt_dump(char *pref, __u8 *buf, int count)
38{
39 char *ptr;
40 char line[100];
41 unsigned int i;
42
43 printk(KERN_INFO "%s: dump, len %d\n", pref, count);
44
45 ptr = line;
46 *ptr = 0;
47 for (i = 0; i < count; i++) {
48 ptr += sprintf(ptr, " %2.2X", buf[i]);
49
50 if (i && !((i + 1) % 20)) {
51 printk(KERN_INFO "%s:%s\n", pref, line);
52 ptr = line;
53 *ptr = 0;
54 }
55 }
56
57 if (line[0])
58 printk(KERN_INFO "%s:%s\n", pref, line);
59}
60EXPORT_SYMBOL(bt_dump);
61
62void baswap(bdaddr_t *dst, bdaddr_t *src) 37void baswap(bdaddr_t *dst, bdaddr_t *src)
63{ 38{
64 unsigned char *d = (unsigned char *) dst; 39 unsigned char *d = (unsigned char *) dst;
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index e9e6fda66f1a..27bf5047cd33 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -389,8 +389,6 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
389 rfcomm_dlc_unlock(d); 389 rfcomm_dlc_unlock(d);
390 390
391 skb_queue_purge(&d->tx_queue); 391 skb_queue_purge(&d->tx_queue);
392 rfcomm_session_put(s);
393
394 rfcomm_dlc_unlink(d); 392 rfcomm_dlc_unlink(d);
395 } 393 }
396 394
@@ -600,8 +598,6 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
600 goto failed; 598 goto failed;
601 } 599 }
602 600
603 rfcomm_session_hold(s);
604
605 s->initiator = 1; 601 s->initiator = 1;
606 602
607 bacpy(&addr.l2_bdaddr, dst); 603 bacpy(&addr.l2_bdaddr, dst);
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index f3f6355a2786..63a123c5c41b 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
590 for (;;) { 590 for (;;) {
591 set_current_state(TASK_INTERRUPTIBLE); 591 set_current_state(TASK_INTERRUPTIBLE);
592 592
593 if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || 593 if (!skb_queue_empty(&sk->sk_receive_queue) ||
594 signal_pending(current) || !timeo) 594 sk->sk_err ||
595 (sk->sk_shutdown & RCV_SHUTDOWN) ||
596 signal_pending(current) ||
597 !timeo)
595 break; 598 break;
596 599
597 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 600 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6d689200bcf3..6304590fd36a 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
781 781
782 BT_DBG("tty %p dev %p", tty, dev); 782 BT_DBG("tty %p dev %p", tty, dev);
783 783
784 if (skb_queue_len(&dlc->tx_queue)) 784 if (!skb_queue_empty(&dlc->tx_queue))
785 return dlc->mtu; 785 return dlc->mtu;
786 786
787 return 0; 787 return 0;
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 000000000000..db23d59746cf
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,31 @@
1#
2# 802.1d Ethernet Bridging
3#
4
5config BRIDGE
6 tristate "802.1d Ethernet Bridging"
7 ---help---
8 If you say Y here, then your Linux box will be able to act as an
9 Ethernet bridge, which means that the different Ethernet segments it
10 is connected to will appear as one Ethernet to the participants.
11 Several such bridges can work together to create even larger
12 networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
13 As this is a standard, Linux bridges will cooperate properly with
14 other third party bridge products.
15
16 In order to use the Ethernet bridge, you'll need the bridge
17 configuration tools; see <file:Documentation/networking/bridge.txt>
18 for location. Please read the Bridge mini-HOWTO for more
19 information.
20
21 If you enable iptables support along with the bridge support then you
22 turn your bridge into a bridging IP firewall.
23 iptables will then see the IP packets being bridged, so you need to
24 take this into account when setting up your firewall rules.
25 Enabling arptables support when bridging will let arptables see
26 bridged ARP traffic in the arptables FORWARD chain.
27
28 To compile this code as a module, choose M here: the module
29 will be called bridge.
30
31 If unsure, say N.
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index ef9f2095f96e..069253f830c1 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -57,9 +57,6 @@ int br_forward_finish(struct sk_buff *skb)
57static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) 57static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
58{ 58{
59 skb->dev = to->dev; 59 skb->dev = to->dev;
60#ifdef CONFIG_NETFILTER_DEBUG
61 skb->nf_debug = 0;
62#endif
63 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, 60 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
64 br_forward_finish); 61 br_forward_finish);
65} 62}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 8f5f2e730992..9a45e6279c57 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -23,11 +23,7 @@ const unsigned char bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
23 23
24static int br_pass_frame_up_finish(struct sk_buff *skb) 24static int br_pass_frame_up_finish(struct sk_buff *skb)
25{ 25{
26#ifdef CONFIG_NETFILTER_DEBUG
27 skb->nf_debug = 0;
28#endif
29 netif_receive_skb(skb); 26 netif_receive_skb(skb);
30
31 return 0; 27 return 0;
32} 28}
33 29
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index be03d3ad2648..2d52fee63a8c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -102,10 +102,6 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
102{ 102{
103 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 103 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
104 104
105#ifdef CONFIG_NETFILTER_DEBUG
106 skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
107#endif
108
109 if (nf_bridge->mask & BRNF_PKT_TYPE) { 105 if (nf_bridge->mask & BRNF_PKT_TYPE) {
110 skb->pkt_type = PACKET_OTHERHOST; 106 skb->pkt_type = PACKET_OTHERHOST;
111 nf_bridge->mask ^= BRNF_PKT_TYPE; 107 nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -182,10 +178,6 @@ static void __br_dnat_complain(void)
182 * --Bart, 20021007 (updated) */ 178 * --Bart, 20021007 (updated) */
183static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) 179static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
184{ 180{
185#ifdef CONFIG_NETFILTER_DEBUG
186 skb->nf_debug |= (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_FORWARD);
187#endif
188
189 if (skb->pkt_type == PACKET_OTHERHOST) { 181 if (skb->pkt_type == PACKET_OTHERHOST) {
190 skb->pkt_type = PACKET_HOST; 182 skb->pkt_type = PACKET_HOST;
191 skb->nf_bridge->mask |= BRNF_PKT_TYPE; 183 skb->nf_bridge->mask |= BRNF_PKT_TYPE;
@@ -207,10 +199,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
207 struct iphdr *iph = skb->nh.iph; 199 struct iphdr *iph = skb->nh.iph;
208 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 200 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
209 201
210#ifdef CONFIG_NETFILTER_DEBUG
211 skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
212#endif
213
214 if (nf_bridge->mask & BRNF_PKT_TYPE) { 202 if (nf_bridge->mask & BRNF_PKT_TYPE) {
215 skb->pkt_type = PACKET_OTHERHOST; 203 skb->pkt_type = PACKET_OTHERHOST;
216 nf_bridge->mask ^= BRNF_PKT_TYPE; 204 nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -382,9 +370,6 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
382 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) 370 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
383 goto inhdr_error; 371 goto inhdr_error;
384 372
385#ifdef CONFIG_NETFILTER_DEBUG
386 skb->nf_debug ^= (1 << NF_IP6_PRE_ROUTING);
387#endif
388 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) 373 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
389 return NF_DROP; 374 return NF_DROP;
390 setup_pre_routing(skb); 375 setup_pre_routing(skb);
@@ -468,9 +453,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
468 skb->ip_summed = CHECKSUM_NONE; 453 skb->ip_summed = CHECKSUM_NONE;
469 } 454 }
470 455
471#ifdef CONFIG_NETFILTER_DEBUG
472 skb->nf_debug ^= (1 << NF_IP_PRE_ROUTING);
473#endif
474 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) 456 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
475 return NF_DROP; 457 return NF_DROP;
476 setup_pre_routing(skb); 458 setup_pre_routing(skb);
@@ -517,10 +499,6 @@ static int br_nf_forward_finish(struct sk_buff *skb)
517 struct net_device *in; 499 struct net_device *in;
518 struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); 500 struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
519 501
520#ifdef CONFIG_NETFILTER_DEBUG
521 skb->nf_debug ^= (1 << NF_BR_FORWARD);
522#endif
523
524 if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) { 502 if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) {
525 in = nf_bridge->physindev; 503 in = nf_bridge->physindev;
526 if (nf_bridge->mask & BRNF_PKT_TYPE) { 504 if (nf_bridge->mask & BRNF_PKT_TYPE) {
@@ -566,9 +544,6 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb,
566 (*pskb)->nh.raw += VLAN_HLEN; 544 (*pskb)->nh.raw += VLAN_HLEN;
567 } 545 }
568 546
569#ifdef CONFIG_NETFILTER_DEBUG
570 skb->nf_debug ^= (1 << NF_BR_FORWARD);
571#endif
572 nf_bridge = skb->nf_bridge; 547 nf_bridge = skb->nf_bridge;
573 if (skb->pkt_type == PACKET_OTHERHOST) { 548 if (skb->pkt_type == PACKET_OTHERHOST) {
574 skb->pkt_type = PACKET_HOST; 549 skb->pkt_type = PACKET_HOST;
@@ -605,10 +580,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
605 (*pskb)->nh.raw += VLAN_HLEN; 580 (*pskb)->nh.raw += VLAN_HLEN;
606 } 581 }
607 582
608#ifdef CONFIG_NETFILTER_DEBUG
609 skb->nf_debug ^= (1 << NF_BR_FORWARD);
610#endif
611
612 if (skb->nh.arph->ar_pln != 4) { 583 if (skb->nh.arph->ar_pln != 4) {
613 if (IS_VLAN_ARP) { 584 if (IS_VLAN_ARP) {
614 skb_push(*pskb, VLAN_HLEN); 585 skb_push(*pskb, VLAN_HLEN);
@@ -627,9 +598,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
627/* PF_BRIDGE/LOCAL_OUT ***********************************************/ 598/* PF_BRIDGE/LOCAL_OUT ***********************************************/
628static int br_nf_local_out_finish(struct sk_buff *skb) 599static int br_nf_local_out_finish(struct sk_buff *skb)
629{ 600{
630#ifdef CONFIG_NETFILTER_DEBUG
631 skb->nf_debug &= ~(1 << NF_BR_LOCAL_OUT);
632#endif
633 if (skb->protocol == __constant_htons(ETH_P_8021Q)) { 601 if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
634 skb_push(skb, VLAN_HLEN); 602 skb_push(skb, VLAN_HLEN);
635 skb->nh.raw -= VLAN_HLEN; 603 skb->nh.raw -= VLAN_HLEN;
@@ -731,10 +699,6 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb,
731 realoutdev, br_nf_local_out_finish, 699 realoutdev, br_nf_local_out_finish,
732 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1); 700 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1);
733 } else { 701 } else {
734#ifdef CONFIG_NETFILTER_DEBUG
735 skb->nf_debug ^= (1 << NF_IP_LOCAL_OUT);
736#endif
737
738 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev, 702 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev,
739 realoutdev, br_nf_local_out_finish, 703 realoutdev, br_nf_local_out_finish,
740 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1); 704 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1);
@@ -779,8 +743,6 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
779 printk(KERN_CRIT "br_netfilter: skb->dst == NULL."); 743 printk(KERN_CRIT "br_netfilter: skb->dst == NULL.");
780 goto print_error; 744 goto print_error;
781 } 745 }
782
783 skb->nf_debug ^= (1 << NF_IP_POST_ROUTING);
784#endif 746#endif
785 747
786 /* We assume any code from br_dev_queue_push_xmit onwards doesn't care 748 /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
@@ -882,7 +844,7 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
882 * doesn't use the bridge parent of the indev by using 844 * doesn't use the bridge parent of the indev by using
883 * the BRNF_DONT_TAKE_PARENT mask. */ 845 * the BRNF_DONT_TAKE_PARENT mask. */
884 if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) { 846 if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
885 nf_bridge->mask &= BRNF_DONT_TAKE_PARENT; 847 nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
886 nf_bridge->physindev = (struct net_device *)in; 848 nf_bridge->physindev = (struct net_device *)in;
887 } 849 }
888#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) 850#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 68ccef507b49..c70b3be23026 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -138,7 +138,7 @@ config BRIDGE_EBT_VLAN
138# 138#
139config BRIDGE_EBT_ARPREPLY 139config BRIDGE_EBT_ARPREPLY
140 tristate "ebt: arp reply target support" 140 tristate "ebt: arp reply target support"
141 depends on BRIDGE_NF_EBTABLES 141 depends on BRIDGE_NF_EBTABLES && INET
142 help 142 help
143 This option adds the arp reply target, which allows 143 This option adds the arp reply target, which allows
144 automatically sending arp replies to arp requests. 144 automatically sending arp replies to arp requests.
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e4ae34b88925..662975be3d1d 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -61,8 +61,6 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
61{ 61{
62 struct ebt_log_info *info = (struct ebt_log_info *)data; 62 struct ebt_log_info *info = (struct ebt_log_info *)data;
63 char level_string[4] = "< >"; 63 char level_string[4] = "< >";
64 union {struct iphdr iph; struct tcpudphdr ports;
65 struct arphdr arph; struct arppayload arpp;} u;
66 64
67 level_string[1] = '0' + info->loglevel; 65 level_string[1] = '0' + info->loglevel;
68 spin_lock_bh(&ebt_log_lock); 66 spin_lock_bh(&ebt_log_lock);
@@ -88,7 +86,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
88 } 86 }
89 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", 87 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
90 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); 88 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
91 printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos, 89 printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
92 ih->protocol); 90 ih->protocol);
93 if (ih->protocol == IPPROTO_TCP || 91 if (ih->protocol == IPPROTO_TCP ||
94 ih->protocol == IPPROTO_UDP) { 92 ih->protocol == IPPROTO_UDP) {
@@ -127,7 +125,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
127 ah->ar_pln == sizeof(uint32_t)) { 125 ah->ar_pln == sizeof(uint32_t)) {
128 struct arppayload _arpp, *ap; 126 struct arppayload _arpp, *ap;
129 127
130 ap = skb_header_pointer(skb, sizeof(u.arph), 128 ap = skb_header_pointer(skb, sizeof(_arph),
131 sizeof(_arpp), &_arpp); 129 sizeof(_arpp), &_arpp);
132 if (ap == NULL) { 130 if (ap == NULL) {
133 printk(" INCOMPLETE ARP payload"); 131 printk(" INCOMPLETE ARP payload");
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 18ebc664769b..c4540144f0f4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -859,8 +859,7 @@ static int translate_table(struct ebt_replace *repl,
859 if (repl->valid_hooks & (1 << i)) 859 if (repl->valid_hooks & (1 << i))
860 if (check_chainloops(newinfo->hook_entry[i], 860 if (check_chainloops(newinfo->hook_entry[i],
861 cl_s, udc_cnt, i, newinfo->entries)) { 861 cl_s, udc_cnt, i, newinfo->entries)) {
862 if (cl_s) 862 vfree(cl_s);
863 vfree(cl_s);
864 return -EINVAL; 863 return -EINVAL;
865 } 864 }
866 865
@@ -883,8 +882,7 @@ static int translate_table(struct ebt_replace *repl,
883 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, 882 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
884 ebt_cleanup_entry, &i); 883 ebt_cleanup_entry, &i);
885 } 884 }
886 if (cl_s) 885 vfree(cl_s);
887 vfree(cl_s);
888 return ret; 886 return ret;
889} 887}
890 888
@@ -1030,8 +1028,7 @@ static int do_replace(void __user *user, unsigned int len)
1030 } 1028 }
1031 vfree(table); 1029 vfree(table);
1032 1030
1033 if (counterstmp) 1031 vfree(counterstmp);
1034 vfree(counterstmp);
1035 return ret; 1032 return ret;
1036 1033
1037free_unlock: 1034free_unlock:
@@ -1040,8 +1037,7 @@ free_iterate:
1040 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, 1037 EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
1041 ebt_cleanup_entry, NULL); 1038 ebt_cleanup_entry, NULL);
1042free_counterstmp: 1039free_counterstmp:
1043 if (counterstmp) 1040 vfree(counterstmp);
1044 vfree(counterstmp);
1045 /* can be initialized in translate_table() */ 1041 /* can be initialized in translate_table() */
1046 if (newinfo->chainstack) { 1042 if (newinfo->chainstack) {
1047 for (i = 0; i < num_possible_cpus(); i++) 1043 for (i = 0; i < num_possible_cpus(); i++)
@@ -1049,11 +1045,9 @@ free_counterstmp:
1049 vfree(newinfo->chainstack); 1045 vfree(newinfo->chainstack);
1050 } 1046 }
1051free_entries: 1047free_entries:
1052 if (newinfo->entries) 1048 vfree(newinfo->entries);
1053 vfree(newinfo->entries);
1054free_newinfo: 1049free_newinfo:
1055 if (newinfo) 1050 vfree(newinfo);
1056 vfree(newinfo);
1057 return ret; 1051 return ret;
1058} 1052}
1059 1053
@@ -1213,8 +1207,7 @@ void ebt_unregister_table(struct ebt_table *table)
1213 down(&ebt_mutex); 1207 down(&ebt_mutex);
1214 LIST_DELETE(&ebt_tables, table); 1208 LIST_DELETE(&ebt_tables, table);
1215 up(&ebt_mutex); 1209 up(&ebt_mutex);
1216 if (table->private->entries) 1210 vfree(table->private->entries);
1217 vfree(table->private->entries);
1218 if (table->private->chainstack) { 1211 if (table->private->chainstack) {
1219 for (i = 0; i < num_possible_cpus(); i++) 1212 for (i = 0; i < num_possible_cpus(); i++)
1220 vfree(table->private->chainstack[i]); 1213 vfree(table->private->chainstack[i]);
diff --git a/net/compat.c b/net/compat.c
index be5d936dc423..d99ab9695893 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -91,20 +91,11 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
91 } else 91 } else
92 kern_msg->msg_name = NULL; 92 kern_msg->msg_name = NULL;
93 93
94 if(kern_msg->msg_iovlen > UIO_FASTIOV) {
95 kern_iov = kmalloc(kern_msg->msg_iovlen * sizeof(struct iovec),
96 GFP_KERNEL);
97 if(!kern_iov)
98 return -ENOMEM;
99 }
100
101 tot_len = iov_from_user_compat_to_kern(kern_iov, 94 tot_len = iov_from_user_compat_to_kern(kern_iov,
102 (struct compat_iovec __user *)kern_msg->msg_iov, 95 (struct compat_iovec __user *)kern_msg->msg_iov,
103 kern_msg->msg_iovlen); 96 kern_msg->msg_iovlen);
104 if(tot_len >= 0) 97 if(tot_len >= 0)
105 kern_msg->msg_iov = kern_iov; 98 kern_msg->msg_iov = kern_iov;
106 else if(kern_msg->msg_iovlen > UIO_FASTIOV)
107 kfree(kern_iov);
108 99
109 return tot_len; 100 return tot_len;
110} 101}
diff --git a/net/core/Makefile b/net/core/Makefile
index 81f03243fe2f..f5f5e58943e8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -2,13 +2,15 @@
2# Makefile for the Linux networking core. 2# Makefile for the Linux networking core.
3# 3#
4 4
5obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o 5obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
6 gen_stats.o gen_estimator.o
6 7
7obj-$(CONFIG_SYSCTL) += sysctl_net_core.o 8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
8 9
9obj-y += flow.o dev.o ethtool.o dev_mcast.o dst.o \ 10obj-y += dev.o ethtool.o dev_mcast.o dst.o \
10 neighbour.o rtnetlink.o utils.o link_watch.o filter.o 11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o
11 12
13obj-$(CONFIG_XFRM) += flow.o
12obj-$(CONFIG_SYSFS) += net-sysfs.o 14obj-$(CONFIG_SYSFS) += net-sysfs.o
13obj-$(CONFIG_NETFILTER) += netfilter.o 15obj-$(CONFIG_NETFILTER) += netfilter.o
14obj-$(CONFIG_NET_DIVERT) += dv.o 16obj-$(CONFIG_NET_DIVERT) += dv.o
diff --git a/net/core/dev.c b/net/core/dev.c
index f15a3ffff635..faf59b02c4bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
115#endif /* CONFIG_NET_RADIO */ 115#endif /* CONFIG_NET_RADIO */
116#include <asm/current.h> 116#include <asm/current.h>
117 117
118/* This define, if set, will randomly drop a packet when congestion
119 * is more than moderate. It helps fairness in the multi-interface
120 * case when one of them is a hog, but it kills performance for the
121 * single interface case so it is off now by default.
122 */
123#undef RAND_LIE
124
125/* Setting this will sample the queue lengths and thus congestion
126 * via a timer instead of as each packet is received.
127 */
128#undef OFFLINE_SAMPLE
129
130/* 118/*
131 * The list of packet types we will receive (as opposed to discard) 119 * The list of packet types we will receive (as opposed to discard)
132 * and the routines to invoke. 120 * and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
159static struct list_head ptype_base[16]; /* 16 way hashed list */ 147static struct list_head ptype_base[16]; /* 16 way hashed list */
160static struct list_head ptype_all; /* Taps */ 148static struct list_head ptype_all; /* Taps */
161 149
162#ifdef OFFLINE_SAMPLE
163static void sample_queue(unsigned long dummy);
164static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
165#endif
166
167/* 150/*
168 * The @dev_base list is protected by @dev_base_lock and the rtln 151 * The @dev_base list is protected by @dev_base_lock and the rtln
169 * semaphore. 152 * semaphore.
@@ -215,7 +198,7 @@ static struct notifier_block *netdev_chain;
215 * Device drivers call our routines to queue packets here. We empty the 198 * Device drivers call our routines to queue packets here. We empty the
216 * queue in the local softnet handler. 199 * queue in the local softnet handler.
217 */ 200 */
218DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; 201DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
219 202
220#ifdef CONFIG_SYSFS 203#ifdef CONFIG_SYSFS
221extern int netdev_sysfs_init(void); 204extern int netdev_sysfs_init(void);
@@ -918,8 +901,7 @@ int dev_close(struct net_device *dev)
918 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 901 smp_mb__after_clear_bit(); /* Commit netif_running(). */
919 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { 902 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
920 /* No hurry. */ 903 /* No hurry. */
921 current->state = TASK_INTERRUPTIBLE; 904 msleep(1);
922 schedule_timeout(1);
923 } 905 }
924 906
925 /* 907 /*
@@ -1144,7 +1126,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1144extern void skb_release_data(struct sk_buff *); 1126extern void skb_release_data(struct sk_buff *);
1145 1127
1146/* Keep head the same: replace data */ 1128/* Keep head the same: replace data */
1147int __skb_linearize(struct sk_buff *skb, int gfp_mask) 1129int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
1148{ 1130{
1149 unsigned int size; 1131 unsigned int size;
1150 u8 *data; 1132 u8 *data;
@@ -1363,71 +1345,13 @@ out:
1363 Receiver routines 1345 Receiver routines
1364 =======================================================================*/ 1346 =======================================================================*/
1365 1347
1366int netdev_max_backlog = 300; 1348int netdev_max_backlog = 1000;
1349int netdev_budget = 300;
1367int weight_p = 64; /* old backlog weight */ 1350int weight_p = 64; /* old backlog weight */
1368/* These numbers are selected based on intuition and some
1369 * experimentatiom, if you have more scientific way of doing this
1370 * please go ahead and fix things.
1371 */
1372int no_cong_thresh = 10;
1373int no_cong = 20;
1374int lo_cong = 100;
1375int mod_cong = 290;
1376 1351
1377DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 1352DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1378 1353
1379 1354
1380static void get_sample_stats(int cpu)
1381{
1382#ifdef RAND_LIE
1383 unsigned long rd;
1384 int rq;
1385#endif
1386 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1387 int blog = sd->input_pkt_queue.qlen;
1388 int avg_blog = sd->avg_blog;
1389
1390 avg_blog = (avg_blog >> 1) + (blog >> 1);
1391
1392 if (avg_blog > mod_cong) {
1393 /* Above moderate congestion levels. */
1394 sd->cng_level = NET_RX_CN_HIGH;
1395#ifdef RAND_LIE
1396 rd = net_random();
1397 rq = rd % netdev_max_backlog;
1398 if (rq < avg_blog) /* unlucky bastard */
1399 sd->cng_level = NET_RX_DROP;
1400#endif
1401 } else if (avg_blog > lo_cong) {
1402 sd->cng_level = NET_RX_CN_MOD;
1403#ifdef RAND_LIE
1404 rd = net_random();
1405 rq = rd % netdev_max_backlog;
1406 if (rq < avg_blog) /* unlucky bastard */
1407 sd->cng_level = NET_RX_CN_HIGH;
1408#endif
1409 } else if (avg_blog > no_cong)
1410 sd->cng_level = NET_RX_CN_LOW;
1411 else /* no congestion */
1412 sd->cng_level = NET_RX_SUCCESS;
1413
1414 sd->avg_blog = avg_blog;
1415}
1416
1417#ifdef OFFLINE_SAMPLE
1418static void sample_queue(unsigned long dummy)
1419{
1420/* 10 ms 0r 1ms -- i don't care -- JHS */
1421 int next_tick = 1;
1422 int cpu = smp_processor_id();
1423
1424 get_sample_stats(cpu);
1425 next_tick += jiffies;
1426 mod_timer(&samp_timer, next_tick);
1427}
1428#endif
1429
1430
1431/** 1355/**
1432 * netif_rx - post buffer to the network code 1356 * netif_rx - post buffer to the network code
1433 * @skb: buffer to post 1357 * @skb: buffer to post
@@ -1448,7 +1372,6 @@ static void sample_queue(unsigned long dummy)
1448 1372
1449int netif_rx(struct sk_buff *skb) 1373int netif_rx(struct sk_buff *skb)
1450{ 1374{
1451 int this_cpu;
1452 struct softnet_data *queue; 1375 struct softnet_data *queue;
1453 unsigned long flags; 1376 unsigned long flags;
1454 1377
@@ -1464,38 +1387,22 @@ int netif_rx(struct sk_buff *skb)
1464 * short when CPU is congested, but is still operating. 1387 * short when CPU is congested, but is still operating.
1465 */ 1388 */
1466 local_irq_save(flags); 1389 local_irq_save(flags);
1467 this_cpu = smp_processor_id();
1468 queue = &__get_cpu_var(softnet_data); 1390 queue = &__get_cpu_var(softnet_data);
1469 1391
1470 __get_cpu_var(netdev_rx_stat).total++; 1392 __get_cpu_var(netdev_rx_stat).total++;
1471 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 1393 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1472 if (queue->input_pkt_queue.qlen) { 1394 if (queue->input_pkt_queue.qlen) {
1473 if (queue->throttle)
1474 goto drop;
1475
1476enqueue: 1395enqueue:
1477 dev_hold(skb->dev); 1396 dev_hold(skb->dev);
1478 __skb_queue_tail(&queue->input_pkt_queue, skb); 1397 __skb_queue_tail(&queue->input_pkt_queue, skb);
1479#ifndef OFFLINE_SAMPLE
1480 get_sample_stats(this_cpu);
1481#endif
1482 local_irq_restore(flags); 1398 local_irq_restore(flags);
1483 return queue->cng_level; 1399 return NET_RX_SUCCESS;
1484 } 1400 }
1485 1401
1486 if (queue->throttle)
1487 queue->throttle = 0;
1488
1489 netif_rx_schedule(&queue->backlog_dev); 1402 netif_rx_schedule(&queue->backlog_dev);
1490 goto enqueue; 1403 goto enqueue;
1491 } 1404 }
1492 1405
1493 if (!queue->throttle) {
1494 queue->throttle = 1;
1495 __get_cpu_var(netdev_rx_stat).throttled++;
1496 }
1497
1498drop:
1499 __get_cpu_var(netdev_rx_stat).dropped++; 1406 __get_cpu_var(netdev_rx_stat).dropped++;
1500 local_irq_restore(flags); 1407 local_irq_restore(flags);
1501 1408
@@ -1744,6 +1651,7 @@ static int process_backlog(struct net_device *backlog_dev, int *budget)
1744 struct softnet_data *queue = &__get_cpu_var(softnet_data); 1651 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1745 unsigned long start_time = jiffies; 1652 unsigned long start_time = jiffies;
1746 1653
1654 backlog_dev->weight = weight_p;
1747 for (;;) { 1655 for (;;) {
1748 struct sk_buff *skb; 1656 struct sk_buff *skb;
1749 struct net_device *dev; 1657 struct net_device *dev;
@@ -1779,8 +1687,6 @@ job_done:
1779 smp_mb__before_clear_bit(); 1687 smp_mb__before_clear_bit();
1780 netif_poll_enable(backlog_dev); 1688 netif_poll_enable(backlog_dev);
1781 1689
1782 if (queue->throttle)
1783 queue->throttle = 0;
1784 local_irq_enable(); 1690 local_irq_enable();
1785 return 0; 1691 return 0;
1786} 1692}
@@ -1789,9 +1695,9 @@ static void net_rx_action(struct softirq_action *h)
1789{ 1695{
1790 struct softnet_data *queue = &__get_cpu_var(softnet_data); 1696 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1791 unsigned long start_time = jiffies; 1697 unsigned long start_time = jiffies;
1792 int budget = netdev_max_backlog; 1698 int budget = netdev_budget;
1699 void *have;
1793 1700
1794
1795 local_irq_disable(); 1701 local_irq_disable();
1796 1702
1797 while (!list_empty(&queue->poll_list)) { 1703 while (!list_empty(&queue->poll_list)) {
@@ -1804,10 +1710,10 @@ static void net_rx_action(struct softirq_action *h)
1804 1710
1805 dev = list_entry(queue->poll_list.next, 1711 dev = list_entry(queue->poll_list.next,
1806 struct net_device, poll_list); 1712 struct net_device, poll_list);
1807 netpoll_poll_lock(dev); 1713 have = netpoll_poll_lock(dev);
1808 1714
1809 if (dev->quota <= 0 || dev->poll(dev, &budget)) { 1715 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1810 netpoll_poll_unlock(dev); 1716 netpoll_poll_unlock(have);
1811 local_irq_disable(); 1717 local_irq_disable();
1812 list_del(&dev->poll_list); 1718 list_del(&dev->poll_list);
1813 list_add_tail(&dev->poll_list, &queue->poll_list); 1719 list_add_tail(&dev->poll_list, &queue->poll_list);
@@ -1816,7 +1722,7 @@ static void net_rx_action(struct softirq_action *h)
1816 else 1722 else
1817 dev->quota = dev->weight; 1723 dev->quota = dev->weight;
1818 } else { 1724 } else {
1819 netpoll_poll_unlock(dev); 1725 netpoll_poll_unlock(have);
1820 dev_put(dev); 1726 dev_put(dev);
1821 local_irq_disable(); 1727 local_irq_disable();
1822 } 1728 }
@@ -2054,15 +1960,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
2054 struct netif_rx_stats *s = v; 1960 struct netif_rx_stats *s = v;
2055 1961
2056 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 1962 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2057 s->total, s->dropped, s->time_squeeze, s->throttled, 1963 s->total, s->dropped, s->time_squeeze, 0,
2058 s->fastroute_hit, s->fastroute_success, s->fastroute_defer, 1964 0, 0, 0, 0, /* was fastroute */
2059 s->fastroute_deferred_out, 1965 s->cpu_collision );
2060#if 0
2061 s->fastroute_latency_reduction
2062#else
2063 s->cpu_collision
2064#endif
2065 );
2066 return 0; 1966 return 0;
2067} 1967}
2068 1968
@@ -2189,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
2189{ 2089{
2190 unsigned short old_flags = dev->flags; 2090 unsigned short old_flags = dev->flags;
2191 2091
2192 dev->flags |= IFF_PROMISC;
2193 if ((dev->promiscuity += inc) == 0) 2092 if ((dev->promiscuity += inc) == 0)
2194 dev->flags &= ~IFF_PROMISC; 2093 dev->flags &= ~IFF_PROMISC;
2195 if (dev->flags ^ old_flags) { 2094 else
2095 dev->flags |= IFF_PROMISC;
2096 if (dev->flags != old_flags) {
2196 dev_mc_upload(dev); 2097 dev_mc_upload(dev);
2197 printk(KERN_INFO "device %s %s promiscuous mode\n", 2098 printk(KERN_INFO "device %s %s promiscuous mode\n",
2198 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 2099 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
@@ -3304,9 +3205,6 @@ static int __init net_dev_init(void)
3304 3205
3305 queue = &per_cpu(softnet_data, i); 3206 queue = &per_cpu(softnet_data, i);
3306 skb_queue_head_init(&queue->input_pkt_queue); 3207 skb_queue_head_init(&queue->input_pkt_queue);
3307 queue->throttle = 0;
3308 queue->cng_level = 0;
3309 queue->avg_blog = 10; /* arbitrary non-zero */
3310 queue->completion_queue = NULL; 3208 queue->completion_queue = NULL;
3311 INIT_LIST_HEAD(&queue->poll_list); 3209 INIT_LIST_HEAD(&queue->poll_list);
3312 set_bit(__LINK_STATE_START, &queue->backlog_dev.state); 3210 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3315,11 +3213,6 @@ static int __init net_dev_init(void)
3315 atomic_set(&queue->backlog_dev.refcnt, 1); 3213 atomic_set(&queue->backlog_dev.refcnt, 1);
3316 } 3214 }
3317 3215
3318#ifdef OFFLINE_SAMPLE
3319 samp_timer.expires = jiffies + (10 * HZ);
3320 add_timer(&samp_timer);
3321#endif
3322
3323 dev_boot_phase = 0; 3216 dev_boot_phase = 0;
3324 3217
3325 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); 3218 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/dst.c b/net/core/dst.c
index fc434ade5270..334790da9f16 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,6 +45,7 @@ static struct timer_list dst_gc_timer =
45static void dst_run_gc(unsigned long dummy) 45static void dst_run_gc(unsigned long dummy)
46{ 46{
47 int delayed = 0; 47 int delayed = 0;
48 int work_performed;
48 struct dst_entry * dst, **dstp; 49 struct dst_entry * dst, **dstp;
49 50
50 if (!spin_trylock(&dst_lock)) { 51 if (!spin_trylock(&dst_lock)) {
@@ -52,9 +53,9 @@ static void dst_run_gc(unsigned long dummy)
52 return; 53 return;
53 } 54 }
54 55
55
56 del_timer(&dst_gc_timer); 56 del_timer(&dst_gc_timer);
57 dstp = &dst_garbage_list; 57 dstp = &dst_garbage_list;
58 work_performed = 0;
58 while ((dst = *dstp) != NULL) { 59 while ((dst = *dstp) != NULL) {
59 if (atomic_read(&dst->__refcnt)) { 60 if (atomic_read(&dst->__refcnt)) {
60 dstp = &dst->next; 61 dstp = &dst->next;
@@ -62,6 +63,7 @@ static void dst_run_gc(unsigned long dummy)
62 continue; 63 continue;
63 } 64 }
64 *dstp = dst->next; 65 *dstp = dst->next;
66 work_performed = 1;
65 67
66 dst = dst_destroy(dst); 68 dst = dst_destroy(dst);
67 if (dst) { 69 if (dst) {
@@ -86,9 +88,14 @@ static void dst_run_gc(unsigned long dummy)
86 dst_gc_timer_inc = DST_GC_MAX; 88 dst_gc_timer_inc = DST_GC_MAX;
87 goto out; 89 goto out;
88 } 90 }
89 if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) 91 if (!work_performed) {
90 dst_gc_timer_expires = DST_GC_MAX; 92 if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
91 dst_gc_timer_inc += DST_GC_INC; 93 dst_gc_timer_expires = DST_GC_MAX;
94 dst_gc_timer_inc += DST_GC_INC;
95 } else {
96 dst_gc_timer_inc = DST_GC_INC;
97 dst_gc_timer_expires = DST_GC_MIN;
98 }
92 dst_gc_timer.expires = jiffies + dst_gc_timer_expires; 99 dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
93#if RT_CACHE_DEBUG >= 2 100#if RT_CACHE_DEBUG >= 2
94 printk("dst_total: %d/%d %ld\n", 101 printk("dst_total: %d/%d %ld\n",
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ace2..cd91a24f9720 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
36#include <linux/filter.h> 36#include <linux/filter.h>
37 37
38/* No hurry in this branch */ 38/* No hurry in this branch */
39static u8 *load_pointer(struct sk_buff *skb, int k) 39static void *__load_pointer(struct sk_buff *skb, int k)
40{ 40{
41 u8 *ptr = NULL; 41 u8 *ptr = NULL;
42 42
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
50 return NULL; 50 return NULL;
51} 51}
52 52
53static inline void *load_pointer(struct sk_buff *skb, int k,
54 unsigned int size, void *buffer)
55{
56 if (k >= 0)
57 return skb_header_pointer(skb, k, size, buffer);
58 else {
59 if (k >= SKF_AD_OFF)
60 return NULL;
61 return __load_pointer(skb, k);
62 }
63}
64
53/** 65/**
54 * sk_run_filter - run a filter on a socket 66 * sk_run_filter - run a filter on a socket
55 * @skb: buffer to run the filter on 67 * @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
64 76
65int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) 77int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
66{ 78{
67 unsigned char *data = skb->data;
68 /* len is UNSIGNED. Byte wide insns relies only on implicit
69 type casts to prevent reading arbitrary memory locations.
70 */
71 unsigned int len = skb->len-skb->data_len;
72 struct sock_filter *fentry; /* We walk down these */ 79 struct sock_filter *fentry; /* We walk down these */
80 void *ptr;
73 u32 A = 0; /* Accumulator */ 81 u32 A = 0; /* Accumulator */
74 u32 X = 0; /* Index Register */ 82 u32 X = 0; /* Index Register */
75 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ 83 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
84 u32 tmp;
76 int k; 85 int k;
77 int pc; 86 int pc;
78 87
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
168 case BPF_LD|BPF_W|BPF_ABS: 177 case BPF_LD|BPF_W|BPF_ABS:
169 k = fentry->k; 178 k = fentry->k;
170 load_w: 179 load_w:
171 if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { 180 ptr = load_pointer(skb, k, 4, &tmp);
172 A = ntohl(*(u32*)&data[k]); 181 if (ptr != NULL) {
182 A = ntohl(*(u32 *)ptr);
173 continue; 183 continue;
174 } 184 }
175 if (k < 0) {
176 u8 *ptr;
177
178 if (k >= SKF_AD_OFF)
179 break;
180 ptr = load_pointer(skb, k);
181 if (ptr) {
182 A = ntohl(*(u32*)ptr);
183 continue;
184 }
185 } else {
186 u32 _tmp, *p;
187 p = skb_header_pointer(skb, k, 4, &_tmp);
188 if (p != NULL) {
189 A = ntohl(*p);
190 continue;
191 }
192 }
193 return 0; 185 return 0;
194 case BPF_LD|BPF_H|BPF_ABS: 186 case BPF_LD|BPF_H|BPF_ABS:
195 k = fentry->k; 187 k = fentry->k;
196 load_h: 188 load_h:
197 if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { 189 ptr = load_pointer(skb, k, 2, &tmp);
198 A = ntohs(*(u16*)&data[k]); 190 if (ptr != NULL) {
191 A = ntohs(*(u16 *)ptr);
199 continue; 192 continue;
200 } 193 }
201 if (k < 0) {
202 u8 *ptr;
203
204 if (k >= SKF_AD_OFF)
205 break;
206 ptr = load_pointer(skb, k);
207 if (ptr) {
208 A = ntohs(*(u16*)ptr);
209 continue;
210 }
211 } else {
212 u16 _tmp, *p;
213 p = skb_header_pointer(skb, k, 2, &_tmp);
214 if (p != NULL) {
215 A = ntohs(*p);
216 continue;
217 }
218 }
219 return 0; 194 return 0;
220 case BPF_LD|BPF_B|BPF_ABS: 195 case BPF_LD|BPF_B|BPF_ABS:
221 k = fentry->k; 196 k = fentry->k;
222load_b: 197load_b:
223 if (k >= 0 && (unsigned int)k < len) { 198 ptr = load_pointer(skb, k, 1, &tmp);
224 A = data[k]; 199 if (ptr != NULL) {
200 A = *(u8 *)ptr;
225 continue; 201 continue;
226 } 202 }
227 if (k < 0) {
228 u8 *ptr;
229
230 if (k >= SKF_AD_OFF)
231 break;
232 ptr = load_pointer(skb, k);
233 if (ptr) {
234 A = *ptr;
235 continue;
236 }
237 } else {
238 u8 _tmp, *p;
239 p = skb_header_pointer(skb, k, 1, &_tmp);
240 if (p != NULL) {
241 A = *p;
242 continue;
243 }
244 }
245 return 0; 203 return 0;
246 case BPF_LD|BPF_W|BPF_LEN: 204 case BPF_LD|BPF_W|BPF_LEN:
247 A = len; 205 A = skb->len;
248 continue; 206 continue;
249 case BPF_LDX|BPF_W|BPF_LEN: 207 case BPF_LDX|BPF_W|BPF_LEN:
250 X = len; 208 X = skb->len;
251 continue; 209 continue;
252 case BPF_LD|BPF_W|BPF_IND: 210 case BPF_LD|BPF_W|BPF_IND:
253 k = X + fentry->k; 211 k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
259 k = X + fentry->k; 217 k = X + fentry->k;
260 goto load_b; 218 goto load_b;
261 case BPF_LDX|BPF_B|BPF_MSH: 219 case BPF_LDX|BPF_B|BPF_MSH:
262 if (fentry->k >= len) 220 ptr = load_pointer(skb, fentry->k, 1, &tmp);
263 return 0; 221 if (ptr != NULL) {
264 X = (data[fentry->k] & 0xf) << 2; 222 X = (*(u8 *)ptr & 0xf) << 2;
265 continue; 223 continue;
224 }
225 return 0;
266 case BPF_LD|BPF_IMM: 226 case BPF_LD|BPF_IMM:
267 A = fentry->k; 227 A = fentry->k;
268 continue; 228 continue;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 43bdc521e20d..1beb782ac41b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -32,6 +32,7 @@
32#include <net/sock.h> 32#include <net/sock.h>
33#include <linux/rtnetlink.h> 33#include <linux/rtnetlink.h>
34#include <linux/random.h> 34#include <linux/random.h>
35#include <linux/string.h>
35 36
36#define NEIGH_DEBUG 1 37#define NEIGH_DEBUG 1
37 38
@@ -1276,9 +1277,14 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
1276 INIT_RCU_HEAD(&p->rcu_head); 1277 INIT_RCU_HEAD(&p->rcu_head);
1277 p->reachable_time = 1278 p->reachable_time =
1278 neigh_rand_reach_time(p->base_reachable_time); 1279 neigh_rand_reach_time(p->base_reachable_time);
1279 if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) { 1280 if (dev) {
1280 kfree(p); 1281 if (dev->neigh_setup && dev->neigh_setup(dev, p)) {
1281 return NULL; 1282 kfree(p);
1283 return NULL;
1284 }
1285
1286 dev_hold(dev);
1287 p->dev = dev;
1282 } 1288 }
1283 p->sysctl_table = NULL; 1289 p->sysctl_table = NULL;
1284 write_lock_bh(&tbl->lock); 1290 write_lock_bh(&tbl->lock);
@@ -1309,6 +1315,8 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
1309 *p = parms->next; 1315 *p = parms->next;
1310 parms->dead = 1; 1316 parms->dead = 1;
1311 write_unlock_bh(&tbl->lock); 1317 write_unlock_bh(&tbl->lock);
1318 if (parms->dev)
1319 dev_put(parms->dev);
1312 call_rcu(&parms->rcu_head, neigh_rcu_free_parms); 1320 call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
1313 return; 1321 return;
1314 } 1322 }
@@ -1546,21 +1554,330 @@ out:
1546 return err; 1554 return err;
1547} 1555}
1548 1556
1557static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
1558{
1559 struct rtattr *nest = NULL;
1560
1561 nest = RTA_NEST(skb, NDTA_PARMS);
1562
1563 if (parms->dev)
1564 RTA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
1565
1566 RTA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
1567 RTA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
1568 RTA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
1569 RTA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
1570 RTA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
1571 RTA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
1572 RTA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
1573 RTA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
1574 parms->base_reachable_time);
1575 RTA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
1576 RTA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
1577 RTA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
1578 RTA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
1579 RTA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
1580 RTA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
1581
1582 return RTA_NEST_END(skb, nest);
1583
1584rtattr_failure:
1585 return RTA_NEST_CANCEL(skb, nest);
1586}
1587
1588static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
1589 struct netlink_callback *cb)
1590{
1591 struct nlmsghdr *nlh;
1592 struct ndtmsg *ndtmsg;
1593
1594 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
1595 NLM_F_MULTI);
1596
1597 ndtmsg = NLMSG_DATA(nlh);
1598
1599 read_lock_bh(&tbl->lock);
1600 ndtmsg->ndtm_family = tbl->family;
1601 ndtmsg->ndtm_pad1 = 0;
1602 ndtmsg->ndtm_pad2 = 0;
1603
1604 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1605 RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
1606 RTA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
1607 RTA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
1608 RTA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
1609
1610 {
1611 unsigned long now = jiffies;
1612 unsigned int flush_delta = now - tbl->last_flush;
1613 unsigned int rand_delta = now - tbl->last_rand;
1614
1615 struct ndt_config ndc = {
1616 .ndtc_key_len = tbl->key_len,
1617 .ndtc_entry_size = tbl->entry_size,
1618 .ndtc_entries = atomic_read(&tbl->entries),
1619 .ndtc_last_flush = jiffies_to_msecs(flush_delta),
1620 .ndtc_last_rand = jiffies_to_msecs(rand_delta),
1621 .ndtc_hash_rnd = tbl->hash_rnd,
1622 .ndtc_hash_mask = tbl->hash_mask,
1623 .ndtc_hash_chain_gc = tbl->hash_chain_gc,
1624 .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
1625 };
1626
1627 RTA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
1628 }
1629
1630 {
1631 int cpu;
1632 struct ndt_stats ndst;
1633
1634 memset(&ndst, 0, sizeof(ndst));
1635
1636 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1637 struct neigh_statistics *st;
1638
1639 if (!cpu_possible(cpu))
1640 continue;
1641
1642 st = per_cpu_ptr(tbl->stats, cpu);
1643 ndst.ndts_allocs += st->allocs;
1644 ndst.ndts_destroys += st->destroys;
1645 ndst.ndts_hash_grows += st->hash_grows;
1646 ndst.ndts_res_failed += st->res_failed;
1647 ndst.ndts_lookups += st->lookups;
1648 ndst.ndts_hits += st->hits;
1649 ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
1650 ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
1651 ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
1652 ndst.ndts_forced_gc_runs += st->forced_gc_runs;
1653 }
1654
1655 RTA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
1656 }
1657
1658 BUG_ON(tbl->parms.dev);
1659 if (neightbl_fill_parms(skb, &tbl->parms) < 0)
1660 goto rtattr_failure;
1661
1662 read_unlock_bh(&tbl->lock);
1663 return NLMSG_END(skb, nlh);
1664
1665rtattr_failure:
1666 read_unlock_bh(&tbl->lock);
1667 return NLMSG_CANCEL(skb, nlh);
1668
1669nlmsg_failure:
1670 return -1;
1671}
1672
1673static int neightbl_fill_param_info(struct neigh_table *tbl,
1674 struct neigh_parms *parms,
1675 struct sk_buff *skb,
1676 struct netlink_callback *cb)
1677{
1678 struct ndtmsg *ndtmsg;
1679 struct nlmsghdr *nlh;
1680
1681 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
1682 NLM_F_MULTI);
1683
1684 ndtmsg = NLMSG_DATA(nlh);
1685
1686 read_lock_bh(&tbl->lock);
1687 ndtmsg->ndtm_family = tbl->family;
1688 ndtmsg->ndtm_pad1 = 0;
1689 ndtmsg->ndtm_pad2 = 0;
1690 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1691
1692 if (neightbl_fill_parms(skb, parms) < 0)
1693 goto rtattr_failure;
1694
1695 read_unlock_bh(&tbl->lock);
1696 return NLMSG_END(skb, nlh);
1697
1698rtattr_failure:
1699 read_unlock_bh(&tbl->lock);
1700 return NLMSG_CANCEL(skb, nlh);
1701
1702nlmsg_failure:
1703 return -1;
1704}
1705
1706static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl,
1707 int ifindex)
1708{
1709 struct neigh_parms *p;
1710
1711 for (p = &tbl->parms; p; p = p->next)
1712 if ((p->dev && p->dev->ifindex == ifindex) ||
1713 (!p->dev && !ifindex))
1714 return p;
1715
1716 return NULL;
1717}
1718
1719int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1720{
1721 struct neigh_table *tbl;
1722 struct ndtmsg *ndtmsg = NLMSG_DATA(nlh);
1723 struct rtattr **tb = arg;
1724 int err = -EINVAL;
1725
1726 if (!tb[NDTA_NAME - 1] || !RTA_PAYLOAD(tb[NDTA_NAME - 1]))
1727 return -EINVAL;
1728
1729 read_lock(&neigh_tbl_lock);
1730 for (tbl = neigh_tables; tbl; tbl = tbl->next) {
1731 if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
1732 continue;
1733
1734 if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id))
1735 break;
1736 }
1737
1738 if (tbl == NULL) {
1739 err = -ENOENT;
1740 goto errout;
1741 }
1742
1743 /*
1744 * We acquire tbl->lock to be nice to the periodic timers and
1745 * make sure they always see a consistent set of values.
1746 */
1747 write_lock_bh(&tbl->lock);
1748
1749 if (tb[NDTA_THRESH1 - 1])
1750 tbl->gc_thresh1 = RTA_GET_U32(tb[NDTA_THRESH1 - 1]);
1751
1752 if (tb[NDTA_THRESH2 - 1])
1753 tbl->gc_thresh2 = RTA_GET_U32(tb[NDTA_THRESH2 - 1]);
1754
1755 if (tb[NDTA_THRESH3 - 1])
1756 tbl->gc_thresh3 = RTA_GET_U32(tb[NDTA_THRESH3 - 1]);
1757
1758 if (tb[NDTA_GC_INTERVAL - 1])
1759 tbl->gc_interval = RTA_GET_MSECS(tb[NDTA_GC_INTERVAL - 1]);
1760
1761 if (tb[NDTA_PARMS - 1]) {
1762 struct rtattr *tbp[NDTPA_MAX];
1763 struct neigh_parms *p;
1764 u32 ifindex = 0;
1765
1766 if (rtattr_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS - 1]) < 0)
1767 goto rtattr_failure;
1768
1769 if (tbp[NDTPA_IFINDEX - 1])
1770 ifindex = RTA_GET_U32(tbp[NDTPA_IFINDEX - 1]);
1771
1772 p = lookup_neigh_params(tbl, ifindex);
1773 if (p == NULL) {
1774 err = -ENOENT;
1775 goto rtattr_failure;
1776 }
1777
1778 if (tbp[NDTPA_QUEUE_LEN - 1])
1779 p->queue_len = RTA_GET_U32(tbp[NDTPA_QUEUE_LEN - 1]);
1780
1781 if (tbp[NDTPA_PROXY_QLEN - 1])
1782 p->proxy_qlen = RTA_GET_U32(tbp[NDTPA_PROXY_QLEN - 1]);
1783
1784 if (tbp[NDTPA_APP_PROBES - 1])
1785 p->app_probes = RTA_GET_U32(tbp[NDTPA_APP_PROBES - 1]);
1786
1787 if (tbp[NDTPA_UCAST_PROBES - 1])
1788 p->ucast_probes =
1789 RTA_GET_U32(tbp[NDTPA_UCAST_PROBES - 1]);
1790
1791 if (tbp[NDTPA_MCAST_PROBES - 1])
1792 p->mcast_probes =
1793 RTA_GET_U32(tbp[NDTPA_MCAST_PROBES - 1]);
1794
1795 if (tbp[NDTPA_BASE_REACHABLE_TIME - 1])
1796 p->base_reachable_time =
1797 RTA_GET_MSECS(tbp[NDTPA_BASE_REACHABLE_TIME - 1]);
1798
1799 if (tbp[NDTPA_GC_STALETIME - 1])
1800 p->gc_staletime =
1801 RTA_GET_MSECS(tbp[NDTPA_GC_STALETIME - 1]);
1802
1803 if (tbp[NDTPA_DELAY_PROBE_TIME - 1])
1804 p->delay_probe_time =
1805 RTA_GET_MSECS(tbp[NDTPA_DELAY_PROBE_TIME - 1]);
1806
1807 if (tbp[NDTPA_RETRANS_TIME - 1])
1808 p->retrans_time =
1809 RTA_GET_MSECS(tbp[NDTPA_RETRANS_TIME - 1]);
1810
1811 if (tbp[NDTPA_ANYCAST_DELAY - 1])
1812 p->anycast_delay =
1813 RTA_GET_MSECS(tbp[NDTPA_ANYCAST_DELAY - 1]);
1814
1815 if (tbp[NDTPA_PROXY_DELAY - 1])
1816 p->proxy_delay =
1817 RTA_GET_MSECS(tbp[NDTPA_PROXY_DELAY - 1]);
1818
1819 if (tbp[NDTPA_LOCKTIME - 1])
1820 p->locktime = RTA_GET_MSECS(tbp[NDTPA_LOCKTIME - 1]);
1821 }
1822
1823 err = 0;
1824
1825rtattr_failure:
1826 write_unlock_bh(&tbl->lock);
1827errout:
1828 read_unlock(&neigh_tbl_lock);
1829 return err;
1830}
1831
1832int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
1833{
1834 int idx, family;
1835 int s_idx = cb->args[0];
1836 struct neigh_table *tbl;
1837
1838 family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family;
1839
1840 read_lock(&neigh_tbl_lock);
1841 for (tbl = neigh_tables, idx = 0; tbl; tbl = tbl->next) {
1842 struct neigh_parms *p;
1843
1844 if (idx < s_idx || (family && tbl->family != family))
1845 continue;
1846
1847 if (neightbl_fill_info(tbl, skb, cb) <= 0)
1848 break;
1849
1850 for (++idx, p = tbl->parms.next; p; p = p->next, idx++) {
1851 if (idx < s_idx)
1852 continue;
1853
1854 if (neightbl_fill_param_info(tbl, p, skb, cb) <= 0)
1855 goto out;
1856 }
1857
1858 }
1859out:
1860 read_unlock(&neigh_tbl_lock);
1861 cb->args[0] = idx;
1862
1863 return skb->len;
1864}
1549 1865
1550static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, 1866static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
1551 u32 pid, u32 seq, int event) 1867 u32 pid, u32 seq, int event, unsigned int flags)
1552{ 1868{
1553 unsigned long now = jiffies; 1869 unsigned long now = jiffies;
1554 unsigned char *b = skb->tail; 1870 unsigned char *b = skb->tail;
1555 struct nda_cacheinfo ci; 1871 struct nda_cacheinfo ci;
1556 int locked = 0; 1872 int locked = 0;
1557 u32 probes; 1873 u32 probes;
1558 struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, event, 1874 struct nlmsghdr *nlh = NLMSG_NEW(skb, pid, seq, event,
1559 sizeof(struct ndmsg)); 1875 sizeof(struct ndmsg), flags);
1560 struct ndmsg *ndm = NLMSG_DATA(nlh); 1876 struct ndmsg *ndm = NLMSG_DATA(nlh);
1561 1877
1562 nlh->nlmsg_flags = pid ? NLM_F_MULTI : 0;
1563 ndm->ndm_family = n->ops->family; 1878 ndm->ndm_family = n->ops->family;
1879 ndm->ndm_pad1 = 0;
1880 ndm->ndm_pad2 = 0;
1564 ndm->ndm_flags = n->flags; 1881 ndm->ndm_flags = n->flags;
1565 ndm->ndm_type = n->type; 1882 ndm->ndm_type = n->type;
1566 ndm->ndm_ifindex = n->dev->ifindex; 1883 ndm->ndm_ifindex = n->dev->ifindex;
@@ -1609,7 +1926,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
1609 continue; 1926 continue;
1610 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, 1927 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
1611 cb->nlh->nlmsg_seq, 1928 cb->nlh->nlmsg_seq,
1612 RTM_NEWNEIGH) <= 0) { 1929 RTM_NEWNEIGH,
1930 NLM_F_MULTI) <= 0) {
1613 read_unlock_bh(&tbl->lock); 1931 read_unlock_bh(&tbl->lock);
1614 rc = -1; 1932 rc = -1;
1615 goto out; 1933 goto out;
@@ -2018,7 +2336,7 @@ void neigh_app_ns(struct neighbour *n)
2018 if (!skb) 2336 if (!skb)
2019 return; 2337 return;
2020 2338
2021 if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { 2339 if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH, 0) < 0) {
2022 kfree_skb(skb); 2340 kfree_skb(skb);
2023 return; 2341 return;
2024 } 2342 }
@@ -2037,7 +2355,7 @@ static void neigh_app_notify(struct neighbour *n)
2037 if (!skb) 2355 if (!skb)
2038 return; 2356 return;
2039 2357
2040 if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { 2358 if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH, 0) < 0) {
2041 kfree_skb(skb); 2359 kfree_skb(skb);
2042 return; 2360 return;
2043 } 2361 }
@@ -2281,7 +2599,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
2281 t->neigh_vars[17].extra1 = dev; 2599 t->neigh_vars[17].extra1 = dev;
2282 } 2600 }
2283 2601
2284 dev_name = net_sysctl_strdup(dev_name_source); 2602 dev_name = kstrdup(dev_name_source, GFP_KERNEL);
2285 if (!dev_name) { 2603 if (!dev_name) {
2286 err = -ENOBUFS; 2604 err = -ENOBUFS;
2287 goto free; 2605 goto free;
@@ -2352,6 +2670,8 @@ EXPORT_SYMBOL(neigh_update);
2352EXPORT_SYMBOL(neigh_update_hhs); 2670EXPORT_SYMBOL(neigh_update_hhs);
2353EXPORT_SYMBOL(pneigh_enqueue); 2671EXPORT_SYMBOL(pneigh_enqueue);
2354EXPORT_SYMBOL(pneigh_lookup); 2672EXPORT_SYMBOL(pneigh_lookup);
2673EXPORT_SYMBOL(neightbl_dump_info);
2674EXPORT_SYMBOL(neightbl_set);
2355 2675
2356#ifdef CONFIG_ARPD 2676#ifdef CONFIG_ARPD
2357EXPORT_SYMBOL(neigh_app_ns); 2677EXPORT_SYMBOL(neigh_app_ns);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 910eb4c05a47..e2137f3e489d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -185,6 +185,22 @@ static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, siz
185static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, 185static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
186 store_tx_queue_len); 186 store_tx_queue_len);
187 187
188NETDEVICE_SHOW(weight, fmt_dec);
189
190static int change_weight(struct net_device *net, unsigned long new_weight)
191{
192 net->weight = new_weight;
193 return 0;
194}
195
196static ssize_t store_weight(struct class_device *dev, const char *buf, size_t len)
197{
198 return netdev_store(dev, buf, len, change_weight);
199}
200
201static CLASS_DEVICE_ATTR(weight, S_IRUGO | S_IWUSR, show_weight,
202 store_weight);
203
188 204
189static struct class_device_attribute *net_class_attributes[] = { 205static struct class_device_attribute *net_class_attributes[] = {
190 &class_device_attr_ifindex, 206 &class_device_attr_ifindex,
@@ -194,6 +210,7 @@ static struct class_device_attribute *net_class_attributes[] = {
194 &class_device_attr_features, 210 &class_device_attr_features,
195 &class_device_attr_mtu, 211 &class_device_attr_mtu,
196 &class_device_attr_flags, 212 &class_device_attr_flags,
213 &class_device_attr_weight,
197 &class_device_attr_type, 214 &class_device_attr_type,
198 &class_device_attr_address, 215 &class_device_attr_address,
199 &class_device_attr_broadcast, 216 &class_device_attr_broadcast,
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
index 22a8f127c4aa..076c156d5eda 100644
--- a/net/core/netfilter.c
+++ b/net/core/netfilter.c
@@ -141,136 +141,6 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
141 up(&nf_sockopt_mutex); 141 up(&nf_sockopt_mutex);
142} 142}
143 143
144#ifdef CONFIG_NETFILTER_DEBUG
145#include <net/ip.h>
146#include <net/tcp.h>
147#include <linux/netfilter_ipv4.h>
148
149static void debug_print_hooks_ip(unsigned int nf_debug)
150{
151 if (nf_debug & (1 << NF_IP_PRE_ROUTING)) {
152 printk("PRE_ROUTING ");
153 nf_debug ^= (1 << NF_IP_PRE_ROUTING);
154 }
155 if (nf_debug & (1 << NF_IP_LOCAL_IN)) {
156 printk("LOCAL_IN ");
157 nf_debug ^= (1 << NF_IP_LOCAL_IN);
158 }
159 if (nf_debug & (1 << NF_IP_FORWARD)) {
160 printk("FORWARD ");
161 nf_debug ^= (1 << NF_IP_FORWARD);
162 }
163 if (nf_debug & (1 << NF_IP_LOCAL_OUT)) {
164 printk("LOCAL_OUT ");
165 nf_debug ^= (1 << NF_IP_LOCAL_OUT);
166 }
167 if (nf_debug & (1 << NF_IP_POST_ROUTING)) {
168 printk("POST_ROUTING ");
169 nf_debug ^= (1 << NF_IP_POST_ROUTING);
170 }
171 if (nf_debug)
172 printk("Crap bits: 0x%04X", nf_debug);
173 printk("\n");
174}
175
176static void nf_dump_skb(int pf, struct sk_buff *skb)
177{
178 printk("skb: pf=%i %s dev=%s len=%u\n",
179 pf,
180 skb->sk ? "(owned)" : "(unowned)",
181 skb->dev ? skb->dev->name : "(no dev)",
182 skb->len);
183 switch (pf) {
184 case PF_INET: {
185 const struct iphdr *ip = skb->nh.iph;
186 __u32 *opt = (__u32 *) (ip + 1);
187 int opti;
188 __u16 src_port = 0, dst_port = 0;
189
190 if (ip->protocol == IPPROTO_TCP
191 || ip->protocol == IPPROTO_UDP) {
192 struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
193 src_port = ntohs(tcp->source);
194 dst_port = ntohs(tcp->dest);
195 }
196
197 printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu"
198 " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
199 ip->protocol, NIPQUAD(ip->saddr),
200 src_port, NIPQUAD(ip->daddr),
201 dst_port,
202 ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
203 ntohs(ip->frag_off), ip->ttl);
204
205 for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
206 printk(" O=0x%8.8X", *opt++);
207 printk("\n");
208 }
209 }
210}
211
212void nf_debug_ip_local_deliver(struct sk_buff *skb)
213{
214 /* If it's a loopback packet, it must have come through
215 * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and
216 * NF_IP_LOCAL_IN. Otherwise, must have gone through
217 * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */
218 if (!skb->dev) {
219 printk("ip_local_deliver: skb->dev is NULL.\n");
220 } else {
221 if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING)
222 | (1<<NF_IP_LOCAL_IN))) {
223 printk("ip_local_deliver: bad skb: ");
224 debug_print_hooks_ip(skb->nf_debug);
225 nf_dump_skb(PF_INET, skb);
226 }
227 }
228}
229
230void nf_debug_ip_loopback_xmit(struct sk_buff *newskb)
231{
232 if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
233 | (1 << NF_IP_POST_ROUTING))) {
234 printk("ip_dev_loopback_xmit: bad owned skb = %p: ",
235 newskb);
236 debug_print_hooks_ip(newskb->nf_debug);
237 nf_dump_skb(PF_INET, newskb);
238 }
239}
240
241void nf_debug_ip_finish_output2(struct sk_buff *skb)
242{
243 /* If it's owned, it must have gone through the
244 * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING.
245 * Otherwise, must have gone through
246 * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING.
247 */
248 if (skb->sk) {
249 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
250 | (1 << NF_IP_POST_ROUTING))) {
251 printk("ip_finish_output: bad owned skb = %p: ", skb);
252 debug_print_hooks_ip(skb->nf_debug);
253 nf_dump_skb(PF_INET, skb);
254 }
255 } else {
256 if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
257 | (1 << NF_IP_FORWARD)
258 | (1 << NF_IP_POST_ROUTING))) {
259 /* Fragments, entunnelled packets, TCP RSTs
260 generated by ipt_REJECT will have no
261 owners, but still may be local */
262 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
263 | (1 << NF_IP_POST_ROUTING))){
264 printk("ip_finish_output:"
265 " bad unowned skb = %p: ",skb);
266 debug_print_hooks_ip(skb->nf_debug);
267 nf_dump_skb(PF_INET, skb);
268 }
269 }
270 }
271}
272#endif /*CONFIG_NETFILTER_DEBUG*/
273
274/* Call get/setsockopt() */ 144/* Call get/setsockopt() */
275static int nf_sockopt(struct sock *sk, int pf, int val, 145static int nf_sockopt(struct sock *sk, int pf, int val,
276 char __user *opt, int *len, int get) 146 char __user *opt, int *len, int get)
@@ -488,14 +358,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
488 /* We may already have this, but read-locks nest anyway */ 358 /* We may already have this, but read-locks nest anyway */
489 rcu_read_lock(); 359 rcu_read_lock();
490 360
491#ifdef CONFIG_NETFILTER_DEBUG
492 if (unlikely((*pskb)->nf_debug & (1 << hook))) {
493 printk("nf_hook: hook %i already set.\n", hook);
494 nf_dump_skb(pf, *pskb);
495 }
496 (*pskb)->nf_debug |= (1 << hook);
497#endif
498
499 elem = &nf_hooks[pf][hook]; 361 elem = &nf_hooks[pf][hook];
500next_hook: 362next_hook:
501 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, 363 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a119696d5521..a1a9a7abff50 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -33,6 +33,7 @@
33#define MAX_UDP_CHUNK 1460 33#define MAX_UDP_CHUNK 1460
34#define MAX_SKBS 32 34#define MAX_SKBS 32
35#define MAX_QUEUE_DEPTH (MAX_SKBS / 2) 35#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
36#define MAX_RETRIES 20000
36 37
37static DEFINE_SPINLOCK(skb_list_lock); 38static DEFINE_SPINLOCK(skb_list_lock);
38static int nr_skbs; 39static int nr_skbs;
@@ -130,19 +131,20 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
130 */ 131 */
131static void poll_napi(struct netpoll *np) 132static void poll_napi(struct netpoll *np)
132{ 133{
134 struct netpoll_info *npinfo = np->dev->npinfo;
133 int budget = 16; 135 int budget = 16;
134 136
135 if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && 137 if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
136 np->poll_owner != smp_processor_id() && 138 npinfo->poll_owner != smp_processor_id() &&
137 spin_trylock(&np->poll_lock)) { 139 spin_trylock(&npinfo->poll_lock)) {
138 np->rx_flags |= NETPOLL_RX_DROP; 140 npinfo->rx_flags |= NETPOLL_RX_DROP;
139 atomic_inc(&trapped); 141 atomic_inc(&trapped);
140 142
141 np->dev->poll(np->dev, &budget); 143 np->dev->poll(np->dev, &budget);
142 144
143 atomic_dec(&trapped); 145 atomic_dec(&trapped);
144 np->rx_flags &= ~NETPOLL_RX_DROP; 146 npinfo->rx_flags &= ~NETPOLL_RX_DROP;
145 spin_unlock(&np->poll_lock); 147 spin_unlock(&npinfo->poll_lock);
146 } 148 }
147} 149}
148 150
@@ -245,16 +247,18 @@ repeat:
245static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) 247static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
246{ 248{
247 int status; 249 int status;
250 struct netpoll_info *npinfo;
248 251
249repeat: 252 if (!np || !np->dev || !netif_running(np->dev)) {
250 if(!np || !np->dev || !netif_running(np->dev)) {
251 __kfree_skb(skb); 253 __kfree_skb(skb);
252 return; 254 return;
253 } 255 }
254 256
257 npinfo = np->dev->npinfo;
258
255 /* avoid recursion */ 259 /* avoid recursion */
256 if(np->poll_owner == smp_processor_id() || 260 if (npinfo->poll_owner == smp_processor_id() ||
257 np->dev->xmit_lock_owner == smp_processor_id()) { 261 np->dev->xmit_lock_owner == smp_processor_id()) {
258 if (np->drop) 262 if (np->drop)
259 np->drop(skb); 263 np->drop(skb);
260 else 264 else
@@ -262,30 +266,37 @@ repeat:
262 return; 266 return;
263 } 267 }
264 268
265 spin_lock(&np->dev->xmit_lock); 269 do {
266 np->dev->xmit_lock_owner = smp_processor_id(); 270 npinfo->tries--;
271 spin_lock(&np->dev->xmit_lock);
272 np->dev->xmit_lock_owner = smp_processor_id();
267 273
268 /* 274 /*
269 * network drivers do not expect to be called if the queue is 275 * network drivers do not expect to be called if the queue is
270 * stopped. 276 * stopped.
271 */ 277 */
272 if (netif_queue_stopped(np->dev)) { 278 if (netif_queue_stopped(np->dev)) {
279 np->dev->xmit_lock_owner = -1;
280 spin_unlock(&np->dev->xmit_lock);
281 netpoll_poll(np);
282 udelay(50);
283 continue;
284 }
285
286 status = np->dev->hard_start_xmit(skb, np->dev);
273 np->dev->xmit_lock_owner = -1; 287 np->dev->xmit_lock_owner = -1;
274 spin_unlock(&np->dev->xmit_lock); 288 spin_unlock(&np->dev->xmit_lock);
275 289
276 netpoll_poll(np); 290 /* success */
277 goto repeat; 291 if(!status) {
278 } 292 npinfo->tries = MAX_RETRIES; /* reset */
279 293 return;
280 status = np->dev->hard_start_xmit(skb, np->dev); 294 }
281 np->dev->xmit_lock_owner = -1;
282 spin_unlock(&np->dev->xmit_lock);
283 295
284 /* transmit busy */ 296 /* transmit busy */
285 if(status) {
286 netpoll_poll(np); 297 netpoll_poll(np);
287 goto repeat; 298 udelay(50);
288 } 299 } while (npinfo->tries > 0);
289} 300}
290 301
291void netpoll_send_udp(struct netpoll *np, const char *msg, int len) 302void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -341,14 +352,18 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
341 352
342static void arp_reply(struct sk_buff *skb) 353static void arp_reply(struct sk_buff *skb)
343{ 354{
355 struct netpoll_info *npinfo = skb->dev->npinfo;
344 struct arphdr *arp; 356 struct arphdr *arp;
345 unsigned char *arp_ptr; 357 unsigned char *arp_ptr;
346 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; 358 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
347 u32 sip, tip; 359 u32 sip, tip;
348 struct sk_buff *send_skb; 360 struct sk_buff *send_skb;
349 struct netpoll *np = skb->dev->np; 361 struct netpoll *np = NULL;
350 362
351 if (!np) return; 363 if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
364 np = npinfo->rx_np;
365 if (!np)
366 return;
352 367
353 /* No arp on this interface */ 368 /* No arp on this interface */
354 if (skb->dev->flags & IFF_NOARP) 369 if (skb->dev->flags & IFF_NOARP)
@@ -429,9 +444,9 @@ int __netpoll_rx(struct sk_buff *skb)
429 int proto, len, ulen; 444 int proto, len, ulen;
430 struct iphdr *iph; 445 struct iphdr *iph;
431 struct udphdr *uh; 446 struct udphdr *uh;
432 struct netpoll *np = skb->dev->np; 447 struct netpoll *np = skb->dev->npinfo->rx_np;
433 448
434 if (!np->rx_hook) 449 if (!np)
435 goto out; 450 goto out;
436 if (skb->dev->type != ARPHRD_ETHER) 451 if (skb->dev->type != ARPHRD_ETHER)
437 goto out; 452 goto out;
@@ -611,9 +626,8 @@ int netpoll_setup(struct netpoll *np)
611{ 626{
612 struct net_device *ndev = NULL; 627 struct net_device *ndev = NULL;
613 struct in_device *in_dev; 628 struct in_device *in_dev;
614 629 struct netpoll_info *npinfo;
615 np->poll_lock = SPIN_LOCK_UNLOCKED; 630 unsigned long flags;
616 np->poll_owner = -1;
617 631
618 if (np->dev_name) 632 if (np->dev_name)
619 ndev = dev_get_by_name(np->dev_name); 633 ndev = dev_get_by_name(np->dev_name);
@@ -624,7 +638,19 @@ int netpoll_setup(struct netpoll *np)
624 } 638 }
625 639
626 np->dev = ndev; 640 np->dev = ndev;
627 ndev->np = np; 641 if (!ndev->npinfo) {
642 npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
643 if (!npinfo)
644 goto release;
645
646 npinfo->rx_flags = 0;
647 npinfo->rx_np = NULL;
648 npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
649 npinfo->poll_owner = -1;
650 npinfo->tries = MAX_RETRIES;
651 npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
652 } else
653 npinfo = ndev->npinfo;
628 654
629 if (!ndev->poll_controller) { 655 if (!ndev->poll_controller) {
630 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", 656 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
@@ -692,13 +718,27 @@ int netpoll_setup(struct netpoll *np)
692 np->name, HIPQUAD(np->local_ip)); 718 np->name, HIPQUAD(np->local_ip));
693 } 719 }
694 720
695 if(np->rx_hook) 721 if (np->rx_hook) {
696 np->rx_flags = NETPOLL_RX_ENABLED; 722 spin_lock_irqsave(&npinfo->rx_lock, flags);
723 npinfo->rx_flags |= NETPOLL_RX_ENABLED;
724 npinfo->rx_np = np;
725 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
726 }
727
728 /* fill up the skb queue */
729 refill_skbs();
730
731 /* last thing to do is link it to the net device structure */
732 ndev->npinfo = npinfo;
733
734 /* avoid racing with NAPI reading npinfo */
735 synchronize_rcu();
697 736
698 return 0; 737 return 0;
699 738
700 release: 739 release:
701 ndev->np = NULL; 740 if (!ndev->npinfo)
741 kfree(npinfo);
702 np->dev = NULL; 742 np->dev = NULL;
703 dev_put(ndev); 743 dev_put(ndev);
704 return -1; 744 return -1;
@@ -706,9 +746,20 @@ int netpoll_setup(struct netpoll *np)
706 746
707void netpoll_cleanup(struct netpoll *np) 747void netpoll_cleanup(struct netpoll *np)
708{ 748{
709 if (np->dev) 749 struct netpoll_info *npinfo;
710 np->dev->np = NULL; 750 unsigned long flags;
711 dev_put(np->dev); 751
752 if (np->dev) {
753 npinfo = np->dev->npinfo;
754 if (npinfo && npinfo->rx_np == np) {
755 spin_lock_irqsave(&npinfo->rx_lock, flags);
756 npinfo->rx_np = NULL;
757 npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
758 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
759 }
760 dev_put(np->dev);
761 }
762
712 np->dev = NULL; 763 np->dev = NULL;
713} 764}
714 765
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06bc79f3..8eb083b6041a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
151#include <asm/timex.h> 151#include <asm/timex.h>
152 152
153 153
154#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n" 154#define VERSION "pktgen v2.62: Packet Generator for packet performance testing.\n"
155 155
156/* #define PG_DEBUG(a) a */ 156/* #define PG_DEBUG(a) a */
157#define PG_DEBUG(a) 157#define PG_DEBUG(a)
@@ -363,7 +363,7 @@ struct pktgen_thread {
363 * All Rights Reserved. 363 * All Rights Reserved.
364 * 364 *
365 */ 365 */
366inline static s64 divremdi3(s64 x, s64 y, int type) 366static inline s64 divremdi3(s64 x, s64 y, int type)
367{ 367{
368 u64 a = (x < 0) ? -x : x; 368 u64 a = (x < 0) ? -x : x;
369 u64 b = (y < 0) ? -y : y; 369 u64 b = (y < 0) ? -y : y;
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1921 struct iphdr *iph; 1921 struct iphdr *iph;
1922 struct pktgen_hdr *pgh = NULL; 1922 struct pktgen_hdr *pgh = NULL;
1923 1923
1924 /* Update any of the values, used when we're incrementing various
1925 * fields.
1926 */
1927 mod_cur_headers(pkt_dev);
1928
1924 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 1929 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
1925 if (!skb) { 1930 if (!skb) {
1926 sprintf(pkt_dev->result, "No memory"); 1931 sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1934 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); 1939 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
1935 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 1940 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
1936 1941
1937 /* Update any of the values, used when we're incrementing various
1938 * fields.
1939 */
1940 mod_cur_headers(pkt_dev);
1941
1942 memcpy(eth, pkt_dev->hh, 12); 1942 memcpy(eth, pkt_dev->hh, 12);
1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP); 1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
1944 1944
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2192 int datalen; 2192 int datalen;
2193 struct ipv6hdr *iph; 2193 struct ipv6hdr *iph;
2194 struct pktgen_hdr *pgh = NULL; 2194 struct pktgen_hdr *pgh = NULL;
2195 2195
2196 /* Update any of the values, used when we're incrementing various
2197 * fields.
2198 */
2199 mod_cur_headers(pkt_dev);
2200
2196 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 2201 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
2197 if (!skb) { 2202 if (!skb) {
2198 sprintf(pkt_dev->result, "No memory"); 2203 sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2206 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); 2211 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
2207 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 2212 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
2208 2213
2209
2210 /* Update any of the values, used when we're incrementing various
2211 * fields.
2212 */
2213 mod_cur_headers(pkt_dev);
2214
2215
2216 memcpy(eth, pkt_dev->hh, 12); 2214 memcpy(eth, pkt_dev->hh, 12);
2217 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6); 2215 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
2218 2216
2219
2220 datalen = pkt_dev->cur_pkt_size-14- 2217 datalen = pkt_dev->cur_pkt_size-14-
2221 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */ 2218 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
2222 2219
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
new file mode 100644
index 000000000000..bb55675f0685
--- /dev/null
+++ b/net/core/request_sock.c
@@ -0,0 +1,64 @@
1/*
2 * NET Generic infrastructure for Network protocols.
3 *
4 * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
5 *
6 * From code originally in include/net/tcp.h
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/module.h>
15#include <linux/random.h>
16#include <linux/slab.h>
17#include <linux/string.h>
18
19#include <net/request_sock.h>
20
21/*
22 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
23 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
24 * It would be better to replace it with a global counter for all sockets
25 * but then some measure against one socket starving all other sockets
26 * would be needed.
27 *
28 * It was 128 by default. Experiments with real servers show, that
29 * it is absolutely not enough even at 100conn/sec. 256 cures most
30 * of problems. This value is adjusted to 128 for very small machines
31 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
32 * Further increasing requires to change hash table size.
33 */
34int sysctl_max_syn_backlog = 256;
35EXPORT_SYMBOL(sysctl_max_syn_backlog);
36
37int reqsk_queue_alloc(struct request_sock_queue *queue,
38 const int nr_table_entries)
39{
40 const int lopt_size = sizeof(struct listen_sock) +
41 nr_table_entries * sizeof(struct request_sock *);
42 struct listen_sock *lopt = kmalloc(lopt_size, GFP_KERNEL);
43
44 if (lopt == NULL)
45 return -ENOMEM;
46
47 memset(lopt, 0, lopt_size);
48
49 for (lopt->max_qlen_log = 6;
50 (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
51 lopt->max_qlen_log++);
52
53 get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
54 rwlock_init(&queue->syn_wait_lock);
55 queue->rskq_accept_head = queue->rskq_accept_head = NULL;
56
57 write_lock_bh(&queue->syn_wait_lock);
58 queue->listen_opt = lopt;
59 write_unlock_bh(&queue->syn_wait_lock);
60
61 return 0;
62}
63
64EXPORT_SYMBOL(reqsk_queue_alloc);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 00caf4b318b2..4b1bb30e6381 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -100,6 +100,7 @@ static const int rtm_min[RTM_NR_FAMILIES] =
100 [RTM_FAM(RTM_NEWPREFIX)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), 100 [RTM_FAM(RTM_NEWPREFIX)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
101 [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), 101 [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
102 [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), 102 [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
103 [RTM_FAM(RTM_NEWNEIGHTBL)] = NLMSG_LENGTH(sizeof(struct ndtmsg)),
103}; 104};
104 105
105static const int rta_max[RTM_NR_FAMILIES] = 106static const int rta_max[RTM_NR_FAMILIES] =
@@ -113,6 +114,7 @@ static const int rta_max[RTM_NR_FAMILIES] =
113 [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, 114 [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX,
114 [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, 115 [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX,
115 [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, 116 [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX,
117 [RTM_FAM(RTM_NEWNEIGHTBL)] = NDTA_MAX,
116}; 118};
117 119
118void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) 120void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
@@ -124,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
124 rta->rta_type = attrtype; 126 rta->rta_type = attrtype;
125 rta->rta_len = size; 127 rta->rta_len = size;
126 memcpy(RTA_DATA(rta), data, attrlen); 128 memcpy(RTA_DATA(rta), data, attrlen);
129 memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
127} 130}
128 131
129size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size) 132size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
@@ -176,16 +179,17 @@ rtattr_failure:
176 179
177 180
178static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, 181static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
179 int type, u32 pid, u32 seq, u32 change) 182 int type, u32 pid, u32 seq, u32 change,
183 unsigned int flags)
180{ 184{
181 struct ifinfomsg *r; 185 struct ifinfomsg *r;
182 struct nlmsghdr *nlh; 186 struct nlmsghdr *nlh;
183 unsigned char *b = skb->tail; 187 unsigned char *b = skb->tail;
184 188
185 nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); 189 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
186 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
187 r = NLMSG_DATA(nlh); 190 r = NLMSG_DATA(nlh);
188 r->ifi_family = AF_UNSPEC; 191 r->ifi_family = AF_UNSPEC;
192 r->__ifi_pad = 0;
189 r->ifi_type = dev->type; 193 r->ifi_type = dev->type;
190 r->ifi_index = dev->ifindex; 194 r->ifi_index = dev->ifindex;
191 r->ifi_flags = dev_get_flags(dev); 195 r->ifi_flags = dev_get_flags(dev);
@@ -273,7 +277,10 @@ static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *c
273 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { 277 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
274 if (idx < s_idx) 278 if (idx < s_idx)
275 continue; 279 continue;
276 if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) 280 if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK,
281 NETLINK_CB(cb->skb).pid,
282 cb->nlh->nlmsg_seq, 0,
283 NLM_F_MULTI) <= 0)
277 break; 284 break;
278 } 285 }
279 read_unlock(&dev_base_lock); 286 read_unlock(&dev_base_lock);
@@ -447,7 +454,7 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
447 if (!skb) 454 if (!skb)
448 return; 455 return;
449 456
450 if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change) < 0) { 457 if (rtnetlink_fill_ifinfo(skb, dev, type, current->pid, 0, change, 0) < 0) {
451 kfree_skb(skb); 458 kfree_skb(skb);
452 return; 459 return;
453 } 460 }
@@ -649,14 +656,16 @@ static void rtnetlink_rcv(struct sock *sk, int len)
649 656
650static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] = 657static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
651{ 658{
652 [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo }, 659 [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
653 [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink }, 660 [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink },
654 [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, 661 [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
655 [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, 662 [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
656 [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add }, 663 [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add },
657 [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete }, 664 [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete },
658 [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info }, 665 [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info },
659 [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, 666 [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
667 [RTM_GETNEIGHTBL - RTM_BASE] = { .dumpit = neightbl_dump_info },
668 [RTM_SETNEIGHTBL - RTM_BASE] = { .doit = neightbl_set },
660}; 669};
661 670
662static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) 671static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f65b3de590a9..7eab867ede59 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -129,7 +129,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
129 * Buffers may only be allocated from interrupts using a @gfp_mask of 129 * Buffers may only be allocated from interrupts using a @gfp_mask of
130 * %GFP_ATOMIC. 130 * %GFP_ATOMIC.
131 */ 131 */
132struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) 132struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
133{ 133{
134 struct sk_buff *skb; 134 struct sk_buff *skb;
135 u8 *data; 135 u8 *data;
@@ -182,7 +182,8 @@ nodata:
182 * %GFP_ATOMIC. 182 * %GFP_ATOMIC.
183 */ 183 */
184struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, 184struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
185 unsigned int size, int gfp_mask) 185 unsigned int size,
186 unsigned int __nocast gfp_mask)
186{ 187{
187 struct sk_buff *skb; 188 struct sk_buff *skb;
188 u8 *data; 189 u8 *data;
@@ -322,7 +323,7 @@ void __kfree_skb(struct sk_buff *skb)
322 * %GFP_ATOMIC. 323 * %GFP_ATOMIC.
323 */ 324 */
324 325
325struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) 326struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
326{ 327{
327 struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 328 struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
328 329
@@ -357,7 +358,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
357 C(ip_summed); 358 C(ip_summed);
358 C(priority); 359 C(priority);
359 C(protocol); 360 C(protocol);
360 C(security);
361 n->destructor = NULL; 361 n->destructor = NULL;
362#ifdef CONFIG_NETFILTER 362#ifdef CONFIG_NETFILTER
363 C(nfmark); 363 C(nfmark);
@@ -365,9 +365,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
365 C(nfct); 365 C(nfct);
366 nf_conntrack_get(skb->nfct); 366 nf_conntrack_get(skb->nfct);
367 C(nfctinfo); 367 C(nfctinfo);
368#ifdef CONFIG_NETFILTER_DEBUG
369 C(nf_debug);
370#endif
371#ifdef CONFIG_BRIDGE_NETFILTER 368#ifdef CONFIG_BRIDGE_NETFILTER
372 C(nf_bridge); 369 C(nf_bridge);
373 nf_bridge_get(skb->nf_bridge); 370 nf_bridge_get(skb->nf_bridge);
@@ -380,8 +377,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
380 C(tc_index); 377 C(tc_index);
381#ifdef CONFIG_NET_CLS_ACT 378#ifdef CONFIG_NET_CLS_ACT
382 n->tc_verd = SET_TC_VERD(skb->tc_verd,0); 379 n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
383 n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd); 380 n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
384 n->tc_verd = CLR_TC_MUNGED(skb->tc_verd); 381 n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
385 C(input_dev); 382 C(input_dev);
386 C(tc_classid); 383 C(tc_classid);
387#endif 384#endif
@@ -425,16 +422,12 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
425 new->pkt_type = old->pkt_type; 422 new->pkt_type = old->pkt_type;
426 new->stamp = old->stamp; 423 new->stamp = old->stamp;
427 new->destructor = NULL; 424 new->destructor = NULL;
428 new->security = old->security;
429#ifdef CONFIG_NETFILTER 425#ifdef CONFIG_NETFILTER
430 new->nfmark = old->nfmark; 426 new->nfmark = old->nfmark;
431 new->nfcache = old->nfcache; 427 new->nfcache = old->nfcache;
432 new->nfct = old->nfct; 428 new->nfct = old->nfct;
433 nf_conntrack_get(old->nfct); 429 nf_conntrack_get(old->nfct);
434 new->nfctinfo = old->nfctinfo; 430 new->nfctinfo = old->nfctinfo;
435#ifdef CONFIG_NETFILTER_DEBUG
436 new->nf_debug = old->nf_debug;
437#endif
438#ifdef CONFIG_BRIDGE_NETFILTER 431#ifdef CONFIG_BRIDGE_NETFILTER
439 new->nf_bridge = old->nf_bridge; 432 new->nf_bridge = old->nf_bridge;
440 nf_bridge_get(old->nf_bridge); 433 nf_bridge_get(old->nf_bridge);
@@ -468,7 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
468 * header is going to be modified. Use pskb_copy() instead. 461 * header is going to be modified. Use pskb_copy() instead.
469 */ 462 */
470 463
471struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) 464struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
472{ 465{
473 int headerlen = skb->data - skb->head; 466 int headerlen = skb->data - skb->head;
474 /* 467 /*
@@ -507,7 +500,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
507 * The returned buffer has a reference count of 1. 500 * The returned buffer has a reference count of 1.
508 */ 501 */
509 502
510struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) 503struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
511{ 504{
512 /* 505 /*
513 * Allocate the copy buffer 506 * Allocate the copy buffer
@@ -565,7 +558,8 @@ out:
565 * reloaded after call to this function. 558 * reloaded after call to this function.
566 */ 559 */
567 560
568int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) 561int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
562 unsigned int __nocast gfp_mask)
569{ 563{
570 int i; 564 int i;
571 u8 *data; 565 u8 *data;
@@ -655,7 +649,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
655 * only by netfilter in the cases when checksum is recalculated? --ANK 649 * only by netfilter in the cases when checksum is recalculated? --ANK
656 */ 650 */
657struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 651struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
658 int newheadroom, int newtailroom, int gfp_mask) 652 int newheadroom, int newtailroom,
653 unsigned int __nocast gfp_mask)
659{ 654{
660 /* 655 /*
661 * Allocate the copy buffer 656 * Allocate the copy buffer
@@ -1506,6 +1501,159 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
1506 skb_split_no_header(skb, skb1, len, pos); 1501 skb_split_no_header(skb, skb1, len, pos);
1507} 1502}
1508 1503
1504/**
1505 * skb_prepare_seq_read - Prepare a sequential read of skb data
1506 * @skb: the buffer to read
1507 * @from: lower offset of data to be read
1508 * @to: upper offset of data to be read
1509 * @st: state variable
1510 *
1511 * Initializes the specified state variable. Must be called before
1512 * invoking skb_seq_read() for the first time.
1513 */
1514void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
1515 unsigned int to, struct skb_seq_state *st)
1516{
1517 st->lower_offset = from;
1518 st->upper_offset = to;
1519 st->root_skb = st->cur_skb = skb;
1520 st->frag_idx = st->stepped_offset = 0;
1521 st->frag_data = NULL;
1522}
1523
1524/**
1525 * skb_seq_read - Sequentially read skb data
1526 * @consumed: number of bytes consumed by the caller so far
1527 * @data: destination pointer for data to be returned
1528 * @st: state variable
1529 *
1530 * Reads a block of skb data at &consumed relative to the
1531 * lower offset specified to skb_prepare_seq_read(). Assigns
1532 * the head of the data block to &data and returns the length
1533 * of the block or 0 if the end of the skb data or the upper
1534 * offset has been reached.
1535 *
1536 * The caller is not required to consume all of the data
1537 * returned, i.e. &consumed is typically set to the number
1538 * of bytes already consumed and the next call to
1539 * skb_seq_read() will return the remaining part of the block.
1540 *
1541 * Note: The size of each block of data returned can be arbitary,
1542 * this limitation is the cost for zerocopy seqeuental
1543 * reads of potentially non linear data.
1544 *
1545 * Note: Fragment lists within fragments are not implemented
1546 * at the moment, state->root_skb could be replaced with
1547 * a stack for this purpose.
1548 */
1549unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
1550 struct skb_seq_state *st)
1551{
1552 unsigned int block_limit, abs_offset = consumed + st->lower_offset;
1553 skb_frag_t *frag;
1554
1555 if (unlikely(abs_offset >= st->upper_offset))
1556 return 0;
1557
1558next_skb:
1559 block_limit = skb_headlen(st->cur_skb);
1560
1561 if (abs_offset < block_limit) {
1562 *data = st->cur_skb->data + abs_offset;
1563 return block_limit - abs_offset;
1564 }
1565
1566 if (st->frag_idx == 0 && !st->frag_data)
1567 st->stepped_offset += skb_headlen(st->cur_skb);
1568
1569 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
1570 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
1571 block_limit = frag->size + st->stepped_offset;
1572
1573 if (abs_offset < block_limit) {
1574 if (!st->frag_data)
1575 st->frag_data = kmap_skb_frag(frag);
1576
1577 *data = (u8 *) st->frag_data + frag->page_offset +
1578 (abs_offset - st->stepped_offset);
1579
1580 return block_limit - abs_offset;
1581 }
1582
1583 if (st->frag_data) {
1584 kunmap_skb_frag(st->frag_data);
1585 st->frag_data = NULL;
1586 }
1587
1588 st->frag_idx++;
1589 st->stepped_offset += frag->size;
1590 }
1591
1592 if (st->cur_skb->next) {
1593 st->cur_skb = st->cur_skb->next;
1594 st->frag_idx = 0;
1595 goto next_skb;
1596 } else if (st->root_skb == st->cur_skb &&
1597 skb_shinfo(st->root_skb)->frag_list) {
1598 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
1599 goto next_skb;
1600 }
1601
1602 return 0;
1603}
1604
1605/**
1606 * skb_abort_seq_read - Abort a sequential read of skb data
1607 * @st: state variable
1608 *
1609 * Must be called if skb_seq_read() was not called until it
1610 * returned 0.
1611 */
1612void skb_abort_seq_read(struct skb_seq_state *st)
1613{
1614 if (st->frag_data)
1615 kunmap_skb_frag(st->frag_data);
1616}
1617
1618#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
1619
1620static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
1621 struct ts_config *conf,
1622 struct ts_state *state)
1623{
1624 return skb_seq_read(offset, text, TS_SKB_CB(state));
1625}
1626
1627static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
1628{
1629 skb_abort_seq_read(TS_SKB_CB(state));
1630}
1631
1632/**
1633 * skb_find_text - Find a text pattern in skb data
1634 * @skb: the buffer to look in
1635 * @from: search offset
1636 * @to: search limit
1637 * @config: textsearch configuration
1638 * @state: uninitialized textsearch state variable
1639 *
1640 * Finds a pattern in the skb data according to the specified
1641 * textsearch configuration. Use textsearch_next() to retrieve
1642 * subsequent occurrences of the pattern. Returns the offset
1643 * to the first occurrence or UINT_MAX if no match was found.
1644 */
1645unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
1646 unsigned int to, struct ts_config *config,
1647 struct ts_state *state)
1648{
1649 config->get_next_block = skb_ts_get_next_block;
1650 config->finish = skb_ts_finish;
1651
1652 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
1653
1654 return textsearch_find(config, state);
1655}
1656
1509void __init skb_init(void) 1657void __init skb_init(void)
1510{ 1658{
1511 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 1659 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1544,3 +1692,7 @@ EXPORT_SYMBOL(skb_queue_tail);
1544EXPORT_SYMBOL(skb_unlink); 1692EXPORT_SYMBOL(skb_unlink);
1545EXPORT_SYMBOL(skb_append); 1693EXPORT_SYMBOL(skb_append);
1546EXPORT_SYMBOL(skb_split); 1694EXPORT_SYMBOL(skb_split);
1695EXPORT_SYMBOL(skb_prepare_seq_read);
1696EXPORT_SYMBOL(skb_seq_read);
1697EXPORT_SYMBOL(skb_abort_seq_read);
1698EXPORT_SYMBOL(skb_find_text);
diff --git a/net/core/sock.c b/net/core/sock.c
index 96e00b08698f..12f6d9a2a522 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,6 +118,7 @@
118#include <linux/netdevice.h> 118#include <linux/netdevice.h>
119#include <net/protocol.h> 119#include <net/protocol.h>
120#include <linux/skbuff.h> 120#include <linux/skbuff.h>
121#include <net/request_sock.h>
121#include <net/sock.h> 122#include <net/sock.h>
122#include <net/xfrm.h> 123#include <net/xfrm.h>
123#include <linux/ipsec.h> 124#include <linux/ipsec.h>
@@ -205,13 +206,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
205 */ 206 */
206 207
207#ifdef SO_DONTLINGER /* Compatibility item... */ 208#ifdef SO_DONTLINGER /* Compatibility item... */
208 switch (optname) { 209 if (optname == SO_DONTLINGER) {
209 case SO_DONTLINGER: 210 lock_sock(sk);
210 sock_reset_flag(sk, SOCK_LINGER); 211 sock_reset_flag(sk, SOCK_LINGER);
211 return 0; 212 release_sock(sk);
213 return 0;
212 } 214 }
213#endif 215#endif
214 216
215 if(optlen<sizeof(int)) 217 if(optlen<sizeof(int))
216 return(-EINVAL); 218 return(-EINVAL);
217 219
@@ -621,7 +623,8 @@ lenout:
621 * @prot: struct proto associated with this new sock instance 623 * @prot: struct proto associated with this new sock instance
622 * @zero_it: if we should zero the newly allocated sock 624 * @zero_it: if we should zero the newly allocated sock
623 */ 625 */
624struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it) 626struct sock *sk_alloc(int family, unsigned int __nocast priority,
627 struct proto *prot, int zero_it)
625{ 628{
626 struct sock *sk = NULL; 629 struct sock *sk = NULL;
627 kmem_cache_t *slab = prot->slab; 630 kmem_cache_t *slab = prot->slab;
@@ -749,7 +752,8 @@ unsigned long sock_i_ino(struct sock *sk)
749/* 752/*
750 * Allocate a skb from the socket's send buffer. 753 * Allocate a skb from the socket's send buffer.
751 */ 754 */
752struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) 755struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
756 unsigned int __nocast priority)
753{ 757{
754 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 758 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
755 struct sk_buff * skb = alloc_skb(size, priority); 759 struct sk_buff * skb = alloc_skb(size, priority);
@@ -764,7 +768,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
764/* 768/*
765 * Allocate a skb from the socket's receive buffer. 769 * Allocate a skb from the socket's receive buffer.
766 */ 770 */
767struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) 771struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
772 unsigned int __nocast priority)
768{ 773{
769 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 774 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
770 struct sk_buff *skb = alloc_skb(size, priority); 775 struct sk_buff *skb = alloc_skb(size, priority);
@@ -779,7 +784,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
779/* 784/*
780 * Allocate a memory block from the socket's option memory buffer. 785 * Allocate a memory block from the socket's option memory buffer.
781 */ 786 */
782void *sock_kmalloc(struct sock *sk, int size, int priority) 787void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
783{ 788{
784 if ((unsigned)size <= sysctl_optmem_max && 789 if ((unsigned)size <= sysctl_optmem_max &&
785 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 790 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
@@ -1363,6 +1368,7 @@ static LIST_HEAD(proto_list);
1363 1368
1364int proto_register(struct proto *prot, int alloc_slab) 1369int proto_register(struct proto *prot, int alloc_slab)
1365{ 1370{
1371 char *request_sock_slab_name;
1366 int rc = -ENOBUFS; 1372 int rc = -ENOBUFS;
1367 1373
1368 if (alloc_slab) { 1374 if (alloc_slab) {
@@ -1374,6 +1380,25 @@ int proto_register(struct proto *prot, int alloc_slab)
1374 prot->name); 1380 prot->name);
1375 goto out; 1381 goto out;
1376 } 1382 }
1383
1384 if (prot->rsk_prot != NULL) {
1385 static const char mask[] = "request_sock_%s";
1386
1387 request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1388 if (request_sock_slab_name == NULL)
1389 goto out_free_sock_slab;
1390
1391 sprintf(request_sock_slab_name, mask, prot->name);
1392 prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1393 prot->rsk_prot->obj_size, 0,
1394 SLAB_HWCACHE_ALIGN, NULL, NULL);
1395
1396 if (prot->rsk_prot->slab == NULL) {
1397 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1398 prot->name);
1399 goto out_free_request_sock_slab_name;
1400 }
1401 }
1377 } 1402 }
1378 1403
1379 write_lock(&proto_list_lock); 1404 write_lock(&proto_list_lock);
@@ -1382,6 +1407,12 @@ int proto_register(struct proto *prot, int alloc_slab)
1382 rc = 0; 1407 rc = 0;
1383out: 1408out:
1384 return rc; 1409 return rc;
1410out_free_request_sock_slab_name:
1411 kfree(request_sock_slab_name);
1412out_free_sock_slab:
1413 kmem_cache_destroy(prot->slab);
1414 prot->slab = NULL;
1415 goto out;
1385} 1416}
1386 1417
1387EXPORT_SYMBOL(proto_register); 1418EXPORT_SYMBOL(proto_register);
@@ -1395,6 +1426,14 @@ void proto_unregister(struct proto *prot)
1395 prot->slab = NULL; 1426 prot->slab = NULL;
1396 } 1427 }
1397 1428
1429 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1430 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1431
1432 kmem_cache_destroy(prot->rsk_prot->slab);
1433 kfree(name);
1434 prot->rsk_prot->slab = NULL;
1435 }
1436
1398 list_del(&prot->node); 1437 list_del(&prot->node);
1399 write_unlock(&proto_list_lock); 1438 write_unlock(&proto_list_lock);
1400} 1439}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8be646cb191..8f817ad9f546 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,12 +13,8 @@
13#ifdef CONFIG_SYSCTL 13#ifdef CONFIG_SYSCTL
14 14
15extern int netdev_max_backlog; 15extern int netdev_max_backlog;
16extern int netdev_budget;
16extern int weight_p; 17extern int weight_p;
17extern int no_cong_thresh;
18extern int no_cong;
19extern int lo_cong;
20extern int mod_cong;
21extern int netdev_fastroute;
22extern int net_msg_cost; 18extern int net_msg_cost;
23extern int net_msg_burst; 19extern int net_msg_burst;
24 20
@@ -35,19 +31,6 @@ extern int sysctl_somaxconn;
35extern char sysctl_divert_version[]; 31extern char sysctl_divert_version[];
36#endif /* CONFIG_NET_DIVERT */ 32#endif /* CONFIG_NET_DIVERT */
37 33
38/*
39 * This strdup() is used for creating copies of network
40 * device names to be handed over to sysctl.
41 */
42
43char *net_sysctl_strdup(const char *s)
44{
45 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
46 if (rv)
47 strcpy(rv, s);
48 return rv;
49}
50
51ctl_table core_table[] = { 34ctl_table core_table[] = {
52#ifdef CONFIG_NET 35#ifdef CONFIG_NET
53 { 36 {
@@ -99,38 +82,6 @@ ctl_table core_table[] = {
99 .proc_handler = &proc_dointvec 82 .proc_handler = &proc_dointvec
100 }, 83 },
101 { 84 {
102 .ctl_name = NET_CORE_NO_CONG_THRESH,
103 .procname = "no_cong_thresh",
104 .data = &no_cong_thresh,
105 .maxlen = sizeof(int),
106 .mode = 0644,
107 .proc_handler = &proc_dointvec
108 },
109 {
110 .ctl_name = NET_CORE_NO_CONG,
111 .procname = "no_cong",
112 .data = &no_cong,
113 .maxlen = sizeof(int),
114 .mode = 0644,
115 .proc_handler = &proc_dointvec
116 },
117 {
118 .ctl_name = NET_CORE_LO_CONG,
119 .procname = "lo_cong",
120 .data = &lo_cong,
121 .maxlen = sizeof(int),
122 .mode = 0644,
123 .proc_handler = &proc_dointvec
124 },
125 {
126 .ctl_name = NET_CORE_MOD_CONG,
127 .procname = "mod_cong",
128 .data = &mod_cong,
129 .maxlen = sizeof(int),
130 .mode = 0644,
131 .proc_handler = &proc_dointvec
132 },
133 {
134 .ctl_name = NET_CORE_MSG_COST, 85 .ctl_name = NET_CORE_MSG_COST,
135 .procname = "message_cost", 86 .procname = "message_cost",
136 .data = &net_msg_cost, 87 .data = &net_msg_cost,
@@ -174,9 +125,15 @@ ctl_table core_table[] = {
174 .mode = 0644, 125 .mode = 0644,
175 .proc_handler = &proc_dointvec 126 .proc_handler = &proc_dointvec
176 }, 127 },
128 {
129 .ctl_name = NET_CORE_BUDGET,
130 .procname = "netdev_budget",
131 .data = &netdev_budget,
132 .maxlen = sizeof(int),
133 .mode = 0644,
134 .proc_handler = &proc_dointvec
135 },
177 { .ctl_name = 0 } 136 { .ctl_name = 0 }
178}; 137};
179 138
180EXPORT_SYMBOL(net_sysctl_strdup);
181
182#endif 139#endif
diff --git a/net/core/utils.c b/net/core/utils.c
index e11a8654f363..88eb8b68e26b 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -23,10 +23,10 @@
23#include <linux/percpu.h> 23#include <linux/percpu.h>
24#include <linux/init.h> 24#include <linux/init.h>
25 25
26#include <asm/byteorder.h>
26#include <asm/system.h> 27#include <asm/system.h>
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28 29
29
30/* 30/*
31 This is a maximally equidistributed combined Tausworthe generator 31 This is a maximally equidistributed combined Tausworthe generator
32 based on code from GNU Scientific Library 1.5 (30 Jun 2004) 32 based on code from GNU Scientific Library 1.5 (30 Jun 2004)
@@ -153,3 +153,38 @@ int net_ratelimit(void)
153EXPORT_SYMBOL(net_random); 153EXPORT_SYMBOL(net_random);
154EXPORT_SYMBOL(net_ratelimit); 154EXPORT_SYMBOL(net_ratelimit);
155EXPORT_SYMBOL(net_srandom); 155EXPORT_SYMBOL(net_srandom);
156
157/*
158 * Convert an ASCII string to binary IP.
159 * This is outside of net/ipv4/ because various code that uses IP addresses
160 * is otherwise not dependent on the TCP/IP stack.
161 */
162
163__u32 in_aton(const char *str)
164{
165 unsigned long l;
166 unsigned int val;
167 int i;
168
169 l = 0;
170 for (i = 0; i < 4; i++)
171 {
172 l <<= 8;
173 if (*str != '\0')
174 {
175 val = 0;
176 while (*str != '\0' && *str != '.')
177 {
178 val *= 10;
179 val += *str - '0';
180 str++;
181 }
182 l |= val;
183 if (*str != '\0')
184 str++;
185 }
186 }
187 return(htonl(l));
188}
189
190EXPORT_SYMBOL(in_aton);
diff --git a/net/core/wireless.c b/net/core/wireless.c
index 750cc5daeb03..3ff5639c0b78 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -2,7 +2,7 @@
2 * This file implement the Wireless Extensions APIs. 2 * This file implement the Wireless Extensions APIs.
3 * 3 *
4 * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> 4 * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
5 * Copyright (c) 1997-2004 Jean Tourrilhes, All Rights Reserved. 5 * Copyright (c) 1997-2005 Jean Tourrilhes, All Rights Reserved.
6 * 6 *
7 * (As all part of the Linux kernel, this file is GPL) 7 * (As all part of the Linux kernel, this file is GPL)
8 */ 8 */
@@ -187,6 +187,12 @@ static const struct iw_ioctl_description standard_ioctl[] = {
187 .header_type = IW_HEADER_TYPE_ADDR, 187 .header_type = IW_HEADER_TYPE_ADDR,
188 .flags = IW_DESCR_FLAG_DUMP, 188 .flags = IW_DESCR_FLAG_DUMP,
189 }, 189 },
190 [SIOCSIWMLME - SIOCIWFIRST] = {
191 .header_type = IW_HEADER_TYPE_POINT,
192 .token_size = 1,
193 .min_tokens = sizeof(struct iw_mlme),
194 .max_tokens = sizeof(struct iw_mlme),
195 },
190 [SIOCGIWAPLIST - SIOCIWFIRST] = { 196 [SIOCGIWAPLIST - SIOCIWFIRST] = {
191 .header_type = IW_HEADER_TYPE_POINT, 197 .header_type = IW_HEADER_TYPE_POINT,
192 .token_size = sizeof(struct sockaddr) + 198 .token_size = sizeof(struct sockaddr) +
@@ -195,7 +201,10 @@ static const struct iw_ioctl_description standard_ioctl[] = {
195 .flags = IW_DESCR_FLAG_NOMAX, 201 .flags = IW_DESCR_FLAG_NOMAX,
196 }, 202 },
197 [SIOCSIWSCAN - SIOCIWFIRST] = { 203 [SIOCSIWSCAN - SIOCIWFIRST] = {
198 .header_type = IW_HEADER_TYPE_PARAM, 204 .header_type = IW_HEADER_TYPE_POINT,
205 .token_size = 1,
206 .min_tokens = 0,
207 .max_tokens = sizeof(struct iw_scan_req),
199 }, 208 },
200 [SIOCGIWSCAN - SIOCIWFIRST] = { 209 [SIOCGIWSCAN - SIOCIWFIRST] = {
201 .header_type = IW_HEADER_TYPE_POINT, 210 .header_type = IW_HEADER_TYPE_POINT,
@@ -273,6 +282,42 @@ static const struct iw_ioctl_description standard_ioctl[] = {
273 [SIOCGIWPOWER - SIOCIWFIRST] = { 282 [SIOCGIWPOWER - SIOCIWFIRST] = {
274 .header_type = IW_HEADER_TYPE_PARAM, 283 .header_type = IW_HEADER_TYPE_PARAM,
275 }, 284 },
285 [SIOCSIWGENIE - SIOCIWFIRST] = {
286 .header_type = IW_HEADER_TYPE_POINT,
287 .token_size = 1,
288 .max_tokens = IW_GENERIC_IE_MAX,
289 },
290 [SIOCGIWGENIE - SIOCIWFIRST] = {
291 .header_type = IW_HEADER_TYPE_POINT,
292 .token_size = 1,
293 .max_tokens = IW_GENERIC_IE_MAX,
294 },
295 [SIOCSIWAUTH - SIOCIWFIRST] = {
296 .header_type = IW_HEADER_TYPE_PARAM,
297 },
298 [SIOCGIWAUTH - SIOCIWFIRST] = {
299 .header_type = IW_HEADER_TYPE_PARAM,
300 },
301 [SIOCSIWENCODEEXT - SIOCIWFIRST] = {
302 .header_type = IW_HEADER_TYPE_POINT,
303 .token_size = 1,
304 .min_tokens = sizeof(struct iw_encode_ext),
305 .max_tokens = sizeof(struct iw_encode_ext) +
306 IW_ENCODING_TOKEN_MAX,
307 },
308 [SIOCGIWENCODEEXT - SIOCIWFIRST] = {
309 .header_type = IW_HEADER_TYPE_POINT,
310 .token_size = 1,
311 .min_tokens = sizeof(struct iw_encode_ext),
312 .max_tokens = sizeof(struct iw_encode_ext) +
313 IW_ENCODING_TOKEN_MAX,
314 },
315 [SIOCSIWPMKSA - SIOCIWFIRST] = {
316 .header_type = IW_HEADER_TYPE_POINT,
317 .token_size = 1,
318 .min_tokens = sizeof(struct iw_pmksa),
319 .max_tokens = sizeof(struct iw_pmksa),
320 },
276}; 321};
277static const int standard_ioctl_num = (sizeof(standard_ioctl) / 322static const int standard_ioctl_num = (sizeof(standard_ioctl) /
278 sizeof(struct iw_ioctl_description)); 323 sizeof(struct iw_ioctl_description));
@@ -299,6 +344,31 @@ static const struct iw_ioctl_description standard_event[] = {
299 [IWEVEXPIRED - IWEVFIRST] = { 344 [IWEVEXPIRED - IWEVFIRST] = {
300 .header_type = IW_HEADER_TYPE_ADDR, 345 .header_type = IW_HEADER_TYPE_ADDR,
301 }, 346 },
347 [IWEVGENIE - IWEVFIRST] = {
348 .header_type = IW_HEADER_TYPE_POINT,
349 .token_size = 1,
350 .max_tokens = IW_GENERIC_IE_MAX,
351 },
352 [IWEVMICHAELMICFAILURE - IWEVFIRST] = {
353 .header_type = IW_HEADER_TYPE_POINT,
354 .token_size = 1,
355 .max_tokens = sizeof(struct iw_michaelmicfailure),
356 },
357 [IWEVASSOCREQIE - IWEVFIRST] = {
358 .header_type = IW_HEADER_TYPE_POINT,
359 .token_size = 1,
360 .max_tokens = IW_GENERIC_IE_MAX,
361 },
362 [IWEVASSOCRESPIE - IWEVFIRST] = {
363 .header_type = IW_HEADER_TYPE_POINT,
364 .token_size = 1,
365 .max_tokens = IW_GENERIC_IE_MAX,
366 },
367 [IWEVPMKIDCAND - IWEVFIRST] = {
368 .header_type = IW_HEADER_TYPE_POINT,
369 .token_size = 1,
370 .max_tokens = sizeof(struct iw_pmkid_cand),
371 },
302}; 372};
303static const int standard_event_num = (sizeof(standard_event) / 373static const int standard_event_num = (sizeof(standard_event) /
304 sizeof(struct iw_ioctl_description)); 374 sizeof(struct iw_ioctl_description));
@@ -1032,6 +1102,7 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb,
1032 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r)); 1102 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
1033 r = NLMSG_DATA(nlh); 1103 r = NLMSG_DATA(nlh);
1034 r->ifi_family = AF_UNSPEC; 1104 r->ifi_family = AF_UNSPEC;
1105 r->__ifi_pad = 0;
1035 r->ifi_type = dev->type; 1106 r->ifi_type = dev->type;
1036 r->ifi_index = dev->ifindex; 1107 r->ifi_index = dev->ifindex;
1037 r->ifi_flags = dev->flags; 1108 r->ifi_flags = dev->flags;
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 2101da542ba8..92f2ec46fd22 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -1,6 +1,29 @@
1# 1#
2# DECnet configuration 2# DECnet configuration
3# 3#
4config DECNET
5 tristate "DECnet Support"
6 ---help---
7 The DECnet networking protocol was used in many products made by
8 Digital (now Compaq). It provides reliable stream and sequenced
9 packet communications over which run a variety of services similar
10 to those which run over TCP/IP.
11
12 To find some tools to use with the kernel layer support, please
13 look at Patrick Caulfield's web site:
14 <http://linux-decnet.sourceforge.net/>.
15
16 More detailed documentation is available in
17 <file:Documentation/networking/decnet.txt>.
18
19 Be sure to say Y to "/proc file system support" and "Sysctl support"
20 below when using DECnet, since you will need sysctl support to aid
21 in configuration at run time.
22
23 The DECnet code is also available as a module ( = code which can be
24 inserted in and removed from the running kernel whenever you want).
25 The module is called decnet.
26
4config DECNET_ROUTER 27config DECNET_ROUTER
5 bool "DECnet: router support (EXPERIMENTAL)" 28 bool "DECnet: router support (EXPERIMENTAL)"
6 depends on DECNET && EXPERIMENTAL 29 depends on DECNET && EXPERIMENTAL
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 29bb3cd21965..acdd18e6adb2 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk)
536 * we are double checking that we are not sending too 536 * we are double checking that we are not sending too
537 * many of these keepalive frames. 537 * many of these keepalive frames.
538 */ 538 */
539 if (skb_queue_len(&scp->other_xmit_queue) == 0) 539 if (skb_queue_empty(&scp->other_xmit_queue))
540 dn_nsp_send_link(sk, DN_NOCHANGE, 0); 540 dn_nsp_send_link(sk, DN_NOCHANGE, 0);
541} 541}
542 542
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table
1191 struct dn_scp *scp = DN_SK(sk); 1191 struct dn_scp *scp = DN_SK(sk);
1192 int mask = datagram_poll(file, sock, wait); 1192 int mask = datagram_poll(file, sock, wait);
1193 1193
1194 if (skb_queue_len(&scp->other_receive_queue)) 1194 if (!skb_queue_empty(&scp->other_receive_queue))
1195 mask |= POLLRDBAND; 1195 mask |= POLLRDBAND;
1196 1196
1197 return mask; 1197 return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1214 1214
1215 case SIOCATMARK: 1215 case SIOCATMARK:
1216 lock_sock(sk); 1216 lock_sock(sk);
1217 val = (skb_queue_len(&scp->other_receive_queue) != 0); 1217 val = !skb_queue_empty(&scp->other_receive_queue);
1218 if (scp->state != DN_RUN) 1218 if (scp->state != DN_RUN)
1219 val = -ENOTCONN; 1219 val = -ENOTCONN;
1220 release_sock(sk); 1220 release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int
1630 int len = 0; 1630 int len = 0;
1631 1631
1632 if (flags & MSG_OOB) 1632 if (flags & MSG_OOB)
1633 return skb_queue_len(q) ? 1 : 0; 1633 return !skb_queue_empty(q) ? 1 : 0;
1634 1634
1635 while(skb != (struct sk_buff *)q) { 1635 while(skb != (struct sk_buff *)q) {
1636 struct dn_skb_cb *cb = DN_SKB_CB(skb); 1636 struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
1707 if (sk->sk_err) 1707 if (sk->sk_err)
1708 goto out; 1708 goto out;
1709 1709
1710 if (skb_queue_len(&scp->other_receive_queue)) { 1710 if (!skb_queue_empty(&scp->other_receive_queue)) {
1711 if (!(flags & MSG_OOB)) { 1711 if (!(flags & MSG_OOB)) {
1712 msg->msg_flags |= MSG_OOB; 1712 msg->msg_flags |= MSG_OOB;
1713 if (!scp->other_report) { 1713 if (!scp->other_report) {
@@ -1876,15 +1876,6 @@ static inline unsigned int dn_current_mss(struct sock *sk, int flags)
1876 return mss_now; 1876 return mss_now;
1877} 1877}
1878 1878
1879static int dn_error(struct sock *sk, int flags, int err)
1880{
1881 if (err == -EPIPE)
1882 err = sock_error(sk) ? : -EPIPE;
1883 if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
1884 send_sig(SIGPIPE, current, 0);
1885 return err;
1886}
1887
1888static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, 1879static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1889 struct msghdr *msg, size_t size) 1880 struct msghdr *msg, size_t size)
1890{ 1881{
@@ -2045,7 +2036,7 @@ out:
2045 return sent ? sent : err; 2036 return sent ? sent : err;
2046 2037
2047out_err: 2038out_err:
2048 err = dn_error(sk, flags, err); 2039 err = sk_stream_error(sk, flags, err);
2049 release_sock(sk); 2040 release_sock(sk);
2050 return err; 2041 return err;
2051} 2042}
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index ee7bf46eb78a..00233ecbc9cb 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -716,13 +716,13 @@ static int dn_dev_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *a
716} 716}
717 717
718static int dn_dev_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, 718static int dn_dev_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
719 u32 pid, u32 seq, int event) 719 u32 pid, u32 seq, int event, unsigned int flags)
720{ 720{
721 struct ifaddrmsg *ifm; 721 struct ifaddrmsg *ifm;
722 struct nlmsghdr *nlh; 722 struct nlmsghdr *nlh;
723 unsigned char *b = skb->tail; 723 unsigned char *b = skb->tail;
724 724
725 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 725 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
726 ifm = NLMSG_DATA(nlh); 726 ifm = NLMSG_DATA(nlh);
727 727
728 ifm->ifa_family = AF_DECnet; 728 ifm->ifa_family = AF_DECnet;
@@ -755,7 +755,7 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa)
755 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS); 755 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS);
756 return; 756 return;
757 } 757 }
758 if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { 758 if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
759 kfree_skb(skb); 759 kfree_skb(skb);
760 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL); 760 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL);
761 return; 761 return;
@@ -790,7 +790,8 @@ static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
790 if (dn_dev_fill_ifaddr(skb, ifa, 790 if (dn_dev_fill_ifaddr(skb, ifa,
791 NETLINK_CB(cb->skb).pid, 791 NETLINK_CB(cb->skb).pid,
792 cb->nlh->nlmsg_seq, 792 cb->nlh->nlmsg_seq,
793 RTM_NEWADDR) <= 0) 793 RTM_NEWADDR,
794 NLM_F_MULTI) <= 0)
794 goto done; 795 goto done;
795 } 796 }
796 } 797 }
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720e4..99bc061759c3 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
551 if (t < s_t) 551 if (t < s_t)
552 continue; 552 continue;
553 if (t > s_t) 553 if (t > s_t)
554 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); 554 memset(&cb->args[1], 0,
555 sizeof(cb->args) - sizeof(cb->args[0]));
555 tb = dn_fib_get_table(t, 0); 556 tb = dn_fib_get_table(t, 0);
556 if (tb == NULL) 557 if (tb == NULL)
557 continue; 558 continue;
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f6dfe96f45b7..8d0cc3cf3e49 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -101,7 +101,6 @@ struct neigh_table dn_neigh_table = {
101 .id = "dn_neigh_cache", 101 .id = "dn_neigh_cache",
102 .parms ={ 102 .parms ={
103 .tbl = &dn_neigh_table, 103 .tbl = &dn_neigh_table,
104 .entries = 0,
105 .base_reachable_time = 30 * HZ, 104 .base_reachable_time = 30 * HZ,
106 .retrans_time = 1 * HZ, 105 .retrans_time = 1 * HZ,
107 .gc_staletime = 60 * HZ, 106 .gc_staletime = 60 * HZ,
@@ -149,12 +148,12 @@ static int dn_neigh_construct(struct neighbour *neigh)
149 148
150 __neigh_parms_put(neigh->parms); 149 __neigh_parms_put(neigh->parms);
151 neigh->parms = neigh_parms_clone(parms); 150 neigh->parms = neigh_parms_clone(parms);
152 rcu_read_unlock();
153 151
154 if (dn_db->use_long) 152 if (dn_db->use_long)
155 neigh->ops = &dn_long_ops; 153 neigh->ops = &dn_long_ops;
156 else 154 else
157 neigh->ops = &dn_short_ops; 155 neigh->ops = &dn_short_ops;
156 rcu_read_unlock();
158 157
159 if (dn->flags & DN_NDFLAG_P3) 158 if (dn->flags & DN_NDFLAG_P3)
160 neigh->ops = &dn_phase3_ops; 159 neigh->ops = &dn_phase3_ops;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 42abbf3f524f..8cce1fdbda90 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
342 342
343 dn_nsp_output(sk); 343 dn_nsp_output(sk);
344 344
345 if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue)) 345 if (!skb_queue_empty(&scp->data_xmit_queue) ||
346 !skb_queue_empty(&scp->other_xmit_queue))
346 scp->persist = dn_nsp_persist(sk); 347 scp->persist = dn_nsp_persist(sk);
347 348
348 return 0; 349 return 0;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 1e7b5c3ea215..2399fa8a3f86 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1465,7 +1465,8 @@ int dn_route_input(struct sk_buff *skb)
1465 return dn_route_input_slow(skb); 1465 return dn_route_input_slow(skb);
1466} 1466}
1467 1467
1468static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait) 1468static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
1469 int event, int nowait, unsigned int flags)
1469{ 1470{
1470 struct dn_route *rt = (struct dn_route *)skb->dst; 1471 struct dn_route *rt = (struct dn_route *)skb->dst;
1471 struct rtmsg *r; 1472 struct rtmsg *r;
@@ -1473,9 +1474,8 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int
1473 unsigned char *b = skb->tail; 1474 unsigned char *b = skb->tail;
1474 struct rta_cacheinfo ci; 1475 struct rta_cacheinfo ci;
1475 1476
1476 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); 1477 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
1477 r = NLMSG_DATA(nlh); 1478 r = NLMSG_DATA(nlh);
1478 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1479 r->rtm_family = AF_DECnet; 1479 r->rtm_family = AF_DECnet;
1480 r->rtm_dst_len = 16; 1480 r->rtm_dst_len = 16;
1481 r->rtm_src_len = 0; 1481 r->rtm_src_len = 0;
@@ -1596,7 +1596,7 @@ int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
1596 1596
1597 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; 1597 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1598 1598
1599 err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0); 1599 err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
1600 1600
1601 if (err == 0) 1601 if (err == 0)
1602 goto out_free; 1602 goto out_free;
@@ -1644,7 +1644,8 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
1644 continue; 1644 continue;
1645 skb->dst = dst_clone(&rt->u.dst); 1645 skb->dst = dst_clone(&rt->u.dst);
1646 if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 1646 if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
1647 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { 1647 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
1648 1, NLM_F_MULTI) <= 0) {
1648 dst_release(xchg(&skb->dst, NULL)); 1649 dst_release(xchg(&skb->dst, NULL));
1649 rcu_read_unlock_bh(); 1650 rcu_read_unlock_bh();
1650 goto done; 1651 goto done;
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 597587d170d8..1060de70bc0c 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -342,14 +342,15 @@ static struct notifier_block dn_fib_rules_notifier = {
342 .notifier_call = dn_fib_rules_event, 342 .notifier_call = dn_fib_rules_event,
343}; 343};
344 344
345static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r, struct netlink_callback *cb) 345static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r,
346 struct netlink_callback *cb, unsigned int flags)
346{ 347{
347 struct rtmsg *rtm; 348 struct rtmsg *rtm;
348 struct nlmsghdr *nlh; 349 struct nlmsghdr *nlh;
349 unsigned char *b = skb->tail; 350 unsigned char *b = skb->tail;
350 351
351 352
352 nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); 353 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
353 rtm = NLMSG_DATA(nlh); 354 rtm = NLMSG_DATA(nlh);
354 rtm->rtm_family = AF_DECnet; 355 rtm->rtm_family = AF_DECnet;
355 rtm->rtm_dst_len = r->r_dst_len; 356 rtm->rtm_dst_len = r->r_dst_len;
@@ -394,7 +395,7 @@ int dn_fib_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
394 for(r = dn_fib_rules, idx = 0; r; r = r->r_next, idx++) { 395 for(r = dn_fib_rules, idx = 0; r; r = r->r_next, idx++) {
395 if (idx < s_idx) 396 if (idx < s_idx)
396 continue; 397 continue;
397 if (dn_fib_fill_rule(skb, r, cb) < 0) 398 if (dn_fib_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
398 break; 399 break;
399 } 400 }
400 read_unlock(&dn_fib_rules_lock); 401 read_unlock(&dn_fib_rules_lock);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index dad5603912be..28ba5777a25a 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -270,13 +270,13 @@ static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern
270 270
271static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 271static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
272 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, 272 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len,
273 struct dn_fib_info *fi) 273 struct dn_fib_info *fi, unsigned int flags)
274{ 274{
275 struct rtmsg *rtm; 275 struct rtmsg *rtm;
276 struct nlmsghdr *nlh; 276 struct nlmsghdr *nlh;
277 unsigned char *b = skb->tail; 277 unsigned char *b = skb->tail;
278 278
279 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); 279 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
280 rtm = NLMSG_DATA(nlh); 280 rtm = NLMSG_DATA(nlh);
281 rtm->rtm_family = AF_DECnet; 281 rtm->rtm_family = AF_DECnet;
282 rtm->rtm_dst_len = dst_len; 282 rtm->rtm_dst_len = dst_len;
@@ -345,7 +345,7 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
345 345
346 if (dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id, 346 if (dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id,
347 f->fn_type, f->fn_scope, &f->fn_key, z, 347 f->fn_type, f->fn_scope, &f->fn_key, z,
348 DN_FIB_INFO(f)) < 0) { 348 DN_FIB_INFO(f), 0) < 0) {
349 kfree_skb(skb); 349 kfree_skb(skb);
350 return; 350 return;
351 } 351 }
@@ -377,7 +377,7 @@ static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
377 tb->n, 377 tb->n,
378 (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type, 378 (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
379 f->fn_scope, &f->fn_key, dz->dz_order, 379 f->fn_scope, &f->fn_key, dz->dz_order,
380 f->fn_info) < 0) { 380 f->fn_info, NLM_F_MULTI) < 0) {
381 cb->args[3] = i; 381 cb->args[3] = i;
382 return -1; 382 return -1;
383 } 383 }
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
new file mode 100644
index 000000000000..39a2d2975e0e
--- /dev/null
+++ b/net/econet/Kconfig
@@ -0,0 +1,36 @@
1#
2# Acorn Econet/AUN protocols
3#
4
5config ECONET
6 tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
7 depends on EXPERIMENTAL && INET
8 ---help---
9 Econet is a fairly old and slow networking protocol mainly used by
10 Acorn computers to access file and print servers. It uses native
11 Econet network cards. AUN is an implementation of the higher level
12 parts of Econet that runs over ordinary Ethernet connections, on
13 top of the UDP packet protocol, which in turn runs on top of the
14 Internet protocol IP.
15
16 If you say Y here, you can choose with the next two options whether
17 to send Econet/AUN traffic over a UDP Ethernet connection or over
18 a native Econet network card.
19
20 To compile this driver as a module, choose M here: the module
21 will be called econet.
22
23config ECONET_AUNUDP
24 bool "AUN over UDP"
25 depends on ECONET
26 help
27 Say Y here if you want to send Econet/AUN traffic over a UDP
28 connection (UDP is a packet based protocol that runs on top of the
29 Internet protocol IP) using an ordinary Ethernet network card.
30
31config ECONET_NATIVE
32 bool "Native Econet"
33 depends on ECONET
34 help
35 Say Y here if you have a native Econet network card installed in
36 your computer.
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 6617ea47d365..f6dbfb99b14d 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -92,10 +92,9 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
92 * Set the source hardware address. 92 * Set the source hardware address.
93 */ 93 */
94 94
95 if(saddr) 95 if(!saddr)
96 memcpy(eth->h_source,saddr,dev->addr_len); 96 saddr = dev->dev_addr;
97 else 97 memcpy(eth->h_source,saddr,dev->addr_len);
98 memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
99 98
100 /* 99 /*
101 * Anyway, the loopback-device should never use this function... 100 * Anyway, the loopback-device should never use this function...
@@ -156,7 +155,7 @@ int eth_rebuild_header(struct sk_buff *skb)
156 * This is normal practice and works for any 'now in use' protocol. 155 * This is normal practice and works for any 'now in use' protocol.
157 */ 156 */
158 157
159unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev) 158__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
160{ 159{
161 struct ethhdr *eth; 160 struct ethhdr *eth;
162 unsigned char *rawp; 161 unsigned char *rawp;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d3e8b1bd1f2..0b3d9f1d8069 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -3,7 +3,6 @@
3# 3#
4config IP_MULTICAST 4config IP_MULTICAST
5 bool "IP: multicasting" 5 bool "IP: multicasting"
6 depends on INET
7 help 6 help
8 This is code for addressing several networked computers at once, 7 This is code for addressing several networked computers at once,
9 enlarging your kernel by about 2 KB. You need multicasting if you 8 enlarging your kernel by about 2 KB. You need multicasting if you
@@ -17,7 +16,6 @@ config IP_MULTICAST
17 16
18config IP_ADVANCED_ROUTER 17config IP_ADVANCED_ROUTER
19 bool "IP: advanced router" 18 bool "IP: advanced router"
20 depends on INET
21 ---help--- 19 ---help---
22 If you intend to run your Linux box mostly as a router, i.e. as a 20 If you intend to run your Linux box mostly as a router, i.e. as a
23 computer that forwards and redistributes network packets, say Y; you 21 computer that forwards and redistributes network packets, say Y; you
@@ -53,6 +51,40 @@ config IP_ADVANCED_ROUTER
53 51
54 If unsure, say N here. 52 If unsure, say N here.
55 53
54choice
55 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
56 depends on IP_ADVANCED_ROUTER
57 default ASK_IP_FIB_HASH
58
59config ASK_IP_FIB_HASH
60 bool "FIB_HASH"
61 ---help---
62 Current FIB is very proven and good enough for most users.
63
64config IP_FIB_TRIE
65 bool "FIB_TRIE"
66 ---help---
67 Use new experimental LC-trie as FIB lookup algoritm.
68 This improves lookup performance if you have a large
69 number of routes.
70
71 LC-trie is a longest matching prefix lookup algorithm which
72 performs better than FIB_HASH for large routing tables.
73 But, it consumes more memory and is more complex.
74
75 LC-trie is described in:
76
77 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
78 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
79 An experimental study of compression methods for dynamic tries
80 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
81 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
82
83endchoice
84
85config IP_FIB_HASH
86 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
87
56config IP_MULTIPLE_TABLES 88config IP_MULTIPLE_TABLES
57 bool "IP: policy routing" 89 bool "IP: policy routing"
58 depends on IP_ADVANCED_ROUTER 90 depends on IP_ADVANCED_ROUTER
@@ -92,7 +124,7 @@ config IP_ROUTE_MULTIPATH
92 124
93config IP_ROUTE_MULTIPATH_CACHED 125config IP_ROUTE_MULTIPATH_CACHED
94 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)" 126 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
95 depends on: IP_ROUTE_MULTIPATH 127 depends on IP_ROUTE_MULTIPATH
96 help 128 help
97 Normally, equal cost multipath routing is not supported by the 129 Normally, equal cost multipath routing is not supported by the
98 routing cache. If you say Y here, alternative routes are cached 130 routing cache. If you say Y here, alternative routes are cached
@@ -145,7 +177,6 @@ config IP_ROUTE_VERBOSE
145 177
146config IP_PNP 178config IP_PNP
147 bool "IP: kernel level autoconfiguration" 179 bool "IP: kernel level autoconfiguration"
148 depends on INET
149 help 180 help
150 This enables automatic configuration of IP addresses of devices and 181 This enables automatic configuration of IP addresses of devices and
151 of the routing table during kernel boot, based on either information 182 of the routing table during kernel boot, based on either information
@@ -204,8 +235,6 @@ config IP_PNP_RARP
204# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 235# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
205config NET_IPIP 236config NET_IPIP
206 tristate "IP: tunneling" 237 tristate "IP: tunneling"
207 depends on INET
208 select INET_TUNNEL
209 ---help--- 238 ---help---
210 Tunneling means encapsulating data of one protocol type within 239 Tunneling means encapsulating data of one protocol type within
211 another protocol and sending it over a channel that understands the 240 another protocol and sending it over a channel that understands the
@@ -222,8 +251,6 @@ config NET_IPIP
222 251
223config NET_IPGRE 252config NET_IPGRE
224 tristate "IP: GRE tunnels over IP" 253 tristate "IP: GRE tunnels over IP"
225 depends on INET
226 select XFRM
227 help 254 help
228 Tunneling means encapsulating data of one protocol type within 255 Tunneling means encapsulating data of one protocol type within
229 another protocol and sending it over a channel that understands the 256 another protocol and sending it over a channel that understands the
@@ -281,7 +308,7 @@ config IP_PIMSM_V2
281 308
282config ARPD 309config ARPD
283 bool "IP: ARP daemon support (EXPERIMENTAL)" 310 bool "IP: ARP daemon support (EXPERIMENTAL)"
284 depends on INET && EXPERIMENTAL 311 depends on EXPERIMENTAL
285 ---help--- 312 ---help---
286 Normally, the kernel maintains an internal cache which maps IP 313 Normally, the kernel maintains an internal cache which maps IP
287 addresses to hardware addresses on the local network, so that 314 addresses to hardware addresses on the local network, so that
@@ -306,7 +333,6 @@ config ARPD
306 333
307config SYN_COOKIES 334config SYN_COOKIES
308 bool "IP: TCP syncookie support (disabled per default)" 335 bool "IP: TCP syncookie support (disabled per default)"
309 depends on INET
310 ---help--- 336 ---help---
311 Normal TCP/IP networking is open to an attack known as "SYN 337 Normal TCP/IP networking is open to an attack known as "SYN
312 flooding". This denial-of-service attack prevents legitimate remote 338 flooding". This denial-of-service attack prevents legitimate remote
@@ -343,7 +369,6 @@ config SYN_COOKIES
343 369
344config INET_AH 370config INET_AH
345 tristate "IP: AH transformation" 371 tristate "IP: AH transformation"
346 depends on INET
347 select XFRM 372 select XFRM
348 select CRYPTO 373 select CRYPTO
349 select CRYPTO_HMAC 374 select CRYPTO_HMAC
@@ -356,7 +381,6 @@ config INET_AH
356 381
357config INET_ESP 382config INET_ESP
358 tristate "IP: ESP transformation" 383 tristate "IP: ESP transformation"
359 depends on INET
360 select XFRM 384 select XFRM
361 select CRYPTO 385 select CRYPTO
362 select CRYPTO_HMAC 386 select CRYPTO_HMAC
@@ -370,7 +394,6 @@ config INET_ESP
370 394
371config INET_IPCOMP 395config INET_IPCOMP
372 tristate "IP: IPComp transformation" 396 tristate "IP: IPComp transformation"
373 depends on INET
374 select XFRM 397 select XFRM
375 select INET_TUNNEL 398 select INET_TUNNEL
376 select CRYPTO 399 select CRYPTO
@@ -383,7 +406,6 @@ config INET_IPCOMP
383 406
384config INET_TUNNEL 407config INET_TUNNEL
385 tristate "IP: tunnel transformation" 408 tristate "IP: tunnel transformation"
386 depends on INET
387 select XFRM 409 select XFRM
388 ---help--- 410 ---help---
389 Support for generic IP tunnel transformation, which is required by 411 Support for generic IP tunnel transformation, which is required by
@@ -393,7 +415,6 @@ config INET_TUNNEL
393 415
394config IP_TCPDIAG 416config IP_TCPDIAG
395 tristate "IP: TCP socket monitoring interface" 417 tristate "IP: TCP socket monitoring interface"
396 depends on INET
397 default y 418 default y
398 ---help--- 419 ---help---
399 Support for TCP socket monitoring interface used by native Linux 420 Support for TCP socket monitoring interface used by native Linux
@@ -407,5 +428,108 @@ config IP_TCPDIAG
407config IP_TCPDIAG_IPV6 428config IP_TCPDIAG_IPV6
408 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) 429 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
409 430
431config TCP_CONG_ADVANCED
432 bool "TCP: advanced congestion control"
433 ---help---
434 Support for selection of various TCP congestion control
435 modules.
436
437 Nearly all users can safely say no here, and a safe default
438 selection will be made (BIC-TCP with new Reno as a fallback).
439
440 If unsure, say N.
441
442# TCP Reno is builtin (required as fallback)
443menu "TCP congestion control"
444 depends on TCP_CONG_ADVANCED
445
446config TCP_CONG_BIC
447 tristate "Binary Increase Congestion (BIC) control"
448 default y
449 ---help---
450 BIC-TCP is a sender-side only change that ensures a linear RTT
451 fairness under large windows while offering both scalability and
452 bounded TCP-friendliness. The protocol combines two schemes
453 called additive increase and binary search increase. When the
454 congestion window is large, additive increase with a large
455 increment ensures linear RTT fairness as well as good
456 scalability. Under small congestion windows, binary search
457 increase provides TCP friendliness.
458 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
459
460config TCP_CONG_WESTWOOD
461 tristate "TCP Westwood+"
462 default m
463 ---help---
464 TCP Westwood+ is a sender-side only modification of the TCP Reno
465 protocol stack that optimizes the performance of TCP congestion
466 control. It is based on end-to-end bandwidth estimation to set
467 congestion window and slow start threshold after a congestion
468 episode. Using this estimation, TCP Westwood+ adaptively sets a
469 slow start threshold and a congestion window which takes into
470 account the bandwidth used at the time congestion is experienced.
471 TCP Westwood+ significantly increases fairness wrt TCP Reno in
472 wired networks and throughput over wireless links.
473
474config TCP_CONG_HTCP
475 tristate "H-TCP"
476 default m
477 ---help---
478 H-TCP is a send-side only modifications of the TCP Reno
479 protocol stack that optimizes the performance of TCP
480 congestion control for high speed network links. It uses a
481 modeswitch to change the alpha and beta parameters of TCP Reno
482 based on network conditions and in a way so as to be fair with
483 other Reno and H-TCP flows.
484
485config TCP_CONG_HSTCP
486 tristate "High Speed TCP"
487 depends on EXPERIMENTAL
488 default n
489 ---help---
490 Sally Floyd's High Speed TCP (RFC 3649) congestion control.
491 A modification to TCP's congestion control mechanism for use
492 with large congestion windows. A table indicates how much to
493 increase the congestion window by when an ACK is received.
494 For more detail see http://www.icir.org/floyd/hstcp.html
495
496config TCP_CONG_HYBLA
497 tristate "TCP-Hybla congestion control algorithm"
498 depends on EXPERIMENTAL
499 default n
500 ---help---
501 TCP-Hybla is a sender-side only change that eliminates penalization of
502 long-RTT, large-bandwidth connections, like when satellite legs are
503 involved, expecially when sharing a common bottleneck with normal
504 terrestrial connections.
505
506config TCP_CONG_VEGAS
507 tristate "TCP Vegas"
508 depends on EXPERIMENTAL
509 default n
510 ---help---
511 TCP Vegas is a sender-side only change to TCP that anticipates
512 the onset of congestion by estimating the bandwidth. TCP Vegas
513 adjusts the sending rate by modifying the congestion
514 window. TCP Vegas should provide less packet loss, but it is
515 not as aggressive as TCP Reno.
516
517config TCP_CONG_SCALABLE
518 tristate "Scalable TCP"
519 depends on EXPERIMENTAL
520 default n
521 ---help---
522 Scalable TCP is a sender-side only change to TCP which uses a
523 MIMD congestion control algorithm which has some nice scaling
524 properties, though is known to have fairness issues.
525 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
526
527endmenu
528
529config TCP_CONG_BIC
530 tristate
531 depends on !TCP_CONG_ADVANCED
532 default y
533
410source "net/ipv4/ipvs/Kconfig" 534source "net/ipv4/ipvs/Kconfig"
411 535
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8b379627ebb6..55dc6cca1e7b 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -2,13 +2,16 @@
2# Makefile for the Linux TCP/IP (INET) layer. 2# Makefile for the Linux TCP/IP (INET) layer.
3# 3#
4 4
5obj-y := utils.o route.o inetpeer.o protocol.o \ 5obj-y := route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
9 tcp_minisocks.o tcp_cong.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 10 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o 11 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 12
13obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
14obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
12obj-$(CONFIG_PROC_FS) += proc.o 15obj-$(CONFIG_PROC_FS) += proc.o
13obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 16obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
14obj-$(CONFIG_IP_MROUTE) += ipmr.o 17obj-$(CONFIG_IP_MROUTE) += ipmr.o
@@ -28,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/
28obj-$(CONFIG_IP_VS) += ipvs/ 31obj-$(CONFIG_IP_VS) += ipvs/
29obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 32obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
30obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 33obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
34obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
37obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
38obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
39obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
40obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
31 41
32obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 42obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
33 xfrm4_output.o 43 xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b3cb49ce5fad..163ae4068b5f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
1009static int ipv4_proc_init(void); 1009static int ipv4_proc_init(void);
1010extern void ipfrag_init(void); 1010extern void ipfrag_init(void);
1011 1011
1012/*
1013 * IP protocol layer initialiser
1014 */
1015
1016static struct packet_type ip_packet_type = {
1017 .type = __constant_htons(ETH_P_IP),
1018 .func = ip_rcv,
1019};
1020
1012static int __init inet_init(void) 1021static int __init inet_init(void)
1013{ 1022{
1014 struct sk_buff *dummy_skb; 1023 struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
1102 1111
1103 ipfrag_init(); 1112 ipfrag_init();
1104 1113
1114 dev_add_pack(&ip_packet_type);
1115
1105 rc = 0; 1116 rc = 0;
1106out: 1117out:
1107 return rc; 1118 return rc;
@@ -1119,6 +1130,10 @@ module_init(inet_init);
1119#ifdef CONFIG_PROC_FS 1130#ifdef CONFIG_PROC_FS
1120extern int fib_proc_init(void); 1131extern int fib_proc_init(void);
1121extern void fib_proc_exit(void); 1132extern void fib_proc_exit(void);
1133#ifdef CONFIG_IP_FIB_TRIE
1134extern int fib_stat_proc_init(void);
1135extern void fib_stat_proc_exit(void);
1136#endif
1122extern int ip_misc_proc_init(void); 1137extern int ip_misc_proc_init(void);
1123extern int raw_proc_init(void); 1138extern int raw_proc_init(void);
1124extern void raw_proc_exit(void); 1139extern void raw_proc_exit(void);
@@ -1139,11 +1154,19 @@ static int __init ipv4_proc_init(void)
1139 goto out_udp; 1154 goto out_udp;
1140 if (fib_proc_init()) 1155 if (fib_proc_init())
1141 goto out_fib; 1156 goto out_fib;
1157#ifdef CONFIG_IP_FIB_TRIE
1158 if (fib_stat_proc_init())
1159 goto out_fib_stat;
1160#endif
1142 if (ip_misc_proc_init()) 1161 if (ip_misc_proc_init())
1143 goto out_misc; 1162 goto out_misc;
1144out: 1163out:
1145 return rc; 1164 return rc;
1146out_misc: 1165out_misc:
1166#ifdef CONFIG_IP_FIB_TRIE
1167 fib_stat_proc_exit();
1168out_fib_stat:
1169#endif
1147 fib_proc_exit(); 1170 fib_proc_exit();
1148out_fib: 1171out_fib:
1149 udp4_proc_exit(); 1172 udp4_proc_exit();
@@ -1181,6 +1204,7 @@ EXPORT_SYMBOL(inet_stream_connect);
1181EXPORT_SYMBOL(inet_stream_ops); 1204EXPORT_SYMBOL(inet_stream_ops);
1182EXPORT_SYMBOL(inet_unregister_protosw); 1205EXPORT_SYMBOL(inet_unregister_protosw);
1183EXPORT_SYMBOL(net_statistics); 1206EXPORT_SYMBOL(net_statistics);
1207EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
1184 1208
1185#ifdef INET_REFCNT_DEBUG 1209#ifdef INET_REFCNT_DEBUG
1186EXPORT_SYMBOL(inet_sock_nr); 1210EXPORT_SYMBOL(inet_sock_nr);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 0e98f2235b6e..514c85b2631a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -200,7 +200,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
200 xfrm_state_put(x); 200 xfrm_state_put(x);
201} 201}
202 202
203static int ah_init_state(struct xfrm_state *x, void *args) 203static int ah_init_state(struct xfrm_state *x)
204{ 204{
205 struct ah_data *ahp = NULL; 205 struct ah_data *ahp = NULL;
206 struct xfrm_algo_desc *aalg_desc; 206 struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 478a30179a52..d8a10e3dd77d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1030,14 +1030,13 @@ static struct notifier_block ip_netdev_notifier = {
1030}; 1030};
1031 1031
1032static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, 1032static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1033 u32 pid, u32 seq, int event) 1033 u32 pid, u32 seq, int event, unsigned int flags)
1034{ 1034{
1035 struct ifaddrmsg *ifm; 1035 struct ifaddrmsg *ifm;
1036 struct nlmsghdr *nlh; 1036 struct nlmsghdr *nlh;
1037 unsigned char *b = skb->tail; 1037 unsigned char *b = skb->tail;
1038 1038
1039 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 1039 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
1040 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
1041 ifm = NLMSG_DATA(nlh); 1040 ifm = NLMSG_DATA(nlh);
1042 ifm->ifa_family = AF_INET; 1041 ifm->ifa_family = AF_INET;
1043 ifm->ifa_prefixlen = ifa->ifa_prefixlen; 1042 ifm->ifa_prefixlen = ifa->ifa_prefixlen;
@@ -1090,7 +1089,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1090 continue; 1089 continue;
1091 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, 1090 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
1092 cb->nlh->nlmsg_seq, 1091 cb->nlh->nlmsg_seq,
1093 RTM_NEWADDR) <= 0) { 1092 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1094 rcu_read_unlock(); 1093 rcu_read_unlock();
1095 goto done; 1094 goto done;
1096 } 1095 }
@@ -1113,7 +1112,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
1113 1112
1114 if (!skb) 1113 if (!skb)
1115 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); 1114 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
1116 else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { 1115 else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
1117 kfree_skb(skb); 1116 kfree_skb(skb);
1118 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); 1117 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
1119 } else { 1118 } else {
@@ -1472,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
1472 * by sysctl and we wouldn't want anyone to change it under our feet 1471 * by sysctl and we wouldn't want anyone to change it under our feet
1473 * (see SIOCSIFNAME). 1472 * (see SIOCSIFNAME).
1474 */ 1473 */
1475 dev_name = net_sysctl_strdup(dev_name); 1474 dev_name = kstrdup(dev_name, GFP_KERNEL);
1476 if (!dev_name) 1475 if (!dev_name)
1477 goto free; 1476 goto free;
1478 1477
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index eae84cc39d3f..ba57446d5d1f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -362,7 +362,7 @@ static void esp_destroy(struct xfrm_state *x)
362 kfree(esp); 362 kfree(esp);
363} 363}
364 364
365static int esp_init_state(struct xfrm_state *x, void *args) 365static int esp_init_state(struct xfrm_state *x)
366{ 366{
367 struct esp_data *esp = NULL; 367 struct esp_data *esp = NULL;
368 368
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 563e7d612706..cd8e45ab9580 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -516,6 +516,60 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
516#undef BRD1_OK 516#undef BRD1_OK
517} 517}
518 518
519static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
520{
521
522 struct fib_result res;
523 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
524 .fwmark = frn->fl_fwmark,
525 .tos = frn->fl_tos,
526 .scope = frn->fl_scope } } };
527 if (tb) {
528 local_bh_disable();
529
530 frn->tb_id = tb->tb_id;
531 frn->err = tb->tb_lookup(tb, &fl, &res);
532
533 if (!frn->err) {
534 frn->prefixlen = res.prefixlen;
535 frn->nh_sel = res.nh_sel;
536 frn->type = res.type;
537 frn->scope = res.scope;
538 }
539 local_bh_enable();
540 }
541}
542
543static void nl_fib_input(struct sock *sk, int len)
544{
545 struct sk_buff *skb = NULL;
546 struct nlmsghdr *nlh = NULL;
547 struct fib_result_nl *frn;
548 int err;
549 u32 pid;
550 struct fib_table *tb;
551
552 skb = skb_recv_datagram(sk, 0, 0, &err);
553 nlh = (struct nlmsghdr *)skb->data;
554
555 frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
556 tb = fib_get_table(frn->tb_id_in);
557
558 nl_fib_lookup(frn, tb);
559
560 pid = nlh->nlmsg_pid; /*pid of sending process */
561 NETLINK_CB(skb).groups = 0; /* not in mcast group */
562 NETLINK_CB(skb).pid = 0; /* from kernel */
563 NETLINK_CB(skb).dst_pid = pid;
564 NETLINK_CB(skb).dst_groups = 0; /* unicast */
565 netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
566}
567
568static void nl_fib_lookup_init(void)
569{
570 netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
571}
572
519static void fib_disable_ip(struct net_device *dev, int force) 573static void fib_disable_ip(struct net_device *dev, int force)
520{ 574{
521 if (fib_sync_down(0, dev, force)) 575 if (fib_sync_down(0, dev, force))
@@ -604,6 +658,7 @@ void __init ip_fib_init(void)
604 658
605 register_netdevice_notifier(&fib_netdev_notifier); 659 register_netdevice_notifier(&fib_netdev_notifier);
606 register_inetaddr_notifier(&fib_inetaddr_notifier); 660 register_inetaddr_notifier(&fib_inetaddr_notifier);
661 nl_fib_lookup_init();
607} 662}
608 663
609EXPORT_SYMBOL(inet_addr_type); 664EXPORT_SYMBOL(inet_addr_type);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 6506dcc01b46..b10d6bb5ef3d 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -703,7 +703,8 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
703 &f->fn_key, 703 &f->fn_key,
704 fz->fz_order, 704 fz->fz_order,
705 fa->fa_tos, 705 fa->fa_tos,
706 fa->fa_info) < 0) { 706 fa->fa_info,
707 NLM_F_MULTI) < 0) {
707 cb->args[3] = i; 708 cb->args[3] = i;
708 return -1; 709 return -1;
709 } 710 }
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index ac4485f75e97..b729d97cfa93 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -30,7 +30,8 @@ extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
30 struct kern_rta *rta, struct fib_info *fi); 30 struct kern_rta *rta, struct fib_info *fi);
31extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 31extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
32 u8 tb_id, u8 type, u8 scope, void *dst, 32 u8 tb_id, u8 type, u8 scope, void *dst,
33 int dst_len, u8 tos, struct fib_info *fi); 33 int dst_len, u8 tos, struct fib_info *fi,
34 unsigned int);
34extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, 35extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
35 int z, int tb_id, 36 int z, int tb_id,
36 struct nlmsghdr *n, struct netlink_skb_parms *req); 37 struct nlmsghdr *n, struct netlink_skb_parms *req);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 39d0aadb9a2a..0b298bbc1518 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -367,13 +367,14 @@ static struct notifier_block fib_rules_notifier = {
367 367
368static __inline__ int inet_fill_rule(struct sk_buff *skb, 368static __inline__ int inet_fill_rule(struct sk_buff *skb,
369 struct fib_rule *r, 369 struct fib_rule *r,
370 struct netlink_callback *cb) 370 struct netlink_callback *cb,
371 unsigned int flags)
371{ 372{
372 struct rtmsg *rtm; 373 struct rtmsg *rtm;
373 struct nlmsghdr *nlh; 374 struct nlmsghdr *nlh;
374 unsigned char *b = skb->tail; 375 unsigned char *b = skb->tail;
375 376
376 nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); 377 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
377 rtm = NLMSG_DATA(nlh); 378 rtm = NLMSG_DATA(nlh);
378 rtm->rtm_family = AF_INET; 379 rtm->rtm_family = AF_INET;
379 rtm->rtm_dst_len = r->r_dst_len; 380 rtm->rtm_dst_len = r->r_dst_len;
@@ -422,7 +423,7 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
422 for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { 423 for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
423 if (idx < s_idx) 424 if (idx < s_idx)
424 continue; 425 continue;
425 if (inet_fill_rule(skb, r, cb) < 0) 426 if (inet_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
426 break; 427 break;
427 } 428 }
428 read_unlock(&fib_rules_lock); 429 read_unlock(&fib_rules_lock);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 029362d66135..e278cb9d0075 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -276,7 +276,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
276 struct nlmsghdr *n, struct netlink_skb_parms *req) 276 struct nlmsghdr *n, struct netlink_skb_parms *req)
277{ 277{
278 struct sk_buff *skb; 278 struct sk_buff *skb;
279 u32 pid = req ? req->pid : 0; 279 u32 pid = req ? req->pid : n->nlmsg_pid;
280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); 280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
281 281
282 skb = alloc_skb(size, GFP_KERNEL); 282 skb = alloc_skb(size, GFP_KERNEL);
@@ -286,7 +286,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id, 286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
287 fa->fa_type, fa->fa_scope, &key, z, 287 fa->fa_type, fa->fa_scope, &key, z,
288 fa->fa_tos, 288 fa->fa_tos,
289 fa->fa_info) < 0) { 289 fa->fa_info, 0) < 0) {
290 kfree_skb(skb); 290 kfree_skb(skb);
291 return; 291 return;
292 } 292 }
@@ -593,10 +593,13 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
593 struct hlist_head *new_laddrhash, 593 struct hlist_head *new_laddrhash,
594 unsigned int new_size) 594 unsigned int new_size)
595{ 595{
596 struct hlist_head *old_info_hash, *old_laddrhash;
596 unsigned int old_size = fib_hash_size; 597 unsigned int old_size = fib_hash_size;
597 unsigned int i; 598 unsigned int i, bytes;
598 599
599 write_lock(&fib_info_lock); 600 write_lock(&fib_info_lock);
601 old_info_hash = fib_info_hash;
602 old_laddrhash = fib_info_laddrhash;
600 fib_hash_size = new_size; 603 fib_hash_size = new_size;
601 604
602 for (i = 0; i < old_size; i++) { 605 for (i = 0; i < old_size; i++) {
@@ -636,6 +639,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
636 fib_info_laddrhash = new_laddrhash; 639 fib_info_laddrhash = new_laddrhash;
637 640
638 write_unlock(&fib_info_lock); 641 write_unlock(&fib_info_lock);
642
643 bytes = old_size * sizeof(struct hlist_head *);
644 fib_hash_free(old_info_hash, bytes);
645 fib_hash_free(old_laddrhash, bytes);
639} 646}
640 647
641struct fib_info * 648struct fib_info *
@@ -932,13 +939,13 @@ u32 __fib_res_prefsrc(struct fib_result *res)
932int 939int
933fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 940fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
934 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, 941 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
935 struct fib_info *fi) 942 struct fib_info *fi, unsigned int flags)
936{ 943{
937 struct rtmsg *rtm; 944 struct rtmsg *rtm;
938 struct nlmsghdr *nlh; 945 struct nlmsghdr *nlh;
939 unsigned char *b = skb->tail; 946 unsigned char *b = skb->tail;
940 947
941 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); 948 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
942 rtm = NLMSG_DATA(nlh); 949 rtm = NLMSG_DATA(nlh);
943 rtm->rtm_family = AF_INET; 950 rtm->rtm_family = AF_INET;
944 rtm->rtm_dst_len = dst_len; 951 rtm->rtm_dst_len = dst_len;
@@ -1035,7 +1042,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
1035 } 1042 }
1036 1043
1037 nl->nlmsg_flags = NLM_F_REQUEST; 1044 nl->nlmsg_flags = NLM_F_REQUEST;
1038 nl->nlmsg_pid = 0; 1045 nl->nlmsg_pid = current->pid;
1039 nl->nlmsg_seq = 0; 1046 nl->nlmsg_seq = 0;
1040 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm)); 1047 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1041 if (cmd == SIOCDELRT) { 1048 if (cmd == SIOCDELRT) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 000000000000..a701405fab0b
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2612 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
46#define VERSION "0.325"
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \
93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \
95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
98 ((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \
100 ((_node)->_parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106 t_key key;
107 unsigned long _parent;
108};
109
110struct leaf {
111 t_key key;
112 unsigned long _parent;
113 struct hlist_head list;
114};
115
116struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120};
121
122struct tnode {
123 t_key key;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
140};
141#endif
142
143struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
150};
151
152struct trie {
153 struct node *trie;
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156#endif
157 int size;
158 unsigned int revision;
159};
160
161static int trie_debug = 0;
162
163static int tnode_full(struct tnode *tn, struct node *n);
164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166static int tnode_child_length(struct tnode *tn);
167static struct node *resize(struct trie *t, struct tnode *tn);
168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
170static void tnode_free(struct tnode *tn);
171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173extern int fib_detect_death(struct fib_info *fi, int order,
174 struct fib_info **last_resort, int *last_idx, int *dflt);
175
176extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
177 struct nlmsghdr *n, struct netlink_skb_parms *req);
178
179static kmem_cache_t *fn_alias_kmem;
180static struct trie *trie_local = NULL, *trie_main = NULL;
181
182static void trie_bug(char *err)
183{
184 printk("Trie Bug: %s\n", err);
185 BUG();
186}
187
188static inline struct node *tnode_get_child(struct tnode *tn, int i)
189{
190 if (i >= 1<<tn->bits)
191 trie_bug("tnode_get_child");
192
193 return tn->child[i];
194}
195
196static inline int tnode_child_length(struct tnode *tn)
197{
198 return 1<<tn->bits;
199}
200
201/*
202 _________________________________________________________________
203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204 ----------------------------------------------------------------
205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
206
207 _________________________________________________________________
208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209 -----------------------------------------------------------------
210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
211
212 tp->pos = 7
213 tp->bits = 3
214 n->pos = 15
215 n->bits=4
216 KEYLENGTH=32
217*/
218
219static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
220{
221 if (offset < KEYLENGTH)
222 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
223 else
224 return 0;
225}
226
227static inline int tkey_equals(t_key a, t_key b)
228{
229 return a == b;
230}
231
232static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
233{
234 if (bits == 0 || offset >= KEYLENGTH)
235 return 1;
236 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
238}
239
240static inline int tkey_mismatch(t_key a, int offset, t_key b)
241{
242 t_key diff = a ^ b;
243 int i = offset;
244
245 if (!diff)
246 return 0;
247 while ((diff << i) >> (KEYLENGTH-1) == 0)
248 i++;
249 return i;
250}
251
252/* Candiate for fib_semantics */
253
254static void fn_free_alias(struct fib_alias *fa)
255{
256 fib_release_info(fa->fa_info);
257 kmem_cache_free(fn_alias_kmem, fa);
258}
259
260/*
261 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not
263 all of the bits in that key are significant.
264
265 Consider a node 'n' and its parent 'tp'.
266
267 If n is a leaf, every bit in its key is significant. Its presence is
268 necessitaded by path compression, since during a tree traversal (when
269 searching for a leaf - unless we are doing an insertion) we will completely
270 ignore all skipped bits we encounter. Thus we need to verify, at the end of
271 a potentially successful search, that we have indeed been walking the
272 correct key path.
273
274 Note that we can never "miss" the correct key in the tree if present by
275 following the wrong path. Path compression ensures that segments of the key
276 that are the same for all keys with a given prefix are skipped, but the
277 skipped part *is* identical for each node in the subtrie below the skipped
278 bit! trie_insert() in this implementation takes care of that - note the
279 call to tkey_sub_equals() in trie_insert().
280
281 if n is an internal node - a 'tnode' here, the various parts of its key
282 have many different meanings.
283
284 Example:
285 _________________________________________________________________
286 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287 -----------------------------------------------------------------
288 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
289
290 _________________________________________________________________
291 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292 -----------------------------------------------------------------
293 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
294
295 tp->pos = 7
296 tp->bits = 3
297 n->pos = 15
298 n->bits=4
299
300 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302 not use them for anything.
303
304 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305 index into the parent's child array. That is, they will be used to find
306 'n' among tp's children.
307
308 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309 for the node n.
310
311 All the bits we have seen so far are significant to the node n. The rest
312 of the bits are really not needed or indeed known in n->key.
313
314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315 n's child array, and will of course be different for each child.
316
317
318 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
319 at this point.
320
321*/
322
323static void check_tnode(struct tnode *tn)
324{
325 if (tn && tn->pos+tn->bits > 32) {
326 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
327 }
328}
329
330static int halve_threshold = 25;
331static int inflate_threshold = 50;
332
333static struct leaf *leaf_new(void)
334{
335 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
336 if (l) {
337 NODE_INIT_PARENT(l, T_LEAF);
338 INIT_HLIST_HEAD(&l->list);
339 }
340 return l;
341}
342
343static struct leaf_info *leaf_info_new(int plen)
344{
345 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
346 if (li) {
347 li->plen = plen;
348 INIT_LIST_HEAD(&li->falh);
349 }
350 return li;
351}
352
353static inline void free_leaf(struct leaf *l)
354{
355 kfree(l);
356}
357
358static inline void free_leaf_info(struct leaf_info *li)
359{
360 kfree(li);
361}
362
363static struct tnode *tnode_alloc(unsigned int size)
364{
365 if (size <= PAGE_SIZE) {
366 return kmalloc(size, GFP_KERNEL);
367 } else {
368 return (struct tnode *)
369 __get_free_pages(GFP_KERNEL, get_order(size));
370 }
371}
372
373static void __tnode_free(struct tnode *tn)
374{
375 unsigned int size = sizeof(struct tnode) +
376 (1<<tn->bits) * sizeof(struct node *);
377
378 if (size <= PAGE_SIZE)
379 kfree(tn);
380 else
381 free_pages((unsigned long)tn, get_order(size));
382}
383
384static struct tnode* tnode_new(t_key key, int pos, int bits)
385{
386 int nchildren = 1<<bits;
387 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
388 struct tnode *tn = tnode_alloc(sz);
389
390 if (tn) {
391 memset(tn, 0, sz);
392 NODE_INIT_PARENT(tn, T_TNODE);
393 tn->pos = pos;
394 tn->bits = bits;
395 tn->key = key;
396 tn->full_children = 0;
397 tn->empty_children = 1<<bits;
398 }
399
400 if (trie_debug > 0)
401 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
402 (unsigned int) (sizeof(struct node) * 1<<bits));
403 return tn;
404}
405
406static void tnode_free(struct tnode *tn)
407{
408 if (!tn) {
409 trie_bug("tnode_free\n");
410 }
411 if (IS_LEAF(tn)) {
412 free_leaf((struct leaf *)tn);
413 if (trie_debug > 0 )
414 printk("FL %p \n", tn);
415 }
416 else if (IS_TNODE(tn)) {
417 __tnode_free(tn);
418 if (trie_debug > 0 )
419 printk("FT %p \n", tn);
420 }
421 else {
422 trie_bug("tnode_free\n");
423 }
424}
425
426/*
427 * Check whether a tnode 'n' is "full", i.e. it is an internal node
428 * and no bits are skipped. See discussion in dyntree paper p. 6
429 */
430
431static inline int tnode_full(struct tnode *tn, struct node *n)
432{
433 if (n == NULL || IS_LEAF(n))
434 return 0;
435
436 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
437}
438
439static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
440{
441 tnode_put_child_reorg(tn, i, n, -1);
442}
443
444 /*
445 * Add a child at position i overwriting the old value.
446 * Update the value of full_children and empty_children.
447 */
448
449static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
450{
451 struct node *chi;
452 int isfull;
453
454 if (i >= 1<<tn->bits) {
455 printk("bits=%d, i=%d\n", tn->bits, i);
456 trie_bug("tnode_put_child_reorg bits");
457 }
458 write_lock_bh(&fib_lock);
459 chi = tn->child[i];
460
461 /* update emptyChildren */
462 if (n == NULL && chi != NULL)
463 tn->empty_children++;
464 else if (n != NULL && chi == NULL)
465 tn->empty_children--;
466
467 /* update fullChildren */
468 if (wasfull == -1)
469 wasfull = tnode_full(tn, chi);
470
471 isfull = tnode_full(tn, n);
472 if (wasfull && !isfull)
473 tn->full_children--;
474
475 else if (!wasfull && isfull)
476 tn->full_children++;
477 if (n)
478 NODE_SET_PARENT(n, tn);
479
480 tn->child[i] = n;
481 write_unlock_bh(&fib_lock);
482}
483
484static struct node *resize(struct trie *t, struct tnode *tn)
485{
486 int i;
487 int err = 0;
488
489 if (!tn)
490 return NULL;
491
492 if (trie_debug)
493 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
494 tn, inflate_threshold, halve_threshold);
495
496 /* No children */
497 if (tn->empty_children == tnode_child_length(tn)) {
498 tnode_free(tn);
499 return NULL;
500 }
501 /* One child */
502 if (tn->empty_children == tnode_child_length(tn) - 1)
503 for (i = 0; i < tnode_child_length(tn); i++) {
504
505 write_lock_bh(&fib_lock);
506 if (tn->child[i] != NULL) {
507
508 /* compress one level */
509 struct node *n = tn->child[i];
510 if (n)
511 NODE_INIT_PARENT(n, NODE_TYPE(n));
512
513 write_unlock_bh(&fib_lock);
514 tnode_free(tn);
515 return n;
516 }
517 write_unlock_bh(&fib_lock);
518 }
519 /*
520 * Double as long as the resulting node has a number of
521 * nonempty nodes that are above the threshold.
522 */
523
524 /*
525 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
526 * the Helsinki University of Technology and Matti Tikkanen of Nokia
527 * Telecommunications, page 6:
528 * "A node is doubled if the ratio of non-empty children to all
529 * children in the *doubled* node is at least 'high'."
530 *
531 * 'high' in this instance is the variable 'inflate_threshold'. It
532 * is expressed as a percentage, so we multiply it with
533 * tnode_child_length() and instead of multiplying by 2 (since the
534 * child array will be doubled by inflate()) and multiplying
535 * the left-hand side by 100 (to handle the percentage thing) we
536 * multiply the left-hand side by 50.
537 *
538 * The left-hand side may look a bit weird: tnode_child_length(tn)
539 * - tn->empty_children is of course the number of non-null children
540 * in the current node. tn->full_children is the number of "full"
541 * children, that is non-null tnodes with a skip value of 0.
542 * All of those will be doubled in the resulting inflated tnode, so
543 * we just count them one extra time here.
544 *
545 * A clearer way to write this would be:
546 *
547 * to_be_doubled = tn->full_children;
548 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
549 * tn->full_children;
550 *
551 * new_child_length = tnode_child_length(tn) * 2;
552 *
553 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
554 * new_child_length;
555 * if (new_fill_factor >= inflate_threshold)
556 *
557 * ...and so on, tho it would mess up the while () loop.
558 *
559 * anyway,
560 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
561 * inflate_threshold
562 *
563 * avoid a division:
564 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
565 * inflate_threshold * new_child_length
566 *
567 * expand not_to_be_doubled and to_be_doubled, and shorten:
568 * 100 * (tnode_child_length(tn) - tn->empty_children +
569 * tn->full_children ) >= inflate_threshold * new_child_length
570 *
571 * expand new_child_length:
572 * 100 * (tnode_child_length(tn) - tn->empty_children +
573 * tn->full_children ) >=
574 * inflate_threshold * tnode_child_length(tn) * 2
575 *
576 * shorten again:
577 * 50 * (tn->full_children + tnode_child_length(tn) -
578 * tn->empty_children ) >= inflate_threshold *
579 * tnode_child_length(tn)
580 *
581 */
582
583 check_tnode(tn);
584
585 err = 0;
586 while ((tn->full_children > 0 &&
587 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
588 inflate_threshold * tnode_child_length(tn))) {
589
590 tn = inflate(t, tn, &err);
591
592 if (err) {
593#ifdef CONFIG_IP_FIB_TRIE_STATS
594 t->stats.resize_node_skipped++;
595#endif
596 break;
597 }
598 }
599
600 check_tnode(tn);
601
602 /*
603 * Halve as long as the number of empty children in this
604 * node is above threshold.
605 */
606
607 err = 0;
608 while (tn->bits > 1 &&
609 100 * (tnode_child_length(tn) - tn->empty_children) <
610 halve_threshold * tnode_child_length(tn)) {
611
612 tn = halve(t, tn, &err);
613
614 if (err) {
615#ifdef CONFIG_IP_FIB_TRIE_STATS
616 t->stats.resize_node_skipped++;
617#endif
618 break;
619 }
620 }
621
622
623 /* Only one child remains */
624
625 if (tn->empty_children == tnode_child_length(tn) - 1)
626 for (i = 0; i < tnode_child_length(tn); i++) {
627
628 write_lock_bh(&fib_lock);
629 if (tn->child[i] != NULL) {
630 /* compress one level */
631 struct node *n = tn->child[i];
632
633 if (n)
634 NODE_INIT_PARENT(n, NODE_TYPE(n));
635
636 write_unlock_bh(&fib_lock);
637 tnode_free(tn);
638 return n;
639 }
640 write_unlock_bh(&fib_lock);
641 }
642
643 return (struct node *) tn;
644}
645
646static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
647{
648 struct tnode *inode;
649 struct tnode *oldtnode = tn;
650 int olen = tnode_child_length(tn);
651 int i;
652
653 if (trie_debug)
654 printk("In inflate\n");
655
656 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
657
658 if (!tn) {
659 *err = -ENOMEM;
660 return oldtnode;
661 }
662
663 /*
664 * Preallocate and store tnodes before the actual work so we
665 * don't get into an inconsistent state if memory allocation
666 * fails. In case of failure we return the oldnode and inflate
667 * of tnode is ignored.
668 */
669
670 for(i = 0; i < olen; i++) {
671 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
672
673 if (inode &&
674 IS_TNODE(inode) &&
675 inode->pos == oldtnode->pos + oldtnode->bits &&
676 inode->bits > 1) {
677 struct tnode *left, *right;
678
679 t_key m = TKEY_GET_MASK(inode->pos, 1);
680
681 left = tnode_new(inode->key&(~m), inode->pos + 1,
682 inode->bits - 1);
683
684 if (!left) {
685 *err = -ENOMEM;
686 break;
687 }
688
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
692 if (!right) {
693 *err = -ENOMEM;
694 break;
695 }
696
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
699 }
700 }
701
702 if (*err) {
703 int size = tnode_child_length(tn);
704 int j;
705
706 for(j = 0; j < size; j++)
707 if (tn->child[j])
708 tnode_free((struct tnode *)tn->child[j]);
709
710 tnode_free(tn);
711
712 *err = -ENOMEM;
713 return oldtnode;
714 }
715
716 for(i = 0; i < olen; i++) {
717 struct node *node = tnode_get_child(oldtnode, i);
718
719 /* An empty child */
720 if (node == NULL)
721 continue;
722
723 /* A leaf or an internal node with skipped bits */
724
725 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
726 tn->pos + tn->bits - 1) {
727 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
728 1) == 0)
729 put_child(t, tn, 2*i, node);
730 else
731 put_child(t, tn, 2*i+1, node);
732 continue;
733 }
734
735 /* An internal node with two children */
736 inode = (struct tnode *) node;
737
738 if (inode->bits == 1) {
739 put_child(t, tn, 2*i, inode->child[0]);
740 put_child(t, tn, 2*i+1, inode->child[1]);
741
742 tnode_free(inode);
743 }
744
745 /* An internal node with more than two children */
746 else {
747 struct tnode *left, *right;
748 int size, j;
749
750 /* We will replace this node 'inode' with two new
751 * ones, 'left' and 'right', each with half of the
752 * original children. The two new nodes will have
753 * a position one bit further down the key and this
754 * means that the "significant" part of their keys
755 * (see the discussion near the top of this file)
756 * will differ by one bit, which will be "0" in
757 * left's key and "1" in right's key. Since we are
758 * moving the key position by one step, the bit that
759 * we are moving away from - the bit at position
760 * (inode->pos) - is the one that will differ between
761 * left and right. So... we synthesize that bit in the
762 * two new keys.
763 * The mask 'm' below will be a single "one" bit at
764 * the position (inode->pos)
765 */
766
767 /* Use the old key, but set the new significant
768 * bit to zero.
769 */
770
771 left = (struct tnode *) tnode_get_child(tn, 2*i);
772 put_child(t, tn, 2*i, NULL);
773
774 if (!left)
775 BUG();
776
777 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
778 put_child(t, tn, 2*i+1, NULL);
779
780 if (!right)
781 BUG();
782
783 size = tnode_child_length(left);
784 for(j = 0; j < size; j++) {
785 put_child(t, left, j, inode->child[j]);
786 put_child(t, right, j, inode->child[j + size]);
787 }
788 put_child(t, tn, 2*i, resize(t, left));
789 put_child(t, tn, 2*i+1, resize(t, right));
790
791 tnode_free(inode);
792 }
793 }
794 tnode_free(oldtnode);
795 return tn;
796}
797
798static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
799{
800 struct tnode *oldtnode = tn;
801 struct node *left, *right;
802 int i;
803 int olen = tnode_child_length(tn);
804
805 if (trie_debug) printk("In halve\n");
806
807 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808
809 if (!tn) {
810 *err = -ENOMEM;
811 return oldtnode;
812 }
813
814 /*
815 * Preallocate and store tnodes before the actual work so we
816 * don't get into an inconsistent state if memory allocation
817 * fails. In case of failure we return the oldnode and halve
818 * of tnode is ignored.
819 */
820
821 for(i = 0; i < olen; i += 2) {
822 left = tnode_get_child(oldtnode, i);
823 right = tnode_get_child(oldtnode, i+1);
824
825 /* Two nonempty children */
826 if (left && right) {
827 struct tnode *newBinNode =
828 tnode_new(left->key, tn->pos + tn->bits, 1);
829
830 if (!newBinNode) {
831 *err = -ENOMEM;
832 break;
833 }
834 put_child(t, tn, i/2, (struct node *)newBinNode);
835 }
836 }
837
838 if (*err) {
839 int size = tnode_child_length(tn);
840 int j;
841
842 for(j = 0; j < size; j++)
843 if (tn->child[j])
844 tnode_free((struct tnode *)tn->child[j]);
845
846 tnode_free(tn);
847
848 *err = -ENOMEM;
849 return oldtnode;
850 }
851
852 for(i = 0; i < olen; i += 2) {
853 left = tnode_get_child(oldtnode, i);
854 right = tnode_get_child(oldtnode, i+1);
855
856 /* At least one of the children is empty */
857 if (left == NULL) {
858 if (right == NULL) /* Both are empty */
859 continue;
860 put_child(t, tn, i/2, right);
861 } else if (right == NULL)
862 put_child(t, tn, i/2, left);
863
864 /* Two nonempty children */
865 else {
866 struct tnode *newBinNode =
867 (struct tnode *) tnode_get_child(tn, i/2);
868 put_child(t, tn, i/2, NULL);
869
870 if (!newBinNode)
871 BUG();
872
873 put_child(t, newBinNode, 0, left);
874 put_child(t, newBinNode, 1, right);
875 put_child(t, tn, i/2, resize(t, newBinNode));
876 }
877 }
878 tnode_free(oldtnode);
879 return tn;
880}
881
882static void *trie_init(struct trie *t)
883{
884 if (t) {
885 t->size = 0;
886 t->trie = NULL;
887 t->revision = 0;
888#ifdef CONFIG_IP_FIB_TRIE_STATS
889 memset(&t->stats, 0, sizeof(struct trie_use_stats));
890#endif
891 }
892 return t;
893}
894
895static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
896{
897 struct hlist_node *node;
898 struct leaf_info *li;
899
900 hlist_for_each_entry(li, node, head, hlist) {
901 if (li->plen == plen)
902 return li;
903 }
904 return NULL;
905}
906
907static inline struct list_head * get_fa_head(struct leaf *l, int plen)
908{
909 struct list_head *fa_head = NULL;
910 struct leaf_info *li = find_leaf_info(&l->list, plen);
911
912 if (li)
913 fa_head = &li->falh;
914
915 return fa_head;
916}
917
918static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
919{
920 struct leaf_info *li = NULL, *last = NULL;
921 struct hlist_node *node, *tmp;
922
923 write_lock_bh(&fib_lock);
924
925 if (hlist_empty(head))
926 hlist_add_head(&new->hlist, head);
927 else {
928 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
929
930 if (new->plen > li->plen)
931 break;
932
933 last = li;
934 }
935 if (last)
936 hlist_add_after(&last->hlist, &new->hlist);
937 else
938 hlist_add_before(&new->hlist, &li->hlist);
939 }
940 write_unlock_bh(&fib_lock);
941}
942
943static struct leaf *
944fib_find_node(struct trie *t, u32 key)
945{
946 int pos;
947 struct tnode *tn;
948 struct node *n;
949
950 pos = 0;
951 n = t->trie;
952
953 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
954 tn = (struct tnode *) n;
955
956 check_tnode(tn);
957
958 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
959 pos=tn->pos + tn->bits;
960 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
961 }
962 else
963 break;
964 }
965 /* Case we have found a leaf. Compare prefixes */
966
967 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
968 struct leaf *l = (struct leaf *) n;
969 return l;
970 }
971 return NULL;
972}
973
974static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
975{
976 int i = 0;
977 int wasfull;
978 t_key cindex, key;
979 struct tnode *tp = NULL;
980
981 if (!tn)
982 BUG();
983
984 key = tn->key;
985 i = 0;
986
987 while (tn != NULL && NODE_PARENT(tn) != NULL) {
988
989 if (i > 10) {
990 printk("Rebalance tn=%p \n", tn);
991 if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
992
993 printk("Rebalance tp=%p \n", tp);
994 if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
995 }
996
997 if (i > 12) BUG();
998 i++;
999
1000 tp = NODE_PARENT(tn);
1001 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1002 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1003 tn = (struct tnode *) resize (t, (struct tnode *)tn);
1004 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
1005
1006 if (!NODE_PARENT(tn))
1007 break;
1008
1009 tn = NODE_PARENT(tn);
1010 }
1011 /* Handle last (top) tnode */
1012 if (IS_TNODE(tn))
1013 tn = (struct tnode*) resize(t, (struct tnode *)tn);
1014
1015 return (struct node*) tn;
1016}
1017
1018static struct list_head *
1019fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1020{
1021 int pos, newpos;
1022 struct tnode *tp = NULL, *tn = NULL;
1023 struct node *n;
1024 struct leaf *l;
1025 int missbit;
1026 struct list_head *fa_head = NULL;
1027 struct leaf_info *li;
1028 t_key cindex;
1029
1030 pos = 0;
1031 n = t->trie;
1032
1033 /* If we point to NULL, stop. Either the tree is empty and we should
1034 * just put a new leaf in if, or we have reached an empty child slot,
1035 * and we should just put our new leaf in that.
1036 * If we point to a T_TNODE, check if it matches our key. Note that
1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 * not be the parent's 'pos'+'bits'!
1039 *
1040 * If it does match the current key, get pos/bits from it, extract
1041 * the index from our key, push the T_TNODE and walk the tree.
1042 *
1043 * If it doesn't, we have to replace it with a new T_TNODE.
1044 *
1045 * If we point to a T_LEAF, it might or might not have the same key
1046 * as we do. If it does, just change the value, update the T_LEAF's
1047 * value, and return it.
1048 * If it doesn't, we need to replace it with a T_TNODE.
1049 */
1050
1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1052 tn = (struct tnode *) n;
1053
1054 check_tnode(tn);
1055
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057 tp = tn;
1058 pos=tn->pos + tn->bits;
1059 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1060
1061 if (n && NODE_PARENT(n) != tn) {
1062 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1063 BUG();
1064 }
1065 }
1066 else
1067 break;
1068 }
1069
1070 /*
1071 * n ----> NULL, LEAF or TNODE
1072 *
1073 * tp is n's (parent) ----> NULL or TNODE
1074 */
1075
1076 if (tp && IS_LEAF(tp))
1077 BUG();
1078
1079
1080 /* Case 1: n is a leaf. Compare prefixes */
1081
1082 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1083 struct leaf *l = ( struct leaf *) n;
1084
1085 li = leaf_info_new(plen);
1086
1087 if (!li) {
1088 *err = -ENOMEM;
1089 goto err;
1090 }
1091
1092 fa_head = &li->falh;
1093 insert_leaf_info(&l->list, li);
1094 goto done;
1095 }
1096 t->size++;
1097 l = leaf_new();
1098
1099 if (!l) {
1100 *err = -ENOMEM;
1101 goto err;
1102 }
1103
1104 l->key = key;
1105 li = leaf_info_new(plen);
1106
1107 if (!li) {
1108 tnode_free((struct tnode *) l);
1109 *err = -ENOMEM;
1110 goto err;
1111 }
1112
1113 fa_head = &li->falh;
1114 insert_leaf_info(&l->list, li);
1115
1116 /* Case 2: n is NULL, and will just insert a new leaf */
1117 if (t->trie && n == NULL) {
1118
1119 NODE_SET_PARENT(l, tp);
1120
1121 if (!tp)
1122 BUG();
1123
1124 else {
1125 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1126 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1127 }
1128 }
1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1130 else {
1131 /*
1132 * Add a new tnode here
1133 * first tnode need some special handling
1134 */
1135
1136 if (tp)
1137 pos=tp->pos+tp->bits;
1138 else
1139 pos=0;
1140 if (n) {
1141 newpos = tkey_mismatch(key, pos, n->key);
1142 tn = tnode_new(n->key, newpos, 1);
1143 }
1144 else {
1145 newpos = 0;
1146 tn = tnode_new(key, newpos, 1); /* First tnode */
1147 }
1148
1149 if (!tn) {
1150 free_leaf_info(li);
1151 tnode_free((struct tnode *) l);
1152 *err = -ENOMEM;
1153 goto err;
1154 }
1155
1156 NODE_SET_PARENT(tn, tp);
1157
1158 missbit=tkey_extract_bits(key, newpos, 1);
1159 put_child(t, tn, missbit, (struct node *)l);
1160 put_child(t, tn, 1-missbit, n);
1161
1162 if (tp) {
1163 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1164 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1165 }
1166 else {
1167 t->trie = (struct node*) tn; /* First tnode */
1168 tp = tn;
1169 }
1170 }
1171 if (tp && tp->pos+tp->bits > 32) {
1172 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1173 tp, tp->pos, tp->bits, key, plen);
1174 }
1175 /* Rebalance the trie */
1176 t->trie = trie_rebalance(t, tp);
1177done:
1178 t->revision++;
1179err:;
1180 return fa_head;
1181}
1182
1183static int
1184fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1185 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1186{
1187 struct trie *t = (struct trie *) tb->tb_data;
1188 struct fib_alias *fa, *new_fa;
1189 struct list_head *fa_head = NULL;
1190 struct fib_info *fi;
1191 int plen = r->rtm_dst_len;
1192 int type = r->rtm_type;
1193 u8 tos = r->rtm_tos;
1194 u32 key, mask;
1195 int err;
1196 struct leaf *l;
1197
1198 if (plen > 32)
1199 return -EINVAL;
1200
1201 key = 0;
1202 if (rta->rta_dst)
1203 memcpy(&key, rta->rta_dst, 4);
1204
1205 key = ntohl(key);
1206
1207 if (trie_debug)
1208 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1209
1210 mask = ntohl( inet_make_mask(plen) );
1211
1212 if (key & ~mask)
1213 return -EINVAL;
1214
1215 key = key & mask;
1216
1217 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1218 goto err;
1219
1220 l = fib_find_node(t, key);
1221 fa = NULL;
1222
1223 if (l) {
1224 fa_head = get_fa_head(l, plen);
1225 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1226 }
1227
1228 /* Now fa, if non-NULL, points to the first fib alias
1229 * with the same keys [prefix,tos,priority], if such key already
1230 * exists or to the node before which we will insert new one.
1231 *
1232 * If fa is NULL, we will need to allocate a new one and
1233 * insert to the head of f.
1234 *
1235 * If f is NULL, no fib node matched the destination key
1236 * and we need to allocate a new one of those as well.
1237 */
1238
1239 if (fa &&
1240 fa->fa_info->fib_priority == fi->fib_priority) {
1241 struct fib_alias *fa_orig;
1242
1243 err = -EEXIST;
1244 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1245 goto out;
1246
1247 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1248 struct fib_info *fi_drop;
1249 u8 state;
1250
1251 write_lock_bh(&fib_lock);
1252
1253 fi_drop = fa->fa_info;
1254 fa->fa_info = fi;
1255 fa->fa_type = type;
1256 fa->fa_scope = r->rtm_scope;
1257 state = fa->fa_state;
1258 fa->fa_state &= ~FA_S_ACCESSED;
1259
1260 write_unlock_bh(&fib_lock);
1261
1262 fib_release_info(fi_drop);
1263 if (state & FA_S_ACCESSED)
1264 rt_cache_flush(-1);
1265
1266 goto succeeded;
1267 }
1268 /* Error if we find a perfect match which
1269 * uses the same scope, type, and nexthop
1270 * information.
1271 */
1272 fa_orig = fa;
1273 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1274 if (fa->fa_tos != tos)
1275 break;
1276 if (fa->fa_info->fib_priority != fi->fib_priority)
1277 break;
1278 if (fa->fa_type == type &&
1279 fa->fa_scope == r->rtm_scope &&
1280 fa->fa_info == fi) {
1281 goto out;
1282 }
1283 }
1284 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1285 fa = fa_orig;
1286 }
1287 err = -ENOENT;
1288 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1289 goto out;
1290
1291 err = -ENOBUFS;
1292 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1293 if (new_fa == NULL)
1294 goto out;
1295
1296 new_fa->fa_info = fi;
1297 new_fa->fa_tos = tos;
1298 new_fa->fa_type = type;
1299 new_fa->fa_scope = r->rtm_scope;
1300 new_fa->fa_state = 0;
1301#if 0
1302 new_fa->dst = NULL;
1303#endif
1304 /*
1305 * Insert new entry to the list.
1306 */
1307
1308 if (!fa_head) {
1309 fa_head = fib_insert_node(t, &err, key, plen);
1310 err = 0;
1311 if (err)
1312 goto out_free_new_fa;
1313 }
1314
1315 write_lock_bh(&fib_lock);
1316
1317 list_add_tail(&new_fa->fa_list,
1318 (fa ? &fa->fa_list : fa_head));
1319
1320 write_unlock_bh(&fib_lock);
1321
1322 rt_cache_flush(-1);
1323 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1324succeeded:
1325 return 0;
1326
1327out_free_new_fa:
1328 kmem_cache_free(fn_alias_kmem, new_fa);
1329out:
1330 fib_release_info(fi);
1331err:;
1332 return err;
1333}
1334
1335static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1336 struct fib_result *res, int *err)
1337{
1338 int i;
1339 t_key mask;
1340 struct leaf_info *li;
1341 struct hlist_head *hhead = &l->list;
1342 struct hlist_node *node;
1343
1344 hlist_for_each_entry(li, node, hhead, hlist) {
1345
1346 i = li->plen;
1347 mask = ntohl(inet_make_mask(i));
1348 if (l->key != (key & mask))
1349 continue;
1350
1351 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1352 *plen = i;
1353#ifdef CONFIG_IP_FIB_TRIE_STATS
1354 t->stats.semantic_match_passed++;
1355#endif
1356 return 1;
1357 }
1358#ifdef CONFIG_IP_FIB_TRIE_STATS
1359 t->stats.semantic_match_miss++;
1360#endif
1361 }
1362 return 0;
1363}
1364
1365static int
1366fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1367{
1368 struct trie *t = (struct trie *) tb->tb_data;
1369 int plen, ret = 0;
1370 struct node *n;
1371 struct tnode *pn;
1372 int pos, bits;
1373 t_key key=ntohl(flp->fl4_dst);
1374 int chopped_off;
1375 t_key cindex = 0;
1376 int current_prefix_length = KEYLENGTH;
1377 n = t->trie;
1378
1379 read_lock(&fib_lock);
1380 if (!n)
1381 goto failed;
1382
1383#ifdef CONFIG_IP_FIB_TRIE_STATS
1384 t->stats.gets++;
1385#endif
1386
1387 /* Just a leaf? */
1388 if (IS_LEAF(n)) {
1389 if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1390 goto found;
1391 goto failed;
1392 }
1393 pn = (struct tnode *) n;
1394 chopped_off = 0;
1395
1396 while (pn) {
1397
1398 pos = pn->pos;
1399 bits = pn->bits;
1400
1401 if (!chopped_off)
1402 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1403
1404 n = tnode_get_child(pn, cindex);
1405
1406 if (n == NULL) {
1407#ifdef CONFIG_IP_FIB_TRIE_STATS
1408 t->stats.null_node_hit++;
1409#endif
1410 goto backtrace;
1411 }
1412
1413 if (IS_TNODE(n)) {
1414#define HL_OPTIMIZE
1415#ifdef HL_OPTIMIZE
1416 struct tnode *cn = (struct tnode *)n;
1417 t_key node_prefix, key_prefix, pref_mismatch;
1418 int mp;
1419
1420 /*
1421 * It's a tnode, and we can do some extra checks here if we
1422 * like, to avoid descending into a dead-end branch.
1423 * This tnode is in the parent's child array at index
1424 * key[p_pos..p_pos+p_bits] but potentially with some bits
1425 * chopped off, so in reality the index may be just a
1426 * subprefix, padded with zero at the end.
1427 * We can also take a look at any skipped bits in this
1428 * tnode - everything up to p_pos is supposed to be ok,
1429 * and the non-chopped bits of the index (se previous
1430 * paragraph) are also guaranteed ok, but the rest is
1431 * considered unknown.
1432 *
1433 * The skipped bits are key[pos+bits..cn->pos].
1434 */
1435
1436 /* If current_prefix_length < pos+bits, we are already doing
1437 * actual prefix matching, which means everything from
1438 * pos+(bits-chopped_off) onward must be zero along some
1439 * branch of this subtree - otherwise there is *no* valid
1440 * prefix present. Here we can only check the skipped
1441 * bits. Remember, since we have already indexed into the
1442 * parent's child array, we know that the bits we chopped of
1443 * *are* zero.
1444 */
1445
1446 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1447
1448 if (current_prefix_length < pos+bits) {
1449 if (tkey_extract_bits(cn->key, current_prefix_length,
1450 cn->pos - current_prefix_length) != 0 ||
1451 !(cn->child[0]))
1452 goto backtrace;
1453 }
1454
1455 /*
1456 * If chopped_off=0, the index is fully validated and we
1457 * only need to look at the skipped bits for this, the new,
1458 * tnode. What we actually want to do is to find out if
1459 * these skipped bits match our key perfectly, or if we will
1460 * have to count on finding a matching prefix further down,
1461 * because if we do, we would like to have some way of
1462 * verifying the existence of such a prefix at this point.
1463 */
1464
1465 /* The only thing we can do at this point is to verify that
1466 * any such matching prefix can indeed be a prefix to our
1467 * key, and if the bits in the node we are inspecting that
1468 * do not match our key are not ZERO, this cannot be true.
1469 * Thus, find out where there is a mismatch (before cn->pos)
1470 * and verify that all the mismatching bits are zero in the
1471 * new tnode's key.
1472 */
1473
1474 /* Note: We aren't very concerned about the piece of the key
1475 * that precede pn->pos+pn->bits, since these have already been
1476 * checked. The bits after cn->pos aren't checked since these are
1477 * by definition "unknown" at this point. Thus, what we want to
1478 * see is if we are about to enter the "prefix matching" state,
1479 * and in that case verify that the skipped bits that will prevail
1480 * throughout this subtree are zero, as they have to be if we are
1481 * to find a matching prefix.
1482 */
1483
1484 node_prefix = MASK_PFX(cn->key, cn->pos);
1485 key_prefix = MASK_PFX(key, cn->pos);
1486 pref_mismatch = key_prefix^node_prefix;
1487 mp = 0;
1488
1489 /* In short: If skipped bits in this node do not match the search
1490 * key, enter the "prefix matching" state.directly.
1491 */
1492 if (pref_mismatch) {
1493 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1494 mp++;
1495 pref_mismatch = pref_mismatch <<1;
1496 }
1497 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1498
1499 if (key_prefix != 0)
1500 goto backtrace;
1501
1502 if (current_prefix_length >= cn->pos)
1503 current_prefix_length=mp;
1504 }
1505#endif
1506 pn = (struct tnode *)n; /* Descend */
1507 chopped_off = 0;
1508 continue;
1509 }
1510 if (IS_LEAF(n)) {
1511 if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1512 goto found;
1513 }
1514backtrace:
1515 chopped_off++;
1516
1517 /* As zero don't change the child key (cindex) */
1518 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1519 chopped_off++;
1520 }
1521
1522 /* Decrease current_... with bits chopped off */
1523 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1524 current_prefix_length = pn->pos + pn->bits - chopped_off;
1525
1526 /*
1527 * Either we do the actual chop off according or if we have
1528 * chopped off all bits in this tnode walk up to our parent.
1529 */
1530
1531 if (chopped_off <= pn->bits)
1532 cindex &= ~(1 << (chopped_off-1));
1533 else {
1534 if (NODE_PARENT(pn) == NULL)
1535 goto failed;
1536
1537 /* Get Child's index */
1538 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1539 pn = NODE_PARENT(pn);
1540 chopped_off = 0;
1541
1542#ifdef CONFIG_IP_FIB_TRIE_STATS
1543 t->stats.backtrack++;
1544#endif
1545 goto backtrace;
1546 }
1547 }
1548failed:
1549 ret = 1;
1550found:
1551 read_unlock(&fib_lock);
1552 return ret;
1553}
1554
1555static int trie_leaf_remove(struct trie *t, t_key key)
1556{
1557 t_key cindex;
1558 struct tnode *tp = NULL;
1559 struct node *n = t->trie;
1560 struct leaf *l;
1561
1562 if (trie_debug)
1563 printk("entering trie_leaf_remove(%p)\n", n);
1564
1565 /* Note that in the case skipped bits, those bits are *not* checked!
1566 * When we finish this, we will have NULL or a T_LEAF, and the
1567 * T_LEAF may or may not match our key.
1568 */
1569
1570 while (n != NULL && IS_TNODE(n)) {
1571 struct tnode *tn = (struct tnode *) n;
1572 check_tnode(tn);
1573 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1574
1575 if (n && NODE_PARENT(n) != tn) {
1576 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1577 BUG();
1578 }
1579 }
1580 l = (struct leaf *) n;
1581
1582 if (!n || !tkey_equals(l->key, key))
1583 return 0;
1584
1585 /*
1586 * Key found.
1587 * Remove the leaf and rebalance the tree
1588 */
1589
1590 t->revision++;
1591 t->size--;
1592
1593 tp = NODE_PARENT(n);
1594 tnode_free((struct tnode *) n);
1595
1596 if (tp) {
1597 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1598 put_child(t, (struct tnode *)tp, cindex, NULL);
1599 t->trie = trie_rebalance(t, tp);
1600 }
1601 else
1602 t->trie = NULL;
1603
1604 return 1;
1605}
1606
1607static int
1608fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1609 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1610{
1611 struct trie *t = (struct trie *) tb->tb_data;
1612 u32 key, mask;
1613 int plen = r->rtm_dst_len;
1614 u8 tos = r->rtm_tos;
1615 struct fib_alias *fa, *fa_to_delete;
1616 struct list_head *fa_head;
1617 struct leaf *l;
1618
1619 if (plen > 32)
1620 return -EINVAL;
1621
1622 key = 0;
1623 if (rta->rta_dst)
1624 memcpy(&key, rta->rta_dst, 4);
1625
1626 key = ntohl(key);
1627 mask = ntohl( inet_make_mask(plen) );
1628
1629 if (key & ~mask)
1630 return -EINVAL;
1631
1632 key = key & mask;
1633 l = fib_find_node(t, key);
1634
1635 if (!l)
1636 return -ESRCH;
1637
1638 fa_head = get_fa_head(l, plen);
1639 fa = fib_find_alias(fa_head, tos, 0);
1640
1641 if (!fa)
1642 return -ESRCH;
1643
1644 if (trie_debug)
1645 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1646
1647 fa_to_delete = NULL;
1648 fa_head = fa->fa_list.prev;
1649 list_for_each_entry(fa, fa_head, fa_list) {
1650 struct fib_info *fi = fa->fa_info;
1651
1652 if (fa->fa_tos != tos)
1653 break;
1654
1655 if ((!r->rtm_type ||
1656 fa->fa_type == r->rtm_type) &&
1657 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1658 fa->fa_scope == r->rtm_scope) &&
1659 (!r->rtm_protocol ||
1660 fi->fib_protocol == r->rtm_protocol) &&
1661 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1662 fa_to_delete = fa;
1663 break;
1664 }
1665 }
1666
1667 if (fa_to_delete) {
1668 int kill_li = 0;
1669 struct leaf_info *li;
1670
1671 fa = fa_to_delete;
1672 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1673
1674 l = fib_find_node(t, key);
1675 li = find_leaf_info(&l->list, plen);
1676
1677 write_lock_bh(&fib_lock);
1678
1679 list_del(&fa->fa_list);
1680
1681 if (list_empty(fa_head)) {
1682 hlist_del(&li->hlist);
1683 kill_li = 1;
1684 }
1685 write_unlock_bh(&fib_lock);
1686
1687 if (kill_li)
1688 free_leaf_info(li);
1689
1690 if (hlist_empty(&l->list))
1691 trie_leaf_remove(t, key);
1692
1693 if (fa->fa_state & FA_S_ACCESSED)
1694 rt_cache_flush(-1);
1695
1696 fn_free_alias(fa);
1697 return 0;
1698 }
1699 return -ESRCH;
1700}
1701
1702static int trie_flush_list(struct trie *t, struct list_head *head)
1703{
1704 struct fib_alias *fa, *fa_node;
1705 int found = 0;
1706
1707 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1708 struct fib_info *fi = fa->fa_info;
1709
1710 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1711
1712 write_lock_bh(&fib_lock);
1713 list_del(&fa->fa_list);
1714 write_unlock_bh(&fib_lock);
1715
1716 fn_free_alias(fa);
1717 found++;
1718 }
1719 }
1720 return found;
1721}
1722
1723static int trie_flush_leaf(struct trie *t, struct leaf *l)
1724{
1725 int found = 0;
1726 struct hlist_head *lih = &l->list;
1727 struct hlist_node *node, *tmp;
1728 struct leaf_info *li = NULL;
1729
1730 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1731
1732 found += trie_flush_list(t, &li->falh);
1733
1734 if (list_empty(&li->falh)) {
1735
1736 write_lock_bh(&fib_lock);
1737 hlist_del(&li->hlist);
1738 write_unlock_bh(&fib_lock);
1739
1740 free_leaf_info(li);
1741 }
1742 }
1743 return found;
1744}
1745
1746static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1747{
1748 struct node *c = (struct node *) thisleaf;
1749 struct tnode *p;
1750 int idx;
1751
1752 if (c == NULL) {
1753 if (t->trie == NULL)
1754 return NULL;
1755
1756 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1757 return (struct leaf *) t->trie;
1758
1759 p = (struct tnode*) t->trie; /* Start */
1760 }
1761 else
1762 p = (struct tnode *) NODE_PARENT(c);
1763
1764 while (p) {
1765 int pos, last;
1766
1767 /* Find the next child of the parent */
1768 if (c)
1769 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1770 else
1771 pos = 0;
1772
1773 last = 1 << p->bits;
1774 for(idx = pos; idx < last ; idx++) {
1775 if (p->child[idx]) {
1776
1777 /* Decend if tnode */
1778
1779 while (IS_TNODE(p->child[idx])) {
1780 p = (struct tnode*) p->child[idx];
1781 idx = 0;
1782
1783 /* Rightmost non-NULL branch */
1784 if (p && IS_TNODE(p))
1785 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1786
1787 /* Done with this tnode? */
1788 if (idx >= (1 << p->bits) || p->child[idx] == NULL )
1789 goto up;
1790 }
1791 return (struct leaf*) p->child[idx];
1792 }
1793 }
1794up:
1795 /* No more children go up one step */
1796 c = (struct node*) p;
1797 p = (struct tnode *) NODE_PARENT(p);
1798 }
1799 return NULL; /* Ready. Root of trie */
1800}
1801
1802static int fn_trie_flush(struct fib_table *tb)
1803{
1804 struct trie *t = (struct trie *) tb->tb_data;
1805 struct leaf *ll = NULL, *l = NULL;
1806 int found = 0, h;
1807
1808 t->revision++;
1809
1810 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1811 found += trie_flush_leaf(t, l);
1812
1813 if (ll && hlist_empty(&ll->list))
1814 trie_leaf_remove(t, ll->key);
1815 ll = l;
1816 }
1817
1818 if (ll && hlist_empty(&ll->list))
1819 trie_leaf_remove(t, ll->key);
1820
1821 if (trie_debug)
1822 printk("trie_flush found=%d\n", found);
1823 return found;
1824}
1825
1826static int trie_last_dflt=-1;
1827
1828static void
1829fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1830{
1831 struct trie *t = (struct trie *) tb->tb_data;
1832 int order, last_idx;
1833 struct fib_info *fi = NULL;
1834 struct fib_info *last_resort;
1835 struct fib_alias *fa = NULL;
1836 struct list_head *fa_head;
1837 struct leaf *l;
1838
1839 last_idx = -1;
1840 last_resort = NULL;
1841 order = -1;
1842
1843 read_lock(&fib_lock);
1844
1845 l = fib_find_node(t, 0);
1846 if (!l)
1847 goto out;
1848
1849 fa_head = get_fa_head(l, 0);
1850 if (!fa_head)
1851 goto out;
1852
1853 if (list_empty(fa_head))
1854 goto out;
1855
1856 list_for_each_entry(fa, fa_head, fa_list) {
1857 struct fib_info *next_fi = fa->fa_info;
1858
1859 if (fa->fa_scope != res->scope ||
1860 fa->fa_type != RTN_UNICAST)
1861 continue;
1862
1863 if (next_fi->fib_priority > res->fi->fib_priority)
1864 break;
1865 if (!next_fi->fib_nh[0].nh_gw ||
1866 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1867 continue;
1868 fa->fa_state |= FA_S_ACCESSED;
1869
1870 if (fi == NULL) {
1871 if (next_fi != res->fi)
1872 break;
1873 } else if (!fib_detect_death(fi, order, &last_resort,
1874 &last_idx, &trie_last_dflt)) {
1875 if (res->fi)
1876 fib_info_put(res->fi);
1877 res->fi = fi;
1878 atomic_inc(&fi->fib_clntref);
1879 trie_last_dflt = order;
1880 goto out;
1881 }
1882 fi = next_fi;
1883 order++;
1884 }
1885 if (order <= 0 || fi == NULL) {
1886 trie_last_dflt = -1;
1887 goto out;
1888 }
1889
1890 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1891 if (res->fi)
1892 fib_info_put(res->fi);
1893 res->fi = fi;
1894 atomic_inc(&fi->fib_clntref);
1895 trie_last_dflt = order;
1896 goto out;
1897 }
1898 if (last_idx >= 0) {
1899 if (res->fi)
1900 fib_info_put(res->fi);
1901 res->fi = last_resort;
1902 if (last_resort)
1903 atomic_inc(&last_resort->fib_clntref);
1904 }
1905 trie_last_dflt = last_idx;
1906 out:;
1907 read_unlock(&fib_lock);
1908}
1909
1910static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1911 struct sk_buff *skb, struct netlink_callback *cb)
1912{
1913 int i, s_i;
1914 struct fib_alias *fa;
1915
1916 u32 xkey=htonl(key);
1917
1918 s_i=cb->args[3];
1919 i = 0;
1920
1921 list_for_each_entry(fa, fah, fa_list) {
1922 if (i < s_i) {
1923 i++;
1924 continue;
1925 }
1926 if (fa->fa_info->fib_nh == NULL) {
1927 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1928 i++;
1929 continue;
1930 }
1931 if (fa->fa_info == NULL) {
1932 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1933 i++;
1934 continue;
1935 }
1936
1937 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1938 cb->nlh->nlmsg_seq,
1939 RTM_NEWROUTE,
1940 tb->tb_id,
1941 fa->fa_type,
1942 fa->fa_scope,
1943 &xkey,
1944 plen,
1945 fa->fa_tos,
1946 fa->fa_info, 0) < 0) {
1947 cb->args[3] = i;
1948 return -1;
1949 }
1950 i++;
1951 }
1952 cb->args[3]=i;
1953 return skb->len;
1954}
1955
1956static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1957 struct netlink_callback *cb)
1958{
1959 int h, s_h;
1960 struct list_head *fa_head;
1961 struct leaf *l = NULL;
1962 s_h=cb->args[2];
1963
1964 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1965
1966 if (h < s_h)
1967 continue;
1968 if (h > s_h)
1969 memset(&cb->args[3], 0,
1970 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1971
1972 fa_head = get_fa_head(l, plen);
1973
1974 if (!fa_head)
1975 continue;
1976
1977 if (list_empty(fa_head))
1978 continue;
1979
1980 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1981 cb->args[2]=h;
1982 return -1;
1983 }
1984 }
1985 cb->args[2]=h;
1986 return skb->len;
1987}
1988
1989static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1990{
1991 int m, s_m;
1992 struct trie *t = (struct trie *) tb->tb_data;
1993
1994 s_m = cb->args[1];
1995
1996 read_lock(&fib_lock);
1997 for (m=0; m<=32; m++) {
1998
1999 if (m < s_m)
2000 continue;
2001 if (m > s_m)
2002 memset(&cb->args[2], 0,
2003 sizeof(cb->args) - 2*sizeof(cb->args[0]));
2004
2005 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2006 cb->args[1] = m;
2007 goto out;
2008 }
2009 }
2010 read_unlock(&fib_lock);
2011 cb->args[1] = m;
2012 return skb->len;
2013 out:
2014 read_unlock(&fib_lock);
2015 return -1;
2016}
2017
2018/* Fix more generic FIB names for init later */
2019
2020#ifdef CONFIG_IP_MULTIPLE_TABLES
2021struct fib_table * fib_hash_init(int id)
2022#else
2023struct fib_table * __init fib_hash_init(int id)
2024#endif
2025{
2026 struct fib_table *tb;
2027 struct trie *t;
2028
2029 if (fn_alias_kmem == NULL)
2030 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2031 sizeof(struct fib_alias),
2032 0, SLAB_HWCACHE_ALIGN,
2033 NULL, NULL);
2034
2035 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2036 GFP_KERNEL);
2037 if (tb == NULL)
2038 return NULL;
2039
2040 tb->tb_id = id;
2041 tb->tb_lookup = fn_trie_lookup;
2042 tb->tb_insert = fn_trie_insert;
2043 tb->tb_delete = fn_trie_delete;
2044 tb->tb_flush = fn_trie_flush;
2045 tb->tb_select_default = fn_trie_select_default;
2046 tb->tb_dump = fn_trie_dump;
2047 memset(tb->tb_data, 0, sizeof(struct trie));
2048
2049 t = (struct trie *) tb->tb_data;
2050
2051 trie_init(t);
2052
2053 if (id == RT_TABLE_LOCAL)
2054 trie_local = t;
2055 else if (id == RT_TABLE_MAIN)
2056 trie_main = t;
2057
2058 if (id == RT_TABLE_LOCAL)
2059 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2060
2061 return tb;
2062}
2063
2064/* Trie dump functions */
2065
2066static void putspace_seq(struct seq_file *seq, int n)
2067{
2068 while (n--) seq_printf(seq, " ");
2069}
2070
2071static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2072{
2073 while (bits--)
2074 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2075}
2076
2077static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2078 int pend, int cindex, int bits)
2079{
2080 putspace_seq(seq, indent);
2081 if (IS_LEAF(n))
2082 seq_printf(seq, "|");
2083 else
2084 seq_printf(seq, "+");
2085 if (bits) {
2086 seq_printf(seq, "%d/", cindex);
2087 printbin_seq(seq, cindex, bits);
2088 seq_printf(seq, ": ");
2089 }
2090 else
2091 seq_printf(seq, "<root>: ");
2092 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2093
2094 if (IS_LEAF(n))
2095 seq_printf(seq, "key=%d.%d.%d.%d\n",
2096 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2097 else {
2098 int plen = ((struct tnode *)n)->pos;
2099 t_key prf=MASK_PFX(n->key, plen);
2100 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2101 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2102 }
2103 if (IS_LEAF(n)) {
2104 struct leaf *l=(struct leaf *)n;
2105 struct fib_alias *fa;
2106 int i;
2107 for (i=32; i>=0; i--)
2108 if (find_leaf_info(&l->list, i)) {
2109
2110 struct list_head *fa_head = get_fa_head(l, i);
2111
2112 if (!fa_head)
2113 continue;
2114
2115 if (list_empty(fa_head))
2116 continue;
2117
2118 putspace_seq(seq, indent+2);
2119 seq_printf(seq, "{/%d...dumping}\n", i);
2120
2121
2122 list_for_each_entry(fa, fa_head, fa_list) {
2123 putspace_seq(seq, indent+2);
2124 if (fa->fa_info->fib_nh == NULL) {
2125 seq_printf(seq, "Error _fib_nh=NULL\n");
2126 continue;
2127 }
2128 if (fa->fa_info == NULL) {
2129 seq_printf(seq, "Error fa_info=NULL\n");
2130 continue;
2131 }
2132
2133 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2134 fa->fa_type,
2135 fa->fa_scope,
2136 fa->fa_tos);
2137 }
2138 }
2139 }
2140 else if (IS_TNODE(n)) {
2141 struct tnode *tn = (struct tnode *)n;
2142 putspace_seq(seq, indent); seq_printf(seq, "| ");
2143 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
2144 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2145 seq_printf(seq, "}\n");
2146 putspace_seq(seq, indent); seq_printf(seq, "| ");
2147 seq_printf(seq, "{pos=%d", tn->pos);
2148 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2149 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2150 putspace_seq(seq, indent); seq_printf(seq, "| ");
2151 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2152 }
2153}
2154
2155static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2156{
2157 struct node *n = t->trie;
2158 int cindex=0;
2159 int indent=1;
2160 int pend=0;
2161 int depth = 0;
2162
2163 read_lock(&fib_lock);
2164
2165 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2166 if (n) {
2167 printnode_seq(seq, indent, n, pend, cindex, 0);
2168 if (IS_TNODE(n)) {
2169 struct tnode *tn = (struct tnode *)n;
2170 pend = tn->pos+tn->bits;
2171 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2172 indent += 3;
2173 depth++;
2174
2175 while (tn && cindex < (1 << tn->bits)) {
2176 if (tn->child[cindex]) {
2177
2178 /* Got a child */
2179
2180 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2181 if (IS_LEAF(tn->child[cindex])) {
2182 cindex++;
2183
2184 }
2185 else {
2186 /*
2187 * New tnode. Decend one level
2188 */
2189
2190 depth++;
2191 n = tn->child[cindex];
2192 tn = (struct tnode *)n;
2193 pend = tn->pos+tn->bits;
2194 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2195 indent+=3;
2196 cindex=0;
2197 }
2198 }
2199 else
2200 cindex++;
2201
2202 /*
2203 * Test if we are done
2204 */
2205
2206 while (cindex >= (1 << tn->bits)) {
2207
2208 /*
2209 * Move upwards and test for root
2210 * pop off all traversed nodes
2211 */
2212
2213 if (NODE_PARENT(tn) == NULL) {
2214 tn = NULL;
2215 n = NULL;
2216 break;
2217 }
2218 else {
2219 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2220 tn = NODE_PARENT(tn);
2221 cindex++;
2222 n = (struct node *)tn;
2223 pend = tn->pos+tn->bits;
2224 indent-=3;
2225 depth--;
2226 }
2227 }
2228 }
2229 }
2230 else n = NULL;
2231 }
2232 else seq_printf(seq, "------ trie is empty\n");
2233
2234 read_unlock(&fib_lock);
2235}
2236
2237static struct trie_stat *trie_stat_new(void)
2238{
2239 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2240 int i;
2241
2242 if (s) {
2243 s->totdepth = 0;
2244 s->maxdepth = 0;
2245 s->tnodes = 0;
2246 s->leaves = 0;
2247 s->nullpointers = 0;
2248
2249 for(i=0; i< MAX_CHILDS; i++)
2250 s->nodesizes[i] = 0;
2251 }
2252 return s;
2253}
2254
2255static struct trie_stat *trie_collect_stats(struct trie *t)
2256{
2257 struct node *n = t->trie;
2258 struct trie_stat *s = trie_stat_new();
2259 int cindex = 0;
2260 int indent = 1;
2261 int pend = 0;
2262 int depth = 0;
2263
2264 read_lock(&fib_lock);
2265
2266 if (s) {
2267 if (n) {
2268 if (IS_TNODE(n)) {
2269 struct tnode *tn = (struct tnode *)n;
2270 pend = tn->pos+tn->bits;
2271 indent += 3;
2272 s->nodesizes[tn->bits]++;
2273 depth++;
2274
2275 while (tn && cindex < (1 << tn->bits)) {
2276 if (tn->child[cindex]) {
2277 /* Got a child */
2278
2279 if (IS_LEAF(tn->child[cindex])) {
2280 cindex++;
2281
2282 /* stats */
2283 if (depth > s->maxdepth)
2284 s->maxdepth = depth;
2285 s->totdepth += depth;
2286 s->leaves++;
2287 }
2288
2289 else {
2290 /*
2291 * New tnode. Decend one level
2292 */
2293
2294 s->tnodes++;
2295 s->nodesizes[tn->bits]++;
2296 depth++;
2297
2298 n = tn->child[cindex];
2299 tn = (struct tnode *)n;
2300 pend = tn->pos+tn->bits;
2301
2302 indent += 3;
2303 cindex = 0;
2304 }
2305 }
2306 else {
2307 cindex++;
2308 s->nullpointers++;
2309 }
2310
2311 /*
2312 * Test if we are done
2313 */
2314
2315 while (cindex >= (1 << tn->bits)) {
2316
2317 /*
2318 * Move upwards and test for root
2319 * pop off all traversed nodes
2320 */
2321
2322
2323 if (NODE_PARENT(tn) == NULL) {
2324 tn = NULL;
2325 n = NULL;
2326 break;
2327 }
2328 else {
2329 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2330 tn = NODE_PARENT(tn);
2331 cindex++;
2332 n = (struct node *)tn;
2333 pend = tn->pos+tn->bits;
2334 indent -= 3;
2335 depth--;
2336 }
2337 }
2338 }
2339 }
2340 else n = NULL;
2341 }
2342 }
2343
2344 read_unlock(&fib_lock);
2345 return s;
2346}
2347
2348#ifdef CONFIG_PROC_FS
2349
2350static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2351{
2352 return NULL;
2353}
2354
2355static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2356{
2357 return NULL;
2358}
2359
2360static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2361{
2362 void *v = NULL;
2363
2364 if (ip_fib_main_table)
2365 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2366 return v;
2367}
2368
2369static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2370{
2371 ++*pos;
2372 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2373}
2374
2375static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2376{
2377
2378}
2379
2380/*
2381 * This outputs /proc/net/fib_triestats
2382 *
2383 * It always works in backward compatibility mode.
2384 * The format of the file is not supposed to be changed.
2385 */
2386
2387static void collect_and_show(struct trie *t, struct seq_file *seq)
2388{
2389 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2390 int i, max, pointers;
2391 struct trie_stat *stat;
2392 int avdepth;
2393
2394 stat = trie_collect_stats(t);
2395
2396 bytes=0;
2397 seq_printf(seq, "trie=%p\n", t);
2398
2399 if (stat) {
2400 if (stat->leaves)
2401 avdepth=stat->totdepth*100 / stat->leaves;
2402 else
2403 avdepth=0;
2404 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2405 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2406
2407 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2408 bytes += sizeof(struct leaf) * stat->leaves;
2409 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2410 bytes += sizeof(struct tnode) * stat->tnodes;
2411
2412 max = MAX_CHILDS-1;
2413
2414 while (max >= 0 && stat->nodesizes[max] == 0)
2415 max--;
2416 pointers = 0;
2417
2418 for (i = 1; i <= max; i++)
2419 if (stat->nodesizes[i] != 0) {
2420 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2421 pointers += (1<<i) * stat->nodesizes[i];
2422 }
2423 seq_printf(seq, "\n");
2424 seq_printf(seq, "Pointers: %d\n", pointers);
2425 bytes += sizeof(struct node *) * pointers;
2426 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2427 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2428
2429 kfree(stat);
2430 }
2431
2432#ifdef CONFIG_IP_FIB_TRIE_STATS
2433 seq_printf(seq, "Counters:\n---------\n");
2434 seq_printf(seq,"gets = %d\n", t->stats.gets);
2435 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2436 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2437 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2438 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2439 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2440#ifdef CLEAR_STATS
2441 memset(&(t->stats), 0, sizeof(t->stats));
2442#endif
2443#endif /* CONFIG_IP_FIB_TRIE_STATS */
2444}
2445
2446static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2447{
2448 char bf[128];
2449
2450 if (v == SEQ_START_TOKEN) {
2451 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2452 sizeof(struct leaf), sizeof(struct tnode));
2453 if (trie_local)
2454 collect_and_show(trie_local, seq);
2455
2456 if (trie_main)
2457 collect_and_show(trie_main, seq);
2458 }
2459 else {
2460 snprintf(bf, sizeof(bf),
2461 "*\t%08X\t%08X", 200, 400);
2462
2463 seq_printf(seq, "%-127s\n", bf);
2464 }
2465 return 0;
2466}
2467
2468static struct seq_operations fib_triestat_seq_ops = {
2469 .start = fib_triestat_seq_start,
2470 .next = fib_triestat_seq_next,
2471 .stop = fib_triestat_seq_stop,
2472 .show = fib_triestat_seq_show,
2473};
2474
2475static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq;
2478 int rc = -ENOMEM;
2479
2480 rc = seq_open(file, &fib_triestat_seq_ops);
2481 if (rc)
2482 goto out_kfree;
2483
2484 seq = file->private_data;
2485out:
2486 return rc;
2487out_kfree:
2488 goto out;
2489}
2490
2491static struct file_operations fib_triestat_seq_fops = {
2492 .owner = THIS_MODULE,
2493 .open = fib_triestat_seq_open,
2494 .read = seq_read,
2495 .llseek = seq_lseek,
2496 .release = seq_release_private,
2497};
2498
2499int __init fib_stat_proc_init(void)
2500{
2501 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2502 return -ENOMEM;
2503 return 0;
2504}
2505
2506void __init fib_stat_proc_exit(void)
2507{
2508 proc_net_remove("fib_triestat");
2509}
2510
2511static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2512{
2513 return NULL;
2514}
2515
2516static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2517{
2518 return NULL;
2519}
2520
2521static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2522{
2523 void *v = NULL;
2524
2525 if (ip_fib_main_table)
2526 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2527 return v;
2528}
2529
2530static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531{
2532 ++*pos;
2533 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2534}
2535
2536static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2537{
2538
2539}
2540
2541/*
2542 * This outputs /proc/net/fib_trie.
2543 *
2544 * It always works in backward compatibility mode.
2545 * The format of the file is not supposed to be changed.
2546 */
2547
2548static int fib_trie_seq_show(struct seq_file *seq, void *v)
2549{
2550 char bf[128];
2551
2552 if (v == SEQ_START_TOKEN) {
2553 if (trie_local)
2554 trie_dump_seq(seq, trie_local);
2555
2556 if (trie_main)
2557 trie_dump_seq(seq, trie_main);
2558 }
2559
2560 else {
2561 snprintf(bf, sizeof(bf),
2562 "*\t%08X\t%08X", 200, 400);
2563 seq_printf(seq, "%-127s\n", bf);
2564 }
2565
2566 return 0;
2567}
2568
2569static struct seq_operations fib_trie_seq_ops = {
2570 .start = fib_trie_seq_start,
2571 .next = fib_trie_seq_next,
2572 .stop = fib_trie_seq_stop,
2573 .show = fib_trie_seq_show,
2574};
2575
2576static int fib_trie_seq_open(struct inode *inode, struct file *file)
2577{
2578 struct seq_file *seq;
2579 int rc = -ENOMEM;
2580
2581 rc = seq_open(file, &fib_trie_seq_ops);
2582 if (rc)
2583 goto out_kfree;
2584
2585 seq = file->private_data;
2586out:
2587 return rc;
2588out_kfree:
2589 goto out;
2590}
2591
2592static struct file_operations fib_trie_seq_fops = {
2593 .owner = THIS_MODULE,
2594 .open = fib_trie_seq_open,
2595 .read = seq_read,
2596 .llseek = seq_lseek,
2597 .release= seq_release_private,
2598};
2599
2600int __init fib_proc_init(void)
2601{
2602 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2603 return -ENOMEM;
2604 return 0;
2605}
2606
2607void __init fib_proc_exit(void)
2608{
2609 proc_net_remove("fib_trie");
2610}
2611
2612#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 85bf0d3e294b..3d78464f64ea 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -207,6 +207,7 @@ int sysctl_icmp_ignore_bogus_error_responses;
207 207
208int sysctl_icmp_ratelimit = 1 * HZ; 208int sysctl_icmp_ratelimit = 1 * HZ;
209int sysctl_icmp_ratemask = 0x1818; 209int sysctl_icmp_ratemask = 0x1818;
210int sysctl_icmp_errors_use_inbound_ifaddr;
210 211
211/* 212/*
212 * ICMP control array. This specifies what to do with each ICMP. 213 * ICMP control array. This specifies what to do with each ICMP.
@@ -511,8 +512,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
511 */ 512 */
512 513
513 saddr = iph->daddr; 514 saddr = iph->daddr;
514 if (!(rt->rt_flags & RTCF_LOCAL)) 515 if (!(rt->rt_flags & RTCF_LOCAL)) {
515 saddr = 0; 516 if (sysctl_icmp_errors_use_inbound_ifaddr)
517 saddr = inet_select_addr(skb_in->dev, 0, RT_SCOPE_LINK);
518 else
519 saddr = 0;
520 }
516 521
517 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | 522 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
518 IPTOS_PREC_INTERNETCONTROL) : 523 IPTOS_PREC_INTERNETCONTROL) :
@@ -931,8 +936,7 @@ int icmp_rcv(struct sk_buff *skb)
931 case CHECKSUM_HW: 936 case CHECKSUM_HW:
932 if (!(u16)csum_fold(skb->csum)) 937 if (!(u16)csum_fold(skb->csum))
933 break; 938 break;
934 NETDEBUG(if (net_ratelimit()) 939 LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
935 printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
936 case CHECKSUM_NONE: 940 case CHECKSUM_NONE:
937 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) 941 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
938 goto error; 942 goto error;
@@ -965,7 +969,8 @@ int icmp_rcv(struct sk_buff *skb)
965 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently 969 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
966 * discarded if to broadcast/multicast. 970 * discarded if to broadcast/multicast.
967 */ 971 */
968 if (icmph->type == ICMP_ECHO && 972 if ((icmph->type == ICMP_ECHO ||
973 icmph->type == ICMP_TIMESTAMP) &&
969 sysctl_icmp_echo_ignore_broadcasts) { 974 sysctl_icmp_echo_ignore_broadcasts) {
970 goto error; 975 goto error;
971 } 976 }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a90..5088f90835ae 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1615{ 1615{
1616 int err; 1616 int err;
1617 u32 addr = imr->imr_multiaddr.s_addr; 1617 u32 addr = imr->imr_multiaddr.s_addr;
1618 struct ip_mc_socklist *iml, *i; 1618 struct ip_mc_socklist *iml=NULL, *i;
1619 struct in_device *in_dev; 1619 struct in_device *in_dev;
1620 struct inet_sock *inet = inet_sk(sk); 1620 struct inet_sock *inet = inet_sk(sk);
1621 int ifindex;
1621 int count = 0; 1622 int count = 0;
1622 1623
1623 if (!MULTICAST(addr)) 1624 if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1633 goto done; 1634 goto done;
1634 } 1635 }
1635 1636
1636 iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
1637
1638 err = -EADDRINUSE; 1637 err = -EADDRINUSE;
1638 ifindex = imr->imr_ifindex;
1639 for (i = inet->mc_list; i; i = i->next) { 1639 for (i = inet->mc_list; i; i = i->next) {
1640 if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { 1640 if (i->multi.imr_multiaddr.s_addr == addr &&
1641 /* New style additions are reference counted */ 1641 i->multi.imr_ifindex == ifindex)
1642 if (imr->imr_address.s_addr == 0) {
1643 i->count++;
1644 err = 0;
1645 }
1646 goto done; 1642 goto done;
1647 }
1648 count++; 1643 count++;
1649 } 1644 }
1650 err = -ENOBUFS; 1645 err = -ENOBUFS;
1651 if (iml == NULL || count >= sysctl_igmp_max_memberships) 1646 if (count >= sysctl_igmp_max_memberships)
1647 goto done;
1648 iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
1649 if (iml == NULL)
1652 goto done; 1650 goto done;
1651
1653 memcpy(&iml->multi, imr, sizeof(*imr)); 1652 memcpy(&iml->multi, imr, sizeof(*imr));
1654 iml->next = inet->mc_list; 1653 iml->next = inet->mc_list;
1655 iml->count = 1;
1656 iml->sflist = NULL; 1654 iml->sflist = NULL;
1657 iml->sfmode = MCAST_EXCLUDE; 1655 iml->sfmode = MCAST_EXCLUDE;
1658 inet->mc_list = iml; 1656 inet->mc_list = iml;
1659 ip_mc_inc_group(in_dev, addr); 1657 ip_mc_inc_group(in_dev, addr);
1660 iml = NULL;
1661 err = 0; 1658 err = 0;
1662
1663done: 1659done:
1664 rtnl_shunlock(); 1660 rtnl_shunlock();
1665 if (iml)
1666 sock_kfree_s(sk, iml, sizeof(*iml));
1667 return err; 1661 return err;
1668} 1662}
1669 1663
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1693{ 1687{
1694 struct inet_sock *inet = inet_sk(sk); 1688 struct inet_sock *inet = inet_sk(sk);
1695 struct ip_mc_socklist *iml, **imlp; 1689 struct ip_mc_socklist *iml, **imlp;
1690 struct in_device *in_dev;
1691 u32 group = imr->imr_multiaddr.s_addr;
1692 u32 ifindex;
1696 1693
1697 rtnl_lock(); 1694 rtnl_lock();
1695 in_dev = ip_mc_find_dev(imr);
1696 if (!in_dev) {
1697 rtnl_unlock();
1698 return -ENODEV;
1699 }
1700 ifindex = imr->imr_ifindex;
1698 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1701 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
1699 if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && 1702 if (iml->multi.imr_multiaddr.s_addr == group &&
1700 iml->multi.imr_address.s_addr==imr->imr_address.s_addr && 1703 iml->multi.imr_ifindex == ifindex) {
1701 (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { 1704 (void) ip_mc_leave_src(sk, iml, in_dev);
1702 struct in_device *in_dev;
1703
1704 in_dev = inetdev_by_index(iml->multi.imr_ifindex);
1705 if (in_dev)
1706 (void) ip_mc_leave_src(sk, iml, in_dev);
1707 if (--iml->count) {
1708 rtnl_unlock();
1709 if (in_dev)
1710 in_dev_put(in_dev);
1711 return 0;
1712 }
1713 1705
1714 *imlp = iml->next; 1706 *imlp = iml->next;
1715 1707
1716 if (in_dev) { 1708 ip_mc_dec_group(in_dev, group);
1717 ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
1718 in_dev_put(in_dev);
1719 }
1720 rtnl_unlock(); 1709 rtnl_unlock();
1721 sock_kfree_s(sk, iml, sizeof(*iml)); 1710 sock_kfree_s(sk, iml, sizeof(*iml));
1722 return 0; 1711 return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1736 struct in_device *in_dev = NULL; 1725 struct in_device *in_dev = NULL;
1737 struct inet_sock *inet = inet_sk(sk); 1726 struct inet_sock *inet = inet_sk(sk);
1738 struct ip_sf_socklist *psl; 1727 struct ip_sf_socklist *psl;
1728 int leavegroup = 0;
1739 int i, j, rv; 1729 int i, j, rv;
1740 1730
1741 if (!MULTICAST(addr)) 1731 if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1755 err = -EADDRNOTAVAIL; 1745 err = -EADDRNOTAVAIL;
1756 1746
1757 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1747 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1758 if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0) 1748 if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
1749 && pmc->multi.imr_ifindex == imr.imr_ifindex)
1759 break; 1750 break;
1760 } 1751 }
1761 if (!pmc) /* must have a prior join */ 1752 if (!pmc) { /* must have a prior join */
1753 err = -EINVAL;
1762 goto done; 1754 goto done;
1755 }
1763 /* if a source filter was set, must be the same mode as before */ 1756 /* if a source filter was set, must be the same mode as before */
1764 if (pmc->sflist) { 1757 if (pmc->sflist) {
1765 if (pmc->sfmode != omode) 1758 if (pmc->sfmode != omode) {
1759 err = -EINVAL;
1766 goto done; 1760 goto done;
1761 }
1767 } else if (pmc->sfmode != omode) { 1762 } else if (pmc->sfmode != omode) {
1768 /* allow mode switches for empty-set filters */ 1763 /* allow mode switches for empty-set filters */
1769 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); 1764 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1775 psl = pmc->sflist; 1770 psl = pmc->sflist;
1776 if (!add) { 1771 if (!add) {
1777 if (!psl) 1772 if (!psl)
1778 goto done; 1773 goto done; /* err = -EADDRNOTAVAIL */
1779 rv = !0; 1774 rv = !0;
1780 for (i=0; i<psl->sl_count; i++) { 1775 for (i=0; i<psl->sl_count; i++) {
1781 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, 1776 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1784 break; 1779 break;
1785 } 1780 }
1786 if (rv) /* source not found */ 1781 if (rv) /* source not found */
1782 goto done; /* err = -EADDRNOTAVAIL */
1783
1784 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1785 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
1786 leavegroup = 1;
1787 goto done; 1787 goto done;
1788 }
1788 1789
1789 /* update the interface filter */ 1790 /* update the interface filter */
1790 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 1791 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1842 &mreqs->imr_sourceaddr, 1); 1843 &mreqs->imr_sourceaddr, 1);
1843done: 1844done:
1844 rtnl_shunlock(); 1845 rtnl_shunlock();
1846 if (leavegroup)
1847 return ip_mc_leave_group(sk, &imr);
1845 return err; 1848 return err;
1846} 1849}
1847 1850
1848int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) 1851int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1849{ 1852{
1850 int err; 1853 int err = 0;
1851 struct ip_mreqn imr; 1854 struct ip_mreqn imr;
1852 u32 addr = msf->imsf_multiaddr; 1855 u32 addr = msf->imsf_multiaddr;
1853 struct ip_mc_socklist *pmc; 1856 struct ip_mc_socklist *pmc;
1854 struct in_device *in_dev; 1857 struct in_device *in_dev;
1855 struct inet_sock *inet = inet_sk(sk); 1858 struct inet_sock *inet = inet_sk(sk);
1856 struct ip_sf_socklist *newpsl, *psl; 1859 struct ip_sf_socklist *newpsl, *psl;
1860 int leavegroup = 0;
1857 1861
1858 if (!MULTICAST(addr)) 1862 if (!MULTICAST(addr))
1859 return -EINVAL; 1863 return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1872 err = -ENODEV; 1876 err = -ENODEV;
1873 goto done; 1877 goto done;
1874 } 1878 }
1875 err = -EADDRNOTAVAIL; 1879
1880 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1881 if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
1882 leavegroup = 1;
1883 goto done;
1884 }
1876 1885
1877 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1886 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1878 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 1887 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
1879 pmc->multi.imr_ifindex == imr.imr_ifindex) 1888 pmc->multi.imr_ifindex == imr.imr_ifindex)
1880 break; 1889 break;
1881 } 1890 }
1882 if (!pmc) /* must have a prior join */ 1891 if (!pmc) { /* must have a prior join */
1892 err = -EINVAL;
1883 goto done; 1893 goto done;
1894 }
1884 if (msf->imsf_numsrc) { 1895 if (msf->imsf_numsrc) {
1885 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, 1896 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
1886 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); 1897 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1909 0, NULL, 0); 1920 0, NULL, 0);
1910 pmc->sflist = newpsl; 1921 pmc->sflist = newpsl;
1911 pmc->sfmode = msf->imsf_fmode; 1922 pmc->sfmode = msf->imsf_fmode;
1923 err = 0;
1912done: 1924done:
1913 rtnl_shunlock(); 1925 rtnl_shunlock();
1926 if (leavegroup)
1927 err = ip_mc_leave_group(sk, &imr);
1914 return err; 1928 return err;
1915} 1929}
1916 1930
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7f68e27eb4ea..eb377ae15305 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
377 return ip_frag_intern(hash, qp); 377 return ip_frag_intern(hash, qp);
378 378
379out_nomem: 379out_nomem:
380 NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); 380 LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n"));
381 return NULL; 381 return NULL;
382} 382}
383 383
@@ -625,10 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
625 return head; 625 return head;
626 626
627out_nomem: 627out_nomem:
628 NETDEBUG(if (net_ratelimit()) 628 LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing "
629 printk(KERN_ERR 629 "queue %p\n", qp));
630 "IP: queue_glue: no memory for gluing queue %p\n",
631 qp));
632 goto out_fail; 630 goto out_fail;
633out_oversize: 631out_oversize:
634 if (net_ratelimit()) 632 if (net_ratelimit())
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 884835522224..f0d5740d7e22 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -290,7 +290,6 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
290 290
291 dev_hold(dev); 291 dev_hold(dev);
292 ipgre_tunnel_link(nt); 292 ipgre_tunnel_link(nt);
293 /* Do not decrement MOD_USE_COUNT here. */
294 return nt; 293 return nt;
295 294
296failed: 295failed:
@@ -1277,12 +1276,28 @@ err1:
1277 goto out; 1276 goto out;
1278} 1277}
1279 1278
1280static void ipgre_fini(void) 1279static void __exit ipgre_destroy_tunnels(void)
1280{
1281 int prio;
1282
1283 for (prio = 0; prio < 4; prio++) {
1284 int h;
1285 for (h = 0; h < HASH_SIZE; h++) {
1286 struct ip_tunnel *t;
1287 while ((t = tunnels[prio][h]) != NULL)
1288 unregister_netdevice(t->dev);
1289 }
1290 }
1291}
1292
1293static void __exit ipgre_fini(void)
1281{ 1294{
1282 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1295 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1283 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1296 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1284 1297
1285 unregister_netdev(ipgre_fb_tunnel_dev); 1298 rtnl_lock();
1299 ipgre_destroy_tunnels();
1300 rtnl_unlock();
1286} 1301}
1287 1302
1288module_init(ipgre_init); 1303module_init(ipgre_init);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4e47a2658c7c..c703528e0bcd 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -184,6 +184,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
184 raw_rcv(last, skb2); 184 raw_rcv(last, skb2);
185 } 185 }
186 last = sk; 186 last = sk;
187 nf_reset(skb);
187 } 188 }
188 } 189 }
189 190
@@ -200,10 +201,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
200{ 201{
201 int ihl = skb->nh.iph->ihl*4; 202 int ihl = skb->nh.iph->ihl*4;
202 203
203#ifdef CONFIG_NETFILTER_DEBUG
204 nf_debug_ip_local_deliver(skb);
205#endif /*CONFIG_NETFILTER_DEBUG*/
206
207 __skb_pull(skb, ihl); 204 __skb_pull(skb, ihl);
208 205
209 /* Free reference early: we don't need it any more, and it may 206 /* Free reference early: we don't need it any more, and it may
@@ -286,14 +283,18 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
286{ 283{
287 struct net_device *dev = skb->dev; 284 struct net_device *dev = skb->dev;
288 struct iphdr *iph = skb->nh.iph; 285 struct iphdr *iph = skb->nh.iph;
286 int err;
289 287
290 /* 288 /*
291 * Initialise the virtual path cache for the packet. It describes 289 * Initialise the virtual path cache for the packet. It describes
292 * how the packet travels inside Linux networking. 290 * how the packet travels inside Linux networking.
293 */ 291 */
294 if (skb->dst == NULL) { 292 if (skb->dst == NULL) {
295 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) 293 if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
294 if (err == -EHOSTUNREACH)
295 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
296 goto drop; 296 goto drop;
297 }
297 } 298 }
298 299
299#ifdef CONFIG_NET_CLS_ROUTE 300#ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 760dc8238d65..80d13103b2b0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,11 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110
111#ifdef CONFIG_NETFILTER_DEBUG
112 nf_debug_ip_loopback_xmit(newskb);
113#endif
114 nf_reset(newskb);
115 netif_rx(newskb); 110 netif_rx(newskb);
116 return 0; 111 return 0;
117} 112}
@@ -192,12 +187,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
192 skb = skb2; 187 skb = skb2;
193 } 188 }
194 189
195#ifdef CONFIG_NETFILTER_DEBUG
196 nf_debug_ip_finish_output2(skb);
197#endif /*CONFIG_NETFILTER_DEBUG*/
198
199 nf_reset(skb);
200
201 if (hh) { 190 if (hh) {
202 int hh_alen; 191 int hh_alen;
203 192
@@ -391,7 +380,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
391 to->pkt_type = from->pkt_type; 380 to->pkt_type = from->pkt_type;
392 to->priority = from->priority; 381 to->priority = from->priority;
393 to->protocol = from->protocol; 382 to->protocol = from->protocol;
394 to->security = from->security;
395 dst_release(to->dst); 383 dst_release(to->dst);
396 to->dst = dst_clone(from->dst); 384 to->dst = dst_clone(from->dst);
397 to->dev = from->dev; 385 to->dev = from->dev;
@@ -415,9 +403,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
415 to->nf_bridge = from->nf_bridge; 403 to->nf_bridge = from->nf_bridge;
416 nf_bridge_get(to->nf_bridge); 404 nf_bridge_get(to->nf_bridge);
417#endif 405#endif
418#ifdef CONFIG_NETFILTER_DEBUG
419 to->nf_debug = from->nf_debug;
420#endif
421#endif 406#endif
422} 407}
423 408
@@ -1334,23 +1319,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1334 ip_rt_put(rt); 1319 ip_rt_put(rt);
1335} 1320}
1336 1321
1337/*
1338 * IP protocol layer initialiser
1339 */
1340
1341static struct packet_type ip_packet_type = {
1342 .type = __constant_htons(ETH_P_IP),
1343 .func = ip_rcv,
1344};
1345
1346/*
1347 * IP registers the packet type and then calls the subprotocol initialisers
1348 */
1349
1350void __init ip_init(void) 1322void __init ip_init(void)
1351{ 1323{
1352 dev_add_pack(&ip_packet_type);
1353
1354 ip_rt_init(); 1324 ip_rt_init();
1355 inet_initpeers(); 1325 inet_initpeers();
1356 1326
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 47012b93cad2..ff4bd067b397 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -360,14 +360,14 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
360 err = copied; 360 err = copied;
361 361
362 /* Reset and regenerate socket error */ 362 /* Reset and regenerate socket error */
363 spin_lock_irq(&sk->sk_error_queue.lock); 363 spin_lock_bh(&sk->sk_error_queue.lock);
364 sk->sk_err = 0; 364 sk->sk_err = 0;
365 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { 365 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
366 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; 366 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
367 spin_unlock_irq(&sk->sk_error_queue.lock); 367 spin_unlock_bh(&sk->sk_error_queue.lock);
368 sk->sk_error_report(sk); 368 sk->sk_error_report(sk);
369 } else 369 } else
370 spin_unlock_irq(&sk->sk_error_queue.lock); 370 spin_unlock_bh(&sk->sk_error_queue.lock);
371 371
372out_free_skb: 372out_free_skb:
373 kfree_skb(skb); 373 kfree_skb(skb);
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
677 mreq.imr_address.s_addr = mreqs.imr_interface; 677 mreq.imr_address.s_addr = mreqs.imr_interface;
678 mreq.imr_ifindex = 0; 678 mreq.imr_ifindex = 0;
679 err = ip_mc_join_group(sk, &mreq); 679 err = ip_mc_join_group(sk, &mreq);
680 if (err) 680 if (err && err != -EADDRINUSE)
681 break; 681 break;
682 omode = MCAST_INCLUDE; 682 omode = MCAST_INCLUDE;
683 add = 1; 683 add = 1;
684 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 684 } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
685 omode = MCAST_INCLUDE; 685 omode = MCAST_INCLUDE;
686 add = 0; 686 add = 0;
687 } 687 }
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
754 mreq.imr_address.s_addr = 0; 754 mreq.imr_address.s_addr = 0;
755 mreq.imr_ifindex = greqs.gsr_interface; 755 mreq.imr_ifindex = greqs.gsr_interface;
756 err = ip_mc_join_group(sk, &mreq); 756 err = ip_mc_join_group(sk, &mreq);
757 if (err) 757 if (err && err != -EADDRINUSE)
758 break; 758 break;
759 greqs.gsr_interface = mreq.imr_ifindex; 759 greqs.gsr_interface = mreq.imr_ifindex;
760 omode = MCAST_INCLUDE; 760 omode = MCAST_INCLUDE;
@@ -848,6 +848,9 @@ mc_msf_out:
848 848
849 case IP_IPSEC_POLICY: 849 case IP_IPSEC_POLICY:
850 case IP_XFRM_POLICY: 850 case IP_XFRM_POLICY:
851 err = -EPERM;
852 if (!capable(CAP_NET_ADMIN))
853 break;
851 err = xfrm_user_policy(sk, optname, optval, optlen); 854 err = xfrm_user_policy(sk, optname, optval, optlen);
852 break; 855 break;
853 856
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 1a23c5263b99..2065944fd9e5 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -236,15 +236,10 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
236 t->props.mode = 1; 236 t->props.mode = 1;
237 t->props.saddr.a4 = x->props.saddr.a4; 237 t->props.saddr.a4 = x->props.saddr.a4;
238 t->props.flags = x->props.flags; 238 t->props.flags = x->props.flags;
239 239
240 t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family); 240 if (xfrm_init_state(t))
241 if (t->type == NULL)
242 goto error;
243
244 if (t->type->init_state(t, NULL))
245 goto error; 241 goto error;
246 242
247 t->km.state = XFRM_STATE_VALID;
248 atomic_set(&t->tunnel_users, 1); 243 atomic_set(&t->tunnel_users, 1);
249out: 244out:
250 return t; 245 return t;
@@ -422,7 +417,7 @@ static void ipcomp_destroy(struct xfrm_state *x)
422 kfree(ipcd); 417 kfree(ipcd);
423} 418}
424 419
425static int ipcomp_init_state(struct xfrm_state *x, void *args) 420static int ipcomp_init_state(struct xfrm_state *x)
426{ 421{
427 int err; 422 int err;
428 struct ipcomp_data *ipcd; 423 struct ipcomp_data *ipcd;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f2509034ce72..d2bf8e1930a3 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1149,8 +1149,10 @@ static int __init ic_dynamic(void)
1149 ic_rarp_cleanup(); 1149 ic_rarp_cleanup();
1150#endif 1150#endif
1151 1151
1152 if (!ic_got_reply) 1152 if (!ic_got_reply) {
1153 ic_myaddr = INADDR_NONE;
1153 return -1; 1154 return -1;
1155 }
1154 1156
1155 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ", 1157 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
1156 ((ic_got_reply & IC_RARP) ? "RARP" 1158 ((ic_got_reply & IC_RARP) ? "RARP"
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 68a78731f722..c05c1df0bb04 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -255,7 +255,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
255 255
256 dev_hold(dev); 256 dev_hold(dev);
257 ipip_tunnel_link(nt); 257 ipip_tunnel_link(nt);
258 /* Do not decrement MOD_USE_COUNT here. */
259 return nt; 258 return nt;
260 259
261failed: 260failed:
@@ -273,7 +272,7 @@ static void ipip_tunnel_uninit(struct net_device *dev)
273 dev_put(dev); 272 dev_put(dev);
274} 273}
275 274
276static void ipip_err(struct sk_buff *skb, void *__unused) 275static void ipip_err(struct sk_buff *skb, u32 info)
277{ 276{
278#ifndef I_WISH_WORLD_WERE_PERFECT 277#ifndef I_WISH_WORLD_WERE_PERFECT
279 278
@@ -852,11 +851,39 @@ static int __init ipip_fb_tunnel_init(struct net_device *dev)
852 return 0; 851 return 0;
853} 852}
854 853
854#ifdef CONFIG_INET_TUNNEL
855static struct xfrm_tunnel ipip_handler = { 855static struct xfrm_tunnel ipip_handler = {
856 .handler = ipip_rcv, 856 .handler = ipip_rcv,
857 .err_handler = ipip_err, 857 .err_handler = ipip_err,
858}; 858};
859 859
860static inline int ipip_register(void)
861{
862 return xfrm4_tunnel_register(&ipip_handler);
863}
864
865static inline int ipip_unregister(void)
866{
867 return xfrm4_tunnel_deregister(&ipip_handler);
868}
869#else
870static struct net_protocol ipip_protocol = {
871 .handler = ipip_rcv,
872 .err_handler = ipip_err,
873 .no_policy = 1,
874};
875
876static inline int ipip_register(void)
877{
878 return inet_add_protocol(&ipip_protocol, IPPROTO_IPIP);
879}
880
881static inline int ipip_unregister(void)
882{
883 return inet_del_protocol(&ipip_protocol, IPPROTO_IPIP);
884}
885#endif
886
860static char banner[] __initdata = 887static char banner[] __initdata =
861 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 888 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
862 889
@@ -866,7 +893,7 @@ static int __init ipip_init(void)
866 893
867 printk(banner); 894 printk(banner);
868 895
869 if (xfrm4_tunnel_register(&ipip_handler) < 0) { 896 if (ipip_register() < 0) {
870 printk(KERN_INFO "ipip init: can't register tunnel\n"); 897 printk(KERN_INFO "ipip init: can't register tunnel\n");
871 return -EAGAIN; 898 return -EAGAIN;
872 } 899 }
@@ -888,16 +915,33 @@ static int __init ipip_init(void)
888 err2: 915 err2:
889 free_netdev(ipip_fb_tunnel_dev); 916 free_netdev(ipip_fb_tunnel_dev);
890 err1: 917 err1:
891 xfrm4_tunnel_deregister(&ipip_handler); 918 ipip_unregister();
892 goto out; 919 goto out;
893} 920}
894 921
922static void __exit ipip_destroy_tunnels(void)
923{
924 int prio;
925
926 for (prio = 1; prio < 4; prio++) {
927 int h;
928 for (h = 0; h < HASH_SIZE; h++) {
929 struct ip_tunnel *t;
930 while ((t = tunnels[prio][h]) != NULL)
931 unregister_netdevice(t->dev);
932 }
933 }
934}
935
895static void __exit ipip_fini(void) 936static void __exit ipip_fini(void)
896{ 937{
897 if (xfrm4_tunnel_deregister(&ipip_handler) < 0) 938 if (ipip_unregister() < 0)
898 printk(KERN_INFO "ipip close: can't deregister tunnel\n"); 939 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
899 940
900 unregister_netdev(ipip_fb_tunnel_dev); 941 rtnl_lock();
942 ipip_destroy_tunnels();
943 unregister_netdevice(ipip_fb_tunnel_dev);
944 rtnl_unlock();
901} 945}
902 946
903module_init(ipip_init); 947module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e21c049ec62a..dc806b578427 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
297static void ipmr_destroy_unres(struct mfc_cache *c) 297static void ipmr_destroy_unres(struct mfc_cache *c)
298{ 298{
299 struct sk_buff *skb; 299 struct sk_buff *skb;
300 struct nlmsgerr *e;
300 301
301 atomic_dec(&cache_resolve_queue_len); 302 atomic_dec(&cache_resolve_queue_len);
302 303
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
306 nlh->nlmsg_type = NLMSG_ERROR; 307 nlh->nlmsg_type = NLMSG_ERROR;
307 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 308 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308 skb_trim(skb, nlh->nlmsg_len); 309 skb_trim(skb, nlh->nlmsg_len);
309 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; 310 e = NLMSG_DATA(nlh);
311 e->error = -ETIMEDOUT;
312 memset(&e->msg, 0, sizeof(e->msg));
310 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 313 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
311 } else 314 } else
312 kfree_skb(skb); 315 kfree_skb(skb);
@@ -359,7 +362,7 @@ out:
359 362
360/* Fill oifs list. It is called under write locked mrt_lock. */ 363/* Fill oifs list. It is called under write locked mrt_lock. */
361 364
362static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) 365static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
363{ 366{
364 int vifi; 367 int vifi;
365 368
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
499static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) 502static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
500{ 503{
501 struct sk_buff *skb; 504 struct sk_buff *skb;
505 struct nlmsgerr *e;
502 506
503 /* 507 /*
504 * Play the pending entries through our router 508 * Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
515 nlh->nlmsg_type = NLMSG_ERROR; 519 nlh->nlmsg_type = NLMSG_ERROR;
516 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 520 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517 skb_trim(skb, nlh->nlmsg_len); 521 skb_trim(skb, nlh->nlmsg_len);
518 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; 522 e = NLMSG_DATA(nlh);
523 e->error = -EMSGSIZE;
524 memset(&e->msg, 0, sizeof(e->msg));
519 } 525 }
520 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 526 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
521 } else 527 } else
@@ -721,7 +727,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
721 if (c != NULL) { 727 if (c != NULL) {
722 write_lock_bh(&mrt_lock); 728 write_lock_bh(&mrt_lock);
723 c->mfc_parent = mfc->mfcc_parent; 729 c->mfc_parent = mfc->mfcc_parent;
724 ipmr_update_threshoulds(c, mfc->mfcc_ttls); 730 ipmr_update_thresholds(c, mfc->mfcc_ttls);
725 if (!mrtsock) 731 if (!mrtsock)
726 c->mfc_flags |= MFC_STATIC; 732 c->mfc_flags |= MFC_STATIC;
727 write_unlock_bh(&mrt_lock); 733 write_unlock_bh(&mrt_lock);
@@ -738,7 +744,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
738 c->mfc_origin=mfc->mfcc_origin.s_addr; 744 c->mfc_origin=mfc->mfcc_origin.s_addr;
739 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; 745 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
740 c->mfc_parent=mfc->mfcc_parent; 746 c->mfc_parent=mfc->mfcc_parent;
741 ipmr_update_threshoulds(c, mfc->mfcc_ttls); 747 ipmr_update_thresholds(c, mfc->mfcc_ttls);
742 if (!mrtsock) 748 if (!mrtsock)
743 c->mfc_flags |= MFC_STATIC; 749 c->mfc_flags |= MFC_STATIC;
744 750
@@ -1350,6 +1356,7 @@ int ip_mr_input(struct sk_buff *skb)
1350 */ 1356 */
1351 read_lock(&mrt_lock); 1357 read_lock(&mrt_lock);
1352 if (mroute_socket) { 1358 if (mroute_socket) {
1359 nf_reset(skb);
1353 raw_rcv(mroute_socket, skb); 1360 raw_rcv(mroute_socket, skb);
1354 read_unlock(&mrt_lock); 1361 read_unlock(&mrt_lock);
1355 return 0; 1362 return 0;
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64bb..c9820bfc493a 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
2# IP Virtual Server configuration 2# IP Virtual Server configuration
3# 3#
4menu "IP: Virtual Server Configuration" 4menu "IP: Virtual Server Configuration"
5 depends on INET && NETFILTER 5 depends on NETFILTER
6 6
7config IP_VS 7config IP_VS
8 tristate "IP virtual server support (EXPERIMENTAL)" 8 tristate "IP virtual server support (EXPERIMENTAL)"
9 depends on INET && NETFILTER 9 depends on NETFILTER
10 ---help--- 10 ---help---
11 IP Virtual Server support will let you build a high-performance 11 IP Virtual Server support will let you build a high-performance
12 virtual server based on cluster of two or more real servers. This 12 virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5499fe..d0145a8b1551 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
548{ 548{
549 if (del_timer(&cp->timer)) 549 if (del_timer(&cp->timer))
550 mod_timer(&cp->timer, jiffies); 550 mod_timer(&cp->timer, jiffies);
551 __ip_vs_conn_put(cp);
552} 551}
553 552
554 553
@@ -759,12 +758,11 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
759 return 1; 758 return 1;
760} 759}
761 760
762 761/* Called from keventd and must protect itself from softirqs */
763void ip_vs_random_dropentry(void) 762void ip_vs_random_dropentry(void)
764{ 763{
765 int idx; 764 int idx;
766 struct ip_vs_conn *cp; 765 struct ip_vs_conn *cp;
767 struct ip_vs_conn *ct;
768 766
769 /* 767 /*
770 * Randomly scan 1/32 of the whole table every second 768 * Randomly scan 1/32 of the whole table every second
@@ -775,7 +773,7 @@ void ip_vs_random_dropentry(void)
775 /* 773 /*
776 * Lock is actually needed in this loop. 774 * Lock is actually needed in this loop.
777 */ 775 */
778 ct_write_lock(hash); 776 ct_write_lock_bh(hash);
779 777
780 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
781 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) 779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -801,23 +799,14 @@ void ip_vs_random_dropentry(void)
801 continue; 799 continue;
802 } 800 }
803 801
804 /*
805 * Drop the entry, and drop its ct if not referenced
806 */
807 atomic_inc(&cp->refcnt);
808 ct_write_unlock(hash);
809
810 if ((ct = cp->control))
811 atomic_inc(&ct->refcnt);
812 IP_VS_DBG(4, "del connection\n"); 802 IP_VS_DBG(4, "del connection\n");
813 ip_vs_conn_expire_now(cp); 803 ip_vs_conn_expire_now(cp);
814 if (ct) { 804 if (cp->control) {
815 IP_VS_DBG(4, "del conn template\n"); 805 IP_VS_DBG(4, "del conn template\n");
816 ip_vs_conn_expire_now(ct); 806 ip_vs_conn_expire_now(cp->control);
817 } 807 }
818 ct_write_lock(hash);
819 } 808 }
820 ct_write_unlock(hash); 809 ct_write_unlock_bh(hash);
821 } 810 }
822} 811}
823 812
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
829{ 818{
830 int idx; 819 int idx;
831 struct ip_vs_conn *cp; 820 struct ip_vs_conn *cp;
832 struct ip_vs_conn *ct;
833 821
834 flush_again: 822 flush_again:
835 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 823 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
839 ct_write_lock_bh(idx); 827 ct_write_lock_bh(idx);
840 828
841 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 829 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
842 atomic_inc(&cp->refcnt);
843 ct_write_unlock(idx);
844 830
845 if ((ct = cp->control))
846 atomic_inc(&ct->refcnt);
847 IP_VS_DBG(4, "del connection\n"); 831 IP_VS_DBG(4, "del connection\n");
848 ip_vs_conn_expire_now(cp); 832 ip_vs_conn_expire_now(cp);
849 if (ct) { 833 if (cp->control) {
850 IP_VS_DBG(4, "del conn template\n"); 834 IP_VS_DBG(4, "del conn template\n");
851 ip_vs_conn_expire_now(ct); 835 ip_vs_conn_expire_now(cp->control);
852 } 836 }
853 ct_write_lock(idx);
854 } 837 }
855 ct_write_unlock_bh(idx); 838 ct_write_unlock_bh(idx);
856 } 839 }
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d9701036e..7d99ede2ef79 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
90#endif 90#endif
91 91
92/* 92/*
93 * update_defense_level is called from keventd and from sysctl. 93 * update_defense_level is called from keventd and from sysctl,
94 * so it needs to protect itself from softirqs
94 */ 95 */
95static void update_defense_level(void) 96static void update_defense_level(void)
96{ 97{
@@ -110,6 +111,8 @@ static void update_defense_level(void)
110 111
111 nomem = (availmem < sysctl_ip_vs_amemthresh); 112 nomem = (availmem < sysctl_ip_vs_amemthresh);
112 113
114 local_bh_disable();
115
113 /* drop_entry */ 116 /* drop_entry */
114 spin_lock(&__ip_vs_dropentry_lock); 117 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) { 118 switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
206 if (to_change >= 0) 209 if (to_change >= 0)
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); 210 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock); 211 write_unlock(&__ip_vs_securetcp_lock);
212
213 local_bh_enable();
209} 214}
210 215
211 216
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1360 /* Restore the correct value */ 1365 /* Restore the correct value */
1361 *valp = val; 1366 *valp = val;
1362 } else { 1367 } else {
1363 local_bh_disable();
1364 update_defense_level(); 1368 update_defense_level();
1365 local_bh_enable();
1366 } 1369 }
1367 } 1370 }
1368 return rc; 1371 return rc;
@@ -2059,7 +2062,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2059 dst->addr = src->addr; 2062 dst->addr = src->addr;
2060 dst->port = src->port; 2063 dst->port = src->port;
2061 dst->fwmark = src->fwmark; 2064 dst->fwmark = src->fwmark;
2062 strcpy(dst->sched_name, src->scheduler->name); 2065 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2063 dst->flags = src->flags; 2066 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ; 2067 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask; 2068 dst->netmask = src->netmask;
@@ -2080,6 +2083,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2083 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services) 2084 if (count >= get->num_services)
2082 goto out; 2085 goto out;
2086 memset(&entry, 0, sizeof(entry));
2083 ip_vs_copy_service(&entry, svc); 2087 ip_vs_copy_service(&entry, svc);
2084 if (copy_to_user(&uptr->entrytable[count], 2088 if (copy_to_user(&uptr->entrytable[count],
2085 &entry, sizeof(entry))) { 2089 &entry, sizeof(entry))) {
@@ -2094,6 +2098,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2094 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2098 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095 if (count >= get->num_services) 2099 if (count >= get->num_services)
2096 goto out; 2100 goto out;
2101 memset(&entry, 0, sizeof(entry));
2097 ip_vs_copy_service(&entry, svc); 2102 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count], 2103 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) { 2104 &entry, sizeof(entry))) {
@@ -2304,12 +2309,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2304 memset(&d, 0, sizeof(d)); 2309 memset(&d, 0, sizeof(d));
2305 if (ip_vs_sync_state & IP_VS_STATE_MASTER) { 2310 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306 d[0].state = IP_VS_STATE_MASTER; 2311 d[0].state = IP_VS_STATE_MASTER;
2307 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); 2312 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2308 d[0].syncid = ip_vs_master_syncid; 2313 d[0].syncid = ip_vs_master_syncid;
2309 } 2314 }
2310 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { 2315 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311 d[1].state = IP_VS_STATE_BACKUP; 2316 d[1].state = IP_VS_STATE_BACKUP;
2312 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); 2317 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2313 d[1].syncid = ip_vs_backup_syncid; 2318 d[1].syncid = ip_vs_backup_syncid;
2314 } 2319 }
2315 if (copy_to_user(user, &d, sizeof(d)) != 0) 2320 if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a32..574d1f509b46 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
839 839
840 ip_vs_sync_state |= state; 840 ip_vs_sync_state |= state;
841 if (state == IP_VS_STATE_MASTER) { 841 if (state == IP_VS_STATE_MASTER) {
842 strcpy(ip_vs_master_mcast_ifn, mcast_ifn); 842 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
843 ip_vs_master_syncid = syncid; 843 ip_vs_master_syncid = syncid;
844 } else { 844 } else {
845 strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); 845 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
846 ip_vs_backup_syncid = syncid; 846 ip_vs_backup_syncid = syncid;
847 } 847 }
848 848
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index de21da00057f..a8512a3fd08a 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,6 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
127 127
128#define IP_VS_XMIT(skb, rt) \ 128#define IP_VS_XMIT(skb, rt) \
129do { \ 129do { \
130 nf_reset_debug(skb); \
131 (skb)->nfcache |= NFC_IPVS_PROPERTY; \ 130 (skb)->nfcache |= NFC_IPVS_PROPERTY; \
132 (skb)->ip_summed = CHECKSUM_NONE; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \
133 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index cf2e6bcf7973..c9cf8726051d 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -247,3 +248,4 @@ static void __exit drr_exit(void)
247 248
248module_init(drr_init); 249module_init(drr_init);
249module_exit(drr_exit); 250module_exit(drr_exit);
251MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
index 805a16e47de5..5249dbe7c559 100644
--- a/net/ipv4/multipath_random.c
+++ b/net/ipv4/multipath_random.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -126,3 +127,4 @@ static void __exit random_exit(void)
126 127
127module_init(random_init); 128module_init(random_init);
128module_exit(random_exit); 129module_exit(random_exit);
130MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
index 061b6b253982..b6cd2870478f 100644
--- a/net/ipv4/multipath_rr.c
+++ b/net/ipv4/multipath_rr.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -93,3 +94,4 @@ static void __exit rr_exit(void)
93 94
94module_init(rr_init); 95module_init(rr_init);
95module_exit(rr_exit); 96module_exit(rr_exit);
97MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
index c3d2ca1a6781..bd7d75b6abe0 100644
--- a/net/ipv4/multipath_wrandom.c
+++ b/net/ipv4/multipath_wrandom.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -342,3 +343,4 @@ static void __exit wrandom_exit(void)
342 343
343module_init(wrandom_init); 344module_init(wrandom_init);
344module_exit(wrandom_exit); 345module_exit(wrandom_exit);
346MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index df79f5ed6a0a..fa1634256680 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -60,7 +60,6 @@ static DECLARE_MUTEX(arpt_mutex);
60 60
61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
63#include <linux/netfilter_ipv4/lockhelp.h>
64#include <linux/netfilter_ipv4/listhelp.h> 63#include <linux/netfilter_ipv4/listhelp.h>
65 64
66struct arpt_table_info { 65struct arpt_table_info {
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 3dbddd062605..01e1b58322a9 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -26,7 +26,6 @@
26#include <net/checksum.h> 26#include <net/checksum.h>
27#include <net/udp.h> 27#include <net/udp.h>
28 28
29#include <linux/netfilter_ipv4/lockhelp.h>
30#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 29#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> 30#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
32 31
@@ -42,7 +41,7 @@ static char *conns[] = { "DATA ", "MESG ", "INDEX " };
42 41
43/* This is slow, but it's simple. --RR */ 42/* This is slow, but it's simple. --RR */
44static char amanda_buffer[65536]; 43static char amanda_buffer[65536];
45static DECLARE_LOCK(amanda_buffer_lock); 44static DEFINE_SPINLOCK(amanda_buffer_lock);
46 45
47unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
48 enum ip_conntrack_info ctinfo, 47 enum ip_conntrack_info ctinfo,
@@ -76,7 +75,7 @@ static int help(struct sk_buff **pskb,
76 return NF_ACCEPT; 75 return NF_ACCEPT;
77 } 76 }
78 77
79 LOCK_BH(&amanda_buffer_lock); 78 spin_lock_bh(&amanda_buffer_lock);
80 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); 79 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
81 data = amanda_buffer; 80 data = amanda_buffer;
82 data_limit = amanda_buffer + (*pskb)->len - dataoff; 81 data_limit = amanda_buffer + (*pskb)->len - dataoff;
@@ -102,14 +101,13 @@ static int help(struct sk_buff **pskb,
102 if (port == 0 || len > 5) 101 if (port == 0 || len > 5)
103 break; 102 break;
104 103
105 exp = ip_conntrack_expect_alloc(); 104 exp = ip_conntrack_expect_alloc(ct);
106 if (exp == NULL) { 105 if (exp == NULL) {
107 ret = NF_DROP; 106 ret = NF_DROP;
108 goto out; 107 goto out;
109 } 108 }
110 109
111 exp->expectfn = NULL; 110 exp->expectfn = NULL;
112 exp->master = ct;
113 111
114 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; 112 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
115 exp->tuple.src.u.tcp.port = 0; 113 exp->tuple.src.u.tcp.port = 0;
@@ -127,14 +125,13 @@ static int help(struct sk_buff **pskb,
127 ret = ip_nat_amanda_hook(pskb, ctinfo, 125 ret = ip_nat_amanda_hook(pskb, ctinfo,
128 tmp - amanda_buffer, 126 tmp - amanda_buffer,
129 len, exp); 127 len, exp);
130 else if (ip_conntrack_expect_related(exp) != 0) { 128 else if (ip_conntrack_expect_related(exp) != 0)
131 ip_conntrack_expect_free(exp);
132 ret = NF_DROP; 129 ret = NF_DROP;
133 } 130 ip_conntrack_expect_put(exp);
134 } 131 }
135 132
136out: 133out:
137 UNLOCK_BH(&amanda_buffer_lock); 134 spin_unlock_bh(&amanda_buffer_lock);
138 return ret; 135 return ret;
139} 136}
140 137
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 09e824622977..a7f0c821a9b2 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -38,10 +38,10 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40 40
41/* This rwlock protects the main hash table, protocol/helper/expected 41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/ 42 registrations, conntrack timers*/
43#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 43#define ASSERT_READ_LOCK(x)
44#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 44#define ASSERT_WRITE_LOCK(x)
45 45
46#include <linux/netfilter_ipv4/ip_conntrack.h> 46#include <linux/netfilter_ipv4/ip_conntrack.h>
47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -57,7 +57,7 @@
57#define DEBUGP(format, args...) 57#define DEBUGP(format, args...)
58#endif 58#endif
59 59
60DECLARE_RWLOCK(ip_conntrack_lock); 60DEFINE_RWLOCK(ip_conntrack_lock);
61 61
62/* ip_conntrack_standalone needs this */ 62/* ip_conntrack_standalone needs this */
63atomic_t ip_conntrack_count = ATOMIC_INIT(0); 63atomic_t ip_conntrack_count = ATOMIC_INIT(0);
@@ -137,19 +137,12 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
137 137
138 138
139/* ip_conntrack_expect helper functions */ 139/* ip_conntrack_expect helper functions */
140static void destroy_expect(struct ip_conntrack_expect *exp)
141{
142 ip_conntrack_put(exp->master);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 kmem_cache_free(ip_conntrack_expect_cachep, exp);
145 CONNTRACK_STAT_INC(expect_delete);
146}
147
148static void unlink_expect(struct ip_conntrack_expect *exp) 140static void unlink_expect(struct ip_conntrack_expect *exp)
149{ 141{
150 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 142 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
151 list_del(&exp->list); 144 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */ 145 CONNTRACK_STAT_INC(expect_delete);
153 exp->master->expecting--; 146 exp->master->expecting--;
154} 147}
155 148
@@ -157,10 +150,10 @@ static void expectation_timed_out(unsigned long ul_expect)
157{ 150{
158 struct ip_conntrack_expect *exp = (void *)ul_expect; 151 struct ip_conntrack_expect *exp = (void *)ul_expect;
159 152
160 WRITE_LOCK(&ip_conntrack_lock); 153 write_lock_bh(&ip_conntrack_lock);
161 unlink_expect(exp); 154 unlink_expect(exp);
162 WRITE_UNLOCK(&ip_conntrack_lock); 155 write_unlock_bh(&ip_conntrack_lock);
163 destroy_expect(exp); 156 ip_conntrack_expect_put(exp);
164} 157}
165 158
166/* If an expectation for this connection is found, it gets delete from 159/* If an expectation for this connection is found, it gets delete from
@@ -198,7 +191,7 @@ static void remove_expectations(struct ip_conntrack *ct)
198 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { 191 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
199 if (i->master == ct && del_timer(&i->timeout)) { 192 if (i->master == ct && del_timer(&i->timeout)) {
200 unlink_expect(i); 193 unlink_expect(i);
201 destroy_expect(i); 194 ip_conntrack_expect_put(i);
202 } 195 }
203 } 196 }
204} 197}
@@ -209,7 +202,7 @@ clean_from_lists(struct ip_conntrack *ct)
209 unsigned int ho, hr; 202 unsigned int ho, hr;
210 203
211 DEBUGP("clean_from_lists(%p)\n", ct); 204 DEBUGP("clean_from_lists(%p)\n", ct);
212 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 205 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
213 206
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 207 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 208 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -240,7 +233,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
240 if (ip_conntrack_destroyed) 233 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct); 234 ip_conntrack_destroyed(ct);
242 235
243 WRITE_LOCK(&ip_conntrack_lock); 236 write_lock_bh(&ip_conntrack_lock);
244 /* Expectations will have been removed in clean_from_lists, 237 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet, 238 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here, 239 * before connection is in the list, so we need to clean here,
@@ -254,7 +247,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
254 } 247 }
255 248
256 CONNTRACK_STAT_INC(delete); 249 CONNTRACK_STAT_INC(delete);
257 WRITE_UNLOCK(&ip_conntrack_lock); 250 write_unlock_bh(&ip_conntrack_lock);
258 251
259 if (ct->master) 252 if (ct->master)
260 ip_conntrack_put(ct->master); 253 ip_conntrack_put(ct->master);
@@ -268,12 +261,12 @@ static void death_by_timeout(unsigned long ul_conntrack)
268{ 261{
269 struct ip_conntrack *ct = (void *)ul_conntrack; 262 struct ip_conntrack *ct = (void *)ul_conntrack;
270 263
271 WRITE_LOCK(&ip_conntrack_lock); 264 write_lock_bh(&ip_conntrack_lock);
272 /* Inside lock so preempt is disabled on module removal path. 265 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */ 266 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list); 267 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct); 268 clean_from_lists(ct);
276 WRITE_UNLOCK(&ip_conntrack_lock); 269 write_unlock_bh(&ip_conntrack_lock);
277 ip_conntrack_put(ct); 270 ip_conntrack_put(ct);
278} 271}
279 272
@@ -282,7 +275,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple, 275 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack) 276 const struct ip_conntrack *ignored_conntrack)
284{ 277{
285 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 278 ASSERT_READ_LOCK(&ip_conntrack_lock);
286 return tuplehash_to_ctrack(i) != ignored_conntrack 279 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple); 280 && ip_ct_tuple_equal(tuple, &i->tuple);
288} 281}
@@ -294,7 +287,7 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
294 struct ip_conntrack_tuple_hash *h; 287 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple); 288 unsigned int hash = hash_conntrack(tuple);
296 289
297 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 290 ASSERT_READ_LOCK(&ip_conntrack_lock);
298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) { 291 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { 292 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found); 293 CONNTRACK_STAT_INC(found);
@@ -313,11 +306,11 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
313{ 306{
314 struct ip_conntrack_tuple_hash *h; 307 struct ip_conntrack_tuple_hash *h;
315 308
316 READ_LOCK(&ip_conntrack_lock); 309 read_lock_bh(&ip_conntrack_lock);
317 h = __ip_conntrack_find(tuple, ignored_conntrack); 310 h = __ip_conntrack_find(tuple, ignored_conntrack);
318 if (h) 311 if (h)
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 312 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
320 READ_UNLOCK(&ip_conntrack_lock); 313 read_unlock_bh(&ip_conntrack_lock);
321 314
322 return h; 315 return h;
323} 316}
@@ -352,7 +345,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
352 IP_NF_ASSERT(!is_confirmed(ct)); 345 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct); 346 DEBUGP("Confirming conntrack %p\n", ct);
354 347
355 WRITE_LOCK(&ip_conntrack_lock); 348 write_lock_bh(&ip_conntrack_lock);
356 349
357 /* See if there's one in the list already, including reverse: 350 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're 351 NAT could have grabbed it without realizing, since we're
@@ -380,12 +373,12 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
380 atomic_inc(&ct->ct_general.use); 373 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status); 374 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert); 375 CONNTRACK_STAT_INC(insert);
383 WRITE_UNLOCK(&ip_conntrack_lock); 376 write_unlock_bh(&ip_conntrack_lock);
384 return NF_ACCEPT; 377 return NF_ACCEPT;
385 } 378 }
386 379
387 CONNTRACK_STAT_INC(insert_failed); 380 CONNTRACK_STAT_INC(insert_failed);
388 WRITE_UNLOCK(&ip_conntrack_lock); 381 write_unlock_bh(&ip_conntrack_lock);
389 382
390 return NF_DROP; 383 return NF_DROP;
391} 384}
@@ -398,9 +391,9 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
398{ 391{
399 struct ip_conntrack_tuple_hash *h; 392 struct ip_conntrack_tuple_hash *h;
400 393
401 READ_LOCK(&ip_conntrack_lock); 394 read_lock_bh(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack); 395 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 READ_UNLOCK(&ip_conntrack_lock); 396 read_unlock_bh(&ip_conntrack_lock);
404 397
405 return h != NULL; 398 return h != NULL;
406} 399}
@@ -419,13 +412,13 @@ static int early_drop(struct list_head *chain)
419 struct ip_conntrack *ct = NULL; 412 struct ip_conntrack *ct = NULL;
420 int dropped = 0; 413 int dropped = 0;
421 414
422 READ_LOCK(&ip_conntrack_lock); 415 read_lock_bh(&ip_conntrack_lock);
423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); 416 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
424 if (h) { 417 if (h) {
425 ct = tuplehash_to_ctrack(h); 418 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use); 419 atomic_inc(&ct->ct_general.use);
427 } 420 }
428 READ_UNLOCK(&ip_conntrack_lock); 421 read_unlock_bh(&ip_conntrack_lock);
429 422
430 if (!ct) 423 if (!ct)
431 return dropped; 424 return dropped;
@@ -508,7 +501,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
508 conntrack->timeout.data = (unsigned long)conntrack; 501 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout; 502 conntrack->timeout.function = death_by_timeout;
510 503
511 WRITE_LOCK(&ip_conntrack_lock); 504 write_lock_bh(&ip_conntrack_lock);
512 exp = find_expectation(tuple); 505 exp = find_expectation(tuple);
513 506
514 if (exp) { 507 if (exp) {
@@ -517,9 +510,14 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
517 /* Welcome, Mr. Bond. We've been expecting you... */ 510 /* Welcome, Mr. Bond. We've been expecting you... */
518 __set_bit(IPS_EXPECTED_BIT, &conntrack->status); 511 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
519 conntrack->master = exp->master; 512 conntrack->master = exp->master;
520#if CONFIG_IP_NF_CONNTRACK_MARK 513#ifdef CONFIG_IP_NF_CONNTRACK_MARK
521 conntrack->mark = exp->master->mark; 514 conntrack->mark = exp->master->mark;
522#endif 515#endif
516#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
517 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
518 /* this is ugly, but there is no other place where to put it */
519 conntrack->nat.masq_index = exp->master->nat.masq_index;
520#endif
523 nf_conntrack_get(&conntrack->master->ct_general); 521 nf_conntrack_get(&conntrack->master->ct_general);
524 CONNTRACK_STAT_INC(expect_new); 522 CONNTRACK_STAT_INC(expect_new);
525 } else { 523 } else {
@@ -532,12 +530,12 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); 530 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
533 531
534 atomic_inc(&ip_conntrack_count); 532 atomic_inc(&ip_conntrack_count);
535 WRITE_UNLOCK(&ip_conntrack_lock); 533 write_unlock_bh(&ip_conntrack_lock);
536 534
537 if (exp) { 535 if (exp) {
538 if (exp->expectfn) 536 if (exp->expectfn)
539 exp->expectfn(conntrack, exp); 537 exp->expectfn(conntrack, exp);
540 destroy_expect(exp); 538 ip_conntrack_expect_put(exp);
541 } 539 }
542 540
543 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; 541 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
@@ -723,20 +721,20 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
723{ 721{
724 struct ip_conntrack_expect *i; 722 struct ip_conntrack_expect *i;
725 723
726 WRITE_LOCK(&ip_conntrack_lock); 724 write_lock_bh(&ip_conntrack_lock);
727 /* choose the the oldest expectation to evict */ 725 /* choose the the oldest expectation to evict */
728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { 726 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) { 727 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i); 728 unlink_expect(i);
731 WRITE_UNLOCK(&ip_conntrack_lock); 729 write_unlock_bh(&ip_conntrack_lock);
732 destroy_expect(i); 730 ip_conntrack_expect_put(i);
733 return; 731 return;
734 } 732 }
735 } 733 }
736 WRITE_UNLOCK(&ip_conntrack_lock); 734 write_unlock_bh(&ip_conntrack_lock);
737} 735}
738 736
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) 737struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
740{ 738{
741 struct ip_conntrack_expect *new; 739 struct ip_conntrack_expect *new;
742 740
@@ -745,30 +743,31 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
745 DEBUGP("expect_related: OOM allocating expect\n"); 743 DEBUGP("expect_related: OOM allocating expect\n");
746 return NULL; 744 return NULL;
747 } 745 }
748 new->master = NULL; 746 new->master = me;
747 atomic_inc(&new->master->ct_general.use);
748 atomic_set(&new->use, 1);
749 return new; 749 return new;
750} 750}
751 751
752void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) 752void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
753{ 753{
754 kmem_cache_free(ip_conntrack_expect_cachep, expect); 754 if (atomic_dec_and_test(&exp->use)) {
755 ip_conntrack_put(exp->master);
756 kmem_cache_free(ip_conntrack_expect_cachep, exp);
757 }
755} 758}
756 759
757static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) 760static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
758{ 761{
759 atomic_inc(&exp->master->ct_general.use); 762 atomic_inc(&exp->use);
760 exp->master->expecting++; 763 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list); 764 list_add(&exp->list, &ip_conntrack_expect_list);
762 765
763 if (exp->master->helper->timeout) { 766 init_timer(&exp->timeout);
764 init_timer(&exp->timeout); 767 exp->timeout.data = (unsigned long)exp;
765 exp->timeout.data = (unsigned long)exp; 768 exp->timeout.function = expectation_timed_out;
766 exp->timeout.function = expectation_timed_out; 769 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
767 exp->timeout.expires 770 add_timer(&exp->timeout);
768 = jiffies + exp->master->helper->timeout * HZ;
769 add_timer(&exp->timeout);
770 } else
771 exp->timeout.function = NULL;
772 771
773 CONNTRACK_STAT_INC(expect_create); 772 CONNTRACK_STAT_INC(expect_create);
774} 773}
@@ -782,7 +781,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
782 if (i->master == master) { 781 if (i->master == master) {
783 if (del_timer(&i->timeout)) { 782 if (del_timer(&i->timeout)) {
784 unlink_expect(i); 783 unlink_expect(i);
785 destroy_expect(i); 784 ip_conntrack_expect_put(i);
786 } 785 }
787 break; 786 break;
788 } 787 }
@@ -808,14 +807,12 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
808 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); 807 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
809 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); 808 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
810 809
811 WRITE_LOCK(&ip_conntrack_lock); 810 write_lock_bh(&ip_conntrack_lock);
812 list_for_each_entry(i, &ip_conntrack_expect_list, list) { 811 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
813 if (expect_matches(i, expect)) { 812 if (expect_matches(i, expect)) {
814 /* Refresh timer: if it's dying, ignore.. */ 813 /* Refresh timer: if it's dying, ignore.. */
815 if (refresh_timer(i)) { 814 if (refresh_timer(i)) {
816 ret = 0; 815 ret = 0;
817 /* We don't need the one they've given us. */
818 ip_conntrack_expect_free(expect);
819 goto out; 816 goto out;
820 } 817 }
821 } else if (expect_clash(i, expect)) { 818 } else if (expect_clash(i, expect)) {
@@ -832,7 +829,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
832 ip_conntrack_expect_insert(expect); 829 ip_conntrack_expect_insert(expect);
833 ret = 0; 830 ret = 0;
834out: 831out:
835 WRITE_UNLOCK(&ip_conntrack_lock); 832 write_unlock_bh(&ip_conntrack_lock);
836 return ret; 833 return ret;
837} 834}
838 835
@@ -841,7 +838,7 @@ out:
841void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, 838void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
842 const struct ip_conntrack_tuple *newreply) 839 const struct ip_conntrack_tuple *newreply)
843{ 840{
844 WRITE_LOCK(&ip_conntrack_lock); 841 write_lock_bh(&ip_conntrack_lock);
845 /* Should be unconfirmed, so not in hash table yet */ 842 /* Should be unconfirmed, so not in hash table yet */
846 IP_NF_ASSERT(!is_confirmed(conntrack)); 843 IP_NF_ASSERT(!is_confirmed(conntrack));
847 844
@@ -851,15 +848,15 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
851 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 848 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
852 if (!conntrack->master && conntrack->expecting == 0) 849 if (!conntrack->master && conntrack->expecting == 0)
853 conntrack->helper = ip_ct_find_helper(newreply); 850 conntrack->helper = ip_ct_find_helper(newreply);
854 WRITE_UNLOCK(&ip_conntrack_lock); 851 write_unlock_bh(&ip_conntrack_lock);
855} 852}
856 853
857int ip_conntrack_helper_register(struct ip_conntrack_helper *me) 854int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
858{ 855{
859 BUG_ON(me->timeout == 0); 856 BUG_ON(me->timeout == 0);
860 WRITE_LOCK(&ip_conntrack_lock); 857 write_lock_bh(&ip_conntrack_lock);
861 list_prepend(&helpers, me); 858 list_prepend(&helpers, me);
862 WRITE_UNLOCK(&ip_conntrack_lock); 859 write_unlock_bh(&ip_conntrack_lock);
863 860
864 return 0; 861 return 0;
865} 862}
@@ -878,14 +875,14 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
878 struct ip_conntrack_expect *exp, *tmp; 875 struct ip_conntrack_expect *exp, *tmp;
879 876
880 /* Need write lock here, to delete helper. */ 877 /* Need write lock here, to delete helper. */
881 WRITE_LOCK(&ip_conntrack_lock); 878 write_lock_bh(&ip_conntrack_lock);
882 LIST_DELETE(&helpers, me); 879 LIST_DELETE(&helpers, me);
883 880
884 /* Get rid of expectations */ 881 /* Get rid of expectations */
885 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { 882 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
886 if (exp->master->helper == me && del_timer(&exp->timeout)) { 883 if (exp->master->helper == me && del_timer(&exp->timeout)) {
887 unlink_expect(exp); 884 unlink_expect(exp);
888 destroy_expect(exp); 885 ip_conntrack_expect_put(exp);
889 } 886 }
890 } 887 }
891 /* Get rid of expecteds, set helpers to NULL. */ 888 /* Get rid of expecteds, set helpers to NULL. */
@@ -893,7 +890,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
893 for (i = 0; i < ip_conntrack_htable_size; i++) 890 for (i = 0; i < ip_conntrack_htable_size; i++)
894 LIST_FIND_W(&ip_conntrack_hash[i], unhelp, 891 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
895 struct ip_conntrack_tuple_hash *, me); 892 struct ip_conntrack_tuple_hash *, me);
896 WRITE_UNLOCK(&ip_conntrack_lock); 893 write_unlock_bh(&ip_conntrack_lock);
897 894
898 /* Someone could be still looking at the helper in a bh. */ 895 /* Someone could be still looking at the helper in a bh. */
899 synchronize_net(); 896 synchronize_net();
@@ -925,14 +922,14 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
925 ct->timeout.expires = extra_jiffies; 922 ct->timeout.expires = extra_jiffies;
926 ct_add_counters(ct, ctinfo, skb); 923 ct_add_counters(ct, ctinfo, skb);
927 } else { 924 } else {
928 WRITE_LOCK(&ip_conntrack_lock); 925 write_lock_bh(&ip_conntrack_lock);
929 /* Need del_timer for race avoidance (may already be dying). */ 926 /* Need del_timer for race avoidance (may already be dying). */
930 if (del_timer(&ct->timeout)) { 927 if (del_timer(&ct->timeout)) {
931 ct->timeout.expires = jiffies + extra_jiffies; 928 ct->timeout.expires = jiffies + extra_jiffies;
932 add_timer(&ct->timeout); 929 add_timer(&ct->timeout);
933 } 930 }
934 ct_add_counters(ct, ctinfo, skb); 931 ct_add_counters(ct, ctinfo, skb);
935 WRITE_UNLOCK(&ip_conntrack_lock); 932 write_unlock_bh(&ip_conntrack_lock);
936 } 933 }
937} 934}
938 935
@@ -940,10 +937,6 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
940struct sk_buff * 937struct sk_buff *
941ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) 938ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
942{ 939{
943#ifdef CONFIG_NETFILTER_DEBUG
944 unsigned int olddebug = skb->nf_debug;
945#endif
946
947 skb_orphan(skb); 940 skb_orphan(skb);
948 941
949 local_bh_disable(); 942 local_bh_disable();
@@ -953,12 +946,7 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
953 if (skb) { 946 if (skb) {
954 ip_send_check(skb->nh.iph); 947 ip_send_check(skb->nh.iph);
955 skb->nfcache |= NFC_ALTERED; 948 skb->nfcache |= NFC_ALTERED;
956#ifdef CONFIG_NETFILTER_DEBUG
957 /* Packet path as if nothing had happened. */
958 skb->nf_debug = olddebug;
959#endif
960 } 949 }
961
962 return skb; 950 return skb;
963} 951}
964 952
@@ -997,7 +985,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
997{ 985{
998 struct ip_conntrack_tuple_hash *h = NULL; 986 struct ip_conntrack_tuple_hash *h = NULL;
999 987
1000 WRITE_LOCK(&ip_conntrack_lock); 988 write_lock_bh(&ip_conntrack_lock);
1001 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { 989 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1002 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, 990 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1003 struct ip_conntrack_tuple_hash *, iter, data); 991 struct ip_conntrack_tuple_hash *, iter, data);
@@ -1009,7 +997,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1009 struct ip_conntrack_tuple_hash *, iter, data); 997 struct ip_conntrack_tuple_hash *, iter, data);
1010 if (h) 998 if (h)
1011 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 999 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1012 WRITE_UNLOCK(&ip_conntrack_lock); 1000 write_unlock_bh(&ip_conntrack_lock);
1013 1001
1014 return h; 1002 return h;
1015} 1003}
@@ -1124,6 +1112,9 @@ void ip_conntrack_cleanup(void)
1124 schedule(); 1112 schedule();
1125 goto i_see_dead_people; 1113 goto i_see_dead_people;
1126 } 1114 }
1115 /* wait until all references to ip_conntrack_untracked are dropped */
1116 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1117 schedule();
1127 1118
1128 kmem_cache_destroy(ip_conntrack_cachep); 1119 kmem_cache_destroy(ip_conntrack_cachep);
1129 kmem_cache_destroy(ip_conntrack_expect_cachep); 1120 kmem_cache_destroy(ip_conntrack_expect_cachep);
@@ -1201,14 +1192,14 @@ int __init ip_conntrack_init(void)
1201 } 1192 }
1202 1193
1203 /* Don't NEED lock here, but good form anyway. */ 1194 /* Don't NEED lock here, but good form anyway. */
1204 WRITE_LOCK(&ip_conntrack_lock); 1195 write_lock_bh(&ip_conntrack_lock);
1205 for (i = 0; i < MAX_IP_CT_PROTO; i++) 1196 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1206 ip_ct_protos[i] = &ip_conntrack_generic_protocol; 1197 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1207 /* Sew in builtin protocols. */ 1198 /* Sew in builtin protocols. */
1208 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; 1199 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1209 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; 1200 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1210 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; 1201 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1211 WRITE_UNLOCK(&ip_conntrack_lock); 1202 write_unlock_bh(&ip_conntrack_lock);
1212 1203
1213 for (i = 0; i < ip_conntrack_htable_size; i++) 1204 for (i = 0; i < ip_conntrack_htable_size; i++)
1214 INIT_LIST_HEAD(&ip_conntrack_hash[i]); 1205 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index dd86503aa788..7a3b773be3f9 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -16,7 +16,6 @@
16#include <net/checksum.h> 16#include <net/checksum.h>
17#include <net/tcp.h> 17#include <net/tcp.h>
18 18
19#include <linux/netfilter_ipv4/lockhelp.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> 20#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
22#include <linux/moduleparam.h> 21#include <linux/moduleparam.h>
@@ -28,7 +27,7 @@ MODULE_DESCRIPTION("ftp connection tracking helper");
28/* This is slow, but it's simple. --RR */ 27/* This is slow, but it's simple. --RR */
29static char ftp_buffer[65536]; 28static char ftp_buffer[65536];
30 29
31static DECLARE_LOCK(ip_ftp_lock); 30static DEFINE_SPINLOCK(ip_ftp_lock);
32 31
33#define MAX_PORTS 8 32#define MAX_PORTS 8
34static int ports[MAX_PORTS]; 33static int ports[MAX_PORTS];
@@ -319,7 +318,7 @@ static int help(struct sk_buff **pskb,
319 } 318 }
320 datalen = (*pskb)->len - dataoff; 319 datalen = (*pskb)->len - dataoff;
321 320
322 LOCK_BH(&ip_ftp_lock); 321 spin_lock_bh(&ip_ftp_lock);
323 fb_ptr = skb_header_pointer(*pskb, dataoff, 322 fb_ptr = skb_header_pointer(*pskb, dataoff,
324 (*pskb)->len - dataoff, ftp_buffer); 323 (*pskb)->len - dataoff, ftp_buffer);
325 BUG_ON(fb_ptr == NULL); 324 BUG_ON(fb_ptr == NULL);
@@ -377,7 +376,7 @@ static int help(struct sk_buff **pskb,
377 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); 376 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
378 377
379 /* Allocate expectation which will be inserted */ 378 /* Allocate expectation which will be inserted */
380 exp = ip_conntrack_expect_alloc(); 379 exp = ip_conntrack_expect_alloc(ct);
381 if (exp == NULL) { 380 if (exp == NULL) {
382 ret = NF_DROP; 381 ret = NF_DROP;
383 goto out; 382 goto out;
@@ -404,8 +403,7 @@ static int help(struct sk_buff **pskb,
404 networks, or the packet filter itself). */ 403 networks, or the packet filter itself). */
405 if (!loose) { 404 if (!loose) {
406 ret = NF_ACCEPT; 405 ret = NF_ACCEPT;
407 ip_conntrack_expect_free(exp); 406 goto out_put_expect;
408 goto out_update_nl;
409 } 407 }
410 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) 408 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
411 | (array[2] << 8) | array[3]); 409 | (array[2] << 8) | array[3]);
@@ -420,7 +418,6 @@ static int help(struct sk_buff **pskb,
420 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); 418 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
421 419
422 exp->expectfn = NULL; 420 exp->expectfn = NULL;
423 exp->master = ct;
424 421
425 /* Now, NAT might want to mangle the packet, and register the 422 /* Now, NAT might want to mangle the packet, and register the
426 * (possibly changed) expectation itself. */ 423 * (possibly changed) expectation itself. */
@@ -429,20 +426,22 @@ static int help(struct sk_buff **pskb,
429 matchoff, matchlen, exp, &seq); 426 matchoff, matchlen, exp, &seq);
430 else { 427 else {
431 /* Can't expect this? Best to drop packet now. */ 428 /* Can't expect this? Best to drop packet now. */
432 if (ip_conntrack_expect_related(exp) != 0) { 429 if (ip_conntrack_expect_related(exp) != 0)
433 ip_conntrack_expect_free(exp);
434 ret = NF_DROP; 430 ret = NF_DROP;
435 } else 431 else
436 ret = NF_ACCEPT; 432 ret = NF_ACCEPT;
437 } 433 }
438 434
435out_put_expect:
436 ip_conntrack_expect_put(exp);
437
439out_update_nl: 438out_update_nl:
440 /* Now if this ends in \n, update ftp info. Seq may have been 439 /* Now if this ends in \n, update ftp info. Seq may have been
441 * adjusted by NAT code. */ 440 * adjusted by NAT code. */
442 if (ends_in_nl) 441 if (ends_in_nl)
443 update_nl_seq(seq, ct_ftp_info,dir); 442 update_nl_seq(seq, ct_ftp_info,dir);
444 out: 443 out:
445 UNLOCK_BH(&ip_ftp_lock); 444 spin_unlock_bh(&ip_ftp_lock);
446 return ret; 445 return ret;
447} 446}
448 447
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 33cc7348b6ee..4a28f297d502 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -29,7 +29,6 @@
29#include <net/checksum.h> 29#include <net/checksum.h>
30#include <net/tcp.h> 30#include <net/tcp.h>
31 31
32#include <linux/netfilter_ipv4/lockhelp.h>
33#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 32#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
34#include <linux/netfilter_ipv4/ip_conntrack_irc.h> 33#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
35#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
@@ -41,7 +40,7 @@ static int max_dcc_channels = 8;
41static unsigned int dcc_timeout = 300; 40static unsigned int dcc_timeout = 300;
42/* This is slow, but it's simple. --RR */ 41/* This is slow, but it's simple. --RR */
43static char irc_buffer[65536]; 42static char irc_buffer[65536];
44static DECLARE_LOCK(irc_buffer_lock); 43static DEFINE_SPINLOCK(irc_buffer_lock);
45 44
46unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, 45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
47 enum ip_conntrack_info ctinfo, 46 enum ip_conntrack_info ctinfo,
@@ -141,7 +140,7 @@ static int help(struct sk_buff **pskb,
141 if (dataoff >= (*pskb)->len) 140 if (dataoff >= (*pskb)->len)
142 return NF_ACCEPT; 141 return NF_ACCEPT;
143 142
144 LOCK_BH(&irc_buffer_lock); 143 spin_lock_bh(&irc_buffer_lock);
145 ib_ptr = skb_header_pointer(*pskb, dataoff, 144 ib_ptr = skb_header_pointer(*pskb, dataoff,
146 (*pskb)->len - dataoff, irc_buffer); 145 (*pskb)->len - dataoff, irc_buffer);
147 BUG_ON(ib_ptr == NULL); 146 BUG_ON(ib_ptr == NULL);
@@ -198,7 +197,7 @@ static int help(struct sk_buff **pskb,
198 continue; 197 continue;
199 } 198 }
200 199
201 exp = ip_conntrack_expect_alloc(); 200 exp = ip_conntrack_expect_alloc(ct);
202 if (exp == NULL) { 201 if (exp == NULL) {
203 ret = NF_DROP; 202 ret = NF_DROP;
204 goto out; 203 goto out;
@@ -222,22 +221,20 @@ static int help(struct sk_buff **pskb,
222 { { 0, { 0 } }, 221 { { 0, { 0 } },
223 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); 222 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
224 exp->expectfn = NULL; 223 exp->expectfn = NULL;
225 exp->master = ct;
226 if (ip_nat_irc_hook) 224 if (ip_nat_irc_hook)
227 ret = ip_nat_irc_hook(pskb, ctinfo, 225 ret = ip_nat_irc_hook(pskb, ctinfo,
228 addr_beg_p - ib_ptr, 226 addr_beg_p - ib_ptr,
229 addr_end_p - addr_beg_p, 227 addr_end_p - addr_beg_p,
230 exp); 228 exp);
231 else if (ip_conntrack_expect_related(exp) != 0) { 229 else if (ip_conntrack_expect_related(exp) != 0)
232 ip_conntrack_expect_free(exp);
233 ret = NF_DROP; 230 ret = NF_DROP;
234 } 231 ip_conntrack_expect_put(exp);
235 goto out; 232 goto out;
236 } /* for .. NUM_DCCPROTO */ 233 } /* for .. NUM_DCCPROTO */
237 } /* while data < ... */ 234 } /* while data < ... */
238 235
239 out: 236 out:
240 UNLOCK_BH(&irc_buffer_lock); 237 spin_unlock_bh(&irc_buffer_lock);
241 return ret; 238 return ret;
242} 239}
243 240
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index ff8c34a860ff..31d75390bf12 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/netfilter_ipv4/ip_conntrack.h> 27#include <linux/netfilter_ipv4/ip_conntrack.h>
28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
29#include <linux/netfilter_ipv4/lockhelp.h>
30 29
31#if 0 30#if 0
32#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) 31#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
@@ -35,7 +34,7 @@
35#endif 34#endif
36 35
37/* Protects conntrack->proto.sctp */ 36/* Protects conntrack->proto.sctp */
38static DECLARE_RWLOCK(sctp_lock); 37static DEFINE_RWLOCK(sctp_lock);
39 38
40/* FIXME: Examine ipfilter's timeouts and conntrack transitions more 39/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
41 closely. They're more complex. --RR 40 closely. They're more complex. --RR
@@ -199,9 +198,9 @@ static int sctp_print_conntrack(struct seq_file *s,
199 DEBUGP(__FUNCTION__); 198 DEBUGP(__FUNCTION__);
200 DEBUGP("\n"); 199 DEBUGP("\n");
201 200
202 READ_LOCK(&sctp_lock); 201 read_lock_bh(&sctp_lock);
203 state = conntrack->proto.sctp.state; 202 state = conntrack->proto.sctp.state;
204 READ_UNLOCK(&sctp_lock); 203 read_unlock_bh(&sctp_lock);
205 204
206 return seq_printf(s, "%s ", sctp_conntrack_names[state]); 205 return seq_printf(s, "%s ", sctp_conntrack_names[state]);
207} 206}
@@ -343,13 +342,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
343 342
344 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; 343 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
345 for_each_sctp_chunk (skb, sch, _sch, offset, count) { 344 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
346 WRITE_LOCK(&sctp_lock); 345 write_lock_bh(&sctp_lock);
347 346
348 /* Special cases of Verification tag check (Sec 8.5.1) */ 347 /* Special cases of Verification tag check (Sec 8.5.1) */
349 if (sch->type == SCTP_CID_INIT) { 348 if (sch->type == SCTP_CID_INIT) {
350 /* Sec 8.5.1 (A) */ 349 /* Sec 8.5.1 (A) */
351 if (sh->vtag != 0) { 350 if (sh->vtag != 0) {
352 WRITE_UNLOCK(&sctp_lock); 351 write_unlock_bh(&sctp_lock);
353 return -1; 352 return -1;
354 } 353 }
355 } else if (sch->type == SCTP_CID_ABORT) { 354 } else if (sch->type == SCTP_CID_ABORT) {
@@ -357,7 +356,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
357 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) 356 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
358 && !(sh->vtag == conntrack->proto.sctp.vtag 357 && !(sh->vtag == conntrack->proto.sctp.vtag
359 [1 - CTINFO2DIR(ctinfo)])) { 358 [1 - CTINFO2DIR(ctinfo)])) {
360 WRITE_UNLOCK(&sctp_lock); 359 write_unlock_bh(&sctp_lock);
361 return -1; 360 return -1;
362 } 361 }
363 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { 362 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
@@ -366,13 +365,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
366 && !(sh->vtag == conntrack->proto.sctp.vtag 365 && !(sh->vtag == conntrack->proto.sctp.vtag
367 [1 - CTINFO2DIR(ctinfo)] 366 [1 - CTINFO2DIR(ctinfo)]
368 && (sch->flags & 1))) { 367 && (sch->flags & 1))) {
369 WRITE_UNLOCK(&sctp_lock); 368 write_unlock_bh(&sctp_lock);
370 return -1; 369 return -1;
371 } 370 }
372 } else if (sch->type == SCTP_CID_COOKIE_ECHO) { 371 } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
373 /* Sec 8.5.1 (D) */ 372 /* Sec 8.5.1 (D) */
374 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { 373 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
375 WRITE_UNLOCK(&sctp_lock); 374 write_unlock_bh(&sctp_lock);
376 return -1; 375 return -1;
377 } 376 }
378 } 377 }
@@ -384,7 +383,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
384 if (newconntrack == SCTP_CONNTRACK_MAX) { 383 if (newconntrack == SCTP_CONNTRACK_MAX) {
385 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", 384 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
386 CTINFO2DIR(ctinfo), sch->type, oldsctpstate); 385 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
387 WRITE_UNLOCK(&sctp_lock); 386 write_unlock_bh(&sctp_lock);
388 return -1; 387 return -1;
389 } 388 }
390 389
@@ -396,7 +395,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
396 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), 395 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
397 sizeof(_inithdr), &_inithdr); 396 sizeof(_inithdr), &_inithdr);
398 if (ih == NULL) { 397 if (ih == NULL) {
399 WRITE_UNLOCK(&sctp_lock); 398 write_unlock_bh(&sctp_lock);
400 return -1; 399 return -1;
401 } 400 }
402 DEBUGP("Setting vtag %x for dir %d\n", 401 DEBUGP("Setting vtag %x for dir %d\n",
@@ -405,7 +404,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
405 } 404 }
406 405
407 conntrack->proto.sctp.state = newconntrack; 406 conntrack->proto.sctp.state = newconntrack;
408 WRITE_UNLOCK(&sctp_lock); 407 write_unlock_bh(&sctp_lock);
409 } 408 }
410 409
411 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); 410 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 721ddbf522b4..809dfed766d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -36,7 +36,6 @@
36#include <linux/netfilter_ipv4.h> 36#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter_ipv4/ip_conntrack.h> 37#include <linux/netfilter_ipv4/ip_conntrack.h>
38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39#include <linux/netfilter_ipv4/lockhelp.h>
40 39
41#if 0 40#if 0
42#define DEBUGP printk 41#define DEBUGP printk
@@ -46,7 +45,7 @@
46#endif 45#endif
47 46
48/* Protects conntrack->proto.tcp */ 47/* Protects conntrack->proto.tcp */
49static DECLARE_RWLOCK(tcp_lock); 48static DEFINE_RWLOCK(tcp_lock);
50 49
51/* "Be conservative in what you do, 50/* "Be conservative in what you do,
52 be liberal in what you accept from others." 51 be liberal in what you accept from others."
@@ -330,9 +329,9 @@ static int tcp_print_conntrack(struct seq_file *s,
330{ 329{
331 enum tcp_conntrack state; 330 enum tcp_conntrack state;
332 331
333 READ_LOCK(&tcp_lock); 332 read_lock_bh(&tcp_lock);
334 state = conntrack->proto.tcp.state; 333 state = conntrack->proto.tcp.state;
335 READ_UNLOCK(&tcp_lock); 334 read_unlock_bh(&tcp_lock);
336 335
337 return seq_printf(s, "%s ", tcp_conntrack_names[state]); 336 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
338} 337}
@@ -738,14 +737,14 @@ void ip_conntrack_tcp_update(struct sk_buff *skb,
738 737
739 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); 738 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
740 739
741 WRITE_LOCK(&tcp_lock); 740 write_lock_bh(&tcp_lock);
742 /* 741 /*
743 * We have to worry for the ack in the reply packet only... 742 * We have to worry for the ack in the reply packet only...
744 */ 743 */
745 if (after(end, conntrack->proto.tcp.seen[dir].td_end)) 744 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
746 conntrack->proto.tcp.seen[dir].td_end = end; 745 conntrack->proto.tcp.seen[dir].td_end = end;
747 conntrack->proto.tcp.last_end = end; 746 conntrack->proto.tcp.last_end = end;
748 WRITE_UNLOCK(&tcp_lock); 747 write_unlock_bh(&tcp_lock);
749 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " 748 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
750 "receiver end=%u maxend=%u maxwin=%u scale=%i\n", 749 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
751 sender->td_end, sender->td_maxend, sender->td_maxwin, 750 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -857,7 +856,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
857 sizeof(_tcph), &_tcph); 856 sizeof(_tcph), &_tcph);
858 BUG_ON(th == NULL); 857 BUG_ON(th == NULL);
859 858
860 WRITE_LOCK(&tcp_lock); 859 write_lock_bh(&tcp_lock);
861 old_state = conntrack->proto.tcp.state; 860 old_state = conntrack->proto.tcp.state;
862 dir = CTINFO2DIR(ctinfo); 861 dir = CTINFO2DIR(ctinfo);
863 index = get_conntrack_index(th); 862 index = get_conntrack_index(th);
@@ -879,7 +878,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
879 * that the client cannot but retransmit its SYN and 878 * that the client cannot but retransmit its SYN and
880 * thus initiate a clean new session. 879 * thus initiate a clean new session.
881 */ 880 */
882 WRITE_UNLOCK(&tcp_lock); 881 write_unlock_bh(&tcp_lock);
883 if (LOG_INVALID(IPPROTO_TCP)) 882 if (LOG_INVALID(IPPROTO_TCP))
884 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 883 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
885 "ip_ct_tcp: killing out of sync session "); 884 "ip_ct_tcp: killing out of sync session ");
@@ -894,7 +893,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
894 conntrack->proto.tcp.last_end = 893 conntrack->proto.tcp.last_end =
895 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); 894 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
896 895
897 WRITE_UNLOCK(&tcp_lock); 896 write_unlock_bh(&tcp_lock);
898 if (LOG_INVALID(IPPROTO_TCP)) 897 if (LOG_INVALID(IPPROTO_TCP))
899 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 898 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
900 "ip_ct_tcp: invalid packet ignored "); 899 "ip_ct_tcp: invalid packet ignored ");
@@ -904,7 +903,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
904 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", 903 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
905 dir, get_conntrack_index(th), 904 dir, get_conntrack_index(th),
906 old_state); 905 old_state);
907 WRITE_UNLOCK(&tcp_lock); 906 write_unlock_bh(&tcp_lock);
908 if (LOG_INVALID(IPPROTO_TCP)) 907 if (LOG_INVALID(IPPROTO_TCP))
909 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 908 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
910 "ip_ct_tcp: invalid state "); 909 "ip_ct_tcp: invalid state ");
@@ -918,13 +917,13 @@ static int tcp_packet(struct ip_conntrack *conntrack,
918 conntrack->proto.tcp.seen[dir].td_end)) { 917 conntrack->proto.tcp.seen[dir].td_end)) {
919 /* Attempt to reopen a closed connection. 918 /* Attempt to reopen a closed connection.
920 * Delete this connection and look up again. */ 919 * Delete this connection and look up again. */
921 WRITE_UNLOCK(&tcp_lock); 920 write_unlock_bh(&tcp_lock);
922 if (del_timer(&conntrack->timeout)) 921 if (del_timer(&conntrack->timeout))
923 conntrack->timeout.function((unsigned long) 922 conntrack->timeout.function((unsigned long)
924 conntrack); 923 conntrack);
925 return -NF_REPEAT; 924 return -NF_REPEAT;
926 } else { 925 } else {
927 WRITE_UNLOCK(&tcp_lock); 926 write_unlock_bh(&tcp_lock);
928 if (LOG_INVALID(IPPROTO_TCP)) 927 if (LOG_INVALID(IPPROTO_TCP))
929 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 928 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
930 "ip_ct_tcp: invalid SYN"); 929 "ip_ct_tcp: invalid SYN");
@@ -949,7 +948,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
949 948
950 if (!tcp_in_window(&conntrack->proto.tcp, dir, index, 949 if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
951 skb, iph, th)) { 950 skb, iph, th)) {
952 WRITE_UNLOCK(&tcp_lock); 951 write_unlock_bh(&tcp_lock);
953 return -NF_ACCEPT; 952 return -NF_ACCEPT;
954 } 953 }
955 in_window: 954 in_window:
@@ -972,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
972 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans 971 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
973 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans 972 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
974 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; 973 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
975 WRITE_UNLOCK(&tcp_lock); 974 write_unlock_bh(&tcp_lock);
976 975
977 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { 976 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
978 /* If only reply is a RST, we can consider ourselves not to 977 /* If only reply is a RST, we can consider ourselves not to
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 5bc28a224623..8c1eaba098d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,6 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
120 * and moreover root might send raw packets. 120 * and moreover root might send raw packets.
121 * FIXME: Source route IP option packets --RR */ 121 * FIXME: Source route IP option packets --RR */
122 if (hooknum == NF_IP_PRE_ROUTING 122 if (hooknum == NF_IP_PRE_ROUTING
123 && skb->ip_summed != CHECKSUM_UNNECESSARY
123 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, 124 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
124 skb->ip_summed == CHECKSUM_HW ? skb->csum 125 skb->ip_summed == CHECKSUM_HW ? skb->csum
125 : skb_checksum(skb, iph->ihl*4, udplen, 0))) { 126 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 46ca45f74d85..61798c46e91d 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -28,8 +28,8 @@
28#include <net/checksum.h> 28#include <net/checksum.h>
29#include <net/ip.h> 29#include <net/ip.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -119,7 +119,7 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
119 119
120static void *ct_seq_start(struct seq_file *seq, loff_t *pos) 120static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
121{ 121{
122 READ_LOCK(&ip_conntrack_lock); 122 read_lock_bh(&ip_conntrack_lock);
123 return ct_get_idx(seq, *pos); 123 return ct_get_idx(seq, *pos);
124} 124}
125 125
@@ -131,7 +131,7 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
131 131
132static void ct_seq_stop(struct seq_file *s, void *v) 132static void ct_seq_stop(struct seq_file *s, void *v)
133{ 133{
134 READ_UNLOCK(&ip_conntrack_lock); 134 read_unlock_bh(&ip_conntrack_lock);
135} 135}
136 136
137static int ct_seq_show(struct seq_file *s, void *v) 137static int ct_seq_show(struct seq_file *s, void *v)
@@ -140,7 +140,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); 140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
141 struct ip_conntrack_protocol *proto; 141 struct ip_conntrack_protocol *proto;
142 142
143 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 143 ASSERT_READ_LOCK(&ip_conntrack_lock);
144 IP_NF_ASSERT(conntrack); 144 IP_NF_ASSERT(conntrack);
145 145
146 /* we only want to print DIR_ORIGINAL */ 146 /* we only want to print DIR_ORIGINAL */
@@ -239,7 +239,7 @@ static void *exp_seq_start(struct seq_file *s, loff_t *pos)
239 239
240 /* strange seq_file api calls stop even if we fail, 240 /* strange seq_file api calls stop even if we fail,
241 * thus we need to grab lock since stop unlocks */ 241 * thus we need to grab lock since stop unlocks */
242 READ_LOCK(&ip_conntrack_lock); 242 read_lock_bh(&ip_conntrack_lock);
243 243
244 if (list_empty(e)) 244 if (list_empty(e))
245 return NULL; 245 return NULL;
@@ -256,6 +256,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
256{ 256{
257 struct list_head *e = v; 257 struct list_head *e = v;
258 258
259 ++*pos;
259 e = e->next; 260 e = e->next;
260 261
261 if (e == &ip_conntrack_expect_list) 262 if (e == &ip_conntrack_expect_list)
@@ -266,7 +267,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
266 267
267static void exp_seq_stop(struct seq_file *s, void *v) 268static void exp_seq_stop(struct seq_file *s, void *v)
268{ 269{
269 READ_UNLOCK(&ip_conntrack_lock); 270 read_unlock_bh(&ip_conntrack_lock);
270} 271}
271 272
272static int exp_seq_show(struct seq_file *s, void *v) 273static int exp_seq_show(struct seq_file *s, void *v)
@@ -431,6 +432,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
431 const struct net_device *out, 432 const struct net_device *out,
432 int (*okfn)(struct sk_buff *)) 433 int (*okfn)(struct sk_buff *))
433{ 434{
435#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
436 /* Previously seen (loopback)? Ignore. Do this before
437 fragment check. */
438 if ((*pskb)->nfct)
439 return NF_ACCEPT;
440#endif
441
434 /* Gather fragments. */ 442 /* Gather fragments. */
435 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 443 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
436 *pskb = ip_ct_gather_frags(*pskb, 444 *pskb = ip_ct_gather_frags(*pskb,
@@ -920,22 +928,22 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
920{ 928{
921 int ret = 0; 929 int ret = 0;
922 930
923 WRITE_LOCK(&ip_conntrack_lock); 931 write_lock_bh(&ip_conntrack_lock);
924 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { 932 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
925 ret = -EBUSY; 933 ret = -EBUSY;
926 goto out; 934 goto out;
927 } 935 }
928 ip_ct_protos[proto->proto] = proto; 936 ip_ct_protos[proto->proto] = proto;
929 out: 937 out:
930 WRITE_UNLOCK(&ip_conntrack_lock); 938 write_unlock_bh(&ip_conntrack_lock);
931 return ret; 939 return ret;
932} 940}
933 941
934void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) 942void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
935{ 943{
936 WRITE_LOCK(&ip_conntrack_lock); 944 write_lock_bh(&ip_conntrack_lock);
937 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; 945 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
938 WRITE_UNLOCK(&ip_conntrack_lock); 946 write_unlock_bh(&ip_conntrack_lock);
939 947
940 /* Somebody could be still looking at the proto in bh. */ 948 /* Somebody could be still looking at the proto in bh. */
941 synchronize_net(); 949 synchronize_net();
@@ -977,7 +985,7 @@ EXPORT_SYMBOL(ip_ct_refresh_acct);
977EXPORT_SYMBOL(ip_ct_protos); 985EXPORT_SYMBOL(ip_ct_protos);
978EXPORT_SYMBOL(ip_ct_find_proto); 986EXPORT_SYMBOL(ip_ct_find_proto);
979EXPORT_SYMBOL(ip_conntrack_expect_alloc); 987EXPORT_SYMBOL(ip_conntrack_expect_alloc);
980EXPORT_SYMBOL(ip_conntrack_expect_free); 988EXPORT_SYMBOL(ip_conntrack_expect_put);
981EXPORT_SYMBOL(ip_conntrack_expect_related); 989EXPORT_SYMBOL(ip_conntrack_expect_related);
982EXPORT_SYMBOL(ip_conntrack_unexpect_related); 990EXPORT_SYMBOL(ip_conntrack_unexpect_related);
983EXPORT_SYMBOL(ip_conntrack_tuple_taken); 991EXPORT_SYMBOL(ip_conntrack_tuple_taken);
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index 992fac3e36ee..f8ff170f390a 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -65,7 +65,7 @@ static int tftp_help(struct sk_buff **pskb,
65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
67 67
68 exp = ip_conntrack_expect_alloc(); 68 exp = ip_conntrack_expect_alloc(ct);
69 if (exp == NULL) 69 if (exp == NULL)
70 return NF_DROP; 70 return NF_DROP;
71 71
@@ -75,17 +75,15 @@ static int tftp_help(struct sk_buff **pskb,
75 exp->mask.dst.u.udp.port = 0xffff; 75 exp->mask.dst.u.udp.port = 0xffff;
76 exp->mask.dst.protonum = 0xff; 76 exp->mask.dst.protonum = 0xff;
77 exp->expectfn = NULL; 77 exp->expectfn = NULL;
78 exp->master = ct;
79 78
80 DEBUGP("expect: "); 79 DEBUGP("expect: ");
81 DUMP_TUPLE(&exp->tuple); 80 DUMP_TUPLE(&exp->tuple);
82 DUMP_TUPLE(&exp->mask); 81 DUMP_TUPLE(&exp->mask);
83 if (ip_nat_tftp_hook) 82 if (ip_nat_tftp_hook)
84 ret = ip_nat_tftp_hook(pskb, ctinfo, exp); 83 ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
85 else if (ip_conntrack_expect_related(exp) != 0) { 84 else if (ip_conntrack_expect_related(exp) != 0)
86 ip_conntrack_expect_free(exp);
87 ret = NF_DROP; 85 ret = NF_DROP;
88 } 86 ip_conntrack_expect_put(exp);
89 break; 87 break;
90 case TFTP_OPCODE_DATA: 88 case TFTP_OPCODE_DATA:
91 case TFTP_OPCODE_ACK: 89 case TFTP_OPCODE_ACK:
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
index da1f412583ed..706c8074f422 100644
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -56,10 +56,8 @@ static unsigned int help(struct sk_buff **pskb,
56 break; 56 break;
57 } 57 }
58 58
59 if (port == 0) { 59 if (port == 0)
60 ip_conntrack_expect_free(exp);
61 return NF_DROP; 60 return NF_DROP;
62 }
63 61
64 sprintf(buffer, "%u", port); 62 sprintf(buffer, "%u", port);
65 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, 63 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 9fc6f93af0dd..739b6dde1c82 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -22,8 +22,8 @@
22#include <linux/udp.h> 22#include <linux/udp.h>
23#include <linux/jhash.h> 23#include <linux/jhash.h>
24 24
25#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 25#define ASSERT_READ_LOCK(x)
26#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 26#define ASSERT_WRITE_LOCK(x)
27 27
28#include <linux/netfilter_ipv4/ip_conntrack.h> 28#include <linux/netfilter_ipv4/ip_conntrack.h>
29#include <linux/netfilter_ipv4/ip_conntrack_core.h> 29#include <linux/netfilter_ipv4/ip_conntrack_core.h>
@@ -41,7 +41,7 @@
41#define DEBUGP(format, args...) 41#define DEBUGP(format, args...)
42#endif 42#endif
43 43
44DECLARE_RWLOCK(ip_nat_lock); 44DEFINE_RWLOCK(ip_nat_lock);
45 45
46/* Calculated at init based on memory size */ 46/* Calculated at init based on memory size */
47static unsigned int ip_nat_htable_size; 47static unsigned int ip_nat_htable_size;
@@ -65,9 +65,9 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
65 if (!(conn->status & IPS_NAT_DONE_MASK)) 65 if (!(conn->status & IPS_NAT_DONE_MASK))
66 return; 66 return;
67 67
68 WRITE_LOCK(&ip_nat_lock); 68 write_lock_bh(&ip_nat_lock);
69 list_del(&conn->nat.info.bysource); 69 list_del(&conn->nat.info.bysource);
70 WRITE_UNLOCK(&ip_nat_lock); 70 write_unlock_bh(&ip_nat_lock);
71} 71}
72 72
73/* We do checksum mangling, so if they were wrong before they're still 73/* We do checksum mangling, so if they were wrong before they're still
@@ -142,7 +142,7 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
142 unsigned int h = hash_by_src(tuple); 142 unsigned int h = hash_by_src(tuple);
143 struct ip_conntrack *ct; 143 struct ip_conntrack *ct;
144 144
145 READ_LOCK(&ip_nat_lock); 145 read_lock_bh(&ip_nat_lock);
146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) { 146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147 if (same_src(ct, tuple)) { 147 if (same_src(ct, tuple)) {
148 /* Copy source part from reply tuple. */ 148 /* Copy source part from reply tuple. */
@@ -151,12 +151,12 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
151 result->dst = tuple->dst; 151 result->dst = tuple->dst;
152 152
153 if (in_range(result, range)) { 153 if (in_range(result, range)) {
154 READ_UNLOCK(&ip_nat_lock); 154 read_unlock_bh(&ip_nat_lock);
155 return 1; 155 return 1;
156 } 156 }
157 } 157 }
158 } 158 }
159 READ_UNLOCK(&ip_nat_lock); 159 read_unlock_bh(&ip_nat_lock);
160 return 0; 160 return 0;
161} 161}
162 162
@@ -297,9 +297,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
297 unsigned int srchash 297 unsigned int srchash
298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299 .tuple); 299 .tuple);
300 WRITE_LOCK(&ip_nat_lock); 300 write_lock_bh(&ip_nat_lock);
301 list_add(&info->bysource, &bysource[srchash]); 301 list_add(&info->bysource, &bysource[srchash]);
302 WRITE_UNLOCK(&ip_nat_lock); 302 write_unlock_bh(&ip_nat_lock);
303 } 303 }
304 304
305 /* It's done. */ 305 /* It's done. */
@@ -474,23 +474,23 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
474{ 474{
475 int ret = 0; 475 int ret = 0;
476 476
477 WRITE_LOCK(&ip_nat_lock); 477 write_lock_bh(&ip_nat_lock);
478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { 478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
479 ret = -EBUSY; 479 ret = -EBUSY;
480 goto out; 480 goto out;
481 } 481 }
482 ip_nat_protos[proto->protonum] = proto; 482 ip_nat_protos[proto->protonum] = proto;
483 out: 483 out:
484 WRITE_UNLOCK(&ip_nat_lock); 484 write_unlock_bh(&ip_nat_lock);
485 return ret; 485 return ret;
486} 486}
487 487
488/* Noone stores the protocol anywhere; simply delete it. */ 488/* Noone stores the protocol anywhere; simply delete it. */
489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) 489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
490{ 490{
491 WRITE_LOCK(&ip_nat_lock); 491 write_lock_bh(&ip_nat_lock);
492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; 492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
493 WRITE_UNLOCK(&ip_nat_lock); 493 write_unlock_bh(&ip_nat_lock);
494 494
495 /* Someone could be still looking at the proto in a bh. */ 495 /* Someone could be still looking at the proto in a bh. */
496 synchronize_net(); 496 synchronize_net();
@@ -509,13 +509,13 @@ int __init ip_nat_init(void)
509 return -ENOMEM; 509 return -ENOMEM;
510 510
511 /* Sew in builtin protocols. */ 511 /* Sew in builtin protocols. */
512 WRITE_LOCK(&ip_nat_lock); 512 write_lock_bh(&ip_nat_lock);
513 for (i = 0; i < MAX_IP_NAT_PROTO; i++) 513 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
514 ip_nat_protos[i] = &ip_nat_unknown_protocol; 514 ip_nat_protos[i] = &ip_nat_unknown_protocol;
515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; 515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; 516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; 517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
518 WRITE_UNLOCK(&ip_nat_lock); 518 write_unlock_bh(&ip_nat_lock);
519 519
520 for (i = 0; i < ip_nat_htable_size; i++) { 520 for (i = 0; i < ip_nat_htable_size; i++) {
521 INIT_LIST_HEAD(&bysource[i]); 521 INIT_LIST_HEAD(&bysource[i]);
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index c6000e794ad6..d83757a70d9f 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -143,10 +143,8 @@ static unsigned int ip_nat_ftp(struct sk_buff **pskb,
143 break; 143 break;
144 } 144 }
145 145
146 if (port == 0) { 146 if (port == 0)
147 ip_conntrack_expect_free(exp);
148 return NF_DROP; 147 return NF_DROP;
149 }
150 148
151 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, 149 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
152 seq)) { 150 seq)) {
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 1637b96d8c01..158f34f32c04 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -28,8 +28,8 @@
28#include <net/tcp.h> 28#include <net/tcp.h>
29#include <net/udp.h> 29#include <net/udp.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -47,7 +47,7 @@
47#define DUMP_OFFSET(x) 47#define DUMP_OFFSET(x)
48#endif 48#endif
49 49
50static DECLARE_LOCK(ip_nat_seqofs_lock); 50static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
51 51
52/* Setup TCP sequence correction given this change at this sequence */ 52/* Setup TCP sequence correction given this change at this sequence */
53static inline void 53static inline void
@@ -70,7 +70,7 @@ adjust_tcp_sequence(u32 seq,
70 DEBUGP("ip_nat_resize_packet: Seq_offset before: "); 70 DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
71 DUMP_OFFSET(this_way); 71 DUMP_OFFSET(this_way);
72 72
73 LOCK_BH(&ip_nat_seqofs_lock); 73 spin_lock_bh(&ip_nat_seqofs_lock);
74 74
75 /* SYN adjust. If it's uninitialized, or this is after last 75 /* SYN adjust. If it's uninitialized, or this is after last
76 * correction, record it: we don't handle more than one 76 * correction, record it: we don't handle more than one
@@ -82,7 +82,7 @@ adjust_tcp_sequence(u32 seq,
82 this_way->offset_before = this_way->offset_after; 82 this_way->offset_before = this_way->offset_after;
83 this_way->offset_after += sizediff; 83 this_way->offset_after += sizediff;
84 } 84 }
85 UNLOCK_BH(&ip_nat_seqofs_lock); 85 spin_unlock_bh(&ip_nat_seqofs_lock);
86 86
87 DEBUGP("ip_nat_resize_packet: Seq_offset after: "); 87 DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
88 DUMP_OFFSET(this_way); 88 DUMP_OFFSET(this_way);
@@ -142,9 +142,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
142 /* Transfer socket to new skb. */ 142 /* Transfer socket to new skb. */
143 if ((*pskb)->sk) 143 if ((*pskb)->sk)
144 skb_set_owner_w(nskb, (*pskb)->sk); 144 skb_set_owner_w(nskb, (*pskb)->sk);
145#ifdef CONFIG_NETFILTER_DEBUG
146 nskb->nf_debug = (*pskb)->nf_debug;
147#endif
148 kfree_skb(*pskb); 145 kfree_skb(*pskb);
149 *pskb = nskb; 146 *pskb = nskb;
150 return 1; 147 return 1;
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
index 9c1ca3381d56..de31942babe3 100644
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -65,10 +65,8 @@ static unsigned int help(struct sk_buff **pskb,
65 break; 65 break;
66 } 66 }
67 67
68 if (port == 0) { 68 if (port == 0)
69 ip_conntrack_expect_free(exp);
70 return NF_DROP; 69 return NF_DROP;
71 }
72 70
73 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 71 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
74 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 72 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index a558cf0eee8a..6596c9ee1655 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -35,16 +35,17 @@ icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
35 const struct ip_conntrack *conntrack) 35 const struct ip_conntrack *conntrack)
36{ 36{
37 static u_int16_t id; 37 static u_int16_t id;
38 unsigned int range_size 38 unsigned int range_size;
39 = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
40 unsigned int i; 39 unsigned int i;
41 40
41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
42 /* If no range specified... */ 42 /* If no range specified... */
43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) 43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF; 44 range_size = 0xFFFF;
45 45
46 for (i = 0; i < range_size; i++, id++) { 46 for (i = 0; i < range_size; i++, id++) {
47 tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size); 47 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
48 (id % range_size));
48 if (!ip_nat_used_tuple(tuple, conntrack)) 49 if (!ip_nat_used_tuple(tuple, conntrack))
49 return 1; 50 return 1;
50 } 51 }
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a91cfceff272..a98e36d2b3c6 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -40,7 +40,8 @@ tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
40 enum ip_nat_manip_type maniptype, 40 enum ip_nat_manip_type maniptype,
41 const struct ip_conntrack *conntrack) 41 const struct ip_conntrack *conntrack)
42{ 42{
43 static u_int16_t port, *portptr; 43 static u_int16_t port;
44 u_int16_t *portptr;
44 unsigned int range_size, min, i; 45 unsigned int range_size, min, i;
45 46
46 if (maniptype == IP_NAT_MANIP_SRC) 47 if (maniptype == IP_NAT_MANIP_SRC)
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index c669e3b5f5d0..9f66e5625664 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -41,7 +41,8 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple,
41 enum ip_nat_manip_type maniptype, 41 enum ip_nat_manip_type maniptype,
42 const struct ip_conntrack *conntrack) 42 const struct ip_conntrack *conntrack)
43{ 43{
44 static u_int16_t port, *portptr; 44 static u_int16_t port;
45 u_int16_t *portptr;
45 unsigned int range_size, min, i; 46 unsigned int range_size, min, i;
46 47
47 if (maniptype == IP_NAT_MANIP_SRC) 48 if (maniptype == IP_NAT_MANIP_SRC)
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 581f097f5a24..60d70fa41a15 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -19,8 +19,8 @@
19#include <net/route.h> 19#include <net/route.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21 21
22#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 22#define ASSERT_READ_LOCK(x)
23#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 23#define ASSERT_WRITE_LOCK(x)
24 24
25#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
26#include <linux/netfilter_ipv4/ip_nat.h> 26#include <linux/netfilter_ipv4/ip_nat.h>
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 79f56f662b33..91d5ea1dbbc9 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -31,8 +31,8 @@
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33 33
34#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 34#define ASSERT_READ_LOCK(x)
35#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 35#define ASSERT_WRITE_LOCK(x)
36 36
37#include <linux/netfilter_ipv4/ip_nat.h> 37#include <linux/netfilter_ipv4/ip_nat.h>
38#include <linux/netfilter_ipv4/ip_nat_rule.h> 38#include <linux/netfilter_ipv4/ip_nat_rule.h>
@@ -102,6 +102,10 @@ ip_nat_fn(unsigned int hooknum,
102 return NF_ACCEPT; 102 return NF_ACCEPT;
103 } 103 }
104 104
105 /* Don't try to NAT if this packet is not conntracked */
106 if (ct == &ip_conntrack_untracked)
107 return NF_ACCEPT;
108
105 switch (ctinfo) { 109 switch (ctinfo) {
106 case IP_CT_RELATED: 110 case IP_CT_RELATED:
107 case IP_CT_RELATED+IP_CT_IS_REPLY: 111 case IP_CT_RELATED+IP_CT_IS_REPLY:
@@ -373,7 +377,6 @@ static int init_or_cleanup(int init)
373 cleanup_rule_init: 377 cleanup_rule_init:
374 ip_nat_rule_cleanup(); 378 ip_nat_rule_cleanup();
375 cleanup_nothing: 379 cleanup_nothing:
376 MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
377 return ret; 380 return ret;
378} 381}
379 382
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
index 0343e0d64674..2215317c76b7 100644
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -45,10 +45,8 @@ static unsigned int help(struct sk_buff **pskb,
45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; 45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
46 exp->dir = IP_CT_DIR_REPLY; 46 exp->dir = IP_CT_DIR_REPLY;
47 exp->expectfn = ip_nat_follow_master; 47 exp->expectfn = ip_nat_follow_master;
48 if (ip_conntrack_expect_related(exp) != 0) { 48 if (ip_conntrack_expect_related(exp) != 0)
49 ip_conntrack_expect_free(exp);
50 return NF_DROP; 49 return NF_DROP;
51 }
52 return NF_ACCEPT; 50 return NF_ACCEPT;
53} 51}
54 52
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a54f92b8496..c88dfcd38c56 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -67,7 +67,6 @@ static DECLARE_MUTEX(ipt_mutex);
67/* Must have mutex */ 67/* Must have mutex */
68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
70#include <linux/netfilter_ipv4/lockhelp.h>
71#include <linux/netfilter_ipv4/listhelp.h> 70#include <linux/netfilter_ipv4/listhelp.h>
72 71
73#if 0 72#if 0
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0f12e3a3dc73..6706d3a1bc4f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,9 +29,8 @@
29#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32#include <linux/netfilter_ipv4/lockhelp.h>
33 32
34#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.7"
35 34
36#define DEBUG_CLUSTERIP 35#define DEBUG_CLUSTERIP
37 36
@@ -41,6 +40,8 @@
41#define DEBUGP 40#define DEBUGP
42#endif 41#endif
43 42
43#define ASSERT_READ_LOCK(x)
44
44MODULE_LICENSE("GPL"); 45MODULE_LICENSE("GPL");
45MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); 46MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
46MODULE_DESCRIPTION("iptables target for CLUSTERIP"); 47MODULE_DESCRIPTION("iptables target for CLUSTERIP");
@@ -67,7 +68,7 @@ static LIST_HEAD(clusterip_configs);
67 68
68/* clusterip_lock protects the clusterip_configs list _AND_ the configurable 69/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
69 * data within all structurses (num_local_nodes, local_nodes[]) */ 70 * data within all structurses (num_local_nodes, local_nodes[]) */
70static DECLARE_RWLOCK(clusterip_lock); 71static DEFINE_RWLOCK(clusterip_lock);
71 72
72#ifdef CONFIG_PROC_FS 73#ifdef CONFIG_PROC_FS
73static struct file_operations clusterip_proc_fops; 74static struct file_operations clusterip_proc_fops;
@@ -82,9 +83,9 @@ clusterip_config_get(struct clusterip_config *c) {
82static inline void 83static inline void
83clusterip_config_put(struct clusterip_config *c) { 84clusterip_config_put(struct clusterip_config *c) {
84 if (atomic_dec_and_test(&c->refcount)) { 85 if (atomic_dec_and_test(&c->refcount)) {
85 WRITE_LOCK(&clusterip_lock); 86 write_lock_bh(&clusterip_lock);
86 list_del(&c->list); 87 list_del(&c->list);
87 WRITE_UNLOCK(&clusterip_lock); 88 write_unlock_bh(&clusterip_lock);
88 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); 89 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
89 dev_put(c->dev); 90 dev_put(c->dev);
90 kfree(c); 91 kfree(c);
@@ -97,7 +98,7 @@ __clusterip_config_find(u_int32_t clusterip)
97{ 98{
98 struct list_head *pos; 99 struct list_head *pos;
99 100
100 MUST_BE_READ_LOCKED(&clusterip_lock); 101 ASSERT_READ_LOCK(&clusterip_lock);
101 list_for_each(pos, &clusterip_configs) { 102 list_for_each(pos, &clusterip_configs) {
102 struct clusterip_config *c = list_entry(pos, 103 struct clusterip_config *c = list_entry(pos,
103 struct clusterip_config, list); 104 struct clusterip_config, list);
@@ -114,14 +115,14 @@ clusterip_config_find_get(u_int32_t clusterip)
114{ 115{
115 struct clusterip_config *c; 116 struct clusterip_config *c;
116 117
117 READ_LOCK(&clusterip_lock); 118 read_lock_bh(&clusterip_lock);
118 c = __clusterip_config_find(clusterip); 119 c = __clusterip_config_find(clusterip);
119 if (!c) { 120 if (!c) {
120 READ_UNLOCK(&clusterip_lock); 121 read_unlock_bh(&clusterip_lock);
121 return NULL; 122 return NULL;
122 } 123 }
123 atomic_inc(&c->refcount); 124 atomic_inc(&c->refcount);
124 READ_UNLOCK(&clusterip_lock); 125 read_unlock_bh(&clusterip_lock);
125 126
126 return c; 127 return c;
127} 128}
@@ -160,9 +161,9 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
160 c->pde->data = c; 161 c->pde->data = c;
161#endif 162#endif
162 163
163 WRITE_LOCK(&clusterip_lock); 164 write_lock_bh(&clusterip_lock);
164 list_add(&c->list, &clusterip_configs); 165 list_add(&c->list, &clusterip_configs);
165 WRITE_UNLOCK(&clusterip_lock); 166 write_unlock_bh(&clusterip_lock);
166 167
167 return c; 168 return c;
168} 169}
@@ -172,25 +173,25 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
172{ 173{
173 int i; 174 int i;
174 175
175 WRITE_LOCK(&clusterip_lock); 176 write_lock_bh(&clusterip_lock);
176 177
177 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES 178 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
178 || nodenum > CLUSTERIP_MAX_NODES) { 179 || nodenum > CLUSTERIP_MAX_NODES) {
179 WRITE_UNLOCK(&clusterip_lock); 180 write_unlock_bh(&clusterip_lock);
180 return 1; 181 return 1;
181 } 182 }
182 183
183 /* check if we alrady have this number in our array */ 184 /* check if we alrady have this number in our array */
184 for (i = 0; i < c->num_local_nodes; i++) { 185 for (i = 0; i < c->num_local_nodes; i++) {
185 if (c->local_nodes[i] == nodenum) { 186 if (c->local_nodes[i] == nodenum) {
186 WRITE_UNLOCK(&clusterip_lock); 187 write_unlock_bh(&clusterip_lock);
187 return 1; 188 return 1;
188 } 189 }
189 } 190 }
190 191
191 c->local_nodes[c->num_local_nodes++] = nodenum; 192 c->local_nodes[c->num_local_nodes++] = nodenum;
192 193
193 WRITE_UNLOCK(&clusterip_lock); 194 write_unlock_bh(&clusterip_lock);
194 return 0; 195 return 0;
195} 196}
196 197
@@ -199,10 +200,10 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
199{ 200{
200 int i; 201 int i;
201 202
202 WRITE_LOCK(&clusterip_lock); 203 write_lock_bh(&clusterip_lock);
203 204
204 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { 205 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
205 WRITE_UNLOCK(&clusterip_lock); 206 write_unlock_bh(&clusterip_lock);
206 return 1; 207 return 1;
207 } 208 }
208 209
@@ -211,12 +212,12 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
211 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); 212 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
212 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); 213 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
213 c->num_local_nodes--; 214 c->num_local_nodes--;
214 WRITE_UNLOCK(&clusterip_lock); 215 write_unlock_bh(&clusterip_lock);
215 return 0; 216 return 0;
216 } 217 }
217 } 218 }
218 219
219 WRITE_UNLOCK(&clusterip_lock); 220 write_unlock_bh(&clusterip_lock);
220 return 1; 221 return 1;
221} 222}
222 223
@@ -286,21 +287,21 @@ clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
286{ 287{
287 int i; 288 int i;
288 289
289 READ_LOCK(&clusterip_lock); 290 read_lock_bh(&clusterip_lock);
290 291
291 if (config->num_local_nodes == 0) { 292 if (config->num_local_nodes == 0) {
292 READ_UNLOCK(&clusterip_lock); 293 read_unlock_bh(&clusterip_lock);
293 return 0; 294 return 0;
294 } 295 }
295 296
296 for (i = 0; i < config->num_local_nodes; i++) { 297 for (i = 0; i < config->num_local_nodes; i++) {
297 if (config->local_nodes[i] == hash) { 298 if (config->local_nodes[i] == hash) {
298 READ_UNLOCK(&clusterip_lock); 299 read_unlock_bh(&clusterip_lock);
299 return 1; 300 return 1;
300 } 301 }
301 } 302 }
302 303
303 READ_UNLOCK(&clusterip_lock); 304 read_unlock_bh(&clusterip_lock);
304 305
305 return 0; 306 return 0;
306} 307}
@@ -338,7 +339,7 @@ target(struct sk_buff **pskb,
338 * error messages (RELATED) and information requests (see below) */ 339 * error messages (RELATED) and information requests (see below) */
339 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 340 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
340 && (ctinfo == IP_CT_RELATED 341 && (ctinfo == IP_CT_RELATED
341 || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) 342 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
342 return IPT_CONTINUE; 343 return IPT_CONTINUE;
343 344
344 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 345 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -523,8 +524,9 @@ arp_mangle(unsigned int hook,
523 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) 524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
524 return NF_ACCEPT; 525 return NF_ACCEPT;
525 526
526 /* we only want to mangle arp replies */ 527 /* we only want to mangle arp requests and replies */
527 if (arp->ar_op != htons(ARPOP_REPLY)) 528 if (arp->ar_op != htons(ARPOP_REPLY)
529 && arp->ar_op != htons(ARPOP_REQUEST))
528 return NF_ACCEPT; 530 return NF_ACCEPT;
529 531
530 payload = (void *)(arp+1); 532 payload = (void *)(arp+1);
@@ -578,7 +580,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
578 struct clusterip_config *c = pde->data; 580 struct clusterip_config *c = pde->data;
579 unsigned int *nodeidx; 581 unsigned int *nodeidx;
580 582
581 READ_LOCK(&clusterip_lock); 583 read_lock_bh(&clusterip_lock);
582 if (*pos >= c->num_local_nodes) 584 if (*pos >= c->num_local_nodes)
583 return NULL; 585 return NULL;
584 586
@@ -608,7 +610,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v)
608{ 610{
609 kfree(v); 611 kfree(v);
610 612
611 READ_UNLOCK(&clusterip_lock); 613 read_unlock_bh(&clusterip_lock);
612} 614}
613 615
614static int clusterip_seq_show(struct seq_file *s, void *v) 616static int clusterip_seq_show(struct seq_file *s, void *v)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 57e9f6cf1c36..91e74502c3d3 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module");
33#endif 33#endif
34 34
35/* Lock protects masq region inside conntrack */ 35/* Lock protects masq region inside conntrack */
36static DECLARE_RWLOCK(masq_lock); 36static DEFINE_RWLOCK(masq_lock);
37 37
38/* FIXME: Multiple targets. --RR */ 38/* FIXME: Multiple targets. --RR */
39static int 39static int
@@ -103,9 +103,9 @@ masquerade_target(struct sk_buff **pskb,
103 return NF_DROP; 103 return NF_DROP;
104 } 104 }
105 105
106 WRITE_LOCK(&masq_lock); 106 write_lock_bh(&masq_lock);
107 ct->nat.masq_index = out->ifindex; 107 ct->nat.masq_index = out->ifindex;
108 WRITE_UNLOCK(&masq_lock); 108 write_unlock_bh(&masq_lock);
109 109
110 /* Transfer from original range. */ 110 /* Transfer from original range. */
111 newrange = ((struct ip_nat_range) 111 newrange = ((struct ip_nat_range)
@@ -122,9 +122,9 @@ device_cmp(struct ip_conntrack *i, void *ifindex)
122{ 122{
123 int ret; 123 int ret;
124 124
125 READ_LOCK(&masq_lock); 125 read_lock_bh(&masq_lock);
126 ret = (i->nat.masq_index == (int)(long)ifindex); 126 ret = (i->nat.masq_index == (int)(long)ifindex);
127 READ_UNLOCK(&masq_lock); 127 read_unlock_bh(&masq_lock);
128 128
129 return ret; 129 return ret;
130} 130}
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 266d64979286..915696446020 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -104,10 +104,12 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
104static void send_reset(struct sk_buff *oldskb, int hook) 104static void send_reset(struct sk_buff *oldskb, int hook)
105{ 105{
106 struct sk_buff *nskb; 106 struct sk_buff *nskb;
107 struct iphdr *iph = oldskb->nh.iph;
107 struct tcphdr _otcph, *oth, *tcph; 108 struct tcphdr _otcph, *oth, *tcph;
108 struct rtable *rt; 109 struct rtable *rt;
109 u_int16_t tmp_port; 110 u_int16_t tmp_port;
110 u_int32_t tmp_addr; 111 u_int32_t tmp_addr;
112 unsigned int tcplen;
111 int needs_ack; 113 int needs_ack;
112 int hh_len; 114 int hh_len;
113 115
@@ -124,7 +126,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
124 if (oth->rst) 126 if (oth->rst)
125 return; 127 return;
126 128
127 /* FIXME: Check checksum --RR */ 129 /* Check checksum */
130 tcplen = oldskb->len - iph->ihl * 4;
131 if (((hook != NF_IP_LOCAL_IN && oldskb->ip_summed != CHECKSUM_HW) ||
132 (hook == NF_IP_LOCAL_IN &&
133 oldskb->ip_summed != CHECKSUM_UNNECESSARY)) &&
134 csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
135 oldskb->ip_summed == CHECKSUM_HW ? oldskb->csum :
136 skb_checksum(oldskb, iph->ihl * 4, tcplen, 0)))
137 return;
138
128 if ((rt = route_reverse(oldskb, oth, hook)) == NULL) 139 if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
129 return; 140 return;
130 141
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6f2cefbe16cd..52a0076302a7 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -56,7 +56,6 @@
56#include <linux/netfilter.h> 56#include <linux/netfilter.h>
57#include <linux/netfilter_ipv4/ip_tables.h> 57#include <linux/netfilter_ipv4/ip_tables.h>
58#include <linux/netfilter_ipv4/ipt_ULOG.h> 58#include <linux/netfilter_ipv4/ipt_ULOG.h>
59#include <linux/netfilter_ipv4/lockhelp.h>
60#include <net/sock.h> 59#include <net/sock.h>
61#include <linux/bitops.h> 60#include <linux/bitops.h>
62 61
@@ -99,8 +98,8 @@ typedef struct {
99 98
100static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ 99static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
101 100
102static struct sock *nflognl; /* our socket */ 101static struct sock *nflognl; /* our socket */
103static DECLARE_LOCK(ulog_lock); /* spinlock */ 102static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
104 103
105/* send one ulog_buff_t to userspace */ 104/* send one ulog_buff_t to userspace */
106static void ulog_send(unsigned int nlgroupnum) 105static void ulog_send(unsigned int nlgroupnum)
@@ -135,9 +134,9 @@ static void ulog_timer(unsigned long data)
135 134
136 /* lock to protect against somebody modifying our structure 135 /* lock to protect against somebody modifying our structure
137 * from ipt_ulog_target at the same time */ 136 * from ipt_ulog_target at the same time */
138 LOCK_BH(&ulog_lock); 137 spin_lock_bh(&ulog_lock);
139 ulog_send(data); 138 ulog_send(data);
140 UNLOCK_BH(&ulog_lock); 139 spin_unlock_bh(&ulog_lock);
141} 140}
142 141
143static struct sk_buff *ulog_alloc_skb(unsigned int size) 142static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -193,7 +192,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
193 192
194 ub = &ulog_buffers[groupnum]; 193 ub = &ulog_buffers[groupnum];
195 194
196 LOCK_BH(&ulog_lock); 195 spin_lock_bh(&ulog_lock);
197 196
198 if (!ub->skb) { 197 if (!ub->skb) {
199 if (!(ub->skb = ulog_alloc_skb(size))) 198 if (!(ub->skb = ulog_alloc_skb(size)))
@@ -278,7 +277,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
278 ulog_send(groupnum); 277 ulog_send(groupnum);
279 } 278 }
280 279
281 UNLOCK_BH(&ulog_lock); 280 spin_unlock_bh(&ulog_lock);
282 281
283 return; 282 return;
284 283
@@ -288,7 +287,7 @@ nlmsg_failure:
288alloc_failure: 287alloc_failure:
289 PRINTR("ipt_ULOG: Error building netlink message\n"); 288 PRINTR("ipt_ULOG: Error building netlink message\n");
290 289
291 UNLOCK_BH(&ulog_lock); 290 spin_unlock_bh(&ulog_lock);
292} 291}
293 292
294static unsigned int ipt_ulog_target(struct sk_buff **pskb, 293static unsigned int ipt_ulog_target(struct sk_buff **pskb,
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index f1937190cd77..564b49bfebcf 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/netfilter_ipv4/ip_tables.h> 38#include <linux/netfilter_ipv4/ip_tables.h>
39#include <linux/netfilter_ipv4/ipt_hashlimit.h> 39#include <linux/netfilter_ipv4/ipt_hashlimit.h>
40#include <linux/netfilter_ipv4/lockhelp.h>
41 40
42/* FIXME: this is just for IP_NF_ASSERRT */ 41/* FIXME: this is just for IP_NF_ASSERRT */
43#include <linux/netfilter_ipv4/ip_conntrack.h> 42#include <linux/netfilter_ipv4/ip_conntrack.h>
@@ -92,7 +91,7 @@ struct ipt_hashlimit_htable {
92 struct hlist_head hash[0]; /* hashtable itself */ 91 struct hlist_head hash[0]; /* hashtable itself */
93}; 92};
94 93
95static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ 94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
96static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ 95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
97static HLIST_HEAD(hashlimit_htables); 96static HLIST_HEAD(hashlimit_htables);
98static kmem_cache_t *hashlimit_cachep; 97static kmem_cache_t *hashlimit_cachep;
@@ -233,9 +232,9 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
233 hinfo->timer.function = htable_gc; 232 hinfo->timer.function = htable_gc;
234 add_timer(&hinfo->timer); 233 add_timer(&hinfo->timer);
235 234
236 LOCK_BH(&hashlimit_lock); 235 spin_lock_bh(&hashlimit_lock);
237 hlist_add_head(&hinfo->node, &hashlimit_htables); 236 hlist_add_head(&hinfo->node, &hashlimit_htables);
238 UNLOCK_BH(&hashlimit_lock); 237 spin_unlock_bh(&hashlimit_lock);
239 238
240 return 0; 239 return 0;
241} 240}
@@ -301,15 +300,15 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
301 struct ipt_hashlimit_htable *hinfo; 300 struct ipt_hashlimit_htable *hinfo;
302 struct hlist_node *pos; 301 struct hlist_node *pos;
303 302
304 LOCK_BH(&hashlimit_lock); 303 spin_lock_bh(&hashlimit_lock);
305 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { 304 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
306 if (!strcmp(name, hinfo->pde->name)) { 305 if (!strcmp(name, hinfo->pde->name)) {
307 atomic_inc(&hinfo->use); 306 atomic_inc(&hinfo->use);
308 UNLOCK_BH(&hashlimit_lock); 307 spin_unlock_bh(&hashlimit_lock);
309 return hinfo; 308 return hinfo;
310 } 309 }
311 } 310 }
312 UNLOCK_BH(&hashlimit_lock); 311 spin_unlock_bh(&hashlimit_lock);
313 312
314 return NULL; 313 return NULL;
315} 314}
@@ -317,9 +316,9 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
317static void htable_put(struct ipt_hashlimit_htable *hinfo) 316static void htable_put(struct ipt_hashlimit_htable *hinfo)
318{ 317{
319 if (atomic_dec_and_test(&hinfo->use)) { 318 if (atomic_dec_and_test(&hinfo->use)) {
320 LOCK_BH(&hashlimit_lock); 319 spin_lock_bh(&hashlimit_lock);
321 hlist_del(&hinfo->node); 320 hlist_del(&hinfo->node);
322 UNLOCK_BH(&hashlimit_lock); 321 spin_unlock_bh(&hashlimit_lock);
323 htable_destroy(hinfo); 322 htable_destroy(hinfo);
324 } 323 }
325} 324}
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 33fdf364d3d3..3e7dd014de43 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
53 return ret; 53 return ret;
54 } 54 }
55 55
56 READ_LOCK(&ip_conntrack_lock); 56 read_lock_bh(&ip_conntrack_lock);
57 if (!ct->master->helper) { 57 if (!ct->master->helper) {
58 DEBUGP("ipt_helper: master ct %p has no helper\n", 58 DEBUGP("ipt_helper: master ct %p has no helper\n",
59 exp->expectant); 59 exp->expectant);
@@ -69,7 +69,7 @@ match(const struct sk_buff *skb,
69 ret ^= !strncmp(ct->master->helper->name, info->name, 69 ret ^= !strncmp(ct->master->helper->name, info->name,
70 strlen(ct->master->helper->name)); 70 strlen(ct->master->helper->name));
71out_unlock: 71out_unlock:
72 READ_UNLOCK(&ip_conntrack_lock); 72 read_unlock_bh(&ip_conntrack_lock);
73 return ret; 73 return ret;
74} 74}
75 75
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 25ab9fabdcba..2d44b07688af 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -223,7 +223,7 @@ static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned
223 curr_table->table[count].last_seen = 0; 223 curr_table->table[count].last_seen = 0;
224 curr_table->table[count].addr = 0; 224 curr_table->table[count].addr = 0;
225 curr_table->table[count].ttl = 0; 225 curr_table->table[count].ttl = 0;
226 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 226 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
227 curr_table->table[count].oldest_pkt = 0; 227 curr_table->table[count].oldest_pkt = 0;
228 curr_table->table[count].time_pos = 0; 228 curr_table->table[count].time_pos = 0;
229 curr_table->time_info[count].position = count; 229 curr_table->time_info[count].position = count;
@@ -502,7 +502,7 @@ match(const struct sk_buff *skb,
502 location = time_info[curr_table->time_pos].position; 502 location = time_info[curr_table->time_pos].position;
503 hash_table[r_list[location].hash_entry] = -1; 503 hash_table[r_list[location].hash_entry] = -1;
504 hash_table[hash_result] = location; 504 hash_table[hash_result] = location;
505 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 505 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
506 r_list[location].time_pos = curr_table->time_pos; 506 r_list[location].time_pos = curr_table->time_pos;
507 r_list[location].addr = addr; 507 r_list[location].addr = addr;
508 r_list[location].ttl = ttl; 508 r_list[location].ttl = ttl;
@@ -631,7 +631,7 @@ match(const struct sk_buff *skb,
631 r_list[location].last_seen = 0; 631 r_list[location].last_seen = 0;
632 r_list[location].addr = 0; 632 r_list[location].addr = 0;
633 r_list[location].ttl = 0; 633 r_list[location].ttl = 0;
634 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 634 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
635 r_list[location].oldest_pkt = 0; 635 r_list[location].oldest_pkt = 0;
636 ans = !info->invert; 636 ans = !info->invert;
637 } 637 }
@@ -734,10 +734,10 @@ checkentry(const char *tablename,
734 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); 734 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
735#ifdef DEBUG 735#ifdef DEBUG
736 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", 736 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
737 sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); 737 sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
738#endif 738#endif
739 739
740 hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); 740 hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
741#ifdef DEBUG 741#ifdef DEBUG
742 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); 742 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
743#endif 743#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5b1ec586bae6..d1835b1bc8c4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -259,7 +259,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
259 return 0; 259 return 0;
260} 260}
261 261
262static int raw_send_hdrinc(struct sock *sk, void *from, int length, 262static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
263 struct rtable *rt, 263 struct rtable *rt,
264 unsigned int flags) 264 unsigned int flags)
265{ 265{
@@ -298,7 +298,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, int length,
298 goto error_fault; 298 goto error_fault;
299 299
300 /* We don't modify invalid header */ 300 /* We don't modify invalid header */
301 if (length >= sizeof(*iph) && iph->ihl * 4 <= length) { 301 if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
302 if (!iph->saddr) 302 if (!iph->saddr)
303 iph->saddr = rt->rt_src; 303 iph->saddr = rt->rt_src;
304 iph->check = 0; 304 iph->check = 0;
@@ -332,7 +332,7 @@ static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
332 u8 __user *type = NULL; 332 u8 __user *type = NULL;
333 u8 __user *code = NULL; 333 u8 __user *code = NULL;
334 int probed = 0; 334 int probed = 0;
335 int i; 335 unsigned int i;
336 336
337 if (!msg->msg_iov) 337 if (!msg->msg_iov)
338 return; 338 return;
@@ -384,7 +384,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
384 int err; 384 int err;
385 385
386 err = -EMSGSIZE; 386 err = -EMSGSIZE;
387 if (len < 0 || len > 0xFFFF) 387 if (len > 0xFFFF)
388 goto out; 388 goto out;
389 389
390 /* 390 /*
@@ -514,7 +514,10 @@ done:
514 kfree(ipc.opt); 514 kfree(ipc.opt);
515 ip_rt_put(rt); 515 ip_rt_put(rt);
516 516
517out: return err < 0 ? err : len; 517out:
518 if (err < 0)
519 return err;
520 return len;
518 521
519do_confirm: 522do_confirm:
520 dst_confirm(&rt->u.dst); 523 dst_confirm(&rt->u.dst);
@@ -610,7 +613,10 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
610 copied = skb->len; 613 copied = skb->len;
611done: 614done:
612 skb_free_datagram(sk, skb); 615 skb_free_datagram(sk, skb);
613out: return err ? err : copied; 616out:
617 if (err)
618 return err;
619 return copied;
614} 620}
615 621
616static int raw_init(struct sock *sk) 622static int raw_init(struct sock *sk)
@@ -691,11 +697,11 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
691 struct sk_buff *skb; 697 struct sk_buff *skb;
692 int amount = 0; 698 int amount = 0;
693 699
694 spin_lock_irq(&sk->sk_receive_queue.lock); 700 spin_lock_bh(&sk->sk_receive_queue.lock);
695 skb = skb_peek(&sk->sk_receive_queue); 701 skb = skb_peek(&sk->sk_receive_queue);
696 if (skb != NULL) 702 if (skb != NULL)
697 amount = skb->len; 703 amount = skb->len;
698 spin_unlock_irq(&sk->sk_receive_queue.lock); 704 spin_unlock_bh(&sk->sk_receive_queue.lock);
699 return put_user(amount, (int __user *)arg); 705 return put_user(amount, (int __user *)arg);
700 } 706 }
701 707
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a682d28e247b..d675ff80b04d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
54 * Marc Boucher : routing by fwmark 54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics 55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * 58 *
58 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
70#include <linux/kernel.h> 71#include <linux/kernel.h>
71#include <linux/sched.h> 72#include <linux/sched.h>
72#include <linux/mm.h> 73#include <linux/mm.h>
74#include <linux/bootmem.h>
73#include <linux/string.h> 75#include <linux/string.h>
74#include <linux/socket.h> 76#include <linux/socket.h>
75#include <linux/sockios.h> 77#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
201 203
202struct rt_hash_bucket { 204struct rt_hash_bucket {
203 struct rtable *chain; 205 struct rtable *chain;
204 spinlock_t lock; 206};
205} __attribute__((__aligned__(8))); 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
206 237
207static struct rt_hash_bucket *rt_hash_table; 238static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask; 239static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
575/* This runs via a timer and thus is always in BH context. */ 606/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy) 607static void rt_check_expire(unsigned long dummy)
577{ 608{
578 static int rover; 609 static unsigned int rover;
579 int i = rover, t; 610 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 611 struct rtable *rth, **rthp;
581 unsigned long now = jiffies; 612 unsigned long now = jiffies;
582 613 u64 mult;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 614
584 t -= ip_rt_gc_timeout) { 615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
585 unsigned long tmo = ip_rt_gc_timeout; 621 unsigned long tmo = ip_rt_gc_timeout;
586 622
587 i = (i + 1) & rt_hash_mask; 623 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain; 624 rthp = &rt_hash_table[i].chain;
589 625
590 spin_lock(&rt_hash_table[i].lock); 626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
591 while ((rth = *rthp) != NULL) { 629 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) { 630 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */ 631 /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
620 rt_free(rth); 658 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 } 660 }
623 spin_unlock(&rt_hash_table[i].lock); 661 spin_unlock(rt_hash_lock_addr(i));
624 662
625 /* Fallback loop breaker. */ 663 /* Fallback loop breaker. */
626 if (time_after(jiffies, now)) 664 if (time_after(jiffies, now))
627 break; 665 break;
628 } 666 }
629 rover = i; 667 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
631} 669}
632 670
633/* This can run from both BH and non-BH contexts, the latter 671/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
643 get_random_bytes(&rt_hash_rnd, 4); 681 get_random_bytes(&rt_hash_rnd, 4);
644 682
645 for (i = rt_hash_mask; i >= 0; i--) { 683 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock); 684 spin_lock_bh(rt_hash_lock_addr(i));
647 rth = rt_hash_table[i].chain; 685 rth = rt_hash_table[i].chain;
648 if (rth) 686 if (rth)
649 rt_hash_table[i].chain = NULL; 687 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock); 688 spin_unlock_bh(rt_hash_lock_addr(i));
651 689
652 for (; rth; rth = next) { 690 for (; rth; rth = next) {
653 next = rth->u.rt_next; 691 next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
780 818
781 k = (k + 1) & rt_hash_mask; 819 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain; 820 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock); 821 spin_lock_bh(rt_hash_lock_addr(k));
784 while ((rth = *rthp) != NULL) { 822 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) { 823 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1; 824 tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
812 goal--; 850 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 } 852 }
815 spin_unlock_bh(&rt_hash_table[k].lock); 853 spin_unlock_bh(rt_hash_lock_addr(k));
816 if (goal <= 0) 854 if (goal <= 0)
817 break; 855 break;
818 } 856 }
@@ -882,7 +920,7 @@ restart:
882 920
883 rthp = &rt_hash_table[hash].chain; 921 rthp = &rt_hash_table[hash].chain;
884 922
885 spin_lock_bh(&rt_hash_table[hash].lock); 923 spin_lock_bh(rt_hash_lock_addr(hash));
886 while ((rth = *rthp) != NULL) { 924 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) && 926 if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
908 rth->u.dst.__use++; 946 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst); 947 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now; 948 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock); 949 spin_unlock_bh(rt_hash_lock_addr(hash));
912 950
913 rt_drop(rt); 951 rt_drop(rt);
914 *rp = rth; 952 *rp = rth;
@@ -949,7 +987,7 @@ restart:
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst); 988 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) { 989 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock); 990 spin_unlock_bh(rt_hash_lock_addr(hash));
953 991
954 if (err != -ENOBUFS) { 992 if (err != -ENOBUFS) {
955 rt_drop(rt); 993 rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
990 } 1028 }
991#endif 1029#endif
992 rt_hash_table[hash].chain = rt; 1030 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock); 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
994 *rp = rt; 1032 *rp = rt;
995 return 0; 1033 return 0;
996} 1034}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1058{ 1096{
1059 struct rtable **rthp; 1097 struct rtable **rthp;
1060 1098
1061 spin_lock_bh(&rt_hash_table[hash].lock); 1099 spin_lock_bh(rt_hash_lock_addr(hash));
1062 ip_rt_put(rt); 1100 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next) 1102 rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1067 rt_free(rt); 1105 rt_free(rt);
1068 break; 1106 break;
1069 } 1107 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1071} 1109}
1072 1110
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1647,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1647 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1648 "%u.%u.%u.%u, on dev %s\n", 1686 "%u.%u.%u.%u, on dev %s\n",
1649 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1650 if (dev->hard_header_len) { 1688 if (dev->hard_header_len && skb->mac.raw) {
1651 int i; 1689 int i;
1652 unsigned char *p = skb->mac.raw; 1690 unsigned char *p = skb->mac.raw;
1653 printk(KERN_WARNING "ll header: "); 1691 printk(KERN_WARNING "ll header: ");
@@ -1767,7 +1805,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
1767 struct in_device *in_dev, 1805 struct in_device *in_dev,
1768 u32 daddr, u32 saddr, u32 tos) 1806 u32 daddr, u32 saddr, u32 tos)
1769{ 1807{
1770 struct rtable* rth; 1808 struct rtable* rth = NULL;
1771 int err; 1809 int err;
1772 unsigned hash; 1810 unsigned hash;
1773 1811
@@ -1794,7 +1832,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
1794 u32 daddr, u32 saddr, u32 tos) 1832 u32 daddr, u32 saddr, u32 tos)
1795{ 1833{
1796#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1834#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1797 struct rtable* rth; 1835 struct rtable* rth = NULL;
1798 unsigned char hop, hopcount, lasthop; 1836 unsigned char hop, hopcount, lasthop;
1799 int err = -EINVAL; 1837 int err = -EINVAL;
1800 unsigned int hash; 1838 unsigned int hash;
@@ -1909,7 +1947,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1909 */ 1947 */
1910 if ((err = fib_lookup(&fl, &res)) != 0) { 1948 if ((err = fib_lookup(&fl, &res)) != 0) {
1911 if (!IN_DEV_FORWARD(in_dev)) 1949 if (!IN_DEV_FORWARD(in_dev))
1912 goto e_inval; 1950 goto e_hostunreach;
1913 goto no_route; 1951 goto no_route;
1914 } 1952 }
1915 free_res = 1; 1953 free_res = 1;
@@ -1933,7 +1971,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1933 } 1971 }
1934 1972
1935 if (!IN_DEV_FORWARD(in_dev)) 1973 if (!IN_DEV_FORWARD(in_dev))
1936 goto e_inval; 1974 goto e_hostunreach;
1937 if (res.type != RTN_UNICAST) 1975 if (res.type != RTN_UNICAST)
1938 goto martian_destination; 1976 goto martian_destination;
1939 1977
@@ -2025,6 +2063,11 @@ martian_destination:
2025 "%u.%u.%u.%u, dev %s\n", 2063 "%u.%u.%u.%u, dev %s\n",
2026 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 2064 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027#endif 2065#endif
2066
2067e_hostunreach:
2068 err = -EHOSTUNREACH;
2069 goto done;
2070
2028e_inval: 2071e_inval:
2029 err = -EINVAL; 2072 err = -EINVAL;
2030 goto done; 2073 goto done;
@@ -2239,7 +2282,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
2239 struct net_device *dev_out, 2282 struct net_device *dev_out,
2240 unsigned flags) 2283 unsigned flags)
2241{ 2284{
2242 struct rtable *rth; 2285 struct rtable *rth = NULL;
2243 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2286 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2244 unsigned hash; 2287 unsigned hash;
2245 if (err == 0) { 2288 if (err == 0) {
@@ -2267,7 +2310,7 @@ static inline int ip_mkroute_output(struct rtable** rp,
2267 unsigned char hop; 2310 unsigned char hop;
2268 unsigned hash; 2311 unsigned hash;
2269 int err = -EINVAL; 2312 int err = -EINVAL;
2270 struct rtable *rth; 2313 struct rtable *rth = NULL;
2271 2314
2272 if (res->fi && res->fi->fib_nhs > 1) { 2315 if (res->fi && res->fi->fib_nhs > 1) {
2273 unsigned char hopcount = res->fi->fib_nhs; 2316 unsigned char hopcount = res->fi->fib_nhs;
@@ -2581,7 +2624,7 @@ int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2581} 2624}
2582 2625
2583static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 2626static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2584 int nowait) 2627 int nowait, unsigned int flags)
2585{ 2628{
2586 struct rtable *rt = (struct rtable*)skb->dst; 2629 struct rtable *rt = (struct rtable*)skb->dst;
2587 struct rtmsg *r; 2630 struct rtmsg *r;
@@ -2591,9 +2634,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2591#ifdef CONFIG_IP_MROUTE 2634#ifdef CONFIG_IP_MROUTE
2592 struct rtattr *eptr; 2635 struct rtattr *eptr;
2593#endif 2636#endif
2594 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); 2637 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2595 r = NLMSG_DATA(nlh); 2638 r = NLMSG_DATA(nlh);
2596 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2597 r->rtm_family = AF_INET; 2639 r->rtm_family = AF_INET;
2598 r->rtm_dst_len = 32; 2640 r->rtm_dst_len = 32;
2599 r->rtm_src_len = 0; 2641 r->rtm_src_len = 0;
@@ -2744,7 +2786,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2744 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; 2786 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2745 2787
2746 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2788 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2747 RTM_NEWROUTE, 0); 2789 RTM_NEWROUTE, 0, 0);
2748 if (!err) 2790 if (!err)
2749 goto out_free; 2791 goto out_free;
2750 if (err < 0) { 2792 if (err < 0) {
@@ -2781,8 +2823,8 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2781 continue; 2823 continue;
2782 skb->dst = dst_clone(&rt->u.dst); 2824 skb->dst = dst_clone(&rt->u.dst);
2783 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 2825 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2784 cb->nlh->nlmsg_seq, 2826 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2785 RTM_NEWROUTE, 1) <= 0) { 2827 1, NLM_F_MULTI) <= 0) {
2786 dst_release(xchg(&skb->dst, NULL)); 2828 dst_release(xchg(&skb->dst, NULL));
2787 rcu_read_unlock_bh(); 2829 rcu_read_unlock_bh();
2788 goto done; 2830 goto done;
@@ -3069,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
3069 3111
3070int __init ip_rt_init(void) 3112int __init ip_rt_init(void)
3071{ 3113{
3072 int i, order, goal, rc = 0; 3114 int rc = 0;
3073 3115
3074 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3075 (jiffies ^ (jiffies >> 7))); 3117 (jiffies ^ (jiffies >> 7)));
3076 3118
3077#ifdef CONFIG_NET_CLS_ROUTE 3119#ifdef CONFIG_NET_CLS_ROUTE
3120 {
3121 int order;
3078 for (order = 0; 3122 for (order = 0;
3079 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3080 /* NOTHING */; 3124 /* NOTHING */;
@@ -3082,6 +3126,7 @@ int __init ip_rt_init(void)
3082 if (!ip_rt_acct) 3126 if (!ip_rt_acct)
3083 panic("IP: failed to allocate ip_rt_acct\n"); 3127 panic("IP: failed to allocate ip_rt_acct\n");
3084 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 }
3085#endif 3130#endif
3086 3131
3087 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3092,36 +3137,19 @@ int __init ip_rt_init(void)
3092 if (!ipv4_dst_ops.kmem_cachep) 3137 if (!ipv4_dst_ops.kmem_cachep)
3093 panic("IP: failed to allocate ip_dst_cache\n"); 3138 panic("IP: failed to allocate ip_dst_cache\n");
3094 3139
3095 goal = num_physpages >> (26 - PAGE_SHIFT); 3140 rt_hash_table = (struct rt_hash_bucket *)
3096 if (rhash_entries) 3141 alloc_large_system_hash("IP route cache",
3097 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3142 sizeof(struct rt_hash_bucket),
3098 for (order = 0; (1UL << order) < goal; order++) 3143 rhash_entries,
3099 /* NOTHING */; 3144 (num_physpages >= 128 * 1024) ?
3100 3145 (27 - PAGE_SHIFT) :
3101 do { 3146 (29 - PAGE_SHIFT),
3102 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3147 HASH_HIGHMEM,
3103 sizeof(struct rt_hash_bucket); 3148 &rt_hash_log,
3104 while (rt_hash_mask & (rt_hash_mask - 1)) 3149 &rt_hash_mask,
3105 rt_hash_mask--; 3150 0);
3106 rt_hash_table = (struct rt_hash_bucket *) 3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3107 __get_free_pages(GFP_ATOMIC, order); 3152 rt_hash_lock_init();
3108 } while (rt_hash_table == NULL && --order > 0);
3109
3110 if (!rt_hash_table)
3111 panic("Failed to allocate IP route cache hash table\n");
3112
3113 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3114 rt_hash_mask,
3115 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3116
3117 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3118 /* NOTHING */;
3119
3120 rt_hash_mask--;
3121 for (i = 0; i <= rt_hash_mask; i++) {
3122 spin_lock_init(&rt_hash_table[i].lock);
3123 rt_hash_table[i].chain = NULL;
3124 }
3125 3153
3126 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3127 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e923d2f021aa..72d014442185 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,10 +169,10 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
170} 170}
171 171
172extern struct or_calltable or_ipv4; 172extern struct request_sock_ops tcp_request_sock_ops;
173 173
174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
175 struct open_request *req, 175 struct request_sock *req,
176 struct dst_entry *dst) 176 struct dst_entry *dst)
177{ 177{
178 struct tcp_sock *tp = tcp_sk(sk); 178 struct tcp_sock *tp = tcp_sk(sk);
@@ -182,7 +182,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
182 if (child) 182 if (child)
183 tcp_acceptq_queue(sk, req, child); 183 tcp_acceptq_queue(sk, req, child);
184 else 184 else
185 tcp_openreq_free(req); 185 reqsk_free(req);
186 186
187 return child; 187 return child;
188} 188}
@@ -190,10 +190,12 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
190struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 190struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
191 struct ip_options *opt) 191 struct ip_options *opt)
192{ 192{
193 struct inet_request_sock *ireq;
194 struct tcp_request_sock *treq;
193 struct tcp_sock *tp = tcp_sk(sk); 195 struct tcp_sock *tp = tcp_sk(sk);
194 __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; 196 __u32 cookie = ntohl(skb->h.th->ack_seq) - 1;
195 struct sock *ret = sk; 197 struct sock *ret = sk;
196 struct open_request *req; 198 struct request_sock *req;
197 int mss; 199 int mss;
198 struct rtable *rt; 200 struct rtable *rt;
199 __u8 rcv_wscale; 201 __u8 rcv_wscale;
@@ -209,19 +211,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
209 211
210 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); 212 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
211 213
212 req = tcp_openreq_alloc();
213 ret = NULL; 214 ret = NULL;
215 req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */
214 if (!req) 216 if (!req)
215 goto out; 217 goto out;
216 218
217 req->rcv_isn = htonl(skb->h.th->seq) - 1; 219 ireq = inet_rsk(req);
218 req->snt_isn = cookie; 220 treq = tcp_rsk(req);
221 treq->rcv_isn = htonl(skb->h.th->seq) - 1;
222 treq->snt_isn = cookie;
219 req->mss = mss; 223 req->mss = mss;
220 req->rmt_port = skb->h.th->source; 224 ireq->rmt_port = skb->h.th->source;
221 req->af.v4_req.loc_addr = skb->nh.iph->daddr; 225 ireq->loc_addr = skb->nh.iph->daddr;
222 req->af.v4_req.rmt_addr = skb->nh.iph->saddr; 226 ireq->rmt_addr = skb->nh.iph->saddr;
223 req->class = &or_ipv4; /* for savety */ 227 ireq->opt = NULL;
224 req->af.v4_req.opt = NULL;
225 228
226 /* We throwed the options of the initial SYN away, so we hope 229 /* We throwed the options of the initial SYN away, so we hope
227 * the ACK carries the same options again (see RFC1122 4.2.3.8) 230 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -229,17 +232,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
229 if (opt && opt->optlen) { 232 if (opt && opt->optlen) {
230 int opt_size = sizeof(struct ip_options) + opt->optlen; 233 int opt_size = sizeof(struct ip_options) + opt->optlen;
231 234
232 req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC); 235 ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
233 if (req->af.v4_req.opt) { 236 if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
234 if (ip_options_echo(req->af.v4_req.opt, skb)) { 237 kfree(ireq->opt);
235 kfree(req->af.v4_req.opt); 238 ireq->opt = NULL;
236 req->af.v4_req.opt = NULL;
237 }
238 } 239 }
239 } 240 }
240 241
241 req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0; 242 ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0;
242 req->wscale_ok = req->sack_ok = 0; 243 ireq->wscale_ok = ireq->sack_ok = 0;
243 req->expires = 0UL; 244 req->expires = 0UL;
244 req->retrans = 0; 245 req->retrans = 0;
245 246
@@ -253,15 +254,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
253 struct flowi fl = { .nl_u = { .ip4_u = 254 struct flowi fl = { .nl_u = { .ip4_u =
254 { .daddr = ((opt && opt->srr) ? 255 { .daddr = ((opt && opt->srr) ?
255 opt->faddr : 256 opt->faddr :
256 req->af.v4_req.rmt_addr), 257 ireq->rmt_addr),
257 .saddr = req->af.v4_req.loc_addr, 258 .saddr = ireq->loc_addr,
258 .tos = RT_CONN_FLAGS(sk) } }, 259 .tos = RT_CONN_FLAGS(sk) } },
259 .proto = IPPROTO_TCP, 260 .proto = IPPROTO_TCP,
260 .uli_u = { .ports = 261 .uli_u = { .ports =
261 { .sport = skb->h.th->dest, 262 { .sport = skb->h.th->dest,
262 .dport = skb->h.th->source } } }; 263 .dport = skb->h.th->source } } };
263 if (ip_route_output_key(&rt, &fl)) { 264 if (ip_route_output_key(&rt, &fl)) {
264 tcp_openreq_free(req); 265 reqsk_free(req);
265 goto out; 266 goto out;
266 } 267 }
267 } 268 }
@@ -272,7 +273,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
272 &req->rcv_wnd, &req->window_clamp, 273 &req->rcv_wnd, &req->window_clamp,
273 0, &rcv_wscale); 274 0, &rcv_wscale);
274 /* BTW win scale with syncookies is 0 by definition */ 275 /* BTW win scale with syncookies is 0 by definition */
275 req->rcv_wscale = rcv_wscale; 276 ireq->rcv_wscale = rcv_wscale;
276 277
277 ret = get_cookie_sock(sk, skb, req, &rt->u.dst); 278 ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
278out: return ret; 279out: return ret;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3aafb298c1c1..e32894532416 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -23,6 +23,7 @@ extern int sysctl_ip_nonlocal_bind;
23extern int sysctl_icmp_echo_ignore_all; 23extern int sysctl_icmp_echo_ignore_all;
24extern int sysctl_icmp_echo_ignore_broadcasts; 24extern int sysctl_icmp_echo_ignore_broadcasts;
25extern int sysctl_icmp_ignore_bogus_error_responses; 25extern int sysctl_icmp_ignore_bogus_error_responses;
26extern int sysctl_icmp_errors_use_inbound_ifaddr;
26 27
27/* From ip_fragment.c */ 28/* From ip_fragment.c */
28extern int sysctl_ipfrag_low_thresh; 29extern int sysctl_ipfrag_low_thresh;
@@ -117,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
117 return 1; 118 return 1;
118} 119}
119 120
121static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
122 void __user *buffer, size_t *lenp, loff_t *ppos)
123{
124 char val[TCP_CA_NAME_MAX];
125 ctl_table tbl = {
126 .data = val,
127 .maxlen = TCP_CA_NAME_MAX,
128 };
129 int ret;
130
131 tcp_get_default_congestion_control(val);
132
133 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
134 if (write && ret == 0)
135 ret = tcp_set_default_congestion_control(val);
136 return ret;
137}
138
139int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
140 void __user *oldval, size_t __user *oldlenp,
141 void __user *newval, size_t newlen,
142 void **context)
143{
144 char val[TCP_CA_NAME_MAX];
145 ctl_table tbl = {
146 .data = val,
147 .maxlen = TCP_CA_NAME_MAX,
148 };
149 int ret;
150
151 tcp_get_default_congestion_control(val);
152 ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
153 context);
154 if (ret == 0 && newval && newlen)
155 ret = tcp_set_default_congestion_control(val);
156 return ret;
157}
158
159
120ctl_table ipv4_table[] = { 160ctl_table ipv4_table[] = {
121 { 161 {
122 .ctl_name = NET_IPV4_TCP_TIMESTAMPS, 162 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -396,6 +436,14 @@ ctl_table ipv4_table[] = {
396 .proc_handler = &proc_dointvec 436 .proc_handler = &proc_dointvec
397 }, 437 },
398 { 438 {
439 .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
440 .procname = "icmp_errors_use_inbound_ifaddr",
441 .data = &sysctl_icmp_errors_use_inbound_ifaddr,
442 .maxlen = sizeof(int),
443 .mode = 0644,
444 .proc_handler = &proc_dointvec
445 },
446 {
399 .ctl_name = NET_IPV4_ROUTE, 447 .ctl_name = NET_IPV4_ROUTE,
400 .procname = "route", 448 .procname = "route",
401 .maxlen = 0, 449 .maxlen = 0,
@@ -603,70 +651,6 @@ ctl_table ipv4_table[] = {
603 .proc_handler = &proc_dointvec, 651 .proc_handler = &proc_dointvec,
604 }, 652 },
605 { 653 {
606 .ctl_name = NET_TCP_WESTWOOD,
607 .procname = "tcp_westwood",
608 .data = &sysctl_tcp_westwood,
609 .maxlen = sizeof(int),
610 .mode = 0644,
611 .proc_handler = &proc_dointvec,
612 },
613 {
614 .ctl_name = NET_TCP_VEGAS,
615 .procname = "tcp_vegas_cong_avoid",
616 .data = &sysctl_tcp_vegas_cong_avoid,
617 .maxlen = sizeof(int),
618 .mode = 0644,
619 .proc_handler = &proc_dointvec,
620 },
621 {
622 .ctl_name = NET_TCP_VEGAS_ALPHA,
623 .procname = "tcp_vegas_alpha",
624 .data = &sysctl_tcp_vegas_alpha,
625 .maxlen = sizeof(int),
626 .mode = 0644,
627 .proc_handler = &proc_dointvec,
628 },
629 {
630 .ctl_name = NET_TCP_VEGAS_BETA,
631 .procname = "tcp_vegas_beta",
632 .data = &sysctl_tcp_vegas_beta,
633 .maxlen = sizeof(int),
634 .mode = 0644,
635 .proc_handler = &proc_dointvec,
636 },
637 {
638 .ctl_name = NET_TCP_VEGAS_GAMMA,
639 .procname = "tcp_vegas_gamma",
640 .data = &sysctl_tcp_vegas_gamma,
641 .maxlen = sizeof(int),
642 .mode = 0644,
643 .proc_handler = &proc_dointvec,
644 },
645 {
646 .ctl_name = NET_TCP_BIC,
647 .procname = "tcp_bic",
648 .data = &sysctl_tcp_bic,
649 .maxlen = sizeof(int),
650 .mode = 0644,
651 .proc_handler = &proc_dointvec,
652 },
653 {
654 .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
655 .procname = "tcp_bic_fast_convergence",
656 .data = &sysctl_tcp_bic_fast_convergence,
657 .maxlen = sizeof(int),
658 .mode = 0644,
659 .proc_handler = &proc_dointvec,
660 },
661 {
662 .ctl_name = NET_TCP_BIC_LOW_WINDOW,
663 .procname = "tcp_bic_low_window",
664 .data = &sysctl_tcp_bic_low_window,
665 .maxlen = sizeof(int),
666 .mode = 0644,
667 .proc_handler = &proc_dointvec,
668 },
669 {
670 .ctl_name = NET_TCP_MODERATE_RCVBUF, 654 .ctl_name = NET_TCP_MODERATE_RCVBUF,
671 .procname = "tcp_moderate_rcvbuf", 655 .procname = "tcp_moderate_rcvbuf",
672 .data = &sysctl_tcp_moderate_rcvbuf, 656 .data = &sysctl_tcp_moderate_rcvbuf,
@@ -683,13 +667,14 @@ ctl_table ipv4_table[] = {
683 .proc_handler = &proc_dointvec, 667 .proc_handler = &proc_dointvec,
684 }, 668 },
685 { 669 {
686 .ctl_name = NET_TCP_BIC_BETA, 670 .ctl_name = NET_TCP_CONG_CONTROL,
687 .procname = "tcp_bic_beta", 671 .procname = "tcp_congestion_control",
688 .data = &sysctl_tcp_bic_beta,
689 .maxlen = sizeof(int),
690 .mode = 0644, 672 .mode = 0644,
691 .proc_handler = &proc_dointvec, 673 .maxlen = TCP_CA_NAME_MAX,
674 .proc_handler = &proc_tcp_congestion_control,
675 .strategy = &sysctl_tcp_congestion_control,
692 }, 676 },
677
693 { .ctl_name = 0 } 678 { .ctl_name = 0 }
694}; 679};
695 680
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a037bafcba3c..ddb6ce4ecff2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -271,7 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); 272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273 273
274kmem_cache_t *tcp_openreq_cachep;
275kmem_cache_t *tcp_bucket_cachep; 274kmem_cache_t *tcp_bucket_cachep;
276kmem_cache_t *tcp_timewait_cachep; 275kmem_cache_t *tcp_timewait_cachep;
277 276
@@ -317,7 +316,7 @@ EXPORT_SYMBOL(tcp_enter_memory_pressure);
317static __inline__ unsigned int tcp_listen_poll(struct sock *sk, 316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318 poll_table *wait) 317 poll_table *wait)
319{ 318{
320 return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0; 319 return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
321} 320}
322 321
323/* 322/*
@@ -463,28 +462,15 @@ int tcp_listen_start(struct sock *sk)
463{ 462{
464 struct inet_sock *inet = inet_sk(sk); 463 struct inet_sock *inet = inet_sk(sk);
465 struct tcp_sock *tp = tcp_sk(sk); 464 struct tcp_sock *tp = tcp_sk(sk);
466 struct tcp_listen_opt *lopt; 465 int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467 if (rc != 0)
468 return rc;
467 469
468 sk->sk_max_ack_backlog = 0; 470 sk->sk_max_ack_backlog = 0;
469 sk->sk_ack_backlog = 0; 471 sk->sk_ack_backlog = 0;
470 tp->accept_queue = tp->accept_queue_tail = NULL;
471 rwlock_init(&tp->syn_wait_lock);
472 tcp_delack_init(tp); 472 tcp_delack_init(tp);
473 473
474 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475 if (!lopt)
476 return -ENOMEM;
477
478 memset(lopt, 0, sizeof(struct tcp_listen_opt));
479 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481 break;
482 get_random_bytes(&lopt->hash_rnd, 4);
483
484 write_lock_bh(&tp->syn_wait_lock);
485 tp->listen_opt = lopt;
486 write_unlock_bh(&tp->syn_wait_lock);
487
488 /* There is race window here: we announce ourselves listening, 474 /* There is race window here: we announce ourselves listening,
489 * but this transition is still not validated by get_port(). 475 * but this transition is still not validated by get_port().
490 * It is OK, because this socket enters to hash table only 476 * It is OK, because this socket enters to hash table only
@@ -501,10 +487,7 @@ int tcp_listen_start(struct sock *sk)
501 } 487 }
502 488
503 sk->sk_state = TCP_CLOSE; 489 sk->sk_state = TCP_CLOSE;
504 write_lock_bh(&tp->syn_wait_lock); 490 reqsk_queue_destroy(&tp->accept_queue);
505 tp->listen_opt = NULL;
506 write_unlock_bh(&tp->syn_wait_lock);
507 kfree(lopt);
508 return -EADDRINUSE; 491 return -EADDRINUSE;
509} 492}
510 493
@@ -516,25 +499,23 @@ int tcp_listen_start(struct sock *sk)
516static void tcp_listen_stop (struct sock *sk) 499static void tcp_listen_stop (struct sock *sk)
517{ 500{
518 struct tcp_sock *tp = tcp_sk(sk); 501 struct tcp_sock *tp = tcp_sk(sk);
519 struct tcp_listen_opt *lopt = tp->listen_opt; 502 struct listen_sock *lopt;
520 struct open_request *acc_req = tp->accept_queue; 503 struct request_sock *acc_req;
521 struct open_request *req; 504 struct request_sock *req;
522 int i; 505 int i;
523 506
524 tcp_delete_keepalive_timer(sk); 507 tcp_delete_keepalive_timer(sk);
525 508
526 /* make all the listen_opt local to us */ 509 /* make all the listen_opt local to us */
527 write_lock_bh(&tp->syn_wait_lock); 510 lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
528 tp->listen_opt = NULL; 511 acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
529 write_unlock_bh(&tp->syn_wait_lock);
530 tp->accept_queue = tp->accept_queue_tail = NULL;
531 512
532 if (lopt->qlen) { 513 if (lopt->qlen) {
533 for (i = 0; i < TCP_SYNQ_HSIZE; i++) { 514 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
534 while ((req = lopt->syn_table[i]) != NULL) { 515 while ((req = lopt->syn_table[i]) != NULL) {
535 lopt->syn_table[i] = req->dl_next; 516 lopt->syn_table[i] = req->dl_next;
536 lopt->qlen--; 517 lopt->qlen--;
537 tcp_openreq_free(req); 518 reqsk_free(req);
538 519
539 /* Following specs, it would be better either to send FIN 520 /* Following specs, it would be better either to send FIN
540 * (and enter FIN-WAIT-1, it is normal close) 521 * (and enter FIN-WAIT-1, it is normal close)
@@ -574,7 +555,7 @@ static void tcp_listen_stop (struct sock *sk)
574 sock_put(child); 555 sock_put(child);
575 556
576 sk_acceptq_removed(sk); 557 sk_acceptq_removed(sk);
577 tcp_openreq_fastfree(req); 558 __reqsk_free(req);
578 } 559 }
579 BUG_TRAP(!sk->sk_ack_backlog); 560 BUG_TRAP(!sk->sk_ack_backlog);
580} 561}
@@ -634,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
634 size_t psize, int flags) 615 size_t psize, int flags)
635{ 616{
636 struct tcp_sock *tp = tcp_sk(sk); 617 struct tcp_sock *tp = tcp_sk(sk);
637 int mss_now; 618 int mss_now, size_goal;
638 int err; 619 int err;
639 ssize_t copied; 620 ssize_t copied;
640 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -647,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
647 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
648 629
649 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 size_goal = tp->xmit_size_goal;
650 copied = 0; 632 copied = 0;
651 633
652 err = -EPIPE; 634 err = -EPIPE;
@@ -660,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
660 int offset = poffset % PAGE_SIZE; 642 int offset = poffset % PAGE_SIZE;
661 int size = min_t(size_t, psize, PAGE_SIZE - offset); 643 int size = min_t(size_t, psize, PAGE_SIZE - offset);
662 644
663 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { 645 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
664new_segment: 646new_segment:
665 if (!sk_stream_memory_free(sk)) 647 if (!sk_stream_memory_free(sk))
666 goto wait_for_sndbuf; 648 goto wait_for_sndbuf;
@@ -671,7 +653,7 @@ new_segment:
671 goto wait_for_memory; 653 goto wait_for_memory;
672 654
673 skb_entail(sk, tp, skb); 655 skb_entail(sk, tp, skb);
674 copy = mss_now; 656 copy = size_goal;
675 } 657 }
676 658
677 if (copy > size) 659 if (copy > size)
@@ -712,7 +694,7 @@ new_segment:
712 if (!(psize -= copy)) 694 if (!(psize -= copy))
713 goto out; 695 goto out;
714 696
715 if (skb->len != mss_now || (flags & MSG_OOB)) 697 if (skb->len < mss_now || (flags & MSG_OOB))
716 continue; 698 continue;
717 699
718 if (forced_push(tp)) { 700 if (forced_push(tp)) {
@@ -732,6 +714,7 @@ wait_for_memory:
732 goto do_error; 714 goto do_error;
733 715
734 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 716 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 size_goal = tp->xmit_size_goal;
735 } 718 }
736 719
737out: 720out:
@@ -773,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
773 756
774static inline int select_size(struct sock *sk, struct tcp_sock *tp) 757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
775{ 758{
776 int tmp = tp->mss_cache_std; 759 int tmp = tp->mss_cache;
777 760
778 if (sk->sk_route_caps & NETIF_F_SG) { 761 if (sk->sk_route_caps & NETIF_F_SG) {
779 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 762 if (sk->sk_route_caps & NETIF_F_TSO)
763 tmp = 0;
764 else {
765 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
780 766
781 if (tmp >= pgbreak && 767 if (tmp >= pgbreak &&
782 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) 768 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
783 tmp = pgbreak; 769 tmp = pgbreak;
770 }
784 } 771 }
772
785 return tmp; 773 return tmp;
786} 774}
787 775
@@ -792,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
792 struct tcp_sock *tp = tcp_sk(sk); 780 struct tcp_sock *tp = tcp_sk(sk);
793 struct sk_buff *skb; 781 struct sk_buff *skb;
794 int iovlen, flags; 782 int iovlen, flags;
795 int mss_now; 783 int mss_now, size_goal;
796 int err, copied; 784 int err, copied;
797 long timeo; 785 long timeo;
798 786
@@ -811,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
811 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 799 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
812 800
813 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 801 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 size_goal = tp->xmit_size_goal;
814 803
815 /* Ok commence sending. */ 804 /* Ok commence sending. */
816 iovlen = msg->msg_iovlen; 805 iovlen = msg->msg_iovlen;
@@ -833,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
833 skb = sk->sk_write_queue.prev; 822 skb = sk->sk_write_queue.prev;
834 823
835 if (!sk->sk_send_head || 824 if (!sk->sk_send_head ||
836 (copy = mss_now - skb->len) <= 0) { 825 (copy = size_goal - skb->len) <= 0) {
837 826
838new_segment: 827new_segment:
839 /* Allocate new segment. If the interface is SG, 828 /* Allocate new segment. If the interface is SG,
@@ -856,7 +845,7 @@ new_segment:
856 skb->ip_summed = CHECKSUM_HW; 845 skb->ip_summed = CHECKSUM_HW;
857 846
858 skb_entail(sk, tp, skb); 847 skb_entail(sk, tp, skb);
859 copy = mss_now; 848 copy = size_goal;
860 } 849 }
861 850
862 /* Try to append data to the end of skb. */ 851 /* Try to append data to the end of skb. */
@@ -891,11 +880,6 @@ new_segment:
891 tcp_mark_push(tp, skb); 880 tcp_mark_push(tp, skb);
892 goto new_segment; 881 goto new_segment;
893 } else if (page) { 882 } else if (page) {
894 /* If page is cached, align
895 * offset to L1 cache boundary
896 */
897 off = (off + L1_CACHE_BYTES - 1) &
898 ~(L1_CACHE_BYTES - 1);
899 if (off == PAGE_SIZE) { 883 if (off == PAGE_SIZE) {
900 put_page(page); 884 put_page(page);
901 TCP_PAGE(sk) = page = NULL; 885 TCP_PAGE(sk) = page = NULL;
@@ -956,7 +940,7 @@ new_segment:
956 if ((seglen -= copy) == 0 && iovlen == 0) 940 if ((seglen -= copy) == 0 && iovlen == 0)
957 goto out; 941 goto out;
958 942
959 if (skb->len != mss_now || (flags & MSG_OOB)) 943 if (skb->len < mss_now || (flags & MSG_OOB))
960 continue; 944 continue;
961 945
962 if (forced_push(tp)) { 946 if (forced_push(tp)) {
@@ -976,6 +960,7 @@ wait_for_memory:
976 goto do_error; 960 goto do_error;
977 961
978 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 962 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 size_goal = tp->xmit_size_goal;
979 } 964 }
980 } 965 }
981 966
@@ -1120,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
1120 struct sk_buff *skb; 1105 struct sk_buff *skb;
1121 struct tcp_sock *tp = tcp_sk(sk); 1106 struct tcp_sock *tp = tcp_sk(sk);
1122 1107
1123 NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue)); 1108 NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1124 1109
1125 /* RX process wants to run with disabled BHs, though it is not 1110 /* RX process wants to run with disabled BHs, though it is not
1126 * necessary */ 1111 * necessary */
@@ -1345,7 +1330,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1345 1330
1346 cleanup_rbuf(sk, copied); 1331 cleanup_rbuf(sk, copied);
1347 1332
1348 if (tp->ucopy.task == user_recv) { 1333 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1349 /* Install new reader */ 1334 /* Install new reader */
1350 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { 1335 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1351 user_recv = current; 1336 user_recv = current;
@@ -1384,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1384 * is not empty. It is more elegant, but eats cycles, 1369 * is not empty. It is more elegant, but eats cycles,
1385 * unfortunately. 1370 * unfortunately.
1386 */ 1371 */
1387 if (skb_queue_len(&tp->ucopy.prequeue)) 1372 if (!skb_queue_empty(&tp->ucopy.prequeue))
1388 goto do_prequeue; 1373 goto do_prequeue;
1389 1374
1390 /* __ Set realtime policy in scheduler __ */ 1375 /* __ Set realtime policy in scheduler __ */
@@ -1409,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1409 } 1394 }
1410 1395
1411 if (tp->rcv_nxt == tp->copied_seq && 1396 if (tp->rcv_nxt == tp->copied_seq &&
1412 skb_queue_len(&tp->ucopy.prequeue)) { 1397 !skb_queue_empty(&tp->ucopy.prequeue)) {
1413do_prequeue: 1398do_prequeue:
1414 tcp_prequeue_process(sk); 1399 tcp_prequeue_process(sk);
1415 1400
@@ -1491,7 +1476,7 @@ skip_copy:
1491 } while (len > 0); 1476 } while (len > 0);
1492 1477
1493 if (user_recv) { 1478 if (user_recv) {
1494 if (skb_queue_len(&tp->ucopy.prequeue)) { 1479 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1495 int chunk; 1480 int chunk;
1496 1481
1497 tp->ucopy.len = copied > 0 ? len : 0; 1482 tp->ucopy.len = copied > 0 ? len : 0;
@@ -1868,11 +1853,11 @@ static int wait_for_connect(struct sock *sk, long timeo)
1868 prepare_to_wait_exclusive(sk->sk_sleep, &wait, 1853 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1869 TASK_INTERRUPTIBLE); 1854 TASK_INTERRUPTIBLE);
1870 release_sock(sk); 1855 release_sock(sk);
1871 if (!tp->accept_queue) 1856 if (reqsk_queue_empty(&tp->accept_queue))
1872 timeo = schedule_timeout(timeo); 1857 timeo = schedule_timeout(timeo);
1873 lock_sock(sk); 1858 lock_sock(sk);
1874 err = 0; 1859 err = 0;
1875 if (tp->accept_queue) 1860 if (!reqsk_queue_empty(&tp->accept_queue))
1876 break; 1861 break;
1877 err = -EINVAL; 1862 err = -EINVAL;
1878 if (sk->sk_state != TCP_LISTEN) 1863 if (sk->sk_state != TCP_LISTEN)
@@ -1895,7 +1880,6 @@ static int wait_for_connect(struct sock *sk, long timeo)
1895struct sock *tcp_accept(struct sock *sk, int flags, int *err) 1880struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1896{ 1881{
1897 struct tcp_sock *tp = tcp_sk(sk); 1882 struct tcp_sock *tp = tcp_sk(sk);
1898 struct open_request *req;
1899 struct sock *newsk; 1883 struct sock *newsk;
1900 int error; 1884 int error;
1901 1885
@@ -1906,37 +1890,31 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1906 */ 1890 */
1907 error = -EINVAL; 1891 error = -EINVAL;
1908 if (sk->sk_state != TCP_LISTEN) 1892 if (sk->sk_state != TCP_LISTEN)
1909 goto out; 1893 goto out_err;
1910 1894
1911 /* Find already established connection */ 1895 /* Find already established connection */
1912 if (!tp->accept_queue) { 1896 if (reqsk_queue_empty(&tp->accept_queue)) {
1913 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1897 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1914 1898
1915 /* If this is a non blocking socket don't sleep */ 1899 /* If this is a non blocking socket don't sleep */
1916 error = -EAGAIN; 1900 error = -EAGAIN;
1917 if (!timeo) 1901 if (!timeo)
1918 goto out; 1902 goto out_err;
1919 1903
1920 error = wait_for_connect(sk, timeo); 1904 error = wait_for_connect(sk, timeo);
1921 if (error) 1905 if (error)
1922 goto out; 1906 goto out_err;
1923 } 1907 }
1924 1908
1925 req = tp->accept_queue; 1909 newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1926 if ((tp->accept_queue = req->dl_next) == NULL)
1927 tp->accept_queue_tail = NULL;
1928
1929 newsk = req->sk;
1930 sk_acceptq_removed(sk);
1931 tcp_openreq_fastfree(req);
1932 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); 1910 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1933 release_sock(sk);
1934 return newsk;
1935
1936out: 1911out:
1937 release_sock(sk); 1912 release_sock(sk);
1913 return newsk;
1914out_err:
1915 newsk = NULL;
1938 *err = error; 1916 *err = error;
1939 return NULL; 1917 goto out;
1940} 1918}
1941 1919
1942/* 1920/*
@@ -1953,6 +1931,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1953 return tp->af_specific->setsockopt(sk, level, optname, 1931 return tp->af_specific->setsockopt(sk, level, optname,
1954 optval, optlen); 1932 optval, optlen);
1955 1933
1934 /* This is a string value all the others are int's */
1935 if (optname == TCP_CONGESTION) {
1936 char name[TCP_CA_NAME_MAX];
1937
1938 if (optlen < 1)
1939 return -EINVAL;
1940
1941 val = strncpy_from_user(name, optval,
1942 min(TCP_CA_NAME_MAX-1, optlen));
1943 if (val < 0)
1944 return -EFAULT;
1945 name[val] = 0;
1946
1947 lock_sock(sk);
1948 err = tcp_set_congestion_control(tp, name);
1949 release_sock(sk);
1950 return err;
1951 }
1952
1956 if (optlen < sizeof(int)) 1953 if (optlen < sizeof(int))
1957 return -EINVAL; 1954 return -EINVAL;
1958 1955
@@ -2135,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2135 2132
2136 info->tcpi_rto = jiffies_to_usecs(tp->rto); 2133 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2137 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2138 info->tcpi_snd_mss = tp->mss_cache_std; 2135 info->tcpi_snd_mss = tp->mss_cache;
2139 info->tcpi_rcv_mss = tp->ack.rcv_mss; 2136 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2140 2137
2141 info->tcpi_unacked = tp->packets_out; 2138 info->tcpi_unacked = tp->packets_out;
@@ -2185,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2185 2182
2186 switch (optname) { 2183 switch (optname) {
2187 case TCP_MAXSEG: 2184 case TCP_MAXSEG:
2188 val = tp->mss_cache_std; 2185 val = tp->mss_cache;
2189 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2186 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2190 val = tp->rx_opt.user_mss; 2187 val = tp->rx_opt.user_mss;
2191 break; 2188 break;
@@ -2237,6 +2234,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2237 case TCP_QUICKACK: 2234 case TCP_QUICKACK:
2238 val = !tp->ack.pingpong; 2235 val = !tp->ack.pingpong;
2239 break; 2236 break;
2237
2238 case TCP_CONGESTION:
2239 if (get_user(len, optlen))
2240 return -EFAULT;
2241 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242 if (put_user(len, optlen))
2243 return -EFAULT;
2244 if (copy_to_user(optval, tp->ca_ops->name, len))
2245 return -EFAULT;
2246 return 0;
2240 default: 2247 default:
2241 return -ENOPROTOOPT; 2248 return -ENOPROTOOPT;
2242 }; 2249 };
@@ -2250,7 +2257,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2250 2257
2251 2258
2252extern void __skb_cb_too_small_for_tcp(int, int); 2259extern void __skb_cb_too_small_for_tcp(int, int);
2253extern void tcpdiag_init(void); 2260extern struct tcp_congestion_ops tcp_reno;
2254 2261
2255static __initdata unsigned long thash_entries; 2262static __initdata unsigned long thash_entries;
2256static int __init set_thash_entries(char *str) 2263static int __init set_thash_entries(char *str)
@@ -2271,13 +2278,6 @@ void __init tcp_init(void)
2271 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), 2278 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2272 sizeof(skb->cb)); 2279 sizeof(skb->cb));
2273 2280
2274 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2275 sizeof(struct open_request),
2276 0, SLAB_HWCACHE_ALIGN,
2277 NULL, NULL);
2278 if (!tcp_openreq_cachep)
2279 panic("tcp_init: Cannot alloc open_request cache.");
2280
2281 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", 2281 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282 sizeof(struct tcp_bind_bucket), 2282 sizeof(struct tcp_bind_bucket),
2283 0, SLAB_HWCACHE_ALIGN, 2283 0, SLAB_HWCACHE_ALIGN,
@@ -2338,7 +2338,7 @@ void __init tcp_init(void)
2338 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); 2338 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339 order++) 2339 order++)
2340 ; 2340 ;
2341 if (order > 4) { 2341 if (order >= 4) {
2342 sysctl_local_port_range[0] = 32768; 2342 sysctl_local_port_range[0] = 32768;
2343 sysctl_local_port_range[1] = 61000; 2343 sysctl_local_port_range[1] = 61000;
2344 sysctl_tcp_max_tw_buckets = 180000; 2344 sysctl_tcp_max_tw_buckets = 180000;
@@ -2366,6 +2366,8 @@ void __init tcp_init(void)
2366 printk(KERN_INFO "TCP: Hash tables configured " 2366 printk(KERN_INFO "TCP: Hash tables configured "
2367 "(established %d bind %d)\n", 2367 "(established %d bind %d)\n",
2368 tcp_ehash_size << 1, tcp_bhash_size); 2368 tcp_ehash_size << 1, tcp_bhash_size);
2369
2370 tcp_register_congestion_control(&tcp_reno);
2369} 2371}
2370 2372
2371EXPORT_SYMBOL(tcp_accept); 2373EXPORT_SYMBOL(tcp_accept);
@@ -2374,7 +2376,6 @@ EXPORT_SYMBOL(tcp_destroy_sock);
2374EXPORT_SYMBOL(tcp_disconnect); 2376EXPORT_SYMBOL(tcp_disconnect);
2375EXPORT_SYMBOL(tcp_getsockopt); 2377EXPORT_SYMBOL(tcp_getsockopt);
2376EXPORT_SYMBOL(tcp_ioctl); 2378EXPORT_SYMBOL(tcp_ioctl);
2377EXPORT_SYMBOL(tcp_openreq_cachep);
2378EXPORT_SYMBOL(tcp_poll); 2379EXPORT_SYMBOL(tcp_poll);
2379EXPORT_SYMBOL(tcp_read_sock); 2380EXPORT_SYMBOL(tcp_read_sock);
2380EXPORT_SYMBOL(tcp_recvmsg); 2381EXPORT_SYMBOL(tcp_recvmsg);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 000000000000..ec38d45d6649
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
1/*
2 * Binary Increase Congestion control for TCP
3 *
4 * This is from the implementation of BICTCP in
5 * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
6 * "Binary Increase Congestion Control for Fast, Long Distance
7 * Networks" in InfoComm 2004
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
10 *
11 * Unless BIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28
29static int fast_convergence = 1;
30static int max_increment = 32;
31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100;
36static int smooth_part = 20;
37
38module_param(fast_convergence, int, 0644);
39MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
40module_param(max_increment, int, 0644);
41MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
42module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644);
53MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
54
55
56/* BIC TCP Parameters */
57struct bictcp {
58 u32 cnt; /* increase cwnd by 1 after ACKs */
59 u32 last_max_cwnd; /* last maximum snd_cwnd */
60 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
71};
72
73static inline void bictcp_reset(struct bictcp *ca)
74{
75 ca->cnt = 0;
76 ca->last_max_cwnd = 0;
77 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0;
79 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87}
88
89static void bictcp_init(struct tcp_sock *tp)
90{
91 bictcp_reset(tcp_ca(tp));
92 if (initial_ssthresh)
93 tp->snd_ssthresh = initial_ssthresh;
94}
95
96/*
97 * Compute congestion window to use.
98 */
99static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
100{
101 if (ca->last_cwnd == cwnd &&
102 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
103 return;
104
105 ca->last_cwnd = cwnd;
106 ca->last_time = tcp_time_stamp;
107
108 if (ca->epoch_start == 0) /* record the beginning of an epoch */
109 ca->epoch_start = tcp_time_stamp;
110
111 /* start off normal */
112 if (cwnd <= low_window) {
113 ca->cnt = cwnd;
114 return;
115 }
116
117 /* binary increase */
118 if (cwnd < ca->last_max_cwnd) {
119 __u32 dist = (ca->last_max_cwnd - cwnd)
120 / BICTCP_B;
121
122 if (dist > max_increment)
123 /* linear increase */
124 ca->cnt = cwnd / max_increment;
125 else if (dist <= 1U)
126 /* binary search increase */
127 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
128 else
129 /* binary search increase */
130 ca->cnt = cwnd / dist;
131 } else {
132 /* slow start AMD linear increase */
133 if (cwnd < ca->last_max_cwnd + BICTCP_B)
134 /* slow start */
135 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
136 else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
137 /* slow start */
138 ca->cnt = (cwnd * (BICTCP_B-1))
139 / cwnd-ca->last_max_cwnd;
140 else
141 /* linear increase */
142 ca->cnt = cwnd / max_increment;
143 }
144
145 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 ||
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20;
150 }
151
152 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
153 if (ca->cnt == 0) /* cannot be zero */
154 ca->cnt = 1;
155}
156
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
160{
161 struct bictcp *ca = tcp_ca(tp);
162 u32 dist, delay;
163
164 /* No time stamp */
165 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
166 /* Discard delay samples right after fast recovery */
167 tcp_time_stamp < ca->epoch_start + HZ ||
168 /* this delay samples may not be accurate */
169 flag == 0) {
170 ca->last_delay = 0;
171 goto notlow;
172 }
173
174 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
175 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
176 if (delay == 0) /* no previous delay sample */
177 goto notlow;
178
179 /* first time call or link delay decreases */
180 if (ca->delay_min == 0 || ca->delay_min > delay) {
181 ca->delay_min = ca->delay_max = delay;
182 goto notlow;
183 }
184
185 if (ca->delay_max < delay)
186 ca->delay_max = delay;
187
188 /* utilization is low, if avg delay < dist*threshold
189 for checking_period time */
190 dist = ca->delay_max - ca->delay_min;
191 if (dist <= ca->delay_min>>6 ||
192 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
193 goto notlow;
194
195 if (ca->low_utilization_start == 0) {
196 ca->low_utilization = 0;
197 ca->low_utilization_start = tcp_time_stamp;
198 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
199 > low_utilization_period*HZ) {
200 ca->low_utilization = 1;
201 }
202
203 return;
204
205 notlow:
206 ca->low_utilization = 0;
207 ca->low_utilization_start = 0;
208
209}
210
211static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
212 u32 seq_rtt, u32 in_flight, int data_acked)
213{
214 struct bictcp *ca = tcp_ca(tp);
215
216 bictcp_low_utilization(tp, data_acked);
217
218 if (in_flight < tp->snd_cwnd)
219 return;
220
221 if (tp->snd_cwnd <= tp->snd_ssthresh) {
222 /* In "safe" area, increase. */
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 } else {
226 bictcp_update(ca, tp->snd_cwnd);
227
228 /* In dangerous area, increase slowly.
229 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
230 */
231 if (tp->snd_cwnd_cnt >= ca->cnt) {
232 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
233 tp->snd_cwnd++;
234 tp->snd_cwnd_cnt = 0;
235 } else
236 tp->snd_cwnd_cnt++;
237 }
238
239}
240
241/*
242 * behave like Reno until low_window is reached,
243 * then increase congestion window slowly
244 */
245static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
246{
247 struct bictcp *ca = tcp_ca(tp);
248
249 ca->epoch_start = 0; /* end of epoch */
250
251 /* in case of wrong delay_max*/
252 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
253 ca->delay_max = ca->delay_min
254 + ((ca->delay_max - ca->delay_min)* 90) / 100;
255
256 /* Wmax and fast convergence */
257 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
258 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
259 / (2 * BICTCP_BETA_SCALE);
260 else
261 ca->last_max_cwnd = tp->snd_cwnd;
262
263 ca->loss_cwnd = tp->snd_cwnd;
264
265
266 if (tp->snd_cwnd <= low_window)
267 return max(tp->snd_cwnd >> 1U, 2U);
268 else
269 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
270}
271
272static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
273{
274 struct bictcp *ca = tcp_ca(tp);
275
276 return max(tp->snd_cwnd, ca->last_max_cwnd);
277}
278
279static u32 bictcp_min_cwnd(struct tcp_sock *tp)
280{
281 return tp->snd_ssthresh;
282}
283
284static void bictcp_state(struct tcp_sock *tp, u8 new_state)
285{
286 if (new_state == TCP_CA_Loss)
287 bictcp_reset(tcp_ca(tp));
288}
289
290/* Track delayed acknowledgement ratio using sliding window
291 * ratio = (15*ratio + sample) / 16
292 */
293static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
294{
295 if (cnt > 0 && tp->ca_state == TCP_CA_Open) {
296 struct bictcp *ca = tcp_ca(tp);
297 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
298 ca->delayed_ack += cnt;
299 }
300}
301
302
303static struct tcp_congestion_ops bictcp = {
304 .init = bictcp_init,
305 .ssthresh = bictcp_recalc_ssthresh,
306 .cong_avoid = bictcp_cong_avoid,
307 .set_state = bictcp_state,
308 .undo_cwnd = bictcp_undo_cwnd,
309 .min_cwnd = bictcp_min_cwnd,
310 .pkts_acked = bictcp_acked,
311 .owner = THIS_MODULE,
312 .name = "bic",
313};
314
315static int __init bictcp_register(void)
316{
317 BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
318 return tcp_register_congestion_control(&bictcp);
319}
320
321static void __exit bictcp_unregister(void)
322{
323 tcp_unregister_congestion_control(&bictcp);
324}
325
326module_init(bictcp_register);
327module_exit(bictcp_unregister);
328
329MODULE_AUTHOR("Stephen Hemminger");
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000000..4970d10a7785
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,237 @@
1/*
2 * Plugable TCP congestion control support and newReno
3 * congestion control.
4 * Based on ideas from I/O scheduler suport and Web100.
5 *
6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/mm.h>
12#include <linux/types.h>
13#include <linux/list.h>
14#include <net/tcp.h>
15
16static DEFINE_SPINLOCK(tcp_cong_list_lock);
17static LIST_HEAD(tcp_cong_list);
18
19/* Simple linear search, don't expect many entries! */
20static struct tcp_congestion_ops *tcp_ca_find(const char *name)
21{
22 struct tcp_congestion_ops *e;
23
24 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
25 if (strcmp(e->name, name) == 0)
26 return e;
27 }
28
29 return NULL;
30}
31
32/*
33 * Attach new congestion control algorthim to the list
34 * of available options.
35 */
36int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
37{
38 int ret = 0;
39
40 /* all algorithms must implement ssthresh and cong_avoid ops */
41 if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
42 printk(KERN_ERR "TCP %s does not implement required ops\n",
43 ca->name);
44 return -EINVAL;
45 }
46
47 spin_lock(&tcp_cong_list_lock);
48 if (tcp_ca_find(ca->name)) {
49 printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
50 ret = -EEXIST;
51 } else {
52 list_add_rcu(&ca->list, &tcp_cong_list);
53 printk(KERN_INFO "TCP %s registered\n", ca->name);
54 }
55 spin_unlock(&tcp_cong_list_lock);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
60
61/*
62 * Remove congestion control algorithm, called from
63 * the module's remove function. Module ref counts are used
64 * to ensure that this can't be done till all sockets using
65 * that method are closed.
66 */
67void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
68{
69 spin_lock(&tcp_cong_list_lock);
70 list_del_rcu(&ca->list);
71 spin_unlock(&tcp_cong_list_lock);
72}
73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
74
75/* Assign choice of congestion control. */
76void tcp_init_congestion_control(struct tcp_sock *tp)
77{
78 struct tcp_congestion_ops *ca;
79
80 if (tp->ca_ops != &tcp_init_congestion_ops)
81 return;
82
83 rcu_read_lock();
84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
85 if (try_module_get(ca->owner)) {
86 tp->ca_ops = ca;
87 break;
88 }
89
90 }
91 rcu_read_unlock();
92
93 if (tp->ca_ops->init)
94 tp->ca_ops->init(tp);
95}
96
97/* Manage refcounts on socket close. */
98void tcp_cleanup_congestion_control(struct tcp_sock *tp)
99{
100 if (tp->ca_ops->release)
101 tp->ca_ops->release(tp);
102 module_put(tp->ca_ops->owner);
103}
104
105/* Used by sysctl to change default congestion control */
106int tcp_set_default_congestion_control(const char *name)
107{
108 struct tcp_congestion_ops *ca;
109 int ret = -ENOENT;
110
111 spin_lock(&tcp_cong_list_lock);
112 ca = tcp_ca_find(name);
113#ifdef CONFIG_KMOD
114 if (!ca) {
115 spin_unlock(&tcp_cong_list_lock);
116
117 request_module("tcp_%s", name);
118 spin_lock(&tcp_cong_list_lock);
119 ca = tcp_ca_find(name);
120 }
121#endif
122
123 if (ca) {
124 list_move(&ca->list, &tcp_cong_list);
125 ret = 0;
126 }
127 spin_unlock(&tcp_cong_list_lock);
128
129 return ret;
130}
131
132/* Get current default congestion control */
133void tcp_get_default_congestion_control(char *name)
134{
135 struct tcp_congestion_ops *ca;
136 /* We will always have reno... */
137 BUG_ON(list_empty(&tcp_cong_list));
138
139 rcu_read_lock();
140 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
141 strncpy(name, ca->name, TCP_CA_NAME_MAX);
142 rcu_read_unlock();
143}
144
145/* Change congestion control for socket */
146int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
147{
148 struct tcp_congestion_ops *ca;
149 int err = 0;
150
151 rcu_read_lock();
152 ca = tcp_ca_find(name);
153 if (ca == tp->ca_ops)
154 goto out;
155
156 if (!ca)
157 err = -ENOENT;
158
159 else if (!try_module_get(ca->owner))
160 err = -EBUSY;
161
162 else {
163 tcp_cleanup_congestion_control(tp);
164 tp->ca_ops = ca;
165 if (tp->ca_ops->init)
166 tp->ca_ops->init(tp);
167 }
168 out:
169 rcu_read_unlock();
170 return err;
171}
172
173/*
174 * TCP Reno congestion control
175 * This is special case used for fallback as well.
176 */
177/* This is Jacobson's slow start and congestion avoidance.
178 * SIGCOMM '88, p. 328.
179 */
180void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
181 int flag)
182{
183 if (in_flight < tp->snd_cwnd)
184 return;
185
186 if (tp->snd_cwnd <= tp->snd_ssthresh) {
187 /* In "safe" area, increase. */
188 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
189 tp->snd_cwnd++;
190 } else {
191 /* In dangerous area, increase slowly.
192 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
193 */
194 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
195 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
196 tp->snd_cwnd++;
197 tp->snd_cwnd_cnt = 0;
198 } else
199 tp->snd_cwnd_cnt++;
200 }
201}
202EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
203
204/* Slow start threshold is half the congestion window (min 2) */
205u32 tcp_reno_ssthresh(struct tcp_sock *tp)
206{
207 return max(tp->snd_cwnd >> 1U, 2U);
208}
209EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
210
211/* Lower bound on congestion window. */
212u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
213{
214 return tp->snd_ssthresh/2;
215}
216EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
217
218struct tcp_congestion_ops tcp_reno = {
219 .name = "reno",
220 .owner = THIS_MODULE,
221 .ssthresh = tcp_reno_ssthresh,
222 .cong_avoid = tcp_reno_cong_avoid,
223 .min_cwnd = tcp_reno_min_cwnd,
224};
225
226/* Initial congestion control used (until SYN)
227 * really reno under another name so we can tell difference
228 * during tcp_set_default_congestion_control
229 */
230struct tcp_congestion_ops tcp_init_congestion_ops = {
231 .name = "",
232 .owner = THIS_MODULE,
233 .ssthresh = tcp_reno_ssthresh,
234 .cong_avoid = tcp_reno_cong_avoid,
235 .min_cwnd = tcp_reno_min_cwnd,
236};
237EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 8faa8948f75c..f66945cb158f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,15 +42,8 @@ struct tcpdiag_entry
42 42
43static struct sock *tcpnl; 43static struct sock *tcpnl;
44 44
45
46#define TCPDIAG_PUT(skb, attrtype, attrlen) \ 45#define TCPDIAG_PUT(skb, attrtype, attrlen) \
47({ int rtalen = RTA_LENGTH(attrlen); \ 46 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
48 struct rtattr *rta; \
49 if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
50 rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
51 rta->rta_type = attrtype; \
52 rta->rta_len = rtalen; \
53 RTA_DATA(rta); })
54 47
55static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, 48static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
56 int ext, u32 pid, u32 seq, u16 nlmsg_flags) 49 int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
61 struct nlmsghdr *nlh; 54 struct nlmsghdr *nlh;
62 struct tcp_info *info = NULL; 55 struct tcp_info *info = NULL;
63 struct tcpdiag_meminfo *minfo = NULL; 56 struct tcpdiag_meminfo *minfo = NULL;
64 struct tcpvegas_info *vinfo = NULL;
65 unsigned char *b = skb->tail; 57 unsigned char *b = skb->tail;
66 58
67 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); 59 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
73 if (ext & (1<<(TCPDIAG_INFO-1))) 65 if (ext & (1<<(TCPDIAG_INFO-1)))
74 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); 66 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
75 67
76 if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) 68 if (ext & (1<<(TCPDIAG_CONG-1))) {
77 && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) 69 size_t len = strlen(tp->ca_ops->name);
78 vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); 70 strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
71 tp->ca_ops->name);
72 }
79 } 73 }
80 r->tcpdiag_family = sk->sk_family; 74 r->tcpdiag_family = sk->sk_family;
81 r->tcpdiag_state = sk->sk_state; 75 r->tcpdiag_state = sk->sk_state;
@@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
166 if (info) 160 if (info)
167 tcp_get_info(sk, info); 161 tcp_get_info(sk, info);
168 162
169 if (vinfo) { 163 if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
170 if (tcp_is_vegas(tp)) { 164 tp->ca_ops->get_info(tp, ext, skb);
171 vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
172 vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
173 vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
174 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
175 } else {
176 vinfo->tcpv_enabled = 0;
177 vinfo->tcpv_rttcnt = 0;
178 vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
179 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
180 }
181 }
182 165
183 nlh->nlmsg_len = skb->tail - b; 166 nlh->nlmsg_len = skb->tail - b;
184 return skb->len; 167 return skb->len;
185 168
169rtattr_failure:
186nlmsg_failure: 170nlmsg_failure:
187 skb_trim(skb, b - skb->data); 171 skb_trim(skb, b - skb->data);
188 return -1; 172 return -1;
@@ -455,9 +439,10 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
455} 439}
456 440
457static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, 441static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
458 struct open_request *req, 442 struct request_sock *req,
459 u32 pid, u32 seq) 443 u32 pid, u32 seq)
460{ 444{
445 const struct inet_request_sock *ireq = inet_rsk(req);
461 struct inet_sock *inet = inet_sk(sk); 446 struct inet_sock *inet = inet_sk(sk);
462 unsigned char *b = skb->tail; 447 unsigned char *b = skb->tail;
463 struct tcpdiagmsg *r; 448 struct tcpdiagmsg *r;
@@ -482,9 +467,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
482 tmo = 0; 467 tmo = 0;
483 468
484 r->id.tcpdiag_sport = inet->sport; 469 r->id.tcpdiag_sport = inet->sport;
485 r->id.tcpdiag_dport = req->rmt_port; 470 r->id.tcpdiag_dport = ireq->rmt_port;
486 r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr; 471 r->id.tcpdiag_src[0] = ireq->loc_addr;
487 r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr; 472 r->id.tcpdiag_dst[0] = ireq->rmt_addr;
488 r->tcpdiag_expires = jiffies_to_msecs(tmo), 473 r->tcpdiag_expires = jiffies_to_msecs(tmo),
489 r->tcpdiag_rqueue = 0; 474 r->tcpdiag_rqueue = 0;
490 r->tcpdiag_wqueue = 0; 475 r->tcpdiag_wqueue = 0;
@@ -493,9 +478,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
493#ifdef CONFIG_IP_TCPDIAG_IPV6 478#ifdef CONFIG_IP_TCPDIAG_IPV6
494 if (r->tcpdiag_family == AF_INET6) { 479 if (r->tcpdiag_family == AF_INET6) {
495 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, 480 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
496 &req->af.v6_req.loc_addr); 481 &tcp6_rsk(req)->loc_addr);
497 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, 482 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
498 &req->af.v6_req.rmt_addr); 483 &tcp6_rsk(req)->rmt_addr);
499 } 484 }
500#endif 485#endif
501 nlh->nlmsg_len = skb->tail - b; 486 nlh->nlmsg_len = skb->tail - b;
@@ -513,7 +498,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
513 struct tcpdiag_entry entry; 498 struct tcpdiag_entry entry;
514 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); 499 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
515 struct tcp_sock *tp = tcp_sk(sk); 500 struct tcp_sock *tp = tcp_sk(sk);
516 struct tcp_listen_opt *lopt; 501 struct listen_sock *lopt;
517 struct rtattr *bc = NULL; 502 struct rtattr *bc = NULL;
518 struct inet_sock *inet = inet_sk(sk); 503 struct inet_sock *inet = inet_sk(sk);
519 int j, s_j; 504 int j, s_j;
@@ -528,9 +513,9 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
528 513
529 entry.family = sk->sk_family; 514 entry.family = sk->sk_family;
530 515
531 read_lock_bh(&tp->syn_wait_lock); 516 read_lock_bh(&tp->accept_queue.syn_wait_lock);
532 517
533 lopt = tp->listen_opt; 518 lopt = tp->accept_queue.listen_opt;
534 if (!lopt || !lopt->qlen) 519 if (!lopt || !lopt->qlen)
535 goto out; 520 goto out;
536 521
@@ -541,13 +526,15 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
541 } 526 }
542 527
543 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { 528 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
544 struct open_request *req, *head = lopt->syn_table[j]; 529 struct request_sock *req, *head = lopt->syn_table[j];
545 530
546 reqnum = 0; 531 reqnum = 0;
547 for (req = head; req; reqnum++, req = req->dl_next) { 532 for (req = head; req; reqnum++, req = req->dl_next) {
533 struct inet_request_sock *ireq = inet_rsk(req);
534
548 if (reqnum < s_reqnum) 535 if (reqnum < s_reqnum)
549 continue; 536 continue;
550 if (r->id.tcpdiag_dport != req->rmt_port && 537 if (r->id.tcpdiag_dport != ireq->rmt_port &&
551 r->id.tcpdiag_dport) 538 r->id.tcpdiag_dport)
552 continue; 539 continue;
553 540
@@ -555,16 +542,16 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
555 entry.saddr = 542 entry.saddr =
556#ifdef CONFIG_IP_TCPDIAG_IPV6 543#ifdef CONFIG_IP_TCPDIAG_IPV6
557 (entry.family == AF_INET6) ? 544 (entry.family == AF_INET6) ?
558 req->af.v6_req.loc_addr.s6_addr32 : 545 tcp6_rsk(req)->loc_addr.s6_addr32 :
559#endif 546#endif
560 &req->af.v4_req.loc_addr; 547 &ireq->loc_addr;
561 entry.daddr = 548 entry.daddr =
562#ifdef CONFIG_IP_TCPDIAG_IPV6 549#ifdef CONFIG_IP_TCPDIAG_IPV6
563 (entry.family == AF_INET6) ? 550 (entry.family == AF_INET6) ?
564 req->af.v6_req.rmt_addr.s6_addr32 : 551 tcp6_rsk(req)->rmt_addr.s6_addr32 :
565#endif 552#endif
566 &req->af.v4_req.rmt_addr; 553 &ireq->rmt_addr;
567 entry.dport = ntohs(req->rmt_port); 554 entry.dport = ntohs(ireq->rmt_port);
568 555
569 if (!tcpdiag_bc_run(RTA_DATA(bc), 556 if (!tcpdiag_bc_run(RTA_DATA(bc),
570 RTA_PAYLOAD(bc), &entry)) 557 RTA_PAYLOAD(bc), &entry))
@@ -585,7 +572,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
585 } 572 }
586 573
587out: 574out:
588 read_unlock_bh(&tp->syn_wait_lock); 575 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
589 576
590 return err; 577 return err;
591} 578}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 000000000000..36c51f8136bf
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
1/*
2 * Sally Floyd's High Speed TCP (RFC 3649) congestion control
3 *
4 * See http://www.icir.org/floyd/hstcp.html
5 *
6 * John Heffner <jheffner@psc.edu>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <net/tcp.h>
12
13
14/* From AIMD tables from RFC 3649 appendix B,
15 * with fixed-point MD scaled <<8.
16 */
17static const struct hstcp_aimd_val {
18 unsigned int cwnd;
19 unsigned int md;
20} hstcp_aimd_vals[] = {
21 { 38, 128, /* 0.50 */ },
22 { 118, 112, /* 0.44 */ },
23 { 221, 104, /* 0.41 */ },
24 { 347, 98, /* 0.38 */ },
25 { 495, 93, /* 0.37 */ },
26 { 663, 89, /* 0.35 */ },
27 { 851, 86, /* 0.34 */ },
28 { 1058, 83, /* 0.33 */ },
29 { 1284, 81, /* 0.32 */ },
30 { 1529, 78, /* 0.31 */ },
31 { 1793, 76, /* 0.30 */ },
32 { 2076, 74, /* 0.29 */ },
33 { 2378, 72, /* 0.28 */ },
34 { 2699, 71, /* 0.28 */ },
35 { 3039, 69, /* 0.27 */ },
36 { 3399, 68, /* 0.27 */ },
37 { 3778, 66, /* 0.26 */ },
38 { 4177, 65, /* 0.26 */ },
39 { 4596, 64, /* 0.25 */ },
40 { 5036, 62, /* 0.25 */ },
41 { 5497, 61, /* 0.24 */ },
42 { 5979, 60, /* 0.24 */ },
43 { 6483, 59, /* 0.23 */ },
44 { 7009, 58, /* 0.23 */ },
45 { 7558, 57, /* 0.22 */ },
46 { 8130, 56, /* 0.22 */ },
47 { 8726, 55, /* 0.22 */ },
48 { 9346, 54, /* 0.21 */ },
49 { 9991, 53, /* 0.21 */ },
50 { 10661, 52, /* 0.21 */ },
51 { 11358, 52, /* 0.20 */ },
52 { 12082, 51, /* 0.20 */ },
53 { 12834, 50, /* 0.20 */ },
54 { 13614, 49, /* 0.19 */ },
55 { 14424, 48, /* 0.19 */ },
56 { 15265, 48, /* 0.19 */ },
57 { 16137, 47, /* 0.19 */ },
58 { 17042, 46, /* 0.18 */ },
59 { 17981, 45, /* 0.18 */ },
60 { 18955, 45, /* 0.18 */ },
61 { 19965, 44, /* 0.17 */ },
62 { 21013, 43, /* 0.17 */ },
63 { 22101, 43, /* 0.17 */ },
64 { 23230, 42, /* 0.17 */ },
65 { 24402, 41, /* 0.16 */ },
66 { 25618, 41, /* 0.16 */ },
67 { 26881, 40, /* 0.16 */ },
68 { 28193, 39, /* 0.16 */ },
69 { 29557, 39, /* 0.15 */ },
70 { 30975, 38, /* 0.15 */ },
71 { 32450, 38, /* 0.15 */ },
72 { 33986, 37, /* 0.15 */ },
73 { 35586, 36, /* 0.14 */ },
74 { 37253, 36, /* 0.14 */ },
75 { 38992, 35, /* 0.14 */ },
76 { 40808, 35, /* 0.14 */ },
77 { 42707, 34, /* 0.13 */ },
78 { 44694, 33, /* 0.13 */ },
79 { 46776, 33, /* 0.13 */ },
80 { 48961, 32, /* 0.13 */ },
81 { 51258, 32, /* 0.13 */ },
82 { 53677, 31, /* 0.12 */ },
83 { 56230, 30, /* 0.12 */ },
84 { 58932, 30, /* 0.12 */ },
85 { 61799, 29, /* 0.12 */ },
86 { 64851, 28, /* 0.11 */ },
87 { 68113, 28, /* 0.11 */ },
88 { 71617, 27, /* 0.11 */ },
89 { 75401, 26, /* 0.10 */ },
90 { 79517, 26, /* 0.10 */ },
91 { 84035, 25, /* 0.10 */ },
92 { 89053, 24, /* 0.10 */ },
93};
94
95#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
96
97struct hstcp {
98 u32 ai;
99};
100
101static void hstcp_init(struct tcp_sock *tp)
102{
103 struct hstcp *ca = tcp_ca(tp);
104
105 ca->ai = 0;
106
107 /* Ensure the MD arithmetic works. This is somewhat pedantic,
108 * since I don't think we will see a cwnd this large. :) */
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110}
111
112static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
113 u32 in_flight, int good)
114{
115 struct hstcp *ca = tcp_ca(tp);
116
117 if (in_flight < tp->snd_cwnd)
118 return;
119
120 if (tp->snd_cwnd <= tp->snd_ssthresh) {
121 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
122 tp->snd_cwnd++;
123 } else {
124 /* Update AIMD parameters */
125 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
126 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
127 ca->ai < HSTCP_AIMD_MAX)
128 ca->ai++;
129 } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
130 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
131 ca->ai > 0)
132 ca->ai--;
133 }
134
135 /* Do additive increase */
136 if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
137 tp->snd_cwnd_cnt += ca->ai;
138 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
139 tp->snd_cwnd++;
140 tp->snd_cwnd_cnt -= tp->snd_cwnd;
141 }
142 }
143 }
144}
145
146static u32 hstcp_ssthresh(struct tcp_sock *tp)
147{
148 struct hstcp *ca = tcp_ca(tp);
149
150 /* Do multiplicative decrease */
151 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
152}
153
154
155static struct tcp_congestion_ops tcp_highspeed = {
156 .init = hstcp_init,
157 .ssthresh = hstcp_ssthresh,
158 .cong_avoid = hstcp_cong_avoid,
159 .min_cwnd = tcp_reno_min_cwnd,
160
161 .owner = THIS_MODULE,
162 .name = "highspeed"
163};
164
165static int __init hstcp_register(void)
166{
167 BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
168 return tcp_register_congestion_control(&tcp_highspeed);
169}
170
171static void __exit hstcp_unregister(void)
172{
173 tcp_unregister_congestion_control(&tcp_highspeed);
174}
175
176module_init(hstcp_register);
177module_exit(hstcp_unregister);
178
179MODULE_AUTHOR("John Heffner");
180MODULE_LICENSE("GPL");
181MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 000000000000..40168275acf9
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
1/*
2 * H-TCP congestion control. The algorithm is detailed in:
3 * R.N.Shorten, D.J.Leith:
4 * "H-TCP: TCP for high-speed and long-distance networks"
5 * Proc. PFLDnet, Argonne, 2004.
6 * http://www.hamilton.ie/net/htcp3.pdf
7 */
8
9#include <linux/config.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <net/tcp.h>
13
14#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
15#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
16#define BETA_MAX 102 /* 0.8 with shift << 7 */
17
18static int use_rtt_scaling = 1;
19module_param(use_rtt_scaling, int, 0644);
20MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
21
22static int use_bandwidth_switch = 1;
23module_param(use_bandwidth_switch, int, 0644);
24MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
25
26struct htcp {
27 u16 alpha; /* Fixed point arith, << 7 */
28 u8 beta; /* Fixed point arith, << 7 */
29 u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */
30 u8 ccount; /* Number of RTTs since last congestion event */
31 u8 undo_ccount;
32 u16 packetcount;
33 u32 minRTT;
34 u32 maxRTT;
35 u32 snd_cwnd_cnt2;
36
37 u32 undo_maxRTT;
38 u32 undo_old_maxB;
39
40 /* Bandwidth estimation */
41 u32 minB;
42 u32 maxB;
43 u32 old_maxB;
44 u32 Bi;
45 u32 lasttime;
46};
47
48static inline void htcp_reset(struct htcp *ca)
49{
50 ca->undo_ccount = ca->ccount;
51 ca->undo_maxRTT = ca->maxRTT;
52 ca->undo_old_maxB = ca->old_maxB;
53
54 ca->ccount = 0;
55 ca->snd_cwnd_cnt2 = 0;
56}
57
58static u32 htcp_cwnd_undo(struct tcp_sock *tp)
59{
60 struct htcp *ca = tcp_ca(tp);
61 ca->ccount = ca->undo_ccount;
62 ca->maxRTT = ca->undo_maxRTT;
63 ca->old_maxB = ca->undo_old_maxB;
64 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
65}
66
67static inline void measure_rtt(struct tcp_sock *tp)
68{
69 struct htcp *ca = tcp_ca(tp);
70 u32 srtt = tp->srtt>>3;
71
72 /* keep track of minimum RTT seen so far, minRTT is zero at first */
73 if (ca->minRTT > srtt || !ca->minRTT)
74 ca->minRTT = srtt;
75
76 /* max RTT */
77 if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
78 if (ca->maxRTT < ca->minRTT)
79 ca->maxRTT = ca->minRTT;
80 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
81 ca->maxRTT = srtt;
82 }
83}
84
85static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
86{
87 struct htcp *ca = tcp_ca(tp);
88 u32 now = tcp_time_stamp;
89
90 /* achieved throughput calculations */
91 if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
92 ca->packetcount = 0;
93 ca->lasttime = now;
94 return;
95 }
96
97 ca->packetcount += pkts_acked;
98
99 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
100 && now - ca->lasttime >= ca->minRTT
101 && ca->minRTT > 0) {
102 __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
103 if (ca->ccount <= 3) {
104 /* just after backoff */
105 ca->minB = ca->maxB = ca->Bi = cur_Bi;
106 } else {
107 ca->Bi = (3*ca->Bi + cur_Bi)/4;
108 if (ca->Bi > ca->maxB)
109 ca->maxB = ca->Bi;
110 if (ca->minB > ca->maxB)
111 ca->minB = ca->maxB;
112 }
113 ca->packetcount = 0;
114 ca->lasttime = now;
115 }
116}
117
118static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
119{
120 if (use_bandwidth_switch) {
121 u32 maxB = ca->maxB;
122 u32 old_maxB = ca->old_maxB;
123 ca->old_maxB = ca->maxB;
124
125 if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
126 ca->beta = BETA_MIN;
127 ca->modeswitch = 0;
128 return;
129 }
130 }
131
132 if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
133 ca->beta = (minRTT<<7)/maxRTT;
134 if (ca->beta < BETA_MIN)
135 ca->beta = BETA_MIN;
136 else if (ca->beta > BETA_MAX)
137 ca->beta = BETA_MAX;
138 } else {
139 ca->beta = BETA_MIN;
140 ca->modeswitch = 1;
141 }
142}
143
144static inline void htcp_alpha_update(struct htcp *ca)
145{
146 u32 minRTT = ca->minRTT;
147 u32 factor = 1;
148 u32 diff = ca->ccount * minRTT; /* time since last backoff */
149
150 if (diff > HZ) {
151 diff -= HZ;
152 factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
153 }
154
155 if (use_rtt_scaling && minRTT) {
156 u32 scale = (HZ<<3)/(10*minRTT);
157 scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
158 factor = (factor<<3)/scale;
159 if (!factor)
160 factor = 1;
161 }
162
163 ca->alpha = 2*factor*((1<<7)-ca->beta);
164 if (!ca->alpha)
165 ca->alpha = ALPHA_BASE;
166}
167
168/* After we have the rtt data to calculate beta, we'd still prefer to wait one
169 * rtt before we adjust our beta to ensure we are working from a consistent
170 * data.
171 *
172 * This function should be called when we hit a congestion event since only at
173 * that point do we really have a real sense of maxRTT (the queues en route
174 * were getting just too full now).
175 */
176static void htcp_param_update(struct tcp_sock *tp)
177{
178 struct htcp *ca = tcp_ca(tp);
179 u32 minRTT = ca->minRTT;
180 u32 maxRTT = ca->maxRTT;
181
182 htcp_beta_update(ca, minRTT, maxRTT);
183 htcp_alpha_update(ca);
184
185 /* add slowly fading memory for maxRTT to accommodate routing changes etc */
186 if (minRTT > 0 && maxRTT > minRTT)
187 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
188}
189
190static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
191{
192 struct htcp *ca = tcp_ca(tp);
193 htcp_param_update(tp);
194 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
195}
196
197static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
198 u32 in_flight, int data_acked)
199{
200 struct htcp *ca = tcp_ca(tp);
201
202 if (in_flight < tp->snd_cwnd)
203 return;
204
205 if (tp->snd_cwnd <= tp->snd_ssthresh) {
206 /* In "safe" area, increase. */
207 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
208 tp->snd_cwnd++;
209 } else {
210 measure_rtt(tp);
211
212 /* keep track of number of round-trip times since last backoff event */
213 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
214 ca->ccount++;
215 ca->snd_cwnd_cnt2 = 0;
216 htcp_alpha_update(ca);
217 }
218
219 /* In dangerous area, increase slowly.
220 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
221 */
222 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 tp->snd_cwnd_cnt = 0;
226 ca->ccount++;
227 }
228 }
229}
230
231/* Lower bound on congestion window. */
232static u32 htcp_min_cwnd(struct tcp_sock *tp)
233{
234 return tp->snd_ssthresh;
235}
236
237
238static void htcp_init(struct tcp_sock *tp)
239{
240 struct htcp *ca = tcp_ca(tp);
241
242 memset(ca, 0, sizeof(struct htcp));
243 ca->alpha = ALPHA_BASE;
244 ca->beta = BETA_MIN;
245}
246
247static void htcp_state(struct tcp_sock *tp, u8 new_state)
248{
249 switch (new_state) {
250 case TCP_CA_CWR:
251 case TCP_CA_Recovery:
252 case TCP_CA_Loss:
253 htcp_reset(tcp_ca(tp));
254 break;
255 }
256}
257
258static struct tcp_congestion_ops htcp = {
259 .init = htcp_init,
260 .ssthresh = htcp_recalc_ssthresh,
261 .min_cwnd = htcp_min_cwnd,
262 .cong_avoid = htcp_cong_avoid,
263 .set_state = htcp_state,
264 .undo_cwnd = htcp_cwnd_undo,
265 .pkts_acked = measure_achieved_throughput,
266 .owner = THIS_MODULE,
267 .name = "htcp",
268};
269
270static int __init htcp_register(void)
271{
272 BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
273 BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
274 if (!use_bandwidth_switch)
275 htcp.pkts_acked = NULL;
276 return tcp_register_congestion_control(&htcp);
277}
278
279static void __exit htcp_unregister(void)
280{
281 tcp_unregister_congestion_control(&htcp);
282}
283
284module_init(htcp_register);
285module_exit(htcp_unregister);
286
287MODULE_AUTHOR("Baruch Even");
288MODULE_LICENSE("GPL");
289MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 000000000000..13a66342c304
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
1/*
2 * TCP HYBLA
3 *
4 * TCP-HYBLA Congestion control algorithm, based on:
5 * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
6 * for Heterogeneous Networks",
7 * International Journal on satellite Communications,
8 * September 2004
9 * Daniele Lacamera
10 * root at danielinux.net
11 */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <net/tcp.h>
16
17/* Tcp Hybla structure. */
18struct hybla {
19 u8 hybla_en;
20 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
21 u32 rho; /* Rho parameter, integer part */
22 u32 rho2; /* Rho * Rho, integer part */
23 u32 rho_3ls; /* Rho parameter, <<3 */
24 u32 rho2_7ls; /* Rho^2, <<7 */
25 u32 minrtt; /* Minimum smoothed round trip time value seen */
26};
27
28/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
29 expressed in jiffies */
30static int rtt0 = 25;
31module_param(rtt0, int, 0644);
32MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
33
34
35/* This is called to refresh values for hybla parameters */
36static inline void hybla_recalc_param (struct tcp_sock *tp)
37{
38 struct hybla *ca = tcp_ca(tp);
39
40 ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
41 ca->rho = ca->rho_3ls >> 3;
42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
43 ca->rho2 = ca->rho2_7ls >>7;
44}
45
46static void hybla_init(struct tcp_sock *tp)
47{
48 struct hybla *ca = tcp_ca(tp);
49
50 ca->rho = 0;
51 ca->rho2 = 0;
52 ca->rho_3ls = 0;
53 ca->rho2_7ls = 0;
54 ca->snd_cwnd_cents = 0;
55 ca->hybla_en = 1;
56 tp->snd_cwnd = 2;
57 tp->snd_cwnd_clamp = 65535;
58
59 /* 1st Rho measurement based on initial srtt */
60 hybla_recalc_param(tp);
61
62 /* set minimum rtt as this is the 1st ever seen */
63 ca->minrtt = tp->srtt;
64 tp->snd_cwnd = ca->rho;
65}
66
67static void hybla_state(struct tcp_sock *tp, u8 ca_state)
68{
69 struct hybla *ca = tcp_ca(tp);
70
71 ca->hybla_en = (ca_state == TCP_CA_Open);
72}
73
74static inline u32 hybla_fraction(u32 odds)
75{
76 static const u32 fractions[] = {
77 128, 139, 152, 165, 181, 197, 215, 234,
78 };
79
80 return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
81}
82
83/* TCP Hybla main routine.
84 * This is the algorithm behavior:
85 * o Recalc Hybla parameters if min_rtt has changed
86 * o Give cwnd a new value based on the model proposed
87 * o remember increments <1
88 */
89static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
90 u32 in_flight, int flag)
91{
92 struct hybla *ca = tcp_ca(tp);
93 u32 increment, odd, rho_fractions;
94 int is_slowstart = 0;
95
96 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){
98 hybla_recalc_param(tp);
99 ca->minrtt = tp->srtt;
100 }
101
102 if (!ca->hybla_en)
103 return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
104
105 if (in_flight < tp->snd_cwnd)
106 return;
107
108 if (ca->rho == 0)
109 hybla_recalc_param(tp);
110
111 rho_fractions = ca->rho_3ls - (ca->rho << 3);
112
113 if (tp->snd_cwnd < tp->snd_ssthresh) {
114 /*
115 * slow start
116 * INC = 2^RHO - 1
117 * This is done by splitting the rho parameter
118 * into 2 parts: an integer part and a fraction part.
119 * Inrement<<7 is estimated by doing:
120 * [2^(int+fract)]<<7
121 * that is equal to:
122 * (2^int) * [(2^fract) <<7]
123 * 2^int is straightly computed as 1<<int,
124 * while we will use hybla_slowstart_fraction_increment() to
125 * calculate 2^fract in a <<7 value.
126 */
127 is_slowstart = 1;
128 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
129 - 128;
130 } else {
131 /*
132 * congestion avoidance
133 * INC = RHO^2 / W
134 * as long as increment is estimated as (rho<<7)/window
135 * it already is <<7 and we can easily count its fractions.
136 */
137 increment = ca->rho2_7ls / tp->snd_cwnd;
138 if (increment < 128)
139 tp->snd_cwnd_cnt++;
140 }
141
142 odd = increment % 128;
143 tp->snd_cwnd += increment >> 7;
144 ca->snd_cwnd_cents += odd;
145
146 /* check when fractions goes >=128 and increase cwnd by 1. */
147 while(ca->snd_cwnd_cents >= 128) {
148 tp->snd_cwnd++;
149 ca->snd_cwnd_cents -= 128;
150 tp->snd_cwnd_cnt = 0;
151 }
152
153 /* clamp down slowstart cwnd to ssthresh value. */
154 if (is_slowstart)
155 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
156
157 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
158}
159
160static struct tcp_congestion_ops tcp_hybla = {
161 .init = hybla_init,
162 .ssthresh = tcp_reno_ssthresh,
163 .min_cwnd = tcp_reno_min_cwnd,
164 .cong_avoid = hybla_cong_avoid,
165 .set_state = hybla_state,
166
167 .owner = THIS_MODULE,
168 .name = "hybla"
169};
170
171static int __init hybla_register(void)
172{
173 BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
174 return tcp_register_congestion_control(&tcp_hybla);
175}
176
177static void __exit hybla_unregister(void)
178{
179 tcp_unregister_congestion_control(&tcp_hybla);
180}
181
182module_init(hybla_register);
183module_exit(hybla_unregister);
184
185MODULE_AUTHOR("Daniele Lacamera");
186MODULE_LICENSE("GPL");
187MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a3..53a8a5399f1e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission 61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission
62 * engine. Lots of bugs are found. 62 * engine. Lots of bugs are found.
63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
64 * Angelo Dell'Aera: TCP Westwood+ support
65 */ 64 */
66 65
67#include <linux/config.h> 66#include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
88int sysctl_tcp_max_orphans = NR_FILE; 87int sysctl_tcp_max_orphans = NR_FILE;
89int sysctl_tcp_frto; 88int sysctl_tcp_frto;
90int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
91int sysctl_tcp_westwood;
92int sysctl_tcp_vegas_cong_avoid;
93 90
94int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
95 92
96/* Default values of the Vegas variables, in fixed-point representation
97 * with V_PARAM_SHIFT bits to the right of the binary point.
98 */
99#define V_PARAM_SHIFT 1
100int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
101int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
102int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
103int sysctl_tcp_bic = 1;
104int sysctl_tcp_bic_fast_convergence = 1;
105int sysctl_tcp_bic_low_window = 14;
106int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
107
108#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 93#define FLAG_DATA 0x01 /* Incoming frame contained data. */
109#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
110#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ 95#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
333 tp->snd_cwnd_stamp = tcp_time_stamp; 318 tp->snd_cwnd_stamp = tcp_time_stamp;
334} 319}
335 320
336static void init_bictcp(struct tcp_sock *tp)
337{
338 tp->bictcp.cnt = 0;
339
340 tp->bictcp.last_max_cwnd = 0;
341 tp->bictcp.last_cwnd = 0;
342 tp->bictcp.last_stamp = 0;
343}
344
345/* 5. Recalculate window clamp after socket hit its memory bounds. */ 321/* 5. Recalculate window clamp after socket hit its memory bounds. */
346static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
347{ 323{
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
558 tcp_grow_window(sk, tp, skb); 534 tcp_grow_window(sk, tp, skb);
559} 535}
560 536
561/* When starting a new connection, pin down the current choice of
562 * congestion algorithm.
563 */
564void tcp_ca_init(struct tcp_sock *tp)
565{
566 if (sysctl_tcp_westwood)
567 tp->adv_cong = TCP_WESTWOOD;
568 else if (sysctl_tcp_bic)
569 tp->adv_cong = TCP_BIC;
570 else if (sysctl_tcp_vegas_cong_avoid) {
571 tp->adv_cong = TCP_VEGAS;
572 tp->vegas.baseRTT = 0x7fffffff;
573 tcp_vegas_enable(tp);
574 }
575}
576
577/* Do RTT sampling needed for Vegas.
578 * Basically we:
579 * o min-filter RTT samples from within an RTT to get the current
580 * propagation delay + queuing delay (we are min-filtering to try to
581 * avoid the effects of delayed ACKs)
582 * o min-filter RTT samples from a much longer window (forever for now)
583 * to find the propagation delay (baseRTT)
584 */
585static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
586{
587 __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
588
589 /* Filter to find propagation delay: */
590 if (vrtt < tp->vegas.baseRTT)
591 tp->vegas.baseRTT = vrtt;
592
593 /* Find the min RTT during the last RTT to find
594 * the current prop. delay + queuing delay:
595 */
596 tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
597 tp->vegas.cntRTT++;
598}
599
600/* Called to compute a smoothed rtt estimate. The data fed to this 537/* Called to compute a smoothed rtt estimate. The data fed to this
601 * routine either comes from timestamps, or from segments that were 538 * routine either comes from timestamps, or from segments that were
602 * known _not_ to have been retransmitted [see Karn/Partridge 539 * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
606 * To save cycles in the RFC 1323 implementation it was better to break 543 * To save cycles in the RFC 1323 implementation it was better to break
607 * it up into three procedures. -- erics 544 * it up into three procedures. -- erics
608 */ 545 */
609static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) 546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
610{ 547{
611 long m = mrtt; /* RTT */ 548 long m = mrtt; /* RTT */
612 549
613 if (tcp_vegas_enabled(tp))
614 vegas_rtt_calc(tp, mrtt);
615
616 /* The following amusing code comes from Jacobson's 550 /* The following amusing code comes from Jacobson's
617 * article in SIGCOMM '88. Note that rtt and mdev 551 * article in SIGCOMM '88. Note that rtt and mdev
618 * are scaled versions of rtt and mean deviation. 552 * are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
670 tp->rtt_seq = tp->snd_nxt; 604 tp->rtt_seq = tp->snd_nxt;
671 } 605 }
672 606
673 tcp_westwood_update_rtt(tp, tp->srtt >> 3); 607 if (tp->ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt);
674} 609}
675 610
676/* Calculate rto without backoff. This is the second half of Van Jacobson's 611/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -805,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
805 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
806 741
807 if (!cwnd) { 742 if (!cwnd) {
808 if (tp->mss_cache_std > 1460) 743 if (tp->mss_cache > 1460)
809 cwnd = 2; 744 cwnd = 2;
810 else 745 else
811 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; 746 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
812 } 747 }
813 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
814} 749}
@@ -979,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
979 if (sk->sk_route_caps & NETIF_F_TSO) { 914 if (sk->sk_route_caps & NETIF_F_TSO) {
980 sk->sk_route_caps &= ~NETIF_F_TSO; 915 sk->sk_route_caps &= ~NETIF_F_TSO;
981 sock_set_flag(sk, SOCK_NO_LARGESEND); 916 sock_set_flag(sk, SOCK_NO_LARGESEND);
982 tp->mss_cache = tp->mss_cache_std; 917 tp->mss_cache = tp->mss_cache;
983 } 918 }
984 919
985 if (!tp->sacked_out) 920 if (!tp->sacked_out)
@@ -1142,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1142 (IsFack(tp) || 1077 (IsFack(tp) ||
1143 !before(lost_retrans, 1078 !before(lost_retrans,
1144 TCP_SKB_CB(skb)->ack_seq + tp->reordering * 1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1145 tp->mss_cache_std))) { 1080 tp->mss_cache))) {
1146 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1147 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1148 1083
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
1185 tp->snd_una == tp->high_seq || 1120 tp->snd_una == tp->high_seq ||
1186 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1187 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1122 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1188 if (!tcp_westwood_ssthresh(tp)) 1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1189 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1124 tcp_ca_event(tp, CA_EVENT_FRTO);
1190 } 1125 }
1191 1126
1192 /* Have to clear retransmission markers here to keep the bookkeeping 1127 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
1252 tcp_set_ca_state(tp, TCP_CA_Loss); 1187 tcp_set_ca_state(tp, TCP_CA_Loss);
1253 tp->high_seq = tp->frto_highmark; 1188 tp->high_seq = tp->frto_highmark;
1254 TCP_ECN_queue_cwr(tp); 1189 TCP_ECN_queue_cwr(tp);
1255
1256 init_bictcp(tp);
1257} 1190}
1258 1191
1259void tcp_clear_retrans(struct tcp_sock *tp) 1192void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1283 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1284 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1285 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1218 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1286 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1220 tcp_ca_event(tp, CA_EVENT_LOSS);
1287 } 1221 }
1288 tp->snd_cwnd = 1; 1222 tp->snd_cwnd = 1;
1289 tp->snd_cwnd_cnt = 0; 1223 tp->snd_cwnd_cnt = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1596} 1530}
1597 1531
1598/* Decrease cwnd each second ack. */ 1532/* Decrease cwnd each second ack. */
1599
1600static void tcp_cwnd_down(struct tcp_sock *tp) 1533static void tcp_cwnd_down(struct tcp_sock *tp)
1601{ 1534{
1602 int decr = tp->snd_cwnd_cnt + 1; 1535 int decr = tp->snd_cwnd_cnt + 1;
1603 __u32 limit;
1604
1605 /*
1606 * TCP Westwood
1607 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
1608 * in packets we use mss_cache). If sysctl_tcp_westwood is off
1609 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
1610 * still used as usual. It prevents other strange cases in which
1611 * BWE*RTTmin could assume value 0. It should not happen but...
1612 */
1613
1614 if (!(limit = tcp_westwood_bw_rttmin(tp)))
1615 limit = tp->snd_ssthresh/2;
1616 1536
1617 tp->snd_cwnd_cnt = decr&1; 1537 tp->snd_cwnd_cnt = decr&1;
1618 decr >>= 1; 1538 decr >>= 1;
1619 1539
1620 if (decr && tp->snd_cwnd > limit) 1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
1621 tp->snd_cwnd -= decr; 1541 tp->snd_cwnd -= decr;
1622 1542
1623 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1654static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
1655{ 1575{
1656 if (tp->prior_ssthresh) { 1576 if (tp->prior_ssthresh) {
1657 if (tcp_is_bic(tp)) 1577 if (tp->ca_ops->undo_cwnd)
1658 tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); 1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
1659 else 1579 else
1660 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1661 1581
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1767 1687
1768static inline void tcp_complete_cwr(struct tcp_sock *tp) 1688static inline void tcp_complete_cwr(struct tcp_sock *tp)
1769{ 1689{
1770 if (tcp_westwood_cwnd(tp)) 1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1771 tp->snd_ssthresh = tp->snd_cwnd;
1772 else
1773 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1774 tp->snd_cwnd_stamp = tcp_time_stamp; 1691 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
1775} 1693}
1776 1694
1777static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1946 if (tp->ca_state < TCP_CA_CWR) { 1864 if (tp->ca_state < TCP_CA_CWR) {
1947 if (!(flag&FLAG_ECE)) 1865 if (!(flag&FLAG_ECE))
1948 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1866 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1949 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1950 TCP_ECN_queue_cwr(tp); 1868 TCP_ECN_queue_cwr(tp);
1951 } 1869 }
1952 1870
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1963/* Read draft-ietf-tcplw-high-performance before mucking 1881/* Read draft-ietf-tcplw-high-performance before mucking
1964 * with this code. (Superceeds RFC1323) 1882 * with this code. (Superceeds RFC1323)
1965 */ 1883 */
1966static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) 1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1967{ 1885{
1968 __u32 seq_rtt; 1886 __u32 seq_rtt;
1969 1887
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
1983 * in window is lost... Voila. --ANK (010210) 1901 * in window is lost... Voila. --ANK (010210)
1984 */ 1902 */
1985 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1986 tcp_rtt_estimator(tp, seq_rtt); 1904 tcp_rtt_estimator(tp, seq_rtt, usrtt);
1987 tcp_set_rto(tp); 1905 tcp_set_rto(tp);
1988 tp->backoff = 0; 1906 tp->backoff = 0;
1989 tcp_bound_rto(tp); 1907 tcp_bound_rto(tp);
1990} 1908}
1991 1909
1992static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) 1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
1993{ 1911{
1994 /* We don't have a timestamp. Can only use 1912 /* We don't have a timestamp. Can only use
1995 * packets that are not retransmitted to determine 1913 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
2003 if (flag & FLAG_RETRANS_DATA_ACKED) 1921 if (flag & FLAG_RETRANS_DATA_ACKED)
2004 return; 1922 return;
2005 1923
2006 tcp_rtt_estimator(tp, seq_rtt); 1924 tcp_rtt_estimator(tp, seq_rtt, usrtt);
2007 tcp_set_rto(tp); 1925 tcp_set_rto(tp);
2008 tp->backoff = 0; 1926 tp->backoff = 0;
2009 tcp_bound_rto(tp); 1927 tcp_bound_rto(tp);
2010} 1928}
2011 1929
2012static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
2013 int flag, s32 seq_rtt) 1931 int flag, s32 seq_rtt, u32 *usrtt)
2014{ 1932{
2015 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2016 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2017 tcp_ack_saw_tstamp(tp, flag); 1935 tcp_ack_saw_tstamp(tp, usrtt, flag);
2018 else if (seq_rtt >= 0) 1936 else if (seq_rtt >= 0)
2019 tcp_ack_no_tstamp(tp, seq_rtt, flag); 1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
2020}
2021
2022/*
2023 * Compute congestion window to use.
2024 *
2025 * This is from the implementation of BICTCP in
2026 * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
2027 * "Binary Increase Congestion Control for Fast, Long Distance
2028 * Networks" in InfoComm 2004
2029 * Available from:
2030 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
2031 *
2032 * Unless BIC is enabled and congestion window is large
2033 * this behaves the same as the original Reno.
2034 */
2035static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
2036{
2037 /* orignal Reno behaviour */
2038 if (!tcp_is_bic(tp))
2039 return tp->snd_cwnd;
2040
2041 if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
2042 (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
2043 return tp->bictcp.cnt;
2044
2045 tp->bictcp.last_cwnd = tp->snd_cwnd;
2046 tp->bictcp.last_stamp = tcp_time_stamp;
2047
2048 /* start off normal */
2049 if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
2050 tp->bictcp.cnt = tp->snd_cwnd;
2051
2052 /* binary increase */
2053 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
2054 __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
2055 / BICTCP_B;
2056
2057 if (dist > BICTCP_MAX_INCREMENT)
2058 /* linear increase */
2059 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2060 else if (dist <= 1U)
2061 /* binary search increase */
2062 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2063 / BICTCP_B;
2064 else
2065 /* binary search increase */
2066 tp->bictcp.cnt = tp->snd_cwnd / dist;
2067 } else {
2068 /* slow start amd linear increase */
2069 if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
2070 /* slow start */
2071 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2072 / BICTCP_B;
2073 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
2074 + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
2075 /* slow start */
2076 tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
2077 / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
2078 else
2079 /* linear increase */
2080 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2081 }
2082 return tp->bictcp.cnt;
2083} 1938}
2084 1939
2085/* This is Jacobson's slow start and congestion avoidance. 1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
2086 * SIGCOMM '88, p. 328. 1941 u32 in_flight, int good)
2087 */
2088static inline void reno_cong_avoid(struct tcp_sock *tp)
2089{ 1942{
2090 if (tp->snd_cwnd <= tp->snd_ssthresh) { 1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
2091 /* In "safe" area, increase. */
2092 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2093 tp->snd_cwnd++;
2094 } else {
2095 /* In dangerous area, increase slowly.
2096 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
2097 */
2098 if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
2099 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2100 tp->snd_cwnd++;
2101 tp->snd_cwnd_cnt=0;
2102 } else
2103 tp->snd_cwnd_cnt++;
2104 }
2105 tp->snd_cwnd_stamp = tcp_time_stamp; 1944 tp->snd_cwnd_stamp = tcp_time_stamp;
2106} 1945}
2107 1946
2108/* This is based on the congestion detection/avoidance scheme described in
2109 * Lawrence S. Brakmo and Larry L. Peterson.
2110 * "TCP Vegas: End to end congestion avoidance on a global internet."
2111 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
2112 * October 1995. Available from:
2113 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
2114 *
2115 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
2116 * The main aspects that distinguish this implementation from the
2117 * Arizona Vegas implementation are:
2118 * o We do not change the loss detection or recovery mechanisms of
2119 * Linux in any way. Linux already recovers from losses quite well,
2120 * using fine-grained timers, NewReno, and FACK.
2121 * o To avoid the performance penalty imposed by increasing cwnd
2122 * only every-other RTT during slow start, we increase during
2123 * every RTT during slow start, just like Reno.
2124 * o Largely to allow continuous cwnd growth during slow start,
2125 * we use the rate at which ACKs come back as the "actual"
2126 * rate, rather than the rate at which data is sent.
2127 * o To speed convergence to the right rate, we set the cwnd
2128 * to achieve the right ("actual") rate when we exit slow start.
2129 * o To filter out the noise caused by delayed ACKs, we use the
2130 * minimum RTT sample observed during the last RTT to calculate
2131 * the actual rate.
2132 * o When the sender re-starts from idle, it waits until it has
2133 * received ACKs for an entire flight of new data before making
2134 * a cwnd adjustment decision. The original Vegas implementation
2135 * assumed senders never went idle.
2136 */
2137static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2138{
2139 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
2140 *
2141 * These are so named because they represent the approximate values
2142 * of snd_una and snd_nxt at the beginning of the current RTT. More
2143 * precisely, they represent the amount of data sent during the RTT.
2144 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
2145 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
2146 * bytes of data have been ACKed during the course of the RTT, giving
2147 * an "actual" rate of:
2148 *
2149 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
2150 *
2151 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
2152 * because delayed ACKs can cover more than one segment, so they
2153 * don't line up nicely with the boundaries of RTTs.
2154 *
2155 * Another unfortunate fact of life is that delayed ACKs delay the
2156 * advance of the left edge of our send window, so that the number
2157 * of bytes we send in an RTT is often less than our cwnd will allow.
2158 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
2159 */
2160
2161 if (after(ack, tp->vegas.beg_snd_nxt)) {
2162 /* Do the Vegas once-per-RTT cwnd adjustment. */
2163 u32 old_wnd, old_snd_cwnd;
2164
2165
2166 /* Here old_wnd is essentially the window of data that was
2167 * sent during the previous RTT, and has all
2168 * been acknowledged in the course of the RTT that ended
2169 * with the ACK we just received. Likewise, old_snd_cwnd
2170 * is the cwnd during the previous RTT.
2171 */
2172 old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
2173 tp->mss_cache_std;
2174 old_snd_cwnd = tp->vegas.beg_snd_cwnd;
2175
2176 /* Save the extent of the current window so we can use this
2177 * at the end of the next RTT.
2178 */
2179 tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
2180 tp->vegas.beg_snd_nxt = tp->snd_nxt;
2181 tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
2182
2183 /* Take into account the current RTT sample too, to
2184 * decrease the impact of delayed acks. This double counts
2185 * this sample since we count it for the next window as well,
2186 * but that's not too awful, since we're taking the min,
2187 * rather than averaging.
2188 */
2189 vegas_rtt_calc(tp, seq_rtt);
2190
2191 /* We do the Vegas calculations only if we got enough RTT
2192 * samples that we can be reasonably sure that we got
2193 * at least one RTT sample that wasn't from a delayed ACK.
2194 * If we only had 2 samples total,
2195 * then that means we're getting only 1 ACK per RTT, which
2196 * means they're almost certainly delayed ACKs.
2197 * If we have 3 samples, we should be OK.
2198 */
2199
2200 if (tp->vegas.cntRTT <= 2) {
2201 /* We don't have enough RTT samples to do the Vegas
2202 * calculation, so we'll behave like Reno.
2203 */
2204 if (tp->snd_cwnd > tp->snd_ssthresh)
2205 tp->snd_cwnd++;
2206 } else {
2207 u32 rtt, target_cwnd, diff;
2208
2209 /* We have enough RTT samples, so, using the Vegas
2210 * algorithm, we determine if we should increase or
2211 * decrease cwnd, and by how much.
2212 */
2213
2214 /* Pluck out the RTT we are using for the Vegas
2215 * calculations. This is the min RTT seen during the
2216 * last RTT. Taking the min filters out the effects
2217 * of delayed ACKs, at the cost of noticing congestion
2218 * a bit later.
2219 */
2220 rtt = tp->vegas.minRTT;
2221
2222 /* Calculate the cwnd we should have, if we weren't
2223 * going too fast.
2224 *
2225 * This is:
2226 * (actual rate in segments) * baseRTT
2227 * We keep it as a fixed point number with
2228 * V_PARAM_SHIFT bits to the right of the binary point.
2229 */
2230 target_cwnd = ((old_wnd * tp->vegas.baseRTT)
2231 << V_PARAM_SHIFT) / rtt;
2232
2233 /* Calculate the difference between the window we had,
2234 * and the window we would like to have. This quantity
2235 * is the "Diff" from the Arizona Vegas papers.
2236 *
2237 * Again, this is a fixed point number with
2238 * V_PARAM_SHIFT bits to the right of the binary
2239 * point.
2240 */
2241 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
2242
2243 if (tp->snd_cwnd < tp->snd_ssthresh) {
2244 /* Slow start. */
2245 if (diff > sysctl_tcp_vegas_gamma) {
2246 /* Going too fast. Time to slow down
2247 * and switch to congestion avoidance.
2248 */
2249 tp->snd_ssthresh = 2;
2250
2251 /* Set cwnd to match the actual rate
2252 * exactly:
2253 * cwnd = (actual rate) * baseRTT
2254 * Then we add 1 because the integer
2255 * truncation robs us of full link
2256 * utilization.
2257 */
2258 tp->snd_cwnd = min(tp->snd_cwnd,
2259 (target_cwnd >>
2260 V_PARAM_SHIFT)+1);
2261
2262 }
2263 } else {
2264 /* Congestion avoidance. */
2265 u32 next_snd_cwnd;
2266
2267 /* Figure out where we would like cwnd
2268 * to be.
2269 */
2270 if (diff > sysctl_tcp_vegas_beta) {
2271 /* The old window was too fast, so
2272 * we slow down.
2273 */
2274 next_snd_cwnd = old_snd_cwnd - 1;
2275 } else if (diff < sysctl_tcp_vegas_alpha) {
2276 /* We don't have enough extra packets
2277 * in the network, so speed up.
2278 */
2279 next_snd_cwnd = old_snd_cwnd + 1;
2280 } else {
2281 /* Sending just as fast as we
2282 * should be.
2283 */
2284 next_snd_cwnd = old_snd_cwnd;
2285 }
2286
2287 /* Adjust cwnd upward or downward, toward the
2288 * desired value.
2289 */
2290 if (next_snd_cwnd > tp->snd_cwnd)
2291 tp->snd_cwnd++;
2292 else if (next_snd_cwnd < tp->snd_cwnd)
2293 tp->snd_cwnd--;
2294 }
2295 }
2296
2297 /* Wipe the slate clean for the next RTT. */
2298 tp->vegas.cntRTT = 0;
2299 tp->vegas.minRTT = 0x7fffffff;
2300 }
2301
2302 /* The following code is executed for every ack we receive,
2303 * except for conditions checked in should_advance_cwnd()
2304 * before the call to tcp_cong_avoid(). Mainly this means that
2305 * we only execute this code if the ack actually acked some
2306 * data.
2307 */
2308
2309 /* If we are in slow start, increase our cwnd in response to this ACK.
2310 * (If we are not in slow start then we are in congestion avoidance,
2311 * and adjust our congestion window only once per RTT. See the code
2312 * above.)
2313 */
2314 if (tp->snd_cwnd <= tp->snd_ssthresh)
2315 tp->snd_cwnd++;
2316
2317 /* to keep cwnd from growing without bound */
2318 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
2319
2320 /* Make sure that we are never so timid as to reduce our cwnd below
2321 * 2 MSS.
2322 *
2323 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
2324 */
2325 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
2326
2327 tp->snd_cwnd_stamp = tcp_time_stamp;
2328}
2329
2330static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2331{
2332 if (tcp_vegas_enabled(tp))
2333 vegas_cong_avoid(tp, ack, seq_rtt);
2334 else
2335 reno_cong_avoid(tp);
2336}
2337
2338/* Restart timer after forward progress on connection. 1947/* Restart timer after forward progress on connection.
2339 * RFC2988 recommends to restart timer to now+rto. 1948 * RFC2988 recommends to restart timer to now+rto.
2340 */ 1949 */
@@ -2348,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
2348 } 1957 }
2349} 1958}
2350 1959
2351/* There is one downside to this scheme. Although we keep the
2352 * ACK clock ticking, adjusting packet counters and advancing
2353 * congestion window, we do not liberate socket send buffer
2354 * space.
2355 *
2356 * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
2357 * then making a write space wakeup callback is a possible
2358 * future enhancement. WARNING: it is not trivial to make.
2359 */
2360static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, 1960static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2361 __u32 now, __s32 *seq_rtt) 1961 __u32 now, __s32 *seq_rtt)
2362{ 1962{
@@ -2415,13 +2015,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2415 2015
2416 2016
2417/* Remove acknowledged frames from the retransmission queue. */ 2017/* Remove acknowledged frames from the retransmission queue. */
2418static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) 2018static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
2419{ 2019{
2420 struct tcp_sock *tp = tcp_sk(sk); 2020 struct tcp_sock *tp = tcp_sk(sk);
2421 struct sk_buff *skb; 2021 struct sk_buff *skb;
2422 __u32 now = tcp_time_stamp; 2022 __u32 now = tcp_time_stamp;
2423 int acked = 0; 2023 int acked = 0;
2424 __s32 seq_rtt = -1; 2024 __s32 seq_rtt = -1;
2025 struct timeval usnow;
2026 u32 pkts_acked = 0;
2027
2028 if (seq_usrtt)
2029 do_gettimeofday(&usnow);
2425 2030
2426 while ((skb = skb_peek(&sk->sk_write_queue)) && 2031 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2427 skb != sk->sk_send_head) { 2032 skb != sk->sk_send_head) {
@@ -2433,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2433 * the other end. 2038 * the other end.
2434 */ 2039 */
2435 if (after(scb->end_seq, tp->snd_una)) { 2040 if (after(scb->end_seq, tp->snd_una)) {
2436 if (tcp_skb_pcount(skb) > 1) 2041 if (tcp_skb_pcount(skb) > 1 &&
2042 after(tp->snd_una, scb->seq))
2437 acked |= tcp_tso_acked(sk, skb, 2043 acked |= tcp_tso_acked(sk, skb,
2438 now, &seq_rtt); 2044 now, &seq_rtt);
2439 break; 2045 break;
@@ -2448,6 +2054,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2448 */ 2054 */
2449 if (!(scb->flags & TCPCB_FLAG_SYN)) { 2055 if (!(scb->flags & TCPCB_FLAG_SYN)) {
2450 acked |= FLAG_DATA_ACKED; 2056 acked |= FLAG_DATA_ACKED;
2057 ++pkts_acked;
2451 } else { 2058 } else {
2452 acked |= FLAG_SYN_ACKED; 2059 acked |= FLAG_SYN_ACKED;
2453 tp->retrans_stamp = 0; 2060 tp->retrans_stamp = 0;
@@ -2461,6 +2068,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2461 seq_rtt = -1; 2068 seq_rtt = -1;
2462 } else if (seq_rtt < 0) 2069 } else if (seq_rtt < 0)
2463 seq_rtt = now - scb->when; 2070 seq_rtt = now - scb->when;
2071 if (seq_usrtt)
2072 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
2073 + (usnow.tv_usec - skb->stamp.tv_usec);
2074
2464 if (sacked & TCPCB_SACKED_ACKED) 2075 if (sacked & TCPCB_SACKED_ACKED)
2465 tp->sacked_out -= tcp_skb_pcount(skb); 2076 tp->sacked_out -= tcp_skb_pcount(skb);
2466 if (sacked & TCPCB_LOST) 2077 if (sacked & TCPCB_LOST)
@@ -2479,8 +2090,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2479 } 2090 }
2480 2091
2481 if (acked&FLAG_ACKED) { 2092 if (acked&FLAG_ACKED) {
2482 tcp_ack_update_rtt(tp, acked, seq_rtt); 2093 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
2483 tcp_ack_packets_out(sk, tp); 2094 tcp_ack_packets_out(sk, tp);
2095
2096 if (tp->ca_ops->pkts_acked)
2097 tp->ca_ops->pkts_acked(tp, pkts_acked);
2484 } 2098 }
2485 2099
2486#if FASTRETRANS_DEBUG > 0 2100#if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2238,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2624 tp->frto_counter = (tp->frto_counter + 1) % 3; 2238 tp->frto_counter = (tp->frto_counter + 1) % 3;
2625} 2239}
2626 2240
2627/*
2628 * TCP Westwood+
2629 */
2630
2631/*
2632 * @init_westwood
2633 * This function initializes fields used in TCP Westwood+. We can't
2634 * get no information about RTTmin at this time so we simply set it to
2635 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
2636 * since in this way we're sure it will be updated in a consistent
2637 * way as soon as possible. It will reasonably happen within the first
2638 * RTT period of the connection lifetime.
2639 */
2640
2641static void init_westwood(struct sock *sk)
2642{
2643 struct tcp_sock *tp = tcp_sk(sk);
2644
2645 tp->westwood.bw_ns_est = 0;
2646 tp->westwood.bw_est = 0;
2647 tp->westwood.accounted = 0;
2648 tp->westwood.cumul_ack = 0;
2649 tp->westwood.rtt_win_sx = tcp_time_stamp;
2650 tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
2651 tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
2652 tp->westwood.snd_una = tp->snd_una;
2653}
2654
2655/*
2656 * @westwood_do_filter
2657 * Low-pass filter. Implemented using constant coeffients.
2658 */
2659
2660static inline __u32 westwood_do_filter(__u32 a, __u32 b)
2661{
2662 return (((7 * a) + b) >> 3);
2663}
2664
2665static void westwood_filter(struct sock *sk, __u32 delta)
2666{
2667 struct tcp_sock *tp = tcp_sk(sk);
2668
2669 tp->westwood.bw_ns_est =
2670 westwood_do_filter(tp->westwood.bw_ns_est,
2671 tp->westwood.bk / delta);
2672 tp->westwood.bw_est =
2673 westwood_do_filter(tp->westwood.bw_est,
2674 tp->westwood.bw_ns_est);
2675}
2676
2677/*
2678 * @westwood_update_rttmin
2679 * It is used to update RTTmin. In this case we MUST NOT use
2680 * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
2681 */
2682
2683static inline __u32 westwood_update_rttmin(const struct sock *sk)
2684{
2685 const struct tcp_sock *tp = tcp_sk(sk);
2686 __u32 rttmin = tp->westwood.rtt_min;
2687
2688 if (tp->westwood.rtt != 0 &&
2689 (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
2690 rttmin = tp->westwood.rtt;
2691
2692 return rttmin;
2693}
2694
2695/*
2696 * @westwood_acked
2697 * Evaluate increases for dk.
2698 */
2699
2700static inline __u32 westwood_acked(const struct sock *sk)
2701{
2702 const struct tcp_sock *tp = tcp_sk(sk);
2703
2704 return tp->snd_una - tp->westwood.snd_una;
2705}
2706
2707/*
2708 * @westwood_new_window
2709 * It evaluates if we are receiving data inside the same RTT window as
2710 * when we started.
2711 * Return value:
2712 * It returns 0 if we are still evaluating samples in the same RTT
2713 * window, 1 if the sample has to be considered in the next window.
2714 */
2715
2716static int westwood_new_window(const struct sock *sk)
2717{
2718 const struct tcp_sock *tp = tcp_sk(sk);
2719 __u32 left_bound;
2720 __u32 rtt;
2721 int ret = 0;
2722
2723 left_bound = tp->westwood.rtt_win_sx;
2724 rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
2725
2726 /*
2727 * A RTT-window has passed. Be careful since if RTT is less than
2728 * 50ms we don't filter but we continue 'building the sample'.
2729 * This minimum limit was choosen since an estimation on small
2730 * time intervals is better to avoid...
2731 * Obvioulsy on a LAN we reasonably will always have
2732 * right_bound = left_bound + WESTWOOD_RTT_MIN
2733 */
2734
2735 if ((left_bound + rtt) < tcp_time_stamp)
2736 ret = 1;
2737
2738 return ret;
2739}
2740
2741/*
2742 * @westwood_update_window
2743 * It updates RTT evaluation window if it is the right moment to do
2744 * it. If so it calls filter for evaluating bandwidth.
2745 */
2746
2747static void __westwood_update_window(struct sock *sk, __u32 now)
2748{
2749 struct tcp_sock *tp = tcp_sk(sk);
2750 __u32 delta = now - tp->westwood.rtt_win_sx;
2751
2752 if (delta) {
2753 if (tp->westwood.rtt)
2754 westwood_filter(sk, delta);
2755
2756 tp->westwood.bk = 0;
2757 tp->westwood.rtt_win_sx = tcp_time_stamp;
2758 }
2759}
2760
2761
2762static void westwood_update_window(struct sock *sk, __u32 now)
2763{
2764 if (westwood_new_window(sk))
2765 __westwood_update_window(sk, now);
2766}
2767
2768/*
2769 * @__tcp_westwood_fast_bw
2770 * It is called when we are in fast path. In particular it is called when
2771 * header prediction is successfull. In such case infact update is
2772 * straight forward and doesn't need any particular care.
2773 */
2774
2775static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779 westwood_update_window(sk, tcp_time_stamp);
2780
2781 tp->westwood.bk += westwood_acked(sk);
2782 tp->westwood.snd_una = tp->snd_una;
2783 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2784}
2785
2786static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2787{
2788 if (tcp_is_westwood(tcp_sk(sk)))
2789 __tcp_westwood_fast_bw(sk, skb);
2790}
2791
2792
2793/*
2794 * @westwood_dupack_update
2795 * It updates accounted and cumul_ack when receiving a dupack.
2796 */
2797
2798static void westwood_dupack_update(struct sock *sk)
2799{
2800 struct tcp_sock *tp = tcp_sk(sk);
2801
2802 tp->westwood.accounted += tp->mss_cache_std;
2803 tp->westwood.cumul_ack = tp->mss_cache_std;
2804}
2805
2806static inline int westwood_may_change_cumul(struct tcp_sock *tp)
2807{
2808 return (tp->westwood.cumul_ack > tp->mss_cache_std);
2809}
2810
2811static inline void westwood_partial_update(struct tcp_sock *tp)
2812{
2813 tp->westwood.accounted -= tp->westwood.cumul_ack;
2814 tp->westwood.cumul_ack = tp->mss_cache_std;
2815}
2816
2817static inline void westwood_complete_update(struct tcp_sock *tp)
2818{
2819 tp->westwood.cumul_ack -= tp->westwood.accounted;
2820 tp->westwood.accounted = 0;
2821}
2822
2823/*
2824 * @westwood_acked_count
2825 * This function evaluates cumul_ack for evaluating dk in case of
2826 * delayed or partial acks.
2827 */
2828
2829static inline __u32 westwood_acked_count(struct sock *sk)
2830{
2831 struct tcp_sock *tp = tcp_sk(sk);
2832
2833 tp->westwood.cumul_ack = westwood_acked(sk);
2834
2835 /* If cumul_ack is 0 this is a dupack since it's not moving
2836 * tp->snd_una.
2837 */
2838 if (!(tp->westwood.cumul_ack))
2839 westwood_dupack_update(sk);
2840
2841 if (westwood_may_change_cumul(tp)) {
2842 /* Partial or delayed ack */
2843 if (tp->westwood.accounted >= tp->westwood.cumul_ack)
2844 westwood_partial_update(tp);
2845 else
2846 westwood_complete_update(tp);
2847 }
2848
2849 tp->westwood.snd_una = tp->snd_una;
2850
2851 return tp->westwood.cumul_ack;
2852}
2853
2854
2855/*
2856 * @__tcp_westwood_slow_bw
2857 * It is called when something is going wrong..even if there could
2858 * be no problems! Infact a simple delayed packet may trigger a
2859 * dupack. But we need to be careful in such case.
2860 */
2861
2862static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2863{
2864 struct tcp_sock *tp = tcp_sk(sk);
2865
2866 westwood_update_window(sk, tcp_time_stamp);
2867
2868 tp->westwood.bk += westwood_acked_count(sk);
2869 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2870}
2871
2872static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2873{
2874 if (tcp_is_westwood(tcp_sk(sk)))
2875 __tcp_westwood_slow_bw(sk, skb);
2876}
2877
2878/* This routine deals with incoming acks, but not outgoing ones. */ 2241/* This routine deals with incoming acks, but not outgoing ones. */
2879static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2242static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2880{ 2243{
@@ -2884,6 +2247,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2884 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2247 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2885 u32 prior_in_flight; 2248 u32 prior_in_flight;
2886 s32 seq_rtt; 2249 s32 seq_rtt;
2250 s32 seq_usrtt = 0;
2887 int prior_packets; 2251 int prior_packets;
2888 2252
2889 /* If the ack is newer than sent or older than previous acks 2253 /* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2266,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2902 */ 2266 */
2903 tcp_update_wl(tp, ack, ack_seq); 2267 tcp_update_wl(tp, ack, ack_seq);
2904 tp->snd_una = ack; 2268 tp->snd_una = ack;
2905 tcp_westwood_fast_bw(sk, skb);
2906 flag |= FLAG_WIN_UPDATE; 2269 flag |= FLAG_WIN_UPDATE;
2907 2270
2271 tcp_ca_event(tp, CA_EVENT_FAST_ACK);
2272
2908 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2273 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2909 } else { 2274 } else {
2910 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 2275 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2285,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2920 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2285 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2921 flag |= FLAG_ECE; 2286 flag |= FLAG_ECE;
2922 2287
2923 tcp_westwood_slow_bw(sk,skb); 2288 tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
2924 } 2289 }
2925 2290
2926 /* We passed data and got it acked, remove any soft error 2291 /* We passed data and got it acked, remove any soft error
@@ -2935,22 +2300,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2935 prior_in_flight = tcp_packets_in_flight(tp); 2300 prior_in_flight = tcp_packets_in_flight(tp);
2936 2301
2937 /* See if we can take anything off of the retransmit queue. */ 2302 /* See if we can take anything off of the retransmit queue. */
2938 flag |= tcp_clean_rtx_queue(sk, &seq_rtt); 2303 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2304 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
2939 2305
2940 if (tp->frto_counter) 2306 if (tp->frto_counter)
2941 tcp_process_frto(sk, prior_snd_una); 2307 tcp_process_frto(sk, prior_snd_una);
2942 2308
2943 if (tcp_ack_is_dubious(tp, flag)) { 2309 if (tcp_ack_is_dubious(tp, flag)) {
2944 /* Advanve CWND, if state allows this. */ 2310 /* Advanve CWND, if state allows this. */
2945 if ((flag & FLAG_DATA_ACKED) && 2311 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
2946 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && 2312 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
2947 tcp_may_raise_cwnd(tp, flag))
2948 tcp_cong_avoid(tp, ack, seq_rtt);
2949 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2313 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2950 } else { 2314 } else {
2951 if ((flag & FLAG_DATA_ACKED) && 2315 if ((flag & FLAG_DATA_ACKED))
2952 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) 2316 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
2953 tcp_cong_avoid(tp, ack, seq_rtt);
2954 } 2317 }
2955 2318
2956 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2319 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -3439,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
3439 int this_sack; 2802 int this_sack;
3440 2803
3441 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 2804 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
3442 if (skb_queue_len(&tp->out_of_order_queue) == 0) { 2805 if (skb_queue_empty(&tp->out_of_order_queue)) {
3443 tp->rx_opt.num_sacks = 0; 2806 tp->rx_opt.num_sacks = 0;
3444 tp->rx_opt.eff_sacks = tp->rx_opt.dsack; 2807 tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
3445 return; 2808 return;
@@ -3572,13 +2935,13 @@ queue_and_out:
3572 if(th->fin) 2935 if(th->fin)
3573 tcp_fin(skb, sk, th); 2936 tcp_fin(skb, sk, th);
3574 2937
3575 if (skb_queue_len(&tp->out_of_order_queue)) { 2938 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3576 tcp_ofo_queue(sk); 2939 tcp_ofo_queue(sk);
3577 2940
3578 /* RFC2581. 4.2. SHOULD send immediate ACK, when 2941 /* RFC2581. 4.2. SHOULD send immediate ACK, when
3579 * gap in queue is filled. 2942 * gap in queue is filled.
3580 */ 2943 */
3581 if (!skb_queue_len(&tp->out_of_order_queue)) 2944 if (skb_queue_empty(&tp->out_of_order_queue))
3582 tp->ack.pingpong = 0; 2945 tp->ack.pingpong = 0;
3583 } 2946 }
3584 2947
@@ -3886,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
3886 * This must not ever occur. */ 3249 * This must not ever occur. */
3887 3250
3888 /* First, purge the out_of_order queue. */ 3251 /* First, purge the out_of_order queue. */
3889 if (skb_queue_len(&tp->out_of_order_queue)) { 3252 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3890 NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 3253 NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
3891 skb_queue_len(&tp->out_of_order_queue));
3892 __skb_queue_purge(&tp->out_of_order_queue); 3254 __skb_queue_purge(&tp->out_of_order_queue);
3893 3255
3894 /* Reset SACK state. A conforming SACK implementation will 3256 /* Reset SACK state. A conforming SACK implementation will
@@ -3937,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
3937 tp->snd_cwnd_stamp = tcp_time_stamp; 3299 tp->snd_cwnd_stamp = tcp_time_stamp;
3938} 3300}
3939 3301
3302static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3303{
3304 /* If the user specified a specific send buffer setting, do
3305 * not modify it.
3306 */
3307 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
3308 return 0;
3309
3310 /* If we are under global TCP memory pressure, do not expand. */
3311 if (tcp_memory_pressure)
3312 return 0;
3313
3314 /* If we are under soft global TCP memory pressure, do not expand. */
3315 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
3316 return 0;
3317
3318 /* If we filled the congestion window, do not expand. */
3319 if (tp->packets_out >= tp->snd_cwnd)
3320 return 0;
3321
3322 return 1;
3323}
3940 3324
3941/* When incoming ACK allowed to free some skb from write_queue, 3325/* When incoming ACK allowed to free some skb from write_queue,
3942 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket 3326 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3948,11 +3332,8 @@ static void tcp_new_space(struct sock *sk)
3948{ 3332{
3949 struct tcp_sock *tp = tcp_sk(sk); 3333 struct tcp_sock *tp = tcp_sk(sk);
3950 3334
3951 if (tp->packets_out < tp->snd_cwnd && 3335 if (tcp_should_expand_sndbuf(sk, tp)) {
3952 !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && 3336 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3953 !tcp_memory_pressure &&
3954 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
3955 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
3956 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3337 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3957 demanded = max_t(unsigned int, tp->snd_cwnd, 3338 demanded = max_t(unsigned int, tp->snd_cwnd,
3958 tp->reordering + 1); 3339 tp->reordering + 1);
@@ -3975,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk)
3975 } 3356 }
3976} 3357}
3977 3358
3978static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) 3359static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3979{ 3360{
3980 struct tcp_sock *tp = tcp_sk(sk); 3361 tcp_push_pending_frames(sk, tp);
3981
3982 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
3983 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
3984 tcp_write_xmit(sk, tp->nonagle))
3985 tcp_check_probe_timer(sk, tp);
3986}
3987
3988static __inline__ void tcp_data_snd_check(struct sock *sk)
3989{
3990 struct sk_buff *skb = sk->sk_send_head;
3991
3992 if (skb != NULL)
3993 __tcp_data_snd_check(sk, skb);
3994 tcp_check_space(sk); 3362 tcp_check_space(sk);
3995} 3363}
3996 3364
@@ -4284,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4284 */ 3652 */
4285 tcp_ack(sk, skb, 0); 3653 tcp_ack(sk, skb, 0);
4286 __kfree_skb(skb); 3654 __kfree_skb(skb);
4287 tcp_data_snd_check(sk); 3655 tcp_data_snd_check(sk, tp);
4288 return 0; 3656 return 0;
4289 } else { /* Header too small */ 3657 } else { /* Header too small */
4290 TCP_INC_STATS_BH(TCP_MIB_INERRS); 3658 TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -4350,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4350 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 3718 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
4351 /* Well, only one small jumplet in fast path... */ 3719 /* Well, only one small jumplet in fast path... */
4352 tcp_ack(sk, skb, FLAG_DATA); 3720 tcp_ack(sk, skb, FLAG_DATA);
4353 tcp_data_snd_check(sk); 3721 tcp_data_snd_check(sk, tp);
4354 if (!tcp_ack_scheduled(tp)) 3722 if (!tcp_ack_scheduled(tp))
4355 goto no_ack; 3723 goto no_ack;
4356 } 3724 }
@@ -4428,7 +3796,7 @@ step5:
4428 /* step 7: process the segment text */ 3796 /* step 7: process the segment text */
4429 tcp_data_queue(sk, skb); 3797 tcp_data_queue(sk, skb);
4430 3798
4431 tcp_data_snd_check(sk); 3799 tcp_data_snd_check(sk, tp);
4432 tcp_ack_snd_check(sk); 3800 tcp_ack_snd_check(sk);
4433 return 0; 3801 return 0;
4434 3802
@@ -4552,6 +3920,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4552 3920
4553 tcp_init_metrics(sk); 3921 tcp_init_metrics(sk);
4554 3922
3923 tcp_init_congestion_control(tp);
3924
4555 /* Prevent spurious tcp_cwnd_restart() on first data 3925 /* Prevent spurious tcp_cwnd_restart() on first data
4556 * packet. 3926 * packet.
4557 */ 3927 */
@@ -4708,9 +4078,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4708 if(tp->af_specific->conn_request(sk, skb) < 0) 4078 if(tp->af_specific->conn_request(sk, skb) < 0)
4709 return 1; 4079 return 1;
4710 4080
4711 init_westwood(sk);
4712 init_bictcp(tp);
4713
4714 /* Now we have several options: In theory there is 4081 /* Now we have several options: In theory there is
4715 * nothing else in the frame. KA9Q has an option to 4082 * nothing else in the frame. KA9Q has an option to
4716 * send data with the syn, BSD accepts data with the 4083 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4099,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4732 goto discard; 4099 goto discard;
4733 4100
4734 case TCP_SYN_SENT: 4101 case TCP_SYN_SENT:
4735 init_westwood(sk);
4736 init_bictcp(tp);
4737
4738 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 4102 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4739 if (queued >= 0) 4103 if (queued >= 0)
4740 return queued; 4104 return queued;
@@ -4742,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4742 /* Do step6 onward by hand. */ 4106 /* Do step6 onward by hand. */
4743 tcp_urg(sk, skb, th); 4107 tcp_urg(sk, skb, th);
4744 __kfree_skb(skb); 4108 __kfree_skb(skb);
4745 tcp_data_snd_check(sk); 4109 tcp_data_snd_check(sk, tp);
4746 return 0; 4110 return 0;
4747 } 4111 }
4748 4112
@@ -4816,7 +4180,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4816 */ 4180 */
4817 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4181 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4818 !tp->srtt) 4182 !tp->srtt)
4819 tcp_ack_saw_tstamp(tp, 0); 4183 tcp_ack_saw_tstamp(tp, 0, 0);
4820 4184
4821 if (tp->rx_opt.tstamp_ok) 4185 if (tp->rx_opt.tstamp_ok)
4822 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4186 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4192,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4828 4192
4829 tcp_init_metrics(sk); 4193 tcp_init_metrics(sk);
4830 4194
4195 tcp_init_congestion_control(tp);
4196
4831 /* Prevent spurious tcp_cwnd_restart() on 4197 /* Prevent spurious tcp_cwnd_restart() on
4832 * first data packet. 4198 * first data packet.
4833 */ 4199 */
@@ -4931,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4931 4297
4932 /* tcp_data could move socket to TIME-WAIT */ 4298 /* tcp_data could move socket to TIME-WAIT */
4933 if (sk->sk_state != TCP_CLOSE) { 4299 if (sk->sk_state != TCP_CLOSE) {
4934 tcp_data_snd_check(sk); 4300 tcp_data_snd_check(sk, tp);
4935 tcp_ack_snd_check(sk); 4301 tcp_ack_snd_check(sk);
4936 } 4302 }
4937 4303
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dad98e4a5043..5d91213d34c0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -36,7 +36,7 @@
36 * ACK bit. 36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery. 37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the 38 * Fixed many serious bugs in the
39 * open_request handling and moved 39 * request_sock handling and moved
40 * most of it into the af independent code. 40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes. 41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics. 42 * Added new listen sematics.
@@ -869,21 +869,23 @@ static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
869 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); 869 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
870} 870}
871 871
872static struct open_request *tcp_v4_search_req(struct tcp_sock *tp, 872static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
873 struct open_request ***prevp, 873 struct request_sock ***prevp,
874 __u16 rport, 874 __u16 rport,
875 __u32 raddr, __u32 laddr) 875 __u32 raddr, __u32 laddr)
876{ 876{
877 struct tcp_listen_opt *lopt = tp->listen_opt; 877 struct listen_sock *lopt = tp->accept_queue.listen_opt;
878 struct open_request *req, **prev; 878 struct request_sock *req, **prev;
879 879
880 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; 880 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
881 (req = *prev) != NULL; 881 (req = *prev) != NULL;
882 prev = &req->dl_next) { 882 prev = &req->dl_next) {
883 if (req->rmt_port == rport && 883 const struct inet_request_sock *ireq = inet_rsk(req);
884 req->af.v4_req.rmt_addr == raddr && 884
885 req->af.v4_req.loc_addr == laddr && 885 if (ireq->rmt_port == rport &&
886 TCP_INET_FAMILY(req->class->family)) { 886 ireq->rmt_addr == raddr &&
887 ireq->loc_addr == laddr &&
888 TCP_INET_FAMILY(req->rsk_ops->family)) {
887 BUG_TRAP(!req->sk); 889 BUG_TRAP(!req->sk);
888 *prevp = prev; 890 *prevp = prev;
889 break; 891 break;
@@ -893,21 +895,13 @@ static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
893 return req; 895 return req;
894} 896}
895 897
896static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) 898static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
897{ 899{
898 struct tcp_sock *tp = tcp_sk(sk); 900 struct tcp_sock *tp = tcp_sk(sk);
899 struct tcp_listen_opt *lopt = tp->listen_opt; 901 struct listen_sock *lopt = tp->accept_queue.listen_opt;
900 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd); 902 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
901
902 req->expires = jiffies + TCP_TIMEOUT_INIT;
903 req->retrans = 0;
904 req->sk = NULL;
905 req->dl_next = lopt->syn_table[h];
906
907 write_lock(&tp->syn_wait_lock);
908 lopt->syn_table[h] = req;
909 write_unlock(&tp->syn_wait_lock);
910 903
904 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
911 tcp_synq_added(sk); 905 tcp_synq_added(sk);
912} 906}
913 907
@@ -1050,7 +1044,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1050 } 1044 }
1051 1045
1052 switch (sk->sk_state) { 1046 switch (sk->sk_state) {
1053 struct open_request *req, **prev; 1047 struct request_sock *req, **prev;
1054 case TCP_LISTEN: 1048 case TCP_LISTEN:
1055 if (sock_owned_by_user(sk)) 1049 if (sock_owned_by_user(sk))
1056 goto out; 1050 goto out;
@@ -1065,7 +1059,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1065 */ 1059 */
1066 BUG_TRAP(!req->sk); 1060 BUG_TRAP(!req->sk);
1067 1061
1068 if (seq != req->snt_isn) { 1062 if (seq != tcp_rsk(req)->snt_isn) {
1069 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); 1063 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1070 goto out; 1064 goto out;
1071 } 1065 }
@@ -1254,28 +1248,29 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1254 tcp_tw_put(tw); 1248 tcp_tw_put(tw);
1255} 1249}
1256 1250
1257static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) 1251static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1258{ 1252{
1259 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd, 1253 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1260 req->ts_recent); 1254 req->ts_recent);
1261} 1255}
1262 1256
1263static struct dst_entry* tcp_v4_route_req(struct sock *sk, 1257static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1264 struct open_request *req) 1258 struct request_sock *req)
1265{ 1259{
1266 struct rtable *rt; 1260 struct rtable *rt;
1267 struct ip_options *opt = req->af.v4_req.opt; 1261 const struct inet_request_sock *ireq = inet_rsk(req);
1262 struct ip_options *opt = inet_rsk(req)->opt;
1268 struct flowi fl = { .oif = sk->sk_bound_dev_if, 1263 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1269 .nl_u = { .ip4_u = 1264 .nl_u = { .ip4_u =
1270 { .daddr = ((opt && opt->srr) ? 1265 { .daddr = ((opt && opt->srr) ?
1271 opt->faddr : 1266 opt->faddr :
1272 req->af.v4_req.rmt_addr), 1267 ireq->rmt_addr),
1273 .saddr = req->af.v4_req.loc_addr, 1268 .saddr = ireq->loc_addr,
1274 .tos = RT_CONN_FLAGS(sk) } }, 1269 .tos = RT_CONN_FLAGS(sk) } },
1275 .proto = IPPROTO_TCP, 1270 .proto = IPPROTO_TCP,
1276 .uli_u = { .ports = 1271 .uli_u = { .ports =
1277 { .sport = inet_sk(sk)->sport, 1272 { .sport = inet_sk(sk)->sport,
1278 .dport = req->rmt_port } } }; 1273 .dport = ireq->rmt_port } } };
1279 1274
1280 if (ip_route_output_flow(&rt, &fl, sk, 0)) { 1275 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1281 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 1276 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
@@ -1291,12 +1286,13 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1291 1286
1292/* 1287/*
1293 * Send a SYN-ACK after having received an ACK. 1288 * Send a SYN-ACK after having received an ACK.
1294 * This still operates on a open_request only, not on a big 1289 * This still operates on a request_sock only, not on a big
1295 * socket. 1290 * socket.
1296 */ 1291 */
1297static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, 1292static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1298 struct dst_entry *dst) 1293 struct dst_entry *dst)
1299{ 1294{
1295 const struct inet_request_sock *ireq = inet_rsk(req);
1300 int err = -1; 1296 int err = -1;
1301 struct sk_buff * skb; 1297 struct sk_buff * skb;
1302 1298
@@ -1310,14 +1306,14 @@ static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1310 struct tcphdr *th = skb->h.th; 1306 struct tcphdr *th = skb->h.th;
1311 1307
1312 th->check = tcp_v4_check(th, skb->len, 1308 th->check = tcp_v4_check(th, skb->len,
1313 req->af.v4_req.loc_addr, 1309 ireq->loc_addr,
1314 req->af.v4_req.rmt_addr, 1310 ireq->rmt_addr,
1315 csum_partial((char *)th, skb->len, 1311 csum_partial((char *)th, skb->len,
1316 skb->csum)); 1312 skb->csum));
1317 1313
1318 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, 1314 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1319 req->af.v4_req.rmt_addr, 1315 ireq->rmt_addr,
1320 req->af.v4_req.opt); 1316 ireq->opt);
1321 if (err == NET_XMIT_CN) 1317 if (err == NET_XMIT_CN)
1322 err = 0; 1318 err = 0;
1323 } 1319 }
@@ -1328,12 +1324,12 @@ out:
1328} 1324}
1329 1325
1330/* 1326/*
1331 * IPv4 open_request destructor. 1327 * IPv4 request_sock destructor.
1332 */ 1328 */
1333static void tcp_v4_or_free(struct open_request *req) 1329static void tcp_v4_reqsk_destructor(struct request_sock *req)
1334{ 1330{
1335 if (req->af.v4_req.opt) 1331 if (inet_rsk(req)->opt)
1336 kfree(req->af.v4_req.opt); 1332 kfree(inet_rsk(req)->opt);
1337} 1333}
1338 1334
1339static inline void syn_flood_warning(struct sk_buff *skb) 1335static inline void syn_flood_warning(struct sk_buff *skb)
@@ -1349,7 +1345,7 @@ static inline void syn_flood_warning(struct sk_buff *skb)
1349} 1345}
1350 1346
1351/* 1347/*
1352 * Save and compile IPv4 options into the open_request if needed. 1348 * Save and compile IPv4 options into the request_sock if needed.
1353 */ 1349 */
1354static inline struct ip_options *tcp_v4_save_options(struct sock *sk, 1350static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1355 struct sk_buff *skb) 1351 struct sk_buff *skb)
@@ -1370,33 +1366,20 @@ static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1370 return dopt; 1366 return dopt;
1371} 1367}
1372 1368
1373/* 1369struct request_sock_ops tcp_request_sock_ops = {
1374 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1375 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1376 * It would be better to replace it with a global counter for all sockets
1377 * but then some measure against one socket starving all other sockets
1378 * would be needed.
1379 *
1380 * It was 128 by default. Experiments with real servers show, that
1381 * it is absolutely not enough even at 100conn/sec. 256 cures most
1382 * of problems. This value is adjusted to 128 for very small machines
1383 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1384 * Further increasing requires to change hash table size.
1385 */
1386int sysctl_max_syn_backlog = 256;
1387
1388struct or_calltable or_ipv4 = {
1389 .family = PF_INET, 1370 .family = PF_INET,
1371 .obj_size = sizeof(struct tcp_request_sock),
1390 .rtx_syn_ack = tcp_v4_send_synack, 1372 .rtx_syn_ack = tcp_v4_send_synack,
1391 .send_ack = tcp_v4_or_send_ack, 1373 .send_ack = tcp_v4_reqsk_send_ack,
1392 .destructor = tcp_v4_or_free, 1374 .destructor = tcp_v4_reqsk_destructor,
1393 .send_reset = tcp_v4_send_reset, 1375 .send_reset = tcp_v4_send_reset,
1394}; 1376};
1395 1377
1396int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1378int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1397{ 1379{
1380 struct inet_request_sock *ireq;
1398 struct tcp_options_received tmp_opt; 1381 struct tcp_options_received tmp_opt;
1399 struct open_request *req; 1382 struct request_sock *req;
1400 __u32 saddr = skb->nh.iph->saddr; 1383 __u32 saddr = skb->nh.iph->saddr;
1401 __u32 daddr = skb->nh.iph->daddr; 1384 __u32 daddr = skb->nh.iph->daddr;
1402 __u32 isn = TCP_SKB_CB(skb)->when; 1385 __u32 isn = TCP_SKB_CB(skb)->when;
@@ -1433,7 +1416,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1433 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 1416 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1434 goto drop; 1417 goto drop;
1435 1418
1436 req = tcp_openreq_alloc(); 1419 req = reqsk_alloc(&tcp_request_sock_ops);
1437 if (!req) 1420 if (!req)
1438 goto drop; 1421 goto drop;
1439 1422
@@ -1461,10 +1444,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1461 1444
1462 tcp_openreq_init(req, &tmp_opt, skb); 1445 tcp_openreq_init(req, &tmp_opt, skb);
1463 1446
1464 req->af.v4_req.loc_addr = daddr; 1447 ireq = inet_rsk(req);
1465 req->af.v4_req.rmt_addr = saddr; 1448 ireq->loc_addr = daddr;
1466 req->af.v4_req.opt = tcp_v4_save_options(sk, skb); 1449 ireq->rmt_addr = saddr;
1467 req->class = &or_ipv4; 1450 ireq->opt = tcp_v4_save_options(sk, skb);
1468 if (!want_cookie) 1451 if (!want_cookie)
1469 TCP_ECN_create_request(req, skb->h.th); 1452 TCP_ECN_create_request(req, skb->h.th);
1470 1453
@@ -1511,32 +1494,31 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1511 * to destinations, already remembered 1494 * to destinations, already remembered
1512 * to the moment of synflood. 1495 * to the moment of synflood.
1513 */ 1496 */
1514 NETDEBUG(if (net_ratelimit()) \ 1497 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1515 printk(KERN_DEBUG "TCP: drop open " 1498 "request from %u.%u."
1516 "request from %u.%u." 1499 "%u.%u/%u\n",
1517 "%u.%u/%u\n", \ 1500 NIPQUAD(saddr),
1518 NIPQUAD(saddr), 1501 ntohs(skb->h.th->source)));
1519 ntohs(skb->h.th->source)));
1520 dst_release(dst); 1502 dst_release(dst);
1521 goto drop_and_free; 1503 goto drop_and_free;
1522 } 1504 }
1523 1505
1524 isn = tcp_v4_init_sequence(sk, skb); 1506 isn = tcp_v4_init_sequence(sk, skb);
1525 } 1507 }
1526 req->snt_isn = isn; 1508 tcp_rsk(req)->snt_isn = isn;
1527 1509
1528 if (tcp_v4_send_synack(sk, req, dst)) 1510 if (tcp_v4_send_synack(sk, req, dst))
1529 goto drop_and_free; 1511 goto drop_and_free;
1530 1512
1531 if (want_cookie) { 1513 if (want_cookie) {
1532 tcp_openreq_free(req); 1514 reqsk_free(req);
1533 } else { 1515 } else {
1534 tcp_v4_synq_add(sk, req); 1516 tcp_v4_synq_add(sk, req);
1535 } 1517 }
1536 return 0; 1518 return 0;
1537 1519
1538drop_and_free: 1520drop_and_free:
1539 tcp_openreq_free(req); 1521 reqsk_free(req);
1540drop: 1522drop:
1541 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1523 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1542 return 0; 1524 return 0;
@@ -1548,9 +1530,10 @@ drop:
1548 * now create the new socket. 1530 * now create the new socket.
1549 */ 1531 */
1550struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 1532struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1551 struct open_request *req, 1533 struct request_sock *req,
1552 struct dst_entry *dst) 1534 struct dst_entry *dst)
1553{ 1535{
1536 struct inet_request_sock *ireq;
1554 struct inet_sock *newinet; 1537 struct inet_sock *newinet;
1555 struct tcp_sock *newtp; 1538 struct tcp_sock *newtp;
1556 struct sock *newsk; 1539 struct sock *newsk;
@@ -1570,11 +1553,12 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1570 1553
1571 newtp = tcp_sk(newsk); 1554 newtp = tcp_sk(newsk);
1572 newinet = inet_sk(newsk); 1555 newinet = inet_sk(newsk);
1573 newinet->daddr = req->af.v4_req.rmt_addr; 1556 ireq = inet_rsk(req);
1574 newinet->rcv_saddr = req->af.v4_req.loc_addr; 1557 newinet->daddr = ireq->rmt_addr;
1575 newinet->saddr = req->af.v4_req.loc_addr; 1558 newinet->rcv_saddr = ireq->loc_addr;
1576 newinet->opt = req->af.v4_req.opt; 1559 newinet->saddr = ireq->loc_addr;
1577 req->af.v4_req.opt = NULL; 1560 newinet->opt = ireq->opt;
1561 ireq->opt = NULL;
1578 newinet->mc_index = tcp_v4_iif(skb); 1562 newinet->mc_index = tcp_v4_iif(skb);
1579 newinet->mc_ttl = skb->nh.iph->ttl; 1563 newinet->mc_ttl = skb->nh.iph->ttl;
1580 newtp->ext_header_len = 0; 1564 newtp->ext_header_len = 0;
@@ -1605,9 +1589,9 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1605 struct iphdr *iph = skb->nh.iph; 1589 struct iphdr *iph = skb->nh.iph;
1606 struct tcp_sock *tp = tcp_sk(sk); 1590 struct tcp_sock *tp = tcp_sk(sk);
1607 struct sock *nsk; 1591 struct sock *nsk;
1608 struct open_request **prev; 1592 struct request_sock **prev;
1609 /* Find possible connection requests. */ 1593 /* Find possible connection requests. */
1610 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source, 1594 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1611 iph->saddr, iph->daddr); 1595 iph->saddr, iph->daddr);
1612 if (req) 1596 if (req)
1613 return tcp_check_req(sk, skb, req, prev); 1597 return tcp_check_req(sk, skb, req, prev);
@@ -1642,8 +1626,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
1642 skb->nh.iph->daddr, skb->csum)) 1626 skb->nh.iph->daddr, skb->csum))
1643 return 0; 1627 return 0;
1644 1628
1645 NETDEBUG(if (net_ratelimit()) 1629 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1646 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1647 skb->ip_summed = CHECKSUM_NONE; 1630 skb->ip_summed = CHECKSUM_NONE;
1648 } 1631 }
1649 if (skb->len <= 76) { 1632 if (skb->len <= 76) {
@@ -2060,9 +2043,10 @@ static int tcp_v4_init_sock(struct sock *sk)
2060 */ 2043 */
2061 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 2044 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2062 tp->snd_cwnd_clamp = ~0; 2045 tp->snd_cwnd_clamp = ~0;
2063 tp->mss_cache_std = tp->mss_cache = 536; 2046 tp->mss_cache = 536;
2064 2047
2065 tp->reordering = sysctl_tcp_reordering; 2048 tp->reordering = sysctl_tcp_reordering;
2049 tp->ca_ops = &tcp_init_congestion_ops;
2066 2050
2067 sk->sk_state = TCP_CLOSE; 2051 sk->sk_state = TCP_CLOSE;
2068 2052
@@ -2085,6 +2069,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
2085 2069
2086 tcp_clear_xmit_timers(sk); 2070 tcp_clear_xmit_timers(sk);
2087 2071
2072 tcp_cleanup_congestion_control(tp);
2073
2088 /* Cleanup up the write buffer. */ 2074 /* Cleanup up the write buffer. */
2089 sk_stream_writequeue_purge(sk); 2075 sk_stream_writequeue_purge(sk);
2090 2076
@@ -2144,13 +2130,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2144 ++st->num; 2130 ++st->num;
2145 2131
2146 if (st->state == TCP_SEQ_STATE_OPENREQ) { 2132 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2147 struct open_request *req = cur; 2133 struct request_sock *req = cur;
2148 2134
2149 tp = tcp_sk(st->syn_wait_sk); 2135 tp = tcp_sk(st->syn_wait_sk);
2150 req = req->dl_next; 2136 req = req->dl_next;
2151 while (1) { 2137 while (1) {
2152 while (req) { 2138 while (req) {
2153 if (req->class->family == st->family) { 2139 if (req->rsk_ops->family == st->family) {
2154 cur = req; 2140 cur = req;
2155 goto out; 2141 goto out;
2156 } 2142 }
@@ -2159,17 +2145,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2159 if (++st->sbucket >= TCP_SYNQ_HSIZE) 2145 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2160 break; 2146 break;
2161get_req: 2147get_req:
2162 req = tp->listen_opt->syn_table[st->sbucket]; 2148 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2163 } 2149 }
2164 sk = sk_next(st->syn_wait_sk); 2150 sk = sk_next(st->syn_wait_sk);
2165 st->state = TCP_SEQ_STATE_LISTENING; 2151 st->state = TCP_SEQ_STATE_LISTENING;
2166 read_unlock_bh(&tp->syn_wait_lock); 2152 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2167 } else { 2153 } else {
2168 tp = tcp_sk(sk); 2154 tp = tcp_sk(sk);
2169 read_lock_bh(&tp->syn_wait_lock); 2155 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2170 if (tp->listen_opt && tp->listen_opt->qlen) 2156 if (reqsk_queue_len(&tp->accept_queue))
2171 goto start_req; 2157 goto start_req;
2172 read_unlock_bh(&tp->syn_wait_lock); 2158 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2173 sk = sk_next(sk); 2159 sk = sk_next(sk);
2174 } 2160 }
2175get_sk: 2161get_sk:
@@ -2179,8 +2165,8 @@ get_sk:
2179 goto out; 2165 goto out;
2180 } 2166 }
2181 tp = tcp_sk(sk); 2167 tp = tcp_sk(sk);
2182 read_lock_bh(&tp->syn_wait_lock); 2168 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2183 if (tp->listen_opt && tp->listen_opt->qlen) { 2169 if (reqsk_queue_len(&tp->accept_queue)) {
2184start_req: 2170start_req:
2185 st->uid = sock_i_uid(sk); 2171 st->uid = sock_i_uid(sk);
2186 st->syn_wait_sk = sk; 2172 st->syn_wait_sk = sk;
@@ -2188,7 +2174,7 @@ start_req:
2188 st->sbucket = 0; 2174 st->sbucket = 0;
2189 goto get_req; 2175 goto get_req;
2190 } 2176 }
2191 read_unlock_bh(&tp->syn_wait_lock); 2177 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2192 } 2178 }
2193 if (++st->bucket < TCP_LHTABLE_SIZE) { 2179 if (++st->bucket < TCP_LHTABLE_SIZE) {
2194 sk = sk_head(&tcp_listening_hash[st->bucket]); 2180 sk = sk_head(&tcp_listening_hash[st->bucket]);
@@ -2375,7 +2361,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2375 case TCP_SEQ_STATE_OPENREQ: 2361 case TCP_SEQ_STATE_OPENREQ:
2376 if (v) { 2362 if (v) {
2377 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); 2363 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2378 read_unlock_bh(&tp->syn_wait_lock); 2364 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2379 } 2365 }
2380 case TCP_SEQ_STATE_LISTENING: 2366 case TCP_SEQ_STATE_LISTENING:
2381 if (v != SEQ_START_TOKEN) 2367 if (v != SEQ_START_TOKEN)
@@ -2451,18 +2437,19 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2451 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 2437 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2452} 2438}
2453 2439
2454static void get_openreq4(struct sock *sk, struct open_request *req, 2440static void get_openreq4(struct sock *sk, struct request_sock *req,
2455 char *tmpbuf, int i, int uid) 2441 char *tmpbuf, int i, int uid)
2456{ 2442{
2443 const struct inet_request_sock *ireq = inet_rsk(req);
2457 int ttd = req->expires - jiffies; 2444 int ttd = req->expires - jiffies;
2458 2445
2459 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" 2446 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2460 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p", 2447 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2461 i, 2448 i,
2462 req->af.v4_req.loc_addr, 2449 ireq->loc_addr,
2463 ntohs(inet_sk(sk)->sport), 2450 ntohs(inet_sk(sk)->sport),
2464 req->af.v4_req.rmt_addr, 2451 ireq->rmt_addr,
2465 ntohs(req->rmt_port), 2452 ntohs(ireq->rmt_port),
2466 TCP_SYN_RECV, 2453 TCP_SYN_RECV,
2467 0, 0, /* could print option size, but that is af dependent. */ 2454 0, 0, /* could print option size, but that is af dependent. */
2468 1, /* timers active (only the expire timer) */ 2455 1, /* timers active (only the expire timer) */
@@ -2618,6 +2605,7 @@ struct proto tcp_prot = {
2618 .sysctl_rmem = sysctl_tcp_rmem, 2605 .sysctl_rmem = sysctl_tcp_rmem,
2619 .max_header = MAX_TCP_HEADER, 2606 .max_header = MAX_TCP_HEADER,
2620 .obj_size = sizeof(struct tcp_sock), 2607 .obj_size = sizeof(struct tcp_sock),
2608 .rsk_prot = &tcp_request_sock_ops,
2621}; 2609};
2622 2610
2623 2611
@@ -2660,7 +2648,6 @@ EXPORT_SYMBOL(tcp_proc_register);
2660EXPORT_SYMBOL(tcp_proc_unregister); 2648EXPORT_SYMBOL(tcp_proc_unregister);
2661#endif 2649#endif
2662EXPORT_SYMBOL(sysctl_local_port_range); 2650EXPORT_SYMBOL(sysctl_local_port_range);
2663EXPORT_SYMBOL(sysctl_max_syn_backlog);
2664EXPORT_SYMBOL(sysctl_tcp_low_latency); 2651EXPORT_SYMBOL(sysctl_tcp_low_latency);
2665EXPORT_SYMBOL(sysctl_tcp_tw_reuse); 2652EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2666 2653
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index eea1a17a9ac2..f42a284164b7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -684,7 +684,7 @@ out:
684 * Actually, we could lots of memory writes here. tp of listening 684 * Actually, we could lots of memory writes here. tp of listening
685 * socket contains all necessary default parameters. 685 * socket contains all necessary default parameters.
686 */ 686 */
687struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) 687struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
688{ 688{
689 /* allocate the newsk from the same slab of the master sock, 689 /* allocate the newsk from the same slab of the master sock,
690 * if not, at sk_free time we'll try to free it from the wrong 690 * if not, at sk_free time we'll try to free it from the wrong
@@ -692,6 +692,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); 692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
693 693
694 if(newsk != NULL) { 694 if(newsk != NULL) {
695 struct inet_request_sock *ireq = inet_rsk(req);
696 struct tcp_request_sock *treq = tcp_rsk(req);
695 struct tcp_sock *newtp; 697 struct tcp_sock *newtp;
696 struct sk_filter *filter; 698 struct sk_filter *filter;
697 699
@@ -703,7 +705,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
703 tcp_sk(newsk)->bind_hash = NULL; 705 tcp_sk(newsk)->bind_hash = NULL;
704 706
705 /* Clone the TCP header template */ 707 /* Clone the TCP header template */
706 inet_sk(newsk)->dport = req->rmt_port; 708 inet_sk(newsk)->dport = ireq->rmt_port;
707 709
708 sock_lock_init(newsk); 710 sock_lock_init(newsk);
709 bh_lock_sock(newsk); 711 bh_lock_sock(newsk);
@@ -739,14 +741,14 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
739 /* Now setup tcp_sock */ 741 /* Now setup tcp_sock */
740 newtp = tcp_sk(newsk); 742 newtp = tcp_sk(newsk);
741 newtp->pred_flags = 0; 743 newtp->pred_flags = 0;
742 newtp->rcv_nxt = req->rcv_isn + 1; 744 newtp->rcv_nxt = treq->rcv_isn + 1;
743 newtp->snd_nxt = req->snt_isn + 1; 745 newtp->snd_nxt = treq->snt_isn + 1;
744 newtp->snd_una = req->snt_isn + 1; 746 newtp->snd_una = treq->snt_isn + 1;
745 newtp->snd_sml = req->snt_isn + 1; 747 newtp->snd_sml = treq->snt_isn + 1;
746 748
747 tcp_prequeue_init(newtp); 749 tcp_prequeue_init(newtp);
748 750
749 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn); 751 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
750 752
751 newtp->retransmits = 0; 753 newtp->retransmits = 0;
752 newtp->backoff = 0; 754 newtp->backoff = 0;
@@ -772,13 +774,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
772 newtp->frto_counter = 0; 774 newtp->frto_counter = 0;
773 newtp->frto_highmark = 0; 775 newtp->frto_highmark = 0;
774 776
777 newtp->ca_ops = &tcp_reno;
778
775 tcp_set_ca_state(newtp, TCP_CA_Open); 779 tcp_set_ca_state(newtp, TCP_CA_Open);
776 tcp_init_xmit_timers(newsk); 780 tcp_init_xmit_timers(newsk);
777 skb_queue_head_init(&newtp->out_of_order_queue); 781 skb_queue_head_init(&newtp->out_of_order_queue);
778 newtp->rcv_wup = req->rcv_isn + 1; 782 newtp->rcv_wup = treq->rcv_isn + 1;
779 newtp->write_seq = req->snt_isn + 1; 783 newtp->write_seq = treq->snt_isn + 1;
780 newtp->pushed_seq = newtp->write_seq; 784 newtp->pushed_seq = newtp->write_seq;
781 newtp->copied_seq = req->rcv_isn + 1; 785 newtp->copied_seq = treq->rcv_isn + 1;
782 786
783 newtp->rx_opt.saw_tstamp = 0; 787 newtp->rx_opt.saw_tstamp = 0;
784 788
@@ -788,10 +792,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
788 newtp->probes_out = 0; 792 newtp->probes_out = 0;
789 newtp->rx_opt.num_sacks = 0; 793 newtp->rx_opt.num_sacks = 0;
790 newtp->urg_data = 0; 794 newtp->urg_data = 0;
791 newtp->listen_opt = NULL; 795 /* Deinitialize accept_queue to trap illegal accesses. */
792 newtp->accept_queue = newtp->accept_queue_tail = NULL; 796 memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
793 /* Deinitialize syn_wait_lock to trap illegal accesses. */
794 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
795 797
796 /* Back to base struct sock members. */ 798 /* Back to base struct sock members. */
797 newsk->sk_err = 0; 799 newsk->sk_err = 0;
@@ -808,18 +810,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
808 newsk->sk_socket = NULL; 810 newsk->sk_socket = NULL;
809 newsk->sk_sleep = NULL; 811 newsk->sk_sleep = NULL;
810 812
811 newtp->rx_opt.tstamp_ok = req->tstamp_ok; 813 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
812 if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) { 814 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
813 if (sysctl_tcp_fack) 815 if (sysctl_tcp_fack)
814 newtp->rx_opt.sack_ok |= 2; 816 newtp->rx_opt.sack_ok |= 2;
815 } 817 }
816 newtp->window_clamp = req->window_clamp; 818 newtp->window_clamp = req->window_clamp;
817 newtp->rcv_ssthresh = req->rcv_wnd; 819 newtp->rcv_ssthresh = req->rcv_wnd;
818 newtp->rcv_wnd = req->rcv_wnd; 820 newtp->rcv_wnd = req->rcv_wnd;
819 newtp->rx_opt.wscale_ok = req->wscale_ok; 821 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
820 if (newtp->rx_opt.wscale_ok) { 822 if (newtp->rx_opt.wscale_ok) {
821 newtp->rx_opt.snd_wscale = req->snd_wscale; 823 newtp->rx_opt.snd_wscale = ireq->snd_wscale;
822 newtp->rx_opt.rcv_wscale = req->rcv_wscale; 824 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
823 } else { 825 } else {
824 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 826 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
825 newtp->window_clamp = min(newtp->window_clamp, 65535U); 827 newtp->window_clamp = min(newtp->window_clamp, 65535U);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
842 if (newtp->ecn_flags&TCP_ECN_OK) 844 if (newtp->ecn_flags&TCP_ECN_OK)
843 sock_set_flag(newsk, SOCK_NO_LARGESEND); 845 sock_set_flag(newsk, SOCK_NO_LARGESEND);
844 846
845 tcp_ca_init(newtp);
846
847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); 847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
848 } 848 }
849 return newsk; 849 return newsk;
@@ -851,12 +851,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
851 851
852/* 852/*
853 * Process an incoming packet for SYN_RECV sockets represented 853 * Process an incoming packet for SYN_RECV sockets represented
854 * as an open_request. 854 * as a request_sock.
855 */ 855 */
856 856
857struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, 857struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
858 struct open_request *req, 858 struct request_sock *req,
859 struct open_request **prev) 859 struct request_sock **prev)
860{ 860{
861 struct tcphdr *th = skb->h.th; 861 struct tcphdr *th = skb->h.th;
862 struct tcp_sock *tp = tcp_sk(sk); 862 struct tcp_sock *tp = tcp_sk(sk);
@@ -881,7 +881,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
881 } 881 }
882 882
883 /* Check for pure retransmitted SYN. */ 883 /* Check for pure retransmitted SYN. */
884 if (TCP_SKB_CB(skb)->seq == req->rcv_isn && 884 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
885 flg == TCP_FLAG_SYN && 885 flg == TCP_FLAG_SYN &&
886 !paws_reject) { 886 !paws_reject) {
887 /* 887 /*
@@ -901,7 +901,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
901 * Enforce "SYN-ACK" according to figure 8, figure 6 901 * Enforce "SYN-ACK" according to figure 8, figure 6
902 * of RFC793, fixed by RFC1122. 902 * of RFC793, fixed by RFC1122.
903 */ 903 */
904 req->class->rtx_syn_ack(sk, req, NULL); 904 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
905 return NULL; 905 return NULL;
906 } 906 }
907 907
@@ -959,7 +959,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
959 * Invalid ACK: reset will be sent by listening socket 959 * Invalid ACK: reset will be sent by listening socket
960 */ 960 */
961 if ((flg & TCP_FLAG_ACK) && 961 if ((flg & TCP_FLAG_ACK) &&
962 (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)) 962 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
963 return sk; 963 return sk;
964 964
965 /* Also, it would be not so bad idea to check rcv_tsecr, which 965 /* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -970,10 +970,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
970 /* RFC793: "first check sequence number". */ 970 /* RFC793: "first check sequence number". */
971 971
972 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 972 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
973 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) { 973 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
974 /* Out of window: send ACK and drop. */ 974 /* Out of window: send ACK and drop. */
975 if (!(flg & TCP_FLAG_RST)) 975 if (!(flg & TCP_FLAG_RST))
976 req->class->send_ack(skb, req); 976 req->rsk_ops->send_ack(skb, req);
977 if (paws_reject) 977 if (paws_reject)
978 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 978 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
979 return NULL; 979 return NULL;
@@ -981,12 +981,12 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
981 981
982 /* In sequence, PAWS is OK. */ 982 /* In sequence, PAWS is OK. */
983 983
984 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) 984 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
985 req->ts_recent = tmp_opt.rcv_tsval; 985 req->ts_recent = tmp_opt.rcv_tsval;
986 986
987 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { 987 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
988 /* Truncate SYN, it is out of window starting 988 /* Truncate SYN, it is out of window starting
989 at req->rcv_isn+1. */ 989 at tcp_rsk(req)->rcv_isn + 1. */
990 flg &= ~TCP_FLAG_SYN; 990 flg &= ~TCP_FLAG_SYN;
991 } 991 }
992 992
@@ -1003,8 +1003,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1003 return NULL; 1003 return NULL;
1004 1004
1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ 1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { 1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
1007 req->acked = 1; 1007 inet_rsk(req)->acked = 1;
1008 return NULL; 1008 return NULL;
1009 } 1009 }
1010 1010
@@ -1026,14 +1026,14 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1026 1026
1027 listen_overflow: 1027 listen_overflow:
1028 if (!sysctl_tcp_abort_on_overflow) { 1028 if (!sysctl_tcp_abort_on_overflow) {
1029 req->acked = 1; 1029 inet_rsk(req)->acked = 1;
1030 return NULL; 1030 return NULL;
1031 } 1031 }
1032 1032
1033 embryonic_reset: 1033 embryonic_reset:
1034 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); 1034 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
1035 if (!(flg & TCP_FLAG_RST)) 1035 if (!(flg & TCP_FLAG_RST))
1036 req->class->send_reset(skb); 1036 req->rsk_ops->send_reset(skb);
1037 1037
1038 tcp_synq_drop(sk, req, prev); 1038 tcp_synq_drop(sk, req, prev);
1039 return NULL; 1039 return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fa24e7ae1f40..566045e58437 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 112 u32 cwnd = tp->snd_cwnd;
113 113
114 if (tcp_is_vegas(tp)) 114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
115 tcp_vegas_enable(tp);
116 115
117 tp->snd_ssthresh = tcp_current_ssthresh(tp); 116 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd); 117 restart_cwnd = min(restart_cwnd, cwnd);
@@ -141,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
141 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
142} 141}
143 142
144static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
145{ 144{
146 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
147 146
148 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
150} 149}
151 150
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_WSCALE 0x2 279#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4 280#define SYSCTL_FLAG_SACK 0x4
282 281
282 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp);
285
283 sysctl_flags = 0; 286 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) { 287 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 288 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 307 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 } 308 }
306 309
307 /* 310 if (tcp_packets_in_flight(tp) == 0)
308 * If the connection is idle and we are restarting, 311 tcp_ca_event(tp, CA_EVENT_TX_START);
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318 312
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 313 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th; 314 skb->h.th = th;
@@ -361,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
361 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
362 356
363 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
365 359
366 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -409,42 +403,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
409 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
410} 404}
411 405
412static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
413{
414 /* Force push to be on for any TSO frames to workaround
415 * problems with busted implementations like Mac OS-X that
416 * hold off socket receive wakeups until push is seen.
417 */
418 if (tcp_skb_pcount(skb) > 1)
419 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
420}
421
422/* Send _single_ skb sitting at the send head. This function requires
423 * true push pending frames to setup probe timer etc.
424 */
425void tcp_push_one(struct sock *sk, unsigned cur_mss)
426{
427 struct tcp_sock *tp = tcp_sk(sk);
428 struct sk_buff *skb = sk->sk_send_head;
429
430 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
431 /* Send it out now. */
432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
433 tcp_tso_set_push(skb);
434 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
435 sk->sk_send_head = NULL;
436 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
437 tcp_packets_out_inc(sk, tp, skb);
438 return;
439 }
440 }
441}
442
443void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
444{ 407{
445 struct tcp_sock *tp = tcp_sk(sk); 408 if (skb->len <= mss_now ||
446
447 if (skb->len <= tp->mss_cache_std ||
448 !(sk->sk_route_caps & NETIF_F_TSO)) { 409 !(sk->sk_route_caps & NETIF_F_TSO)) {
449 /* Avoid the costly divide in the normal 410 /* Avoid the costly divide in the normal
450 * non-TSO case. 411 * non-TSO case.
@@ -454,10 +415,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
454 } else { 415 } else {
455 unsigned int factor; 416 unsigned int factor;
456 417
457 factor = skb->len + (tp->mss_cache_std - 1); 418 factor = skb->len + (mss_now - 1);
458 factor /= tp->mss_cache_std; 419 factor /= mss_now;
459 skb_shinfo(skb)->tso_segs = factor; 420 skb_shinfo(skb)->tso_segs = factor;
460 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 421 skb_shinfo(skb)->tso_size = mss_now;
461 } 422 }
462} 423}
463 424
@@ -466,7 +427,7 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
466 * packet to the list. This won't be called frequently, I hope. 427 * packet to the list. This won't be called frequently, I hope.
467 * Remember, these are still headerless SKBs at this point. 428 * Remember, these are still headerless SKBs at this point.
468 */ 429 */
469static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) 430static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
470{ 431{
471 struct tcp_sock *tp = tcp_sk(sk); 432 struct tcp_sock *tp = tcp_sk(sk);
472 struct sk_buff *buff; 433 struct sk_buff *buff;
@@ -521,6 +482,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
521 * skbs, which it never sent before. --ANK 482 * skbs, which it never sent before. --ANK
522 */ 483 */
523 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 484 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
485 buff->stamp = skb->stamp;
524 486
525 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 487 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
526 tp->lost_out -= tcp_skb_pcount(skb); 488 tp->lost_out -= tcp_skb_pcount(skb);
@@ -528,8 +490,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
528 } 490 }
529 491
530 /* Fix up tso_factor for both original and new SKB. */ 492 /* Fix up tso_factor for both original and new SKB. */
531 tcp_set_skb_tso_segs(sk, skb); 493 tcp_set_skb_tso_segs(sk, skb, mss_now);
532 tcp_set_skb_tso_segs(sk, buff); 494 tcp_set_skb_tso_segs(sk, buff, mss_now);
533 495
534 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 496 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
535 tp->lost_out += tcp_skb_pcount(skb); 497 tp->lost_out += tcp_skb_pcount(skb);
@@ -542,6 +504,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
542 } 504 }
543 505
544 /* Link BUFF into the send queue. */ 506 /* Link BUFF into the send queue. */
507 skb_header_release(buff);
545 __skb_append(skb, buff); 508 __skb_append(skb, buff);
546 509
547 return 0; 510 return 0;
@@ -604,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
604 * factor and mss. 567 * factor and mss.
605 */ 568 */
606 if (tcp_skb_pcount(skb) > 1) 569 if (tcp_skb_pcount(skb) > 1)
607 tcp_set_skb_tso_segs(sk, skb); 570 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
608 571
609 return 0; 572 return 0;
610} 573}
@@ -662,7 +625,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
662 625
663 /* And store cached results */ 626 /* And store cached results */
664 tp->pmtu_cookie = pmtu; 627 tp->pmtu_cookie = pmtu;
665 tp->mss_cache = tp->mss_cache_std = mss_now; 628 tp->mss_cache = mss_now;
666 629
667 return mss_now; 630 return mss_now;
668} 631}
@@ -674,57 +637,319 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
674 * cannot be large. However, taking into account rare use of URG, this 637 * cannot be large. However, taking into account rare use of URG, this
675 * is not a big flaw. 638 * is not a big flaw.
676 */ 639 */
677 640unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
678unsigned int tcp_current_mss(struct sock *sk, int large)
679{ 641{
680 struct tcp_sock *tp = tcp_sk(sk); 642 struct tcp_sock *tp = tcp_sk(sk);
681 struct dst_entry *dst = __sk_dst_get(sk); 643 struct dst_entry *dst = __sk_dst_get(sk);
682 unsigned int do_large, mss_now; 644 u32 mss_now;
645 u16 xmit_size_goal;
646 int doing_tso = 0;
647
648 mss_now = tp->mss_cache;
649
650 if (large_allowed &&
651 (sk->sk_route_caps & NETIF_F_TSO) &&
652 !tp->urg_mode)
653 doing_tso = 1;
683 654
684 mss_now = tp->mss_cache_std;
685 if (dst) { 655 if (dst) {
686 u32 mtu = dst_mtu(dst); 656 u32 mtu = dst_mtu(dst);
687 if (mtu != tp->pmtu_cookie) 657 if (mtu != tp->pmtu_cookie)
688 mss_now = tcp_sync_mss(sk, mtu); 658 mss_now = tcp_sync_mss(sk, mtu);
689 } 659 }
690 660
691 do_large = (large && 661 if (tp->rx_opt.eff_sacks)
692 (sk->sk_route_caps & NETIF_F_TSO) && 662 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
693 !tp->urg_mode); 663 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
694 664
695 if (do_large) { 665 xmit_size_goal = mss_now;
696 unsigned int large_mss, factor, limit;
697 666
698 large_mss = 65535 - tp->af_specific->net_header_len - 667 if (doing_tso) {
668 xmit_size_goal = 65535 -
669 tp->af_specific->net_header_len -
699 tp->ext_header_len - tp->tcp_header_len; 670 tp->ext_header_len - tp->tcp_header_len;
700 671
701 if (tp->max_window && large_mss > (tp->max_window>>1)) 672 if (tp->max_window &&
702 large_mss = max((tp->max_window>>1), 673 (xmit_size_goal > (tp->max_window >> 1)))
703 68U - tp->tcp_header_len); 674 xmit_size_goal = max((tp->max_window >> 1),
675 68U - tp->tcp_header_len);
704 676
705 factor = large_mss / mss_now; 677 xmit_size_goal -= (xmit_size_goal % mss_now);
678 }
679 tp->xmit_size_goal = xmit_size_goal;
706 680
707 /* Always keep large mss multiple of real mss, but 681 return mss_now;
708 * do not exceed 1/tso_win_divisor of the congestion window 682}
709 * so we can keep the ACK clock ticking and minimize 683
710 * bursting. 684/* Congestion window validation. (RFC2861) */
711 */
712 limit = tp->snd_cwnd;
713 if (sysctl_tcp_tso_win_divisor)
714 limit /= sysctl_tcp_tso_win_divisor;
715 limit = max(1U, limit);
716 if (factor > limit)
717 factor = limit;
718 685
719 tp->mss_cache = mss_now * factor; 686static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
687{
688 __u32 packets_out = tp->packets_out;
689
690 if (packets_out >= tp->snd_cwnd) {
691 /* Network is feed fully. */
692 tp->snd_cwnd_used = 0;
693 tp->snd_cwnd_stamp = tcp_time_stamp;
694 } else {
695 /* Network starves. */
696 if (tp->packets_out > tp->snd_cwnd_used)
697 tp->snd_cwnd_used = tp->packets_out;
720 698
721 mss_now = tp->mss_cache; 699 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
700 tcp_cwnd_application_limited(sk);
722 } 701 }
702}
723 703
724 if (tp->rx_opt.eff_sacks) 704static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
725 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 705{
726 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 706 u32 window, cwnd_len;
727 return mss_now; 707
708 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
709 cwnd_len = mss_now * cwnd;
710 return min(window, cwnd_len);
711}
712
713/* Can at least one segment of SKB be sent right now, according to the
714 * congestion window rules? If so, return how many segments are allowed.
715 */
716static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
717{
718 u32 in_flight, cwnd;
719
720 /* Don't be strict about the congestion window for the final FIN. */
721 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
722 return 1;
723
724 in_flight = tcp_packets_in_flight(tp);
725 cwnd = tp->snd_cwnd;
726 if (in_flight < cwnd)
727 return (cwnd - in_flight);
728
729 return 0;
730}
731
732/* This must be invoked the first time we consider transmitting
733 * SKB onto the wire.
734 */
735static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
736{
737 int tso_segs = tcp_skb_pcount(skb);
738
739 if (!tso_segs ||
740 (tso_segs > 1 &&
741 skb_shinfo(skb)->tso_size != mss_now)) {
742 tcp_set_skb_tso_segs(sk, skb, mss_now);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb, cur_mss);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 if (skb->len != skb->data_len)
865 return tcp_fragment(sk, skb, len, mss_now);
866
867 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
868 if (unlikely(buff == NULL))
869 return -ENOMEM;
870
871 buff->truesize = nlen;
872 skb->truesize -= nlen;
873
874 /* Correct the sequence numbers. */
875 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
876 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
877 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
878
879 /* PSH and FIN should only be set in the second packet. */
880 flags = TCP_SKB_CB(skb)->flags;
881 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
882 TCP_SKB_CB(buff)->flags = flags;
883
884 /* This packet was never sent out yet, so no SACK bits. */
885 TCP_SKB_CB(buff)->sacked = 0;
886
887 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
888 skb_split(skb, buff, len);
889
890 /* Fix up tso_factor for both original and new SKB. */
891 tcp_set_skb_tso_segs(sk, skb, mss_now);
892 tcp_set_skb_tso_segs(sk, buff, mss_now);
893
894 /* Link BUFF into the send queue. */
895 skb_header_release(buff);
896 __skb_append(skb, buff);
897
898 return 0;
899}
900
901/* Try to defer sending, if possible, in order to minimize the amount
902 * of TSO splitting we do. View it as a kind of TSO Nagle test.
903 *
904 * This algorithm is from John Heffner.
905 */
906static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
907{
908 u32 send_win, cong_win, limit, in_flight;
909
910 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
911 return 0;
912
913 if (tp->ca_state != TCP_CA_Open)
914 return 0;
915
916 in_flight = tcp_packets_in_flight(tp);
917
918 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
919 (tp->snd_cwnd <= in_flight));
920
921 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
922
923 /* From in_flight test above, we know that cwnd > in_flight. */
924 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
925
926 limit = min(send_win, cong_win);
927
928 /* If sk_send_head can be sent fully now, just do it. */
929 if (skb->len <= limit)
930 return 0;
931
932 if (sysctl_tcp_tso_win_divisor) {
933 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
934
935 /* If at least some fraction of a window is available,
936 * just use it.
937 */
938 chunk /= sysctl_tcp_tso_win_divisor;
939 if (limit >= chunk)
940 return 0;
941 } else {
942 /* Different approach, try not to defer past a single
943 * ACK. Receiver should ACK every other full sized
944 * frame, so if we have space for more than 3 frames
945 * then send now.
946 */
947 if (limit > tcp_max_burst(tp) * tp->mss_cache)
948 return 0;
949 }
950
951 /* Ok, it looks like it is advisable to defer. */
952 return 1;
728} 953}
729 954
730/* This routine writes packets to the network. It advances the 955/* This routine writes packets to the network. It advances the
@@ -734,57 +959,142 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
734 * Returns 1, if no segments are in flight and we have queued segments, but 959 * Returns 1, if no segments are in flight and we have queued segments, but
735 * cannot send anything now because of SWS or another problem. 960 * cannot send anything now because of SWS or another problem.
736 */ 961 */
737int tcp_write_xmit(struct sock *sk, int nonagle) 962static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
738{ 963{
739 struct tcp_sock *tp = tcp_sk(sk); 964 struct tcp_sock *tp = tcp_sk(sk);
740 unsigned int mss_now; 965 struct sk_buff *skb;
966 unsigned int tso_segs, sent_pkts;
967 int cwnd_quota;
741 968
742 /* If we are closed, the bytes will have to remain here. 969 /* If we are closed, the bytes will have to remain here.
743 * In time closedown will finish, we empty the write queue and all 970 * In time closedown will finish, we empty the write queue and all
744 * will be happy. 971 * will be happy.
745 */ 972 */
746 if (sk->sk_state != TCP_CLOSE) { 973 if (unlikely(sk->sk_state == TCP_CLOSE))
747 struct sk_buff *skb; 974 return 0;
748 int sent_pkts = 0;
749 975
750 /* Account for SACKS, we may need to fragment due to this. 976 sent_pkts = 0;
751 * It is just like the real MSS changing on us midstream. 977 while ((skb = sk->sk_send_head)) {
752 * We also handle things correctly when the user adds some 978 unsigned int limit;
753 * IP options mid-stream. Silly to do, but cover it.
754 */
755 mss_now = tcp_current_mss(sk, 1);
756
757 while ((skb = sk->sk_send_head) &&
758 tcp_snd_test(sk, skb, mss_now,
759 tcp_skb_is_last(sk, skb) ? nonagle :
760 TCP_NAGLE_PUSH)) {
761 if (skb->len > mss_now) {
762 if (tcp_fragment(sk, skb, mss_now))
763 break;
764 }
765 979
766 TCP_SKB_CB(skb)->when = tcp_time_stamp; 980 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
767 tcp_tso_set_push(skb); 981 BUG_ON(!tso_segs);
768 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) 982
983 cwnd_quota = tcp_cwnd_test(tp, skb);
984 if (!cwnd_quota)
985 break;
986
987 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
988 break;
989
990 if (tso_segs == 1) {
991 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
992 (tcp_skb_is_last(sk, skb) ?
993 nonagle : TCP_NAGLE_PUSH))))
769 break; 994 break;
995 } else {
996 if (tcp_tso_should_defer(sk, tp, skb))
997 break;
998 }
770 999
771 /* Advance the send_head. This one is sent out. 1000 limit = mss_now;
772 * This call will increment packets_out. 1001 if (tso_segs > 1) {
773 */ 1002 limit = tcp_window_allows(tp, skb,
774 update_send_head(sk, tp, skb); 1003 mss_now, cwnd_quota);
1004
1005 if (skb->len < limit) {
1006 unsigned int trim = skb->len % mss_now;
775 1007
776 tcp_minshall_update(tp, mss_now, skb); 1008 if (trim)
777 sent_pkts = 1; 1009 limit = skb->len - trim;
1010 }
778 } 1011 }
779 1012
780 if (sent_pkts) { 1013 if (skb->len > limit &&
781 tcp_cwnd_validate(sk, tp); 1014 unlikely(tso_fragment(sk, skb, limit, mss_now)))
782 return 0; 1015 break;
1016
1017 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1018
1019 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
1020 break;
1021
1022 /* Advance the send_head. This one is sent out.
1023 * This call will increment packets_out.
1024 */
1025 update_send_head(sk, tp, skb);
1026
1027 tcp_minshall_update(tp, mss_now, skb);
1028 sent_pkts++;
1029 }
1030
1031 if (likely(sent_pkts)) {
1032 tcp_cwnd_validate(sk, tp);
1033 return 0;
1034 }
1035 return !tp->packets_out && sk->sk_send_head;
1036}
1037
1038/* Push out any pending frames which were held back due to
1039 * TCP_CORK or attempt at coalescing tiny packets.
1040 * The socket must be locked by the caller.
1041 */
1042void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1043 unsigned int cur_mss, int nonagle)
1044{
1045 struct sk_buff *skb = sk->sk_send_head;
1046
1047 if (skb) {
1048 if (tcp_write_xmit(sk, cur_mss, nonagle))
1049 tcp_check_probe_timer(sk, tp);
1050 }
1051}
1052
1053/* Send _single_ skb sitting at the send head. This function requires
1054 * true push pending frames to setup probe timer etc.
1055 */
1056void tcp_push_one(struct sock *sk, unsigned int mss_now)
1057{
1058 struct tcp_sock *tp = tcp_sk(sk);
1059 struct sk_buff *skb = sk->sk_send_head;
1060 unsigned int tso_segs, cwnd_quota;
1061
1062 BUG_ON(!skb || skb->len < mss_now);
1063
1064 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1065 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1066
1067 if (likely(cwnd_quota)) {
1068 unsigned int limit;
1069
1070 BUG_ON(!tso_segs);
1071
1072 limit = mss_now;
1073 if (tso_segs > 1) {
1074 limit = tcp_window_allows(tp, skb,
1075 mss_now, cwnd_quota);
1076
1077 if (skb->len < limit) {
1078 unsigned int trim = skb->len % mss_now;
1079
1080 if (trim)
1081 limit = skb->len - trim;
1082 }
783 } 1083 }
784 1084
785 return !tp->packets_out && sk->sk_send_head; 1085 if (skb->len > limit &&
1086 unlikely(tso_fragment(sk, skb, limit, mss_now)))
1087 return;
1088
1089 /* Send it out now. */
1090 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1091
1092 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1093 update_send_head(sk, tp, skb);
1094 tcp_cwnd_validate(sk, tp);
1095 return;
1096 }
786 } 1097 }
787 return 0;
788} 1098}
789 1099
790/* This function returns the amount that we can raise the 1100/* This function returns the amount that we can raise the
@@ -1044,7 +1354,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1044 if (sk->sk_route_caps & NETIF_F_TSO) { 1354 if (sk->sk_route_caps & NETIF_F_TSO) {
1045 sk->sk_route_caps &= ~NETIF_F_TSO; 1355 sk->sk_route_caps &= ~NETIF_F_TSO;
1046 sock_set_flag(sk, SOCK_NO_LARGESEND); 1356 sock_set_flag(sk, SOCK_NO_LARGESEND);
1047 tp->mss_cache = tp->mss_cache_std;
1048 } 1357 }
1049 1358
1050 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1359 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1062,15 +1371,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1062 1371
1063 if (skb->len > cur_mss) { 1372 if (skb->len > cur_mss) {
1064 int old_factor = tcp_skb_pcount(skb); 1373 int old_factor = tcp_skb_pcount(skb);
1065 int new_factor; 1374 int diff;
1066 1375
1067 if (tcp_fragment(sk, skb, cur_mss)) 1376 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1068 return -ENOMEM; /* We'll try again later. */ 1377 return -ENOMEM; /* We'll try again later. */
1069 1378
1070 /* New SKB created, account for it. */ 1379 /* New SKB created, account for it. */
1071 new_factor = tcp_skb_pcount(skb); 1380 diff = old_factor - tcp_skb_pcount(skb) -
1072 tp->packets_out -= old_factor - new_factor; 1381 tcp_skb_pcount(skb->next);
1073 tp->packets_out += tcp_skb_pcount(skb->next); 1382 tp->packets_out -= diff;
1383
1384 if (diff > 0) {
1385 tp->fackets_out -= diff;
1386 if ((int)tp->fackets_out < 0)
1387 tp->fackets_out = 0;
1388 }
1074 } 1389 }
1075 1390
1076 /* Collapse two adjacent packets if worthwhile and we can. */ 1391 /* Collapse two adjacent packets if worthwhile and we can. */
@@ -1106,7 +1421,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1106 * is still in somebody's hands, else make a clone. 1421 * is still in somebody's hands, else make a clone.
1107 */ 1422 */
1108 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1423 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1109 tcp_tso_set_push(skb);
1110 1424
1111 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1425 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1112 pskb_copy(skb, GFP_ATOMIC): 1426 pskb_copy(skb, GFP_ATOMIC):
@@ -1290,7 +1604,7 @@ void tcp_send_fin(struct sock *sk)
1290 * was unread data in the receive queue. This behavior is recommended 1604 * was unread data in the receive queue. This behavior is recommended
1291 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM 1605 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1292 */ 1606 */
1293void tcp_send_active_reset(struct sock *sk, int priority) 1607void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
1294{ 1608{
1295 struct tcp_sock *tp = tcp_sk(sk); 1609 struct tcp_sock *tp = tcp_sk(sk);
1296 struct sk_buff *skb; 1610 struct sk_buff *skb;
@@ -1356,8 +1670,9 @@ int tcp_send_synack(struct sock *sk)
1356 * Prepare a SYN-ACK. 1670 * Prepare a SYN-ACK.
1357 */ 1671 */
1358struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, 1672struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1359 struct open_request *req) 1673 struct request_sock *req)
1360{ 1674{
1675 struct inet_request_sock *ireq = inet_rsk(req);
1361 struct tcp_sock *tp = tcp_sk(sk); 1676 struct tcp_sock *tp = tcp_sk(sk);
1362 struct tcphdr *th; 1677 struct tcphdr *th;
1363 int tcp_header_size; 1678 int tcp_header_size;
@@ -1373,47 +1688,47 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1373 skb->dst = dst_clone(dst); 1688 skb->dst = dst_clone(dst);
1374 1689
1375 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + 1690 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1376 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + 1691 (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1377 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + 1692 (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1378 /* SACK_PERM is in the place of NOP NOP of TS */ 1693 /* SACK_PERM is in the place of NOP NOP of TS */
1379 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); 1694 ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1380 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); 1695 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1381 1696
1382 memset(th, 0, sizeof(struct tcphdr)); 1697 memset(th, 0, sizeof(struct tcphdr));
1383 th->syn = 1; 1698 th->syn = 1;
1384 th->ack = 1; 1699 th->ack = 1;
1385 if (dst->dev->features&NETIF_F_TSO) 1700 if (dst->dev->features&NETIF_F_TSO)
1386 req->ecn_ok = 0; 1701 ireq->ecn_ok = 0;
1387 TCP_ECN_make_synack(req, th); 1702 TCP_ECN_make_synack(req, th);
1388 th->source = inet_sk(sk)->sport; 1703 th->source = inet_sk(sk)->sport;
1389 th->dest = req->rmt_port; 1704 th->dest = ireq->rmt_port;
1390 TCP_SKB_CB(skb)->seq = req->snt_isn; 1705 TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
1391 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; 1706 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1392 TCP_SKB_CB(skb)->sacked = 0; 1707 TCP_SKB_CB(skb)->sacked = 0;
1393 skb_shinfo(skb)->tso_segs = 1; 1708 skb_shinfo(skb)->tso_segs = 1;
1394 skb_shinfo(skb)->tso_size = 0; 1709 skb_shinfo(skb)->tso_size = 0;
1395 th->seq = htonl(TCP_SKB_CB(skb)->seq); 1710 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1396 th->ack_seq = htonl(req->rcv_isn + 1); 1711 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
1397 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 1712 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1398 __u8 rcv_wscale; 1713 __u8 rcv_wscale;
1399 /* Set this up on the first call only */ 1714 /* Set this up on the first call only */
1400 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 1715 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1401 /* tcp_full_space because it is guaranteed to be the first packet */ 1716 /* tcp_full_space because it is guaranteed to be the first packet */
1402 tcp_select_initial_window(tcp_full_space(sk), 1717 tcp_select_initial_window(tcp_full_space(sk),
1403 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 1718 dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1404 &req->rcv_wnd, 1719 &req->rcv_wnd,
1405 &req->window_clamp, 1720 &req->window_clamp,
1406 req->wscale_ok, 1721 ireq->wscale_ok,
1407 &rcv_wscale); 1722 &rcv_wscale);
1408 req->rcv_wscale = rcv_wscale; 1723 ireq->rcv_wscale = rcv_wscale;
1409 } 1724 }
1410 1725
1411 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 1726 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1412 th->window = htons(req->rcv_wnd); 1727 th->window = htons(req->rcv_wnd);
1413 1728
1414 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1729 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1415 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok, 1730 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
1416 req->sack_ok, req->wscale_ok, req->rcv_wscale, 1731 ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
1417 TCP_SKB_CB(skb)->when, 1732 TCP_SKB_CB(skb)->when,
1418 req->ts_recent); 1733 req->ts_recent);
1419 1734
@@ -1448,7 +1763,6 @@ static inline void tcp_connect_init(struct sock *sk)
1448 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 1763 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1449 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 1764 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1450 tcp_initialize_rcv_mss(sk); 1765 tcp_initialize_rcv_mss(sk);
1451 tcp_ca_init(tp);
1452 1766
1453 tcp_select_initial_window(tcp_full_space(sk), 1767 tcp_select_initial_window(tcp_full_space(sk),
1454 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 1768 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1502,7 +1816,6 @@ int tcp_connect(struct sock *sk)
1502 TCP_SKB_CB(buff)->end_seq = tp->write_seq; 1816 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1503 tp->snd_nxt = tp->write_seq; 1817 tp->snd_nxt = tp->write_seq;
1504 tp->pushed_seq = tp->write_seq; 1818 tp->pushed_seq = tp->write_seq;
1505 tcp_ca_init(tp);
1506 1819
1507 /* Send it off. */ 1820 /* Send it off. */
1508 TCP_SKB_CB(buff)->when = tcp_time_stamp; 1821 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1669,21 +1982,19 @@ int tcp_write_wakeup(struct sock *sk)
1669 skb->len > mss) { 1982 skb->len > mss) {
1670 seg_size = min(seg_size, mss); 1983 seg_size = min(seg_size, mss);
1671 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1984 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1672 if (tcp_fragment(sk, skb, seg_size)) 1985 if (tcp_fragment(sk, skb, seg_size, mss))
1673 return -1; 1986 return -1;
1674 /* SWS override triggered forced fragmentation. 1987 /* SWS override triggered forced fragmentation.
1675 * Disable TSO, the connection is too sick. */ 1988 * Disable TSO, the connection is too sick. */
1676 if (sk->sk_route_caps & NETIF_F_TSO) { 1989 if (sk->sk_route_caps & NETIF_F_TSO) {
1677 sock_set_flag(sk, SOCK_NO_LARGESEND); 1990 sock_set_flag(sk, SOCK_NO_LARGESEND);
1678 sk->sk_route_caps &= ~NETIF_F_TSO; 1991 sk->sk_route_caps &= ~NETIF_F_TSO;
1679 tp->mss_cache = tp->mss_cache_std;
1680 } 1992 }
1681 } else if (!tcp_skb_pcount(skb)) 1993 } else if (!tcp_skb_pcount(skb))
1682 tcp_set_skb_tso_segs(sk, skb); 1994 tcp_set_skb_tso_segs(sk, skb, mss);
1683 1995
1684 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1996 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1685 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1997 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1686 tcp_tso_set_push(skb);
1687 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 1998 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1688 if (!err) { 1999 if (!err) {
1689 update_send_head(sk, tp, skb); 2000 update_send_head(sk, tp, skb);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 000000000000..70e108e15c71
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
1/* Tom Kelly's Scalable TCP
2 *
3 * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
4 *
5 * John Heffner <jheffner@sc.edu>
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10#include <net/tcp.h>
11
12/* These factors derived from the recommended values in the aer:
13 * .01 and and 7/8. We use 50 instead of 100 to account for
14 * delayed ack.
15 */
16#define TCP_SCALABLE_AI_CNT 50U
17#define TCP_SCALABLE_MD_SCALE 3
18
19static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
20 u32 in_flight, int flag)
21{
22 if (in_flight < tp->snd_cwnd)
23 return;
24
25 if (tp->snd_cwnd <= tp->snd_ssthresh) {
26 tp->snd_cwnd++;
27 } else {
28 tp->snd_cwnd_cnt++;
29 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
30 tp->snd_cwnd++;
31 tp->snd_cwnd_cnt = 0;
32 }
33 }
34 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
35 tp->snd_cwnd_stamp = tcp_time_stamp;
36}
37
38static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
39{
40 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
41}
42
43
44static struct tcp_congestion_ops tcp_scalable = {
45 .ssthresh = tcp_scalable_ssthresh,
46 .cong_avoid = tcp_scalable_cong_avoid,
47 .min_cwnd = tcp_reno_min_cwnd,
48
49 .owner = THIS_MODULE,
50 .name = "scalable",
51};
52
53static int __init tcp_scalable_register(void)
54{
55 return tcp_register_congestion_control(&tcp_scalable);
56}
57
58static void __exit tcp_scalable_unregister(void)
59{
60 tcp_unregister_congestion_control(&tcp_scalable);
61}
62
63module_init(tcp_scalable_register);
64module_exit(tcp_scalable_unregister);
65
66MODULE_AUTHOR("John Heffner");
67MODULE_LICENSE("GPL");
68MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 799ebe061e2c..0084227438c2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
231 } 231 }
232 tp->ack.pending &= ~TCP_ACK_TIMER; 232 tp->ack.pending &= ~TCP_ACK_TIMER;
233 233
234 if (skb_queue_len(&tp->ucopy.prequeue)) { 234 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb; 235 struct sk_buff *skb;
236 236
237 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 237 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
238 skb_queue_len(&tp->ucopy.prequeue));
239 238
240 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 239 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
241 sk->sk_backlog_rcv(sk, skb); 240 sk->sk_backlog_rcv(sk, skb);
@@ -464,11 +463,11 @@ out_unlock:
464static void tcp_synack_timer(struct sock *sk) 463static void tcp_synack_timer(struct sock *sk)
465{ 464{
466 struct tcp_sock *tp = tcp_sk(sk); 465 struct tcp_sock *tp = tcp_sk(sk);
467 struct tcp_listen_opt *lopt = tp->listen_opt; 466 struct listen_sock *lopt = tp->accept_queue.listen_opt;
468 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; 467 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
469 int thresh = max_retries; 468 int thresh = max_retries;
470 unsigned long now = jiffies; 469 unsigned long now = jiffies;
471 struct open_request **reqp, *req; 470 struct request_sock **reqp, *req;
472 int i, budget; 471 int i, budget;
473 472
474 if (lopt == NULL || lopt->qlen == 0) 473 if (lopt == NULL || lopt->qlen == 0)
@@ -513,8 +512,8 @@ static void tcp_synack_timer(struct sock *sk)
513 while ((req = *reqp) != NULL) { 512 while ((req = *reqp) != NULL) {
514 if (time_after_eq(now, req->expires)) { 513 if (time_after_eq(now, req->expires)) {
515 if ((req->retrans < thresh || 514 if ((req->retrans < thresh ||
516 (req->acked && req->retrans < max_retries)) 515 (inet_rsk(req)->acked && req->retrans < max_retries))
517 && !req->class->rtx_syn_ack(sk, req, NULL)) { 516 && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
518 unsigned long timeo; 517 unsigned long timeo;
519 518
520 if (req->retrans++ == 0) 519 if (req->retrans++ == 0)
@@ -527,13 +526,9 @@ static void tcp_synack_timer(struct sock *sk)
527 } 526 }
528 527
529 /* Drop this request */ 528 /* Drop this request */
530 write_lock(&tp->syn_wait_lock); 529 tcp_synq_unlink(tp, req, reqp);
531 *reqp = req->dl_next; 530 reqsk_queue_removed(&tp->accept_queue, req);
532 write_unlock(&tp->syn_wait_lock); 531 reqsk_free(req);
533 lopt->qlen--;
534 if (req->retrans == 0)
535 lopt->qlen_young--;
536 tcp_openreq_free(req);
537 continue; 532 continue;
538 } 533 }
539 reqp = &req->dl_next; 534 reqp = &req->dl_next;
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000000..9bd443db5193
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
1/*
2 * TCP Vegas congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * Lawrence S. Brakmo and Larry L. Peterson.
6 * "TCP Vegas: End to end congestion avoidance on a global internet."
7 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
8 * October 1995. Available from:
9 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
10 *
11 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
12 * The main aspects that distinguish this implementation from the
13 * Arizona Vegas implementation are:
14 * o We do not change the loss detection or recovery mechanisms of
15 * Linux in any way. Linux already recovers from losses quite well,
16 * using fine-grained timers, NewReno, and FACK.
17 * o To avoid the performance penalty imposed by increasing cwnd
18 * only every-other RTT during slow start, we increase during
19 * every RTT during slow start, just like Reno.
20 * o Largely to allow continuous cwnd growth during slow start,
21 * we use the rate at which ACKs come back as the "actual"
22 * rate, rather than the rate at which data is sent.
23 * o To speed convergence to the right rate, we set the cwnd
24 * to achieve the right ("actual") rate when we exit slow start.
25 * o To filter out the noise caused by delayed ACKs, we use the
26 * minimum RTT sample observed during the last RTT to calculate
27 * the actual rate.
28 * o When the sender re-starts from idle, it waits until it has
29 * received ACKs for an entire flight of new data before making
30 * a cwnd adjustment decision. The original Vegas implementation
31 * assumed senders never went idle.
32 */
33
34#include <linux/config.h>
35#include <linux/mm.h>
36#include <linux/module.h>
37#include <linux/skbuff.h>
38#include <linux/tcp_diag.h>
39
40#include <net/tcp.h>
41
42/* Default values of the Vegas variables, in fixed-point representation
43 * with V_PARAM_SHIFT bits to the right of the binary point.
44 */
45#define V_PARAM_SHIFT 1
46static int alpha = 1<<V_PARAM_SHIFT;
47static int beta = 3<<V_PARAM_SHIFT;
48static int gamma = 1<<V_PARAM_SHIFT;
49
50module_param(alpha, int, 0644);
51MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
52module_param(beta, int, 0644);
53MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
54module_param(gamma, int, 0644);
55MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
56
57
58/* Vegas variables */
59struct vegas {
60 u32 beg_snd_nxt; /* right edge during last RTT */
61 u32 beg_snd_una; /* left edge during last RTT */
62 u32 beg_snd_cwnd; /* saves the size of the cwnd */
63 u8 doing_vegas_now;/* if true, do vegas for this RTT */
64 u16 cntRTT; /* # of RTTs measured within last RTT */
65 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
66 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
67};
68
69/* There are several situations when we must "re-start" Vegas:
70 *
71 * o when a connection is established
72 * o after an RTO
73 * o after fast recovery
74 * o when we send a packet and there is no outstanding
75 * unacknowledged data (restarting an idle connection)
76 *
77 * In these circumstances we cannot do a Vegas calculation at the
78 * end of the first RTT, because any calculation we do is using
79 * stale info -- both the saved cwnd and congestion feedback are
80 * stale.
81 *
82 * Instead we must wait until the completion of an RTT during
83 * which we actually receive ACKs.
84 */
85static inline void vegas_enable(struct tcp_sock *tp)
86{
87 struct vegas *vegas = tcp_ca(tp);
88
89 /* Begin taking Vegas samples next time we send something. */
90 vegas->doing_vegas_now = 1;
91
92 /* Set the beginning of the next send window. */
93 vegas->beg_snd_nxt = tp->snd_nxt;
94
95 vegas->cntRTT = 0;
96 vegas->minRTT = 0x7fffffff;
97}
98
99/* Stop taking Vegas samples for now. */
100static inline void vegas_disable(struct tcp_sock *tp)
101{
102 struct vegas *vegas = tcp_ca(tp);
103
104 vegas->doing_vegas_now = 0;
105}
106
107static void tcp_vegas_init(struct tcp_sock *tp)
108{
109 struct vegas *vegas = tcp_ca(tp);
110
111 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(tp);
113}
114
115/* Do RTT sampling needed for Vegas.
116 * Basically we:
117 * o min-filter RTT samples from within an RTT to get the current
118 * propagation delay + queuing delay (we are min-filtering to try to
119 * avoid the effects of delayed ACKs)
120 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT)
122 */
123static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
124{
125 struct vegas *vegas = tcp_ca(tp);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
127
128 /* Filter to find propagation delay: */
129 if (vrtt < vegas->baseRTT)
130 vegas->baseRTT = vrtt;
131
132 /* Find the min RTT during the last RTT to find
133 * the current prop. delay + queuing delay:
134 */
135 vegas->minRTT = min(vegas->minRTT, vrtt);
136 vegas->cntRTT++;
137}
138
139static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
140{
141
142 if (ca_state == TCP_CA_Open)
143 vegas_enable(tp);
144 else
145 vegas_disable(tp);
146}
147
148/*
149 * If the connection is idle and we are restarting,
150 * then we don't want to do any Vegas calculations
151 * until we get fresh RTT samples. So when we
152 * restart, we reset our Vegas state to a clean
153 * slate. After we get acks for this flight of
154 * packets, _then_ we can make Vegas calculations
155 * again.
156 */
157static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
158{
159 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START)
161 tcp_vegas_init(tp);
162}
163
164static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag)
166{
167 struct vegas *vegas = tcp_ca(tp);
168
169 if (!vegas->doing_vegas_now)
170 return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
171
172 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
173 *
174 * These are so named because they represent the approximate values
175 * of snd_una and snd_nxt at the beginning of the current RTT. More
176 * precisely, they represent the amount of data sent during the RTT.
177 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
178 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
179 * bytes of data have been ACKed during the course of the RTT, giving
180 * an "actual" rate of:
181 *
182 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
183 *
184 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
185 * because delayed ACKs can cover more than one segment, so they
186 * don't line up nicely with the boundaries of RTTs.
187 *
188 * Another unfortunate fact of life is that delayed ACKs delay the
189 * advance of the left edge of our send window, so that the number
190 * of bytes we send in an RTT is often less than our cwnd will allow.
191 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
192 */
193
194 if (after(ack, vegas->beg_snd_nxt)) {
195 /* Do the Vegas once-per-RTT cwnd adjustment. */
196 u32 old_wnd, old_snd_cwnd;
197
198
199 /* Here old_wnd is essentially the window of data that was
200 * sent during the previous RTT, and has all
201 * been acknowledged in the course of the RTT that ended
202 * with the ACK we just received. Likewise, old_snd_cwnd
203 * is the cwnd during the previous RTT.
204 */
205 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
206 tp->mss_cache;
207 old_snd_cwnd = vegas->beg_snd_cwnd;
208
209 /* Save the extent of the current window so we can use this
210 * at the end of the next RTT.
211 */
212 vegas->beg_snd_una = vegas->beg_snd_nxt;
213 vegas->beg_snd_nxt = tp->snd_nxt;
214 vegas->beg_snd_cwnd = tp->snd_cwnd;
215
216 /* Take into account the current RTT sample too, to
217 * decrease the impact of delayed acks. This double counts
218 * this sample since we count it for the next window as well,
219 * but that's not too awful, since we're taking the min,
220 * rather than averaging.
221 */
222 tcp_vegas_rtt_calc(tp, seq_rtt*1000);
223
224 /* We do the Vegas calculations only if we got enough RTT
225 * samples that we can be reasonably sure that we got
226 * at least one RTT sample that wasn't from a delayed ACK.
227 * If we only had 2 samples total,
228 * then that means we're getting only 1 ACK per RTT, which
229 * means they're almost certainly delayed ACKs.
230 * If we have 3 samples, we should be OK.
231 */
232
233 if (vegas->cntRTT <= 2) {
234 /* We don't have enough RTT samples to do the Vegas
235 * calculation, so we'll behave like Reno.
236 */
237 if (tp->snd_cwnd > tp->snd_ssthresh)
238 tp->snd_cwnd++;
239 } else {
240 u32 rtt, target_cwnd, diff;
241
242 /* We have enough RTT samples, so, using the Vegas
243 * algorithm, we determine if we should increase or
244 * decrease cwnd, and by how much.
245 */
246
247 /* Pluck out the RTT we are using for the Vegas
248 * calculations. This is the min RTT seen during the
249 * last RTT. Taking the min filters out the effects
250 * of delayed ACKs, at the cost of noticing congestion
251 * a bit later.
252 */
253 rtt = vegas->minRTT;
254
255 /* Calculate the cwnd we should have, if we weren't
256 * going too fast.
257 *
258 * This is:
259 * (actual rate in segments) * baseRTT
260 * We keep it as a fixed point number with
261 * V_PARAM_SHIFT bits to the right of the binary point.
262 */
263 target_cwnd = ((old_wnd * vegas->baseRTT)
264 << V_PARAM_SHIFT) / rtt;
265
266 /* Calculate the difference between the window we had,
267 * and the window we would like to have. This quantity
268 * is the "Diff" from the Arizona Vegas papers.
269 *
270 * Again, this is a fixed point number with
271 * V_PARAM_SHIFT bits to the right of the binary
272 * point.
273 */
274 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
275
276 if (tp->snd_cwnd < tp->snd_ssthresh) {
277 /* Slow start. */
278 if (diff > gamma) {
279 /* Going too fast. Time to slow down
280 * and switch to congestion avoidance.
281 */
282 tp->snd_ssthresh = 2;
283
284 /* Set cwnd to match the actual rate
285 * exactly:
286 * cwnd = (actual rate) * baseRTT
287 * Then we add 1 because the integer
288 * truncation robs us of full link
289 * utilization.
290 */
291 tp->snd_cwnd = min(tp->snd_cwnd,
292 (target_cwnd >>
293 V_PARAM_SHIFT)+1);
294
295 }
296 } else {
297 /* Congestion avoidance. */
298 u32 next_snd_cwnd;
299
300 /* Figure out where we would like cwnd
301 * to be.
302 */
303 if (diff > beta) {
304 /* The old window was too fast, so
305 * we slow down.
306 */
307 next_snd_cwnd = old_snd_cwnd - 1;
308 } else if (diff < alpha) {
309 /* We don't have enough extra packets
310 * in the network, so speed up.
311 */
312 next_snd_cwnd = old_snd_cwnd + 1;
313 } else {
314 /* Sending just as fast as we
315 * should be.
316 */
317 next_snd_cwnd = old_snd_cwnd;
318 }
319
320 /* Adjust cwnd upward or downward, toward the
321 * desired value.
322 */
323 if (next_snd_cwnd > tp->snd_cwnd)
324 tp->snd_cwnd++;
325 else if (next_snd_cwnd < tp->snd_cwnd)
326 tp->snd_cwnd--;
327 }
328 }
329
330 /* Wipe the slate clean for the next RTT. */
331 vegas->cntRTT = 0;
332 vegas->minRTT = 0x7fffffff;
333 }
334
335 /* The following code is executed for every ack we receive,
336 * except for conditions checked in should_advance_cwnd()
337 * before the call to tcp_cong_avoid(). Mainly this means that
338 * we only execute this code if the ack actually acked some
339 * data.
340 */
341
342 /* If we are in slow start, increase our cwnd in response to this ACK.
343 * (If we are not in slow start then we are in congestion avoidance,
344 * and adjust our congestion window only once per RTT. See the code
345 * above.)
346 */
347 if (tp->snd_cwnd <= tp->snd_ssthresh)
348 tp->snd_cwnd++;
349
350 /* to keep cwnd from growing without bound */
351 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
352
353 /* Make sure that we are never so timid as to reduce our cwnd below
354 * 2 MSS.
355 *
356 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
357 */
358 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
359}
360
361/* Extract info for Tcp socket info provided via netlink. */
362static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
363 struct sk_buff *skb)
364{
365 const struct vegas *ca = tcp_ca(tp);
366 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
367 struct tcpvegas_info *info;
368
369 info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
370 sizeof(*info)));
371
372 info->tcpv_enabled = ca->doing_vegas_now;
373 info->tcpv_rttcnt = ca->cntRTT;
374 info->tcpv_rtt = ca->baseRTT;
375 info->tcpv_minrtt = ca->minRTT;
376 rtattr_failure: ;
377 }
378}
379
380static struct tcp_congestion_ops tcp_vegas = {
381 .init = tcp_vegas_init,
382 .ssthresh = tcp_reno_ssthresh,
383 .cong_avoid = tcp_vegas_cong_avoid,
384 .min_cwnd = tcp_reno_min_cwnd,
385 .rtt_sample = tcp_vegas_rtt_calc,
386 .set_state = tcp_vegas_state,
387 .cwnd_event = tcp_vegas_cwnd_event,
388 .get_info = tcp_vegas_get_info,
389
390 .owner = THIS_MODULE,
391 .name = "vegas",
392};
393
394static int __init tcp_vegas_register(void)
395{
396 BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
397 tcp_register_congestion_control(&tcp_vegas);
398 return 0;
399}
400
401static void __exit tcp_vegas_unregister(void)
402{
403 tcp_unregister_congestion_control(&tcp_vegas);
404}
405
406module_init(tcp_vegas_register);
407module_exit(tcp_vegas_unregister);
408
409MODULE_AUTHOR("Stephen Hemminger");
410MODULE_LICENSE("GPL");
411MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 000000000000..ef827242c940
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
1/*
2 * TCP Westwood+
3 *
4 * Angelo Dell'Aera: TCP Westwood+ support
5 */
6
7#include <linux/config.h>
8#include <linux/mm.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/tcp_diag.h>
12#include <net/tcp.h>
13
14/* TCP Westwood structure */
15struct westwood {
16 u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
17 u32 bw_est; /* bandwidth estimate */
18 u32 rtt_win_sx; /* here starts a new evaluation... */
19 u32 bk;
20 u32 snd_una; /* used for evaluating the number of acked bytes */
21 u32 cumul_ack;
22 u32 accounted;
23 u32 rtt;
24 u32 rtt_min; /* minimum observed RTT */
25};
26
27
28/* TCP Westwood functions and constants */
29#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
30#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
31
32/*
33 * @tcp_westwood_create
34 * This function initializes fields used in TCP Westwood+,
35 * it is called after the initial SYN, so the sequence numbers
36 * are correct but new passive connections we have no
37 * information about RTTmin at this time so we simply set it to
38 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
39 * since in this way we're sure it will be updated in a consistent
40 * way as soon as possible. It will reasonably happen within the first
41 * RTT period of the connection lifetime.
42 */
43static void tcp_westwood_init(struct tcp_sock *tp)
44{
45 struct westwood *w = tcp_ca(tp);
46
47 w->bk = 0;
48 w->bw_ns_est = 0;
49 w->bw_est = 0;
50 w->accounted = 0;
51 w->cumul_ack = 0;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tp->snd_una;
55}
56
57/*
58 * @westwood_do_filter
59 * Low-pass filter. Implemented using constant coefficients.
60 */
61static inline u32 westwood_do_filter(u32 a, u32 b)
62{
63 return (((7 * a) + b) >> 3);
64}
65
66static inline void westwood_filter(struct westwood *w, u32 delta)
67{
68 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
69 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
70}
71
72/*
73 * @westwood_pkts_acked
74 * Called after processing group of packets.
75 * but all westwood needs is the last sample of srtt.
76 */
77static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
78{
79 struct westwood *w = tcp_ca(tp);
80 if (cnt > 0)
81 w->rtt = tp->srtt >> 3;
82}
83
84/*
85 * @westwood_update_window
86 * It updates RTT evaluation window if it is the right moment to do
87 * it. If so it calls filter for evaluating bandwidth.
88 */
89static void westwood_update_window(struct tcp_sock *tp)
90{
91 struct westwood *w = tcp_ca(tp);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93
94 /*
95 * See if a RTT-window has passed.
96 * Be careful since if RTT is less than
97 * 50ms we don't filter but we continue 'building the sample'.
98 * This minimum limit was chosen since an estimation on small
99 * time intervals is better to avoid...
100 * Obviously on a LAN we reasonably will always have
101 * right_bound = left_bound + WESTWOOD_RTT_MIN
102 */
103 if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
104 westwood_filter(w, delta);
105
106 w->bk = 0;
107 w->rtt_win_sx = tcp_time_stamp;
108 }
109}
110
111/*
112 * @westwood_fast_bw
113 * It is called when we are in fast path. In particular it is called when
114 * header prediction is successful. In such case in fact update is
115 * straight forward and doesn't need any particular care.
116 */
117static inline void westwood_fast_bw(struct tcp_sock *tp)
118{
119 struct westwood *w = tcp_ca(tp);
120
121 westwood_update_window(tp);
122
123 w->bk += tp->snd_una - w->snd_una;
124 w->snd_una = tp->snd_una;
125 w->rtt_min = min(w->rtt, w->rtt_min);
126}
127
128/*
129 * @westwood_acked_count
130 * This function evaluates cumul_ack for evaluating bk in case of
131 * delayed or partial acks.
132 */
133static inline u32 westwood_acked_count(struct tcp_sock *tp)
134{
135 struct westwood *w = tcp_ca(tp);
136
137 w->cumul_ack = tp->snd_una - w->snd_una;
138
139 /* If cumul_ack is 0 this is a dupack since it's not moving
140 * tp->snd_una.
141 */
142 if (!w->cumul_ack) {
143 w->accounted += tp->mss_cache;
144 w->cumul_ack = tp->mss_cache;
145 }
146
147 if (w->cumul_ack > tp->mss_cache) {
148 /* Partial or delayed ack */
149 if (w->accounted >= w->cumul_ack) {
150 w->accounted -= w->cumul_ack;
151 w->cumul_ack = tp->mss_cache;
152 } else {
153 w->cumul_ack -= w->accounted;
154 w->accounted = 0;
155 }
156 }
157
158 w->snd_una = tp->snd_una;
159
160 return w->cumul_ack;
161}
162
163static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
164{
165 struct westwood *w = tcp_ca(tp);
166 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
167}
168
169/*
170 * TCP Westwood
171 * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
172 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
173 * so avoids ever returning 0.
174 */
175static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
176{
177 return westwood_bw_rttmin(tp);
178}
179
180static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
181{
182 struct westwood *w = tcp_ca(tp);
183
184 switch(event) {
185 case CA_EVENT_FAST_ACK:
186 westwood_fast_bw(tp);
187 break;
188
189 case CA_EVENT_COMPLETE_CWR:
190 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
191 break;
192
193 case CA_EVENT_FRTO:
194 tp->snd_ssthresh = westwood_bw_rttmin(tp);
195 break;
196
197 case CA_EVENT_SLOW_ACK:
198 westwood_update_window(tp);
199 w->bk += westwood_acked_count(tp);
200 w->rtt_min = min(w->rtt, w->rtt_min);
201 break;
202
203 default:
204 /* don't care */
205 break;
206 }
207}
208
209
210/* Extract info for Tcp socket info provided via netlink. */
211static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
212 struct sk_buff *skb)
213{
214 const struct westwood *ca = tcp_ca(tp);
215 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
216 struct rtattr *rta;
217 struct tcpvegas_info *info;
218
219 rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
220 info = RTA_DATA(rta);
221 info->tcpv_enabled = 1;
222 info->tcpv_rttcnt = 0;
223 info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
224 info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
225 rtattr_failure: ;
226 }
227}
228
229
230static struct tcp_congestion_ops tcp_westwood = {
231 .init = tcp_westwood_init,
232 .ssthresh = tcp_reno_ssthresh,
233 .cong_avoid = tcp_reno_cong_avoid,
234 .min_cwnd = tcp_westwood_cwnd_min,
235 .cwnd_event = tcp_westwood_event,
236 .get_info = tcp_westwood_info,
237 .pkts_acked = tcp_westwood_pkts_acked,
238
239 .owner = THIS_MODULE,
240 .name = "westwood"
241};
242
243static int __init tcp_westwood_register(void)
244{
245 BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
246 return tcp_register_congestion_control(&tcp_westwood);
247}
248
249static void __exit tcp_westwood_unregister(void)
250{
251 tcp_unregister_congestion_control(&tcp_westwood);
252}
253
254module_init(tcp_westwood_register);
255module_exit(tcp_westwood_unregister);
256
257MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
258MODULE_LICENSE("GPL");
259MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7c24e64b443f..dc4d07357e3a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -628,7 +628,7 @@ back_from_confirm:
628 /* ... which is an evident application bug. --ANK */ 628 /* ... which is an evident application bug. --ANK */
629 release_sock(sk); 629 release_sock(sk);
630 630
631 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n")); 631 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
632 err = -EINVAL; 632 err = -EINVAL;
633 goto out; 633 goto out;
634 } 634 }
@@ -693,7 +693,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
693 if (unlikely(!up->pending)) { 693 if (unlikely(!up->pending)) {
694 release_sock(sk); 694 release_sock(sk);
695 695
696 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n")); 696 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n"));
697 return -EINVAL; 697 return -EINVAL;
698 } 698 }
699 699
@@ -1102,7 +1102,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1102 skb->ip_summed = CHECKSUM_UNNECESSARY; 1102 skb->ip_summed = CHECKSUM_UNNECESSARY;
1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) 1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
1104 return 0; 1104 return 0;
1105 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n")); 1105 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
1106 skb->ip_summed = CHECKSUM_NONE; 1106 skb->ip_summed = CHECKSUM_NONE;
1107 } 1107 }
1108 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 1108 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,14 +1181,13 @@ int udp_rcv(struct sk_buff *skb)
1181 return(0); 1181 return(0);
1182 1182
1183short_packet: 1183short_packet:
1184 NETDEBUG(if (net_ratelimit()) 1184 LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
1185 printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", 1185 NIPQUAD(saddr),
1186 NIPQUAD(saddr), 1186 ntohs(uh->source),
1187 ntohs(uh->source), 1187 ulen,
1188 ulen, 1188 len,
1189 len, 1189 NIPQUAD(daddr),
1190 NIPQUAD(daddr), 1190 ntohs(uh->dest)));
1191 ntohs(uh->dest)));
1192no_header: 1191no_header:
1193 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1192 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1194 kfree_skb(skb); 1193 kfree_skb(skb);
@@ -1199,13 +1198,12 @@ csum_error:
1199 * RFC1122: OK. Discards the bad packet silently (as far as 1198 * RFC1122: OK. Discards the bad packet silently (as far as
1200 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 1199 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1201 */ 1200 */
1202 NETDEBUG(if (net_ratelimit()) 1201 LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
1203 printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", 1202 NIPQUAD(saddr),
1204 NIPQUAD(saddr), 1203 ntohs(uh->source),
1205 ntohs(uh->source), 1204 NIPQUAD(daddr),
1206 NIPQUAD(daddr), 1205 ntohs(uh->dest),
1207 ntohs(uh->dest), 1206 ulen));
1208 ulen));
1209drop: 1207drop:
1210 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1208 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1211 kfree_skb(skb); 1209 kfree_skb(skb);
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
deleted file mode 100644
index 6aecd7a43534..000000000000
--- a/net/ipv4/utils.c
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Various kernel-resident INET utility functions; mainly
7 * for format conversion and debugging output.
8 *
9 * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
10 *
11 * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *
13 * Fixes:
14 * Alan Cox : verify_area check.
15 * Alan Cox : removed old debugging.
16 * Andi Kleen : add net_ratelimit()
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24#include <linux/module.h>
25#include <linux/types.h>
26#include <asm/byteorder.h>
27
28/*
29 * Convert an ASCII string to binary IP.
30 */
31
32__u32 in_aton(const char *str)
33{
34 unsigned long l;
35 unsigned int val;
36 int i;
37
38 l = 0;
39 for (i = 0; i < 4; i++)
40 {
41 l <<= 8;
42 if (*str != '\0')
43 {
44 val = 0;
45 while (*str != '\0' && *str != '.')
46 {
47 val *= 10;
48 val += *str - '0';
49 str++;
50 }
51 l |= val;
52 if (*str != '\0')
53 str++;
54 }
55 }
56 return(htonl(l));
57}
58
59EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index af2392ae5769..66620a95942a 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -33,6 +33,7 @@ static void xfrm4_encap(struct sk_buff *skb)
33 struct dst_entry *dst = skb->dst; 33 struct dst_entry *dst = skb->dst;
34 struct xfrm_state *x = dst->xfrm; 34 struct xfrm_state *x = dst->xfrm;
35 struct iphdr *iph, *top_iph; 35 struct iphdr *iph, *top_iph;
36 int flags;
36 37
37 iph = skb->nh.iph; 38 iph = skb->nh.iph;
38 skb->h.ipiph = iph; 39 skb->h.ipiph = iph;
@@ -51,10 +52,13 @@ static void xfrm4_encap(struct sk_buff *skb)
51 52
52 /* DS disclosed */ 53 /* DS disclosed */
53 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); 54 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
54 if (x->props.flags & XFRM_STATE_NOECN) 55
56 flags = x->props.flags;
57 if (flags & XFRM_STATE_NOECN)
55 IP_ECN_clear(top_iph); 58 IP_ECN_clear(top_iph);
56 59
57 top_iph->frag_off = iph->frag_off & htons(IP_DF); 60 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
61 0 : (iph->frag_off & htons(IP_DF));
58 if (!top_iph->frag_off) 62 if (!top_iph->frag_off)
59 __ip_select_ident(top_iph, dst, 0); 63 __ip_select_ident(top_iph, dst, 0);
60 64
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 223a2e83853f..050611d7a967 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -7,12 +7,20 @@
7 * 7 *
8 */ 8 */
9 9
10#include <net/ip.h>
10#include <net/xfrm.h> 11#include <net/xfrm.h>
11#include <linux/pfkeyv2.h> 12#include <linux/pfkeyv2.h>
12#include <linux/ipsec.h> 13#include <linux/ipsec.h>
13 14
14static struct xfrm_state_afinfo xfrm4_state_afinfo; 15static struct xfrm_state_afinfo xfrm4_state_afinfo;
15 16
17static int xfrm4_init_flags(struct xfrm_state *x)
18{
19 if (ipv4_config.no_pmtu_disc)
20 x->props.flags |= XFRM_STATE_NOPMTUDISC;
21 return 0;
22}
23
16static void 24static void
17__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 25__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
18 struct xfrm_tmpl *tmpl, 26 struct xfrm_tmpl *tmpl,
@@ -109,6 +117,7 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
109static struct xfrm_state_afinfo xfrm4_state_afinfo = { 117static struct xfrm_state_afinfo xfrm4_state_afinfo = {
110 .family = AF_INET, 118 .family = AF_INET,
111 .lock = RW_LOCK_UNLOCKED, 119 .lock = RW_LOCK_UNLOCKED,
120 .init_flags = xfrm4_init_flags,
112 .init_tempsel = __xfrm4_init_tempsel, 121 .init_tempsel = __xfrm4_init_tempsel,
113 .state_lookup = __xfrm4_state_lookup, 122 .state_lookup = __xfrm4_state_lookup,
114 .find_acq = __xfrm4_find_acq, 123 .find_acq = __xfrm4_find_acq,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 413191f585f6..afbb0d4cc305 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -78,13 +78,12 @@ static int ipip_rcv(struct sk_buff *skb)
78static void ipip_err(struct sk_buff *skb, u32 info) 78static void ipip_err(struct sk_buff *skb, u32 info)
79{ 79{
80 struct xfrm_tunnel *handler = ipip_handler; 80 struct xfrm_tunnel *handler = ipip_handler;
81 u32 arg = info;
82 81
83 if (handler) 82 if (handler)
84 handler->err_handler(skb, &arg); 83 handler->err_handler(skb, info);
85} 84}
86 85
87static int ipip_init_state(struct xfrm_state *x, void *args) 86static int ipip_init_state(struct xfrm_state *x)
88{ 87{
89 if (!x->props.mode) 88 if (!x->props.mode)
90 return -EINVAL; 89 return -EINVAL;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index e66ca9381cfd..ab7a9124f985 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,6 +1,26 @@
1# 1#
2# IPv6 configuration 2# IPv6 configuration
3# 3#
4
5# IPv6 as module will cause a CRASH if you try to unload it
6config IPV6
7 tristate "The IPv6 protocol"
8 default m
9 select CRYPTO if IPV6_PRIVACY
10 select CRYPTO_MD5 if IPV6_PRIVACY
11 ---help---
12 This is complemental support for the IP version 6.
13 You will still be able to do traditional IPv4 networking as well.
14
15 For general information about IPv6, see
16 <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
17 For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
18 For specific information about IPv6 under Linux, read the HOWTO at
19 <http://www.bieringer.de/linux/IPv6/>.
20
21 To compile this protocol support as a module, choose M here: the
22 module will be called ipv6.
23
4config IPV6_PRIVACY 24config IPV6_PRIVACY
5 bool "IPv6: Privacy Extensions (RFC 3041) support" 25 bool "IPv6: Privacy Extensions (RFC 3041) support"
6 depends on IPV6 26 depends on IPV6
@@ -71,7 +91,6 @@ config INET6_TUNNEL
71config IPV6_TUNNEL 91config IPV6_TUNNEL
72 tristate "IPv6: IPv6-in-IPv6 tunnel" 92 tristate "IPv6: IPv6-in-IPv6 tunnel"
73 depends on IPV6 93 depends on IPV6
74 select INET6_TUNNEL
75 ---help--- 94 ---help---
76 Support for IPv6-in-IPv6 tunnels described in RFC 2473. 95 Support for IPv6-in-IPv6 tunnels described in RFC 2473.
77 96
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7744a2592693..77004b9456c0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -57,6 +57,7 @@
57#endif 57#endif
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/notifier.h> 59#include <linux/notifier.h>
60#include <linux/string.h>
60 61
61#include <net/sock.h> 62#include <net/sock.h>
62#include <net/snmp.h> 63#include <net/snmp.h>
@@ -131,7 +132,7 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
131 132
132static int addrconf_ifdown(struct net_device *dev, int how); 133static int addrconf_ifdown(struct net_device *dev, int how);
133 134
134static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags); 135static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
135static void addrconf_dad_timer(unsigned long data); 136static void addrconf_dad_timer(unsigned long data);
136static void addrconf_dad_completed(struct inet6_ifaddr *ifp); 137static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
137static void addrconf_rs_timer(unsigned long data); 138static void addrconf_rs_timer(unsigned long data);
@@ -372,6 +373,7 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
372 ndev->regen_timer.data = (unsigned long) ndev; 373 ndev->regen_timer.data = (unsigned long) ndev;
373 if ((dev->flags&IFF_LOOPBACK) || 374 if ((dev->flags&IFF_LOOPBACK) ||
374 dev->type == ARPHRD_TUNNEL || 375 dev->type == ARPHRD_TUNNEL ||
376 dev->type == ARPHRD_NONE ||
375 dev->type == ARPHRD_SIT) { 377 dev->type == ARPHRD_SIT) {
376 printk(KERN_INFO 378 printk(KERN_INFO
377 "Disabled Privacy Extensions on device %p(%s)\n", 379 "Disabled Privacy Extensions on device %p(%s)\n",
@@ -491,7 +493,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
491 493
492static struct inet6_ifaddr * 494static struct inet6_ifaddr *
493ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, 495ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
494 int scope, unsigned flags) 496 int scope, u32 flags)
495{ 497{
496 struct inet6_ifaddr *ifa = NULL; 498 struct inet6_ifaddr *ifa = NULL;
497 struct rt6_info *rt; 499 struct rt6_info *rt;
@@ -694,7 +696,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
694 696
695 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { 697 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
696 if (onlink == 0) { 698 if (onlink == 0) {
697 ip6_del_rt(rt, NULL, NULL); 699 ip6_del_rt(rt, NULL, NULL, NULL);
698 rt = NULL; 700 rt = NULL;
699 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { 701 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
700 rt->rt6i_expires = expires; 702 rt->rt6i_expires = expires;
@@ -1319,7 +1321,7 @@ static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
1319 1321
1320static void 1322static void
1321addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, 1323addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
1322 unsigned long expires, unsigned flags) 1324 unsigned long expires, u32 flags)
1323{ 1325{
1324 struct in6_rtmsg rtmsg; 1326 struct in6_rtmsg rtmsg;
1325 1327
@@ -1339,7 +1341,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
1339 if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) 1341 if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
1340 rtmsg.rtmsg_flags |= RTF_NONEXTHOP; 1342 rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
1341 1343
1342 ip6_route_add(&rtmsg, NULL, NULL); 1344 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1343} 1345}
1344 1346
1345/* Create "default" multicast route to the interface */ 1347/* Create "default" multicast route to the interface */
@@ -1356,7 +1358,7 @@ static void addrconf_add_mroute(struct net_device *dev)
1356 rtmsg.rtmsg_ifindex = dev->ifindex; 1358 rtmsg.rtmsg_ifindex = dev->ifindex;
1357 rtmsg.rtmsg_flags = RTF_UP; 1359 rtmsg.rtmsg_flags = RTF_UP;
1358 rtmsg.rtmsg_type = RTMSG_NEWROUTE; 1360 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1359 ip6_route_add(&rtmsg, NULL, NULL); 1361 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1360} 1362}
1361 1363
1362static void sit_route_add(struct net_device *dev) 1364static void sit_route_add(struct net_device *dev)
@@ -1373,7 +1375,7 @@ static void sit_route_add(struct net_device *dev)
1373 rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; 1375 rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP;
1374 rtmsg.rtmsg_ifindex = dev->ifindex; 1376 rtmsg.rtmsg_ifindex = dev->ifindex;
1375 1377
1376 ip6_route_add(&rtmsg, NULL, NULL); 1378 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1377} 1379}
1378 1380
1379static void addrconf_add_lroute(struct net_device *dev) 1381static void addrconf_add_lroute(struct net_device *dev)
@@ -1466,7 +1468,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
1466 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { 1468 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
1467 if (rt->rt6i_flags&RTF_EXPIRES) { 1469 if (rt->rt6i_flags&RTF_EXPIRES) {
1468 if (valid_lft == 0) { 1470 if (valid_lft == 0) {
1469 ip6_del_rt(rt, NULL, NULL); 1471 ip6_del_rt(rt, NULL, NULL, NULL);
1470 rt = NULL; 1472 rt = NULL;
1471 } else { 1473 } else {
1472 rt->rt6i_expires = rt_expires; 1474 rt->rt6i_expires = rt_expires;
@@ -2228,7 +2230,7 @@ out:
2228/* 2230/*
2229 * Duplicate Address Detection 2231 * Duplicate Address Detection
2230 */ 2232 */
2231static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags) 2233static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
2232{ 2234{
2233 struct inet6_dev *idev = ifp->idev; 2235 struct inet6_dev *idev = ifp->idev;
2234 struct net_device *dev = idev->dev; 2236 struct net_device *dev = idev->dev;
@@ -2621,15 +2623,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
2621} 2623}
2622 2624
2623static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, 2625static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
2624 u32 pid, u32 seq, int event) 2626 u32 pid, u32 seq, int event, unsigned int flags)
2625{ 2627{
2626 struct ifaddrmsg *ifm; 2628 struct ifaddrmsg *ifm;
2627 struct nlmsghdr *nlh; 2629 struct nlmsghdr *nlh;
2628 struct ifa_cacheinfo ci; 2630 struct ifa_cacheinfo ci;
2629 unsigned char *b = skb->tail; 2631 unsigned char *b = skb->tail;
2630 2632
2631 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 2633 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
2632 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
2633 ifm = NLMSG_DATA(nlh); 2634 ifm = NLMSG_DATA(nlh);
2634 ifm->ifa_family = AF_INET6; 2635 ifm->ifa_family = AF_INET6;
2635 ifm->ifa_prefixlen = ifa->prefix_len; 2636 ifm->ifa_prefixlen = ifa->prefix_len;
@@ -2671,15 +2672,14 @@ rtattr_failure:
2671} 2672}
2672 2673
2673static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, 2674static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
2674 u32 pid, u32 seq, int event) 2675 u32 pid, u32 seq, int event, u16 flags)
2675{ 2676{
2676 struct ifaddrmsg *ifm; 2677 struct ifaddrmsg *ifm;
2677 struct nlmsghdr *nlh; 2678 struct nlmsghdr *nlh;
2678 struct ifa_cacheinfo ci; 2679 struct ifa_cacheinfo ci;
2679 unsigned char *b = skb->tail; 2680 unsigned char *b = skb->tail;
2680 2681
2681 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 2682 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
2682 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
2683 ifm = NLMSG_DATA(nlh); 2683 ifm = NLMSG_DATA(nlh);
2684 ifm->ifa_family = AF_INET6; 2684 ifm->ifa_family = AF_INET6;
2685 ifm->ifa_prefixlen = 128; 2685 ifm->ifa_prefixlen = 128;
@@ -2708,15 +2708,14 @@ rtattr_failure:
2708} 2708}
2709 2709
2710static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, 2710static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
2711 u32 pid, u32 seq, int event) 2711 u32 pid, u32 seq, int event, unsigned int flags)
2712{ 2712{
2713 struct ifaddrmsg *ifm; 2713 struct ifaddrmsg *ifm;
2714 struct nlmsghdr *nlh; 2714 struct nlmsghdr *nlh;
2715 struct ifa_cacheinfo ci; 2715 struct ifa_cacheinfo ci;
2716 unsigned char *b = skb->tail; 2716 unsigned char *b = skb->tail;
2717 2717
2718 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 2718 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
2719 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
2720 ifm = NLMSG_DATA(nlh); 2719 ifm = NLMSG_DATA(nlh);
2721 ifm->ifa_family = AF_INET6; 2720 ifm->ifa_family = AF_INET6;
2722 ifm->ifa_prefixlen = 128; 2721 ifm->ifa_prefixlen = 128;
@@ -2778,28 +2777,17 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2778 read_lock_bh(&idev->lock); 2777 read_lock_bh(&idev->lock);
2779 switch (type) { 2778 switch (type) {
2780 case UNICAST_ADDR: 2779 case UNICAST_ADDR:
2781 /* unicast address */ 2780 /* unicast address incl. temp addr */
2782 for (ifa = idev->addr_list; ifa; 2781 for (ifa = idev->addr_list; ifa;
2783 ifa = ifa->if_next, ip_idx++) { 2782 ifa = ifa->if_next, ip_idx++) {
2784 if (ip_idx < s_ip_idx) 2783 if (ip_idx < s_ip_idx)
2785 continue; 2784 continue;
2786 if ((err = inet6_fill_ifaddr(skb, ifa, 2785 if ((err = inet6_fill_ifaddr(skb, ifa,
2787 NETLINK_CB(cb->skb).pid, 2786 NETLINK_CB(cb->skb).pid,
2788 cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0) 2787 cb->nlh->nlmsg_seq, RTM_NEWADDR,
2788 NLM_F_MULTI)) <= 0)
2789 goto done; 2789 goto done;
2790 } 2790 }
2791 /* temp addr */
2792#ifdef CONFIG_IPV6_PRIVACY
2793 for (ifa = idev->tempaddr_list; ifa;
2794 ifa = ifa->tmp_next, ip_idx++) {
2795 if (ip_idx < s_ip_idx)
2796 continue;
2797 if ((err = inet6_fill_ifaddr(skb, ifa,
2798 NETLINK_CB(cb->skb).pid,
2799 cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0)
2800 goto done;
2801 }
2802#endif
2803 break; 2791 break;
2804 case MULTICAST_ADDR: 2792 case MULTICAST_ADDR:
2805 /* multicast address */ 2793 /* multicast address */
@@ -2809,7 +2797,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2809 continue; 2797 continue;
2810 if ((err = inet6_fill_ifmcaddr(skb, ifmca, 2798 if ((err = inet6_fill_ifmcaddr(skb, ifmca,
2811 NETLINK_CB(cb->skb).pid, 2799 NETLINK_CB(cb->skb).pid,
2812 cb->nlh->nlmsg_seq, RTM_GETMULTICAST)) <= 0) 2800 cb->nlh->nlmsg_seq, RTM_GETMULTICAST,
2801 NLM_F_MULTI)) <= 0)
2813 goto done; 2802 goto done;
2814 } 2803 }
2815 break; 2804 break;
@@ -2821,7 +2810,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2821 continue; 2810 continue;
2822 if ((err = inet6_fill_ifacaddr(skb, ifaca, 2811 if ((err = inet6_fill_ifacaddr(skb, ifaca,
2823 NETLINK_CB(cb->skb).pid, 2812 NETLINK_CB(cb->skb).pid,
2824 cb->nlh->nlmsg_seq, RTM_GETANYCAST)) <= 0) 2813 cb->nlh->nlmsg_seq, RTM_GETANYCAST,
2814 NLM_F_MULTI)) <= 0)
2825 goto done; 2815 goto done;
2826 } 2816 }
2827 break; 2817 break;
@@ -2871,7 +2861,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
2871 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS); 2861 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS);
2872 return; 2862 return;
2873 } 2863 }
2874 if (inet6_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { 2864 if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
2875 kfree_skb(skb); 2865 kfree_skb(skb);
2876 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL); 2866 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL);
2877 return; 2867 return;
@@ -2906,7 +2896,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
2906} 2896}
2907 2897
2908static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 2898static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
2909 u32 pid, u32 seq, int event) 2899 u32 pid, u32 seq, int event, unsigned int flags)
2910{ 2900{
2911 struct net_device *dev = idev->dev; 2901 struct net_device *dev = idev->dev;
2912 __s32 *array = NULL; 2902 __s32 *array = NULL;
@@ -2917,10 +2907,10 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
2917 __u32 mtu = dev->mtu; 2907 __u32 mtu = dev->mtu;
2918 struct ifla_cacheinfo ci; 2908 struct ifla_cacheinfo ci;
2919 2909
2920 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); 2910 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2921 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
2922 r = NLMSG_DATA(nlh); 2911 r = NLMSG_DATA(nlh);
2923 r->ifi_family = AF_INET6; 2912 r->ifi_family = AF_INET6;
2913 r->__ifi_pad = 0;
2924 r->ifi_type = dev->type; 2914 r->ifi_type = dev->type;
2925 r->ifi_index = dev->ifindex; 2915 r->ifi_index = dev->ifindex;
2926 r->ifi_flags = dev_get_flags(dev); 2916 r->ifi_flags = dev_get_flags(dev);
@@ -2985,7 +2975,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
2985 if ((idev = in6_dev_get(dev)) == NULL) 2975 if ((idev = in6_dev_get(dev)) == NULL)
2986 continue; 2976 continue;
2987 err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid, 2977 err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid,
2988 cb->nlh->nlmsg_seq, RTM_NEWLINK); 2978 cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI);
2989 in6_dev_put(idev); 2979 in6_dev_put(idev);
2990 if (err <= 0) 2980 if (err <= 0)
2991 break; 2981 break;
@@ -3007,7 +2997,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
3007 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS); 2997 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS);
3008 return; 2998 return;
3009 } 2999 }
3010 if (inet6_fill_ifinfo(skb, idev, 0, 0, event) < 0) { 3000 if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) {
3011 kfree_skb(skb); 3001 kfree_skb(skb);
3012 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL); 3002 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL);
3013 return; 3003 return;
@@ -3017,23 +3007,23 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
3017} 3007}
3018 3008
3019static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, 3009static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
3020 struct prefix_info *pinfo, u32 pid, u32 seq, int event) 3010 struct prefix_info *pinfo, u32 pid, u32 seq,
3011 int event, unsigned int flags)
3021{ 3012{
3022 struct prefixmsg *pmsg; 3013 struct prefixmsg *pmsg;
3023 struct nlmsghdr *nlh; 3014 struct nlmsghdr *nlh;
3024 unsigned char *b = skb->tail; 3015 unsigned char *b = skb->tail;
3025 struct prefix_cacheinfo ci; 3016 struct prefix_cacheinfo ci;
3026 3017
3027 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*pmsg)); 3018 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
3028
3029 if (pid)
3030 nlh->nlmsg_flags |= NLM_F_MULTI;
3031
3032 pmsg = NLMSG_DATA(nlh); 3019 pmsg = NLMSG_DATA(nlh);
3033 pmsg->prefix_family = AF_INET6; 3020 pmsg->prefix_family = AF_INET6;
3021 pmsg->prefix_pad1 = 0;
3022 pmsg->prefix_pad2 = 0;
3034 pmsg->prefix_ifindex = idev->dev->ifindex; 3023 pmsg->prefix_ifindex = idev->dev->ifindex;
3035 pmsg->prefix_len = pinfo->prefix_len; 3024 pmsg->prefix_len = pinfo->prefix_len;
3036 pmsg->prefix_type = pinfo->type; 3025 pmsg->prefix_type = pinfo->type;
3026 pmsg->prefix_pad3 = 0;
3037 3027
3038 pmsg->prefix_flags = 0; 3028 pmsg->prefix_flags = 0;
3039 if (pinfo->onlink) 3029 if (pinfo->onlink)
@@ -3067,7 +3057,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
3067 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS); 3057 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS);
3068 return; 3058 return;
3069 } 3059 }
3070 if (inet6_fill_prefix(skb, idev, pinfo, 0, 0, event) < 0) { 3060 if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) {
3071 kfree_skb(skb); 3061 kfree_skb(skb);
3072 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL); 3062 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL);
3073 return; 3063 return;
@@ -3096,7 +3086,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3096 switch (event) { 3086 switch (event) {
3097 case RTM_NEWADDR: 3087 case RTM_NEWADDR:
3098 dst_hold(&ifp->rt->u.dst); 3088 dst_hold(&ifp->rt->u.dst);
3099 if (ip6_ins_rt(ifp->rt, NULL, NULL)) 3089 if (ip6_ins_rt(ifp->rt, NULL, NULL, NULL))
3100 dst_release(&ifp->rt->u.dst); 3090 dst_release(&ifp->rt->u.dst);
3101 if (ifp->idev->cnf.forwarding) 3091 if (ifp->idev->cnf.forwarding)
3102 addrconf_join_anycast(ifp); 3092 addrconf_join_anycast(ifp);
@@ -3106,7 +3096,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3106 addrconf_leave_anycast(ifp); 3096 addrconf_leave_anycast(ifp);
3107 addrconf_leave_solict(ifp->idev, &ifp->addr); 3097 addrconf_leave_solict(ifp->idev, &ifp->addr);
3108 dst_hold(&ifp->rt->u.dst); 3098 dst_hold(&ifp->rt->u.dst);
3109 if (ip6_del_rt(ifp->rt, NULL, NULL)) 3099 if (ip6_del_rt(ifp->rt, NULL, NULL, NULL))
3110 dst_free(&ifp->rt->u.dst); 3100 dst_free(&ifp->rt->u.dst);
3111 else 3101 else
3112 dst_release(&ifp->rt->u.dst); 3102 dst_release(&ifp->rt->u.dst);
@@ -3439,7 +3429,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
3439 * by sysctl and we wouldn't want anyone to change it under our feet 3429 * by sysctl and we wouldn't want anyone to change it under our feet
3440 * (see SIOCSIFNAME). 3430 * (see SIOCSIFNAME).
3441 */ 3431 */
3442 dev_name = net_sysctl_strdup(dev_name); 3432 dev_name = kstrdup(dev_name, GFP_KERNEL);
3443 if (!dev_name) 3433 if (!dev_name)
3444 goto free; 3434 goto free;
3445 3435
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df49a..28d9bcab0970 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
774 if (if6_proc_init()) 774 if (if6_proc_init())
775 goto proc_if6_fail; 775 goto proc_if6_fail;
776#endif 776#endif
777 ipv6_packet_init();
778 ip6_route_init(); 777 ip6_route_init();
779 ip6_flowlabel_init(); 778 ip6_flowlabel_init();
780 err = addrconf_init(); 779 err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
791 /* Init v6 transport protocols. */ 790 /* Init v6 transport protocols. */
792 udpv6_init(); 791 udpv6_init();
793 tcpv6_init(); 792 tcpv6_init();
793
794 ipv6_packet_init();
794 err = 0; 795 err = 0;
795out: 796out:
796 return err; 797 return err;
@@ -798,7 +799,6 @@ out:
798addrconf_fail: 799addrconf_fail:
799 ip6_flowlabel_cleanup(); 800 ip6_flowlabel_cleanup();
800 ip6_route_cleanup(); 801 ip6_route_cleanup();
801 ipv6_packet_cleanup();
802#ifdef CONFIG_PROC_FS 802#ifdef CONFIG_PROC_FS
803 if6_proc_exit(); 803 if6_proc_exit();
804proc_if6_fail: 804proc_if6_fail:
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index e3ecf626cbf7..986fdfdccbcd 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -339,7 +339,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
339 xfrm_state_put(x); 339 xfrm_state_put(x);
340} 340}
341 341
342static int ah6_init_state(struct xfrm_state *x, void *args) 342static int ah6_init_state(struct xfrm_state *x)
343{ 343{
344 struct ah_data *ahp = NULL; 344 struct ah_data *ahp = NULL;
345 struct xfrm_algo_desc *aalg_desc; 345 struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 5d22ca3cca2e..6b7294047238 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -337,7 +337,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
337 write_unlock_bh(&idev->lock); 337 write_unlock_bh(&idev->lock);
338 338
339 dst_hold(&rt->u.dst); 339 dst_hold(&rt->u.dst);
340 if (ip6_ins_rt(rt, NULL, NULL)) 340 if (ip6_ins_rt(rt, NULL, NULL, NULL))
341 dst_release(&rt->u.dst); 341 dst_release(&rt->u.dst);
342 342
343 addrconf_join_solict(dev, &aca->aca_addr); 343 addrconf_join_solict(dev, &aca->aca_addr);
@@ -380,7 +380,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
380 addrconf_leave_solict(idev, &aca->aca_addr); 380 addrconf_leave_solict(idev, &aca->aca_addr);
381 381
382 dst_hold(&aca->aca_rt->u.dst); 382 dst_hold(&aca->aca_rt->u.dst);
383 if (ip6_del_rt(aca->aca_rt, NULL, NULL)) 383 if (ip6_del_rt(aca->aca_rt, NULL, NULL, NULL))
384 dst_free(&aca->aca_rt->u.dst); 384 dst_free(&aca->aca_rt->u.dst);
385 else 385 else
386 dst_release(&aca->aca_rt->u.dst); 386 dst_release(&aca->aca_rt->u.dst);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 65b9375df57d..5229365cd8b4 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -353,14 +353,14 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
353 err = copied; 353 err = copied;
354 354
355 /* Reset and regenerate socket error */ 355 /* Reset and regenerate socket error */
356 spin_lock_irq(&sk->sk_error_queue.lock); 356 spin_lock_bh(&sk->sk_error_queue.lock);
357 sk->sk_err = 0; 357 sk->sk_err = 0;
358 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { 358 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
359 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; 359 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
360 spin_unlock_irq(&sk->sk_error_queue.lock); 360 spin_unlock_bh(&sk->sk_error_queue.lock);
361 sk->sk_error_report(sk); 361 sk->sk_error_report(sk);
362 } else { 362 } else {
363 spin_unlock_irq(&sk->sk_error_queue.lock); 363 spin_unlock_bh(&sk->sk_error_queue.lock);
364 } 364 }
365 365
366out_free_skb: 366out_free_skb:
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index be7095d6babe..324db62515a2 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -296,7 +296,7 @@ static void esp6_destroy(struct xfrm_state *x)
296 kfree(esp); 296 kfree(esp);
297} 297}
298 298
299static int esp6_init_state(struct xfrm_state *x, void *args) 299static int esp6_init_state(struct xfrm_state *x)
300{ 300{
301 struct esp_data *esp = NULL; 301 struct esp_data *esp = NULL;
302 302
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 8e0f569b883e..ff3ec9822e36 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -277,8 +277,8 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
277{ 277{
278 struct inet6_dev *idev = NULL; 278 struct inet6_dev *idev = NULL;
279 struct ipv6hdr *hdr = skb->nh.ipv6h; 279 struct ipv6hdr *hdr = skb->nh.ipv6h;
280 struct sock *sk = icmpv6_socket->sk; 280 struct sock *sk;
281 struct ipv6_pinfo *np = inet6_sk(sk); 281 struct ipv6_pinfo *np;
282 struct in6_addr *saddr = NULL; 282 struct in6_addr *saddr = NULL;
283 struct dst_entry *dst; 283 struct dst_entry *dst;
284 struct icmp6hdr tmp_hdr; 284 struct icmp6hdr tmp_hdr;
@@ -358,6 +358,9 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
358 if (icmpv6_xmit_lock()) 358 if (icmpv6_xmit_lock())
359 return; 359 return;
360 360
361 sk = icmpv6_socket->sk;
362 np = inet6_sk(sk);
363
361 if (!icmpv6_xrlim_allow(sk, type, &fl)) 364 if (!icmpv6_xrlim_allow(sk, type, &fl))
362 goto out; 365 goto out;
363 366
@@ -423,9 +426,9 @@ out:
423 426
424static void icmpv6_echo_reply(struct sk_buff *skb) 427static void icmpv6_echo_reply(struct sk_buff *skb)
425{ 428{
426 struct sock *sk = icmpv6_socket->sk; 429 struct sock *sk;
427 struct inet6_dev *idev; 430 struct inet6_dev *idev;
428 struct ipv6_pinfo *np = inet6_sk(sk); 431 struct ipv6_pinfo *np;
429 struct in6_addr *saddr = NULL; 432 struct in6_addr *saddr = NULL;
430 struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw; 433 struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw;
431 struct icmp6hdr tmp_hdr; 434 struct icmp6hdr tmp_hdr;
@@ -454,6 +457,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
454 if (icmpv6_xmit_lock()) 457 if (icmpv6_xmit_lock())
455 return; 458 return;
456 459
460 sk = icmpv6_socket->sk;
461 np = inet6_sk(sk);
462
457 if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) 463 if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
458 fl.oif = np->mcast_oif; 464 fl.oif = np->mcast_oif;
459 465
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 405740b75abb..1b354aa97934 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -394,7 +394,7 @@ insert_above:
394 */ 394 */
395 395
396static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, 396static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
397 struct nlmsghdr *nlh) 397 struct nlmsghdr *nlh, struct netlink_skb_parms *req)
398{ 398{
399 struct rt6_info *iter = NULL; 399 struct rt6_info *iter = NULL;
400 struct rt6_info **ins; 400 struct rt6_info **ins;
@@ -449,7 +449,7 @@ out:
449 *ins = rt; 449 *ins = rt;
450 rt->rt6i_node = fn; 450 rt->rt6i_node = fn;
451 atomic_inc(&rt->rt6i_ref); 451 atomic_inc(&rt->rt6i_ref);
452 inet6_rt_notify(RTM_NEWROUTE, rt, nlh); 452 inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
453 rt6_stats.fib_rt_entries++; 453 rt6_stats.fib_rt_entries++;
454 454
455 if ((fn->fn_flags & RTN_RTINFO) == 0) { 455 if ((fn->fn_flags & RTN_RTINFO) == 0) {
@@ -479,7 +479,8 @@ void fib6_force_start_gc(void)
479 * with source addr info in sub-trees 479 * with source addr info in sub-trees
480 */ 480 */
481 481
482int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 482int fib6_add(struct fib6_node *root, struct rt6_info *rt,
483 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
483{ 484{
484 struct fib6_node *fn; 485 struct fib6_node *fn;
485 int err = -ENOMEM; 486 int err = -ENOMEM;
@@ -552,7 +553,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh,
552 } 553 }
553#endif 554#endif
554 555
555 err = fib6_add_rt2node(fn, rt, nlh); 556 err = fib6_add_rt2node(fn, rt, nlh, req);
556 557
557 if (err == 0) { 558 if (err == 0) {
558 fib6_start_gc(rt); 559 fib6_start_gc(rt);
@@ -859,7 +860,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
859} 860}
860 861
861static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, 862static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
862 struct nlmsghdr *nlh, void *_rtattr) 863 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
863{ 864{
864 struct fib6_walker_t *w; 865 struct fib6_walker_t *w;
865 struct rt6_info *rt = *rtp; 866 struct rt6_info *rt = *rtp;
@@ -915,11 +916,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
915 if (atomic_read(&rt->rt6i_ref) != 1) BUG(); 916 if (atomic_read(&rt->rt6i_ref) != 1) BUG();
916 } 917 }
917 918
918 inet6_rt_notify(RTM_DELROUTE, rt, nlh); 919 inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
919 rt6_release(rt); 920 rt6_release(rt);
920} 921}
921 922
922int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 923int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
923{ 924{
924 struct fib6_node *fn = rt->rt6i_node; 925 struct fib6_node *fn = rt->rt6i_node;
925 struct rt6_info **rtp; 926 struct rt6_info **rtp;
@@ -944,7 +945,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
944 945
945 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { 946 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
946 if (*rtp == rt) { 947 if (*rtp == rt) {
947 fib6_del_route(fn, rtp, nlh, _rtattr); 948 fib6_del_route(fn, rtp, nlh, _rtattr, req);
948 return 0; 949 return 0;
949 } 950 }
950 } 951 }
@@ -1073,7 +1074,7 @@ static int fib6_clean_node(struct fib6_walker_t *w)
1073 res = c->func(rt, c->arg); 1074 res = c->func(rt, c->arg);
1074 if (res < 0) { 1075 if (res < 0) {
1075 w->leaf = rt; 1076 w->leaf = rt;
1076 res = fib6_del(rt, NULL, NULL); 1077 res = fib6_del(rt, NULL, NULL, NULL);
1077 if (res) { 1078 if (res) {
1078#if RT6_DEBUG >= 2 1079#if RT6_DEBUG >= 2
1079 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); 1080 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499debb..b6c73da5ff35 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
244 opt_space->opt_nflen = 0; 244 opt_space->opt_nflen = 0;
245 } 245 }
246 opt_space->dst1opt = fopt->dst1opt; 246 opt_space->dst1opt = fopt->dst1opt;
247 opt_space->auth = fopt->auth;
248 opt_space->opt_flen = fopt->opt_flen; 247 opt_space->opt_flen = fopt->opt_flen;
249 return opt_space; 248 return opt_space;
250} 249}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 866f10726c58..10fbb50daea4 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -198,12 +198,13 @@ resubmit:
198 if (!raw_sk) { 198 if (!raw_sk) {
199 if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { 199 if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
200 IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); 200 IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
201 icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff); 201 icmpv6_send(skb, ICMPV6_PARAMPROB,
202 ICMPV6_UNK_NEXTHDR, nhoff,
203 skb->dev);
202 } 204 }
203 } else { 205 } else
204 IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); 206 IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
205 kfree_skb(skb); 207 kfree_skb(skb);
206 }
207 } 208 }
208 rcu_read_unlock(); 209 rcu_read_unlock();
209 return 0; 210 return 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b78a53586804..ae652ca14bc9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
465 to->pkt_type = from->pkt_type; 465 to->pkt_type = from->pkt_type;
466 to->priority = from->priority; 466 to->priority = from->priority;
467 to->protocol = from->protocol; 467 to->protocol = from->protocol;
468 to->security = from->security;
469 dst_release(to->dst); 468 dst_release(to->dst);
470 to->dst = dst_clone(from->dst); 469 to->dst = dst_clone(from->dst);
471 to->dev = from->dev; 470 to->dev = from->dev;
@@ -484,9 +483,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
484 to->nf_bridge = from->nf_bridge; 483 to->nf_bridge = from->nf_bridge;
485 nf_bridge_get(to->nf_bridge); 484 nf_bridge_get(to->nf_bridge);
486#endif 485#endif
487#ifdef CONFIG_NETFILTER_DEBUG
488 to->nf_debug = from->nf_debug;
489#endif
490#endif 486#endif
491} 487}
492 488
@@ -796,13 +792,8 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
796 if (ipv6_addr_any(&fl->fl6_src)) { 792 if (ipv6_addr_any(&fl->fl6_src)) {
797 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); 793 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
798 794
799 if (err) { 795 if (err)
800#if IP6_DEBUG >= 2
801 printk(KERN_DEBUG "ip6_dst_lookup: "
802 "no available source address\n");
803#endif
804 goto out_err_release; 796 goto out_err_release;
805 }
806 } 797 }
807 798
808 return 0; 799 return 0;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 3b1c9fa184ae..09613729404c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -882,6 +882,7 @@ ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
882 t->parms.hop_limit = p->hop_limit; 882 t->parms.hop_limit = p->hop_limit;
883 t->parms.encap_limit = p->encap_limit; 883 t->parms.encap_limit = p->encap_limit;
884 t->parms.flowinfo = p->flowinfo; 884 t->parms.flowinfo = p->flowinfo;
885 t->parms.link = p->link;
885 ip6ip6_tnl_link_config(t); 886 ip6ip6_tnl_link_config(t);
886 return 0; 887 return 0;
887} 888}
@@ -1109,11 +1110,39 @@ ip6ip6_fb_tnl_dev_init(struct net_device *dev)
1109 return 0; 1110 return 0;
1110} 1111}
1111 1112
1113#ifdef CONFIG_INET6_TUNNEL
1112static struct xfrm6_tunnel ip6ip6_handler = { 1114static struct xfrm6_tunnel ip6ip6_handler = {
1113 .handler = ip6ip6_rcv, 1115 .handler = ip6ip6_rcv,
1114 .err_handler = ip6ip6_err, 1116 .err_handler = ip6ip6_err,
1115}; 1117};
1116 1118
1119static inline int ip6ip6_register(void)
1120{
1121 return xfrm6_tunnel_register(&ip6ip6_handler);
1122}
1123
1124static inline int ip6ip6_unregister(void)
1125{
1126 return xfrm6_tunnel_deregister(&ip6ip6_handler);
1127}
1128#else
1129static struct inet6_protocol xfrm6_tunnel_protocol = {
1130 .handler = ip6ip6_rcv,
1131 .err_handler = ip6ip6_err,
1132 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
1133};
1134
1135static inline int ip6ip6_register(void)
1136{
1137 return inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
1138}
1139
1140static inline int ip6ip6_unregister(void)
1141{
1142 return inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
1143}
1144#endif
1145
1117/** 1146/**
1118 * ip6_tunnel_init - register protocol and reserve needed resources 1147 * ip6_tunnel_init - register protocol and reserve needed resources
1119 * 1148 *
@@ -1124,7 +1153,7 @@ static int __init ip6_tunnel_init(void)
1124{ 1153{
1125 int err; 1154 int err;
1126 1155
1127 if (xfrm6_tunnel_register(&ip6ip6_handler) < 0) { 1156 if (ip6ip6_register() < 0) {
1128 printk(KERN_ERR "ip6ip6 init: can't register tunnel\n"); 1157 printk(KERN_ERR "ip6ip6 init: can't register tunnel\n");
1129 return -EAGAIN; 1158 return -EAGAIN;
1130 } 1159 }
@@ -1143,7 +1172,7 @@ static int __init ip6_tunnel_init(void)
1143 } 1172 }
1144 return 0; 1173 return 0;
1145fail: 1174fail:
1146 xfrm6_tunnel_deregister(&ip6ip6_handler); 1175 ip6ip6_unregister();
1147 return err; 1176 return err;
1148} 1177}
1149 1178
@@ -1153,7 +1182,7 @@ fail:
1153 1182
1154static void __exit ip6_tunnel_cleanup(void) 1183static void __exit ip6_tunnel_cleanup(void)
1155{ 1184{
1156 if (xfrm6_tunnel_deregister(&ip6ip6_handler) < 0) 1185 if (ip6ip6_unregister() < 0)
1157 printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n"); 1186 printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n");
1158 1187
1159 unregister_netdev(ip6ip6_fb_tnl_dev); 1188 unregister_netdev(ip6ip6_fb_tnl_dev);
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 6cde5310cd76..423feb46ccc0 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -234,14 +234,9 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
234 t->props.mode = 1; 234 t->props.mode = 1;
235 memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); 235 memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
236 236
237 t->type = xfrm_get_type(IPPROTO_IPV6, t->props.family); 237 if (xfrm_init_state(t))
238 if (t->type == NULL)
239 goto error; 238 goto error;
240 239
241 if (t->type->init_state(t, NULL))
242 goto error;
243
244 t->km.state = XFRM_STATE_VALID;
245 atomic_set(&t->tunnel_users, 1); 240 atomic_set(&t->tunnel_users, 1);
246 241
247out: 242out:
@@ -420,7 +415,7 @@ static void ipcomp6_destroy(struct xfrm_state *x)
420 xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); 415 xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr);
421} 416}
422 417
423static int ipcomp6_init_state(struct xfrm_state *x, void *args) 418static int ipcomp6_init_state(struct xfrm_state *x)
424{ 419{
425 int err; 420 int err;
426 struct ipcomp_data *ipcd; 421 struct ipcomp_data *ipcd;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 279ab86be662..3bc144a79fa5 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -423,11 +423,12 @@ done:
423 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; 423 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
424 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, 424 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
425 &psin6->sin6_addr); 425 &psin6->sin6_addr);
426 if (retv) 426 /* prior join w/ different source is ok */
427 if (retv && retv != -EADDRINUSE)
427 break; 428 break;
428 omode = MCAST_INCLUDE; 429 omode = MCAST_INCLUDE;
429 add = 1; 430 add = 1;
430 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 431 } else /* MCAST_LEAVE_SOURCE_GROUP */ {
431 omode = MCAST_INCLUDE; 432 omode = MCAST_INCLUDE;
432 add = 0; 433 add = 0;
433 } 434 }
@@ -503,6 +504,9 @@ done:
503 break; 504 break;
504 case IPV6_IPSEC_POLICY: 505 case IPV6_IPSEC_POLICY:
505 case IPV6_XFRM_POLICY: 506 case IPV6_XFRM_POLICY:
507 retv = -EPERM;
508 if (!capable(CAP_NET_ADMIN))
509 break;
506 retv = xfrm_user_policy(sk, optname, optval, optlen); 510 retv = xfrm_user_policy(sk, optname, optval, optlen);
507 break; 511 break;
508 512
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 393b6e6f50a9..29fed6e58d0a 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -188,6 +188,16 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
188 if (!ipv6_addr_is_multicast(addr)) 188 if (!ipv6_addr_is_multicast(addr))
189 return -EINVAL; 189 return -EINVAL;
190 190
191 read_lock_bh(&ipv6_sk_mc_lock);
192 for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) {
193 if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
194 ipv6_addr_equal(&mc_lst->addr, addr)) {
195 read_unlock_bh(&ipv6_sk_mc_lock);
196 return -EADDRINUSE;
197 }
198 }
199 read_unlock_bh(&ipv6_sk_mc_lock);
200
191 mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); 201 mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
192 202
193 if (mc_lst == NULL) 203 if (mc_lst == NULL)
@@ -271,7 +281,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
271 } 281 }
272 write_unlock_bh(&ipv6_sk_mc_lock); 282 write_unlock_bh(&ipv6_sk_mc_lock);
273 283
274 return -ENOENT; 284 return -EADDRNOTAVAIL;
275} 285}
276 286
277static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex) 287static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex)
@@ -349,6 +359,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
349 struct ipv6_pinfo *inet6 = inet6_sk(sk); 359 struct ipv6_pinfo *inet6 = inet6_sk(sk);
350 struct ip6_sf_socklist *psl; 360 struct ip6_sf_socklist *psl;
351 int i, j, rv; 361 int i, j, rv;
362 int leavegroup = 0;
352 int err; 363 int err;
353 364
354 if (pgsr->gsr_group.ss_family != AF_INET6 || 365 if (pgsr->gsr_group.ss_family != AF_INET6 ||
@@ -368,18 +379,23 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
368 379
369 err = -EADDRNOTAVAIL; 380 err = -EADDRNOTAVAIL;
370 381
382 read_lock_bh(&ipv6_sk_mc_lock);
371 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { 383 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
372 if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) 384 if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
373 continue; 385 continue;
374 if (ipv6_addr_equal(&pmc->addr, group)) 386 if (ipv6_addr_equal(&pmc->addr, group))
375 break; 387 break;
376 } 388 }
377 if (!pmc) /* must have a prior join */ 389 if (!pmc) { /* must have a prior join */
390 err = -EINVAL;
378 goto done; 391 goto done;
392 }
379 /* if a source filter was set, must be the same mode as before */ 393 /* if a source filter was set, must be the same mode as before */
380 if (pmc->sflist) { 394 if (pmc->sflist) {
381 if (pmc->sfmode != omode) 395 if (pmc->sfmode != omode) {
396 err = -EINVAL;
382 goto done; 397 goto done;
398 }
383 } else if (pmc->sfmode != omode) { 399 } else if (pmc->sfmode != omode) {
384 /* allow mode switches for empty-set filters */ 400 /* allow mode switches for empty-set filters */
385 ip6_mc_add_src(idev, group, omode, 0, NULL, 0); 401 ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
@@ -390,7 +406,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
390 psl = pmc->sflist; 406 psl = pmc->sflist;
391 if (!add) { 407 if (!add) {
392 if (!psl) 408 if (!psl)
393 goto done; 409 goto done; /* err = -EADDRNOTAVAIL */
394 rv = !0; 410 rv = !0;
395 for (i=0; i<psl->sl_count; i++) { 411 for (i=0; i<psl->sl_count; i++) {
396 rv = memcmp(&psl->sl_addr[i], source, 412 rv = memcmp(&psl->sl_addr[i], source,
@@ -399,7 +415,13 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
399 break; 415 break;
400 } 416 }
401 if (rv) /* source not found */ 417 if (rv) /* source not found */
418 goto done; /* err = -EADDRNOTAVAIL */
419
420 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
421 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
422 leavegroup = 1;
402 goto done; 423 goto done;
424 }
403 425
404 /* update the interface filter */ 426 /* update the interface filter */
405 ip6_mc_del_src(idev, group, omode, 1, source, 1); 427 ip6_mc_del_src(idev, group, omode, 1, source, 1);
@@ -453,9 +475,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
453 /* update the interface list */ 475 /* update the interface list */
454 ip6_mc_add_src(idev, group, omode, 1, source, 1); 476 ip6_mc_add_src(idev, group, omode, 1, source, 1);
455done: 477done:
478 read_unlock_bh(&ipv6_sk_mc_lock);
456 read_unlock_bh(&idev->lock); 479 read_unlock_bh(&idev->lock);
457 in6_dev_put(idev); 480 in6_dev_put(idev);
458 dev_put(dev); 481 dev_put(dev);
482 if (leavegroup)
483 return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
459 return err; 484 return err;
460} 485}
461 486
@@ -467,6 +492,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
467 struct inet6_dev *idev; 492 struct inet6_dev *idev;
468 struct ipv6_pinfo *inet6 = inet6_sk(sk); 493 struct ipv6_pinfo *inet6 = inet6_sk(sk);
469 struct ip6_sf_socklist *newpsl, *psl; 494 struct ip6_sf_socklist *newpsl, *psl;
495 int leavegroup = 0;
470 int i, err; 496 int i, err;
471 497
472 group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; 498 group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
@@ -482,7 +508,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
482 if (!idev) 508 if (!idev)
483 return -ENODEV; 509 return -ENODEV;
484 dev = idev->dev; 510 dev = idev->dev;
485 err = -EADDRNOTAVAIL; 511
512 err = 0;
513 if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
514 leavegroup = 1;
515 goto done;
516 }
486 517
487 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { 518 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
488 if (pmc->ifindex != gsf->gf_interface) 519 if (pmc->ifindex != gsf->gf_interface)
@@ -490,8 +521,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
490 if (ipv6_addr_equal(&pmc->addr, group)) 521 if (ipv6_addr_equal(&pmc->addr, group))
491 break; 522 break;
492 } 523 }
493 if (!pmc) /* must have a prior join */ 524 if (!pmc) { /* must have a prior join */
525 err = -EINVAL;
494 goto done; 526 goto done;
527 }
495 if (gsf->gf_numsrc) { 528 if (gsf->gf_numsrc) {
496 newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, 529 newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk,
497 IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC); 530 IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC);
@@ -523,10 +556,13 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
523 (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); 556 (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
524 pmc->sflist = newpsl; 557 pmc->sflist = newpsl;
525 pmc->sfmode = gsf->gf_fmode; 558 pmc->sfmode = gsf->gf_fmode;
559 err = 0;
526done: 560done:
527 read_unlock_bh(&idev->lock); 561 read_unlock_bh(&idev->lock);
528 in6_dev_put(idev); 562 in6_dev_put(idev);
529 dev_put(dev); 563 dev_put(dev);
564 if (leavegroup)
565 err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
530 return err; 566 return err;
531} 567}
532 568
@@ -1280,15 +1316,6 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
1280 return NULL; 1316 return NULL;
1281 1317
1282 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 1318 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1283 if (dev->hard_header) {
1284 unsigned char ha[MAX_ADDR_LEN];
1285
1286 ndisc_mc_map(&mld2_all_mcr, ha, dev, 1);
1287 if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) {
1288 kfree_skb(skb);
1289 return NULL;
1290 }
1291 }
1292 1319
1293 if (ipv6_get_lladdr(dev, &addr_buf)) { 1320 if (ipv6_get_lladdr(dev, &addr_buf)) {
1294 /* <draft-ietf-magma-mld-source-05.txt>: 1321 /* <draft-ietf-magma-mld-source-05.txt>:
@@ -1312,6 +1339,30 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
1312 return skb; 1339 return skb;
1313} 1340}
1314 1341
1342static inline int mld_dev_queue_xmit2(struct sk_buff *skb)
1343{
1344 struct net_device *dev = skb->dev;
1345
1346 if (dev->hard_header) {
1347 unsigned char ha[MAX_ADDR_LEN];
1348 int err;
1349
1350 ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1);
1351 err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len);
1352 if (err < 0) {
1353 kfree_skb(skb);
1354 return err;
1355 }
1356 }
1357 return dev_queue_xmit(skb);
1358}
1359
1360static inline int mld_dev_queue_xmit(struct sk_buff *skb)
1361{
1362 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev,
1363 mld_dev_queue_xmit2);
1364}
1365
1315static void mld_sendpack(struct sk_buff *skb) 1366static void mld_sendpack(struct sk_buff *skb)
1316{ 1367{
1317 struct ipv6hdr *pip6 = skb->nh.ipv6h; 1368 struct ipv6hdr *pip6 = skb->nh.ipv6h;
@@ -1329,7 +1380,7 @@ static void mld_sendpack(struct sk_buff *skb)
1329 pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, 1380 pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
1330 IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0)); 1381 IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0));
1331 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, 1382 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
1332 dev_queue_xmit); 1383 mld_dev_queue_xmit);
1333 if (!err) { 1384 if (!err) {
1334 ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS); 1385 ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
1335 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); 1386 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
@@ -1635,12 +1686,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1635 } 1686 }
1636 1687
1637 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 1688 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1638 if (dev->hard_header) {
1639 unsigned char ha[MAX_ADDR_LEN];
1640 ndisc_mc_map(snd_addr, ha, dev, 1);
1641 if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0)
1642 goto out;
1643 }
1644 1689
1645 if (ipv6_get_lladdr(dev, &addr_buf)) { 1690 if (ipv6_get_lladdr(dev, &addr_buf)) {
1646 /* <draft-ietf-magma-mld-source-05.txt>: 1691 /* <draft-ietf-magma-mld-source-05.txt>:
@@ -1668,7 +1713,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1668 idev = in6_dev_get(skb->dev); 1713 idev = in6_dev_get(skb->dev);
1669 1714
1670 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, 1715 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
1671 dev_queue_xmit); 1716 mld_dev_queue_xmit);
1672 if (!err) { 1717 if (!err) {
1673 if (type == ICMPV6_MGM_REDUCTION) 1718 if (type == ICMPV6_MGM_REDUCTION)
1674 ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS); 1719 ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
@@ -1682,10 +1727,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1682 if (likely(idev != NULL)) 1727 if (likely(idev != NULL))
1683 in6_dev_put(idev); 1728 in6_dev_put(idev);
1684 return; 1729 return;
1685
1686out:
1687 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1688 kfree_skb(skb);
1689} 1730}
1690 1731
1691static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, 1732static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7c291f4e9edc..7ae72d4c9bd2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -955,7 +955,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
955 struct rt6_info *rt; 955 struct rt6_info *rt;
956 rt = rt6_get_dflt_router(saddr, dev); 956 rt = rt6_get_dflt_router(saddr, dev);
957 if (rt) 957 if (rt)
958 ip6_del_rt(rt, NULL, NULL); 958 ip6_del_rt(rt, NULL, NULL, NULL);
959 } 959 }
960 960
961out: 961out:
@@ -1096,7 +1096,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1096 1096
1097 if (rt && lifetime == 0) { 1097 if (rt && lifetime == 0) {
1098 neigh_clone(neigh); 1098 neigh_clone(neigh);
1099 ip6_del_rt(rt, NULL, NULL); 1099 ip6_del_rt(rt, NULL, NULL, NULL);
1100 rt = NULL; 1100 rt = NULL;
1101 } 1101 }
1102 1102
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 750943e2d34e..5493180f0d44 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -76,7 +76,9 @@ static DECLARE_MUTEX(ipqnl_sem);
76static void 76static void
77ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) 77ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
78{ 78{
79 local_bh_disable();
79 nf_reinject(entry->skb, entry->info, verdict); 80 nf_reinject(entry->skb, entry->info, verdict);
81 local_bh_enable();
80 kfree(entry); 82 kfree(entry);
81} 83}
82 84
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c735276fdd5f..73034511c8db 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -71,7 +71,6 @@ static DECLARE_MUTEX(ip6t_mutex);
71/* Must have mutex */ 71/* Must have mutex */
72#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) 72#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
73#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) 73#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
74#include <linux/netfilter_ipv4/lockhelp.h>
75#include <linux/netfilter_ipv4/listhelp.h> 74#include <linux/netfilter_ipv4/listhelp.h>
76 75
77#if 0 76#if 0
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index bfc3d0185d19..a692e26a4fa3 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -366,8 +366,6 @@ ip6t_log_packet(unsigned int hooknum,
366 const char *level_string, 366 const char *level_string,
367 const char *prefix) 367 const char *prefix)
368{ 368{
369 struct ipv6hdr *ipv6h = skb->nh.ipv6h;
370
371 spin_lock_bh(&log_lock); 369 spin_lock_bh(&log_lock);
372 printk(level_string); 370 printk(level_string);
373 printk("%sIN=%s OUT=%s ", 371 printk("%sIN=%s OUT=%s ",
@@ -375,41 +373,30 @@ ip6t_log_packet(unsigned int hooknum,
375 in ? in->name : "", 373 in ? in->name : "",
376 out ? out->name : ""); 374 out ? out->name : "");
377 if (in && !out) { 375 if (in && !out) {
376 unsigned int len;
378 /* MAC logging for input chain only. */ 377 /* MAC logging for input chain only. */
379 printk("MAC="); 378 printk("MAC=");
380 if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) { 379 if (skb->dev && (len = skb->dev->hard_header_len) &&
381 if (skb->dev->type != ARPHRD_SIT){ 380 skb->mac.raw != skb->nh.raw) {
382 int i; 381 unsigned char *p = skb->mac.raw;
383 unsigned char *p = skb->mac.raw; 382 int i;
384 for (i = 0; i < skb->dev->hard_header_len; i++,p++) 383
385 printk("%02x%c", *p, 384 if (skb->dev->type == ARPHRD_SIT &&
386 i==skb->dev->hard_header_len - 1 385 (p -= ETH_HLEN) < skb->head)
387 ? ' ':':'); 386 p = NULL;
388 } else { 387
389 int i; 388 if (p != NULL) {
390 unsigned char *p = skb->mac.raw; 389 for (i = 0; i < len; i++)
391 if ( p - (ETH_ALEN*2+2) > skb->head ){ 390 printk("%02x%s", p[i],
392 p -= (ETH_ALEN+2); 391 i == len - 1 ? "" : ":");
393 for (i = 0; i < (ETH_ALEN); i++,p++) 392 }
394 printk("%02x%s", *p, 393 printk(" ");
395 i == ETH_ALEN-1 ? "->" : ":"); 394
396 p -= (ETH_ALEN*2); 395 if (skb->dev->type == ARPHRD_SIT) {
397 for (i = 0; i < (ETH_ALEN); i++,p++) 396 struct iphdr *iph = (struct iphdr *)skb->mac.raw;
398 printk("%02x%c", *p, 397 printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ",
399 i == ETH_ALEN-1 ? ' ' : ':'); 398 NIPQUAD(iph->saddr),
400 } 399 NIPQUAD(iph->daddr));
401
402 if ((skb->dev->addr_len == 4) &&
403 skb->dev->hard_header_len > 20){
404 printk("TUNNEL=");
405 p = skb->mac.raw + 12;
406 for (i = 0; i < 4; i++,p++)
407 printk("%3d%s", *p,
408 i == 3 ? "->" : ".");
409 for (i = 0; i < 4; i++,p++)
410 printk("%3d%c", *p,
411 i == 3 ? ' ' : '.');
412 }
413 } 400 }
414 } else 401 } else
415 printk(" "); 402 printk(" ");
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 71407beaf790..c2982efd14af 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -129,13 +129,15 @@ static struct nf_hook_ops ip6t_ops[] = {
129 .hook = ip6t_hook, 129 .hook = ip6t_hook,
130 .pf = PF_INET6, 130 .pf = PF_INET6,
131 .hooknum = NF_IP6_PRE_ROUTING, 131 .hooknum = NF_IP6_PRE_ROUTING,
132 .priority = NF_IP6_PRI_FIRST 132 .priority = NF_IP6_PRI_FIRST,
133 .owner = THIS_MODULE,
133 }, 134 },
134 { 135 {
135 .hook = ip6t_hook, 136 .hook = ip6t_hook,
136 .pf = PF_INET6, 137 .pf = PF_INET6,
137 .hooknum = NF_IP6_LOCAL_OUT, 138 .hooknum = NF_IP6_LOCAL_OUT,
138 .priority = NF_IP6_PRI_FIRST 139 .priority = NF_IP6_PRI_FIRST,
140 .owner = THIS_MODULE,
139 }, 141 },
140}; 142};
141 143
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 617645bc5ed6..1d4d75b34d32 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -328,6 +328,8 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
328 328
329 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 329 if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
330 if (skb->ip_summed == CHECKSUM_HW) { 330 if (skb->ip_summed == CHECKSUM_HW) {
331 skb_postpull_rcsum(skb, skb->nh.raw,
332 skb->h.raw - skb->nh.raw);
331 skb->ip_summed = CHECKSUM_UNNECESSARY; 333 skb->ip_summed = CHECKSUM_UNNECESSARY;
332 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, 334 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
333 &skb->nh.ipv6h->daddr, 335 &skb->nh.ipv6h->daddr,
@@ -434,12 +436,12 @@ csum_copy_err:
434 /* Clear queue. */ 436 /* Clear queue. */
435 if (flags&MSG_PEEK) { 437 if (flags&MSG_PEEK) {
436 int clear = 0; 438 int clear = 0;
437 spin_lock_irq(&sk->sk_receive_queue.lock); 439 spin_lock_bh(&sk->sk_receive_queue.lock);
438 if (skb == skb_peek(&sk->sk_receive_queue)) { 440 if (skb == skb_peek(&sk->sk_receive_queue)) {
439 __skb_unlink(skb, &sk->sk_receive_queue); 441 __skb_unlink(skb, &sk->sk_receive_queue);
440 clear = 1; 442 clear = 1;
441 } 443 }
442 spin_unlock_irq(&sk->sk_receive_queue.lock); 444 spin_unlock_bh(&sk->sk_receive_queue.lock);
443 if (clear) 445 if (clear)
444 kfree_skb(skb); 446 kfree_skb(skb);
445 } 447 }
@@ -971,11 +973,11 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
971 struct sk_buff *skb; 973 struct sk_buff *skb;
972 int amount = 0; 974 int amount = 0;
973 975
974 spin_lock_irq(&sk->sk_receive_queue.lock); 976 spin_lock_bh(&sk->sk_receive_queue.lock);
975 skb = skb_peek(&sk->sk_receive_queue); 977 skb = skb_peek(&sk->sk_receive_queue);
976 if (skb != NULL) 978 if (skb != NULL)
977 amount = skb->tail - skb->h.raw; 979 amount = skb->tail - skb->h.raw;
978 spin_unlock_irq(&sk->sk_receive_queue.lock); 980 spin_unlock_bh(&sk->sk_receive_queue.lock);
979 return put_user(amount, (int __user *)arg); 981 return put_user(amount, (int __user *)arg);
980 } 982 }
981 983
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3bf8a0254f81..878789b3122d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -384,12 +384,13 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
384 be destroyed. 384 be destroyed.
385 */ 385 */
386 386
387int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 387int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
388 void *_rtattr, struct netlink_skb_parms *req)
388{ 389{
389 int err; 390 int err;
390 391
391 write_lock_bh(&rt6_lock); 392 write_lock_bh(&rt6_lock);
392 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr); 393 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
393 write_unlock_bh(&rt6_lock); 394 write_unlock_bh(&rt6_lock);
394 395
395 return err; 396 return err;
@@ -400,7 +401,7 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
400 */ 401 */
401 402
402static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, 403static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
403 struct in6_addr *saddr) 404 struct in6_addr *saddr, struct netlink_skb_parms *req)
404{ 405{
405 int err; 406 int err;
406 struct rt6_info *rt; 407 struct rt6_info *rt;
@@ -432,7 +433,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
432 433
433 dst_hold(&rt->u.dst); 434 dst_hold(&rt->u.dst);
434 435
435 err = ip6_ins_rt(rt, NULL, NULL); 436 err = ip6_ins_rt(rt, NULL, NULL, req);
436 if (err == 0) 437 if (err == 0)
437 return rt; 438 return rt;
438 439
@@ -491,7 +492,8 @@ restart:
491 read_unlock_bh(&rt6_lock); 492 read_unlock_bh(&rt6_lock);
492 493
493 nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr, 494 nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
494 &skb->nh.ipv6h->saddr); 495 &skb->nh.ipv6h->saddr,
496 &NETLINK_CB(skb));
495 497
496 dst_release(&rt->u.dst); 498 dst_release(&rt->u.dst);
497 rt = nrt; 499 rt = nrt;
@@ -551,7 +553,7 @@ restart:
551 dst_hold(&rt->u.dst); 553 dst_hold(&rt->u.dst);
552 read_unlock_bh(&rt6_lock); 554 read_unlock_bh(&rt6_lock);
553 555
554 nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src); 556 nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src, NULL);
555 557
556 dst_release(&rt->u.dst); 558 dst_release(&rt->u.dst);
557 rt = nrt; 559 rt = nrt;
@@ -598,7 +600,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
598 600
599 if (rt) { 601 if (rt) {
600 if (rt->rt6i_flags & RTF_CACHE) 602 if (rt->rt6i_flags & RTF_CACHE)
601 ip6_del_rt(rt, NULL, NULL); 603 ip6_del_rt(rt, NULL, NULL, NULL);
602 else 604 else
603 dst_release(dst); 605 dst_release(dst);
604 } 606 }
@@ -787,7 +789,8 @@ int ipv6_get_hoplimit(struct net_device *dev)
787 * 789 *
788 */ 790 */
789 791
790int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) 792int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
793 void *_rtattr, struct netlink_skb_parms *req)
791{ 794{
792 int err; 795 int err;
793 struct rtmsg *r; 796 struct rtmsg *r;
@@ -974,7 +977,7 @@ install_route:
974 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); 977 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
975 rt->u.dst.dev = dev; 978 rt->u.dst.dev = dev;
976 rt->rt6i_idev = idev; 979 rt->rt6i_idev = idev;
977 return ip6_ins_rt(rt, nlh, _rtattr); 980 return ip6_ins_rt(rt, nlh, _rtattr, req);
978 981
979out: 982out:
980 if (dev) 983 if (dev)
@@ -986,7 +989,7 @@ out:
986 return err; 989 return err;
987} 990}
988 991
989int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 992int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
990{ 993{
991 int err; 994 int err;
992 995
@@ -994,7 +997,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
994 997
995 rt6_reset_dflt_pointer(NULL); 998 rt6_reset_dflt_pointer(NULL);
996 999
997 err = fib6_del(rt, nlh, _rtattr); 1000 err = fib6_del(rt, nlh, _rtattr, req);
998 dst_release(&rt->u.dst); 1001 dst_release(&rt->u.dst);
999 1002
1000 write_unlock_bh(&rt6_lock); 1003 write_unlock_bh(&rt6_lock);
@@ -1002,7 +1005,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
1002 return err; 1005 return err;
1003} 1006}
1004 1007
1005static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) 1008static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1006{ 1009{
1007 struct fib6_node *fn; 1010 struct fib6_node *fn;
1008 struct rt6_info *rt; 1011 struct rt6_info *rt;
@@ -1029,7 +1032,7 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_r
1029 dst_hold(&rt->u.dst); 1032 dst_hold(&rt->u.dst);
1030 read_unlock_bh(&rt6_lock); 1033 read_unlock_bh(&rt6_lock);
1031 1034
1032 return ip6_del_rt(rt, nlh, _rtattr); 1035 return ip6_del_rt(rt, nlh, _rtattr, req);
1033 } 1036 }
1034 } 1037 }
1035 read_unlock_bh(&rt6_lock); 1038 read_unlock_bh(&rt6_lock);
@@ -1136,11 +1139,11 @@ source_ok:
1136 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); 1139 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1137 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); 1140 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1138 1141
1139 if (ip6_ins_rt(nrt, NULL, NULL)) 1142 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1140 goto out; 1143 goto out;
1141 1144
1142 if (rt->rt6i_flags&RTF_CACHE) { 1145 if (rt->rt6i_flags&RTF_CACHE) {
1143 ip6_del_rt(rt, NULL, NULL); 1146 ip6_del_rt(rt, NULL, NULL, NULL);
1144 return; 1147 return;
1145 } 1148 }
1146 1149
@@ -1204,7 +1207,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1204 2. It is gatewayed route or NONEXTHOP route. Action: clone it. 1207 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1205 */ 1208 */
1206 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { 1209 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
1207 nrt = rt6_cow(rt, daddr, saddr); 1210 nrt = rt6_cow(rt, daddr, saddr, NULL);
1208 if (!nrt->u.dst.error) { 1211 if (!nrt->u.dst.error) {
1209 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; 1212 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1210 if (allfrag) 1213 if (allfrag)
@@ -1232,7 +1235,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1232 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; 1235 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1233 if (allfrag) 1236 if (allfrag)
1234 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; 1237 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1235 ip6_ins_rt(nrt, NULL, NULL); 1238 ip6_ins_rt(nrt, NULL, NULL, NULL);
1236 } 1239 }
1237 1240
1238out: 1241out:
@@ -1305,7 +1308,7 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1305 1308
1306 rtmsg.rtmsg_ifindex = dev->ifindex; 1309 rtmsg.rtmsg_ifindex = dev->ifindex;
1307 1310
1308 ip6_route_add(&rtmsg, NULL, NULL); 1311 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1309 return rt6_get_dflt_router(gwaddr, dev); 1312 return rt6_get_dflt_router(gwaddr, dev);
1310} 1313}
1311 1314
@@ -1323,7 +1326,7 @@ restart:
1323 1326
1324 read_unlock_bh(&rt6_lock); 1327 read_unlock_bh(&rt6_lock);
1325 1328
1326 ip6_del_rt(rt, NULL, NULL); 1329 ip6_del_rt(rt, NULL, NULL, NULL);
1327 1330
1328 goto restart; 1331 goto restart;
1329 } 1332 }
@@ -1349,10 +1352,10 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1349 rtnl_lock(); 1352 rtnl_lock();
1350 switch (cmd) { 1353 switch (cmd) {
1351 case SIOCADDRT: 1354 case SIOCADDRT:
1352 err = ip6_route_add(&rtmsg, NULL, NULL); 1355 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1353 break; 1356 break;
1354 case SIOCDELRT: 1357 case SIOCDELRT:
1355 err = ip6_route_del(&rtmsg, NULL, NULL); 1358 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1356 break; 1359 break;
1357 default: 1360 default:
1358 err = -EINVAL; 1361 err = -EINVAL;
@@ -1546,7 +1549,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1546 1549
1547 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) 1550 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1548 return -EINVAL; 1551 return -EINVAL;
1549 return ip6_route_del(&rtmsg, nlh, arg); 1552 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1550} 1553}
1551 1554
1552int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 1555int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -1556,7 +1559,7 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1556 1559
1557 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) 1560 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1558 return -EINVAL; 1561 return -EINVAL;
1559 return ip6_route_add(&rtmsg, nlh, arg); 1562 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1560} 1563}
1561 1564
1562struct rt6_rtnl_dump_arg 1565struct rt6_rtnl_dump_arg
@@ -1566,11 +1569,9 @@ struct rt6_rtnl_dump_arg
1566}; 1569};
1567 1570
1568static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, 1571static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1569 struct in6_addr *dst, 1572 struct in6_addr *dst, struct in6_addr *src,
1570 struct in6_addr *src, 1573 int iif, int type, u32 pid, u32 seq,
1571 int iif, 1574 int prefix, unsigned int flags)
1572 int type, u32 pid, u32 seq,
1573 struct nlmsghdr *in_nlh, int prefix)
1574{ 1575{
1575 struct rtmsg *rtm; 1576 struct rtmsg *rtm;
1576 struct nlmsghdr *nlh; 1577 struct nlmsghdr *nlh;
@@ -1584,11 +1585,7 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1584 } 1585 }
1585 } 1586 }
1586 1587
1587 if (!pid && in_nlh) { 1588 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1588 pid = in_nlh->nlmsg_pid;
1589 }
1590
1591 nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
1592 rtm = NLMSG_DATA(nlh); 1589 rtm = NLMSG_DATA(nlh);
1593 rtm->rtm_family = AF_INET6; 1590 rtm->rtm_family = AF_INET6;
1594 rtm->rtm_dst_len = rt->rt6i_dst.plen; 1591 rtm->rtm_dst_len = rt->rt6i_dst.plen;
@@ -1674,7 +1671,7 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1674 1671
1675 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 1672 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1676 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, 1673 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1677 NULL, prefix); 1674 prefix, NLM_F_MULTI);
1678} 1675}
1679 1676
1680static int fib6_dump_node(struct fib6_walker_t *w) 1677static int fib6_dump_node(struct fib6_walker_t *w)
@@ -1822,7 +1819,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1822 &fl.fl6_dst, &fl.fl6_src, 1819 &fl.fl6_dst, &fl.fl6_src,
1823 iif, 1820 iif,
1824 RTM_NEWROUTE, NETLINK_CB(in_skb).pid, 1821 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1825 nlh->nlmsg_seq, nlh, 0); 1822 nlh->nlmsg_seq, 0, 0);
1826 if (err < 0) { 1823 if (err < 0) {
1827 err = -EMSGSIZE; 1824 err = -EMSGSIZE;
1828 goto out_free; 1825 goto out_free;
@@ -1838,17 +1835,25 @@ out_free:
1838 goto out; 1835 goto out;
1839} 1836}
1840 1837
1841void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh) 1838void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1839 struct netlink_skb_parms *req)
1842{ 1840{
1843 struct sk_buff *skb; 1841 struct sk_buff *skb;
1844 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); 1842 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1843 u32 pid = current->pid;
1844 u32 seq = 0;
1845 1845
1846 if (req)
1847 pid = req->pid;
1848 if (nlh)
1849 seq = nlh->nlmsg_seq;
1850
1846 skb = alloc_skb(size, gfp_any()); 1851 skb = alloc_skb(size, gfp_any());
1847 if (!skb) { 1852 if (!skb) {
1848 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); 1853 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
1849 return; 1854 return;
1850 } 1855 }
1851 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0) < 0) { 1856 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1852 kfree_skb(skb); 1857 kfree_skb(skb);
1853 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); 1858 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
1854 return; 1859 return;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b788f55e139b..e553e5b80d6e 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -195,7 +195,6 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int
195 dev_hold(dev); 195 dev_hold(dev);
196 196
197 ipip6_tunnel_link(nt); 197 ipip6_tunnel_link(nt);
198 /* Do not decrement MOD_USE_COUNT here. */
199 return nt; 198 return nt;
200 199
201failed: 200failed:
@@ -794,10 +793,28 @@ static struct net_protocol sit_protocol = {
794 .err_handler = ipip6_err, 793 .err_handler = ipip6_err,
795}; 794};
796 795
796static void __exit sit_destroy_tunnels(void)
797{
798 int prio;
799
800 for (prio = 1; prio < 4; prio++) {
801 int h;
802 for (h = 0; h < HASH_SIZE; h++) {
803 struct ip_tunnel *t;
804 while ((t = tunnels[prio][h]) != NULL)
805 unregister_netdevice(t->dev);
806 }
807 }
808}
809
797void __exit sit_cleanup(void) 810void __exit sit_cleanup(void)
798{ 811{
799 inet_del_protocol(&sit_protocol, IPPROTO_IPV6); 812 inet_del_protocol(&sit_protocol, IPPROTO_IPV6);
800 unregister_netdev(ipip6_fb_tunnel_dev); 813
814 rtnl_lock();
815 sit_destroy_tunnels();
816 unregister_netdevice(ipip6_fb_tunnel_dev);
817 rtnl_unlock();
801} 818}
802 819
803int __init sit_init(void) 820int __init sit_init(void)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0f69e800a0ad..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -65,7 +65,7 @@
65#include <linux/seq_file.h> 65#include <linux/seq_file.h>
66 66
67static void tcp_v6_send_reset(struct sk_buff *skb); 67static void tcp_v6_send_reset(struct sk_buff *skb);
68static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req); 68static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
69static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 69static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len,
70 struct sk_buff *skb); 70 struct sk_buff *skb);
71 71
@@ -394,24 +394,26 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
394 return c & (TCP_SYNQ_HSIZE - 1); 394 return c & (TCP_SYNQ_HSIZE - 1);
395} 395}
396 396
397static struct open_request *tcp_v6_search_req(struct tcp_sock *tp, 397static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp,
398 struct open_request ***prevp, 398 struct request_sock ***prevp,
399 __u16 rport, 399 __u16 rport,
400 struct in6_addr *raddr, 400 struct in6_addr *raddr,
401 struct in6_addr *laddr, 401 struct in6_addr *laddr,
402 int iif) 402 int iif)
403{ 403{
404 struct tcp_listen_opt *lopt = tp->listen_opt; 404 struct listen_sock *lopt = tp->accept_queue.listen_opt;
405 struct open_request *req, **prev; 405 struct request_sock *req, **prev;
406 406
407 for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; 407 for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
408 (req = *prev) != NULL; 408 (req = *prev) != NULL;
409 prev = &req->dl_next) { 409 prev = &req->dl_next) {
410 if (req->rmt_port == rport && 410 const struct tcp6_request_sock *treq = tcp6_rsk(req);
411 req->class->family == AF_INET6 && 411
412 ipv6_addr_equal(&req->af.v6_req.rmt_addr, raddr) && 412 if (inet_rsk(req)->rmt_port == rport &&
413 ipv6_addr_equal(&req->af.v6_req.loc_addr, laddr) && 413 req->rsk_ops->family == AF_INET6 &&
414 (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) { 414 ipv6_addr_equal(&treq->rmt_addr, raddr) &&
415 ipv6_addr_equal(&treq->loc_addr, laddr) &&
416 (!treq->iif || treq->iif == iif)) {
415 BUG_TRAP(req->sk == NULL); 417 BUG_TRAP(req->sk == NULL);
416 *prevp = prev; 418 *prevp = prev;
417 return req; 419 return req;
@@ -906,9 +908,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
906 908
907 icmpv6_err_convert(type, code, &err); 909 icmpv6_err_convert(type, code, &err);
908 910
909 /* Might be for an open_request */ 911 /* Might be for an request_sock */
910 switch (sk->sk_state) { 912 switch (sk->sk_state) {
911 struct open_request *req, **prev; 913 struct request_sock *req, **prev;
912 case TCP_LISTEN: 914 case TCP_LISTEN:
913 if (sock_owned_by_user(sk)) 915 if (sock_owned_by_user(sk))
914 goto out; 916 goto out;
@@ -923,7 +925,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
923 */ 925 */
924 BUG_TRAP(req->sk == NULL); 926 BUG_TRAP(req->sk == NULL);
925 927
926 if (seq != req->snt_isn) { 928 if (seq != tcp_rsk(req)->snt_isn) {
927 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); 929 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
928 goto out; 930 goto out;
929 } 931 }
@@ -957,9 +959,10 @@ out:
957} 959}
958 960
959 961
960static int tcp_v6_send_synack(struct sock *sk, struct open_request *req, 962static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
961 struct dst_entry *dst) 963 struct dst_entry *dst)
962{ 964{
965 struct tcp6_request_sock *treq = tcp6_rsk(req);
963 struct ipv6_pinfo *np = inet6_sk(sk); 966 struct ipv6_pinfo *np = inet6_sk(sk);
964 struct sk_buff * skb; 967 struct sk_buff * skb;
965 struct ipv6_txoptions *opt = NULL; 968 struct ipv6_txoptions *opt = NULL;
@@ -969,19 +972,19 @@ static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
969 972
970 memset(&fl, 0, sizeof(fl)); 973 memset(&fl, 0, sizeof(fl));
971 fl.proto = IPPROTO_TCP; 974 fl.proto = IPPROTO_TCP;
972 ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); 975 ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
973 ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); 976 ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
974 fl.fl6_flowlabel = 0; 977 fl.fl6_flowlabel = 0;
975 fl.oif = req->af.v6_req.iif; 978 fl.oif = treq->iif;
976 fl.fl_ip_dport = req->rmt_port; 979 fl.fl_ip_dport = inet_rsk(req)->rmt_port;
977 fl.fl_ip_sport = inet_sk(sk)->sport; 980 fl.fl_ip_sport = inet_sk(sk)->sport;
978 981
979 if (dst == NULL) { 982 if (dst == NULL) {
980 opt = np->opt; 983 opt = np->opt;
981 if (opt == NULL && 984 if (opt == NULL &&
982 np->rxopt.bits.srcrt == 2 && 985 np->rxopt.bits.srcrt == 2 &&
983 req->af.v6_req.pktopts) { 986 treq->pktopts) {
984 struct sk_buff *pktopts = req->af.v6_req.pktopts; 987 struct sk_buff *pktopts = treq->pktopts;
985 struct inet6_skb_parm *rxopt = IP6CB(pktopts); 988 struct inet6_skb_parm *rxopt = IP6CB(pktopts);
986 if (rxopt->srcrt) 989 if (rxopt->srcrt)
987 opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt)); 990 opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt));
@@ -1008,10 +1011,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
1008 struct tcphdr *th = skb->h.th; 1011 struct tcphdr *th = skb->h.th;
1009 1012
1010 th->check = tcp_v6_check(th, skb->len, 1013 th->check = tcp_v6_check(th, skb->len,
1011 &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, 1014 &treq->loc_addr, &treq->rmt_addr,
1012 csum_partial((char *)th, skb->len, skb->csum)); 1015 csum_partial((char *)th, skb->len, skb->csum));
1013 1016
1014 ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); 1017 ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
1015 err = ip6_xmit(sk, skb, &fl, opt, 0); 1018 err = ip6_xmit(sk, skb, &fl, opt, 0);
1016 if (err == NET_XMIT_CN) 1019 if (err == NET_XMIT_CN)
1017 err = 0; 1020 err = 0;
@@ -1024,17 +1027,18 @@ done:
1024 return err; 1027 return err;
1025} 1028}
1026 1029
1027static void tcp_v6_or_free(struct open_request *req) 1030static void tcp_v6_reqsk_destructor(struct request_sock *req)
1028{ 1031{
1029 if (req->af.v6_req.pktopts) 1032 if (tcp6_rsk(req)->pktopts)
1030 kfree_skb(req->af.v6_req.pktopts); 1033 kfree_skb(tcp6_rsk(req)->pktopts);
1031} 1034}
1032 1035
1033static struct or_calltable or_ipv6 = { 1036static struct request_sock_ops tcp6_request_sock_ops = {
1034 .family = AF_INET6, 1037 .family = AF_INET6,
1038 .obj_size = sizeof(struct tcp6_request_sock),
1035 .rtx_syn_ack = tcp_v6_send_synack, 1039 .rtx_syn_ack = tcp_v6_send_synack,
1036 .send_ack = tcp_v6_or_send_ack, 1040 .send_ack = tcp_v6_reqsk_send_ack,
1037 .destructor = tcp_v6_or_free, 1041 .destructor = tcp_v6_reqsk_destructor,
1038 .send_reset = tcp_v6_send_reset 1042 .send_reset = tcp_v6_send_reset
1039}; 1043};
1040 1044
@@ -1219,15 +1223,15 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
1219 tcp_tw_put(tw); 1223 tcp_tw_put(tw);
1220} 1224}
1221 1225
1222static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req) 1226static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1223{ 1227{
1224 tcp_v6_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent); 1228 tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent);
1225} 1229}
1226 1230
1227 1231
1228static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) 1232static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1229{ 1233{
1230 struct open_request *req, **prev; 1234 struct request_sock *req, **prev;
1231 struct tcphdr *th = skb->h.th; 1235 struct tcphdr *th = skb->h.th;
1232 struct tcp_sock *tp = tcp_sk(sk); 1236 struct tcp_sock *tp = tcp_sk(sk);
1233 struct sock *nsk; 1237 struct sock *nsk;
@@ -1260,21 +1264,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1260 return sk; 1264 return sk;
1261} 1265}
1262 1266
1263static void tcp_v6_synq_add(struct sock *sk, struct open_request *req) 1267static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
1264{ 1268{
1265 struct tcp_sock *tp = tcp_sk(sk); 1269 struct tcp_sock *tp = tcp_sk(sk);
1266 struct tcp_listen_opt *lopt = tp->listen_opt; 1270 struct listen_sock *lopt = tp->accept_queue.listen_opt;
1267 u32 h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port, lopt->hash_rnd); 1271 u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
1268
1269 req->sk = NULL;
1270 req->expires = jiffies + TCP_TIMEOUT_INIT;
1271 req->retrans = 0;
1272 req->dl_next = lopt->syn_table[h];
1273
1274 write_lock(&tp->syn_wait_lock);
1275 lopt->syn_table[h] = req;
1276 write_unlock(&tp->syn_wait_lock);
1277 1272
1273 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
1278 tcp_synq_added(sk); 1274 tcp_synq_added(sk);
1279} 1275}
1280 1276
@@ -1284,10 +1280,11 @@ static void tcp_v6_synq_add(struct sock *sk, struct open_request *req)
1284 */ 1280 */
1285static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) 1281static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1286{ 1282{
1283 struct tcp6_request_sock *treq;
1287 struct ipv6_pinfo *np = inet6_sk(sk); 1284 struct ipv6_pinfo *np = inet6_sk(sk);
1288 struct tcp_options_received tmp_opt; 1285 struct tcp_options_received tmp_opt;
1289 struct tcp_sock *tp = tcp_sk(sk); 1286 struct tcp_sock *tp = tcp_sk(sk);
1290 struct open_request *req = NULL; 1287 struct request_sock *req = NULL;
1291 __u32 isn = TCP_SKB_CB(skb)->when; 1288 __u32 isn = TCP_SKB_CB(skb)->when;
1292 1289
1293 if (skb->protocol == htons(ETH_P_IP)) 1290 if (skb->protocol == htons(ETH_P_IP))
@@ -1308,7 +1305,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1308 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 1305 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1309 goto drop; 1306 goto drop;
1310 1307
1311 req = tcp_openreq_alloc(); 1308 req = reqsk_alloc(&tcp6_request_sock_ops);
1312 if (req == NULL) 1309 if (req == NULL)
1313 goto drop; 1310 goto drop;
1314 1311
@@ -1321,28 +1318,28 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1321 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1318 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1322 tcp_openreq_init(req, &tmp_opt, skb); 1319 tcp_openreq_init(req, &tmp_opt, skb);
1323 1320
1324 req->class = &or_ipv6; 1321 treq = tcp6_rsk(req);
1325 ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); 1322 ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
1326 ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); 1323 ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
1327 TCP_ECN_create_request(req, skb->h.th); 1324 TCP_ECN_create_request(req, skb->h.th);
1328 req->af.v6_req.pktopts = NULL; 1325 treq->pktopts = NULL;
1329 if (ipv6_opt_accepted(sk, skb) || 1326 if (ipv6_opt_accepted(sk, skb) ||
1330 np->rxopt.bits.rxinfo || 1327 np->rxopt.bits.rxinfo ||
1331 np->rxopt.bits.rxhlim) { 1328 np->rxopt.bits.rxhlim) {
1332 atomic_inc(&skb->users); 1329 atomic_inc(&skb->users);
1333 req->af.v6_req.pktopts = skb; 1330 treq->pktopts = skb;
1334 } 1331 }
1335 req->af.v6_req.iif = sk->sk_bound_dev_if; 1332 treq->iif = sk->sk_bound_dev_if;
1336 1333
1337 /* So that link locals have meaning */ 1334 /* So that link locals have meaning */
1338 if (!sk->sk_bound_dev_if && 1335 if (!sk->sk_bound_dev_if &&
1339 ipv6_addr_type(&req->af.v6_req.rmt_addr) & IPV6_ADDR_LINKLOCAL) 1336 ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
1340 req->af.v6_req.iif = tcp_v6_iif(skb); 1337 treq->iif = tcp_v6_iif(skb);
1341 1338
1342 if (isn == 0) 1339 if (isn == 0)
1343 isn = tcp_v6_init_sequence(sk,skb); 1340 isn = tcp_v6_init_sequence(sk,skb);
1344 1341
1345 req->snt_isn = isn; 1342 tcp_rsk(req)->snt_isn = isn;
1346 1343
1347 if (tcp_v6_send_synack(sk, req, NULL)) 1344 if (tcp_v6_send_synack(sk, req, NULL))
1348 goto drop; 1345 goto drop;
@@ -1353,16 +1350,17 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1353 1350
1354drop: 1351drop:
1355 if (req) 1352 if (req)
1356 tcp_openreq_free(req); 1353 reqsk_free(req);
1357 1354
1358 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1355 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1359 return 0; /* don't send reset */ 1356 return 0; /* don't send reset */
1360} 1357}
1361 1358
1362static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 1359static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1363 struct open_request *req, 1360 struct request_sock *req,
1364 struct dst_entry *dst) 1361 struct dst_entry *dst)
1365{ 1362{
1363 struct tcp6_request_sock *treq = tcp6_rsk(req);
1366 struct ipv6_pinfo *newnp, *np = inet6_sk(sk); 1364 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
1367 struct tcp6_sock *newtcp6sk; 1365 struct tcp6_sock *newtcp6sk;
1368 struct inet_sock *newinet; 1366 struct inet_sock *newinet;
@@ -1426,10 +1424,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1426 goto out_overflow; 1424 goto out_overflow;
1427 1425
1428 if (np->rxopt.bits.srcrt == 2 && 1426 if (np->rxopt.bits.srcrt == 2 &&
1429 opt == NULL && req->af.v6_req.pktopts) { 1427 opt == NULL && treq->pktopts) {
1430 struct inet6_skb_parm *rxopt = IP6CB(req->af.v6_req.pktopts); 1428 struct inet6_skb_parm *rxopt = IP6CB(treq->pktopts);
1431 if (rxopt->srcrt) 1429 if (rxopt->srcrt)
1432 opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt)); 1430 opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr *)(treq->pktopts->nh.raw + rxopt->srcrt));
1433 } 1431 }
1434 1432
1435 if (dst == NULL) { 1433 if (dst == NULL) {
@@ -1438,16 +1436,16 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1438 1436
1439 memset(&fl, 0, sizeof(fl)); 1437 memset(&fl, 0, sizeof(fl));
1440 fl.proto = IPPROTO_TCP; 1438 fl.proto = IPPROTO_TCP;
1441 ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); 1439 ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
1442 if (opt && opt->srcrt) { 1440 if (opt && opt->srcrt) {
1443 struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; 1441 struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
1444 ipv6_addr_copy(&final, &fl.fl6_dst); 1442 ipv6_addr_copy(&final, &fl.fl6_dst);
1445 ipv6_addr_copy(&fl.fl6_dst, rt0->addr); 1443 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
1446 final_p = &final; 1444 final_p = &final;
1447 } 1445 }
1448 ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); 1446 ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
1449 fl.oif = sk->sk_bound_dev_if; 1447 fl.oif = sk->sk_bound_dev_if;
1450 fl.fl_ip_dport = req->rmt_port; 1448 fl.fl_ip_dport = inet_rsk(req)->rmt_port;
1451 fl.fl_ip_sport = inet_sk(sk)->sport; 1449 fl.fl_ip_sport = inet_sk(sk)->sport;
1452 1450
1453 if (ip6_dst_lookup(sk, &dst, &fl)) 1451 if (ip6_dst_lookup(sk, &dst, &fl))
@@ -1482,10 +1480,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1482 1480
1483 memcpy(newnp, np, sizeof(struct ipv6_pinfo)); 1481 memcpy(newnp, np, sizeof(struct ipv6_pinfo));
1484 1482
1485 ipv6_addr_copy(&newnp->daddr, &req->af.v6_req.rmt_addr); 1483 ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr);
1486 ipv6_addr_copy(&newnp->saddr, &req->af.v6_req.loc_addr); 1484 ipv6_addr_copy(&newnp->saddr, &treq->loc_addr);
1487 ipv6_addr_copy(&newnp->rcv_saddr, &req->af.v6_req.loc_addr); 1485 ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr);
1488 newsk->sk_bound_dev_if = req->af.v6_req.iif; 1486 newsk->sk_bound_dev_if = treq->iif;
1489 1487
1490 /* Now IPv6 options... 1488 /* Now IPv6 options...
1491 1489
@@ -1498,11 +1496,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1498 1496
1499 /* Clone pktoptions received with SYN */ 1497 /* Clone pktoptions received with SYN */
1500 newnp->pktoptions = NULL; 1498 newnp->pktoptions = NULL;
1501 if (req->af.v6_req.pktopts) { 1499 if (treq->pktopts != NULL) {
1502 newnp->pktoptions = skb_clone(req->af.v6_req.pktopts, 1500 newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC);
1503 GFP_ATOMIC); 1501 kfree_skb(treq->pktopts);
1504 kfree_skb(req->af.v6_req.pktopts); 1502 treq->pktopts = NULL;
1505 req->af.v6_req.pktopts = NULL;
1506 if (newnp->pktoptions) 1503 if (newnp->pktoptions)
1507 skb_set_owner_r(newnp->pktoptions, newsk); 1504 skb_set_owner_r(newnp->pktoptions, newsk);
1508 } 1505 }
@@ -2021,14 +2018,14 @@ static int tcp_v6_init_sock(struct sock *sk)
2021 */ 2018 */
2022 tp->snd_ssthresh = 0x7fffffff; 2019 tp->snd_ssthresh = 0x7fffffff;
2023 tp->snd_cwnd_clamp = ~0; 2020 tp->snd_cwnd_clamp = ~0;
2024 tp->mss_cache_std = tp->mss_cache = 536; 2021 tp->mss_cache = 536;
2025 2022
2026 tp->reordering = sysctl_tcp_reordering; 2023 tp->reordering = sysctl_tcp_reordering;
2027 2024
2028 sk->sk_state = TCP_CLOSE; 2025 sk->sk_state = TCP_CLOSE;
2029 2026
2030 tp->af_specific = &ipv6_specific; 2027 tp->af_specific = &ipv6_specific;
2031 2028 tp->ca_ops = &tcp_init_congestion_ops;
2032 sk->sk_write_space = sk_stream_write_space; 2029 sk->sk_write_space = sk_stream_write_space;
2033 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2034 2031
@@ -2050,7 +2047,7 @@ static int tcp_v6_destroy_sock(struct sock *sk)
2050 2047
2051/* Proc filesystem TCPv6 sock list dumping. */ 2048/* Proc filesystem TCPv6 sock list dumping. */
2052static void get_openreq6(struct seq_file *seq, 2049static void get_openreq6(struct seq_file *seq,
2053 struct sock *sk, struct open_request *req, int i, int uid) 2050 struct sock *sk, struct request_sock *req, int i, int uid)
2054{ 2051{
2055 struct in6_addr *dest, *src; 2052 struct in6_addr *dest, *src;
2056 int ttd = req->expires - jiffies; 2053 int ttd = req->expires - jiffies;
@@ -2058,8 +2055,8 @@ static void get_openreq6(struct seq_file *seq,
2058 if (ttd < 0) 2055 if (ttd < 0)
2059 ttd = 0; 2056 ttd = 0;
2060 2057
2061 src = &req->af.v6_req.loc_addr; 2058 src = &tcp6_rsk(req)->loc_addr;
2062 dest = &req->af.v6_req.rmt_addr; 2059 dest = &tcp6_rsk(req)->rmt_addr;
2063 seq_printf(seq, 2060 seq_printf(seq,
2064 "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " 2061 "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
2065 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", 2062 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
@@ -2069,7 +2066,7 @@ static void get_openreq6(struct seq_file *seq,
2069 ntohs(inet_sk(sk)->sport), 2066 ntohs(inet_sk(sk)->sport),
2070 dest->s6_addr32[0], dest->s6_addr32[1], 2067 dest->s6_addr32[0], dest->s6_addr32[1],
2071 dest->s6_addr32[2], dest->s6_addr32[3], 2068 dest->s6_addr32[2], dest->s6_addr32[3],
2072 ntohs(req->rmt_port), 2069 ntohs(inet_rsk(req)->rmt_port),
2073 TCP_SYN_RECV, 2070 TCP_SYN_RECV,
2074 0,0, /* could print option size, but that is af dependent. */ 2071 0,0, /* could print option size, but that is af dependent. */
2075 1, /* timers active (only the expire timer) */ 2072 1, /* timers active (only the expire timer) */
@@ -2239,6 +2236,7 @@ struct proto tcpv6_prot = {
2239 .sysctl_rmem = sysctl_tcp_rmem, 2236 .sysctl_rmem = sysctl_tcp_rmem,
2240 .max_header = MAX_TCP_HEADER, 2237 .max_header = MAX_TCP_HEADER,
2241 .obj_size = sizeof(struct tcp6_sock), 2238 .obj_size = sizeof(struct tcp6_sock),
2239 .rsk_prot = &tcp6_request_sock_ops,
2242}; 2240};
2243 2241
2244static struct inet6_protocol tcpv6_protocol = { 2242static struct inet6_protocol tcpv6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e251d0ba4f39..eff050ac7049 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -300,12 +300,12 @@ csum_copy_err:
300 /* Clear queue. */ 300 /* Clear queue. */
301 if (flags&MSG_PEEK) { 301 if (flags&MSG_PEEK) {
302 int clear = 0; 302 int clear = 0;
303 spin_lock_irq(&sk->sk_receive_queue.lock); 303 spin_lock_bh(&sk->sk_receive_queue.lock);
304 if (skb == skb_peek(&sk->sk_receive_queue)) { 304 if (skb == skb_peek(&sk->sk_receive_queue)) {
305 __skb_unlink(skb, &sk->sk_receive_queue); 305 __skb_unlink(skb, &sk->sk_receive_queue);
306 clear = 1; 306 clear = 1;
307 } 307 }
308 spin_unlock_irq(&sk->sk_receive_queue.lock); 308 spin_unlock_bh(&sk->sk_receive_queue.lock);
309 if (clear) 309 if (clear)
310 kfree_skb(skb); 310 kfree_skb(skb);
311 } 311 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index ffcadd68b951..60c26c87277e 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -466,7 +466,7 @@ static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
466 return; 466 return;
467} 467}
468 468
469static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args) 469static int xfrm6_tunnel_init_state(struct xfrm_state *x)
470{ 470{
471 if (!x->props.mode) 471 if (!x->props.mode)
472 return -EINVAL; 472 return -EINVAL;
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index a16237c0e783..980a826f5d02 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -1,6 +1,39 @@
1# 1#
2# IPX configuration 2# IPX configuration
3# 3#
4config IPX
5 tristate "The IPX protocol"
6 select LLC
7 ---help---
8 This is support for the Novell networking protocol, IPX, commonly
9 used for local networks of Windows machines. You need it if you
10 want to access Novell NetWare file or print servers using the Linux
11 Novell client ncpfs (available from
12 <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
13 within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>). In order
15 to do the former, you'll also have to say Y to "NCP file system
16 support", below.
17
18 IPX is similar in scope to IP, while SPX, which runs on top of IPX,
19 is similar to TCP. There is also experimental support for SPX in
20 Linux (see "SPX networking", below).
21
22 To turn your Linux box into a fully featured NetWare file server and
23 IPX router, say Y here and fetch either lwared from
24 <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
25 mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
26 information, read the IPX-HOWTO available from
27 <http://www.tldp.org/docs.html#howto>.
28
29 General information about how to connect Linux, Windows machines and
30 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
31
32 The IPX driver would enlarge your kernel by about 16 KB. To compile
33 this driver as a module, choose M here: the module will be called ipx.
34 Unless you want to integrate your Linux box with a local Novell
35 network, say N.
36
4config IPX_INTERN 37config IPX_INTERN
5 bool "IPX: Full internal IPX network" 38 bool "IPX: Full internal IPX network"
6 depends on IPX 39 depends on IPX
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
index 046ad0750e48..7029618f5719 100644
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irlap_cb *self)
445 IRDA_ASSERT(self->magic == LAP_MAGIC, return;); 445 IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
446 446
447 /* Don't disconnect until all data frames are successfully sent */ 447 /* Don't disconnect until all data frames are successfully sent */
448 if (skb_queue_len(&self->txq) > 0) { 448 if (!skb_queue_empty(&self->txq)) {
449 self->disconnect_pending = TRUE; 449 self->disconnect_pending = TRUE;
450
451 return; 450 return;
452 } 451 }
453 452
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
index 1cd89f5f3b75..a505b5457608 100644
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
191 * Send out the RR frames faster if our own transmit queue is empty, or 191 * Send out the RR frames faster if our own transmit queue is empty, or
192 * if the peer is busy. The effect is a much faster conversation 192 * if the peer is busy. The effect is a much faster conversation
193 */ 193 */
194 if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) { 194 if (skb_queue_empty(&self->txq) || self->remote_busy) {
195 if (self->fast_RR == TRUE) { 195 if (self->fast_RR == TRUE) {
196 /* 196 /*
197 * Assert that the fast poll timer has not reached the 197 * Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
263 IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__, 263 IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
264 skb_queue_len(&self->txq)); 264 skb_queue_len(&self->txq));
265 265
266 if (skb_queue_len(&self->txq)) { 266 if (!skb_queue_empty(&self->txq)) {
267 /* Prevent race conditions with irlap_data_request() */ 267 /* Prevent race conditions with irlap_data_request() */
268 self->local_busy = TRUE; 268 self->local_busy = TRUE;
269 269
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
1074#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1074#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
1075 /* Window has been adjusted for the max packet 1075 /* Window has been adjusted for the max packet
1076 * size, so much simpler... - Jean II */ 1076 * size, so much simpler... - Jean II */
1077 nextfit = (skb_queue_len(&self->txq) > 0); 1077 nextfit = !skb_queue_empty(&self->txq);
1078#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1078#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
1079 /* 1079 /*
1080 * Send data with poll bit cleared only if window > 1 1080 * Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
1814#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1814#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
1815 /* Window has been adjusted for the max packet 1815 /* Window has been adjusted for the max packet
1816 * size, so much simpler... - Jean II */ 1816 * size, so much simpler... - Jean II */
1817 nextfit = (skb_queue_len(&self->txq) > 0); 1817 nextfit = !skb_queue_empty(&self->txq);
1818#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1818#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
1819 /* 1819 /*
1820 * Send data with final bit cleared only if window > 1 1820 * Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
1937 irlap_data_indication(self, skb, FALSE); 1937 irlap_data_indication(self, skb, FALSE);
1938 1938
1939 /* Any pending data requests? */ 1939 /* Any pending data requests? */
1940 if ((skb_queue_len(&self->txq) > 0) && 1940 if (!skb_queue_empty(&self->txq) &&
1941 (self->window > 0)) 1941 (self->window > 0))
1942 { 1942 {
1943 self->ack_required = TRUE; 1943 self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
2038 /* 2038 /*
2039 * Any pending data requests? 2039 * Any pending data requests?
2040 */ 2040 */
2041 if ((skb_queue_len(&self->txq) > 0) && 2041 if (!skb_queue_empty(&self->txq) &&
2042 (self->window > 0) && !self->remote_busy) 2042 (self->window > 0) && !self->remote_busy)
2043 { 2043 {
2044 irlap_data_indication(self, skb, TRUE); 2044 irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
2069 */ 2069 */
2070 nr_status = irlap_validate_nr_received(self, info->nr); 2070 nr_status = irlap_validate_nr_received(self, info->nr);
2071 if (nr_status == NR_EXPECTED) { 2071 if (nr_status == NR_EXPECTED) {
2072 if ((skb_queue_len( &self->txq) > 0) && 2072 if (!skb_queue_empty(&self->txq) &&
2073 (self->window > 0)) { 2073 (self->window > 0)) {
2074 self->remote_busy = FALSE; 2074 self->remote_busy = FALSE;
2075 2075
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 040abe714aa3..6dafbb43b529 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
1018 /* 1018 /*
1019 * We can now fill the window with additional data frames 1019 * We can now fill the window with additional data frames
1020 */ 1020 */
1021 while (skb_queue_len( &self->txq) > 0) { 1021 while (!skb_queue_empty(&self->txq)) {
1022 1022
1023 IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__); 1023 IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
1024 if ((skb_queue_len( &self->txq) > 0) && 1024 if (self->window > 0) {
1025 (self->window > 0)) {
1026 skb = skb_dequeue( &self->txq); 1025 skb = skb_dequeue( &self->txq);
1027 IRDA_ASSERT(skb != NULL, return;); 1026 IRDA_ASSERT(skb != NULL, return;);
1028 1027
@@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
1031 * bit cleared 1030 * bit cleared
1032 */ 1031 */
1033 if ((self->window > 1) && 1032 if ((self->window > 1) &&
1034 skb_queue_len(&self->txq) > 0) 1033 !skb_queue_empty(&self->txq)) {
1035 {
1036 irlap_send_data_primary(self, skb); 1034 irlap_send_data_primary(self, skb);
1037 } else { 1035 } else {
1038 irlap_send_data_primary_poll(self, skb); 1036 irlap_send_data_primary_poll(self, skb);
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index d091ccf773b3..6602d901f8b1 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
1513 /* 1513 /*
1514 * Check if there is still data segments in the transmit queue 1514 * Check if there is still data segments in the transmit queue
1515 */ 1515 */
1516 if (skb_queue_len(&self->tx_queue) > 0) { 1516 if (!skb_queue_empty(&self->tx_queue)) {
1517 if (priority == P_HIGH) { 1517 if (priority == P_HIGH) {
1518 /* 1518 /*
1519 * No need to send the queued data, if we are 1519 * No need to send the queued data, if we are
diff --git a/net/key/af_key.c b/net/key/af_key.c
index ce980aa94ed8..4879743b945a 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -656,13 +656,18 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
656 sa->sadb_sa_exttype = SADB_EXT_SA; 656 sa->sadb_sa_exttype = SADB_EXT_SA;
657 sa->sadb_sa_spi = x->id.spi; 657 sa->sadb_sa_spi = x->id.spi;
658 sa->sadb_sa_replay = x->props.replay_window; 658 sa->sadb_sa_replay = x->props.replay_window;
659 sa->sadb_sa_state = SADB_SASTATE_DYING; 659 switch (x->km.state) {
660 if (x->km.state == XFRM_STATE_VALID && !x->km.dying) 660 case XFRM_STATE_VALID:
661 sa->sadb_sa_state = SADB_SASTATE_MATURE; 661 sa->sadb_sa_state = x->km.dying ?
662 else if (x->km.state == XFRM_STATE_ACQ) 662 SADB_SASTATE_DYING : SADB_SASTATE_MATURE;
663 break;
664 case XFRM_STATE_ACQ:
663 sa->sadb_sa_state = SADB_SASTATE_LARVAL; 665 sa->sadb_sa_state = SADB_SASTATE_LARVAL;
664 else if (x->km.state == XFRM_STATE_EXPIRED) 666 break;
667 default:
665 sa->sadb_sa_state = SADB_SASTATE_DEAD; 668 sa->sadb_sa_state = SADB_SASTATE_DEAD;
669 break;
670 }
666 sa->sadb_sa_auth = 0; 671 sa->sadb_sa_auth = 0;
667 if (x->aalg) { 672 if (x->aalg) {
668 struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); 673 struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
@@ -685,6 +690,8 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
685 sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; 690 sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN;
686 if (x->props.flags & XFRM_STATE_DECAP_DSCP) 691 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
687 sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; 692 sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP;
693 if (x->props.flags & XFRM_STATE_NOPMTUDISC)
694 sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC;
688 695
689 /* hard time */ 696 /* hard time */
690 if (hsc & 2) { 697 if (hsc & 2) {
@@ -969,6 +976,8 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
969 x->props.flags |= XFRM_STATE_NOECN; 976 x->props.flags |= XFRM_STATE_NOECN;
970 if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) 977 if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP)
971 x->props.flags |= XFRM_STATE_DECAP_DSCP; 978 x->props.flags |= XFRM_STATE_DECAP_DSCP;
979 if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
980 x->props.flags |= XFRM_STATE_NOPMTUDISC;
972 981
973 lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1]; 982 lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
974 if (lifetime != NULL) { 983 if (lifetime != NULL) {
@@ -1091,17 +1100,11 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
1091 } 1100 }
1092 } 1101 }
1093 1102
1094 x->type = xfrm_get_type(proto, x->props.family); 1103 err = xfrm_init_state(x);
1095 if (x->type == NULL) { 1104 if (err)
1096 err = -ENOPROTOOPT;
1097 goto out;
1098 }
1099 if (x->type->init_state(x, NULL)) {
1100 err = -EINVAL;
1101 goto out; 1105 goto out;
1102 } 1106
1103 x->km.seq = hdr->sadb_msg_seq; 1107 x->km.seq = hdr->sadb_msg_seq;
1104 x->km.state = XFRM_STATE_VALID;
1105 return x; 1108 return x;
1106 1109
1107out: 1110out:
@@ -1240,13 +1243,78 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
1240 return 0; 1243 return 0;
1241} 1244}
1242 1245
1246static inline int event2poltype(int event)
1247{
1248 switch (event) {
1249 case XFRM_MSG_DELPOLICY:
1250 return SADB_X_SPDDELETE;
1251 case XFRM_MSG_NEWPOLICY:
1252 return SADB_X_SPDADD;
1253 case XFRM_MSG_UPDPOLICY:
1254 return SADB_X_SPDUPDATE;
1255 case XFRM_MSG_POLEXPIRE:
1256 // return SADB_X_SPDEXPIRE;
1257 default:
1258 printk("pfkey: Unknown policy event %d\n", event);
1259 break;
1260 }
1261
1262 return 0;
1263}
1264
1265static inline int event2keytype(int event)
1266{
1267 switch (event) {
1268 case XFRM_MSG_DELSA:
1269 return SADB_DELETE;
1270 case XFRM_MSG_NEWSA:
1271 return SADB_ADD;
1272 case XFRM_MSG_UPDSA:
1273 return SADB_UPDATE;
1274 case XFRM_MSG_EXPIRE:
1275 return SADB_EXPIRE;
1276 default:
1277 printk("pfkey: Unknown SA event %d\n", event);
1278 break;
1279 }
1280
1281 return 0;
1282}
1283
1284/* ADD/UPD/DEL */
1285static int key_notify_sa(struct xfrm_state *x, struct km_event *c)
1286{
1287 struct sk_buff *skb;
1288 struct sadb_msg *hdr;
1289 int hsc = 3;
1290
1291 if (c->event == XFRM_MSG_DELSA)
1292 hsc = 0;
1293
1294 skb = pfkey_xfrm_state2msg(x, 0, hsc);
1295
1296 if (IS_ERR(skb))
1297 return PTR_ERR(skb);
1298
1299 hdr = (struct sadb_msg *) skb->data;
1300 hdr->sadb_msg_version = PF_KEY_V2;
1301 hdr->sadb_msg_type = event2keytype(c->event);
1302 hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
1303 hdr->sadb_msg_errno = 0;
1304 hdr->sadb_msg_reserved = 0;
1305 hdr->sadb_msg_seq = c->seq;
1306 hdr->sadb_msg_pid = c->pid;
1307
1308 pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
1309
1310 return 0;
1311}
1243 1312
1244static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1313static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1245{ 1314{
1246 struct sk_buff *out_skb;
1247 struct sadb_msg *out_hdr;
1248 struct xfrm_state *x; 1315 struct xfrm_state *x;
1249 int err; 1316 int err;
1317 struct km_event c;
1250 1318
1251 xfrm_probe_algs(); 1319 xfrm_probe_algs();
1252 1320
@@ -1254,6 +1322,7 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
1254 if (IS_ERR(x)) 1322 if (IS_ERR(x))
1255 return PTR_ERR(x); 1323 return PTR_ERR(x);
1256 1324
1325 xfrm_state_hold(x);
1257 if (hdr->sadb_msg_type == SADB_ADD) 1326 if (hdr->sadb_msg_type == SADB_ADD)
1258 err = xfrm_state_add(x); 1327 err = xfrm_state_add(x);
1259 else 1328 else
@@ -1262,30 +1331,26 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
1262 if (err < 0) { 1331 if (err < 0) {
1263 x->km.state = XFRM_STATE_DEAD; 1332 x->km.state = XFRM_STATE_DEAD;
1264 xfrm_state_put(x); 1333 xfrm_state_put(x);
1265 return err; 1334 goto out;
1266 } 1335 }
1267 1336
1268 out_skb = pfkey_xfrm_state2msg(x, 0, 3); 1337 if (hdr->sadb_msg_type == SADB_ADD)
1269 if (IS_ERR(out_skb)) 1338 c.event = XFRM_MSG_NEWSA;
1270 return PTR_ERR(out_skb); /* XXX Should we return 0 here ? */ 1339 else
1271 1340 c.event = XFRM_MSG_UPDSA;
1272 out_hdr = (struct sadb_msg *) out_skb->data; 1341 c.seq = hdr->sadb_msg_seq;
1273 out_hdr->sadb_msg_version = hdr->sadb_msg_version; 1342 c.pid = hdr->sadb_msg_pid;
1274 out_hdr->sadb_msg_type = hdr->sadb_msg_type; 1343 km_state_notify(x, &c);
1275 out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); 1344out:
1276 out_hdr->sadb_msg_errno = 0; 1345 xfrm_state_put(x);
1277 out_hdr->sadb_msg_reserved = 0; 1346 return err;
1278 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
1279 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
1280
1281 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
1282
1283 return 0;
1284} 1347}
1285 1348
1286static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1349static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1287{ 1350{
1288 struct xfrm_state *x; 1351 struct xfrm_state *x;
1352 struct km_event c;
1353 int err;
1289 1354
1290 if (!ext_hdrs[SADB_EXT_SA-1] || 1355 if (!ext_hdrs[SADB_EXT_SA-1] ||
1291 !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 1356 !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
@@ -1301,13 +1366,19 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
1301 return -EPERM; 1366 return -EPERM;
1302 } 1367 }
1303 1368
1304 xfrm_state_delete(x); 1369 err = xfrm_state_delete(x);
1305 xfrm_state_put(x); 1370 if (err < 0) {
1371 xfrm_state_put(x);
1372 return err;
1373 }
1306 1374
1307 pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, 1375 c.seq = hdr->sadb_msg_seq;
1308 BROADCAST_ALL, sk); 1376 c.pid = hdr->sadb_msg_pid;
1377 c.event = XFRM_MSG_DELSA;
1378 km_state_notify(x, &c);
1379 xfrm_state_put(x);
1309 1380
1310 return 0; 1381 return err;
1311} 1382}
1312 1383
1313static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1384static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
@@ -1445,28 +1516,42 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg
1445 return 0; 1516 return 0;
1446} 1517}
1447 1518
1519static int key_notify_sa_flush(struct km_event *c)
1520{
1521 struct sk_buff *skb;
1522 struct sadb_msg *hdr;
1523
1524 skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
1525 if (!skb)
1526 return -ENOBUFS;
1527 hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
1528 hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto);
1529 hdr->sadb_msg_seq = c->seq;
1530 hdr->sadb_msg_pid = c->pid;
1531 hdr->sadb_msg_version = PF_KEY_V2;
1532 hdr->sadb_msg_errno = (uint8_t) 0;
1533 hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
1534
1535 pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
1536
1537 return 0;
1538}
1539
1448static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1540static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1449{ 1541{
1450 unsigned proto; 1542 unsigned proto;
1451 struct sk_buff *skb_out; 1543 struct km_event c;
1452 struct sadb_msg *hdr_out;
1453 1544
1454 proto = pfkey_satype2proto(hdr->sadb_msg_satype); 1545 proto = pfkey_satype2proto(hdr->sadb_msg_satype);
1455 if (proto == 0) 1546 if (proto == 0)
1456 return -EINVAL; 1547 return -EINVAL;
1457 1548
1458 skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
1459 if (!skb_out)
1460 return -ENOBUFS;
1461
1462 xfrm_state_flush(proto); 1549 xfrm_state_flush(proto);
1463 1550 c.data.proto = proto;
1464 hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); 1551 c.seq = hdr->sadb_msg_seq;
1465 pfkey_hdr_dup(hdr_out, hdr); 1552 c.pid = hdr->sadb_msg_pid;
1466 hdr_out->sadb_msg_errno = (uint8_t) 0; 1553 c.event = XFRM_MSG_FLUSHSA;
1467 hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); 1554 km_state_notify(NULL, &c);
1468
1469 pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL);
1470 1555
1471 return 0; 1556 return 0;
1472} 1557}
@@ -1859,6 +1944,35 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
1859 hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); 1944 hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
1860} 1945}
1861 1946
1947static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
1948{
1949 struct sk_buff *out_skb;
1950 struct sadb_msg *out_hdr;
1951 int err;
1952
1953 out_skb = pfkey_xfrm_policy2msg_prep(xp);
1954 if (IS_ERR(out_skb)) {
1955 err = PTR_ERR(out_skb);
1956 goto out;
1957 }
1958 pfkey_xfrm_policy2msg(out_skb, xp, dir);
1959
1960 out_hdr = (struct sadb_msg *) out_skb->data;
1961 out_hdr->sadb_msg_version = PF_KEY_V2;
1962
1963 if (c->data.byid && c->event == XFRM_MSG_DELPOLICY)
1964 out_hdr->sadb_msg_type = SADB_X_SPDDELETE2;
1965 else
1966 out_hdr->sadb_msg_type = event2poltype(c->event);
1967 out_hdr->sadb_msg_errno = 0;
1968 out_hdr->sadb_msg_seq = c->seq;
1969 out_hdr->sadb_msg_pid = c->pid;
1970 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
1971out:
1972 return 0;
1973
1974}
1975
1862static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 1976static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1863{ 1977{
1864 int err; 1978 int err;
@@ -1866,8 +1980,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
1866 struct sadb_address *sa; 1980 struct sadb_address *sa;
1867 struct sadb_x_policy *pol; 1981 struct sadb_x_policy *pol;
1868 struct xfrm_policy *xp; 1982 struct xfrm_policy *xp;
1869 struct sk_buff *out_skb; 1983 struct km_event c;
1870 struct sadb_msg *out_hdr;
1871 1984
1872 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 1985 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
1873 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || 1986 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -1935,31 +2048,23 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
1935 (err = parse_ipsecrequests(xp, pol)) < 0) 2048 (err = parse_ipsecrequests(xp, pol)) < 0)
1936 goto out; 2049 goto out;
1937 2050
1938 out_skb = pfkey_xfrm_policy2msg_prep(xp);
1939 if (IS_ERR(out_skb)) {
1940 err = PTR_ERR(out_skb);
1941 goto out;
1942 }
1943
1944 err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, 2051 err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
1945 hdr->sadb_msg_type != SADB_X_SPDUPDATE); 2052 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
1946 if (err) { 2053 if (err) {
1947 kfree_skb(out_skb); 2054 kfree(xp);
1948 goto out; 2055 return err;
1949 } 2056 }
1950 2057
1951 pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); 2058 if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
2059 c.event = XFRM_MSG_UPDPOLICY;
2060 else
2061 c.event = XFRM_MSG_NEWPOLICY;
1952 2062
1953 xfrm_pol_put(xp); 2063 c.seq = hdr->sadb_msg_seq;
2064 c.pid = hdr->sadb_msg_pid;
1954 2065
1955 out_hdr = (struct sadb_msg *) out_skb->data; 2066 km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
1956 out_hdr->sadb_msg_version = hdr->sadb_msg_version; 2067 xfrm_pol_put(xp);
1957 out_hdr->sadb_msg_type = hdr->sadb_msg_type;
1958 out_hdr->sadb_msg_satype = 0;
1959 out_hdr->sadb_msg_errno = 0;
1960 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
1961 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
1962 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
1963 return 0; 2068 return 0;
1964 2069
1965out: 2070out:
@@ -1973,9 +2078,8 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
1973 struct sadb_address *sa; 2078 struct sadb_address *sa;
1974 struct sadb_x_policy *pol; 2079 struct sadb_x_policy *pol;
1975 struct xfrm_policy *xp; 2080 struct xfrm_policy *xp;
1976 struct sk_buff *out_skb;
1977 struct sadb_msg *out_hdr;
1978 struct xfrm_selector sel; 2081 struct xfrm_selector sel;
2082 struct km_event c;
1979 2083
1980 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 2084 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
1981 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || 2085 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2010,25 +2114,40 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
2010 2114
2011 err = 0; 2115 err = 0;
2012 2116
2117 c.seq = hdr->sadb_msg_seq;
2118 c.pid = hdr->sadb_msg_pid;
2119 c.event = XFRM_MSG_DELPOLICY;
2120 km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
2121
2122 xfrm_pol_put(xp);
2123 return err;
2124}
2125
2126static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, struct sadb_msg *hdr, int dir)
2127{
2128 int err;
2129 struct sk_buff *out_skb;
2130 struct sadb_msg *out_hdr;
2131 err = 0;
2132
2013 out_skb = pfkey_xfrm_policy2msg_prep(xp); 2133 out_skb = pfkey_xfrm_policy2msg_prep(xp);
2014 if (IS_ERR(out_skb)) { 2134 if (IS_ERR(out_skb)) {
2015 err = PTR_ERR(out_skb); 2135 err = PTR_ERR(out_skb);
2016 goto out; 2136 goto out;
2017 } 2137 }
2018 pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); 2138 pfkey_xfrm_policy2msg(out_skb, xp, dir);
2019 2139
2020 out_hdr = (struct sadb_msg *) out_skb->data; 2140 out_hdr = (struct sadb_msg *) out_skb->data;
2021 out_hdr->sadb_msg_version = hdr->sadb_msg_version; 2141 out_hdr->sadb_msg_version = hdr->sadb_msg_version;
2022 out_hdr->sadb_msg_type = SADB_X_SPDDELETE; 2142 out_hdr->sadb_msg_type = hdr->sadb_msg_type;
2023 out_hdr->sadb_msg_satype = 0; 2143 out_hdr->sadb_msg_satype = 0;
2024 out_hdr->sadb_msg_errno = 0; 2144 out_hdr->sadb_msg_errno = 0;
2025 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; 2145 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
2026 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; 2146 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
2027 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); 2147 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk);
2028 err = 0; 2148 err = 0;
2029 2149
2030out: 2150out:
2031 xfrm_pol_put(xp);
2032 return err; 2151 return err;
2033} 2152}
2034 2153
@@ -2037,8 +2156,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2037 int err; 2156 int err;
2038 struct sadb_x_policy *pol; 2157 struct sadb_x_policy *pol;
2039 struct xfrm_policy *xp; 2158 struct xfrm_policy *xp;
2040 struct sk_buff *out_skb; 2159 struct km_event c;
2041 struct sadb_msg *out_hdr;
2042 2160
2043 if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) 2161 if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL)
2044 return -EINVAL; 2162 return -EINVAL;
@@ -2050,24 +2168,16 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2050 2168
2051 err = 0; 2169 err = 0;
2052 2170
2053 out_skb = pfkey_xfrm_policy2msg_prep(xp); 2171 c.seq = hdr->sadb_msg_seq;
2054 if (IS_ERR(out_skb)) { 2172 c.pid = hdr->sadb_msg_pid;
2055 err = PTR_ERR(out_skb); 2173 if (hdr->sadb_msg_type == SADB_X_SPDDELETE2) {
2056 goto out; 2174 c.data.byid = 1;
2175 c.event = XFRM_MSG_DELPOLICY;
2176 km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
2177 } else {
2178 err = key_pol_get_resp(sk, xp, hdr, pol->sadb_x_policy_dir-1);
2057 } 2179 }
2058 pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
2059 2180
2060 out_hdr = (struct sadb_msg *) out_skb->data;
2061 out_hdr->sadb_msg_version = hdr->sadb_msg_version;
2062 out_hdr->sadb_msg_type = hdr->sadb_msg_type;
2063 out_hdr->sadb_msg_satype = 0;
2064 out_hdr->sadb_msg_errno = 0;
2065 out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
2066 out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
2067 pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
2068 err = 0;
2069
2070out:
2071 xfrm_pol_put(xp); 2181 xfrm_pol_put(xp);
2072 return err; 2182 return err;
2073} 2183}
@@ -2102,22 +2212,34 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
2102 return xfrm_policy_walk(dump_sp, &data); 2212 return xfrm_policy_walk(dump_sp, &data);
2103} 2213}
2104 2214
2105static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 2215static int key_notify_policy_flush(struct km_event *c)
2106{ 2216{
2107 struct sk_buff *skb_out; 2217 struct sk_buff *skb_out;
2108 struct sadb_msg *hdr_out; 2218 struct sadb_msg *hdr;
2109 2219
2110 skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); 2220 skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
2111 if (!skb_out) 2221 if (!skb_out)
2112 return -ENOBUFS; 2222 return -ENOBUFS;
2223 hdr = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
2224 hdr->sadb_msg_seq = c->seq;
2225 hdr->sadb_msg_pid = c->pid;
2226 hdr->sadb_msg_version = PF_KEY_V2;
2227 hdr->sadb_msg_errno = (uint8_t) 0;
2228 hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
2229 pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL);
2230 return 0;
2113 2231
2114 xfrm_policy_flush(); 2232}
2115 2233
2116 hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); 2234static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
2117 pfkey_hdr_dup(hdr_out, hdr); 2235{
2118 hdr_out->sadb_msg_errno = (uint8_t) 0; 2236 struct km_event c;
2119 hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); 2237
2120 pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL); 2238 xfrm_policy_flush();
2239 c.event = XFRM_MSG_FLUSHPOLICY;
2240 c.pid = hdr->sadb_msg_pid;
2241 c.seq = hdr->sadb_msg_seq;
2242 km_policy_notify(NULL, 0, &c);
2121 2243
2122 return 0; 2244 return 0;
2123} 2245}
@@ -2317,11 +2439,23 @@ static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
2317 } 2439 }
2318} 2440}
2319 2441
2320static int pfkey_send_notify(struct xfrm_state *x, int hard) 2442static int key_notify_policy_expire(struct xfrm_policy *xp, struct km_event *c)
2443{
2444 return 0;
2445}
2446
2447static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
2321{ 2448{
2322 struct sk_buff *out_skb; 2449 struct sk_buff *out_skb;
2323 struct sadb_msg *out_hdr; 2450 struct sadb_msg *out_hdr;
2324 int hsc = (hard ? 2 : 1); 2451 int hard;
2452 int hsc;
2453
2454 hard = c->data.hard;
2455 if (hard)
2456 hsc = 2;
2457 else
2458 hsc = 1;
2325 2459
2326 out_skb = pfkey_xfrm_state2msg(x, 0, hsc); 2460 out_skb = pfkey_xfrm_state2msg(x, 0, hsc);
2327 if (IS_ERR(out_skb)) 2461 if (IS_ERR(out_skb))
@@ -2340,6 +2474,44 @@ static int pfkey_send_notify(struct xfrm_state *x, int hard)
2340 return 0; 2474 return 0;
2341} 2475}
2342 2476
2477static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
2478{
2479 switch (c->event) {
2480 case XFRM_MSG_EXPIRE:
2481 return key_notify_sa_expire(x, c);
2482 case XFRM_MSG_DELSA:
2483 case XFRM_MSG_NEWSA:
2484 case XFRM_MSG_UPDSA:
2485 return key_notify_sa(x, c);
2486 case XFRM_MSG_FLUSHSA:
2487 return key_notify_sa_flush(c);
2488 default:
2489 printk("pfkey: Unknown SA event %d\n", c->event);
2490 break;
2491 }
2492
2493 return 0;
2494}
2495
2496static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
2497{
2498 switch (c->event) {
2499 case XFRM_MSG_POLEXPIRE:
2500 return key_notify_policy_expire(xp, c);
2501 case XFRM_MSG_DELPOLICY:
2502 case XFRM_MSG_NEWPOLICY:
2503 case XFRM_MSG_UPDPOLICY:
2504 return key_notify_policy(xp, dir, c);
2505 case XFRM_MSG_FLUSHPOLICY:
2506 return key_notify_policy_flush(c);
2507 default:
2508 printk("pfkey: Unknown policy event %d\n", c->event);
2509 break;
2510 }
2511
2512 return 0;
2513}
2514
2343static u32 get_acqseq(void) 2515static u32 get_acqseq(void)
2344{ 2516{
2345 u32 res; 2517 u32 res;
@@ -2856,6 +3028,7 @@ static struct xfrm_mgr pfkeyv2_mgr =
2856 .acquire = pfkey_send_acquire, 3028 .acquire = pfkey_send_acquire,
2857 .compile_policy = pfkey_compile_policy, 3029 .compile_policy = pfkey_compile_policy,
2858 .new_mapping = pfkey_send_new_mapping, 3030 .new_mapping = pfkey_send_new_mapping,
3031 .notify_policy = pfkey_send_policy_notify,
2859}; 3032};
2860 3033
2861static void __exit ipsec_pfkey_exit(void) 3034static void __exit ipsec_pfkey_exit(void)
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 000000000000..f0b5efb31a00
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,22 @@
1#
2# LAPB Data Link Drive
3#
4
5config LAPB
6 tristate "LAPB Data Link Driver (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
10 the lower) part of the X.25 protocol. It offers a reliable
11 connection service to exchange data frames with one other host, and
12 it is used to transport higher level protocols (mostly X.25 Packet
13 Layer, the higher part of X.25, but others are possible as well).
14 Usually, LAPB is used with specialized X.21 network cards, but Linux
15 currently supports LAPB only over Ethernet connections. If you want
16 to use LAPB connections over Ethernet, say Y here and to "LAPB over
17 Ethernet driver" below. Read
18 <file:Documentation/networking/lapb-module.txt> for technical
19 details.
20
21 To compile this driver as a module, choose M here: the
22 module will be called lapb. If unsure, say N.
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index cd130c3b72bc..d5bdb53a348f 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
84 if (llc->dev->flags & IFF_LOOPBACK) 84 if (llc->dev->flags & IFF_LOOPBACK)
85 goto out; 85 goto out;
86 rc = 1; 86 rc = 1;
87 if (!skb_queue_len(&llc->pdu_unack_q)) 87 if (skb_queue_empty(&llc->pdu_unack_q))
88 goto out; 88 goto out;
89 skb = skb_peek(&llc->pdu_unack_q); 89 skb = skb_peek(&llc->pdu_unack_q);
90 pdu = llc_pdu_sn_hdr(skb); 90 pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e41ce458c2a9..ff774a06c89d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -315,8 +315,8 @@ err:
315static void netlink_remove(struct sock *sk) 315static void netlink_remove(struct sock *sk)
316{ 316{
317 netlink_table_grab(); 317 netlink_table_grab();
318 nl_table[sk->sk_protocol].hash.entries--; 318 if (sk_del_node_init(sk))
319 sk_del_node_init(sk); 319 nl_table[sk->sk_protocol].hash.entries--;
320 if (nlk_sk(sk)->groups) 320 if (nlk_sk(sk)->groups)
321 __sk_del_bind_node(sk); 321 __sk_del_bind_node(sk);
322 netlink_table_ungrab(); 322 netlink_table_ungrab();
@@ -429,7 +429,12 @@ retry:
429 err = netlink_insert(sk, pid); 429 err = netlink_insert(sk, pid);
430 if (err == -EADDRINUSE) 430 if (err == -EADDRINUSE)
431 goto retry; 431 goto retry;
432 return 0; 432
433 /* If 2 threads race to autobind, that is fine. */
434 if (err == -EBUSY)
435 err = 0;
436
437 return err;
433} 438}
434 439
435static inline int netlink_capable(struct socket *sock, unsigned int flag) 440static inline int netlink_capable(struct socket *sock, unsigned int flag)
@@ -643,7 +648,8 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
643 sock_put(sk); 648 sock_put(sk);
644} 649}
645 650
646static inline struct sk_buff *netlink_trim(struct sk_buff *skb, int allocation) 651static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
652 unsigned int __nocast allocation)
647{ 653{
648 int delta; 654 int delta;
649 655
@@ -712,7 +718,7 @@ struct netlink_broadcast_data {
712 int failure; 718 int failure;
713 int congested; 719 int congested;
714 int delivered; 720 int delivered;
715 int allocation; 721 unsigned int allocation;
716 struct sk_buff *skb, *skb2; 722 struct sk_buff *skb, *skb2;
717}; 723};
718 724
@@ -853,7 +859,7 @@ static inline void netlink_rcv_wake(struct sock *sk)
853{ 859{
854 struct netlink_sock *nlk = nlk_sk(sk); 860 struct netlink_sock *nlk = nlk_sk(sk);
855 861
856 if (!skb_queue_len(&sk->sk_receive_queue)) 862 if (skb_queue_empty(&sk->sk_receive_queue))
857 clear_bit(0, &nlk->state); 863 clear_bit(0, &nlk->state);
858 if (!test_bit(0, &nlk->state)) 864 if (!test_bit(0, &nlk->state))
859 wake_up_interruptible(&nlk->wait); 865 wake_up_interruptible(&nlk->wait);
@@ -1095,8 +1101,7 @@ static int netlink_dump(struct sock *sk)
1095 return 0; 1101 return 0;
1096 } 1102 }
1097 1103
1098 nlh = __nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLMSG_DONE, sizeof(int)); 1104 nlh = NLMSG_NEW_ANSWER(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
1099 nlh->nlmsg_flags |= NLM_F_MULTI;
1100 memcpy(NLMSG_DATA(nlh), &len, sizeof(len)); 1105 memcpy(NLMSG_DATA(nlh), &len, sizeof(len));
1101 skb_queue_tail(&sk->sk_receive_queue, skb); 1106 skb_queue_tail(&sk->sk_receive_queue, skb);
1102 sk->sk_data_ready(sk, skb->len); 1107 sk->sk_data_ready(sk, skb->len);
@@ -1107,6 +1112,9 @@ static int netlink_dump(struct sock *sk)
1107 1112
1108 netlink_destroy_callback(cb); 1113 netlink_destroy_callback(cb);
1109 return 0; 1114 return 0;
1115
1116nlmsg_failure:
1117 return -ENOBUFS;
1110} 1118}
1111 1119
1112int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, 1120int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
@@ -1178,7 +1186,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
1178 } 1186 }
1179 1187
1180 rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 1188 rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1181 NLMSG_ERROR, sizeof(struct nlmsgerr)); 1189 NLMSG_ERROR, sizeof(struct nlmsgerr), 0);
1182 errmsg = NLMSG_DATA(rep); 1190 errmsg = NLMSG_DATA(rep);
1183 errmsg->error = err; 1191 errmsg->error = err;
1184 memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr)); 1192 memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr));
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 000000000000..34ff93ff894d
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,26 @@
1#
2# Packet configuration
3#
4
5config PACKET
6 tristate "Packet socket"
7 ---help---
8 The Packet protocol is used by applications which communicate
9 directly with network devices without an intermediate network
10 protocol implemented in the kernel, e.g. tcpdump. If you want them
11 to work, choose Y.
12
13 To compile this driver as a module, choose M here: the module will
14 be called af_packet.
15
16 If unsure, say Y.
17
18config PACKET_MMAP
19 bool "Packet socket: mmapped IO"
20 depends on PACKET
21 help
22 If you say Y here, the Packet protocol driver will use an IO
23 mechanism that results in faster communication.
24
25 If unsure, say N.
26
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0269616e75a1..c9d5980aa4de 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -274,6 +274,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct
274 dst_release(skb->dst); 274 dst_release(skb->dst);
275 skb->dst = NULL; 275 skb->dst = NULL;
276 276
277 /* drop conntrack reference */
278 nf_reset(skb);
279
277 spkt = (struct sockaddr_pkt*)skb->cb; 280 spkt = (struct sockaddr_pkt*)skb->cb;
278 281
279 skb_push(skb, skb->data-skb->mac.raw); 282 skb_push(skb, skb->data-skb->mac.raw);
@@ -517,6 +520,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
517 dst_release(skb->dst); 520 dst_release(skb->dst);
518 skb->dst = NULL; 521 skb->dst = NULL;
519 522
523 /* drop conntrack reference */
524 nf_reset(skb);
525
520 spin_lock(&sk->sk_receive_queue.lock); 526 spin_lock(&sk->sk_receive_queue.lock);
521 po->stats.tp_packets++; 527 po->stats.tp_packets++;
522 __skb_queue_tail(&sk->sk_receive_queue, skb); 528 __skb_queue_tail(&sk->sk_receive_queue, skb);
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a17..dada34a77b21 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
138 138
139 _debug("### End Work"); 139 _debug("### End Work");
140 140
141 try_to_freeze(PF_FREEZE); 141 try_to_freeze();
142 142
143 /* discard pending signals */ 143 /* discard pending signals */
144 rxrpc_discard_my_signals(); 144 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d9228..1aadd026d354 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
107 107
108 _debug("### End Inbound Calls"); 108 _debug("### End Inbound Calls");
109 109
110 try_to_freeze(PF_FREEZE); 110 try_to_freeze();
111 111
112 /* discard pending signals */ 112 /* discard pending signals */
113 rxrpc_discard_my_signals(); 113 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290bb..3ac81cdd1211 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
90 complete_and_exit(&krxtimod_dead, 0); 90 complete_and_exit(&krxtimod_dead, 0);
91 } 91 }
92 92
93 try_to_freeze(PF_FREEZE); 93 try_to_freeze();
94 94
95 /* discard pending signals */ 95 /* discard pending signals */
96 rxrpc_discard_my_signals(); 96 rxrpc_discard_my_signals();
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b0941186f867..59d3e71f8b85 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,43 @@
1# 1#
2# Traffic control configuration. 2# Traffic control configuration.
3# 3#
4
5menuconfig NET_SCHED
6 bool "QoS and/or fair queueing"
7 ---help---
8 When the kernel has several packets to send out over a network
9 device, it has to decide which ones to send first, which ones to
10 delay, and which ones to drop. This is the job of the packet
11 scheduler, and several different algorithms for how to do this
12 "fairly" have been proposed.
13
14 If you say N here, you will get the standard packet scheduler, which
15 is a FIFO (first come, first served). If you say Y here, you will be
16 able to choose from among several alternative algorithms which can
17 then be attached to different network devices. This is useful for
18 example if some of your network devices are real time devices that
19 need a certain minimum data flow rate, or if you need to limit the
20 maximum data flow rate for traffic which matches specified criteria.
21 This code is considered to be experimental.
22
23 To administer these schedulers, you'll need the user-level utilities
24 from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
25 That package also contains some documentation; for more, check out
26 <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
27
28 This Quality of Service (QoS) support will enable you to use
29 Differentiated Services (diffserv) and Resource Reservation Protocol
30 (RSVP) on your Linux router if you also say Y to "QoS support",
31 "Packet classifier API" and to some classifiers below. Documentation
32 and software is at <http://diffserv.sourceforge.net/>.
33
34 If you say Y here and to "/proc file system" below, you will be able
35 to read status information about packet schedulers from the file
36 /proc/net/psched.
37
38 The available schedulers are listed in the following questions; you
39 can say Y to as many as you like. If unsure, say N now.
40
4choice 41choice
5 prompt "Packet scheduler clock source" 42 prompt "Packet scheduler clock source"
6 depends on NET_SCHED 43 depends on NET_SCHED
@@ -405,7 +442,7 @@ config NET_EMATCH_STACK
405 ---help--- 442 ---help---
406 Size of the local stack variable used while evaluating the tree of 443 Size of the local stack variable used while evaluating the tree of
407 ematches. Limits the depth of the tree, i.e. the number of 444 ematches. Limits the depth of the tree, i.e. the number of
408 encapsulated precedences. Every level requires 4 bytes of addtional 445 encapsulated precedences. Every level requires 4 bytes of additional
409 stack space. 446 stack space.
410 447
411config NET_EMATCH_CMP 448config NET_EMATCH_CMP
@@ -449,6 +486,19 @@ config NET_EMATCH_META
449 To compile this code as a module, choose M here: the 486 To compile this code as a module, choose M here: the
450 module will be called em_meta. 487 module will be called em_meta.
451 488
489config NET_EMATCH_TEXT
490 tristate "Textsearch"
491 depends on NET_EMATCH
492 select TEXTSEARCH
493 select TEXTSEARCH_KMP
494 select TEXTSEARCH_FSM
495 ---help---
496 Say Y here if you want to be ablt to classify packets based on
497 textsearch comparisons.
498
499 To compile this code as a module, choose M here: the
500 module will be called em_text.
501
452config NET_CLS_ACT 502config NET_CLS_ACT
453 bool "Packet ACTION" 503 bool "Packet ACTION"
454 depends on EXPERIMENTAL && NET_CLS && NET_QOS 504 depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eba8..e48d0d456b3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y := sch_generic.o 5obj-y := sch_generic.o
6 6
7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o 7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o
8obj-$(CONFIG_NET_CLS) += cls_api.o 8obj-$(CONFIG_NET_CLS) += cls_api.o
9obj-$(CONFIG_NET_CLS_ACT) += act_api.o 9obj-$(CONFIG_NET_CLS_ACT) += act_api.o
10obj-$(CONFIG_NET_ACT_POLICE) += police.o 10obj-$(CONFIG_NET_ACT_POLICE) += police.o
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o 40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o 41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o 42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
43obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index cafcb084098d..249c61936ea0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -428,17 +428,19 @@ errout:
428 428
429static int 429static int
430tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, 430tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
431 unsigned flags, int event, int bind, int ref) 431 u16 flags, int event, int bind, int ref)
432{ 432{
433 struct tcamsg *t; 433 struct tcamsg *t;
434 struct nlmsghdr *nlh; 434 struct nlmsghdr *nlh;
435 unsigned char *b = skb->tail; 435 unsigned char *b = skb->tail;
436 struct rtattr *x; 436 struct rtattr *x;
437 437
438 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); 438 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
439 nlh->nlmsg_flags = flags; 439
440 t = NLMSG_DATA(nlh); 440 t = NLMSG_DATA(nlh);
441 t->tca_family = AF_UNSPEC; 441 t->tca_family = AF_UNSPEC;
442 t->tca__pad1 = 0;
443 t->tca__pad2 = 0;
442 444
443 x = (struct rtattr*) skb->tail; 445 x = (struct rtattr*) skb->tail;
444 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 446 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -580,6 +582,8 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
580 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); 582 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
581 t = NLMSG_DATA(nlh); 583 t = NLMSG_DATA(nlh);
582 t->tca_family = AF_UNSPEC; 584 t->tca_family = AF_UNSPEC;
585 t->tca__pad1 = 0;
586 t->tca__pad2 = 0;
583 587
584 x = (struct rtattr *) skb->tail; 588 x = (struct rtattr *) skb->tail;
585 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 589 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -669,7 +673,7 @@ err:
669} 673}
670 674
671static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, 675static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
672 unsigned flags) 676 u16 flags)
673{ 677{
674 struct tcamsg *t; 678 struct tcamsg *t;
675 struct nlmsghdr *nlh; 679 struct nlmsghdr *nlh;
@@ -684,11 +688,12 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
684 688
685 b = (unsigned char *)skb->tail; 689 b = (unsigned char *)skb->tail;
686 690
687 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); 691 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
688 nlh->nlmsg_flags = flags;
689 t = NLMSG_DATA(nlh); 692 t = NLMSG_DATA(nlh);
690 t->tca_family = AF_UNSPEC; 693 t->tca_family = AF_UNSPEC;
691 694 t->tca__pad1 = 0;
695 t->tca__pad2 = 0;
696
692 x = (struct rtattr*) skb->tail; 697 x = (struct rtattr*) skb->tail;
693 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 698 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
694 699
@@ -843,6 +848,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
843 cb->nlh->nlmsg_type, sizeof(*t)); 848 cb->nlh->nlmsg_type, sizeof(*t));
844 t = NLMSG_DATA(nlh); 849 t = NLMSG_DATA(nlh);
845 t->tca_family = AF_UNSPEC; 850 t->tca_family = AF_UNSPEC;
851 t->tca__pad1 = 0;
852 t->tca__pad2 = 0;
846 853
847 x = (struct rtattr *) skb->tail; 854 x = (struct rtattr *) skb->tail;
848 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 855 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -881,7 +888,7 @@ static int __init tc_action_init(void)
881 link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action; 888 link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action;
882 } 889 }
883 890
884 printk("TC classifier action (bugs to netdev@oss.sgi.com cc " 891 printk("TC classifier action (bugs to netdev@vger.kernel.org cc "
885 "hadi@cyberus.ca)\n"); 892 "hadi@cyberus.ca)\n");
886 return 0; 893 return 0;
887} 894}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 56e66c3fe0fa..3b5714ef4d1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -322,16 +322,17 @@ errout:
322 322
323static int 323static int
324tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, 324tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
325 u32 pid, u32 seq, unsigned flags, int event) 325 u32 pid, u32 seq, u16 flags, int event)
326{ 326{
327 struct tcmsg *tcm; 327 struct tcmsg *tcm;
328 struct nlmsghdr *nlh; 328 struct nlmsghdr *nlh;
329 unsigned char *b = skb->tail; 329 unsigned char *b = skb->tail;
330 330
331 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 331 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
332 nlh->nlmsg_flags = flags;
333 tcm = NLMSG_DATA(nlh); 332 tcm = NLMSG_DATA(nlh);
334 tcm->tcm_family = AF_UNSPEC; 333 tcm->tcm_family = AF_UNSPEC;
334 tcm->tcm__pad1 = 0;
335 tcm->tcm__pad1 = 0;
335 tcm->tcm_ifindex = tp->q->dev->ifindex; 336 tcm->tcm_ifindex = tp->q->dev->ifindex;
336 tcm->tcm_parent = tp->classid; 337 tcm->tcm_parent = tp->classid;
337 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); 338 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 0d2d4415f334..dfb300bb6baa 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -261,6 +261,9 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh,
261 rta = (struct rtattr *) b; 261 rta = (struct rtattr *) b;
262 RTA_PUT(skb, TCA_OPTIONS, 0, NULL); 262 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
263 263
264 if (f->res.classid)
265 RTA_PUT(skb, TCA_BASIC_CLASSID, sizeof(u32), &f->res.classid);
266
264 if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || 267 if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
265 tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) 268 tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
266 goto rtattr_failure; 269 goto rtattr_failure;
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb9196810..006168d69376 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
618 pinfo.protocol = s->protocol; 618 pinfo.protocol = s->protocol;
619 pinfo.tunnelid = s->tunnelid; 619 pinfo.tunnelid = s->tunnelid;
620 pinfo.tunnelhdr = f->tunnelhdr; 620 pinfo.tunnelhdr = f->tunnelhdr;
621 pinfo.pad = 0;
621 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); 622 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
622 if (f->res.classid) 623 if (f->res.classid)
623 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); 624 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index f1eeaf65cee5..00eae5f9a01a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -27,17 +27,17 @@
27 * lvalue rvalue 27 * lvalue rvalue
28 * +-----------+ +-----------+ 28 * +-----------+ +-----------+
29 * | type: INT | | type: INT | 29 * | type: INT | | type: INT |
30 * def | id: INDEV | | id: VALUE | 30 * def | id: DEV | | id: VALUE |
31 * | data: | | data: 3 | 31 * | data: | | data: 3 |
32 * +-----------+ +-----------+ 32 * +-----------+ +-----------+
33 * | | 33 * | |
34 * ---> meta_ops[INT][INDEV](...) | 34 * ---> meta_ops[INT][DEV](...) |
35 * | | 35 * | |
36 * ----------- | 36 * ----------- |
37 * V V 37 * V V
38 * +-----------+ +-----------+ 38 * +-----------+ +-----------+
39 * | type: INT | | type: INT | 39 * | type: INT | | type: INT |
40 * obj | id: INDEV | | id: VALUE | 40 * obj | id: DEV | | id: VALUE |
41 * | data: 2 |<--data got filled out | data: 3 | 41 * | data: 2 |<--data got filled out | data: 3 |
42 * +-----------+ +-----------+ 42 * +-----------+ +-----------+
43 * | | 43 * | |
@@ -70,6 +70,7 @@
70#include <net/dst.h> 70#include <net/dst.h>
71#include <net/route.h> 71#include <net/route.h>
72#include <net/pkt_cls.h> 72#include <net/pkt_cls.h>
73#include <net/sock.h>
73 74
74struct meta_obj 75struct meta_obj
75{ 76{
@@ -169,26 +170,6 @@ META_COLLECTOR(var_dev)
169 *err = var_dev(skb->dev, dst); 170 *err = var_dev(skb->dev, dst);
170} 171}
171 172
172META_COLLECTOR(int_indev)
173{
174 *err = int_dev(skb->input_dev, dst);
175}
176
177META_COLLECTOR(var_indev)
178{
179 *err = var_dev(skb->input_dev, dst);
180}
181
182META_COLLECTOR(int_realdev)
183{
184 *err = int_dev(skb->real_dev, dst);
185}
186
187META_COLLECTOR(var_realdev)
188{
189 *err = var_dev(skb->real_dev, dst);
190}
191
192/************************************************************************** 173/**************************************************************************
193 * skb attributes 174 * skb attributes
194 **************************************************************************/ 175 **************************************************************************/
@@ -204,11 +185,6 @@ META_COLLECTOR(int_protocol)
204 dst->value = skb->protocol; 185 dst->value = skb->protocol;
205} 186}
206 187
207META_COLLECTOR(int_security)
208{
209 dst->value = skb->security;
210}
211
212META_COLLECTOR(int_pkttype) 188META_COLLECTOR(int_pkttype)
213{ 189{
214 dst->value = skb->pkt_type; 190 dst->value = skb->pkt_type;
@@ -233,12 +209,14 @@ META_COLLECTOR(int_maclen)
233 * Netfilter 209 * Netfilter
234 **************************************************************************/ 210 **************************************************************************/
235 211
236#ifdef CONFIG_NETFILTER
237META_COLLECTOR(int_nfmark) 212META_COLLECTOR(int_nfmark)
238{ 213{
214#ifdef CONFIG_NETFILTER
239 dst->value = skb->nfmark; 215 dst->value = skb->nfmark;
240} 216#else
217 dst->value = 0;
241#endif 218#endif
219}
242 220
243/************************************************************************** 221/**************************************************************************
244 * Traffic Control 222 * Traffic Control
@@ -249,31 +227,21 @@ META_COLLECTOR(int_tcindex)
249 dst->value = skb->tc_index; 227 dst->value = skb->tc_index;
250} 228}
251 229
252#ifdef CONFIG_NET_CLS_ACT
253META_COLLECTOR(int_tcverd)
254{
255 dst->value = skb->tc_verd;
256}
257
258META_COLLECTOR(int_tcclassid)
259{
260 dst->value = skb->tc_classid;
261}
262#endif
263
264/************************************************************************** 230/**************************************************************************
265 * Routing 231 * Routing
266 **************************************************************************/ 232 **************************************************************************/
267 233
268#ifdef CONFIG_NET_CLS_ROUTE
269META_COLLECTOR(int_rtclassid) 234META_COLLECTOR(int_rtclassid)
270{ 235{
271 if (unlikely(skb->dst == NULL)) 236 if (unlikely(skb->dst == NULL))
272 *err = -1; 237 *err = -1;
273 else 238 else
239#ifdef CONFIG_NET_CLS_ROUTE
274 dst->value = skb->dst->tclassid; 240 dst->value = skb->dst->tclassid;
275} 241#else
242 dst->value = 0;
276#endif 243#endif
244}
277 245
278META_COLLECTOR(int_rtiif) 246META_COLLECTOR(int_rtiif)
279{ 247{
@@ -284,6 +252,214 @@ META_COLLECTOR(int_rtiif)
284} 252}
285 253
286/************************************************************************** 254/**************************************************************************
255 * Socket Attributes
256 **************************************************************************/
257
258#define SKIP_NONLOCAL(skb) \
259 if (unlikely(skb->sk == NULL)) { \
260 *err = -1; \
261 return; \
262 }
263
264META_COLLECTOR(int_sk_family)
265{
266 SKIP_NONLOCAL(skb);
267 dst->value = skb->sk->sk_family;
268}
269
270META_COLLECTOR(int_sk_state)
271{
272 SKIP_NONLOCAL(skb);
273 dst->value = skb->sk->sk_state;
274}
275
276META_COLLECTOR(int_sk_reuse)
277{
278 SKIP_NONLOCAL(skb);
279 dst->value = skb->sk->sk_reuse;
280}
281
282META_COLLECTOR(int_sk_bound_if)
283{
284 SKIP_NONLOCAL(skb);
285 /* No error if bound_dev_if is 0, legal userspace check */
286 dst->value = skb->sk->sk_bound_dev_if;
287}
288
289META_COLLECTOR(var_sk_bound_if)
290{
291 SKIP_NONLOCAL(skb);
292
293 if (skb->sk->sk_bound_dev_if == 0) {
294 dst->value = (unsigned long) "any";
295 dst->len = 3;
296 } else {
297 struct net_device *dev;
298
299 dev = dev_get_by_index(skb->sk->sk_bound_dev_if);
300 *err = var_dev(dev, dst);
301 if (dev)
302 dev_put(dev);
303 }
304}
305
306META_COLLECTOR(int_sk_refcnt)
307{
308 SKIP_NONLOCAL(skb);
309 dst->value = atomic_read(&skb->sk->sk_refcnt);
310}
311
312META_COLLECTOR(int_sk_rcvbuf)
313{
314 SKIP_NONLOCAL(skb);
315 dst->value = skb->sk->sk_rcvbuf;
316}
317
318META_COLLECTOR(int_sk_shutdown)
319{
320 SKIP_NONLOCAL(skb);
321 dst->value = skb->sk->sk_shutdown;
322}
323
324META_COLLECTOR(int_sk_proto)
325{
326 SKIP_NONLOCAL(skb);
327 dst->value = skb->sk->sk_protocol;
328}
329
330META_COLLECTOR(int_sk_type)
331{
332 SKIP_NONLOCAL(skb);
333 dst->value = skb->sk->sk_type;
334}
335
336META_COLLECTOR(int_sk_rmem_alloc)
337{
338 SKIP_NONLOCAL(skb);
339 dst->value = atomic_read(&skb->sk->sk_rmem_alloc);
340}
341
342META_COLLECTOR(int_sk_wmem_alloc)
343{
344 SKIP_NONLOCAL(skb);
345 dst->value = atomic_read(&skb->sk->sk_wmem_alloc);
346}
347
348META_COLLECTOR(int_sk_omem_alloc)
349{
350 SKIP_NONLOCAL(skb);
351 dst->value = atomic_read(&skb->sk->sk_omem_alloc);
352}
353
354META_COLLECTOR(int_sk_rcv_qlen)
355{
356 SKIP_NONLOCAL(skb);
357 dst->value = skb->sk->sk_receive_queue.qlen;
358}
359
360META_COLLECTOR(int_sk_snd_qlen)
361{
362 SKIP_NONLOCAL(skb);
363 dst->value = skb->sk->sk_write_queue.qlen;
364}
365
366META_COLLECTOR(int_sk_wmem_queued)
367{
368 SKIP_NONLOCAL(skb);
369 dst->value = skb->sk->sk_wmem_queued;
370}
371
372META_COLLECTOR(int_sk_fwd_alloc)
373{
374 SKIP_NONLOCAL(skb);
375 dst->value = skb->sk->sk_forward_alloc;
376}
377
378META_COLLECTOR(int_sk_sndbuf)
379{
380 SKIP_NONLOCAL(skb);
381 dst->value = skb->sk->sk_sndbuf;
382}
383
384META_COLLECTOR(int_sk_alloc)
385{
386 SKIP_NONLOCAL(skb);
387 dst->value = skb->sk->sk_allocation;
388}
389
390META_COLLECTOR(int_sk_route_caps)
391{
392 SKIP_NONLOCAL(skb);
393 dst->value = skb->sk->sk_route_caps;
394}
395
396META_COLLECTOR(int_sk_hashent)
397{
398 SKIP_NONLOCAL(skb);
399 dst->value = skb->sk->sk_hashent;
400}
401
402META_COLLECTOR(int_sk_lingertime)
403{
404 SKIP_NONLOCAL(skb);
405 dst->value = skb->sk->sk_lingertime / HZ;
406}
407
408META_COLLECTOR(int_sk_err_qlen)
409{
410 SKIP_NONLOCAL(skb);
411 dst->value = skb->sk->sk_error_queue.qlen;
412}
413
414META_COLLECTOR(int_sk_ack_bl)
415{
416 SKIP_NONLOCAL(skb);
417 dst->value = skb->sk->sk_ack_backlog;
418}
419
420META_COLLECTOR(int_sk_max_ack_bl)
421{
422 SKIP_NONLOCAL(skb);
423 dst->value = skb->sk->sk_max_ack_backlog;
424}
425
426META_COLLECTOR(int_sk_prio)
427{
428 SKIP_NONLOCAL(skb);
429 dst->value = skb->sk->sk_priority;
430}
431
432META_COLLECTOR(int_sk_rcvlowat)
433{
434 SKIP_NONLOCAL(skb);
435 dst->value = skb->sk->sk_rcvlowat;
436}
437
438META_COLLECTOR(int_sk_rcvtimeo)
439{
440 SKIP_NONLOCAL(skb);
441 dst->value = skb->sk->sk_rcvtimeo / HZ;
442}
443
444META_COLLECTOR(int_sk_sndtimeo)
445{
446 SKIP_NONLOCAL(skb);
447 dst->value = skb->sk->sk_sndtimeo / HZ;
448}
449
450META_COLLECTOR(int_sk_sendmsg_off)
451{
452 SKIP_NONLOCAL(skb);
453 dst->value = skb->sk->sk_sndmsg_off;
454}
455
456META_COLLECTOR(int_sk_write_pend)
457{
458 SKIP_NONLOCAL(skb);
459 dst->value = skb->sk->sk_write_pending;
460}
461
462/**************************************************************************
287 * Meta value collectors assignment table 463 * Meta value collectors assignment table
288 **************************************************************************/ 464 **************************************************************************/
289 465
@@ -293,41 +469,62 @@ struct meta_ops
293 struct meta_value *, struct meta_obj *, int *); 469 struct meta_value *, struct meta_obj *, int *);
294}; 470};
295 471
472#define META_ID(name) TCF_META_ID_##name
473#define META_FUNC(name) { .get = meta_##name }
474
296/* Meta value operations table listing all meta value collectors and 475/* Meta value operations table listing all meta value collectors and
297 * assigns them to a type and meta id. */ 476 * assigns them to a type and meta id. */
298static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { 477static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
299 [TCF_META_TYPE_VAR] = { 478 [TCF_META_TYPE_VAR] = {
300 [TCF_META_ID_DEV] = { .get = meta_var_dev }, 479 [META_ID(DEV)] = META_FUNC(var_dev),
301 [TCF_META_ID_INDEV] = { .get = meta_var_indev }, 480 [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
302 [TCF_META_ID_REALDEV] = { .get = meta_var_realdev }
303 }, 481 },
304 [TCF_META_TYPE_INT] = { 482 [TCF_META_TYPE_INT] = {
305 [TCF_META_ID_RANDOM] = { .get = meta_int_random }, 483 [META_ID(RANDOM)] = META_FUNC(int_random),
306 [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 }, 484 [META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0),
307 [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 }, 485 [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
308 [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 }, 486 [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
309 [TCF_META_ID_DEV] = { .get = meta_int_dev }, 487 [META_ID(DEV)] = META_FUNC(int_dev),
310 [TCF_META_ID_INDEV] = { .get = meta_int_indev }, 488 [META_ID(PRIORITY)] = META_FUNC(int_priority),
311 [TCF_META_ID_REALDEV] = { .get = meta_int_realdev }, 489 [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
312 [TCF_META_ID_PRIORITY] = { .get = meta_int_priority }, 490 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
313 [TCF_META_ID_PROTOCOL] = { .get = meta_int_protocol }, 491 [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
314 [TCF_META_ID_SECURITY] = { .get = meta_int_security }, 492 [META_ID(DATALEN)] = META_FUNC(int_datalen),
315 [TCF_META_ID_PKTTYPE] = { .get = meta_int_pkttype }, 493 [META_ID(MACLEN)] = META_FUNC(int_maclen),
316 [TCF_META_ID_PKTLEN] = { .get = meta_int_pktlen }, 494 [META_ID(NFMARK)] = META_FUNC(int_nfmark),
317 [TCF_META_ID_DATALEN] = { .get = meta_int_datalen }, 495 [META_ID(TCINDEX)] = META_FUNC(int_tcindex),
318 [TCF_META_ID_MACLEN] = { .get = meta_int_maclen }, 496 [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
319#ifdef CONFIG_NETFILTER 497 [META_ID(RTIIF)] = META_FUNC(int_rtiif),
320 [TCF_META_ID_NFMARK] = { .get = meta_int_nfmark }, 498 [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
321#endif 499 [META_ID(SK_STATE)] = META_FUNC(int_sk_state),
322 [TCF_META_ID_TCINDEX] = { .get = meta_int_tcindex }, 500 [META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse),
323#ifdef CONFIG_NET_CLS_ACT 501 [META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if),
324 [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd }, 502 [META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt),
325 [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid }, 503 [META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf),
326#endif 504 [META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf),
327#ifdef CONFIG_NET_CLS_ROUTE 505 [META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown),
328 [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid }, 506 [META_ID(SK_PROTO)] = META_FUNC(int_sk_proto),
329#endif 507 [META_ID(SK_TYPE)] = META_FUNC(int_sk_type),
330 [TCF_META_ID_RTIIF] = { .get = meta_int_rtiif } 508 [META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc),
509 [META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc),
510 [META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc),
511 [META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued),
512 [META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen),
513 [META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen),
514 [META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen),
515 [META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
516 [META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
517 [META_ID(SK_ROUTE_CAPS)] = META_FUNC(int_sk_route_caps),
518 [META_ID(SK_HASHENT)] = META_FUNC(int_sk_hashent),
519 [META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
520 [META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
521 [META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl),
522 [META_ID(SK_PRIO)] = META_FUNC(int_sk_prio),
523 [META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat),
524 [META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo),
525 [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo),
526 [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
527 [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
331 } 528 }
332}; 529};
333 530
@@ -396,9 +593,9 @@ static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
396 /* Let gcc optimize it, the unlikely is not really based on 593 /* Let gcc optimize it, the unlikely is not really based on
397 * some numbers but jump free code for mismatches seems 594 * some numbers but jump free code for mismatches seems
398 * more logical. */ 595 * more logical. */
399 if (unlikely(a == b)) 596 if (unlikely(a->value == b->value))
400 return 0; 597 return 0;
401 else if (a < b) 598 else if (a->value < b->value)
402 return -1; 599 return -1;
403 else 600 else
404 return 1; 601 return 1;
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000000..77beabc91fa3
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,154 @@
1/*
2 * net/sched/em_text.c Textsearch ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/string.h>
18#include <linux/skbuff.h>
19#include <linux/textsearch.h>
20#include <linux/tc_ematch/tc_em_text.h>
21#include <net/pkt_cls.h>
22
23struct text_match
24{
25 u16 from_offset;
26 u16 to_offset;
27 u8 from_layer;
28 u8 to_layer;
29 struct ts_config *config;
30};
31
32#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
33
34static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
35 struct tcf_pkt_info *info)
36{
37 struct text_match *tm = EM_TEXT_PRIV(m);
38 int from, to;
39 struct ts_state state;
40
41 from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
42 from += tm->from_offset;
43
44 to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
45 to += tm->to_offset;
46
47 return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
48}
49
50static int em_text_change(struct tcf_proto *tp, void *data, int len,
51 struct tcf_ematch *m)
52{
53 struct text_match *tm;
54 struct tcf_em_text *conf = data;
55 struct ts_config *ts_conf;
56 int flags = 0;
57
58 if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
59 return -EINVAL;
60
61 if (conf->from_layer > conf->to_layer)
62 return -EINVAL;
63
64 if (conf->from_layer == conf->to_layer &&
65 conf->from_offset > conf->to_offset)
66 return -EINVAL;
67
68retry:
69 ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
70 conf->pattern_len, GFP_KERNEL, flags);
71
72 if (flags & TS_AUTOLOAD)
73 rtnl_lock();
74
75 if (IS_ERR(ts_conf)) {
76 if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
77 rtnl_unlock();
78 flags |= TS_AUTOLOAD;
79 goto retry;
80 } else
81 return PTR_ERR(ts_conf);
82 } else if (flags & TS_AUTOLOAD) {
83 textsearch_destroy(ts_conf);
84 return -EAGAIN;
85 }
86
87 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
88 if (tm == NULL) {
89 textsearch_destroy(ts_conf);
90 return -ENOBUFS;
91 }
92
93 tm->from_offset = conf->from_offset;
94 tm->to_offset = conf->to_offset;
95 tm->from_layer = conf->from_layer;
96 tm->to_layer = conf->to_layer;
97 tm->config = ts_conf;
98
99 m->datalen = sizeof(*tm);
100 m->data = (unsigned long) tm;
101
102 return 0;
103}
104
105static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
106{
107 textsearch_destroy(EM_TEXT_PRIV(m)->config);
108}
109
110static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
111{
112 struct text_match *tm = EM_TEXT_PRIV(m);
113 struct tcf_em_text conf;
114
115 strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
116 conf.from_offset = tm->from_offset;
117 conf.to_offset = tm->to_offset;
118 conf.from_layer = tm->from_layer;
119 conf.to_layer = tm->to_layer;
120 conf.pattern_len = textsearch_get_pattern_len(tm->config);
121 conf.pad = 0;
122
123 RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
124 RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
125 return 0;
126
127rtattr_failure:
128 return -1;
129}
130
131static struct tcf_ematch_ops em_text_ops = {
132 .kind = TCF_EM_TEXT,
133 .change = em_text_change,
134 .match = em_text_match,
135 .destroy = em_text_destroy,
136 .dump = em_text_dump,
137 .owner = THIS_MODULE,
138 .link = LIST_HEAD_INIT(em_text_ops.link)
139};
140
141static int __init init_em_text(void)
142{
143 return tcf_em_register(&em_text_ops);
144}
145
146static void __exit exit_em_text(void)
147{
148 tcf_em_unregister(&em_text_ops);
149}
150
151MODULE_LICENSE("GPL");
152
153module_init(init_em_text);
154module_exit(exit_em_text);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 07977f8f2679..b9a069af4a02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{ 399{
400 int err; 400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1]; 401 struct rtattr *kind = tca[TCA_KIND-1];
402 void *p = NULL;
403 struct Qdisc *sch; 402 struct Qdisc *sch;
404 struct Qdisc_ops *ops; 403 struct Qdisc_ops *ops;
405 int size;
406 404
407 ops = qdisc_lookup_ops(kind); 405 ops = qdisc_lookup_ops(kind);
408#ifdef CONFIG_KMOD 406#ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
437 if (ops == NULL) 435 if (ops == NULL)
438 goto err_out; 436 goto err_out;
439 437
440 /* ensure that the Qdisc and the private data are 32-byte aligned */ 438 sch = qdisc_alloc(dev, ops);
441 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 439 if (IS_ERR(sch)) {
442 size += ops->priv_size + QDISC_ALIGN_CONST; 440 err = PTR_ERR(sch);
443
444 p = kmalloc(size, GFP_KERNEL);
445 err = -ENOBUFS;
446 if (!p)
447 goto err_out2; 441 goto err_out2;
448 memset(p, 0, size); 442 }
449 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 & ~QDISC_ALIGN_CONST);
451 sch->padded = (char *)sch - (char *)p;
452
453 INIT_LIST_HEAD(&sch->list);
454 skb_queue_head_init(&sch->q);
455 443
456 if (handle == TC_H_INGRESS) 444 if (handle == TC_H_INGRESS) {
457 sch->flags |= TCQ_F_INGRESS; 445 sch->flags |= TCQ_F_INGRESS;
458 446 handle = TC_H_MAKE(TC_H_INGRESS, 0);
459 sch->ops = ops; 447 } else if (handle == 0) {
460 sch->enqueue = ops->enqueue;
461 sch->dequeue = ops->dequeue;
462 sch->dev = dev;
463 dev_hold(dev);
464 atomic_set(&sch->refcnt, 1);
465 sch->stats_lock = &dev->queue_lock;
466 if (handle == 0) {
467 handle = qdisc_alloc_handle(dev); 448 handle = qdisc_alloc_handle(dev);
468 err = -ENOMEM; 449 err = -ENOMEM;
469 if (handle == 0) 450 if (handle == 0)
470 goto err_out3; 451 goto err_out3;
471 } 452 }
472 453
473 if (handle == TC_H_INGRESS) 454 sch->handle = handle;
474 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475 else
476 sch->handle = handle;
477 455
478 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
457#ifdef CONFIG_NET_ESTIMATOR
458 if (tca[TCA_RATE-1]) {
459 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
460 sch->stats_lock,
461 tca[TCA_RATE-1]);
462 if (err) {
463 /*
464 * Any broken qdiscs that would require
465 * a ops->reset() here? The qdisc was never
466 * in action so it shouldn't be necessary.
467 */
468 if (ops->destroy)
469 ops->destroy(sch);
470 goto err_out3;
471 }
472 }
473#endif
479 qdisc_lock_tree(dev); 474 qdisc_lock_tree(dev);
480 list_add_tail(&sch->list, &dev->qdisc_list); 475 list_add_tail(&sch->list, &dev->qdisc_list);
481 qdisc_unlock_tree(dev); 476 qdisc_unlock_tree(dev);
482 477
483#ifdef CONFIG_NET_ESTIMATOR
484 if (tca[TCA_RATE-1])
485 gen_new_estimator(&sch->bstats, &sch->rate_est,
486 sch->stats_lock, tca[TCA_RATE-1]);
487#endif
488 return sch; 478 return sch;
489 } 479 }
490err_out3: 480err_out3:
491 dev_put(dev); 481 dev_put(dev);
482 kfree((char *) sch - sch->padded);
492err_out2: 483err_out2:
493 module_put(ops->owner); 484 module_put(ops->owner);
494err_out: 485err_out:
495 *errp = err; 486 *errp = err;
496 if (p)
497 kfree(p);
498 return NULL; 487 return NULL;
499} 488}
500 489
@@ -760,17 +749,18 @@ graft:
760} 749}
761 750
762static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 751static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
763 u32 pid, u32 seq, unsigned flags, int event) 752 u32 pid, u32 seq, u16 flags, int event)
764{ 753{
765 struct tcmsg *tcm; 754 struct tcmsg *tcm;
766 struct nlmsghdr *nlh; 755 struct nlmsghdr *nlh;
767 unsigned char *b = skb->tail; 756 unsigned char *b = skb->tail;
768 struct gnet_dump d; 757 struct gnet_dump d;
769 758
770 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 759 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
771 nlh->nlmsg_flags = flags;
772 tcm = NLMSG_DATA(nlh); 760 tcm = NLMSG_DATA(nlh);
773 tcm->tcm_family = AF_UNSPEC; 761 tcm->tcm_family = AF_UNSPEC;
762 tcm->tcm__pad1 = 0;
763 tcm->tcm__pad2 = 0;
774 tcm->tcm_ifindex = q->dev->ifindex; 764 tcm->tcm_ifindex = q->dev->ifindex;
775 tcm->tcm_parent = clid; 765 tcm->tcm_parent = clid;
776 tcm->tcm_handle = q->handle; 766 tcm->tcm_handle = q->handle;
@@ -997,7 +987,7 @@ out:
997 987
998static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 988static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
999 unsigned long cl, 989 unsigned long cl,
1000 u32 pid, u32 seq, unsigned flags, int event) 990 u32 pid, u32 seq, u16 flags, int event)
1001{ 991{
1002 struct tcmsg *tcm; 992 struct tcmsg *tcm;
1003 struct nlmsghdr *nlh; 993 struct nlmsghdr *nlh;
@@ -1005,8 +995,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1005 struct gnet_dump d; 995 struct gnet_dump d;
1006 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 996 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1007 997
1008 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 998 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1009 nlh->nlmsg_flags = flags;
1010 tcm = NLMSG_DATA(nlh); 999 tcm = NLMSG_DATA(nlh);
1011 tcm->tcm_family = AF_UNSPEC; 1000 tcm->tcm_family = AF_UNSPEC;
1012 tcm->tcm_ifindex = q->dev->ifindex; 1001 tcm->tcm_ifindex = q->dev->ifindex;
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000000..81f0b8346d17
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
1/*
2 * net/sched/sch_blackhole.c Black hole queue
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 *
11 * Note: Quantum tunneling is not supported.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/netdevice.h>
19#include <linux/skbuff.h>
20#include <net/pkt_sched.h>
21
22static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
23{
24 qdisc_drop(skb, sch);
25 return NET_XMIT_SUCCESS;
26}
27
28static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
29{
30 return NULL;
31}
32
33static struct Qdisc_ops blackhole_qdisc_ops = {
34 .id = "blackhole",
35 .priv_size = 0,
36 .enqueue = blackhole_enqueue,
37 .dequeue = blackhole_dequeue,
38 .owner = THIS_MODULE,
39};
40
41static int __init blackhole_module_init(void)
42{
43 return register_qdisc(&blackhole_qdisc_ops);
44}
45
46static void __exit blackhole_module_exit(void)
47{
48 unregister_qdisc(&blackhole_qdisc_ops);
49}
50
51module_init(blackhole_module_init)
52module_exit(blackhole_module_exit)
53
54MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d43e3b8cbf6a..09453f997d8c 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1528 1528
1529 opt.strategy = cl->ovl_strategy; 1529 opt.strategy = cl->ovl_strategy;
1530 opt.priority2 = cl->priority2+1; 1530 opt.priority2 = cl->priority2+1;
1531 opt.pad = 0;
1531 opt.penalty = (cl->penalty*1000)/HZ; 1532 opt.penalty = (cl->penalty*1000)/HZ;
1532 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1533 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1533 return skb->len; 1534 return skb->len;
@@ -1563,6 +1564,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1563 1564
1564 if (cl->police) { 1565 if (cl->police) {
1565 opt.police = cl->police; 1566 opt.police = cl->police;
1567 opt.__res1 = 0;
1568 opt.__res2 = 0;
1566 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); 1569 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1567 } 1570 }
1568 return skb->len; 1571 return skb->len;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index d8bd2a569c7c..13e0e7b3856b 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -31,7 +31,7 @@
31#endif 31#endif
32 32
33 33
34#define PRIV(sch) qdisc_priv(sch) 34#define PRIV(sch) ((struct dsmark_qdisc_data *) qdisc_priv(sch))
35 35
36 36
37/* 37/*
@@ -55,24 +55,38 @@
55struct dsmark_qdisc_data { 55struct dsmark_qdisc_data {
56 struct Qdisc *q; 56 struct Qdisc *q;
57 struct tcf_proto *filter_list; 57 struct tcf_proto *filter_list;
58 __u8 *mask; /* "owns" the array */ 58 u8 *mask; /* "owns" the array */
59 __u8 *value; 59 u8 *value;
60 __u16 indices; 60 u16 indices;
61 __u32 default_index; /* index range is 0...0xffff */ 61 u32 default_index; /* index range is 0...0xffff */
62 int set_tc_index; 62 int set_tc_index;
63}; 63};
64 64
65static inline int dsmark_valid_indices(u16 indices)
66{
67 while (indices != 1) {
68 if (indices & 1)
69 return 0;
70 indices >>= 1;
71 }
72
73 return 1;
74}
65 75
66/* ------------------------- Class/flow operations ------------------------- */ 76static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
77{
78 return (index <= p->indices && index > 0);
79}
67 80
81/* ------------------------- Class/flow operations ------------------------- */
68 82
69static int dsmark_graft(struct Qdisc *sch,unsigned long arg, 83static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
70 struct Qdisc *new,struct Qdisc **old) 84 struct Qdisc *new, struct Qdisc **old)
71{ 85{
72 struct dsmark_qdisc_data *p = PRIV(sch); 86 struct dsmark_qdisc_data *p = PRIV(sch);
73 87
74 DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, 88 DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
75 old); 89 sch, p, new, old);
76 90
77 if (new == NULL) { 91 if (new == NULL) {
78 new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); 92 new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
@@ -81,91 +95,95 @@ static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
81 } 95 }
82 96
83 sch_tree_lock(sch); 97 sch_tree_lock(sch);
84 *old = xchg(&p->q,new); 98 *old = xchg(&p->q, new);
85 if (*old) 99 qdisc_reset(*old);
86 qdisc_reset(*old);
87 sch->q.qlen = 0; 100 sch->q.qlen = 0;
88 sch_tree_unlock(sch); /* @@@ move up ? */ 101 sch_tree_unlock(sch);
102
89 return 0; 103 return 0;
90} 104}
91 105
92
93static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) 106static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
94{ 107{
95 struct dsmark_qdisc_data *p = PRIV(sch); 108 return PRIV(sch)->q;
96
97 return p->q;
98} 109}
99 110
100 111static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
101static unsigned long dsmark_get(struct Qdisc *sch,u32 classid)
102{ 112{
103 struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch); 113 DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
114 sch, PRIV(sch), classid);
104 115
105 DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); 116 return TC_H_MIN(classid) + 1;
106 return TC_H_MIN(classid)+1;
107} 117}
108 118
109
110static unsigned long dsmark_bind_filter(struct Qdisc *sch, 119static unsigned long dsmark_bind_filter(struct Qdisc *sch,
111 unsigned long parent, u32 classid) 120 unsigned long parent, u32 classid)
112{ 121{
113 return dsmark_get(sch,classid); 122 return dsmark_get(sch, classid);
114} 123}
115 124
116
117static void dsmark_put(struct Qdisc *sch, unsigned long cl) 125static void dsmark_put(struct Qdisc *sch, unsigned long cl)
118{ 126{
119} 127}
120 128
121
122static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, 129static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
123 struct rtattr **tca, unsigned long *arg) 130 struct rtattr **tca, unsigned long *arg)
124{ 131{
125 struct dsmark_qdisc_data *p = PRIV(sch); 132 struct dsmark_qdisc_data *p = PRIV(sch);
126 struct rtattr *opt = tca[TCA_OPTIONS-1]; 133 struct rtattr *opt = tca[TCA_OPTIONS-1];
127 struct rtattr *tb[TCA_DSMARK_MAX]; 134 struct rtattr *tb[TCA_DSMARK_MAX];
135 int err = -EINVAL;
136 u8 mask = 0;
128 137
129 DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," 138 DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
130 "arg 0x%lx\n",sch,p,classid,parent,*arg); 139 "arg 0x%lx\n", sch, p, classid, parent, *arg);
131 if (*arg > p->indices) 140
132 return -ENOENT; 141 if (!dsmark_valid_index(p, *arg)) {
133 if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt)) 142 err = -ENOENT;
134 return -EINVAL; 143 goto rtattr_failure;
135 if (tb[TCA_DSMARK_MASK-1]) {
136 if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1]))
137 return -EINVAL;
138 p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]);
139 }
140 if (tb[TCA_DSMARK_VALUE-1]) {
141 if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1]))
142 return -EINVAL;
143 p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]);
144 } 144 }
145 return 0;
146}
147 145
146 if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
147 goto rtattr_failure;
148
149 if (tb[TCA_DSMARK_MASK-1])
150 mask = RTA_GET_U8(tb[TCA_DSMARK_MASK-1]);
151
152 if (tb[TCA_DSMARK_VALUE-1])
153 p->value[*arg-1] = RTA_GET_U8(tb[TCA_DSMARK_VALUE-1]);
154
155 if (tb[TCA_DSMARK_MASK-1])
156 p->mask[*arg-1] = mask;
157
158 err = 0;
148 159
149static int dsmark_delete(struct Qdisc *sch,unsigned long arg) 160rtattr_failure:
161 return err;
162}
163
164static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
150{ 165{
151 struct dsmark_qdisc_data *p = PRIV(sch); 166 struct dsmark_qdisc_data *p = PRIV(sch);
152 167
153 if (!arg || arg > p->indices) 168 if (!dsmark_valid_index(p, arg))
154 return -EINVAL; 169 return -EINVAL;
170
155 p->mask[arg-1] = 0xff; 171 p->mask[arg-1] = 0xff;
156 p->value[arg-1] = 0; 172 p->value[arg-1] = 0;
173
157 return 0; 174 return 0;
158} 175}
159 176
160
161static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) 177static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
162{ 178{
163 struct dsmark_qdisc_data *p = PRIV(sch); 179 struct dsmark_qdisc_data *p = PRIV(sch);
164 int i; 180 int i;
165 181
166 DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); 182 DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
183
167 if (walker->stop) 184 if (walker->stop)
168 return; 185 return;
186
169 for (i = 0; i < p->indices; i++) { 187 for (i = 0; i < p->indices; i++) {
170 if (p->mask[i] == 0xff && !p->value[i]) 188 if (p->mask[i] == 0xff && !p->value[i])
171 goto ignore; 189 goto ignore;
@@ -180,26 +198,20 @@ ignore:
180 } 198 }
181} 199}
182 200
183
184static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) 201static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl)
185{ 202{
186 struct dsmark_qdisc_data *p = PRIV(sch); 203 return &PRIV(sch)->filter_list;
187
188 return &p->filter_list;
189} 204}
190 205
191
192/* --------------------------- Qdisc operations ---------------------------- */ 206/* --------------------------- Qdisc operations ---------------------------- */
193 207
194
195static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) 208static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
196{ 209{
197 struct dsmark_qdisc_data *p = PRIV(sch); 210 struct dsmark_qdisc_data *p = PRIV(sch);
198 struct tcf_result res; 211 int err;
199 int result; 212
200 int ret = NET_XMIT_POLICED; 213 D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
201 214
202 D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
203 if (p->set_tc_index) { 215 if (p->set_tc_index) {
204 /* FIXME: Safe with non-linear skbs? --RR */ 216 /* FIXME: Safe with non-linear skbs? --RR */
205 switch (skb->protocol) { 217 switch (skb->protocol) {
@@ -216,17 +228,21 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
216 break; 228 break;
217 }; 229 };
218 } 230 }
219 result = TC_POLICE_OK; /* be nice to gcc */ 231
220 if (TC_H_MAJ(skb->priority) == sch->handle) { 232 if (TC_H_MAJ(skb->priority) == sch->handle)
221 skb->tc_index = TC_H_MIN(skb->priority); 233 skb->tc_index = TC_H_MIN(skb->priority);
222 } else { 234 else {
223 result = tc_classify(skb,p->filter_list,&res); 235 struct tcf_result res;
224 D2PRINTK("result %d class 0x%04x\n",result,res.classid); 236 int result = tc_classify(skb, p->filter_list, &res);
237
238 D2PRINTK("result %d class 0x%04x\n", result, res.classid);
239
225 switch (result) { 240 switch (result) {
226#ifdef CONFIG_NET_CLS_POLICE 241#ifdef CONFIG_NET_CLS_POLICE
227 case TC_POLICE_SHOT: 242 case TC_POLICE_SHOT:
228 kfree_skb(skb); 243 kfree_skb(skb);
229 break; 244 sch->qstats.drops++;
245 return NET_XMIT_POLICED;
230#if 0 246#if 0
231 case TC_POLICE_RECLASSIFY: 247 case TC_POLICE_RECLASSIFY:
232 /* FIXME: what to do here ??? */ 248 /* FIXME: what to do here ??? */
@@ -243,43 +259,45 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
243 break; 259 break;
244 }; 260 };
245 } 261 }
246 if (
247#ifdef CONFIG_NET_CLS_POLICE
248 result == TC_POLICE_SHOT ||
249#endif
250 262
251 ((ret = p->q->enqueue(skb,p->q)) != 0)) { 263 err = p->q->enqueue(skb,p->q);
264 if (err != NET_XMIT_SUCCESS) {
252 sch->qstats.drops++; 265 sch->qstats.drops++;
253 return ret; 266 return err;
254 } 267 }
268
255 sch->bstats.bytes += skb->len; 269 sch->bstats.bytes += skb->len;
256 sch->bstats.packets++; 270 sch->bstats.packets++;
257 sch->q.qlen++; 271 sch->q.qlen++;
258 return ret;
259}
260 272
273 return NET_XMIT_SUCCESS;
274}
261 275
262static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) 276static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
263{ 277{
264 struct dsmark_qdisc_data *p = PRIV(sch); 278 struct dsmark_qdisc_data *p = PRIV(sch);
265 struct sk_buff *skb; 279 struct sk_buff *skb;
266 int index; 280 u32 index;
281
282 D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
267 283
268 D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p);
269 skb = p->q->ops->dequeue(p->q); 284 skb = p->q->ops->dequeue(p->q);
270 if (!skb) 285 if (skb == NULL)
271 return NULL; 286 return NULL;
287
272 sch->q.qlen--; 288 sch->q.qlen--;
273 index = skb->tc_index & (p->indices-1); 289
274 D2PRINTK("index %d->%d\n",skb->tc_index,index); 290 index = skb->tc_index & (p->indices - 1);
291 D2PRINTK("index %d->%d\n", skb->tc_index, index);
292
275 switch (skb->protocol) { 293 switch (skb->protocol) {
276 case __constant_htons(ETH_P_IP): 294 case __constant_htons(ETH_P_IP):
277 ipv4_change_dsfield(skb->nh.iph, 295 ipv4_change_dsfield(skb->nh.iph, p->mask[index],
278 p->mask[index],p->value[index]); 296 p->value[index]);
279 break; 297 break;
280 case __constant_htons(ETH_P_IPV6): 298 case __constant_htons(ETH_P_IPV6):
281 ipv6_change_dsfield(skb->nh.ipv6h, 299 ipv6_change_dsfield(skb->nh.ipv6h, p->mask[index],
282 p->mask[index],p->value[index]); 300 p->value[index]);
283 break; 301 break;
284 default: 302 default:
285 /* 303 /*
@@ -293,152 +311,162 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
293 htons(skb->protocol)); 311 htons(skb->protocol));
294 break; 312 break;
295 }; 313 };
314
296 return skb; 315 return skb;
297} 316}
298 317
299
300static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) 318static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch)
301{ 319{
302 int ret;
303 struct dsmark_qdisc_data *p = PRIV(sch); 320 struct dsmark_qdisc_data *p = PRIV(sch);
321 int err;
304 322
305 D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); 323 D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
306 if ((ret = p->q->ops->requeue(skb, p->q)) == 0) { 324
307 sch->q.qlen++; 325 err = p->q->ops->requeue(skb, p->q);
308 sch->qstats.requeues++; 326 if (err != NET_XMIT_SUCCESS) {
309 return 0; 327 sch->qstats.drops++;
328 return err;
310 } 329 }
311 sch->qstats.drops++;
312 return ret;
313}
314 330
331 sch->q.qlen++;
332 sch->qstats.requeues++;
333
334 return NET_XMIT_SUCCESS;
335}
315 336
316static unsigned int dsmark_drop(struct Qdisc *sch) 337static unsigned int dsmark_drop(struct Qdisc *sch)
317{ 338{
318 struct dsmark_qdisc_data *p = PRIV(sch); 339 struct dsmark_qdisc_data *p = PRIV(sch);
319 unsigned int len; 340 unsigned int len;
320 341
321 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); 342 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
322 if (!p->q->ops->drop) 343
323 return 0; 344 if (p->q->ops->drop == NULL)
324 if (!(len = p->q->ops->drop(p->q)))
325 return 0; 345 return 0;
326 sch->q.qlen--; 346
347 len = p->q->ops->drop(p->q);
348 if (len)
349 sch->q.qlen--;
350
327 return len; 351 return len;
328} 352}
329 353
330 354static int dsmark_init(struct Qdisc *sch, struct rtattr *opt)
331static int dsmark_init(struct Qdisc *sch,struct rtattr *opt)
332{ 355{
333 struct dsmark_qdisc_data *p = PRIV(sch); 356 struct dsmark_qdisc_data *p = PRIV(sch);
334 struct rtattr *tb[TCA_DSMARK_MAX]; 357 struct rtattr *tb[TCA_DSMARK_MAX];
335 __u16 tmp; 358 int err = -EINVAL;
336 359 u32 default_index = NO_DEFAULT_INDEX;
337 DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); 360 u16 indices;
338 if (!opt || 361 u8 *mask;
339 rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 || 362
340 !tb[TCA_DSMARK_INDICES-1] || 363 DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
341 RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16)) 364
342 return -EINVAL; 365 if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt) < 0)
343 p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]); 366 goto errout;
344 if (!p->indices) 367
345 return -EINVAL; 368 indices = RTA_GET_U16(tb[TCA_DSMARK_INDICES-1]);
346 for (tmp = p->indices; tmp != 1; tmp >>= 1) { 369 if (!indices || !dsmark_valid_indices(indices))
347 if (tmp & 1) 370 goto errout;
348 return -EINVAL; 371
349 } 372 if (tb[TCA_DSMARK_DEFAULT_INDEX-1])
350 p->default_index = NO_DEFAULT_INDEX; 373 default_index = RTA_GET_U16(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
351 if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) { 374
352 if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16)) 375 mask = kmalloc(indices * 2, GFP_KERNEL);
353 return -EINVAL; 376 if (mask == NULL) {
354 p->default_index = 377 err = -ENOMEM;
355 *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]); 378 goto errout;
356 } 379 }
357 p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1]; 380
358 p->mask = kmalloc(p->indices*2,GFP_KERNEL); 381 p->mask = mask;
359 if (!p->mask) 382 memset(p->mask, 0xff, indices);
360 return -ENOMEM; 383
361 p->value = p->mask+p->indices; 384 p->value = p->mask + indices;
362 memset(p->mask,0xff,p->indices); 385 memset(p->value, 0, indices);
363 memset(p->value,0,p->indices); 386
364 if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) 387 p->indices = indices;
388 p->default_index = default_index;
389 p->set_tc_index = RTA_GET_FLAG(tb[TCA_DSMARK_SET_TC_INDEX-1]);
390
391 p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
392 if (p->q == NULL)
365 p->q = &noop_qdisc; 393 p->q = &noop_qdisc;
366 DPRINTK("dsmark_init: qdisc %p\n",&p->q);
367 return 0;
368}
369 394
395 DPRINTK("dsmark_init: qdisc %p\n", p->q);
396
397 err = 0;
398errout:
399rtattr_failure:
400 return err;
401}
370 402
371static void dsmark_reset(struct Qdisc *sch) 403static void dsmark_reset(struct Qdisc *sch)
372{ 404{
373 struct dsmark_qdisc_data *p = PRIV(sch); 405 struct dsmark_qdisc_data *p = PRIV(sch);
374 406
375 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); 407 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
376 qdisc_reset(p->q); 408 qdisc_reset(p->q);
377 sch->q.qlen = 0; 409 sch->q.qlen = 0;
378} 410}
379 411
380
381static void dsmark_destroy(struct Qdisc *sch) 412static void dsmark_destroy(struct Qdisc *sch)
382{ 413{
383 struct dsmark_qdisc_data *p = PRIV(sch); 414 struct dsmark_qdisc_data *p = PRIV(sch);
384 struct tcf_proto *tp; 415 struct tcf_proto *tp;
385 416
386 DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p); 417 DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
418
387 while (p->filter_list) { 419 while (p->filter_list) {
388 tp = p->filter_list; 420 tp = p->filter_list;
389 p->filter_list = tp->next; 421 p->filter_list = tp->next;
390 tcf_destroy(tp); 422 tcf_destroy(tp);
391 } 423 }
424
392 qdisc_destroy(p->q); 425 qdisc_destroy(p->q);
393 kfree(p->mask); 426 kfree(p->mask);
394} 427}
395 428
396
397static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, 429static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
398 struct sk_buff *skb, struct tcmsg *tcm) 430 struct sk_buff *skb, struct tcmsg *tcm)
399{ 431{
400 struct dsmark_qdisc_data *p = PRIV(sch); 432 struct dsmark_qdisc_data *p = PRIV(sch);
401 unsigned char *b = skb->tail; 433 struct rtattr *opts = NULL;
402 struct rtattr *rta; 434
435 DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
403 436
404 DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl); 437 if (!dsmark_valid_index(p, cl))
405 if (!cl || cl > p->indices)
406 return -EINVAL; 438 return -EINVAL;
407 tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1); 439
408 rta = (struct rtattr *) b; 440 tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
409 RTA_PUT(skb,TCA_OPTIONS,0,NULL); 441
410 RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]); 442 opts = RTA_NEST(skb, TCA_OPTIONS);
411 RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]); 443 RTA_PUT_U8(skb,TCA_DSMARK_MASK, p->mask[cl-1]);
412 rta->rta_len = skb->tail-b; 444 RTA_PUT_U8(skb,TCA_DSMARK_VALUE, p->value[cl-1]);
413 return skb->len; 445
446 return RTA_NEST_END(skb, opts);
414 447
415rtattr_failure: 448rtattr_failure:
416 skb_trim(skb,b-skb->data); 449 return RTA_NEST_CANCEL(skb, opts);
417 return -1;
418} 450}
419 451
420static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) 452static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
421{ 453{
422 struct dsmark_qdisc_data *p = PRIV(sch); 454 struct dsmark_qdisc_data *p = PRIV(sch);
423 unsigned char *b = skb->tail; 455 struct rtattr *opts = NULL;
424 struct rtattr *rta;
425 456
426 rta = (struct rtattr *) b; 457 opts = RTA_NEST(skb, TCA_OPTIONS);
427 RTA_PUT(skb,TCA_OPTIONS,0,NULL); 458 RTA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
428 RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices); 459
429 if (p->default_index != NO_DEFAULT_INDEX) { 460 if (p->default_index != NO_DEFAULT_INDEX)
430 __u16 tmp = p->default_index; 461 RTA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
431 462
432 RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp);
433 }
434 if (p->set_tc_index) 463 if (p->set_tc_index)
435 RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL); 464 RTA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
436 rta->rta_len = skb->tail-b; 465
437 return skb->len; 466 return RTA_NEST_END(skb, opts);
438 467
439rtattr_failure: 468rtattr_failure:
440 skb_trim(skb,b-skb->data); 469 return RTA_NEST_CANCEL(skb, opts);
441 return -1;
442} 470}
443 471
444static struct Qdisc_class_ops dsmark_class_ops = { 472static struct Qdisc_class_ops dsmark_class_ops = {
@@ -476,10 +504,13 @@ static int __init dsmark_module_init(void)
476{ 504{
477 return register_qdisc(&dsmark_qdisc_ops); 505 return register_qdisc(&dsmark_qdisc_ops);
478} 506}
507
479static void __exit dsmark_module_exit(void) 508static void __exit dsmark_module_exit(void)
480{ 509{
481 unregister_qdisc(&dsmark_qdisc_ops); 510 unregister_qdisc(&dsmark_qdisc_ops);
482} 511}
512
483module_init(dsmark_module_init) 513module_init(dsmark_module_init)
484module_exit(dsmark_module_exit) 514module_exit(dsmark_module_exit)
515
485MODULE_LICENSE("GPL"); 516MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 4888305c96da..033083bf0e74 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -11,131 +11,38 @@
11 11
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h> 14#include <linux/types.h>
18#include <linux/kernel.h> 15#include <linux/kernel.h>
19#include <linux/sched.h>
20#include <linux/string.h>
21#include <linux/mm.h>
22#include <linux/socket.h>
23#include <linux/sockios.h>
24#include <linux/in.h>
25#include <linux/errno.h> 16#include <linux/errno.h>
26#include <linux/interrupt.h>
27#include <linux/if_ether.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h> 17#include <linux/netdevice.h>
30#include <linux/etherdevice.h>
31#include <linux/notifier.h>
32#include <net/ip.h>
33#include <net/route.h>
34#include <linux/skbuff.h> 18#include <linux/skbuff.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h> 19#include <net/pkt_sched.h>
37 20
38/* 1 band FIFO pseudo-"scheduler" */ 21/* 1 band FIFO pseudo-"scheduler" */
39 22
40struct fifo_sched_data 23struct fifo_sched_data
41{ 24{
42 unsigned limit; 25 u32 limit;
43}; 26};
44 27
45static int 28static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
46bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
47{ 29{
48 struct fifo_sched_data *q = qdisc_priv(sch); 30 struct fifo_sched_data *q = qdisc_priv(sch);
49 31
50 if (sch->qstats.backlog + skb->len <= q->limit) { 32 if (likely(sch->qstats.backlog + skb->len <= q->limit))
51 __skb_queue_tail(&sch->q, skb); 33 return qdisc_enqueue_tail(skb, sch);
52 sch->qstats.backlog += skb->len;
53 sch->bstats.bytes += skb->len;
54 sch->bstats.packets++;
55 return 0;
56 }
57 sch->qstats.drops++;
58#ifdef CONFIG_NET_CLS_POLICE
59 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
60#endif
61 kfree_skb(skb);
62 return NET_XMIT_DROP;
63}
64
65static int
66bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
67{
68 __skb_queue_head(&sch->q, skb);
69 sch->qstats.backlog += skb->len;
70 sch->qstats.requeues++;
71 return 0;
72}
73
74static struct sk_buff *
75bfifo_dequeue(struct Qdisc* sch)
76{
77 struct sk_buff *skb;
78 34
79 skb = __skb_dequeue(&sch->q); 35 return qdisc_reshape_fail(skb, sch);
80 if (skb)
81 sch->qstats.backlog -= skb->len;
82 return skb;
83} 36}
84 37
85static unsigned int 38static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
86fifo_drop(struct Qdisc* sch)
87{
88 struct sk_buff *skb;
89
90 skb = __skb_dequeue_tail(&sch->q);
91 if (skb) {
92 unsigned int len = skb->len;
93 sch->qstats.backlog -= len;
94 kfree_skb(skb);
95 return len;
96 }
97 return 0;
98}
99
100static void
101fifo_reset(struct Qdisc* sch)
102{
103 skb_queue_purge(&sch->q);
104 sch->qstats.backlog = 0;
105}
106
107static int
108pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
109{ 39{
110 struct fifo_sched_data *q = qdisc_priv(sch); 40 struct fifo_sched_data *q = qdisc_priv(sch);
111 41
112 if (sch->q.qlen < q->limit) { 42 if (likely(skb_queue_len(&sch->q) < q->limit))
113 __skb_queue_tail(&sch->q, skb); 43 return qdisc_enqueue_tail(skb, sch);
114 sch->bstats.bytes += skb->len;
115 sch->bstats.packets++;
116 return 0;
117 }
118 sch->qstats.drops++;
119#ifdef CONFIG_NET_CLS_POLICE
120 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
121#endif
122 kfree_skb(skb);
123 return NET_XMIT_DROP;
124}
125
126static int
127pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
128{
129 __skb_queue_head(&sch->q, skb);
130 sch->qstats.requeues++;
131 return 0;
132}
133
134 44
135static struct sk_buff * 45 return qdisc_reshape_fail(skb, sch);
136pfifo_dequeue(struct Qdisc* sch)
137{
138 return __skb_dequeue(&sch->q);
139} 46}
140 47
141static int fifo_init(struct Qdisc *sch, struct rtattr *opt) 48static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
@@ -143,66 +50,59 @@ static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
143 struct fifo_sched_data *q = qdisc_priv(sch); 50 struct fifo_sched_data *q = qdisc_priv(sch);
144 51
145 if (opt == NULL) { 52 if (opt == NULL) {
146 unsigned int limit = sch->dev->tx_queue_len ? : 1; 53 u32 limit = sch->dev->tx_queue_len ? : 1;
147 54
148 if (sch->ops == &bfifo_qdisc_ops) 55 if (sch->ops == &bfifo_qdisc_ops)
149 q->limit = limit*sch->dev->mtu; 56 limit *= sch->dev->mtu;
150 else 57
151 q->limit = limit; 58 q->limit = limit;
152 } else { 59 } else {
153 struct tc_fifo_qopt *ctl = RTA_DATA(opt); 60 struct tc_fifo_qopt *ctl = RTA_DATA(opt);
154 if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) 61
62 if (RTA_PAYLOAD(opt) < sizeof(*ctl))
155 return -EINVAL; 63 return -EINVAL;
64
156 q->limit = ctl->limit; 65 q->limit = ctl->limit;
157 } 66 }
67
158 return 0; 68 return 0;
159} 69}
160 70
161static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) 71static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
162{ 72{
163 struct fifo_sched_data *q = qdisc_priv(sch); 73 struct fifo_sched_data *q = qdisc_priv(sch);
164 unsigned char *b = skb->tail; 74 struct tc_fifo_qopt opt = { .limit = q->limit };
165 struct tc_fifo_qopt opt;
166 75
167 opt.limit = q->limit;
168 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 76 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
169
170 return skb->len; 77 return skb->len;
171 78
172rtattr_failure: 79rtattr_failure:
173 skb_trim(skb, b - skb->data);
174 return -1; 80 return -1;
175} 81}
176 82
177struct Qdisc_ops pfifo_qdisc_ops = { 83struct Qdisc_ops pfifo_qdisc_ops = {
178 .next = NULL,
179 .cl_ops = NULL,
180 .id = "pfifo", 84 .id = "pfifo",
181 .priv_size = sizeof(struct fifo_sched_data), 85 .priv_size = sizeof(struct fifo_sched_data),
182 .enqueue = pfifo_enqueue, 86 .enqueue = pfifo_enqueue,
183 .dequeue = pfifo_dequeue, 87 .dequeue = qdisc_dequeue_head,
184 .requeue = pfifo_requeue, 88 .requeue = qdisc_requeue,
185 .drop = fifo_drop, 89 .drop = qdisc_queue_drop,
186 .init = fifo_init, 90 .init = fifo_init,
187 .reset = fifo_reset, 91 .reset = qdisc_reset_queue,
188 .destroy = NULL,
189 .change = fifo_init, 92 .change = fifo_init,
190 .dump = fifo_dump, 93 .dump = fifo_dump,
191 .owner = THIS_MODULE, 94 .owner = THIS_MODULE,
192}; 95};
193 96
194struct Qdisc_ops bfifo_qdisc_ops = { 97struct Qdisc_ops bfifo_qdisc_ops = {
195 .next = NULL,
196 .cl_ops = NULL,
197 .id = "bfifo", 98 .id = "bfifo",
198 .priv_size = sizeof(struct fifo_sched_data), 99 .priv_size = sizeof(struct fifo_sched_data),
199 .enqueue = bfifo_enqueue, 100 .enqueue = bfifo_enqueue,
200 .dequeue = bfifo_dequeue, 101 .dequeue = qdisc_dequeue_head,
201 .requeue = bfifo_requeue, 102 .requeue = qdisc_requeue,
202 .drop = fifo_drop, 103 .drop = qdisc_queue_drop,
203 .init = fifo_init, 104 .init = fifo_init,
204 .reset = fifo_reset, 105 .reset = qdisc_reset_queue,
205 .destroy = NULL,
206 .change = fifo_init, 106 .change = fifo_init,
207 .dump = fifo_dump, 107 .dump = fifo_dump,
208 .owner = THIS_MODULE, 108 .owner = THIS_MODULE,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 87e48a4e1051..8edefd5d095d 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -243,31 +243,27 @@ static void dev_watchdog_down(struct net_device *dev)
243 cheaper. 243 cheaper.
244 */ 244 */
245 245
246static int 246static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
247noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
248{ 247{
249 kfree_skb(skb); 248 kfree_skb(skb);
250 return NET_XMIT_CN; 249 return NET_XMIT_CN;
251} 250}
252 251
253static struct sk_buff * 252static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
254noop_dequeue(struct Qdisc * qdisc)
255{ 253{
256 return NULL; 254 return NULL;
257} 255}
258 256
259static int 257static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
260noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
261{ 258{
262 if (net_ratelimit()) 259 if (net_ratelimit())
263 printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); 260 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
261 skb->dev->name);
264 kfree_skb(skb); 262 kfree_skb(skb);
265 return NET_XMIT_CN; 263 return NET_XMIT_CN;
266} 264}
267 265
268struct Qdisc_ops noop_qdisc_ops = { 266struct Qdisc_ops noop_qdisc_ops = {
269 .next = NULL,
270 .cl_ops = NULL,
271 .id = "noop", 267 .id = "noop",
272 .priv_size = 0, 268 .priv_size = 0,
273 .enqueue = noop_enqueue, 269 .enqueue = noop_enqueue,
@@ -285,8 +281,6 @@ struct Qdisc noop_qdisc = {
285}; 281};
286 282
287static struct Qdisc_ops noqueue_qdisc_ops = { 283static struct Qdisc_ops noqueue_qdisc_ops = {
288 .next = NULL,
289 .cl_ops = NULL,
290 .id = "noqueue", 284 .id = "noqueue",
291 .priv_size = 0, 285 .priv_size = 0,
292 .enqueue = noop_enqueue, 286 .enqueue = noop_enqueue,
@@ -311,97 +305,86 @@ static const u8 prio2band[TC_PRIO_MAX+1] =
311 generic prio+fifo combination. 305 generic prio+fifo combination.
312 */ 306 */
313 307
314static int 308#define PFIFO_FAST_BANDS 3
315pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) 309
310static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
311 struct Qdisc *qdisc)
316{ 312{
317 struct sk_buff_head *list = qdisc_priv(qdisc); 313 struct sk_buff_head *list = qdisc_priv(qdisc);
314 return list + prio2band[skb->priority & TC_PRIO_MAX];
315}
318 316
319 list += prio2band[skb->priority&TC_PRIO_MAX]; 317static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
318{
319 struct sk_buff_head *list = prio2list(skb, qdisc);
320 320
321 if (list->qlen < qdisc->dev->tx_queue_len) { 321 if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
322 __skb_queue_tail(list, skb);
323 qdisc->q.qlen++; 322 qdisc->q.qlen++;
324 qdisc->bstats.bytes += skb->len; 323 return __qdisc_enqueue_tail(skb, qdisc, list);
325 qdisc->bstats.packets++;
326 return 0;
327 } 324 }
328 qdisc->qstats.drops++; 325
329 kfree_skb(skb); 326 return qdisc_drop(skb, qdisc);
330 return NET_XMIT_DROP;
331} 327}
332 328
333static struct sk_buff * 329static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
334pfifo_fast_dequeue(struct Qdisc* qdisc)
335{ 330{
336 int prio; 331 int prio;
337 struct sk_buff_head *list = qdisc_priv(qdisc); 332 struct sk_buff_head *list = qdisc_priv(qdisc);
338 struct sk_buff *skb;
339 333
340 for (prio = 0; prio < 3; prio++, list++) { 334 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
341 skb = __skb_dequeue(list); 335 if (!skb_queue_empty(list + prio)) {
342 if (skb) {
343 qdisc->q.qlen--; 336 qdisc->q.qlen--;
344 return skb; 337 return __qdisc_dequeue_head(qdisc, list + prio);
345 } 338 }
346 } 339 }
340
347 return NULL; 341 return NULL;
348} 342}
349 343
350static int 344static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
351pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
352{ 345{
353 struct sk_buff_head *list = qdisc_priv(qdisc);
354
355 list += prio2band[skb->priority&TC_PRIO_MAX];
356
357 __skb_queue_head(list, skb);
358 qdisc->q.qlen++; 346 qdisc->q.qlen++;
359 qdisc->qstats.requeues++; 347 return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
360 return 0;
361} 348}
362 349
363static void 350static void pfifo_fast_reset(struct Qdisc* qdisc)
364pfifo_fast_reset(struct Qdisc* qdisc)
365{ 351{
366 int prio; 352 int prio;
367 struct sk_buff_head *list = qdisc_priv(qdisc); 353 struct sk_buff_head *list = qdisc_priv(qdisc);
368 354
369 for (prio=0; prio < 3; prio++) 355 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
370 skb_queue_purge(list+prio); 356 __qdisc_reset_queue(qdisc, list + prio);
357
358 qdisc->qstats.backlog = 0;
371 qdisc->q.qlen = 0; 359 qdisc->q.qlen = 0;
372} 360}
373 361
374static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) 362static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
375{ 363{
376 unsigned char *b = skb->tail; 364 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
377 struct tc_prio_qopt opt;
378 365
379 opt.bands = 3;
380 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); 366 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
381 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 367 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
382 return skb->len; 368 return skb->len;
383 369
384rtattr_failure: 370rtattr_failure:
385 skb_trim(skb, b - skb->data);
386 return -1; 371 return -1;
387} 372}
388 373
389static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) 374static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
390{ 375{
391 int i; 376 int prio;
392 struct sk_buff_head *list = qdisc_priv(qdisc); 377 struct sk_buff_head *list = qdisc_priv(qdisc);
393 378
394 for (i=0; i<3; i++) 379 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
395 skb_queue_head_init(list+i); 380 skb_queue_head_init(list + prio);
396 381
397 return 0; 382 return 0;
398} 383}
399 384
400static struct Qdisc_ops pfifo_fast_ops = { 385static struct Qdisc_ops pfifo_fast_ops = {
401 .next = NULL,
402 .cl_ops = NULL,
403 .id = "pfifo_fast", 386 .id = "pfifo_fast",
404 .priv_size = 3 * sizeof(struct sk_buff_head), 387 .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
405 .enqueue = pfifo_fast_enqueue, 388 .enqueue = pfifo_fast_enqueue,
406 .dequeue = pfifo_fast_dequeue, 389 .dequeue = pfifo_fast_dequeue,
407 .requeue = pfifo_fast_requeue, 390 .requeue = pfifo_fast_requeue,
@@ -411,24 +394,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
411 .owner = THIS_MODULE, 394 .owner = THIS_MODULE,
412}; 395};
413 396
414struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) 397struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
415{ 398{
416 void *p; 399 void *p;
417 struct Qdisc *sch; 400 struct Qdisc *sch;
418 int size; 401 unsigned int size;
402 int err = -ENOBUFS;
419 403
420 /* ensure that the Qdisc and the private data are 32-byte aligned */ 404 /* ensure that the Qdisc and the private data are 32-byte aligned */
421 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 405 size = QDISC_ALIGN(sizeof(*sch));
422 size += ops->priv_size + QDISC_ALIGN_CONST; 406 size += ops->priv_size + (QDISC_ALIGNTO - 1);
423 407
424 p = kmalloc(size, GFP_KERNEL); 408 p = kmalloc(size, GFP_KERNEL);
425 if (!p) 409 if (!p)
426 return NULL; 410 goto errout;
427 memset(p, 0, size); 411 memset(p, 0, size);
428 412 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
429 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 413 sch->padded = (char *) sch - (char *) p;
430 & ~QDISC_ALIGN_CONST);
431 sch->padded = (char *)sch - (char *)p;
432 414
433 INIT_LIST_HEAD(&sch->list); 415 INIT_LIST_HEAD(&sch->list);
434 skb_queue_head_init(&sch->q); 416 skb_queue_head_init(&sch->q);
@@ -439,11 +421,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
439 dev_hold(dev); 421 dev_hold(dev);
440 sch->stats_lock = &dev->queue_lock; 422 sch->stats_lock = &dev->queue_lock;
441 atomic_set(&sch->refcnt, 1); 423 atomic_set(&sch->refcnt, 1);
424
425 return sch;
426errout:
427 return ERR_PTR(-err);
428}
429
430struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
431{
432 struct Qdisc *sch;
433
434 sch = qdisc_alloc(dev, ops);
435 if (IS_ERR(sch))
436 goto errout;
437
442 if (!ops->init || ops->init(sch, NULL) == 0) 438 if (!ops->init || ops->init(sch, NULL) == 0)
443 return sch; 439 return sch;
444 440
445 dev_put(dev); 441errout:
446 kfree(p);
447 return NULL; 442 return NULL;
448} 443}
449 444
@@ -607,6 +602,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
607EXPORT_SYMBOL(noop_qdisc); 602EXPORT_SYMBOL(noop_qdisc);
608EXPORT_SYMBOL(noop_qdisc_ops); 603EXPORT_SYMBOL(noop_qdisc_ops);
609EXPORT_SYMBOL(qdisc_create_dflt); 604EXPORT_SYMBOL(qdisc_create_dflt);
605EXPORT_SYMBOL(qdisc_alloc);
610EXPORT_SYMBOL(qdisc_destroy); 606EXPORT_SYMBOL(qdisc_destroy);
611EXPORT_SYMBOL(qdisc_reset); 607EXPORT_SYMBOL(qdisc_reset);
612EXPORT_SYMBOL(qdisc_restart); 608EXPORT_SYMBOL(qdisc_restart);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 664d0e47374f..7845d045eec4 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt)
385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); 385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
386 386
387 q->qcount = -1; 387 q->qcount = -1;
388 if (skb_queue_len(&sch->q) == 0) 388 if (skb_queue_empty(&sch->q))
389 PSCHED_SET_PASTPERFECT(q->qidlestart); 389 PSCHED_SET_PASTPERFECT(q->qidlestart);
390 sch_tree_unlock(sch); 390 sch_tree_unlock(sch);
391 return 0; 391 return 0;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 663843d97a92..5b24ae0650d3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
71 const struct sctp_endpoint *ep, 71 const struct sctp_endpoint *ep,
72 const struct sock *sk, 72 const struct sock *sk,
73 sctp_scope_t scope, 73 sctp_scope_t scope,
74 int gfp) 74 unsigned int __nocast gfp)
75{ 75{
76 struct sctp_sock *sp; 76 struct sctp_sock *sp;
77 int i; 77 int i;
@@ -191,10 +191,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
191 asoc->last_cwr_tsn = asoc->ctsn_ack_point; 191 asoc->last_cwr_tsn = asoc->ctsn_ack_point;
192 asoc->unack_data = 0; 192 asoc->unack_data = 0;
193 193
194 SCTP_DEBUG_PRINTK("myctsnap for %s INIT as 0x%x.\n",
195 asoc->ep->debug_name,
196 asoc->ctsn_ack_point);
197
198 /* ADDIP Section 4.1 Asconf Chunk Procedures 194 /* ADDIP Section 4.1 Asconf Chunk Procedures
199 * 195 *
200 * When an endpoint has an ASCONF signaled change to be sent to the 196 * When an endpoint has an ASCONF signaled change to be sent to the
@@ -207,10 +203,11 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
207 */ 203 */
208 asoc->addip_serial = asoc->c.initial_tsn; 204 asoc->addip_serial = asoc->c.initial_tsn;
209 205
210 skb_queue_head_init(&asoc->addip_chunks); 206 INIT_LIST_HEAD(&asoc->addip_chunk_list);
211 207
212 /* Make an empty list of remote transport addresses. */ 208 /* Make an empty list of remote transport addresses. */
213 INIT_LIST_HEAD(&asoc->peer.transport_addr_list); 209 INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
210 asoc->peer.transport_count = 0;
214 211
215 /* RFC 2960 5.1 Normal Establishment of an Association 212 /* RFC 2960 5.1 Normal Establishment of an Association
216 * 213 *
@@ -275,7 +272,8 @@ fail_init:
275/* Allocate and initialize a new association */ 272/* Allocate and initialize a new association */
276struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, 273struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
277 const struct sock *sk, 274 const struct sock *sk,
278 sctp_scope_t scope, int gfp) 275 sctp_scope_t scope,
276 unsigned int __nocast gfp)
279{ 277{
280 struct sctp_association *asoc; 278 struct sctp_association *asoc;
281 279
@@ -288,6 +286,7 @@ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
288 286
289 asoc->base.malloced = 1; 287 asoc->base.malloced = 1;
290 SCTP_DBG_OBJCNT_INC(assoc); 288 SCTP_DBG_OBJCNT_INC(assoc);
289 SCTP_DEBUG_PRINTK("Created asoc %p\n", asoc);
291 290
292 return asoc; 291 return asoc;
293 292
@@ -356,6 +355,8 @@ void sctp_association_free(struct sctp_association *asoc)
356 sctp_transport_free(transport); 355 sctp_transport_free(transport);
357 } 356 }
358 357
358 asoc->peer.transport_count = 0;
359
359 /* Free any cached ASCONF_ACK chunk. */ 360 /* Free any cached ASCONF_ACK chunk. */
360 if (asoc->addip_last_asconf_ack) 361 if (asoc->addip_last_asconf_ack)
361 sctp_chunk_free(asoc->addip_last_asconf_ack); 362 sctp_chunk_free(asoc->addip_last_asconf_ack);
@@ -400,7 +401,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
400 /* If the primary path is changing, assume that the 401 /* If the primary path is changing, assume that the
401 * user wants to use this new path. 402 * user wants to use this new path.
402 */ 403 */
403 if (transport->active) 404 if (transport->state != SCTP_INACTIVE)
404 asoc->peer.active_path = transport; 405 asoc->peer.active_path = transport;
405 406
406 /* 407 /*
@@ -428,10 +429,58 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
428 transport->cacc.next_tsn_at_change = asoc->next_tsn; 429 transport->cacc.next_tsn_at_change = asoc->next_tsn;
429} 430}
430 431
432/* Remove a transport from an association. */
433void sctp_assoc_rm_peer(struct sctp_association *asoc,
434 struct sctp_transport *peer)
435{
436 struct list_head *pos;
437 struct sctp_transport *transport;
438
439 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_rm_peer:association %p addr: ",
440 " port: %d\n",
441 asoc,
442 (&peer->ipaddr),
443 peer->ipaddr.v4.sin_port);
444
445 /* If we are to remove the current retran_path, update it
446 * to the next peer before removing this peer from the list.
447 */
448 if (asoc->peer.retran_path == peer)
449 sctp_assoc_update_retran_path(asoc);
450
451 /* Remove this peer from the list. */
452 list_del(&peer->transports);
453
454 /* Get the first transport of asoc. */
455 pos = asoc->peer.transport_addr_list.next;
456 transport = list_entry(pos, struct sctp_transport, transports);
457
458 /* Update any entries that match the peer to be deleted. */
459 if (asoc->peer.primary_path == peer)
460 sctp_assoc_set_primary(asoc, transport);
461 if (asoc->peer.active_path == peer)
462 asoc->peer.active_path = transport;
463 if (asoc->peer.last_data_from == peer)
464 asoc->peer.last_data_from = transport;
465
466 /* If we remove the transport an INIT was last sent to, set it to
467 * NULL. Combined with the update of the retran path above, this
468 * will cause the next INIT to be sent to the next available
469 * transport, maintaining the cycle.
470 */
471 if (asoc->init_last_sent_to == peer)
472 asoc->init_last_sent_to = NULL;
473
474 asoc->peer.transport_count--;
475
476 sctp_transport_free(peer);
477}
478
431/* Add a transport address to an association. */ 479/* Add a transport address to an association. */
432struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, 480struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
433 const union sctp_addr *addr, 481 const union sctp_addr *addr,
434 int gfp) 482 const unsigned int __nocast gfp,
483 const int peer_state)
435{ 484{
436 struct sctp_transport *peer; 485 struct sctp_transport *peer;
437 struct sctp_sock *sp; 486 struct sctp_sock *sp;
@@ -442,14 +491,25 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
442 /* AF_INET and AF_INET6 share common port field. */ 491 /* AF_INET and AF_INET6 share common port field. */
443 port = addr->v4.sin_port; 492 port = addr->v4.sin_port;
444 493
494 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_add_peer:association %p addr: ",
495 " port: %d state:%s\n",
496 asoc,
497 addr,
498 addr->v4.sin_port,
499 peer_state == SCTP_UNKNOWN?"UNKNOWN":"ACTIVE");
500
445 /* Set the port if it has not been set yet. */ 501 /* Set the port if it has not been set yet. */
446 if (0 == asoc->peer.port) 502 if (0 == asoc->peer.port)
447 asoc->peer.port = port; 503 asoc->peer.port = port;
448 504
449 /* Check to see if this is a duplicate. */ 505 /* Check to see if this is a duplicate. */
450 peer = sctp_assoc_lookup_paddr(asoc, addr); 506 peer = sctp_assoc_lookup_paddr(asoc, addr);
451 if (peer) 507 if (peer) {
508 if (peer_state == SCTP_ACTIVE &&
509 peer->state == SCTP_UNKNOWN)
510 peer->state = SCTP_ACTIVE;
452 return peer; 511 return peer;
512 }
453 513
454 peer = sctp_transport_new(addr, gfp); 514 peer = sctp_transport_new(addr, gfp);
455 if (!peer) 515 if (!peer)
@@ -516,8 +576,12 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
516 /* Set the transport's RTO.initial value */ 576 /* Set the transport's RTO.initial value */
517 peer->rto = asoc->rto_initial; 577 peer->rto = asoc->rto_initial;
518 578
579 /* Set the peer's active state. */
580 peer->state = peer_state;
581
519 /* Attach the remote transport to our asoc. */ 582 /* Attach the remote transport to our asoc. */
520 list_add_tail(&peer->transports, &asoc->peer.transport_addr_list); 583 list_add_tail(&peer->transports, &asoc->peer.transport_addr_list);
584 asoc->peer.transport_count++;
521 585
522 /* If we do not yet have a primary path, set one. */ 586 /* If we do not yet have a primary path, set one. */
523 if (!asoc->peer.primary_path) { 587 if (!asoc->peer.primary_path) {
@@ -525,8 +589,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
525 asoc->peer.retran_path = peer; 589 asoc->peer.retran_path = peer;
526 } 590 }
527 591
528 if (asoc->peer.active_path == asoc->peer.retran_path) 592 if (asoc->peer.active_path == asoc->peer.retran_path) {
529 asoc->peer.retran_path = peer; 593 asoc->peer.retran_path = peer;
594 }
530 595
531 return peer; 596 return peer;
532} 597}
@@ -537,37 +602,16 @@ void sctp_assoc_del_peer(struct sctp_association *asoc,
537{ 602{
538 struct list_head *pos; 603 struct list_head *pos;
539 struct list_head *temp; 604 struct list_head *temp;
540 struct sctp_transport *peer = NULL;
541 struct sctp_transport *transport; 605 struct sctp_transport *transport;
542 606
543 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { 607 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
544 transport = list_entry(pos, struct sctp_transport, transports); 608 transport = list_entry(pos, struct sctp_transport, transports);
545 if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { 609 if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
546 peer = transport; 610 /* Do book keeping for removing the peer and free it. */
547 list_del(pos); 611 sctp_assoc_rm_peer(asoc, transport);
548 break; 612 break;
549 } 613 }
550 } 614 }
551
552 /* The address we want delete is not in the association. */
553 if (!peer)
554 return;
555
556 /* Get the first transport of asoc. */
557 pos = asoc->peer.transport_addr_list.next;
558 transport = list_entry(pos, struct sctp_transport, transports);
559
560 /* Update any entries that match the peer to be deleted. */
561 if (asoc->peer.primary_path == peer)
562 sctp_assoc_set_primary(asoc, transport);
563 if (asoc->peer.active_path == peer)
564 asoc->peer.active_path = transport;
565 if (asoc->peer.retran_path == peer)
566 asoc->peer.retran_path = transport;
567 if (asoc->peer.last_data_from == peer)
568 asoc->peer.last_data_from = transport;
569
570 sctp_transport_free(peer);
571} 615}
572 616
573/* Lookup a transport by address. */ 617/* Lookup a transport by address. */
@@ -608,12 +652,12 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
608 /* Record the transition on the transport. */ 652 /* Record the transition on the transport. */
609 switch (command) { 653 switch (command) {
610 case SCTP_TRANSPORT_UP: 654 case SCTP_TRANSPORT_UP:
611 transport->active = SCTP_ACTIVE; 655 transport->state = SCTP_ACTIVE;
612 spc_state = SCTP_ADDR_AVAILABLE; 656 spc_state = SCTP_ADDR_AVAILABLE;
613 break; 657 break;
614 658
615 case SCTP_TRANSPORT_DOWN: 659 case SCTP_TRANSPORT_DOWN:
616 transport->active = SCTP_INACTIVE; 660 transport->state = SCTP_INACTIVE;
617 spc_state = SCTP_ADDR_UNREACHABLE; 661 spc_state = SCTP_ADDR_UNREACHABLE;
618 break; 662 break;
619 663
@@ -643,7 +687,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
643 list_for_each(pos, &asoc->peer.transport_addr_list) { 687 list_for_each(pos, &asoc->peer.transport_addr_list) {
644 t = list_entry(pos, struct sctp_transport, transports); 688 t = list_entry(pos, struct sctp_transport, transports);
645 689
646 if (!t->active) 690 if (t->state == SCTP_INACTIVE)
647 continue; 691 continue;
648 if (!first || t->last_time_heard > first->last_time_heard) { 692 if (!first || t->last_time_heard > first->last_time_heard) {
649 second = first; 693 second = first;
@@ -663,7 +707,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
663 * [If the primary is active but not most recent, bump the most 707 * [If the primary is active but not most recent, bump the most
664 * recently used transport.] 708 * recently used transport.]
665 */ 709 */
666 if (asoc->peer.primary_path->active && 710 if (asoc->peer.primary_path->state != SCTP_INACTIVE &&
667 first != asoc->peer.primary_path) { 711 first != asoc->peer.primary_path) {
668 second = first; 712 second = first;
669 first = asoc->peer.primary_path; 713 first = asoc->peer.primary_path;
@@ -958,7 +1002,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
958 transports); 1002 transports);
959 if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr)) 1003 if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr))
960 sctp_assoc_add_peer(asoc, &trans->ipaddr, 1004 sctp_assoc_add_peer(asoc, &trans->ipaddr,
961 GFP_ATOMIC); 1005 GFP_ATOMIC, SCTP_ACTIVE);
962 } 1006 }
963 1007
964 asoc->ctsn_ack_point = asoc->next_tsn - 1; 1008 asoc->ctsn_ack_point = asoc->next_tsn - 1;
@@ -998,7 +1042,7 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
998 1042
999 /* Try to find an active transport. */ 1043 /* Try to find an active transport. */
1000 1044
1001 if (t->active) { 1045 if (t->state != SCTP_INACTIVE) {
1002 break; 1046 break;
1003 } else { 1047 } else {
1004 /* Keep track of the next transport in case 1048 /* Keep track of the next transport in case
@@ -1019,6 +1063,40 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
1019 } 1063 }
1020 1064
1021 asoc->peer.retran_path = t; 1065 asoc->peer.retran_path = t;
1066
1067 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
1068 " %p addr: ",
1069 " port: %d\n",
1070 asoc,
1071 (&t->ipaddr),
1072 t->ipaddr.v4.sin_port);
1073}
1074
1075/* Choose the transport for sending a INIT packet. */
1076struct sctp_transport *sctp_assoc_choose_init_transport(
1077 struct sctp_association *asoc)
1078{
1079 struct sctp_transport *t;
1080
1081 /* Use the retran path. If the last INIT was sent over the
1082 * retran path, update the retran path and use it.
1083 */
1084 if (!asoc->init_last_sent_to) {
1085 t = asoc->peer.active_path;
1086 } else {
1087 if (asoc->init_last_sent_to == asoc->peer.retran_path)
1088 sctp_assoc_update_retran_path(asoc);
1089 t = asoc->peer.retran_path;
1090 }
1091
1092 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
1093 " %p addr: ",
1094 " port: %d\n",
1095 asoc,
1096 (&t->ipaddr),
1097 t->ipaddr.v4.sin_port);
1098
1099 return t;
1022} 1100}
1023 1101
1024/* Choose the transport for sending a SHUTDOWN packet. */ 1102/* Choose the transport for sending a SHUTDOWN packet. */
@@ -1152,7 +1230,8 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
1152/* Build the bind address list for the association based on info from the 1230/* Build the bind address list for the association based on info from the
1153 * local endpoint and the remote peer. 1231 * local endpoint and the remote peer.
1154 */ 1232 */
1155int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp) 1233int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
1234 unsigned int __nocast gfp)
1156{ 1235{
1157 sctp_scope_t scope; 1236 sctp_scope_t scope;
1158 int flags; 1237 int flags;
@@ -1174,7 +1253,8 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
1174 1253
1175/* Build the association's bind address list from the cookie. */ 1254/* Build the association's bind address list from the cookie. */
1176int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc, 1255int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
1177 struct sctp_cookie *cookie, int gfp) 1256 struct sctp_cookie *cookie,
1257 unsigned int __nocast gfp)
1178{ 1258{
1179 int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length); 1259 int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
1180 int var_size3 = cookie->raw_addr_list_len; 1260 int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f90eadfb60a2..f71549710f2e 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,8 @@
53 53
54/* Forward declarations for internal helpers. */ 54/* Forward declarations for internal helpers. */
55static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *, 55static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
56 sctp_scope_t scope, int gfp, int flags); 56 sctp_scope_t scope, unsigned int __nocast gfp,
57 int flags);
57static void sctp_bind_addr_clean(struct sctp_bind_addr *); 58static void sctp_bind_addr_clean(struct sctp_bind_addr *);
58 59
59/* First Level Abstractions. */ 60/* First Level Abstractions. */
@@ -63,7 +64,8 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
63 */ 64 */
64int sctp_bind_addr_copy(struct sctp_bind_addr *dest, 65int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
65 const struct sctp_bind_addr *src, 66 const struct sctp_bind_addr *src,
66 sctp_scope_t scope, int gfp, int flags) 67 sctp_scope_t scope, unsigned int __nocast gfp,
68 int flags)
67{ 69{
68 struct sctp_sockaddr_entry *addr; 70 struct sctp_sockaddr_entry *addr;
69 struct list_head *pos; 71 struct list_head *pos;
@@ -144,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
144 146
145/* Add an address to the bind address list in the SCTP_bind_addr structure. */ 147/* Add an address to the bind address list in the SCTP_bind_addr structure. */
146int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, 148int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
147 int gfp) 149 unsigned int __nocast gfp)
148{ 150{
149 struct sctp_sockaddr_entry *addr; 151 struct sctp_sockaddr_entry *addr;
150 152
@@ -197,7 +199,8 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
197 * The second argument is the return value for the length. 199 * The second argument is the return value for the length.
198 */ 200 */
199union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, 201union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
200 int *addrs_len, int gfp) 202 int *addrs_len,
203 unsigned int __nocast gfp)
201{ 204{
202 union sctp_params addrparms; 205 union sctp_params addrparms;
203 union sctp_params retval; 206 union sctp_params retval;
@@ -249,7 +252,7 @@ end_raw:
249 * address parameters). 252 * address parameters).
250 */ 253 */
251int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, 254int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
252 int addrs_len, __u16 port, int gfp) 255 int addrs_len, __u16 port, unsigned int __nocast gfp)
253{ 256{
254 union sctp_addr_param *rawaddr; 257 union sctp_addr_param *rawaddr;
255 struct sctp_paramhdr *param; 258 struct sctp_paramhdr *param;
@@ -347,7 +350,8 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
347/* Copy out addresses from the global local address list. */ 350/* Copy out addresses from the global local address list. */
348static int sctp_copy_one_addr(struct sctp_bind_addr *dest, 351static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
349 union sctp_addr *addr, 352 union sctp_addr *addr,
350 sctp_scope_t scope, int gfp, int flags) 353 sctp_scope_t scope, unsigned int __nocast gfp,
354 int flags)
351{ 355{
352 int error = 0; 356 int error = 0;
353 357
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0c2ab7885058..61da2937e641 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
62} 62}
63 63
64/* Allocate and initialize datamsg. */ 64/* Allocate and initialize datamsg. */
65SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(int gfp) 65SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
66{ 66{
67 struct sctp_datamsg *msg; 67 struct sctp_datamsg *msg;
68 msg = kmalloc(sizeof(struct sctp_datamsg), gfp); 68 msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 334f61773e6d..e47ac0d1a6d6 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -67,7 +67,8 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
67 * Initialize the base fields of the endpoint structure. 67 * Initialize the base fields of the endpoint structure.
68 */ 68 */
69static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, 69static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
70 struct sock *sk, int gfp) 70 struct sock *sk,
71 unsigned int __nocast gfp)
71{ 72{
72 struct sctp_sock *sp = sctp_sk(sk); 73 struct sctp_sock *sp = sctp_sk(sk);
73 memset(ep, 0, sizeof(struct sctp_endpoint)); 74 memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -102,9 +103,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
102 /* Set up the base timeout information. */ 103 /* Set up the base timeout information. */
103 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; 104 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
104 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = 105 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
105 SCTP_DEFAULT_TIMEOUT_T1_COOKIE; 106 msecs_to_jiffies(sp->rtoinfo.srto_initial);
106 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = 107 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
107 SCTP_DEFAULT_TIMEOUT_T1_INIT; 108 msecs_to_jiffies(sp->rtoinfo.srto_initial);
108 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = 109 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
109 msecs_to_jiffies(sp->rtoinfo.srto_initial); 110 msecs_to_jiffies(sp->rtoinfo.srto_initial);
110 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; 111 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +118,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
117 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] 118 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
118 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); 119 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
119 120
120 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 121 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
121 SCTP_DEFAULT_TIMEOUT_HEARTBEAT; 122 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
122 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = 123 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
123 SCTP_DEFAULT_TIMEOUT_SACK;
124 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
125 sp->autoclose * HZ;
126 124
127 /* Use SCTP specific send buffer space queues. */ 125 /* Use SCTP specific send buffer space queues. */
128 ep->sndbuf_policy = sctp_sndbuf_policy; 126 ep->sndbuf_policy = sctp_sndbuf_policy;
@@ -134,14 +132,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
134 ep->last_key = ep->current_key = 0; 132 ep->last_key = ep->current_key = 0;
135 ep->key_changed_at = jiffies; 133 ep->key_changed_at = jiffies;
136 134
137 ep->debug_name = "unnamedEndpoint";
138 return ep; 135 return ep;
139} 136}
140 137
141/* Create a sctp_endpoint with all that boring stuff initialized. 138/* Create a sctp_endpoint with all that boring stuff initialized.
142 * Returns NULL if there isn't enough memory. 139 * Returns NULL if there isn't enough memory.
143 */ 140 */
144struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, int gfp) 141struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
142 unsigned int __nocast gfp)
145{ 143{
146 struct sctp_endpoint *ep; 144 struct sctp_endpoint *ep;
147 145
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b719a77d66b4..742be9171b7d 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -115,6 +115,17 @@ static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
115 atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); 115 atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
116} 116}
117 117
118struct sctp_input_cb {
119 union {
120 struct inet_skb_parm h4;
121#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
122 struct inet6_skb_parm h6;
123#endif
124 } header;
125 struct sctp_chunk *chunk;
126};
127#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
128
118/* 129/*
119 * This is the routine which IP calls when receiving an SCTP packet. 130 * This is the routine which IP calls when receiving an SCTP packet.
120 */ 131 */
@@ -178,6 +189,37 @@ int sctp_rcv(struct sk_buff *skb)
178 189
179 asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); 190 asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
180 191
192 if (!asoc)
193 ep = __sctp_rcv_lookup_endpoint(&dest);
194
195 /* Retrieve the common input handling substructure. */
196 rcvr = asoc ? &asoc->base : &ep->base;
197 sk = rcvr->sk;
198
199 /*
200 * If a frame arrives on an interface and the receiving socket is
201 * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
202 */
203 if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb)))
204 {
205 sock_put(sk);
206 if (asoc) {
207 sctp_association_put(asoc);
208 asoc = NULL;
209 } else {
210 sctp_endpoint_put(ep);
211 ep = NULL;
212 }
213 sk = sctp_get_ctl_sock();
214 ep = sctp_sk(sk)->ep;
215 sctp_endpoint_hold(ep);
216 sock_hold(sk);
217 rcvr = &ep->base;
218 }
219
220 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
221 goto discard_release;
222
181 /* 223 /*
182 * RFC 2960, 8.4 - Handle "Out of the blue" Packets. 224 * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
183 * An SCTP packet is called an "out of the blue" (OOTB) 225 * An SCTP packet is called an "out of the blue" (OOTB)
@@ -187,22 +229,12 @@ int sctp_rcv(struct sk_buff *skb)
187 * packet belongs. 229 * packet belongs.
188 */ 230 */
189 if (!asoc) { 231 if (!asoc) {
190 ep = __sctp_rcv_lookup_endpoint(&dest);
191 if (sctp_rcv_ootb(skb)) { 232 if (sctp_rcv_ootb(skb)) {
192 SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES); 233 SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES);
193 goto discard_release; 234 goto discard_release;
194 } 235 }
195 } 236 }
196 237
197 /* Retrieve the common input handling substructure. */
198 rcvr = asoc ? &asoc->base : &ep->base;
199 sk = rcvr->sk;
200
201 if ((sk) && (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)) {
202 goto discard_release;
203 }
204
205
206 /* SCTP seems to always need a timestamp right now (FIXME) */ 238 /* SCTP seems to always need a timestamp right now (FIXME) */
207 if (skb->stamp.tv_sec == 0) { 239 if (skb->stamp.tv_sec == 0) {
208 do_gettimeofday(&skb->stamp); 240 do_gettimeofday(&skb->stamp);
@@ -222,6 +254,7 @@ int sctp_rcv(struct sk_buff *skb)
222 ret = -ENOMEM; 254 ret = -ENOMEM;
223 goto discard_release; 255 goto discard_release;
224 } 256 }
257 SCTP_INPUT_CB(skb)->chunk = chunk;
225 258
226 sctp_rcv_set_owner_r(skb,sk); 259 sctp_rcv_set_owner_r(skb,sk);
227 260
@@ -244,9 +277,9 @@ int sctp_rcv(struct sk_buff *skb)
244 sctp_bh_lock_sock(sk); 277 sctp_bh_lock_sock(sk);
245 278
246 if (sock_owned_by_user(sk)) 279 if (sock_owned_by_user(sk))
247 sk_add_backlog(sk, (struct sk_buff *) chunk); 280 sk_add_backlog(sk, skb);
248 else 281 else
249 sctp_backlog_rcv(sk, (struct sk_buff *) chunk); 282 sctp_backlog_rcv(sk, skb);
250 283
251 /* Release the sock and any reference counts we took in the 284 /* Release the sock and any reference counts we took in the
252 * lookup calls. 285 * lookup calls.
@@ -265,13 +298,11 @@ discard_it:
265 298
266discard_release: 299discard_release:
267 /* Release any structures we may be holding. */ 300 /* Release any structures we may be holding. */
268 if (asoc) { 301 sock_put(sk);
269 sock_put(asoc->base.sk); 302 if (asoc)
270 sctp_association_put(asoc); 303 sctp_association_put(asoc);
271 } else { 304 else
272 sock_put(ep->base.sk);
273 sctp_endpoint_put(ep); 305 sctp_endpoint_put(ep);
274 }
275 306
276 goto discard_it; 307 goto discard_it;
277} 308}
@@ -283,14 +314,8 @@ discard_release:
283 */ 314 */
284int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) 315int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
285{ 316{
286 struct sctp_chunk *chunk; 317 struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
287 struct sctp_inq *inqueue; 318 struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
288
289 /* One day chunk will live inside the skb, but for
290 * now this works.
291 */
292 chunk = (struct sctp_chunk *) skb;
293 inqueue = &chunk->rcvr->inqueue;
294 319
295 sctp_inq_push(inqueue, chunk); 320 sctp_inq_push(inqueue, chunk);
296 return 0; 321 return 0;
@@ -326,7 +351,6 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
326 * 351 *
327 */ 352 */
328void sctp_icmp_proto_unreachable(struct sock *sk, 353void sctp_icmp_proto_unreachable(struct sock *sk,
329 struct sctp_endpoint *ep,
330 struct sctp_association *asoc, 354 struct sctp_association *asoc,
331 struct sctp_transport *t) 355 struct sctp_transport *t)
332{ 356{
@@ -334,7 +358,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
334 358
335 sctp_do_sm(SCTP_EVENT_T_OTHER, 359 sctp_do_sm(SCTP_EVENT_T_OTHER,
336 SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), 360 SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
337 asoc->state, asoc->ep, asoc, NULL, 361 asoc->state, asoc->ep, asoc, t,
338 GFP_ATOMIC); 362 GFP_ATOMIC);
339 363
340} 364}
@@ -342,7 +366,6 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
342/* Common lookup code for icmp/icmpv6 error handler. */ 366/* Common lookup code for icmp/icmpv6 error handler. */
343struct sock *sctp_err_lookup(int family, struct sk_buff *skb, 367struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
344 struct sctphdr *sctphdr, 368 struct sctphdr *sctphdr,
345 struct sctp_endpoint **epp,
346 struct sctp_association **app, 369 struct sctp_association **app,
347 struct sctp_transport **tpp) 370 struct sctp_transport **tpp)
348{ 371{
@@ -350,11 +373,10 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
350 union sctp_addr daddr; 373 union sctp_addr daddr;
351 struct sctp_af *af; 374 struct sctp_af *af;
352 struct sock *sk = NULL; 375 struct sock *sk = NULL;
353 struct sctp_endpoint *ep = NULL;
354 struct sctp_association *asoc = NULL; 376 struct sctp_association *asoc = NULL;
355 struct sctp_transport *transport = NULL; 377 struct sctp_transport *transport = NULL;
356 378
357 *app = NULL; *epp = NULL; *tpp = NULL; 379 *app = NULL; *tpp = NULL;
358 380
359 af = sctp_get_af_specific(family); 381 af = sctp_get_af_specific(family);
360 if (unlikely(!af)) { 382 if (unlikely(!af)) {
@@ -369,26 +391,15 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
369 * packet. 391 * packet.
370 */ 392 */
371 asoc = __sctp_lookup_association(&saddr, &daddr, &transport); 393 asoc = __sctp_lookup_association(&saddr, &daddr, &transport);
372 if (!asoc) { 394 if (!asoc)
373 /* If there is no matching association, see if it matches any 395 return NULL;
374 * endpoint. This may happen for an ICMP error generated in
375 * response to an INIT_ACK.
376 */
377 ep = __sctp_rcv_lookup_endpoint(&daddr);
378 if (!ep) {
379 return NULL;
380 }
381 }
382 396
383 if (asoc) { 397 sk = asoc->base.sk;
384 sk = asoc->base.sk;
385 398
386 if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) { 399 if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) {
387 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); 400 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
388 goto out; 401 goto out;
389 } 402 }
390 } else
391 sk = ep->base.sk;
392 403
393 sctp_bh_lock_sock(sk); 404 sctp_bh_lock_sock(sk);
394 405
@@ -398,7 +409,6 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
398 if (sock_owned_by_user(sk)) 409 if (sock_owned_by_user(sk))
399 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); 410 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
400 411
401 *epp = ep;
402 *app = asoc; 412 *app = asoc;
403 *tpp = transport; 413 *tpp = transport;
404 return sk; 414 return sk;
@@ -407,21 +417,16 @@ out:
407 sock_put(sk); 417 sock_put(sk);
408 if (asoc) 418 if (asoc)
409 sctp_association_put(asoc); 419 sctp_association_put(asoc);
410 if (ep)
411 sctp_endpoint_put(ep);
412 return NULL; 420 return NULL;
413} 421}
414 422
415/* Common cleanup code for icmp/icmpv6 error handler. */ 423/* Common cleanup code for icmp/icmpv6 error handler. */
416void sctp_err_finish(struct sock *sk, struct sctp_endpoint *ep, 424void sctp_err_finish(struct sock *sk, struct sctp_association *asoc)
417 struct sctp_association *asoc)
418{ 425{
419 sctp_bh_unlock_sock(sk); 426 sctp_bh_unlock_sock(sk);
420 sock_put(sk); 427 sock_put(sk);
421 if (asoc) 428 if (asoc)
422 sctp_association_put(asoc); 429 sctp_association_put(asoc);
423 if (ep)
424 sctp_endpoint_put(ep);
425} 430}
426 431
427/* 432/*
@@ -446,7 +451,6 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
446 int type = skb->h.icmph->type; 451 int type = skb->h.icmph->type;
447 int code = skb->h.icmph->code; 452 int code = skb->h.icmph->code;
448 struct sock *sk; 453 struct sock *sk;
449 struct sctp_endpoint *ep;
450 struct sctp_association *asoc; 454 struct sctp_association *asoc;
451 struct sctp_transport *transport; 455 struct sctp_transport *transport;
452 struct inet_sock *inet; 456 struct inet_sock *inet;
@@ -463,7 +467,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
463 savesctp = skb->h.raw; 467 savesctp = skb->h.raw;
464 skb->nh.iph = iph; 468 skb->nh.iph = iph;
465 skb->h.raw = (char *)sh; 469 skb->h.raw = (char *)sh;
466 sk = sctp_err_lookup(AF_INET, skb, sh, &ep, &asoc, &transport); 470 sk = sctp_err_lookup(AF_INET, skb, sh, &asoc, &transport);
467 /* Put back, the original pointers. */ 471 /* Put back, the original pointers. */
468 skb->nh.raw = saveip; 472 skb->nh.raw = saveip;
469 skb->h.raw = savesctp; 473 skb->h.raw = savesctp;
@@ -490,7 +494,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
490 } 494 }
491 else { 495 else {
492 if (ICMP_PROT_UNREACH == code) { 496 if (ICMP_PROT_UNREACH == code) {
493 sctp_icmp_proto_unreachable(sk, ep, asoc, 497 sctp_icmp_proto_unreachable(sk, asoc,
494 transport); 498 transport);
495 goto out_unlock; 499 goto out_unlock;
496 } 500 }
@@ -519,7 +523,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
519 } 523 }
520 524
521out_unlock: 525out_unlock:
522 sctp_err_finish(sk, ep, asoc); 526 sctp_err_finish(sk, asoc);
523} 527}
524 528
525/* 529/*
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cedf4351556c..2d33922c044b 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -50,7 +50,7 @@
50/* Initialize an SCTP inqueue. */ 50/* Initialize an SCTP inqueue. */
51void sctp_inq_init(struct sctp_inq *queue) 51void sctp_inq_init(struct sctp_inq *queue)
52{ 52{
53 skb_queue_head_init(&queue->in); 53 INIT_LIST_HEAD(&queue->in_chunk_list);
54 queue->in_progress = NULL; 54 queue->in_progress = NULL;
55 55
56 /* Create a task for delivering data. */ 56 /* Create a task for delivering data. */
@@ -62,11 +62,13 @@ void sctp_inq_init(struct sctp_inq *queue)
62/* Release the memory associated with an SCTP inqueue. */ 62/* Release the memory associated with an SCTP inqueue. */
63void sctp_inq_free(struct sctp_inq *queue) 63void sctp_inq_free(struct sctp_inq *queue)
64{ 64{
65 struct sctp_chunk *chunk; 65 struct sctp_chunk *chunk, *tmp;
66 66
67 /* Empty the queue. */ 67 /* Empty the queue. */
68 while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL) 68 list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
69 list_del_init(&chunk->list);
69 sctp_chunk_free(chunk); 70 sctp_chunk_free(chunk);
71 }
70 72
71 /* If there is a packet which is currently being worked on, 73 /* If there is a packet which is currently being worked on,
72 * free it as well. 74 * free it as well.
@@ -92,7 +94,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
92 * Eventually, we should clean up inqueue to not rely 94 * Eventually, we should clean up inqueue to not rely
93 * on the BH related data structures. 95 * on the BH related data structures.
94 */ 96 */
95 skb_queue_tail(&(q->in), (struct sk_buff *) packet); 97 list_add_tail(&packet->list, &q->in_chunk_list);
96 q->immediate.func(q->immediate.data); 98 q->immediate.func(q->immediate.data);
97} 99}
98 100
@@ -131,12 +133,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
131 133
132 /* Do we need to take the next packet out of the queue to process? */ 134 /* Do we need to take the next packet out of the queue to process? */
133 if (!chunk) { 135 if (!chunk) {
136 struct list_head *entry;
137
134 /* Is the queue empty? */ 138 /* Is the queue empty? */
135 if (skb_queue_empty(&queue->in)) 139 if (list_empty(&queue->in_chunk_list))
136 return NULL; 140 return NULL;
137 141
142 entry = queue->in_chunk_list.next;
138 chunk = queue->in_progress = 143 chunk = queue->in_progress =
139 (struct sctp_chunk *) skb_dequeue(&queue->in); 144 list_entry(entry, struct sctp_chunk, list);
145 list_del_init(entry);
140 146
141 /* This is the first chunk in the packet. */ 147 /* This is the first chunk in the packet. */
142 chunk->singleton = 1; 148 chunk->singleton = 1;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c9d9ea064734..e9b2fd480d61 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -91,7 +91,6 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
91 struct ipv6hdr *iph = (struct ipv6hdr *)skb->data; 91 struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
92 struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); 92 struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
93 struct sock *sk; 93 struct sock *sk;
94 struct sctp_endpoint *ep;
95 struct sctp_association *asoc; 94 struct sctp_association *asoc;
96 struct sctp_transport *transport; 95 struct sctp_transport *transport;
97 struct ipv6_pinfo *np; 96 struct ipv6_pinfo *np;
@@ -105,7 +104,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
105 savesctp = skb->h.raw; 104 savesctp = skb->h.raw;
106 skb->nh.ipv6h = iph; 105 skb->nh.ipv6h = iph;
107 skb->h.raw = (char *)sh; 106 skb->h.raw = (char *)sh;
108 sk = sctp_err_lookup(AF_INET6, skb, sh, &ep, &asoc, &transport); 107 sk = sctp_err_lookup(AF_INET6, skb, sh, &asoc, &transport);
109 /* Put back, the original pointers. */ 108 /* Put back, the original pointers. */
110 skb->nh.raw = saveip; 109 skb->nh.raw = saveip;
111 skb->h.raw = savesctp; 110 skb->h.raw = savesctp;
@@ -124,7 +123,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
124 goto out_unlock; 123 goto out_unlock;
125 case ICMPV6_PARAMPROB: 124 case ICMPV6_PARAMPROB:
126 if (ICMPV6_UNK_NEXTHDR == code) { 125 if (ICMPV6_UNK_NEXTHDR == code) {
127 sctp_icmp_proto_unreachable(sk, ep, asoc, transport); 126 sctp_icmp_proto_unreachable(sk, asoc, transport);
128 goto out_unlock; 127 goto out_unlock;
129 } 128 }
130 break; 129 break;
@@ -142,7 +141,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
142 } 141 }
143 142
144out_unlock: 143out_unlock:
145 sctp_err_finish(sk, ep, asoc); 144 sctp_err_finish(sk, asoc);
146out: 145out:
147 if (likely(idev != NULL)) 146 if (likely(idev != NULL))
148 in6_dev_put(idev); 147 in6_dev_put(idev);
@@ -812,26 +811,23 @@ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
812 if (addr->sa.sa_family != AF_INET6) 811 if (addr->sa.sa_family != AF_INET6)
813 af = sctp_get_af_specific(addr->sa.sa_family); 812 af = sctp_get_af_specific(addr->sa.sa_family);
814 else { 813 else {
815 struct sock *sk;
816 int type = ipv6_addr_type(&addr->v6.sin6_addr); 814 int type = ipv6_addr_type(&addr->v6.sin6_addr);
817 sk = sctp_opt2sk(opt); 815 struct net_device *dev;
816
818 if (type & IPV6_ADDR_LINKLOCAL) { 817 if (type & IPV6_ADDR_LINKLOCAL) {
819 /* Note: Behavior similar to af_inet6.c: 818 if (!addr->v6.sin6_scope_id)
820 * 1) Overrides previous bound_dev_if 819 return 0;
821 * 2) Destructive even if bind isn't successful. 820 dev = dev_get_by_index(addr->v6.sin6_scope_id);
822 */ 821 if (!dev)
823
824 if (addr->v6.sin6_scope_id)
825 sk->sk_bound_dev_if = addr->v6.sin6_scope_id;
826 if (!sk->sk_bound_dev_if)
827 return 0; 822 return 0;
823 dev_put(dev);
828 } 824 }
829 af = opt->pf->af; 825 af = opt->pf->af;
830 } 826 }
831 return af->available(addr, opt); 827 return af->available(addr, opt);
832} 828}
833 829
834/* Verify that the provided sockaddr looks bindable. Common verification, 830/* Verify that the provided sockaddr looks sendable. Common verification,
835 * has already been taken care of. 831 * has already been taken care of.
836 */ 832 */
837static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr) 833static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
@@ -842,19 +838,16 @@ static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
842 if (addr->sa.sa_family != AF_INET6) 838 if (addr->sa.sa_family != AF_INET6)
843 af = sctp_get_af_specific(addr->sa.sa_family); 839 af = sctp_get_af_specific(addr->sa.sa_family);
844 else { 840 else {
845 struct sock *sk;
846 int type = ipv6_addr_type(&addr->v6.sin6_addr); 841 int type = ipv6_addr_type(&addr->v6.sin6_addr);
847 sk = sctp_opt2sk(opt); 842 struct net_device *dev;
843
848 if (type & IPV6_ADDR_LINKLOCAL) { 844 if (type & IPV6_ADDR_LINKLOCAL) {
849 /* Note: Behavior similar to af_inet6.c: 845 if (!addr->v6.sin6_scope_id)
850 * 1) Overrides previous bound_dev_if 846 return 0;
851 * 2) Destructive even if bind isn't successful. 847 dev = dev_get_by_index(addr->v6.sin6_scope_id);
852 */ 848 if (!dev)
853
854 if (addr->v6.sin6_scope_id)
855 sk->sk_bound_dev_if = addr->v6.sin6_scope_id;
856 if (!sk->sk_bound_dev_if)
857 return 0; 849 return 0;
850 dev_put(dev);
858 } 851 }
859 af = opt->pf->af; 852 af = opt->pf->af;
860 } 853 }
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 0781e5d509fd..8ff588f0d76a 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -127,8 +127,12 @@ done:
127/* Initialize the objcount in the proc filesystem. */ 127/* Initialize the objcount in the proc filesystem. */
128void sctp_dbg_objcnt_init(void) 128void sctp_dbg_objcnt_init(void)
129{ 129{
130 create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp, 130 struct proc_dir_entry *ent;
131 ent = create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp,
131 sctp_dbg_objcnt_read, NULL); 132 sctp_dbg_objcnt_read, NULL);
133 if (!ent)
134 printk(KERN_WARNING
135 "sctp_dbg_objcnt: Unable to create /proc entry.\n");
132} 136}
133 137
134/* Cleanup the objcount entry in the proc filesystem. */ 138/* Cleanup the objcount entry in the proc filesystem. */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 84b5b370b09d..931371633464 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -108,7 +108,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
108 packet->transport = transport; 108 packet->transport = transport;
109 packet->source_port = sport; 109 packet->source_port = sport;
110 packet->destination_port = dport; 110 packet->destination_port = dport;
111 skb_queue_head_init(&packet->chunks); 111 INIT_LIST_HEAD(&packet->chunk_list);
112 if (asoc) { 112 if (asoc) {
113 struct sctp_sock *sp = sctp_sk(asoc->base.sk); 113 struct sctp_sock *sp = sctp_sk(asoc->base.sk);
114 overhead = sp->pf->af->net_header_len; 114 overhead = sp->pf->af->net_header_len;
@@ -129,12 +129,14 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
129/* Free a packet. */ 129/* Free a packet. */
130void sctp_packet_free(struct sctp_packet *packet) 130void sctp_packet_free(struct sctp_packet *packet)
131{ 131{
132 struct sctp_chunk *chunk; 132 struct sctp_chunk *chunk, *tmp;
133 133
134 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); 134 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
135 135
136 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) 136 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
137 list_del_init(&chunk->list);
137 sctp_chunk_free(chunk); 138 sctp_chunk_free(chunk);
139 }
138 140
139 if (packet->malloced) 141 if (packet->malloced)
140 kfree(packet); 142 kfree(packet);
@@ -276,7 +278,7 @@ append:
276 packet->has_sack = 1; 278 packet->has_sack = 1;
277 279
278 /* It is OK to send this chunk. */ 280 /* It is OK to send this chunk. */
279 __skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk); 281 list_add_tail(&chunk->list, &packet->chunk_list);
280 packet->size += chunk_len; 282 packet->size += chunk_len;
281 chunk->transport = packet->transport; 283 chunk->transport = packet->transport;
282finish: 284finish:
@@ -295,7 +297,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
295 struct sctphdr *sh; 297 struct sctphdr *sh;
296 __u32 crc32; 298 __u32 crc32;
297 struct sk_buff *nskb; 299 struct sk_buff *nskb;
298 struct sctp_chunk *chunk; 300 struct sctp_chunk *chunk, *tmp;
299 struct sock *sk; 301 struct sock *sk;
300 int err = 0; 302 int err = 0;
301 int padding; /* How much padding do we need? */ 303 int padding; /* How much padding do we need? */
@@ -305,11 +307,11 @@ int sctp_packet_transmit(struct sctp_packet *packet)
305 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); 307 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
306 308
307 /* Do NOT generate a chunkless packet. */ 309 /* Do NOT generate a chunkless packet. */
308 chunk = (struct sctp_chunk *)skb_peek(&packet->chunks); 310 if (list_empty(&packet->chunk_list))
309 if (unlikely(!chunk))
310 return err; 311 return err;
311 312
312 /* Set up convenience variables... */ 313 /* Set up convenience variables... */
314 chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
313 sk = chunk->skb->sk; 315 sk = chunk->skb->sk;
314 316
315 /* Allocate the new skb. */ 317 /* Allocate the new skb. */
@@ -370,7 +372,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
370 * [This whole comment explains WORD_ROUND() below.] 372 * [This whole comment explains WORD_ROUND() below.]
371 */ 373 */
372 SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n"); 374 SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n");
373 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { 375 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
376 list_del_init(&chunk->list);
374 if (sctp_chunk_is_data(chunk)) { 377 if (sctp_chunk_is_data(chunk)) {
375 378
376 if (!chunk->has_tsn) { 379 if (!chunk->has_tsn) {
@@ -511,7 +514,8 @@ err:
511 * will get resent or dropped later. 514 * will get resent or dropped later.
512 */ 515 */
513 516
514 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { 517 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
518 list_del_init(&chunk->list);
515 if (!sctp_chunk_is_data(chunk)) 519 if (!sctp_chunk_is_data(chunk))
516 sctp_chunk_free(chunk); 520 sctp_chunk_free(chunk);
517 } 521 }
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 1b2d4adc4ddb..efb72faba20c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -75,7 +75,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
75static inline void sctp_outq_head_data(struct sctp_outq *q, 75static inline void sctp_outq_head_data(struct sctp_outq *q,
76 struct sctp_chunk *ch) 76 struct sctp_chunk *ch)
77{ 77{
78 __skb_queue_head(&q->out, (struct sk_buff *)ch); 78 list_add(&ch->list, &q->out_chunk_list);
79 q->out_qlen += ch->skb->len; 79 q->out_qlen += ch->skb->len;
80 return; 80 return;
81} 81}
@@ -83,17 +83,22 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
83/* Take data from the front of the queue. */ 83/* Take data from the front of the queue. */
84static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) 84static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
85{ 85{
86 struct sctp_chunk *ch; 86 struct sctp_chunk *ch = NULL;
87 ch = (struct sctp_chunk *)__skb_dequeue(&q->out); 87
88 if (ch) 88 if (!list_empty(&q->out_chunk_list)) {
89 struct list_head *entry = q->out_chunk_list.next;
90
91 ch = list_entry(entry, struct sctp_chunk, list);
92 list_del_init(entry);
89 q->out_qlen -= ch->skb->len; 93 q->out_qlen -= ch->skb->len;
94 }
90 return ch; 95 return ch;
91} 96}
92/* Add data chunk to the end of the queue. */ 97/* Add data chunk to the end of the queue. */
93static inline void sctp_outq_tail_data(struct sctp_outq *q, 98static inline void sctp_outq_tail_data(struct sctp_outq *q,
94 struct sctp_chunk *ch) 99 struct sctp_chunk *ch)
95{ 100{
96 __skb_queue_tail(&q->out, (struct sk_buff *)ch); 101 list_add_tail(&ch->list, &q->out_chunk_list);
97 q->out_qlen += ch->skb->len; 102 q->out_qlen += ch->skb->len;
98 return; 103 return;
99} 104}
@@ -197,8 +202,8 @@ static inline int sctp_cacc_skip(struct sctp_transport *primary,
197void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) 202void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
198{ 203{
199 q->asoc = asoc; 204 q->asoc = asoc;
200 skb_queue_head_init(&q->out); 205 INIT_LIST_HEAD(&q->out_chunk_list);
201 skb_queue_head_init(&q->control); 206 INIT_LIST_HEAD(&q->control_chunk_list);
202 INIT_LIST_HEAD(&q->retransmit); 207 INIT_LIST_HEAD(&q->retransmit);
203 INIT_LIST_HEAD(&q->sacked); 208 INIT_LIST_HEAD(&q->sacked);
204 INIT_LIST_HEAD(&q->abandoned); 209 INIT_LIST_HEAD(&q->abandoned);
@@ -217,7 +222,7 @@ void sctp_outq_teardown(struct sctp_outq *q)
217{ 222{
218 struct sctp_transport *transport; 223 struct sctp_transport *transport;
219 struct list_head *lchunk, *pos, *temp; 224 struct list_head *lchunk, *pos, *temp;
220 struct sctp_chunk *chunk; 225 struct sctp_chunk *chunk, *tmp;
221 226
222 /* Throw away unacknowledged chunks. */ 227 /* Throw away unacknowledged chunks. */
223 list_for_each(pos, &q->asoc->peer.transport_addr_list) { 228 list_for_each(pos, &q->asoc->peer.transport_addr_list) {
@@ -269,8 +274,10 @@ void sctp_outq_teardown(struct sctp_outq *q)
269 q->error = 0; 274 q->error = 0;
270 275
271 /* Throw away any leftover control chunks. */ 276 /* Throw away any leftover control chunks. */
272 while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL) 277 list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
278 list_del_init(&chunk->list);
273 sctp_chunk_free(chunk); 279 sctp_chunk_free(chunk);
280 }
274} 281}
275 282
276/* Free the outqueue structure and any related pending chunks. */ 283/* Free the outqueue structure and any related pending chunks. */
@@ -333,7 +340,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
333 break; 340 break;
334 }; 341 };
335 } else { 342 } else {
336 __skb_queue_tail(&q->control, (struct sk_buff *) chunk); 343 list_add_tail(&chunk->list, &q->control_chunk_list);
337 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); 344 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
338 } 345 }
339 346
@@ -650,10 +657,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
650 __u16 sport = asoc->base.bind_addr.port; 657 __u16 sport = asoc->base.bind_addr.port;
651 __u16 dport = asoc->peer.port; 658 __u16 dport = asoc->peer.port;
652 __u32 vtag = asoc->peer.i.init_tag; 659 __u32 vtag = asoc->peer.i.init_tag;
653 struct sk_buff_head *queue;
654 struct sctp_transport *transport = NULL; 660 struct sctp_transport *transport = NULL;
655 struct sctp_transport *new_transport; 661 struct sctp_transport *new_transport;
656 struct sctp_chunk *chunk; 662 struct sctp_chunk *chunk, *tmp;
657 sctp_xmit_t status; 663 sctp_xmit_t status;
658 int error = 0; 664 int error = 0;
659 int start_timer = 0; 665 int start_timer = 0;
@@ -675,16 +681,17 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
675 * ... 681 * ...
676 */ 682 */
677 683
678 queue = &q->control; 684 list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
679 while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) { 685 list_del_init(&chunk->list);
686
680 /* Pick the right transport to use. */ 687 /* Pick the right transport to use. */
681 new_transport = chunk->transport; 688 new_transport = chunk->transport;
682 689
683 if (!new_transport) { 690 if (!new_transport) {
684 new_transport = asoc->peer.active_path; 691 new_transport = asoc->peer.active_path;
685 } else if (!new_transport->active) { 692 } else if (new_transport->state == SCTP_INACTIVE) {
686 /* If the chunk is Heartbeat or Heartbeat Ack, 693 /* If the chunk is Heartbeat or Heartbeat Ack,
687 * send it to chunk->transport, even if it's 694 * send it to chunk->transport, even if it's
688 * inactive. 695 * inactive.
689 * 696 *
690 * 3.3.6 Heartbeat Acknowledgement: 697 * 3.3.6 Heartbeat Acknowledgement:
@@ -814,8 +821,6 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
814 821
815 /* Finally, transmit new packets. */ 822 /* Finally, transmit new packets. */
816 start_timer = 0; 823 start_timer = 0;
817 queue = &q->out;
818
819 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { 824 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
820 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid 825 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid
821 * stream identifier. 826 * stream identifier.
@@ -840,7 +845,8 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
840 * Otherwise, we want to use the active path. 845 * Otherwise, we want to use the active path.
841 */ 846 */
842 new_transport = chunk->transport; 847 new_transport = chunk->transport;
843 if (!new_transport || !new_transport->active) 848 if (!new_transport ||
849 new_transport->state == SCTP_INACTIVE)
844 new_transport = asoc->peer.active_path; 850 new_transport = asoc->peer.active_path;
845 851
846 /* Change packets if necessary. */ 852 /* Change packets if necessary. */
@@ -1148,8 +1154,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
1148 /* See if all chunks are acked. 1154 /* See if all chunks are acked.
1149 * Make sure the empty queue handler will get run later. 1155 * Make sure the empty queue handler will get run later.
1150 */ 1156 */
1151 q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) && 1157 q->empty = (list_empty(&q->out_chunk_list) &&
1152 list_empty(&q->retransmit); 1158 list_empty(&q->control_chunk_list) &&
1159 list_empty(&q->retransmit));
1153 if (!q->empty) 1160 if (!q->empty)
1154 goto finish; 1161 goto finish;
1155 1162
@@ -1454,7 +1461,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1454 /* Mark the destination transport address as 1461 /* Mark the destination transport address as
1455 * active if it is not so marked. 1462 * active if it is not so marked.
1456 */ 1463 */
1457 if (!transport->active) { 1464 if (transport->state == SCTP_INACTIVE) {
1458 sctp_assoc_control_transport( 1465 sctp_assoc_control_transport(
1459 transport->asoc, 1466 transport->asoc,
1460 transport, 1467 transport,
@@ -1678,9 +1685,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
1678 if (TSN_lte(tsn, ctsn)) { 1685 if (TSN_lte(tsn, ctsn)) {
1679 list_del_init(lchunk); 1686 list_del_init(lchunk);
1680 if (!chunk->tsn_gap_acked) { 1687 if (!chunk->tsn_gap_acked) {
1681 chunk->transport->flight_size -= 1688 chunk->transport->flight_size -=
1682 sctp_data_size(chunk); 1689 sctp_data_size(chunk);
1683 q->outstanding_bytes -= sctp_data_size(chunk); 1690 q->outstanding_bytes -= sctp_data_size(chunk);
1684 } 1691 }
1685 sctp_chunk_free(chunk); 1692 sctp_chunk_free(chunk);
1686 } else { 1693 } else {
@@ -1728,7 +1735,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
1728 nskips, &ftsn_skip_arr[0]); 1735 nskips, &ftsn_skip_arr[0]);
1729 1736
1730 if (ftsn_chunk) { 1737 if (ftsn_chunk) {
1731 __skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk); 1738 list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
1732 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); 1739 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
1733 } 1740 }
1734} 1741}
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index e42fd8c2916b..98d49ec9b74b 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -132,14 +132,25 @@ void sctp_snmp_proc_exit(void)
132static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) 132static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
133{ 133{
134 struct list_head *pos; 134 struct list_head *pos;
135 struct sctp_association *asoc;
135 struct sctp_sockaddr_entry *laddr; 136 struct sctp_sockaddr_entry *laddr;
136 union sctp_addr *addr; 137 struct sctp_transport *peer;
138 union sctp_addr *addr, *primary = NULL;
137 struct sctp_af *af; 139 struct sctp_af *af;
138 140
141 if (epb->type == SCTP_EP_TYPE_ASSOCIATION) {
142 asoc = sctp_assoc(epb);
143 peer = asoc->peer.primary_path;
144 primary = &peer->saddr;
145 }
146
139 list_for_each(pos, &epb->bind_addr.address_list) { 147 list_for_each(pos, &epb->bind_addr.address_list) {
140 laddr = list_entry(pos, struct sctp_sockaddr_entry, list); 148 laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
141 addr = (union sctp_addr *)&laddr->a; 149 addr = (union sctp_addr *)&laddr->a;
142 af = sctp_get_af_specific(addr->sa.sa_family); 150 af = sctp_get_af_specific(addr->sa.sa_family);
151 if (primary && af->cmp_addr(addr, primary)) {
152 seq_printf(seq, "*");
153 }
143 af->seq_dump_addr(seq, addr); 154 af->seq_dump_addr(seq, addr);
144 } 155 }
145} 156}
@@ -149,17 +160,54 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
149{ 160{
150 struct list_head *pos; 161 struct list_head *pos;
151 struct sctp_transport *transport; 162 struct sctp_transport *transport;
152 union sctp_addr *addr; 163 union sctp_addr *addr, *primary;
153 struct sctp_af *af; 164 struct sctp_af *af;
154 165
166 primary = &(assoc->peer.primary_addr);
155 list_for_each(pos, &assoc->peer.transport_addr_list) { 167 list_for_each(pos, &assoc->peer.transport_addr_list) {
156 transport = list_entry(pos, struct sctp_transport, transports); 168 transport = list_entry(pos, struct sctp_transport, transports);
157 addr = (union sctp_addr *)&transport->ipaddr; 169 addr = (union sctp_addr *)&transport->ipaddr;
158 af = sctp_get_af_specific(addr->sa.sa_family); 170 af = sctp_get_af_specific(addr->sa.sa_family);
171 if (af->cmp_addr(addr, primary)) {
172 seq_printf(seq, "*");
173 }
159 af->seq_dump_addr(seq, addr); 174 af->seq_dump_addr(seq, addr);
160 } 175 }
161} 176}
162 177
178static void * sctp_eps_seq_start(struct seq_file *seq, loff_t *pos)
179{
180 if (*pos > sctp_ep_hashsize)
181 return NULL;
182
183 if (*pos < 0)
184 *pos = 0;
185
186 if (*pos == 0)
187 seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT UID INODE LADDRS\n");
188
189 ++*pos;
190
191 return (void *)pos;
192}
193
194static void sctp_eps_seq_stop(struct seq_file *seq, void *v)
195{
196 return;
197}
198
199
200static void * sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos)
201{
202 if (*pos > sctp_ep_hashsize)
203 return NULL;
204
205 ++*pos;
206
207 return pos;
208}
209
210
163/* Display sctp endpoints (/proc/net/sctp/eps). */ 211/* Display sctp endpoints (/proc/net/sctp/eps). */
164static int sctp_eps_seq_show(struct seq_file *seq, void *v) 212static int sctp_eps_seq_show(struct seq_file *seq, void *v)
165{ 213{
@@ -167,38 +215,50 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v)
167 struct sctp_ep_common *epb; 215 struct sctp_ep_common *epb;
168 struct sctp_endpoint *ep; 216 struct sctp_endpoint *ep;
169 struct sock *sk; 217 struct sock *sk;
170 int hash; 218 int hash = *(int *)v;
171 219
172 seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT LADDRS\n"); 220 if (hash > sctp_ep_hashsize)
173 for (hash = 0; hash < sctp_ep_hashsize; hash++) { 221 return -ENOMEM;
174 head = &sctp_ep_hashtable[hash]; 222
175 read_lock(&head->lock); 223 head = &sctp_ep_hashtable[hash-1];
176 for (epb = head->chain; epb; epb = epb->next) { 224 sctp_local_bh_disable();
177 ep = sctp_ep(epb); 225 read_lock(&head->lock);
178 sk = epb->sk; 226 for (epb = head->chain; epb; epb = epb->next) {
179 seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d ", ep, sk, 227 ep = sctp_ep(epb);
180 sctp_sk(sk)->type, sk->sk_state, hash, 228 sk = epb->sk;
181 epb->bind_addr.port); 229 seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d %5d %5lu ", ep, sk,
182 sctp_seq_dump_local_addrs(seq, epb); 230 sctp_sk(sk)->type, sk->sk_state, hash-1,
183 seq_printf(seq, "\n"); 231 epb->bind_addr.port,
184 } 232 sock_i_uid(sk), sock_i_ino(sk));
185 read_unlock(&head->lock); 233
234 sctp_seq_dump_local_addrs(seq, epb);
235 seq_printf(seq, "\n");
186 } 236 }
237 read_unlock(&head->lock);
238 sctp_local_bh_enable();
187 239
188 return 0; 240 return 0;
189} 241}
190 242
243static struct seq_operations sctp_eps_ops = {
244 .start = sctp_eps_seq_start,
245 .next = sctp_eps_seq_next,
246 .stop = sctp_eps_seq_stop,
247 .show = sctp_eps_seq_show,
248};
249
250
191/* Initialize the seq file operations for 'eps' object. */ 251/* Initialize the seq file operations for 'eps' object. */
192static int sctp_eps_seq_open(struct inode *inode, struct file *file) 252static int sctp_eps_seq_open(struct inode *inode, struct file *file)
193{ 253{
194 return single_open(file, sctp_eps_seq_show, NULL); 254 return seq_open(file, &sctp_eps_ops);
195} 255}
196 256
197static struct file_operations sctp_eps_seq_fops = { 257static struct file_operations sctp_eps_seq_fops = {
198 .open = sctp_eps_seq_open, 258 .open = sctp_eps_seq_open,
199 .read = seq_read, 259 .read = seq_read,
200 .llseek = seq_lseek, 260 .llseek = seq_lseek,
201 .release = single_release, 261 .release = seq_release,
202}; 262};
203 263
204/* Set up the proc fs entry for 'eps' object. */ 264/* Set up the proc fs entry for 'eps' object. */
@@ -221,6 +281,40 @@ void sctp_eps_proc_exit(void)
221 remove_proc_entry("eps", proc_net_sctp); 281 remove_proc_entry("eps", proc_net_sctp);
222} 282}
223 283
284
285static void * sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
286{
287 if (*pos > sctp_assoc_hashsize)
288 return NULL;
289
290 if (*pos < 0)
291 *pos = 0;
292
293 if (*pos == 0)
294 seq_printf(seq, " ASSOC SOCK STY SST ST HBKT ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT "
295 "RPORT LADDRS <-> RADDRS\n");
296
297 ++*pos;
298
299 return (void *)pos;
300}
301
302static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
303{
304 return;
305}
306
307
308static void * sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
309{
310 if (*pos > sctp_assoc_hashsize)
311 return NULL;
312
313 ++*pos;
314
315 return pos;
316}
317
224/* Display sctp associations (/proc/net/sctp/assocs). */ 318/* Display sctp associations (/proc/net/sctp/assocs). */
225static int sctp_assocs_seq_show(struct seq_file *seq, void *v) 319static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
226{ 320{
@@ -228,43 +322,57 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
228 struct sctp_ep_common *epb; 322 struct sctp_ep_common *epb;
229 struct sctp_association *assoc; 323 struct sctp_association *assoc;
230 struct sock *sk; 324 struct sock *sk;
231 int hash; 325 int hash = *(int *)v;
232 326
233 seq_printf(seq, " ASSOC SOCK STY SST ST HBKT LPORT RPORT " 327 if (hash > sctp_assoc_hashsize)
234 "LADDRS <-> RADDRS\n"); 328 return -ENOMEM;
235 for (hash = 0; hash < sctp_assoc_hashsize; hash++) { 329
236 head = &sctp_assoc_hashtable[hash]; 330 head = &sctp_assoc_hashtable[hash-1];
237 read_lock(&head->lock); 331 sctp_local_bh_disable();
238 for (epb = head->chain; epb; epb = epb->next) { 332 read_lock(&head->lock);
239 assoc = sctp_assoc(epb); 333 for (epb = head->chain; epb; epb = epb->next) {
240 sk = epb->sk; 334 assoc = sctp_assoc(epb);
241 seq_printf(seq, 335 sk = epb->sk;
242 "%8p %8p %-3d %-3d %-2d %-4d %-5d %-5d ", 336 seq_printf(seq,
243 assoc, sk, sctp_sk(sk)->type, sk->sk_state, 337 "%8p %8p %-3d %-3d %-2d %-4d %4d %8d %8d %7d %5lu %-5d %5d ",
244 assoc->state, hash, epb->bind_addr.port, 338 assoc, sk, sctp_sk(sk)->type, sk->sk_state,
245 assoc->peer.port); 339 assoc->state, hash-1, assoc->assoc_id,
246 sctp_seq_dump_local_addrs(seq, epb); 340 (sk->sk_rcvbuf - assoc->rwnd),
247 seq_printf(seq, "<-> "); 341 assoc->sndbuf_used,
248 sctp_seq_dump_remote_addrs(seq, assoc); 342 sock_i_uid(sk), sock_i_ino(sk),
249 seq_printf(seq, "\n"); 343 epb->bind_addr.port,
250 } 344 assoc->peer.port);
251 read_unlock(&head->lock); 345
346 seq_printf(seq, " ");
347 sctp_seq_dump_local_addrs(seq, epb);
348 seq_printf(seq, "<-> ");
349 sctp_seq_dump_remote_addrs(seq, assoc);
350 seq_printf(seq, "\n");
252 } 351 }
352 read_unlock(&head->lock);
353 sctp_local_bh_enable();
253 354
254 return 0; 355 return 0;
255} 356}
256 357
358static struct seq_operations sctp_assoc_ops = {
359 .start = sctp_assocs_seq_start,
360 .next = sctp_assocs_seq_next,
361 .stop = sctp_assocs_seq_stop,
362 .show = sctp_assocs_seq_show,
363};
364
257/* Initialize the seq file operations for 'assocs' object. */ 365/* Initialize the seq file operations for 'assocs' object. */
258static int sctp_assocs_seq_open(struct inode *inode, struct file *file) 366static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
259{ 367{
260 return single_open(file, sctp_assocs_seq_show, NULL); 368 return seq_open(file, &sctp_assoc_ops);
261} 369}
262 370
263static struct file_operations sctp_assocs_seq_fops = { 371static struct file_operations sctp_assocs_seq_fops = {
264 .open = sctp_assocs_seq_open, 372 .open = sctp_assocs_seq_open,
265 .read = seq_read, 373 .read = seq_read,
266 .llseek = seq_lseek, 374 .llseek = seq_lseek,
267 .release = single_release, 375 .release = seq_release,
268}; 376};
269 377
270/* Set up the proc fs entry for 'assocs' object. */ 378/* Set up the proc fs entry for 'assocs' object. */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2e1f9c3556f5..ce9245e71fca 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
219 219
220/* Copy the local addresses which are valid for 'scope' into 'bp'. */ 220/* Copy the local addresses which are valid for 'scope' into 'bp'. */
221int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope, 221int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
222 int gfp, int copy_flags) 222 unsigned int __nocast gfp, int copy_flags)
223{ 223{
224 struct sctp_sockaddr_entry *addr; 224 struct sctp_sockaddr_entry *addr;
225 int error = 0; 225 int error = 0;
@@ -378,10 +378,13 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
378{ 378{
379 int ret = inet_addr_type(addr->v4.sin_addr.s_addr); 379 int ret = inet_addr_type(addr->v4.sin_addr.s_addr);
380 380
381 /* FIXME: ip_nonlocal_bind sysctl support. */
382 381
383 if (addr->v4.sin_addr.s_addr != INADDR_ANY && ret != RTN_LOCAL) 382 if (addr->v4.sin_addr.s_addr != INADDR_ANY &&
383 ret != RTN_LOCAL &&
384 !sp->inet.freebind &&
385 !sysctl_ip_nonlocal_bind)
384 return 0; 386 return 0;
387
385 return 1; 388 return 1;
386} 389}
387 390
@@ -1047,7 +1050,10 @@ SCTP_STATIC __init int sctp_init(void)
1047 sctp_sndbuf_policy = 0; 1050 sctp_sndbuf_policy = 0;
1048 1051
1049 /* HB.interval - 30 seconds */ 1052 /* HB.interval - 30 seconds */
1050 sctp_hb_interval = 30 * HZ; 1053 sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
1054
1055 /* delayed SACK timeout */
1056 sctp_sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK;
1051 1057
1052 /* Implementation specific variables. */ 1058 /* Implementation specific variables. */
1053 1059
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 33ac8bf47b0e..00d32b7c8266 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
78static int sctp_process_param(struct sctp_association *asoc, 78static int sctp_process_param(struct sctp_association *asoc,
79 union sctp_params param, 79 union sctp_params param,
80 const union sctp_addr *peer_addr, 80 const union sctp_addr *peer_addr,
81 int gfp); 81 unsigned int __nocast gfp);
82 82
83/* What was the inbound interface for this chunk? */ 83/* What was the inbound interface for this chunk? */
84int sctp_chunk_iif(const struct sctp_chunk *chunk) 84int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
174 */ 174 */
175struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, 175struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
176 const struct sctp_bind_addr *bp, 176 const struct sctp_bind_addr *bp,
177 int gfp, int vparam_len) 177 unsigned int __nocast gfp, int vparam_len)
178{ 178{
179 sctp_inithdr_t init; 179 sctp_inithdr_t init;
180 union sctp_params addrs; 180 union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
261 261
262struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, 262struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
263 const struct sctp_chunk *chunk, 263 const struct sctp_chunk *chunk,
264 int gfp, int unkparam_len) 264 unsigned int __nocast gfp, int unkparam_len)
265{ 265{
266 sctp_inithdr_t initack; 266 sctp_inithdr_t initack;
267 struct sctp_chunk *retval; 267 struct sctp_chunk *retval;
@@ -1003,6 +1003,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
1003 SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb); 1003 SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb);
1004 } 1004 }
1005 1005
1006 INIT_LIST_HEAD(&retval->list);
1006 retval->skb = skb; 1007 retval->skb = skb;
1007 retval->asoc = (struct sctp_association *)asoc; 1008 retval->asoc = (struct sctp_association *)asoc;
1008 retval->resent = 0; 1009 retval->resent = 0;
@@ -1116,8 +1117,7 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk)
1116/* Possibly, free the chunk. */ 1117/* Possibly, free the chunk. */
1117void sctp_chunk_free(struct sctp_chunk *chunk) 1118void sctp_chunk_free(struct sctp_chunk *chunk)
1118{ 1119{
1119 /* Make sure that we are not on any list. */ 1120 BUG_ON(!list_empty(&chunk->list));
1120 skb_unlink((struct sk_buff *) chunk);
1121 list_del_init(&chunk->transmitted_list); 1121 list_del_init(&chunk->transmitted_list);
1122 1122
1123 /* Release our reference on the message tracker. */ 1123 /* Release our reference on the message tracker. */
@@ -1233,7 +1233,8 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
1233 1233
1234/* Create a CLOSED association to use with an incoming packet. */ 1234/* Create a CLOSED association to use with an incoming packet. */
1235struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, 1235struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
1236 struct sctp_chunk *chunk, int gfp) 1236 struct sctp_chunk *chunk,
1237 unsigned int __nocast gfp)
1237{ 1238{
1238 struct sctp_association *asoc; 1239 struct sctp_association *asoc;
1239 struct sk_buff *skb; 1240 struct sk_buff *skb;
@@ -1348,7 +1349,7 @@ nodata:
1348struct sctp_association *sctp_unpack_cookie( 1349struct sctp_association *sctp_unpack_cookie(
1349 const struct sctp_endpoint *ep, 1350 const struct sctp_endpoint *ep,
1350 const struct sctp_association *asoc, 1351 const struct sctp_association *asoc,
1351 struct sctp_chunk *chunk, int gfp, 1352 struct sctp_chunk *chunk, unsigned int __nocast gfp,
1352 int *error, struct sctp_chunk **errp) 1353 int *error, struct sctp_chunk **errp)
1353{ 1354{
1354 struct sctp_association *retval = NULL; 1355 struct sctp_association *retval = NULL;
@@ -1812,7 +1813,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
1812 */ 1813 */
1813int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, 1814int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1814 const union sctp_addr *peer_addr, 1815 const union sctp_addr *peer_addr,
1815 sctp_init_chunk_t *peer_init, int gfp) 1816 sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
1816{ 1817{
1817 union sctp_params param; 1818 union sctp_params param;
1818 struct sctp_transport *transport; 1819 struct sctp_transport *transport;
@@ -1830,7 +1831,7 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1830 * be a a better choice than any of the embedded addresses. 1831 * be a a better choice than any of the embedded addresses.
1831 */ 1832 */
1832 if (peer_addr) 1833 if (peer_addr)
1833 if(!sctp_assoc_add_peer(asoc, peer_addr, gfp)) 1834 if(!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
1834 goto nomem; 1835 goto nomem;
1835 1836
1836 /* Process the initialization parameters. */ 1837 /* Process the initialization parameters. */
@@ -1841,6 +1842,14 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1841 goto clean_up; 1842 goto clean_up;
1842 } 1843 }
1843 1844
1845 /* Walk list of transports, removing transports in the UNKNOWN state. */
1846 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
1847 transport = list_entry(pos, struct sctp_transport, transports);
1848 if (transport->state == SCTP_UNKNOWN) {
1849 sctp_assoc_rm_peer(asoc, transport);
1850 }
1851 }
1852
1844 /* The fixed INIT headers are always in network byte 1853 /* The fixed INIT headers are always in network byte
1845 * order. 1854 * order.
1846 */ 1855 */
@@ -1906,7 +1915,8 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1906 * stream sequence number shall be set to 0. 1915 * stream sequence number shall be set to 0.
1907 */ 1916 */
1908 1917
1909 /* Allocate storage for the negotiated streams if it is not a temporary * association. 1918 /* Allocate storage for the negotiated streams if it is not a temporary
1919 * association.
1910 */ 1920 */
1911 if (!asoc->temp) { 1921 if (!asoc->temp) {
1912 int assoc_id; 1922 int assoc_id;
@@ -1952,6 +1962,9 @@ clean_up:
1952 list_del_init(pos); 1962 list_del_init(pos);
1953 sctp_transport_free(transport); 1963 sctp_transport_free(transport);
1954 } 1964 }
1965
1966 asoc->peer.transport_count = 0;
1967
1955nomem: 1968nomem:
1956 return 0; 1969 return 0;
1957} 1970}
@@ -1971,7 +1984,7 @@ nomem:
1971static int sctp_process_param(struct sctp_association *asoc, 1984static int sctp_process_param(struct sctp_association *asoc,
1972 union sctp_params param, 1985 union sctp_params param,
1973 const union sctp_addr *peer_addr, 1986 const union sctp_addr *peer_addr,
1974 int gfp) 1987 unsigned int __nocast gfp)
1975{ 1988{
1976 union sctp_addr addr; 1989 union sctp_addr addr;
1977 int i; 1990 int i;
@@ -1995,7 +2008,7 @@ static int sctp_process_param(struct sctp_association *asoc,
1995 af->from_addr_param(&addr, param.addr, asoc->peer.port, 0); 2008 af->from_addr_param(&addr, param.addr, asoc->peer.port, 0);
1996 scope = sctp_scope(peer_addr); 2009 scope = sctp_scope(peer_addr);
1997 if (sctp_in_scope(&addr, scope)) 2010 if (sctp_in_scope(&addr, scope))
1998 if (!sctp_assoc_add_peer(asoc, &addr, gfp)) 2011 if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_ACTIVE))
1999 return 0; 2012 return 0;
2000 break; 2013 break;
2001 2014
@@ -2396,7 +2409,7 @@ static __u16 sctp_process_asconf_param(struct sctp_association *asoc,
2396 * Due to Resource Shortage'. 2409 * Due to Resource Shortage'.
2397 */ 2410 */
2398 2411
2399 peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC); 2412 peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_ACTIVE);
2400 if (!peer) 2413 if (!peer)
2401 return SCTP_ERROR_RSRC_LOW; 2414 return SCTP_ERROR_RSRC_LOW;
2402 2415
@@ -2727,8 +2740,12 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
2727 asoc->addip_last_asconf = NULL; 2740 asoc->addip_last_asconf = NULL;
2728 2741
2729 /* Send the next asconf chunk from the addip chunk queue. */ 2742 /* Send the next asconf chunk from the addip chunk queue. */
2730 asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks); 2743 if (!list_empty(&asoc->addip_chunk_list)) {
2731 if (asconf) { 2744 struct list_head *entry = asoc->addip_chunk_list.next;
2745 asconf = list_entry(entry, struct sctp_chunk, list);
2746
2747 list_del_init(entry);
2748
2732 /* Hold the chunk until an ASCONF_ACK is received. */ 2749 /* Hold the chunk until an ASCONF_ACK is received. */
2733 sctp_chunk_hold(asconf); 2750 sctp_chunk_hold(asconf);
2734 if (sctp_primitive_ASCONF(asoc, asconf)) 2751 if (sctp_primitive_ASCONF(asoc, asconf))
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f65fa441952f..39c970b5b198 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
63 void *event_arg, 63 void *event_arg,
64 sctp_disposition_t status, 64 sctp_disposition_t status,
65 sctp_cmd_seq_t *commands, 65 sctp_cmd_seq_t *commands,
66 int gfp); 66 unsigned int __nocast gfp);
67static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, 67static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
68 sctp_state_t state, 68 sctp_state_t state,
69 struct sctp_endpoint *ep, 69 struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
71 void *event_arg, 71 void *event_arg,
72 sctp_disposition_t status, 72 sctp_disposition_t status,
73 sctp_cmd_seq_t *commands, 73 sctp_cmd_seq_t *commands,
74 int gfp); 74 unsigned int __nocast gfp);
75 75
76/******************************************************************** 76/********************************************************************
77 * Helper functions 77 * Helper functions
@@ -414,11 +414,13 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
414 */ 414 */
415 asoc->overall_error_count++; 415 asoc->overall_error_count++;
416 416
417 if (transport->active && 417 if (transport->state != SCTP_INACTIVE &&
418 (transport->error_count++ >= transport->max_retrans)) { 418 (transport->error_count++ >= transport->max_retrans)) {
419 SCTP_DEBUG_PRINTK("transport_strike: transport " 419 SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
420 "IP:%d.%d.%d.%d failed.\n", 420 " transport IP: port:%d failed.\n",
421 NIPQUAD(transport->ipaddr.v4.sin_addr)); 421 asoc,
422 (&transport->ipaddr),
423 transport->ipaddr.v4.sin_port);
422 sctp_assoc_control_transport(asoc, transport, 424 sctp_assoc_control_transport(asoc, transport,
423 SCTP_TRANSPORT_DOWN, 425 SCTP_TRANSPORT_DOWN,
424 SCTP_FAILED_THRESHOLD); 426 SCTP_FAILED_THRESHOLD);
@@ -495,7 +497,8 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
495static int sctp_cmd_process_init(sctp_cmd_seq_t *commands, 497static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
496 struct sctp_association *asoc, 498 struct sctp_association *asoc,
497 struct sctp_chunk *chunk, 499 struct sctp_chunk *chunk,
498 sctp_init_chunk_t *peer_init, int gfp) 500 sctp_init_chunk_t *peer_init,
501 unsigned int __nocast gfp)
499{ 502{
500 int error; 503 int error;
501 504
@@ -593,7 +596,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
593 /* Mark the destination transport address as active if it is not so 596 /* Mark the destination transport address as active if it is not so
594 * marked. 597 * marked.
595 */ 598 */
596 if (!t->active) 599 if (t->state == SCTP_INACTIVE)
597 sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, 600 sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
598 SCTP_HEARTBEAT_SUCCESS); 601 SCTP_HEARTBEAT_SUCCESS);
599 602
@@ -665,8 +668,11 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
665 668
666 asoc->state = state; 669 asoc->state = state;
667 670
671 SCTP_DEBUG_PRINTK("sctp_cmd_new_state: asoc %p[%s]\n",
672 asoc, sctp_state_tbl[state]);
673
668 if (sctp_style(sk, TCP)) { 674 if (sctp_style(sk, TCP)) {
669 /* Change the sk->sk_state of a TCP-style socket that has 675 /* Change the sk->sk_state of a TCP-style socket that has
670 * sucessfully completed a connect() call. 676 * sucessfully completed a connect() call.
671 */ 677 */
672 if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) 678 if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
@@ -678,6 +684,16 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
678 sk->sk_shutdown |= RCV_SHUTDOWN; 684 sk->sk_shutdown |= RCV_SHUTDOWN;
679 } 685 }
680 686
687 if (sctp_state(asoc, COOKIE_WAIT)) {
688 /* Reset init timeouts since they may have been
689 * increased due to timer expirations.
690 */
691 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
692 asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT];
693 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
694 asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE];
695 }
696
681 if (sctp_state(asoc, ESTABLISHED) || 697 if (sctp_state(asoc, ESTABLISHED) ||
682 sctp_state(asoc, CLOSED) || 698 sctp_state(asoc, CLOSED) ||
683 sctp_state(asoc, SHUTDOWN_RECEIVED)) { 699 sctp_state(asoc, SHUTDOWN_RECEIVED)) {
@@ -837,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
837 struct sctp_endpoint *ep, 853 struct sctp_endpoint *ep,
838 struct sctp_association *asoc, 854 struct sctp_association *asoc,
839 void *event_arg, 855 void *event_arg,
840 int gfp) 856 unsigned int __nocast gfp)
841{ 857{
842 sctp_cmd_seq_t commands; 858 sctp_cmd_seq_t commands;
843 const sctp_sm_table_entry_t *state_fn; 859 const sctp_sm_table_entry_t *state_fn;
@@ -882,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
882 void *event_arg, 898 void *event_arg,
883 sctp_disposition_t status, 899 sctp_disposition_t status,
884 sctp_cmd_seq_t *commands, 900 sctp_cmd_seq_t *commands,
885 int gfp) 901 unsigned int __nocast gfp)
886{ 902{
887 int error; 903 int error;
888 904
@@ -970,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
970 void *event_arg, 986 void *event_arg,
971 sctp_disposition_t status, 987 sctp_disposition_t status,
972 sctp_cmd_seq_t *commands, 988 sctp_cmd_seq_t *commands,
973 int gfp) 989 unsigned int __nocast gfp)
974{ 990{
975 int error = 0; 991 int error = 0;
976 int force; 992 int force;
@@ -1120,10 +1136,10 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1120 * to be executed only during failed attempts of 1136 * to be executed only during failed attempts of
1121 * association establishment. 1137 * association establishment.
1122 */ 1138 */
1123 if ((asoc->peer.retran_path != 1139 if ((asoc->peer.retran_path !=
1124 asoc->peer.primary_path) && 1140 asoc->peer.primary_path) &&
1125 (asoc->counters[SCTP_COUNTER_INIT_ERROR] > 0)) { 1141 (asoc->init_err_counter > 0)) {
1126 sctp_add_cmd_sf(commands, 1142 sctp_add_cmd_sf(commands,
1127 SCTP_CMD_FORCE_PRIM_RETRAN, 1143 SCTP_CMD_FORCE_PRIM_RETRAN,
1128 SCTP_NULL()); 1144 SCTP_NULL());
1129 } 1145 }
@@ -1237,18 +1253,67 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1237 sctp_association_put(asoc); 1253 sctp_association_put(asoc);
1238 break; 1254 break;
1239 1255
1256 case SCTP_CMD_INIT_CHOOSE_TRANSPORT:
1257 chunk = cmd->obj.ptr;
1258 t = sctp_assoc_choose_init_transport(asoc);
1259 asoc->init_last_sent_to = t;
1260 chunk->transport = t;
1261 t->init_sent_count++;
1262 break;
1263
1240 case SCTP_CMD_INIT_RESTART: 1264 case SCTP_CMD_INIT_RESTART:
1241 /* Do the needed accounting and updates 1265 /* Do the needed accounting and updates
1242 * associated with restarting an initialization 1266 * associated with restarting an initialization
1243 * timer. 1267 * timer. Only multiply the timeout by two if
1268 * all transports have been tried at the current
1269 * timeout.
1270 */
1271 t = asoc->init_last_sent_to;
1272 asoc->init_err_counter++;
1273
1274 if (t->init_sent_count > (asoc->init_cycle + 1)) {
1275 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] *= 2;
1276 if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] >
1277 asoc->max_init_timeo) {
1278 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
1279 asoc->max_init_timeo;
1280 }
1281 asoc->init_cycle++;
1282 SCTP_DEBUG_PRINTK(
1283 "T1 INIT Timeout adjustment"
1284 " init_err_counter: %d"
1285 " cycle: %d"
1286 " timeout: %d\n",
1287 asoc->init_err_counter,
1288 asoc->init_cycle,
1289 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]);
1290 }
1291
1292 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
1293 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
1294 break;
1295
1296 case SCTP_CMD_COOKIEECHO_RESTART:
1297 /* Do the needed accounting and updates
1298 * associated with restarting an initialization
1299 * timer. Only multiply the timeout by two if
1300 * all transports have been tried at the current
1301 * timeout.
1244 */ 1302 */
1245 asoc->counters[SCTP_COUNTER_INIT_ERROR]++; 1303 asoc->init_err_counter++;
1246 asoc->timeouts[cmd->obj.to] *= 2; 1304
1247 if (asoc->timeouts[cmd->obj.to] > 1305 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] *= 2;
1306 if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] >
1248 asoc->max_init_timeo) { 1307 asoc->max_init_timeo) {
1249 asoc->timeouts[cmd->obj.to] = 1308 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
1250 asoc->max_init_timeo; 1309 asoc->max_init_timeo;
1251 } 1310 }
1311 SCTP_DEBUG_PRINTK(
1312 "T1 COOKIE Timeout adjustment"
1313 " init_err_counter: %d"
1314 " timeout: %d\n",
1315 asoc->init_err_counter,
1316 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]);
1252 1317
1253 /* If we've sent any data bundled with 1318 /* If we've sent any data bundled with
1254 * COOKIE-ECHO we need to resend. 1319 * COOKIE-ECHO we need to resend.
@@ -1261,7 +1326,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1261 1326
1262 sctp_add_cmd_sf(commands, 1327 sctp_add_cmd_sf(commands,
1263 SCTP_CMD_TIMER_RESTART, 1328 SCTP_CMD_TIMER_RESTART,
1264 SCTP_TO(cmd->obj.to)); 1329 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
1265 break; 1330 break;
1266 1331
1267 case SCTP_CMD_INIT_FAILED: 1332 case SCTP_CMD_INIT_FAILED:
@@ -1273,12 +1338,13 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1273 subtype, chunk, cmd->obj.u32); 1338 subtype, chunk, cmd->obj.u32);
1274 break; 1339 break;
1275 1340
1276 case SCTP_CMD_COUNTER_INC: 1341 case SCTP_CMD_INIT_COUNTER_INC:
1277 asoc->counters[cmd->obj.counter]++; 1342 asoc->init_err_counter++;
1278 break; 1343 break;
1279 1344
1280 case SCTP_CMD_COUNTER_RESET: 1345 case SCTP_CMD_INIT_COUNTER_RESET:
1281 asoc->counters[cmd->obj.counter] = 0; 1346 asoc->init_err_counter = 0;
1347 asoc->init_cycle = 0;
1282 break; 1348 break;
1283 1349
1284 case SCTP_CMD_REPORT_DUP: 1350 case SCTP_CMD_REPORT_DUP:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8e01b8f09ac2..86073df418f5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
92 sctp_cmd_seq_t *commands); 92 sctp_cmd_seq_t *commands);
93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); 93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
94 94
95static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
96 __u16 error,
97 const struct sctp_association *asoc,
98 struct sctp_transport *transport);
99
100static sctp_disposition_t sctp_sf_violation_chunklen(
101 const struct sctp_endpoint *ep,
102 const struct sctp_association *asoc,
103 const sctp_subtype_t type,
104 void *arg,
105 sctp_cmd_seq_t *commands);
95 106
96/* Small helper function that checks if the chunk length 107/* Small helper function that checks if the chunk length
97 * is of the appropriate length. The 'required_length' argument 108 * is of the appropriate length. The 'required_length' argument
@@ -533,6 +544,9 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep,
533 sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT, 544 sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT,
534 SCTP_PEER_INIT(initchunk)); 545 SCTP_PEER_INIT(initchunk));
535 546
547 /* Reset init error count upon receipt of INIT-ACK. */
548 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
549
536 /* 5.1 C) "A" shall stop the T1-init timer and leave 550 /* 5.1 C) "A" shall stop the T1-init timer and leave
537 * COOKIE-WAIT state. "A" shall then ... start the T1-cookie 551 * COOKIE-WAIT state. "A" shall then ... start the T1-cookie
538 * timer, and enter the COOKIE-ECHOED state. 552 * timer, and enter the COOKIE-ECHOED state.
@@ -775,8 +789,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep,
775 * from the COOKIE-ECHOED state to the COOKIE-WAIT 789 * from the COOKIE-ECHOED state to the COOKIE-WAIT
776 * state is performed. 790 * state is performed.
777 */ 791 */
778 sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_RESET, 792 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
779 SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
780 793
781 /* RFC 2960 5.1 Normal Establishment of an Association 794 /* RFC 2960 5.1 Normal Establishment of an Association
782 * 795 *
@@ -1019,10 +1032,22 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
1019 link = sctp_assoc_lookup_paddr(asoc, &from_addr); 1032 link = sctp_assoc_lookup_paddr(asoc, &from_addr);
1020 1033
1021 /* This should never happen, but lets log it if so. */ 1034 /* This should never happen, but lets log it if so. */
1022 if (!link) { 1035 if (unlikely(!link)) {
1023 printk(KERN_WARNING 1036 if (from_addr.sa.sa_family == AF_INET6) {
1024 "%s: Could not find address %d.%d.%d.%d\n", 1037 printk(KERN_WARNING
1025 __FUNCTION__, NIPQUAD(from_addr.v4.sin_addr)); 1038 "%s association %p could not find address "
1039 "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
1040 __FUNCTION__,
1041 asoc,
1042 NIP6(from_addr.v6.sin6_addr));
1043 } else {
1044 printk(KERN_WARNING
1045 "%s association %p could not find address "
1046 "%u.%u.%u.%u\n",
1047 __FUNCTION__,
1048 asoc,
1049 NIPQUAD(from_addr.v4.sin_addr.s_addr));
1050 }
1026 return SCTP_DISPOSITION_DISCARD; 1051 return SCTP_DISPOSITION_DISCARD;
1027 } 1052 }
1028 1053
@@ -2095,9 +2120,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
2095 sctp_errhdr_t *err; 2120 sctp_errhdr_t *err;
2096 struct sctp_chunk *reply; 2121 struct sctp_chunk *reply;
2097 struct sctp_bind_addr *bp; 2122 struct sctp_bind_addr *bp;
2098 int attempts; 2123 int attempts = asoc->init_err_counter + 1;
2099
2100 attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
2101 2124
2102 if (attempts >= asoc->max_init_attempts) { 2125 if (attempts >= asoc->max_init_attempts) {
2103 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 2126 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -2157,8 +2180,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
2157 /* Cast away the const modifier, as we want to just 2180 /* Cast away the const modifier, as we want to just
2158 * rerun it through as a sideffect. 2181 * rerun it through as a sideffect.
2159 */ 2182 */
2160 sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_INC, 2183 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL());
2161 SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
2162 2184
2163 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, 2185 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
2164 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); 2186 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
@@ -2281,8 +2303,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(const struct sctp_endpoint *ep,
2281 if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) 2303 if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
2282 error = ((sctp_errhdr_t *)chunk->skb->data)->cause; 2304 error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
2283 2305
2284 sctp_stop_t1_and_abort(commands, error); 2306 return sctp_stop_t1_and_abort(commands, error, asoc, chunk->transport);
2285 return SCTP_DISPOSITION_ABORT;
2286} 2307}
2287 2308
2288/* 2309/*
@@ -2294,8 +2315,8 @@ sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(const struct sctp_endpoint *ep
2294 void *arg, 2315 void *arg,
2295 sctp_cmd_seq_t *commands) 2316 sctp_cmd_seq_t *commands)
2296{ 2317{
2297 sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR); 2318 return sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR, asoc,
2298 return SCTP_DISPOSITION_ABORT; 2319 (struct sctp_transport *)arg);
2299} 2320}
2300 2321
2301/* 2322/*
@@ -2318,8 +2339,12 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
2318 * 2339 *
2319 * This is common code called by several sctp_sf_*_abort() functions above. 2340 * This is common code called by several sctp_sf_*_abort() functions above.
2320 */ 2341 */
2321void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error) 2342static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
2343 __u16 error,
2344 const struct sctp_association *asoc,
2345 struct sctp_transport *transport)
2322{ 2346{
2347 SCTP_DEBUG_PRINTK("ABORT received (INIT).\n");
2323 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, 2348 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
2324 SCTP_STATE(SCTP_STATE_CLOSED)); 2349 SCTP_STATE(SCTP_STATE_CLOSED));
2325 SCTP_INC_STATS(SCTP_MIB_ABORTEDS); 2350 SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
@@ -2328,6 +2353,7 @@ void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error)
2328 /* CMD_INIT_FAILED will DELETE_TCB. */ 2353 /* CMD_INIT_FAILED will DELETE_TCB. */
2329 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 2354 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
2330 SCTP_U32(error)); 2355 SCTP_U32(error));
2356 return SCTP_DISPOSITION_ABORT;
2331} 2357}
2332 2358
2333/* 2359/*
@@ -3672,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
3672 * 3698 *
3673 * Generate an ABORT chunk and terminate the association. 3699 * Generate an ABORT chunk and terminate the association.
3674 */ 3700 */
3675sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep, 3701static sctp_disposition_t sctp_sf_violation_chunklen(
3702 const struct sctp_endpoint *ep,
3676 const struct sctp_association *asoc, 3703 const struct sctp_association *asoc,
3677 const sctp_subtype_t type, 3704 const sctp_subtype_t type,
3678 void *arg, 3705 void *arg,
@@ -3805,6 +3832,10 @@ sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep,
3805 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, 3832 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC,
3806 SCTP_ASOC((struct sctp_association *) asoc)); 3833 SCTP_ASOC((struct sctp_association *) asoc));
3807 3834
3835 /* Choose transport for INIT. */
3836 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
3837 SCTP_CHUNK(repl));
3838
3808 /* After sending the INIT, "A" starts the T1-init timer and 3839 /* After sending the INIT, "A" starts the T1-init timer and
3809 * enters the COOKIE-WAIT state. 3840 * enters the COOKIE-WAIT state.
3810 */ 3841 */
@@ -4589,7 +4620,7 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4589} 4620}
4590 4621
4591/* 4622/*
4592 * sctp_sf_t1_timer_expire 4623 * sctp_sf_t1_init_timer_expire
4593 * 4624 *
4594 * Section: 4 Note: 2 4625 * Section: 4 Note: 2
4595 * Verification Tag: 4626 * Verification Tag:
@@ -4603,7 +4634,59 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4603 * endpoint MUST abort the initialization process and report the 4634 * endpoint MUST abort the initialization process and report the
4604 * error to SCTP user. 4635 * error to SCTP user.
4605 * 4636 *
4606 * 3) If the T1-cookie timer expires, the endpoint MUST retransmit 4637 * Outputs
4638 * (timers, events)
4639 *
4640 */
4641sctp_disposition_t sctp_sf_t1_init_timer_expire(const struct sctp_endpoint *ep,
4642 const struct sctp_association *asoc,
4643 const sctp_subtype_t type,
4644 void *arg,
4645 sctp_cmd_seq_t *commands)
4646{
4647 struct sctp_chunk *repl = NULL;
4648 struct sctp_bind_addr *bp;
4649 int attempts = asoc->init_err_counter + 1;
4650
4651 SCTP_DEBUG_PRINTK("Timer T1 expired (INIT).\n");
4652
4653 if (attempts < asoc->max_init_attempts) {
4654 bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
4655 repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
4656 if (!repl)
4657 return SCTP_DISPOSITION_NOMEM;
4658
4659 /* Choose transport for INIT. */
4660 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
4661 SCTP_CHUNK(repl));
4662
4663 /* Issue a sideeffect to do the needed accounting. */
4664 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
4665 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
4666
4667 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
4668 } else {
4669 SCTP_DEBUG_PRINTK("Giving up on INIT, attempts: %d"
4670 " max_init_attempts: %d\n",
4671 attempts, asoc->max_init_attempts);
4672 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
4673 SCTP_U32(SCTP_ERROR_NO_ERROR));
4674 return SCTP_DISPOSITION_DELETE_TCB;
4675 }
4676
4677 return SCTP_DISPOSITION_CONSUME;
4678}
4679
4680/*
4681 * sctp_sf_t1_cookie_timer_expire
4682 *
4683 * Section: 4 Note: 2
4684 * Verification Tag:
4685 * Inputs
4686 * (endpoint, asoc)
4687 *
4688 * RFC 2960 Section 4 Notes
4689 * 3) If the T1-cookie timer expires, the endpoint MUST retransmit
4607 * COOKIE ECHO and re-start the T1-cookie timer without changing 4690 * COOKIE ECHO and re-start the T1-cookie timer without changing
4608 * state. This MUST be repeated up to 'Max.Init.Retransmits' times. 4691 * state. This MUST be repeated up to 'Max.Init.Retransmits' times.
4609 * After that, the endpoint MUST abort the initialization process and 4692 * After that, the endpoint MUST abort the initialization process and
@@ -4613,46 +4696,26 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4613 * (timers, events) 4696 * (timers, events)
4614 * 4697 *
4615 */ 4698 */
4616sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep, 4699sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep,
4617 const struct sctp_association *asoc, 4700 const struct sctp_association *asoc,
4618 const sctp_subtype_t type, 4701 const sctp_subtype_t type,
4619 void *arg, 4702 void *arg,
4620 sctp_cmd_seq_t *commands) 4703 sctp_cmd_seq_t *commands)
4621{ 4704{
4622 struct sctp_chunk *repl; 4705 struct sctp_chunk *repl = NULL;
4623 struct sctp_bind_addr *bp; 4706 int attempts = asoc->init_err_counter + 1;
4624 sctp_event_timeout_t timer = (sctp_event_timeout_t) arg;
4625 int timeout;
4626 int attempts;
4627 4707
4628 timeout = asoc->timeouts[timer]; 4708 SCTP_DEBUG_PRINTK("Timer T1 expired (COOKIE-ECHO).\n");
4629 attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
4630 repl = NULL;
4631
4632 SCTP_DEBUG_PRINTK("Timer T1 expired.\n");
4633 4709
4634 if (attempts < asoc->max_init_attempts) { 4710 if (attempts < asoc->max_init_attempts) {
4635 switch (timer) { 4711 repl = sctp_make_cookie_echo(asoc, NULL);
4636 case SCTP_EVENT_TIMEOUT_T1_INIT:
4637 bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
4638 repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
4639 break;
4640
4641 case SCTP_EVENT_TIMEOUT_T1_COOKIE:
4642 repl = sctp_make_cookie_echo(asoc, NULL);
4643 break;
4644
4645 default:
4646 BUG();
4647 break;
4648 };
4649
4650 if (!repl) 4712 if (!repl)
4651 goto nomem; 4713 return SCTP_DISPOSITION_NOMEM;
4652 4714
4653 /* Issue a sideeffect to do the needed accounting. */ 4715 /* Issue a sideeffect to do the needed accounting. */
4654 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART, 4716 sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
4655 SCTP_TO(timer)); 4717 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
4718
4656 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); 4719 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
4657 } else { 4720 } else {
4658 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 4721 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -4661,9 +4724,6 @@ sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep,
4661 } 4724 }
4662 4725
4663 return SCTP_DISPOSITION_CONSUME; 4726 return SCTP_DISPOSITION_CONSUME;
4664
4665nomem:
4666 return SCTP_DISPOSITION_NOMEM;
4667} 4727}
4668 4728
4669/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN 4729/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 8967846f69e8..75ef10408764 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -783,7 +783,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
783 /* SCTP_STATE_COOKIE_WAIT */ \ 783 /* SCTP_STATE_COOKIE_WAIT */ \
784 {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ 784 {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
785 /* SCTP_STATE_COOKIE_ECHOED */ \ 785 /* SCTP_STATE_COOKIE_ECHOED */ \
786 {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \ 786 {.fn = sctp_sf_t1_cookie_timer_expire, \
787 .name = "sctp_sf_t1_cookie_timer_expire"}, \
787 /* SCTP_STATE_ESTABLISHED */ \ 788 /* SCTP_STATE_ESTABLISHED */ \
788 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 789 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
789 /* SCTP_STATE_SHUTDOWN_PENDING */ \ 790 /* SCTP_STATE_SHUTDOWN_PENDING */ \
@@ -802,7 +803,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
802 /* SCTP_STATE_CLOSED */ \ 803 /* SCTP_STATE_CLOSED */ \
803 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 804 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
804 /* SCTP_STATE_COOKIE_WAIT */ \ 805 /* SCTP_STATE_COOKIE_WAIT */ \
805 {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \ 806 {.fn = sctp_sf_t1_init_timer_expire, \
807 .name = "sctp_sf_t1_init_timer_expire"}, \
806 /* SCTP_STATE_COOKIE_ECHOED */ \ 808 /* SCTP_STATE_COOKIE_ECHOED */ \
807 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 809 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
808 /* SCTP_STATE_ESTABLISHED */ \ 810 /* SCTP_STATE_ESTABLISHED */ \
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0b338eca6dc0..091a66f06a35 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -262,18 +262,18 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
262 * sockaddr_in6 [RFC 2553]), 262 * sockaddr_in6 [RFC 2553]),
263 * addr_len - the size of the address structure. 263 * addr_len - the size of the address structure.
264 */ 264 */
265SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) 265SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
266{ 266{
267 int retval = 0; 267 int retval = 0;
268 268
269 sctp_lock_sock(sk); 269 sctp_lock_sock(sk);
270 270
271 SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, uaddr: %p, addr_len: %d)\n", 271 SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, addr: %p, addr_len: %d)\n",
272 sk, uaddr, addr_len); 272 sk, addr, addr_len);
273 273
274 /* Disallow binding twice. */ 274 /* Disallow binding twice. */
275 if (!sctp_sk(sk)->ep->base.bind_addr.port) 275 if (!sctp_sk(sk)->ep->base.bind_addr.port)
276 retval = sctp_do_bind(sk, (union sctp_addr *)uaddr, 276 retval = sctp_do_bind(sk, (union sctp_addr *)addr,
277 addr_len); 277 addr_len);
278 else 278 else
279 retval = -EINVAL; 279 retval = -EINVAL;
@@ -318,23 +318,27 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
318 unsigned short snum; 318 unsigned short snum;
319 int ret = 0; 319 int ret = 0;
320 320
321 SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d)\n",
322 sk, addr, len);
323
324 /* Common sockaddr verification. */ 321 /* Common sockaddr verification. */
325 af = sctp_sockaddr_af(sp, addr, len); 322 af = sctp_sockaddr_af(sp, addr, len);
326 if (!af) 323 if (!af) {
324 SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d) EINVAL\n",
325 sk, addr, len);
327 return -EINVAL; 326 return -EINVAL;
327 }
328
329 snum = ntohs(addr->v4.sin_port);
330
331 SCTP_DEBUG_PRINTK_IPADDR("sctp_do_bind(sk: %p, new addr: ",
332 ", port: %d, new port: %d, len: %d)\n",
333 sk,
334 addr,
335 bp->port, snum,
336 len);
328 337
329 /* PF specific bind() address verification. */ 338 /* PF specific bind() address verification. */
330 if (!sp->pf->bind_verify(sp, addr)) 339 if (!sp->pf->bind_verify(sp, addr))
331 return -EADDRNOTAVAIL; 340 return -EADDRNOTAVAIL;
332 341
333 snum= ntohs(addr->v4.sin_port);
334
335 SCTP_DEBUG_PRINTK("sctp_do_bind: port: %d, new port: %d\n",
336 bp->port, snum);
337
338 /* We must either be unbound, or bind to the same port. */ 342 /* We must either be unbound, or bind to the same port. */
339 if (bp->port && (snum != bp->port)) { 343 if (bp->port && (snum != bp->port)) {
340 SCTP_DEBUG_PRINTK("sctp_do_bind:" 344 SCTP_DEBUG_PRINTK("sctp_do_bind:"
@@ -402,7 +406,7 @@ static int sctp_send_asconf(struct sctp_association *asoc,
402 * transmission. 406 * transmission.
403 */ 407 */
404 if (asoc->addip_last_asconf) { 408 if (asoc->addip_last_asconf) {
405 __skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk); 409 list_add_tail(&chunk->list, &asoc->addip_chunk_list);
406 goto out; 410 goto out;
407 } 411 }
408 412
@@ -816,7 +820,8 @@ out:
816 * 820 *
817 * Basically do nothing but copying the addresses from user to kernel 821 * Basically do nothing but copying the addresses from user to kernel
818 * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk. 822 * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk.
819 * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace. 823 * This is used for tunneling the sctp_bindx() request through sctp_setsockopt()
824 * from userspace.
820 * 825 *
821 * We don't use copy_from_user() for optimization: we first do the 826 * We don't use copy_from_user() for optimization: we first do the
822 * sanity checks (buffer size -fast- and access check-healthy 827 * sanity checks (buffer size -fast- and access check-healthy
@@ -913,6 +918,243 @@ out:
913 return err; 918 return err;
914} 919}
915 920
921/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
922 *
923 * Common routine for handling connect() and sctp_connectx().
924 * Connect will come in with just a single address.
925 */
926static int __sctp_connect(struct sock* sk,
927 struct sockaddr *kaddrs,
928 int addrs_size)
929{
930 struct sctp_sock *sp;
931 struct sctp_endpoint *ep;
932 struct sctp_association *asoc = NULL;
933 struct sctp_association *asoc2;
934 struct sctp_transport *transport;
935 union sctp_addr to;
936 struct sctp_af *af;
937 sctp_scope_t scope;
938 long timeo;
939 int err = 0;
940 int addrcnt = 0;
941 int walk_size = 0;
942 struct sockaddr *sa_addr;
943 void *addr_buf;
944
945 sp = sctp_sk(sk);
946 ep = sp->ep;
947
948 /* connect() cannot be done on a socket that is already in ESTABLISHED
949 * state - UDP-style peeled off socket or a TCP-style socket that
950 * is already connected.
951 * It cannot be done even on a TCP-style listening socket.
952 */
953 if (sctp_sstate(sk, ESTABLISHED) ||
954 (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
955 err = -EISCONN;
956 goto out_free;
957 }
958
959 /* Walk through the addrs buffer and count the number of addresses. */
960 addr_buf = kaddrs;
961 while (walk_size < addrs_size) {
962 sa_addr = (struct sockaddr *)addr_buf;
963 af = sctp_get_af_specific(sa_addr->sa_family);
964
965 /* If the address family is not supported or if this address
966 * causes the address buffer to overflow return EINVAL.
967 */
968 if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
969 err = -EINVAL;
970 goto out_free;
971 }
972
973 err = sctp_verify_addr(sk, (union sctp_addr *)sa_addr,
974 af->sockaddr_len);
975 if (err)
976 goto out_free;
977
978 memcpy(&to, sa_addr, af->sockaddr_len);
979 to.v4.sin_port = ntohs(to.v4.sin_port);
980
981 /* Check if there already is a matching association on the
982 * endpoint (other than the one created here).
983 */
984 asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport);
985 if (asoc2 && asoc2 != asoc) {
986 if (asoc2->state >= SCTP_STATE_ESTABLISHED)
987 err = -EISCONN;
988 else
989 err = -EALREADY;
990 goto out_free;
991 }
992
993 /* If we could not find a matching association on the endpoint,
994 * make sure that there is no peeled-off association matching
995 * the peer address even on another socket.
996 */
997 if (sctp_endpoint_is_peeled_off(ep, &to)) {
998 err = -EADDRNOTAVAIL;
999 goto out_free;
1000 }
1001
1002 if (!asoc) {
1003 /* If a bind() or sctp_bindx() is not called prior to
1004 * an sctp_connectx() call, the system picks an
1005 * ephemeral port and will choose an address set
1006 * equivalent to binding with a wildcard address.
1007 */
1008 if (!ep->base.bind_addr.port) {
1009 if (sctp_autobind(sk)) {
1010 err = -EAGAIN;
1011 goto out_free;
1012 }
1013 }
1014
1015 scope = sctp_scope(&to);
1016 asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
1017 if (!asoc) {
1018 err = -ENOMEM;
1019 goto out_free;
1020 }
1021 }
1022
1023 /* Prime the peer's transport structures. */
1024 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL,
1025 SCTP_UNKNOWN);
1026 if (!transport) {
1027 err = -ENOMEM;
1028 goto out_free;
1029 }
1030
1031 addrcnt++;
1032 addr_buf += af->sockaddr_len;
1033 walk_size += af->sockaddr_len;
1034 }
1035
1036 err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
1037 if (err < 0) {
1038 goto out_free;
1039 }
1040
1041 err = sctp_primitive_ASSOCIATE(asoc, NULL);
1042 if (err < 0) {
1043 goto out_free;
1044 }
1045
1046 /* Initialize sk's dport and daddr for getpeername() */
1047 inet_sk(sk)->dport = htons(asoc->peer.port);
1048 af = sctp_get_af_specific(to.sa.sa_family);
1049 af->to_sk_daddr(&to, sk);
1050
1051 timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
1052 err = sctp_wait_for_connect(asoc, &timeo);
1053
1054 /* Don't free association on exit. */
1055 asoc = NULL;
1056
1057out_free:
1058
1059 SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p"
1060 " kaddrs: %p err: %d\n",
1061 asoc, kaddrs, err);
1062 if (asoc)
1063 sctp_association_free(asoc);
1064 return err;
1065}
1066
1067/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt()
1068 *
1069 * API 8.9
1070 * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt);
1071 *
1072 * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
1073 * If the sd is an IPv6 socket, the addresses passed can either be IPv4
1074 * or IPv6 addresses.
1075 *
1076 * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
1077 * Section 3.1.2 for this usage.
1078 *
1079 * addrs is a pointer to an array of one or more socket addresses. Each
1080 * address is contained in its appropriate structure (i.e. struct
1081 * sockaddr_in or struct sockaddr_in6) the family of the address type
1082 * must be used to distengish the address length (note that this
1083 * representation is termed a "packed array" of addresses). The caller
1084 * specifies the number of addresses in the array with addrcnt.
1085 *
1086 * On success, sctp_connectx() returns 0. On failure, sctp_connectx() returns
1087 * -1, and sets errno to the appropriate error code.
1088 *
1089 * For SCTP, the port given in each socket address must be the same, or
1090 * sctp_connectx() will fail, setting errno to EINVAL.
1091 *
1092 * An application can use sctp_connectx to initiate an association with
1093 * an endpoint that is multi-homed. Much like sctp_bindx() this call
1094 * allows a caller to specify multiple addresses at which a peer can be
1095 * reached. The way the SCTP stack uses the list of addresses to set up
1096 * the association is implementation dependant. This function only
1097 * specifies that the stack will try to make use of all the addresses in
1098 * the list when needed.
1099 *
1100 * Note that the list of addresses passed in is only used for setting up
1101 * the association. It does not necessarily equal the set of addresses
1102 * the peer uses for the resulting association. If the caller wants to
1103 * find out the set of peer addresses, it must use sctp_getpaddrs() to
1104 * retrieve them after the association has been set up.
1105 *
1106 * Basically do nothing but copying the addresses from user to kernel
1107 * land and invoking either sctp_connectx(). This is used for tunneling
1108 * the sctp_connectx() request through sctp_setsockopt() from userspace.
1109 *
1110 * We don't use copy_from_user() for optimization: we first do the
1111 * sanity checks (buffer size -fast- and access check-healthy
1112 * pointer); if all of those succeed, then we can alloc the memory
1113 * (expensive operation) needed to copy the data to kernel. Then we do
1114 * the copying without checking the user space area
1115 * (__copy_from_user()).
1116 *
1117 * On exit there is no need to do sockfd_put(), sys_setsockopt() does
1118 * it.
1119 *
1120 * sk The sk of the socket
1121 * addrs The pointer to the addresses in user land
1122 * addrssize Size of the addrs buffer
1123 *
1124 * Returns 0 if ok, <0 errno code on error.
1125 */
1126SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk,
1127 struct sockaddr __user *addrs,
1128 int addrs_size)
1129{
1130 int err = 0;
1131 struct sockaddr *kaddrs;
1132
1133 SCTP_DEBUG_PRINTK("%s - sk %p addrs %p addrs_size %d\n",
1134 __FUNCTION__, sk, addrs, addrs_size);
1135
1136 if (unlikely(addrs_size <= 0))
1137 return -EINVAL;
1138
1139 /* Check the user passed a healthy pointer. */
1140 if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
1141 return -EFAULT;
1142
1143 /* Alloc space for the address array in kernel memory. */
1144 kaddrs = (struct sockaddr *)kmalloc(addrs_size, GFP_KERNEL);
1145 if (unlikely(!kaddrs))
1146 return -ENOMEM;
1147
1148 if (__copy_from_user(kaddrs, addrs, addrs_size)) {
1149 err = -EFAULT;
1150 } else {
1151 err = __sctp_connect(sk, kaddrs, addrs_size);
1152 }
1153
1154 kfree(kaddrs);
1155 return err;
1156}
1157
916/* API 3.1.4 close() - UDP Style Syntax 1158/* API 3.1.4 close() - UDP Style Syntax
917 * Applications use close() to perform graceful shutdown (as described in 1159 * Applications use close() to perform graceful shutdown (as described in
918 * Section 10.1 of [SCTP]) on ALL the associations currently represented 1160 * Section 10.1 of [SCTP]) on ALL the associations currently represented
@@ -1095,7 +1337,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
1095 sp = sctp_sk(sk); 1337 sp = sctp_sk(sk);
1096 ep = sp->ep; 1338 ep = sp->ep;
1097 1339
1098 SCTP_DEBUG_PRINTK("Using endpoint: %s.\n", ep->debug_name); 1340 SCTP_DEBUG_PRINTK("Using endpoint: %p.\n", ep);
1099 1341
1100 /* We cannot send a message over a TCP-style listening socket. */ 1342 /* We cannot send a message over a TCP-style listening socket. */
1101 if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) { 1343 if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) {
@@ -1306,7 +1548,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
1306 } 1548 }
1307 1549
1308 /* Prime the peer's transport structures. */ 1550 /* Prime the peer's transport structures. */
1309 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL); 1551 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN);
1310 if (!transport) { 1552 if (!transport) {
1311 err = -ENOMEM; 1553 err = -ENOMEM;
1312 goto out_free; 1554 goto out_free;
@@ -2208,6 +2450,12 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
2208 optlen, SCTP_BINDX_REM_ADDR); 2450 optlen, SCTP_BINDX_REM_ADDR);
2209 break; 2451 break;
2210 2452
2453 case SCTP_SOCKOPT_CONNECTX:
2454 /* 'optlen' is the size of the addresses buffer. */
2455 retval = sctp_setsockopt_connectx(sk, (struct sockaddr __user *)optval,
2456 optlen);
2457 break;
2458
2211 case SCTP_DISABLE_FRAGMENTS: 2459 case SCTP_DISABLE_FRAGMENTS:
2212 retval = sctp_setsockopt_disable_fragments(sk, optval, optlen); 2460 retval = sctp_setsockopt_disable_fragments(sk, optval, optlen);
2213 break; 2461 break;
@@ -2283,112 +2531,29 @@ out_nounlock:
2283 * 2531 *
2284 * len: the size of the address. 2532 * len: the size of the address.
2285 */ 2533 */
2286SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *uaddr, 2534SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *addr,
2287 int addr_len) 2535 int addr_len)
2288{ 2536{
2289 struct sctp_sock *sp;
2290 struct sctp_endpoint *ep;
2291 struct sctp_association *asoc;
2292 struct sctp_transport *transport;
2293 union sctp_addr to;
2294 struct sctp_af *af;
2295 sctp_scope_t scope;
2296 long timeo;
2297 int err = 0; 2537 int err = 0;
2538 struct sctp_af *af;
2298 2539
2299 sctp_lock_sock(sk); 2540 sctp_lock_sock(sk);
2300 2541
2301 SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d)\n", 2542 SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d\n",
2302 __FUNCTION__, sk, uaddr, addr_len); 2543 __FUNCTION__, sk, addr, addr_len);
2303
2304 sp = sctp_sk(sk);
2305 ep = sp->ep;
2306
2307 /* connect() cannot be done on a socket that is already in ESTABLISHED
2308 * state - UDP-style peeled off socket or a TCP-style socket that
2309 * is already connected.
2310 * It cannot be done even on a TCP-style listening socket.
2311 */
2312 if (sctp_sstate(sk, ESTABLISHED) ||
2313 (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
2314 err = -EISCONN;
2315 goto out_unlock;
2316 }
2317
2318 err = sctp_verify_addr(sk, (union sctp_addr *)uaddr, addr_len);
2319 if (err)
2320 goto out_unlock;
2321 2544
2322 if (addr_len > sizeof(to)) 2545 /* Validate addr_len before calling common connect/connectx routine. */
2323 addr_len = sizeof(to); 2546 af = sctp_get_af_specific(addr->sa_family);
2324 memcpy(&to, uaddr, addr_len); 2547 if (!af || addr_len < af->sockaddr_len) {
2325 to.v4.sin_port = ntohs(to.v4.sin_port); 2548 err = -EINVAL;
2326 2549 } else {
2327 asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); 2550 /* Pass correct addr len to common routine (so it knows there
2328 if (asoc) { 2551 * is only one address being passed.
2329 if (asoc->state >= SCTP_STATE_ESTABLISHED) 2552 */
2330 err = -EISCONN; 2553 err = __sctp_connect(sk, addr, af->sockaddr_len);
2331 else
2332 err = -EALREADY;
2333 goto out_unlock;
2334 }
2335
2336 /* If we could not find a matching association on the endpoint,
2337 * make sure that there is no peeled-off association matching the
2338 * peer address even on another socket.
2339 */
2340 if (sctp_endpoint_is_peeled_off(ep, &to)) {
2341 err = -EADDRNOTAVAIL;
2342 goto out_unlock;
2343 }
2344
2345 /* If a bind() or sctp_bindx() is not called prior to a connect()
2346 * call, the system picks an ephemeral port and will choose an address
2347 * set equivalent to binding with a wildcard address.
2348 */
2349 if (!ep->base.bind_addr.port) {
2350 if (sctp_autobind(sk)) {
2351 err = -EAGAIN;
2352 goto out_unlock;
2353 }
2354 }
2355
2356 scope = sctp_scope(&to);
2357 asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
2358 if (!asoc) {
2359 err = -ENOMEM;
2360 goto out_unlock;
2361 }
2362
2363 /* Prime the peer's transport structures. */
2364 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL);
2365 if (!transport) {
2366 sctp_association_free(asoc);
2367 goto out_unlock;
2368 }
2369 err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
2370 if (err < 0) {
2371 sctp_association_free(asoc);
2372 goto out_unlock;
2373 }
2374
2375 err = sctp_primitive_ASSOCIATE(asoc, NULL);
2376 if (err < 0) {
2377 sctp_association_free(asoc);
2378 goto out_unlock;
2379 } 2554 }
2380 2555
2381 /* Initialize sk's dport and daddr for getpeername() */
2382 inet_sk(sk)->dport = htons(asoc->peer.port);
2383 af = sctp_get_af_specific(to.sa.sa_family);
2384 af->to_sk_daddr(&to, sk);
2385
2386 timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
2387 err = sctp_wait_for_connect(asoc, &timeo);
2388
2389out_unlock:
2390 sctp_release_sock(sk); 2556 sctp_release_sock(sk);
2391
2392 return err; 2557 return err;
2393} 2558}
2394 2559
@@ -2677,12 +2842,15 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
2677 /* Map ipv4 address into v4-mapped-on-v6 address. */ 2842 /* Map ipv4 address into v4-mapped-on-v6 address. */
2678 sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), 2843 sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
2679 (union sctp_addr *)&status.sstat_primary.spinfo_address); 2844 (union sctp_addr *)&status.sstat_primary.spinfo_address);
2680 status.sstat_primary.spinfo_state = transport->active; 2845 status.sstat_primary.spinfo_state = transport->state;
2681 status.sstat_primary.spinfo_cwnd = transport->cwnd; 2846 status.sstat_primary.spinfo_cwnd = transport->cwnd;
2682 status.sstat_primary.spinfo_srtt = transport->srtt; 2847 status.sstat_primary.spinfo_srtt = transport->srtt;
2683 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); 2848 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
2684 status.sstat_primary.spinfo_mtu = transport->pmtu; 2849 status.sstat_primary.spinfo_mtu = transport->pmtu;
2685 2850
2851 if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
2852 status.sstat_primary.spinfo_state = SCTP_ACTIVE;
2853
2686 if (put_user(len, optlen)) { 2854 if (put_user(len, optlen)) {
2687 retval = -EFAULT; 2855 retval = -EFAULT;
2688 goto out; 2856 goto out;
@@ -2733,12 +2901,15 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
2733 return -EINVAL; 2901 return -EINVAL;
2734 2902
2735 pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); 2903 pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
2736 pinfo.spinfo_state = transport->active; 2904 pinfo.spinfo_state = transport->state;
2737 pinfo.spinfo_cwnd = transport->cwnd; 2905 pinfo.spinfo_cwnd = transport->cwnd;
2738 pinfo.spinfo_srtt = transport->srtt; 2906 pinfo.spinfo_srtt = transport->srtt;
2739 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); 2907 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
2740 pinfo.spinfo_mtu = transport->pmtu; 2908 pinfo.spinfo_mtu = transport->pmtu;
2741 2909
2910 if (pinfo.spinfo_state == SCTP_UNKNOWN)
2911 pinfo.spinfo_state = SCTP_ACTIVE;
2912
2742 if (put_user(len, optlen)) { 2913 if (put_user(len, optlen)) {
2743 retval = -EFAULT; 2914 retval = -EFAULT;
2744 goto out; 2915 goto out;
@@ -3591,7 +3762,8 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
3591 int retval = 0; 3762 int retval = 0;
3592 int len; 3763 int len;
3593 3764
3594 SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p, ...)\n", sk); 3765 SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p... optname: %d)\n",
3766 sk, optname);
3595 3767
3596 /* I can hardly begin to describe how wrong this is. This is 3768 /* I can hardly begin to describe how wrong this is. This is
3597 * so broken as to be worse than useless. The API draft 3769 * so broken as to be worse than useless. The API draft
@@ -4368,15 +4540,11 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
4368 * However, this function was corrent in any case. 8) 4540 * However, this function was corrent in any case. 8)
4369 */ 4541 */
4370 if (flags & MSG_PEEK) { 4542 if (flags & MSG_PEEK) {
4371 unsigned long cpu_flags; 4543 spin_lock_bh(&sk->sk_receive_queue.lock);
4372
4373 sctp_spin_lock_irqsave(&sk->sk_receive_queue.lock,
4374 cpu_flags);
4375 skb = skb_peek(&sk->sk_receive_queue); 4544 skb = skb_peek(&sk->sk_receive_queue);
4376 if (skb) 4545 if (skb)
4377 atomic_inc(&skb->users); 4546 atomic_inc(&skb->users);
4378 sctp_spin_unlock_irqrestore(&sk->sk_receive_queue.lock, 4547 spin_unlock_bh(&sk->sk_receive_queue.lock);
4379 cpu_flags);
4380 } else { 4548 } else {
4381 skb = skb_dequeue(&sk->sk_receive_queue); 4549 skb = skb_dequeue(&sk->sk_receive_queue);
4382 } 4550 }
@@ -4600,8 +4768,7 @@ out:
4600 return err; 4768 return err;
4601 4769
4602do_error: 4770do_error:
4603 if (asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1 >= 4771 if (asoc->init_err_counter + 1 >= asoc->max_init_attempts)
4604 asoc->max_init_attempts)
4605 err = -ETIMEDOUT; 4772 err = -ETIMEDOUT;
4606 else 4773 else
4607 err = -ECONNREFUSED; 4774 err = -ECONNREFUSED;
@@ -4686,6 +4853,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
4686 struct sctp_endpoint *newep = newsp->ep; 4853 struct sctp_endpoint *newep = newsp->ep;
4687 struct sk_buff *skb, *tmp; 4854 struct sk_buff *skb, *tmp;
4688 struct sctp_ulpevent *event; 4855 struct sctp_ulpevent *event;
4856 int flags = 0;
4689 4857
4690 /* Migrate socket buffer sizes and all the socket level options to the 4858 /* Migrate socket buffer sizes and all the socket level options to the
4691 * new socket. 4859 * new socket.
@@ -4707,6 +4875,17 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
4707 sctp_sk(newsk)->bind_hash = pp; 4875 sctp_sk(newsk)->bind_hash = pp;
4708 inet_sk(newsk)->num = inet_sk(oldsk)->num; 4876 inet_sk(newsk)->num = inet_sk(oldsk)->num;
4709 4877
4878 /* Copy the bind_addr list from the original endpoint to the new
4879 * endpoint so that we can handle restarts properly
4880 */
4881 if (assoc->peer.ipv4_address)
4882 flags |= SCTP_ADDR4_PEERSUPP;
4883 if (assoc->peer.ipv6_address)
4884 flags |= SCTP_ADDR6_PEERSUPP;
4885 sctp_bind_addr_copy(&newsp->ep->base.bind_addr,
4886 &oldsp->ep->base.bind_addr,
4887 SCTP_SCOPE_GLOBAL, GFP_KERNEL, flags);
4888
4710 /* Move any messages in the old socket's receive queue that are for the 4889 /* Move any messages in the old socket's receive queue that are for the
4711 * peeled off association to the new socket's receive queue. 4890 * peeled off association to the new socket's receive queue.
4712 */ 4891 */
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index e627d2b451b6..25037daf3fa0 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -57,7 +57,8 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
57/* Create a new sctp_ssnmap. 57/* Create a new sctp_ssnmap.
58 * Allocate room to store at least 'len' contiguous TSNs. 58 * Allocate room to store at least 'len' contiguous TSNs.
59 */ 59 */
60struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int gfp) 60struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
61 unsigned int __nocast gfp)
61{ 62{
62 struct sctp_ssnmap *retval; 63 struct sctp_ssnmap *retval;
63 int size; 64 int size;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc31849312b..dc4893474f18 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -47,6 +47,8 @@
47static ctl_handler sctp_sysctl_jiffies_ms; 47static ctl_handler sctp_sysctl_jiffies_ms;
48static long rto_timer_min = 1; 48static long rto_timer_min = 1;
49static long rto_timer_max = 86400000; /* One day */ 49static long rto_timer_max = 86400000; /* One day */
50static long sack_timer_min = 1;
51static long sack_timer_max = 500;
50 52
51static ctl_table sctp_table[] = { 53static ctl_table sctp_table[] = {
52 { 54 {
@@ -187,6 +189,17 @@ static ctl_table sctp_table[] = {
187 .mode = 0644, 189 .mode = 0644,
188 .proc_handler = &proc_dointvec 190 .proc_handler = &proc_dointvec
189 }, 191 },
192 {
193 .ctl_name = NET_SCTP_SACK_TIMEOUT,
194 .procname = "sack_timeout",
195 .data = &sctp_sack_timeout,
196 .maxlen = sizeof(long),
197 .mode = 0644,
198 .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
199 .strategy = &sctp_sysctl_jiffies_ms,
200 .extra1 = &sack_timer_min,
201 .extra2 = &sack_timer_max,
202 },
190 { .ctl_name = 0 } 203 { .ctl_name = 0 }
191}; 204};
192 205
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index f30882e1e96a..d2f04ebe5081 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
57/* Initialize a new transport from provided memory. */ 57/* Initialize a new transport from provided memory. */
58static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, 58static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
59 const union sctp_addr *addr, 59 const union sctp_addr *addr,
60 int gfp) 60 unsigned int __nocast gfp)
61{ 61{
62 /* Copy in the address. */ 62 /* Copy in the address. */
63 peer->ipaddr = *addr; 63 peer->ipaddr = *addr;
@@ -83,7 +83,9 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
83 peer->last_time_used = jiffies; 83 peer->last_time_used = jiffies;
84 peer->last_time_ecne_reduced = jiffies; 84 peer->last_time_ecne_reduced = jiffies;
85 85
86 peer->active = SCTP_ACTIVE; 86 peer->init_sent_count = 0;
87
88 peer->state = SCTP_ACTIVE;
87 peer->hb_allowed = 0; 89 peer->hb_allowed = 0;
88 90
89 /* Initialize the default path max_retrans. */ 91 /* Initialize the default path max_retrans. */
@@ -101,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
101 103
102 /* Set up the heartbeat timer. */ 104 /* Set up the heartbeat timer. */
103 init_timer(&peer->hb_timer); 105 init_timer(&peer->hb_timer);
104 peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
105 peer->hb_timer.function = sctp_generate_heartbeat_event; 106 peer->hb_timer.function = sctp_generate_heartbeat_event;
106 peer->hb_timer.data = (unsigned long)peer; 107 peer->hb_timer.data = (unsigned long)peer;
107 108
@@ -120,7 +121,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
120} 121}
121 122
122/* Allocate and initialize a new transport. */ 123/* Allocate and initialize a new transport. */
123struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, int gfp) 124struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
125 unsigned int __nocast gfp)
124{ 126{
125 struct sctp_transport *transport; 127 struct sctp_transport *transport;
126 128
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 17d0ff534735..0abd5101107c 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
74 74
75/* Create a new sctp_ulpevent. */ 75/* Create a new sctp_ulpevent. */
76SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, 76SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
77 int gfp) 77 unsigned int __nocast gfp)
78{ 78{
79 struct sctp_ulpevent *event; 79 struct sctp_ulpevent *event;
80 struct sk_buff *skb; 80 struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
136struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( 136struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
137 const struct sctp_association *asoc, 137 const struct sctp_association *asoc,
138 __u16 flags, __u16 state, __u16 error, __u16 outbound, 138 __u16 flags, __u16 state, __u16 error, __u16 outbound,
139 __u16 inbound, int gfp) 139 __u16 inbound, unsigned int __nocast gfp)
140{ 140{
141 struct sctp_ulpevent *event; 141 struct sctp_ulpevent *event;
142 struct sctp_assoc_change *sac; 142 struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
237struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( 237struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
238 const struct sctp_association *asoc, 238 const struct sctp_association *asoc,
239 const struct sockaddr_storage *aaddr, 239 const struct sockaddr_storage *aaddr,
240 int flags, int state, int error, int gfp) 240 int flags, int state, int error, unsigned int __nocast gfp)
241{ 241{
242 struct sctp_ulpevent *event; 242 struct sctp_ulpevent *event;
243 struct sctp_paddr_change *spc; 243 struct sctp_paddr_change *spc;
@@ -350,7 +350,7 @@ fail:
350 */ 350 */
351struct sctp_ulpevent *sctp_ulpevent_make_remote_error( 351struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
352 const struct sctp_association *asoc, struct sctp_chunk *chunk, 352 const struct sctp_association *asoc, struct sctp_chunk *chunk,
353 __u16 flags, int gfp) 353 __u16 flags, unsigned int __nocast gfp)
354{ 354{
355 struct sctp_ulpevent *event; 355 struct sctp_ulpevent *event;
356 struct sctp_remote_error *sre; 356 struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
448 */ 448 */
449struct sctp_ulpevent *sctp_ulpevent_make_send_failed( 449struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
450 const struct sctp_association *asoc, struct sctp_chunk *chunk, 450 const struct sctp_association *asoc, struct sctp_chunk *chunk,
451 __u16 flags, __u32 error, int gfp) 451 __u16 flags, __u32 error, unsigned int __nocast gfp)
452{ 452{
453 struct sctp_ulpevent *event; 453 struct sctp_ulpevent *event;
454 struct sctp_send_failed *ssf; 454 struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
557 */ 557 */
558struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( 558struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
559 const struct sctp_association *asoc, 559 const struct sctp_association *asoc,
560 __u16 flags, int gfp) 560 __u16 flags, unsigned int __nocast gfp)
561{ 561{
562 struct sctp_ulpevent *event; 562 struct sctp_ulpevent *event;
563 struct sctp_shutdown_event *sse; 563 struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
620 * 5.3.1.6 SCTP_ADAPTION_INDICATION 620 * 5.3.1.6 SCTP_ADAPTION_INDICATION
621 */ 621 */
622struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication( 622struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
623 const struct sctp_association *asoc, int gfp) 623 const struct sctp_association *asoc, unsigned int __nocast gfp)
624{ 624{
625 struct sctp_ulpevent *event; 625 struct sctp_ulpevent *event;
626 struct sctp_adaption_event *sai; 626 struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
657 */ 657 */
658struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, 658struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
659 struct sctp_chunk *chunk, 659 struct sctp_chunk *chunk,
660 int gfp) 660 unsigned int __nocast gfp)
661{ 661{
662 struct sctp_ulpevent *event = NULL; 662 struct sctp_ulpevent *event = NULL;
663 struct sk_buff *skb; 663 struct sk_buff *skb;
@@ -718,7 +718,8 @@ fail:
718 * various events. 718 * various events.
719 */ 719 */
720struct sctp_ulpevent *sctp_ulpevent_make_pdapi( 720struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
721 const struct sctp_association *asoc, __u32 indication, int gfp) 721 const struct sctp_association *asoc, __u32 indication,
722 unsigned int __nocast gfp)
722{ 723{
723 struct sctp_ulpevent *event; 724 struct sctp_ulpevent *event;
724 struct sctp_pdapi_event *pd; 725 struct sctp_pdapi_event *pd;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d5dd2cf7ac4a..8bbc279d6c99 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
100 100
101/* Process an incoming DATA chunk. */ 101/* Process an incoming DATA chunk. */
102int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, 102int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
103 int gfp) 103 unsigned int __nocast gfp)
104{ 104{
105 struct sk_buff_head temp; 105 struct sk_buff_head temp;
106 sctp_data_chunk_t *hdr; 106 sctp_data_chunk_t *hdr;
@@ -778,7 +778,8 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
778 778
779/* Partial deliver the first message as there is pressure on rwnd. */ 779/* Partial deliver the first message as there is pressure on rwnd. */
780void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, 780void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
781 struct sctp_chunk *chunk, int gfp) 781 struct sctp_chunk *chunk,
782 unsigned int __nocast gfp)
782{ 783{
783 struct sctp_ulpevent *event; 784 struct sctp_ulpevent *event;
784 struct sctp_association *asoc; 785 struct sctp_association *asoc;
@@ -802,7 +803,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
802 803
803/* Renege some packets to make room for an incoming chunk. */ 804/* Renege some packets to make room for an incoming chunk. */
804void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, 805void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
805 int gfp) 806 unsigned int __nocast gfp)
806{ 807{
807 struct sctp_association *asoc; 808 struct sctp_association *asoc;
808 __u16 needed, freed; 809 __u16 needed, freed;
@@ -841,7 +842,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
841/* Notify the application if an association is aborted and in 842/* Notify the application if an association is aborted and in
842 * partial delivery mode. Send up any pending received messages. 843 * partial delivery mode. Send up any pending received messages.
843 */ 844 */
844void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, int gfp) 845void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
845{ 846{
846 struct sctp_ulpevent *ev = NULL; 847 struct sctp_ulpevent *ev = NULL;
847 struct sock *sk; 848 struct sock *sk;
diff --git a/net/socket.c b/net/socket.c
index cec0cb38b9ce..6f2a17881972 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -81,6 +81,7 @@
81#include <linux/syscalls.h> 81#include <linux/syscalls.h>
82#include <linux/compat.h> 82#include <linux/compat.h>
83#include <linux/kmod.h> 83#include <linux/kmod.h>
84#include <linux/audit.h>
84 85
85#ifdef CONFIG_NET_RADIO 86#ifdef CONFIG_NET_RADIO
86#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ 87#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */
@@ -226,7 +227,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
226 return 0; 227 return 0;
227 if(copy_from_user(kaddr,uaddr,ulen)) 228 if(copy_from_user(kaddr,uaddr,ulen))
228 return -EFAULT; 229 return -EFAULT;
229 return 0; 230 return audit_sockaddr(ulen, kaddr);
230} 231}
231 232
232/** 233/**
@@ -382,9 +383,8 @@ int sock_map_fd(struct socket *sock)
382 goto out; 383 goto out;
383 } 384 }
384 385
385 sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino); 386 this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
386 this.name = name; 387 this.name = name;
387 this.len = strlen(name);
388 this.hash = SOCK_INODE(sock)->i_ino; 388 this.hash = SOCK_INODE(sock)->i_ino;
389 389
390 file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this); 390 file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
@@ -1906,7 +1906,11 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
1906 /* copy_from_user should be SMP safe. */ 1906 /* copy_from_user should be SMP safe. */
1907 if (copy_from_user(a, args, nargs[call])) 1907 if (copy_from_user(a, args, nargs[call]))
1908 return -EFAULT; 1908 return -EFAULT;
1909 1909
1910 err = audit_socketcall(nargs[call]/sizeof(unsigned long), a);
1911 if (err)
1912 return err;
1913
1910 a0=a[0]; 1914 a0=a[0];
1911 a1=a[1]; 1915 a1=a[1];
1912 1916
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 9bcec9b927b9..505e2d4b3d62 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -66,10 +66,10 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
66 u32 flavor = pseudoflavor_to_flavor(pseudoflavor); 66 u32 flavor = pseudoflavor_to_flavor(pseudoflavor);
67 67
68 if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor])) 68 if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor]))
69 return NULL; 69 return ERR_PTR(-EINVAL);
70 auth = ops->create(clnt, pseudoflavor); 70 auth = ops->create(clnt, pseudoflavor);
71 if (!auth) 71 if (IS_ERR(auth))
72 return NULL; 72 return auth;
73 if (clnt->cl_auth) 73 if (clnt->cl_auth)
74 rpcauth_destroy(clnt->cl_auth); 74 rpcauth_destroy(clnt->cl_auth);
75 clnt->cl_auth = auth; 75 clnt->cl_auth = auth;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a33b627cbef4..2f7b867161d2 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -660,14 +660,16 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
660{ 660{
661 struct gss_auth *gss_auth; 661 struct gss_auth *gss_auth;
662 struct rpc_auth * auth; 662 struct rpc_auth * auth;
663 int err = -ENOMEM; /* XXX? */
663 664
664 dprintk("RPC: creating GSS authenticator for client %p\n",clnt); 665 dprintk("RPC: creating GSS authenticator for client %p\n",clnt);
665 666
666 if (!try_module_get(THIS_MODULE)) 667 if (!try_module_get(THIS_MODULE))
667 return NULL; 668 return ERR_PTR(err);
668 if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) 669 if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
669 goto out_dec; 670 goto out_dec;
670 gss_auth->client = clnt; 671 gss_auth->client = clnt;
672 err = -EINVAL;
671 gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); 673 gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
672 if (!gss_auth->mech) { 674 if (!gss_auth->mech) {
673 printk(KERN_WARNING "%s: Pseudoflavor %d not found!", 675 printk(KERN_WARNING "%s: Pseudoflavor %d not found!",
@@ -675,9 +677,8 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
675 goto err_free; 677 goto err_free;
676 } 678 }
677 gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); 679 gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
678 /* FIXME: Will go away once privacy support is merged in */ 680 if (gss_auth->service == 0)
679 if (gss_auth->service == RPC_GSS_SVC_PRIVACY) 681 goto err_put_mech;
680 gss_auth->service = RPC_GSS_SVC_INTEGRITY;
681 INIT_LIST_HEAD(&gss_auth->upcalls); 682 INIT_LIST_HEAD(&gss_auth->upcalls);
682 spin_lock_init(&gss_auth->lock); 683 spin_lock_init(&gss_auth->lock);
683 auth = &gss_auth->rpc_auth; 684 auth = &gss_auth->rpc_auth;
@@ -687,15 +688,18 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
687 auth->au_flavor = flavor; 688 auth->au_flavor = flavor;
688 atomic_set(&auth->au_count, 1); 689 atomic_set(&auth->au_count, 1);
689 690
690 if (rpcauth_init_credcache(auth, GSS_CRED_EXPIRE) < 0) 691 err = rpcauth_init_credcache(auth, GSS_CRED_EXPIRE);
692 if (err)
691 goto err_put_mech; 693 goto err_put_mech;
692 694
693 snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", 695 snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s",
694 clnt->cl_pathname, 696 clnt->cl_pathname,
695 gss_auth->mech->gm_name); 697 gss_auth->mech->gm_name);
696 gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); 698 gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
697 if (IS_ERR(gss_auth->dentry)) 699 if (IS_ERR(gss_auth->dentry)) {
700 err = PTR_ERR(gss_auth->dentry);
698 goto err_put_mech; 701 goto err_put_mech;
702 }
699 703
700 return auth; 704 return auth;
701err_put_mech: 705err_put_mech:
@@ -704,7 +708,7 @@ err_free:
704 kfree(gss_auth); 708 kfree(gss_auth);
705out_dec: 709out_dec:
706 module_put(THIS_MODULE); 710 module_put(THIS_MODULE);
707 return NULL; 711 return ERR_PTR(err);
708} 712}
709 713
710static void 714static void
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 02bc029d46fe..f17e6153b688 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -97,12 +97,13 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
97 * made to sleep too long. 97 * made to sleep too long.
98 */ 98 */
99struct rpc_clnt * 99struct rpc_clnt *
100rpc_create_client(struct rpc_xprt *xprt, char *servname, 100rpc_new_client(struct rpc_xprt *xprt, char *servname,
101 struct rpc_program *program, u32 vers, 101 struct rpc_program *program, u32 vers,
102 rpc_authflavor_t flavor) 102 rpc_authflavor_t flavor)
103{ 103{
104 struct rpc_version *version; 104 struct rpc_version *version;
105 struct rpc_clnt *clnt = NULL; 105 struct rpc_clnt *clnt = NULL;
106 struct rpc_auth *auth;
106 int err; 107 int err;
107 int len; 108 int len;
108 109
@@ -157,10 +158,11 @@ rpc_create_client(struct rpc_xprt *xprt, char *servname,
157 if (err < 0) 158 if (err < 0)
158 goto out_no_path; 159 goto out_no_path;
159 160
160 err = -ENOMEM; 161 auth = rpcauth_create(flavor, clnt);
161 if (!rpcauth_create(flavor, clnt)) { 162 if (IS_ERR(auth)) {
162 printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", 163 printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
163 flavor); 164 flavor);
165 err = PTR_ERR(auth);
164 goto out_no_auth; 166 goto out_no_auth;
165 } 167 }
166 168
@@ -178,6 +180,37 @@ out_no_path:
178 kfree(clnt->cl_server); 180 kfree(clnt->cl_server);
179 kfree(clnt); 181 kfree(clnt);
180out_err: 182out_err:
183 xprt_destroy(xprt);
184 return ERR_PTR(err);
185}
186
187/**
188 * Create an RPC client
189 * @xprt - pointer to xprt struct
190 * @servname - name of server
191 * @info - rpc_program
192 * @version - rpc_program version
193 * @authflavor - rpc_auth flavour to use
194 *
195 * Creates an RPC client structure, then pings the server in order to
196 * determine if it is up, and if it supports this program and version.
197 *
198 * This function should never be called by asynchronous tasks such as
199 * the portmapper.
200 */
201struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
202 struct rpc_program *info, u32 version, rpc_authflavor_t authflavor)
203{
204 struct rpc_clnt *clnt;
205 int err;
206
207 clnt = rpc_new_client(xprt, servname, info, version, authflavor);
208 if (IS_ERR(clnt))
209 return clnt;
210 err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
211 if (err == 0)
212 return clnt;
213 rpc_shutdown_client(clnt);
181 return ERR_PTR(err); 214 return ERR_PTR(err);
182} 215}
183 216
@@ -208,6 +241,8 @@ rpc_clone_client(struct rpc_clnt *clnt)
208 rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); 241 rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
209 if (new->cl_auth) 242 if (new->cl_auth)
210 atomic_inc(&new->cl_auth->au_count); 243 atomic_inc(&new->cl_auth->au_count);
244 new->cl_pmap = &new->cl_pmap_default;
245 rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
211 return new; 246 return new;
212out_no_clnt: 247out_no_clnt:
213 printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__); 248 printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__);
@@ -296,6 +331,44 @@ rpc_release_client(struct rpc_clnt *clnt)
296 rpc_destroy_client(clnt); 331 rpc_destroy_client(clnt);
297} 332}
298 333
334/**
335 * rpc_bind_new_program - bind a new RPC program to an existing client
336 * @old - old rpc_client
337 * @program - rpc program to set
338 * @vers - rpc program version
339 *
340 * Clones the rpc client and sets up a new RPC program. This is mainly
341 * of use for enabling different RPC programs to share the same transport.
342 * The Sun NFSv2/v3 ACL protocol can do this.
343 */
344struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
345 struct rpc_program *program,
346 int vers)
347{
348 struct rpc_clnt *clnt;
349 struct rpc_version *version;
350 int err;
351
352 BUG_ON(vers >= program->nrvers || !program->version[vers]);
353 version = program->version[vers];
354 clnt = rpc_clone_client(old);
355 if (IS_ERR(clnt))
356 goto out;
357 clnt->cl_procinfo = version->procs;
358 clnt->cl_maxproc = version->nrprocs;
359 clnt->cl_protname = program->name;
360 clnt->cl_prog = program->number;
361 clnt->cl_vers = version->number;
362 clnt->cl_stats = program->stats;
363 err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
364 if (err != 0) {
365 rpc_shutdown_client(clnt);
366 clnt = ERR_PTR(err);
367 }
368out:
369 return clnt;
370}
371
299/* 372/*
300 * Default callback for async RPC calls 373 * Default callback for async RPC calls
301 */ 374 */
@@ -305,38 +378,41 @@ rpc_default_callback(struct rpc_task *task)
305} 378}
306 379
307/* 380/*
308 * Export the signal mask handling for aysnchronous code that 381 * Export the signal mask handling for synchronous code that
309 * sleeps on RPC calls 382 * sleeps on RPC calls
310 */ 383 */
384#define RPC_INTR_SIGNALS (sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGKILL))
311 385
386static void rpc_save_sigmask(sigset_t *oldset, int intr)
387{
388 unsigned long sigallow = 0;
389 sigset_t sigmask;
390
391 /* Block all signals except those listed in sigallow */
392 if (intr)
393 sigallow |= RPC_INTR_SIGNALS;
394 siginitsetinv(&sigmask, sigallow);
395 sigprocmask(SIG_BLOCK, &sigmask, oldset);
396}
397
398static inline void rpc_task_sigmask(struct rpc_task *task, sigset_t *oldset)
399{
400 rpc_save_sigmask(oldset, !RPC_TASK_UNINTERRUPTIBLE(task));
401}
402
403static inline void rpc_restore_sigmask(sigset_t *oldset)
404{
405 sigprocmask(SIG_SETMASK, oldset, NULL);
406}
407
312void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset) 408void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset)
313{ 409{
314 unsigned long sigallow = sigmask(SIGKILL); 410 rpc_save_sigmask(oldset, clnt->cl_intr);
315 unsigned long irqflags;
316
317 /* Turn off various signals */
318 if (clnt->cl_intr) {
319 struct k_sigaction *action = current->sighand->action;
320 if (action[SIGINT-1].sa.sa_handler == SIG_DFL)
321 sigallow |= sigmask(SIGINT);
322 if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL)
323 sigallow |= sigmask(SIGQUIT);
324 }
325 spin_lock_irqsave(&current->sighand->siglock, irqflags);
326 *oldset = current->blocked;
327 siginitsetinv(&current->blocked, sigallow & ~oldset->sig[0]);
328 recalc_sigpending();
329 spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
330} 411}
331 412
332void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset) 413void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset)
333{ 414{
334 unsigned long irqflags; 415 rpc_restore_sigmask(oldset);
335
336 spin_lock_irqsave(&current->sighand->siglock, irqflags);
337 current->blocked = *oldset;
338 recalc_sigpending();
339 spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
340} 416}
341 417
342/* 418/*
@@ -354,26 +430,26 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
354 430
355 BUG_ON(flags & RPC_TASK_ASYNC); 431 BUG_ON(flags & RPC_TASK_ASYNC);
356 432
357 rpc_clnt_sigmask(clnt, &oldset);
358
359 status = -ENOMEM; 433 status = -ENOMEM;
360 task = rpc_new_task(clnt, NULL, flags); 434 task = rpc_new_task(clnt, NULL, flags);
361 if (task == NULL) 435 if (task == NULL)
362 goto out; 436 goto out;
363 437
438 /* Mask signals on RPC calls _and_ GSS_AUTH upcalls */
439 rpc_task_sigmask(task, &oldset);
440
364 rpc_call_setup(task, msg, 0); 441 rpc_call_setup(task, msg, 0);
365 442
366 /* Set up the call info struct and execute the task */ 443 /* Set up the call info struct and execute the task */
367 if (task->tk_status == 0) 444 if (task->tk_status == 0) {
368 status = rpc_execute(task); 445 status = rpc_execute(task);
369 else { 446 } else {
370 status = task->tk_status; 447 status = task->tk_status;
371 rpc_release_task(task); 448 rpc_release_task(task);
372 } 449 }
373 450
451 rpc_restore_sigmask(&oldset);
374out: 452out:
375 rpc_clnt_sigunmask(clnt, &oldset);
376
377 return status; 453 return status;
378} 454}
379 455
@@ -394,8 +470,6 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
394 470
395 flags |= RPC_TASK_ASYNC; 471 flags |= RPC_TASK_ASYNC;
396 472
397 rpc_clnt_sigmask(clnt, &oldset);
398
399 /* Create/initialize a new RPC task */ 473 /* Create/initialize a new RPC task */
400 if (!callback) 474 if (!callback)
401 callback = rpc_default_callback; 475 callback = rpc_default_callback;
@@ -404,6 +478,9 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
404 goto out; 478 goto out;
405 task->tk_calldata = data; 479 task->tk_calldata = data;
406 480
481 /* Mask signals on GSS_AUTH upcalls */
482 rpc_task_sigmask(task, &oldset);
483
407 rpc_call_setup(task, msg, 0); 484 rpc_call_setup(task, msg, 0);
408 485
409 /* Set up the call info struct and execute the task */ 486 /* Set up the call info struct and execute the task */
@@ -413,9 +490,8 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
413 else 490 else
414 rpc_release_task(task); 491 rpc_release_task(task);
415 492
493 rpc_restore_sigmask(&oldset);
416out: 494out:
417 rpc_clnt_sigunmask(clnt, &oldset);
418
419 return status; 495 return status;
420} 496}
421 497
@@ -593,7 +669,7 @@ call_allocate(struct rpc_task *task)
593 return; 669 return;
594 printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); 670 printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task);
595 671
596 if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) { 672 if (RPC_IS_ASYNC(task) || !signalled()) {
597 xprt_release(task); 673 xprt_release(task);
598 task->tk_action = call_reserve; 674 task->tk_action = call_reserve;
599 rpc_delay(task, HZ>>4); 675 rpc_delay(task, HZ>>4);
@@ -957,7 +1033,9 @@ call_header(struct rpc_task *task)
957 *p++ = htonl(clnt->cl_prog); /* program number */ 1033 *p++ = htonl(clnt->cl_prog); /* program number */
958 *p++ = htonl(clnt->cl_vers); /* program version */ 1034 *p++ = htonl(clnt->cl_vers); /* program version */
959 *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */ 1035 *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */
960 return rpcauth_marshcred(task, p); 1036 p = rpcauth_marshcred(task, p);
1037 req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
1038 return p;
961} 1039}
962 1040
963/* 1041/*
@@ -986,10 +1064,11 @@ call_verify(struct rpc_task *task)
986 case RPC_AUTH_ERROR: 1064 case RPC_AUTH_ERROR:
987 break; 1065 break;
988 case RPC_MISMATCH: 1066 case RPC_MISMATCH:
989 printk(KERN_WARNING "%s: RPC call version mismatch!\n", __FUNCTION__); 1067 dprintk("%s: RPC call version mismatch!\n", __FUNCTION__);
990 goto out_eio; 1068 error = -EPROTONOSUPPORT;
1069 goto out_err;
991 default: 1070 default:
992 printk(KERN_WARNING "%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n); 1071 dprintk("%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n);
993 goto out_eio; 1072 goto out_eio;
994 } 1073 }
995 if (--len < 0) 1074 if (--len < 0)
@@ -1040,23 +1119,26 @@ call_verify(struct rpc_task *task)
1040 case RPC_SUCCESS: 1119 case RPC_SUCCESS:
1041 return p; 1120 return p;
1042 case RPC_PROG_UNAVAIL: 1121 case RPC_PROG_UNAVAIL:
1043 printk(KERN_WARNING "RPC: call_verify: program %u is unsupported by server %s\n", 1122 dprintk("RPC: call_verify: program %u is unsupported by server %s\n",
1044 (unsigned int)task->tk_client->cl_prog, 1123 (unsigned int)task->tk_client->cl_prog,
1045 task->tk_client->cl_server); 1124 task->tk_client->cl_server);
1046 goto out_eio; 1125 error = -EPFNOSUPPORT;
1126 goto out_err;
1047 case RPC_PROG_MISMATCH: 1127 case RPC_PROG_MISMATCH:
1048 printk(KERN_WARNING "RPC: call_verify: program %u, version %u unsupported by server %s\n", 1128 dprintk("RPC: call_verify: program %u, version %u unsupported by server %s\n",
1049 (unsigned int)task->tk_client->cl_prog, 1129 (unsigned int)task->tk_client->cl_prog,
1050 (unsigned int)task->tk_client->cl_vers, 1130 (unsigned int)task->tk_client->cl_vers,
1051 task->tk_client->cl_server); 1131 task->tk_client->cl_server);
1052 goto out_eio; 1132 error = -EPROTONOSUPPORT;
1133 goto out_err;
1053 case RPC_PROC_UNAVAIL: 1134 case RPC_PROC_UNAVAIL:
1054 printk(KERN_WARNING "RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n", 1135 dprintk("RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n",
1055 task->tk_msg.rpc_proc, 1136 task->tk_msg.rpc_proc,
1056 task->tk_client->cl_prog, 1137 task->tk_client->cl_prog,
1057 task->tk_client->cl_vers, 1138 task->tk_client->cl_vers,
1058 task->tk_client->cl_server); 1139 task->tk_client->cl_server);
1059 goto out_eio; 1140 error = -EOPNOTSUPP;
1141 goto out_err;
1060 case RPC_GARBAGE_ARGS: 1142 case RPC_GARBAGE_ARGS:
1061 dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__); 1143 dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__);
1062 break; /* retry */ 1144 break; /* retry */
@@ -1069,7 +1151,7 @@ out_retry:
1069 task->tk_client->cl_stats->rpcgarbage++; 1151 task->tk_client->cl_stats->rpcgarbage++;
1070 if (task->tk_garb_retry) { 1152 if (task->tk_garb_retry) {
1071 task->tk_garb_retry--; 1153 task->tk_garb_retry--;
1072 dprintk(KERN_WARNING "RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid); 1154 dprintk("RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
1073 task->tk_action = call_bind; 1155 task->tk_action = call_bind;
1074 return NULL; 1156 return NULL;
1075 } 1157 }
@@ -1083,3 +1165,30 @@ out_overflow:
1083 printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__); 1165 printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__);
1084 goto out_retry; 1166 goto out_retry;
1085} 1167}
1168
1169static int rpcproc_encode_null(void *rqstp, u32 *data, void *obj)
1170{
1171 return 0;
1172}
1173
1174static int rpcproc_decode_null(void *rqstp, u32 *data, void *obj)
1175{
1176 return 0;
1177}
1178
1179static struct rpc_procinfo rpcproc_null = {
1180 .p_encode = rpcproc_encode_null,
1181 .p_decode = rpcproc_decode_null,
1182};
1183
1184int rpc_ping(struct rpc_clnt *clnt, int flags)
1185{
1186 struct rpc_message msg = {
1187 .rpc_proc = &rpcproc_null,
1188 };
1189 int err;
1190 msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
1191 err = rpc_call_sync(clnt, &msg, flags);
1192 put_rpccred(msg.rpc_cred);
1193 return err;
1194}
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index d0b1d2c34a4d..4e81f2766923 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -53,6 +53,9 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
53 task->tk_pid, clnt->cl_server, 53 task->tk_pid, clnt->cl_server,
54 map->pm_prog, map->pm_vers, map->pm_prot); 54 map->pm_prog, map->pm_vers, map->pm_prot);
55 55
56 /* Autobind on cloned rpc clients is discouraged */
57 BUG_ON(clnt->cl_parent != clnt);
58
56 spin_lock(&pmap_lock); 59 spin_lock(&pmap_lock);
57 if (map->pm_binding) { 60 if (map->pm_binding) {
58 rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL); 61 rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL);
@@ -207,12 +210,10 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
207 xprt->addr.sin_port = htons(RPC_PMAP_PORT); 210 xprt->addr.sin_port = htons(RPC_PMAP_PORT);
208 211
209 /* printk("pmap: create clnt\n"); */ 212 /* printk("pmap: create clnt\n"); */
210 clnt = rpc_create_client(xprt, hostname, 213 clnt = rpc_new_client(xprt, hostname,
211 &pmap_program, RPC_PMAP_VERSION, 214 &pmap_program, RPC_PMAP_VERSION,
212 RPC_AUTH_UNIX); 215 RPC_AUTH_UNIX);
213 if (IS_ERR(clnt)) { 216 if (!IS_ERR(clnt)) {
214 xprt_destroy(xprt);
215 } else {
216 clnt->cl_softrtry = 1; 217 clnt->cl_softrtry = 1;
217 clnt->cl_chatty = 1; 218 clnt->cl_chatty = 1;
218 clnt->cl_oneshot = 1; 219 clnt->cl_oneshot = 1;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index c06614d0e31d..2d9eb7fbd521 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -290,7 +290,7 @@ static void rpc_make_runnable(struct rpc_task *task)
290 return; 290 return;
291 } 291 }
292 } else 292 } else
293 wake_up(&task->u.tk_wait.waitq); 293 wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
294} 294}
295 295
296/* 296/*
@@ -555,6 +555,38 @@ __rpc_atrun(struct rpc_task *task)
555} 555}
556 556
557/* 557/*
558 * Helper that calls task->tk_exit if it exists and then returns
559 * true if we should exit __rpc_execute.
560 */
561static inline int __rpc_do_exit(struct rpc_task *task)
562{
563 if (task->tk_exit != NULL) {
564 lock_kernel();
565 task->tk_exit(task);
566 unlock_kernel();
567 /* If tk_action is non-null, we should restart the call */
568 if (task->tk_action != NULL) {
569 if (!RPC_ASSASSINATED(task)) {
570 /* Release RPC slot and buffer memory */
571 xprt_release(task);
572 rpc_free(task);
573 return 0;
574 }
575 printk(KERN_ERR "RPC: dead task tried to walk away.\n");
576 }
577 }
578 return 1;
579}
580
581static int rpc_wait_bit_interruptible(void *word)
582{
583 if (signal_pending(current))
584 return -ERESTARTSYS;
585 schedule();
586 return 0;
587}
588
589/*
558 * This is the RPC `scheduler' (or rather, the finite state machine). 590 * This is the RPC `scheduler' (or rather, the finite state machine).
559 */ 591 */
560static int __rpc_execute(struct rpc_task *task) 592static int __rpc_execute(struct rpc_task *task)
@@ -566,8 +598,7 @@ static int __rpc_execute(struct rpc_task *task)
566 598
567 BUG_ON(RPC_IS_QUEUED(task)); 599 BUG_ON(RPC_IS_QUEUED(task));
568 600
569 restarted: 601 for (;;) {
570 while (1) {
571 /* 602 /*
572 * Garbage collection of pending timers... 603 * Garbage collection of pending timers...
573 */ 604 */
@@ -600,11 +631,12 @@ static int __rpc_execute(struct rpc_task *task)
600 * by someone else. 631 * by someone else.
601 */ 632 */
602 if (!RPC_IS_QUEUED(task)) { 633 if (!RPC_IS_QUEUED(task)) {
603 if (!task->tk_action) 634 if (task->tk_action != NULL) {
635 lock_kernel();
636 task->tk_action(task);
637 unlock_kernel();
638 } else if (__rpc_do_exit(task))
604 break; 639 break;
605 lock_kernel();
606 task->tk_action(task);
607 unlock_kernel();
608 } 640 }
609 641
610 /* 642 /*
@@ -624,44 +656,26 @@ static int __rpc_execute(struct rpc_task *task)
624 656
625 /* sync task: sleep here */ 657 /* sync task: sleep here */
626 dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); 658 dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
627 if (RPC_TASK_UNINTERRUPTIBLE(task)) { 659 /* Note: Caller should be using rpc_clnt_sigmask() */
628 __wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task)); 660 status = out_of_line_wait_on_bit(&task->tk_runstate,
629 } else { 661 RPC_TASK_QUEUED, rpc_wait_bit_interruptible,
630 __wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status); 662 TASK_INTERRUPTIBLE);
663 if (status == -ERESTARTSYS) {
631 /* 664 /*
632 * When a sync task receives a signal, it exits with 665 * When a sync task receives a signal, it exits with
633 * -ERESTARTSYS. In order to catch any callbacks that 666 * -ERESTARTSYS. In order to catch any callbacks that
634 * clean up after sleeping on some queue, we don't 667 * clean up after sleeping on some queue, we don't
635 * break the loop here, but go around once more. 668 * break the loop here, but go around once more.
636 */ 669 */
637 if (status == -ERESTARTSYS) { 670 dprintk("RPC: %4d got signal\n", task->tk_pid);
638 dprintk("RPC: %4d got signal\n", task->tk_pid); 671 task->tk_flags |= RPC_TASK_KILLED;
639 task->tk_flags |= RPC_TASK_KILLED; 672 rpc_exit(task, -ERESTARTSYS);
640 rpc_exit(task, -ERESTARTSYS); 673 rpc_wake_up_task(task);
641 rpc_wake_up_task(task);
642 }
643 } 674 }
644 rpc_set_running(task); 675 rpc_set_running(task);
645 dprintk("RPC: %4d sync task resuming\n", task->tk_pid); 676 dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
646 } 677 }
647 678
648 if (task->tk_exit) {
649 lock_kernel();
650 task->tk_exit(task);
651 unlock_kernel();
652 /* If tk_action is non-null, the user wants us to restart */
653 if (task->tk_action) {
654 if (!RPC_ASSASSINATED(task)) {
655 /* Release RPC slot and buffer memory */
656 if (task->tk_rqstp)
657 xprt_release(task);
658 rpc_free(task);
659 goto restarted;
660 }
661 printk(KERN_ERR "RPC: dead task tries to walk away.\n");
662 }
663 }
664
665 dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status); 679 dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status);
666 status = task->tk_status; 680 status = task->tk_status;
667 681
@@ -759,8 +773,6 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
759 773
760 /* Initialize workqueue for async tasks */ 774 /* Initialize workqueue for async tasks */
761 task->tk_workqueue = rpciod_workqueue; 775 task->tk_workqueue = rpciod_workqueue;
762 if (!RPC_IS_ASYNC(task))
763 init_waitqueue_head(&task->u.tk_wait.waitq);
764 776
765 if (clnt) { 777 if (clnt) {
766 atomic_inc(&clnt->cl_users); 778 atomic_inc(&clnt->cl_users);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index d4f26bf9e732..62a073495276 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,7 +41,9 @@ EXPORT_SYMBOL(rpc_release_task);
41 41
42/* RPC client functions */ 42/* RPC client functions */
43EXPORT_SYMBOL(rpc_create_client); 43EXPORT_SYMBOL(rpc_create_client);
44EXPORT_SYMBOL(rpc_new_client);
44EXPORT_SYMBOL(rpc_clone_client); 45EXPORT_SYMBOL(rpc_clone_client);
46EXPORT_SYMBOL(rpc_bind_new_program);
45EXPORT_SYMBOL(rpc_destroy_client); 47EXPORT_SYMBOL(rpc_destroy_client);
46EXPORT_SYMBOL(rpc_shutdown_client); 48EXPORT_SYMBOL(rpc_shutdown_client);
47EXPORT_SYMBOL(rpc_release_client); 49EXPORT_SYMBOL(rpc_release_client);
@@ -61,7 +63,6 @@ EXPORT_SYMBOL(rpc_mkpipe);
61 63
62/* Client transport */ 64/* Client transport */
63EXPORT_SYMBOL(xprt_create_proto); 65EXPORT_SYMBOL(xprt_create_proto);
64EXPORT_SYMBOL(xprt_destroy);
65EXPORT_SYMBOL(xprt_set_timeout); 66EXPORT_SYMBOL(xprt_set_timeout);
66EXPORT_SYMBOL(xprt_udp_slot_table_entries); 67EXPORT_SYMBOL(xprt_udp_slot_table_entries);
67EXPORT_SYMBOL(xprt_tcp_slot_table_entries); 68EXPORT_SYMBOL(xprt_tcp_slot_table_entries);
@@ -129,6 +130,10 @@ EXPORT_SYMBOL(xdr_encode_netobj);
129EXPORT_SYMBOL(xdr_encode_pages); 130EXPORT_SYMBOL(xdr_encode_pages);
130EXPORT_SYMBOL(xdr_inline_pages); 131EXPORT_SYMBOL(xdr_inline_pages);
131EXPORT_SYMBOL(xdr_shift_buf); 132EXPORT_SYMBOL(xdr_shift_buf);
133EXPORT_SYMBOL(xdr_encode_word);
134EXPORT_SYMBOL(xdr_decode_word);
135EXPORT_SYMBOL(xdr_encode_array2);
136EXPORT_SYMBOL(xdr_decode_array2);
132EXPORT_SYMBOL(xdr_buf_from_iov); 137EXPORT_SYMBOL(xdr_buf_from_iov);
133EXPORT_SYMBOL(xdr_buf_subsegment); 138EXPORT_SYMBOL(xdr_buf_subsegment);
134EXPORT_SYMBOL(xdr_buf_read_netobj); 139EXPORT_SYMBOL(xdr_buf_read_netobj);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bb2d99f33315..e9bd91265f70 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -35,20 +35,24 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
35 if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL))) 35 if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL)))
36 return NULL; 36 return NULL;
37 memset(serv, 0, sizeof(*serv)); 37 memset(serv, 0, sizeof(*serv));
38 serv->sv_name = prog->pg_name;
38 serv->sv_program = prog; 39 serv->sv_program = prog;
39 serv->sv_nrthreads = 1; 40 serv->sv_nrthreads = 1;
40 serv->sv_stats = prog->pg_stats; 41 serv->sv_stats = prog->pg_stats;
41 serv->sv_bufsz = bufsize? bufsize : 4096; 42 serv->sv_bufsz = bufsize? bufsize : 4096;
42 prog->pg_lovers = prog->pg_nvers-1;
43 xdrsize = 0; 43 xdrsize = 0;
44 for (vers=0; vers<prog->pg_nvers ; vers++) 44 while (prog) {
45 if (prog->pg_vers[vers]) { 45 prog->pg_lovers = prog->pg_nvers-1;
46 prog->pg_hivers = vers; 46 for (vers=0; vers<prog->pg_nvers ; vers++)
47 if (prog->pg_lovers > vers) 47 if (prog->pg_vers[vers]) {
48 prog->pg_lovers = vers; 48 prog->pg_hivers = vers;
49 if (prog->pg_vers[vers]->vs_xdrsize > xdrsize) 49 if (prog->pg_lovers > vers)
50 xdrsize = prog->pg_vers[vers]->vs_xdrsize; 50 prog->pg_lovers = vers;
51 } 51 if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
52 xdrsize = prog->pg_vers[vers]->vs_xdrsize;
53 }
54 prog = prog->pg_next;
55 }
52 serv->sv_xdrsize = xdrsize; 56 serv->sv_xdrsize = xdrsize;
53 INIT_LIST_HEAD(&serv->sv_threads); 57 INIT_LIST_HEAD(&serv->sv_threads);
54 INIT_LIST_HEAD(&serv->sv_sockets); 58 INIT_LIST_HEAD(&serv->sv_sockets);
@@ -56,8 +60,6 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
56 INIT_LIST_HEAD(&serv->sv_permsocks); 60 INIT_LIST_HEAD(&serv->sv_permsocks);
57 spin_lock_init(&serv->sv_lock); 61 spin_lock_init(&serv->sv_lock);
58 62
59 serv->sv_name = prog->pg_name;
60
61 /* Remove any stale portmap registrations */ 63 /* Remove any stale portmap registrations */
62 svc_register(serv, 0, 0); 64 svc_register(serv, 0, 0);
63 65
@@ -281,6 +283,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
281 rqstp->rq_res.len = 0; 283 rqstp->rq_res.len = 0;
282 rqstp->rq_res.page_base = 0; 284 rqstp->rq_res.page_base = 0;
283 rqstp->rq_res.page_len = 0; 285 rqstp->rq_res.page_len = 0;
286 rqstp->rq_res.buflen = PAGE_SIZE;
284 rqstp->rq_res.tail[0].iov_len = 0; 287 rqstp->rq_res.tail[0].iov_len = 0;
285 /* tcp needs a space for the record length... */ 288 /* tcp needs a space for the record length... */
286 if (rqstp->rq_prot == IPPROTO_TCP) 289 if (rqstp->rq_prot == IPPROTO_TCP)
@@ -338,7 +341,10 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
338 goto sendit; 341 goto sendit;
339 } 342 }
340 343
341 if (prog != progp->pg_prog) 344 for (progp = serv->sv_program; progp; progp = progp->pg_next)
345 if (prog == progp->pg_prog)
346 break;
347 if (progp == NULL)
342 goto err_bad_prog; 348 goto err_bad_prog;
343 349
344 if (vers >= progp->pg_nvers || 350 if (vers >= progp->pg_nvers ||
@@ -451,11 +457,7 @@ err_bad_auth:
451 goto sendit; 457 goto sendit;
452 458
453err_bad_prog: 459err_bad_prog:
454#ifdef RPC_PARANOIA 460 dprintk("svc: unknown program %d\n", prog);
455 if (prog != 100227 || progp->pg_prog != 100003)
456 printk("svc: unknown program %d (me %d)\n", prog, progp->pg_prog);
457 /* else it is just a Solaris client seeing if ACLs are supported */
458#endif
459 serv->sv_stats->rpcbadfmt++; 461 serv->sv_stats->rpcbadfmt++;
460 svc_putu32(resv, rpc_prog_unavail); 462 svc_putu32(resv, rpc_prog_unavail);
461 goto sendit; 463 goto sendit;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2b99b4028d31..d6baf6fdf8a9 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -8,6 +8,7 @@
8#include <linux/err.h> 8#include <linux/err.h>
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/hash.h> 10#include <linux/hash.h>
11#include <linux/string.h>
11 12
12#define RPCDBG_FACILITY RPCDBG_AUTH 13#define RPCDBG_FACILITY RPCDBG_AUTH
13 14
@@ -20,14 +21,6 @@
20 */ 21 */
21 22
22 23
23static char *strdup(char *s)
24{
25 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
26 if (rv)
27 strcpy(rv, s);
28 return rv;
29}
30
31struct unix_domain { 24struct unix_domain {
32 struct auth_domain h; 25 struct auth_domain h;
33 int addr_changes; 26 int addr_changes;
@@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name)
55 if (new == NULL) 48 if (new == NULL)
56 return NULL; 49 return NULL;
57 cache_init(&new->h.h); 50 cache_init(&new->h.h);
58 new->h.name = strdup(name); 51 new->h.name = kstrdup(name, GFP_KERNEL);
59 new->h.flavour = RPC_AUTH_UNIX; 52 new->h.flavour = RPC_AUTH_UNIX;
60 new->addr_changes = 0; 53 new->addr_changes = 0;
61 new->h.h.expiry_time = NEVER; 54 new->h.h.expiry_time = NEVER;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc96..d0c3120d0233 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -586,7 +586,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
586 } 586 }
587 if (skb->stamp.tv_sec == 0) { 587 if (skb->stamp.tv_sec == 0) {
588 skb->stamp.tv_sec = xtime.tv_sec; 588 skb->stamp.tv_sec = xtime.tv_sec;
589 skb->stamp.tv_usec = xtime.tv_nsec * 1000; 589 skb->stamp.tv_usec = xtime.tv_nsec / NSEC_PER_USEC;
590 /* Don't enable netstamp, sunrpc doesn't 590 /* Don't enable netstamp, sunrpc doesn't
591 need that much accuracy */ 591 need that much accuracy */
592 } 592 }
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1185 arg->page_len = (pages-2)*PAGE_SIZE; 1185 arg->page_len = (pages-2)*PAGE_SIZE;
1186 arg->len = (pages-1)*PAGE_SIZE; 1186 arg->len = (pages-1)*PAGE_SIZE;
1187 arg->tail[0].iov_len = 0; 1187 arg->tail[0].iov_len = 0;
1188 1188
1189 try_to_freeze(PF_FREEZE); 1189 try_to_freeze();
1190 if (signalled()) 1190 if (signalled())
1191 return -EINTR; 1191 return -EINTR;
1192 1192
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1227 1227
1228 schedule_timeout(timeout); 1228 schedule_timeout(timeout);
1229 1229
1230 try_to_freeze(PF_FREEZE); 1230 try_to_freeze();
1231 1231
1232 spin_lock_bh(&serv->sv_lock); 1232 spin_lock_bh(&serv->sv_lock);
1233 remove_wait_queue(&rqstp->rq_wait, &wait); 1233 remove_wait_queue(&rqstp->rq_wait, &wait);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 67b9f035ba86..fde16f40a581 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,21 +176,23 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
176 xdr->buflen += len; 176 xdr->buflen += len;
177} 177}
178 178
179void 179ssize_t
180xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, 180xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
181 skb_reader_t *desc, 181 skb_reader_t *desc,
182 skb_read_actor_t copy_actor) 182 skb_read_actor_t copy_actor)
183{ 183{
184 struct page **ppage = xdr->pages; 184 struct page **ppage = xdr->pages;
185 unsigned int len, pglen = xdr->page_len; 185 unsigned int len, pglen = xdr->page_len;
186 ssize_t copied = 0;
186 int ret; 187 int ret;
187 188
188 len = xdr->head[0].iov_len; 189 len = xdr->head[0].iov_len;
189 if (base < len) { 190 if (base < len) {
190 len -= base; 191 len -= base;
191 ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); 192 ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
193 copied += ret;
192 if (ret != len || !desc->count) 194 if (ret != len || !desc->count)
193 return; 195 goto out;
194 base = 0; 196 base = 0;
195 } else 197 } else
196 base -= len; 198 base -= len;
@@ -210,6 +212,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
210 do { 212 do {
211 char *kaddr; 213 char *kaddr;
212 214
215 /* ACL likes to be lazy in allocating pages - ACLs
216 * are small by default but can get huge. */
217 if (unlikely(*ppage == NULL)) {
218 *ppage = alloc_page(GFP_ATOMIC);
219 if (unlikely(*ppage == NULL)) {
220 if (copied == 0)
221 copied = -ENOMEM;
222 goto out;
223 }
224 }
225
213 len = PAGE_CACHE_SIZE; 226 len = PAGE_CACHE_SIZE;
214 kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); 227 kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
215 if (base) { 228 if (base) {
@@ -225,14 +238,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
225 } 238 }
226 flush_dcache_page(*ppage); 239 flush_dcache_page(*ppage);
227 kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); 240 kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
241 copied += ret;
228 if (ret != len || !desc->count) 242 if (ret != len || !desc->count)
229 return; 243 goto out;
230 ppage++; 244 ppage++;
231 } while ((pglen -= len) != 0); 245 } while ((pglen -= len) != 0);
232copy_tail: 246copy_tail:
233 len = xdr->tail[0].iov_len; 247 len = xdr->tail[0].iov_len;
234 if (base < len) 248 if (base < len)
235 copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); 249 copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
250out:
251 return copied;
236} 252}
237 253
238 254
@@ -616,12 +632,24 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len)
616void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p) 632void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p)
617{ 633{
618 struct kvec *iov = buf->head; 634 struct kvec *iov = buf->head;
635 int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
619 636
637 BUG_ON(scratch_len < 0);
620 xdr->buf = buf; 638 xdr->buf = buf;
621 xdr->iov = iov; 639 xdr->iov = iov;
622 xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len); 640 xdr->p = (uint32_t *)((char *)iov->iov_base + iov->iov_len);
623 buf->len = iov->iov_len = (char *)p - (char *)iov->iov_base; 641 xdr->end = (uint32_t *)((char *)iov->iov_base + scratch_len);
624 xdr->p = p; 642 BUG_ON(iov->iov_len > scratch_len);
643
644 if (p != xdr->p && p != NULL) {
645 size_t len;
646
647 BUG_ON(p < xdr->p || p > xdr->end);
648 len = (char *)p - (char *)xdr->p;
649 xdr->p = p;
650 buf->len += len;
651 iov->iov_len += len;
652 }
625} 653}
626EXPORT_SYMBOL(xdr_init_encode); 654EXPORT_SYMBOL(xdr_init_encode);
627 655
@@ -859,8 +887,34 @@ out:
859 return status; 887 return status;
860} 888}
861 889
862static int 890/* obj is assumed to point to allocated memory of size at least len: */
863read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) 891int
892write_bytes_to_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len)
893{
894 struct xdr_buf subbuf;
895 int this_len;
896 int status;
897
898 status = xdr_buf_subsegment(buf, &subbuf, base, len);
899 if (status)
900 goto out;
901 this_len = min(len, (int)subbuf.head[0].iov_len);
902 memcpy(subbuf.head[0].iov_base, obj, this_len);
903 len -= this_len;
904 obj += this_len;
905 this_len = min(len, (int)subbuf.page_len);
906 if (this_len)
907 _copy_to_pages(subbuf.pages, subbuf.page_base, obj, this_len);
908 len -= this_len;
909 obj += this_len;
910 this_len = min(len, (int)subbuf.tail[0].iov_len);
911 memcpy(subbuf.tail[0].iov_base, obj, this_len);
912out:
913 return status;
914}
915
916int
917xdr_decode_word(struct xdr_buf *buf, int base, u32 *obj)
864{ 918{
865 u32 raw; 919 u32 raw;
866 int status; 920 int status;
@@ -872,6 +926,14 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
872 return 0; 926 return 0;
873} 927}
874 928
929int
930xdr_encode_word(struct xdr_buf *buf, int base, u32 obj)
931{
932 u32 raw = htonl(obj);
933
934 return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
935}
936
875/* If the netobj starting offset bytes from the start of xdr_buf is contained 937/* If the netobj starting offset bytes from the start of xdr_buf is contained
876 * entirely in the head or the tail, set object to point to it; otherwise 938 * entirely in the head or the tail, set object to point to it; otherwise
877 * try to find space for it at the end of the tail, copy it there, and 939 * try to find space for it at the end of the tail, copy it there, and
@@ -882,7 +944,7 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
882 u32 tail_offset = buf->head[0].iov_len + buf->page_len; 944 u32 tail_offset = buf->head[0].iov_len + buf->page_len;
883 u32 obj_end_offset; 945 u32 obj_end_offset;
884 946
885 if (read_u32_from_xdr_buf(buf, offset, &obj->len)) 947 if (xdr_decode_word(buf, offset, &obj->len))
886 goto out; 948 goto out;
887 obj_end_offset = offset + 4 + obj->len; 949 obj_end_offset = offset + 4 + obj->len;
888 950
@@ -915,3 +977,220 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
915out: 977out:
916 return -1; 978 return -1;
917} 979}
980
981/* Returns 0 on success, or else a negative error code. */
982static int
983xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
984 struct xdr_array2_desc *desc, int encode)
985{
986 char *elem = NULL, *c;
987 unsigned int copied = 0, todo, avail_here;
988 struct page **ppages = NULL;
989 int err;
990
991 if (encode) {
992 if (xdr_encode_word(buf, base, desc->array_len) != 0)
993 return -EINVAL;
994 } else {
995 if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
996 desc->array_len > desc->array_maxlen ||
997 (unsigned long) base + 4 + desc->array_len *
998 desc->elem_size > buf->len)
999 return -EINVAL;
1000 }
1001 base += 4;
1002
1003 if (!desc->xcode)
1004 return 0;
1005
1006 todo = desc->array_len * desc->elem_size;
1007
1008 /* process head */
1009 if (todo && base < buf->head->iov_len) {
1010 c = buf->head->iov_base + base;
1011 avail_here = min_t(unsigned int, todo,
1012 buf->head->iov_len - base);
1013 todo -= avail_here;
1014
1015 while (avail_here >= desc->elem_size) {
1016 err = desc->xcode(desc, c);
1017 if (err)
1018 goto out;
1019 c += desc->elem_size;
1020 avail_here -= desc->elem_size;
1021 }
1022 if (avail_here) {
1023 if (!elem) {
1024 elem = kmalloc(desc->elem_size, GFP_KERNEL);
1025 err = -ENOMEM;
1026 if (!elem)
1027 goto out;
1028 }
1029 if (encode) {
1030 err = desc->xcode(desc, elem);
1031 if (err)
1032 goto out;
1033 memcpy(c, elem, avail_here);
1034 } else
1035 memcpy(elem, c, avail_here);
1036 copied = avail_here;
1037 }
1038 base = buf->head->iov_len; /* align to start of pages */
1039 }
1040
1041 /* process pages array */
1042 base -= buf->head->iov_len;
1043 if (todo && base < buf->page_len) {
1044 unsigned int avail_page;
1045
1046 avail_here = min(todo, buf->page_len - base);
1047 todo -= avail_here;
1048
1049 base += buf->page_base;
1050 ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
1051 base &= ~PAGE_CACHE_MASK;
1052 avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
1053 avail_here);
1054 c = kmap(*ppages) + base;
1055
1056 while (avail_here) {
1057 avail_here -= avail_page;
1058 if (copied || avail_page < desc->elem_size) {
1059 unsigned int l = min(avail_page,
1060 desc->elem_size - copied);
1061 if (!elem) {
1062 elem = kmalloc(desc->elem_size,
1063 GFP_KERNEL);
1064 err = -ENOMEM;
1065 if (!elem)
1066 goto out;
1067 }
1068 if (encode) {
1069 if (!copied) {
1070 err = desc->xcode(desc, elem);
1071 if (err)
1072 goto out;
1073 }
1074 memcpy(c, elem + copied, l);
1075 copied += l;
1076 if (copied == desc->elem_size)
1077 copied = 0;
1078 } else {
1079 memcpy(elem + copied, c, l);
1080 copied += l;
1081 if (copied == desc->elem_size) {
1082 err = desc->xcode(desc, elem);
1083 if (err)
1084 goto out;
1085 copied = 0;
1086 }
1087 }
1088 avail_page -= l;
1089 c += l;
1090 }
1091 while (avail_page >= desc->elem_size) {
1092 err = desc->xcode(desc, c);
1093 if (err)
1094 goto out;
1095 c += desc->elem_size;
1096 avail_page -= desc->elem_size;
1097 }
1098 if (avail_page) {
1099 unsigned int l = min(avail_page,
1100 desc->elem_size - copied);
1101 if (!elem) {
1102 elem = kmalloc(desc->elem_size,
1103 GFP_KERNEL);
1104 err = -ENOMEM;
1105 if (!elem)
1106 goto out;
1107 }
1108 if (encode) {
1109 if (!copied) {
1110 err = desc->xcode(desc, elem);
1111 if (err)
1112 goto out;
1113 }
1114 memcpy(c, elem + copied, l);
1115 copied += l;
1116 if (copied == desc->elem_size)
1117 copied = 0;
1118 } else {
1119 memcpy(elem + copied, c, l);
1120 copied += l;
1121 if (copied == desc->elem_size) {
1122 err = desc->xcode(desc, elem);
1123 if (err)
1124 goto out;
1125 copied = 0;
1126 }
1127 }
1128 }
1129 if (avail_here) {
1130 kunmap(*ppages);
1131 ppages++;
1132 c = kmap(*ppages);
1133 }
1134
1135 avail_page = min(avail_here,
1136 (unsigned int) PAGE_CACHE_SIZE);
1137 }
1138 base = buf->page_len; /* align to start of tail */
1139 }
1140
1141 /* process tail */
1142 base -= buf->page_len;
1143 if (todo) {
1144 c = buf->tail->iov_base + base;
1145 if (copied) {
1146 unsigned int l = desc->elem_size - copied;
1147
1148 if (encode)
1149 memcpy(c, elem + copied, l);
1150 else {
1151 memcpy(elem + copied, c, l);
1152 err = desc->xcode(desc, elem);
1153 if (err)
1154 goto out;
1155 }
1156 todo -= l;
1157 c += l;
1158 }
1159 while (todo) {
1160 err = desc->xcode(desc, c);
1161 if (err)
1162 goto out;
1163 c += desc->elem_size;
1164 todo -= desc->elem_size;
1165 }
1166 }
1167 err = 0;
1168
1169out:
1170 if (elem)
1171 kfree(elem);
1172 if (ppages)
1173 kunmap(*ppages);
1174 return err;
1175}
1176
1177int
1178xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
1179 struct xdr_array2_desc *desc)
1180{
1181 if (base >= buf->len)
1182 return -EINVAL;
1183
1184 return xdr_xcode_array2(buf, base, desc, 0);
1185}
1186
1187int
1188xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
1189 struct xdr_array2_desc *desc)
1190{
1191 if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
1192 buf->head->iov_len + buf->page_len + buf->tail->iov_len)
1193 return -EINVAL;
1194
1195 return xdr_xcode_array2(buf, base, desc, 1);
1196}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c74a6bb94074..3c654e06b084 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -145,8 +145,6 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
145 if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { 145 if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) {
146 if (task == xprt->snd_task) 146 if (task == xprt->snd_task)
147 return 1; 147 return 1;
148 if (task == NULL)
149 return 0;
150 goto out_sleep; 148 goto out_sleep;
151 } 149 }
152 if (xprt->nocong || __xprt_get_cong(xprt, task)) { 150 if (xprt->nocong || __xprt_get_cong(xprt, task)) {
@@ -569,8 +567,11 @@ void xprt_connect(struct rpc_task *task)
569 if (xprt->sock != NULL) 567 if (xprt->sock != NULL)
570 schedule_delayed_work(&xprt->sock_connect, 568 schedule_delayed_work(&xprt->sock_connect,
571 RPC_REESTABLISH_TIMEOUT); 569 RPC_REESTABLISH_TIMEOUT);
572 else 570 else {
573 schedule_work(&xprt->sock_connect); 571 schedule_work(&xprt->sock_connect);
572 if (!RPC_IS_ASYNC(task))
573 flush_scheduled_work();
574 }
574 } 575 }
575 return; 576 return;
576 out_write: 577 out_write:
@@ -725,7 +726,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
725 goto no_checksum; 726 goto no_checksum;
726 727
727 desc.csum = csum_partial(skb->data, desc.offset, skb->csum); 728 desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
728 xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits); 729 if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0)
730 return -1;
729 if (desc.offset != skb->len) { 731 if (desc.offset != skb->len) {
730 unsigned int csum2; 732 unsigned int csum2;
731 csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); 733 csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
@@ -737,7 +739,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
737 return -1; 739 return -1;
738 return 0; 740 return 0;
739no_checksum: 741no_checksum:
740 xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits); 742 if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
743 return -1;
741 if (desc.count) 744 if (desc.count)
742 return -1; 745 return -1;
743 return 0; 746 return 0;
@@ -821,10 +824,15 @@ tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
821{ 824{
822 if (len > desc->count) 825 if (len > desc->count)
823 len = desc->count; 826 len = desc->count;
824 if (skb_copy_bits(desc->skb, desc->offset, p, len)) 827 if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
828 dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n",
829 len, desc->count);
825 return 0; 830 return 0;
831 }
826 desc->offset += len; 832 desc->offset += len;
827 desc->count -= len; 833 desc->count -= len;
834 dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n",
835 len, desc->count);
828 return len; 836 return len;
829} 837}
830 838
@@ -863,6 +871,8 @@ tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
863static void 871static void
864tcp_check_recm(struct rpc_xprt *xprt) 872tcp_check_recm(struct rpc_xprt *xprt)
865{ 873{
874 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
875 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
866 if (xprt->tcp_offset == xprt->tcp_reclen) { 876 if (xprt->tcp_offset == xprt->tcp_reclen) {
867 xprt->tcp_flags |= XPRT_COPY_RECM; 877 xprt->tcp_flags |= XPRT_COPY_RECM;
868 xprt->tcp_offset = 0; 878 xprt->tcp_offset = 0;
@@ -907,6 +917,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
907 struct rpc_rqst *req; 917 struct rpc_rqst *req;
908 struct xdr_buf *rcvbuf; 918 struct xdr_buf *rcvbuf;
909 size_t len; 919 size_t len;
920 ssize_t r;
910 921
911 /* Find and lock the request corresponding to this xid */ 922 /* Find and lock the request corresponding to this xid */
912 spin_lock(&xprt->sock_lock); 923 spin_lock(&xprt->sock_lock);
@@ -927,15 +938,40 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
927 len = xprt->tcp_reclen - xprt->tcp_offset; 938 len = xprt->tcp_reclen - xprt->tcp_offset;
928 memcpy(&my_desc, desc, sizeof(my_desc)); 939 memcpy(&my_desc, desc, sizeof(my_desc));
929 my_desc.count = len; 940 my_desc.count = len;
930 xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, 941 r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
931 &my_desc, tcp_copy_data); 942 &my_desc, tcp_copy_data);
932 desc->count -= len; 943 desc->count -= r;
933 desc->offset += len; 944 desc->offset += r;
934 } else 945 } else
935 xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, 946 r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
936 desc, tcp_copy_data); 947 desc, tcp_copy_data);
937 xprt->tcp_copied += len; 948
938 xprt->tcp_offset += len; 949 if (r > 0) {
950 xprt->tcp_copied += r;
951 xprt->tcp_offset += r;
952 }
953 if (r != len) {
954 /* Error when copying to the receive buffer,
955 * usually because we weren't able to allocate
956 * additional buffer pages. All we can do now
957 * is turn off XPRT_COPY_DATA, so the request
958 * will not receive any additional updates,
959 * and time out.
960 * Any remaining data from this record will
961 * be discarded.
962 */
963 xprt->tcp_flags &= ~XPRT_COPY_DATA;
964 dprintk("RPC: XID %08x truncated request\n",
965 ntohl(xprt->tcp_xid));
966 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
967 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
968 goto out;
969 }
970
971 dprintk("RPC: XID %08x read %Zd bytes\n",
972 ntohl(xprt->tcp_xid), r);
973 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
974 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
939 975
940 if (xprt->tcp_copied == req->rq_private_buf.buflen) 976 if (xprt->tcp_copied == req->rq_private_buf.buflen)
941 xprt->tcp_flags &= ~XPRT_COPY_DATA; 977 xprt->tcp_flags &= ~XPRT_COPY_DATA;
@@ -944,6 +980,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
944 xprt->tcp_flags &= ~XPRT_COPY_DATA; 980 xprt->tcp_flags &= ~XPRT_COPY_DATA;
945 } 981 }
946 982
983out:
947 if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { 984 if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
948 dprintk("RPC: %4d received reply complete\n", 985 dprintk("RPC: %4d received reply complete\n",
949 req->rq_task->tk_pid); 986 req->rq_task->tk_pid);
@@ -967,6 +1004,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
967 desc->count -= len; 1004 desc->count -= len;
968 desc->offset += len; 1005 desc->offset += len;
969 xprt->tcp_offset += len; 1006 xprt->tcp_offset += len;
1007 dprintk("RPC: discarded %Zu bytes\n", len);
970 tcp_check_recm(xprt); 1008 tcp_check_recm(xprt);
971} 1009}
972 1010
@@ -1064,8 +1102,7 @@ tcp_state_change(struct sock *sk)
1064 case TCP_SYN_RECV: 1102 case TCP_SYN_RECV:
1065 break; 1103 break;
1066 default: 1104 default:
1067 if (xprt_test_and_clear_connected(xprt)) 1105 xprt_disconnect(xprt);
1068 rpc_wake_up_status(&xprt->pending, -ENOTCONN);
1069 break; 1106 break;
1070 } 1107 }
1071 out: 1108 out:
@@ -1203,6 +1240,8 @@ xprt_transmit(struct rpc_task *task)
1203 list_add_tail(&req->rq_list, &xprt->recv); 1240 list_add_tail(&req->rq_list, &xprt->recv);
1204 spin_unlock_bh(&xprt->sock_lock); 1241 spin_unlock_bh(&xprt->sock_lock);
1205 xprt_reset_majortimeo(req); 1242 xprt_reset_majortimeo(req);
1243 /* Turn off autodisconnect */
1244 del_singleshot_timer_sync(&xprt->timer);
1206 } 1245 }
1207 } else if (!req->rq_bytes_sent) 1246 } else if (!req->rq_bytes_sent)
1208 return; 1247 return;
@@ -1333,8 +1372,6 @@ xprt_reserve(struct rpc_task *task)
1333 spin_lock(&xprt->xprt_lock); 1372 spin_lock(&xprt->xprt_lock);
1334 do_xprt_reserve(task); 1373 do_xprt_reserve(task);
1335 spin_unlock(&xprt->xprt_lock); 1374 spin_unlock(&xprt->xprt_lock);
1336 if (task->tk_rqstp)
1337 del_timer_sync(&xprt->timer);
1338 } 1375 }
1339} 1376}
1340 1377
@@ -1649,6 +1686,10 @@ xprt_shutdown(struct rpc_xprt *xprt)
1649 rpc_wake_up(&xprt->backlog); 1686 rpc_wake_up(&xprt->backlog);
1650 wake_up(&xprt->cong_wait); 1687 wake_up(&xprt->cong_wait);
1651 del_timer_sync(&xprt->timer); 1688 del_timer_sync(&xprt->timer);
1689
1690 /* synchronously wait for connect worker to finish */
1691 cancel_delayed_work(&xprt->sock_connect);
1692 flush_scheduled_work();
1652} 1693}
1653 1694
1654/* 1695/*
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 000000000000..5a69733bcdad
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,21 @@
1#
2# Unix Domain Sockets
3#
4
5config UNIX
6 tristate "Unix domain sockets"
7 ---help---
8 If you say Y here, you will include support for Unix domain sockets;
9 sockets are the standard Unix mechanism for establishing and
10 accessing network connections. Many commonly used programs such as
11 the X Window system and syslog use these sockets even if your
12 machine is not connected to any network. Unless you are working on
13 an embedded system or something similar, you therefore definitely
14 want to say Y here.
15
16 To compile this driver as a module, choose M here: the module will be
17 called unix. Note that several important services won't work
18 correctly if you say M here and then neglect to load the module.
19
20 Say Y unless you know what you are doing.
21
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c420eba4876b..d403e34088ad 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock *sk)
302 * may receive messages only from that peer. */ 302 * may receive messages only from that peer. */
303static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 303static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
304{ 304{
305 if (skb_queue_len(&sk->sk_receive_queue)) { 305 if (!skb_queue_empty(&sk->sk_receive_queue)) {
306 skb_queue_purge(&sk->sk_receive_queue); 306 skb_queue_purge(&sk->sk_receive_queue);
307 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 307 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
308 308
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct sock * sk, long timeo)
1619 for (;;) { 1619 for (;;) {
1620 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 1620 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1621 1621
1622 if (skb_queue_len(&sk->sk_receive_queue) || 1622 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1623 sk->sk_err || 1623 sk->sk_err ||
1624 (sk->sk_shutdown & RCV_SHUTDOWN) || 1624 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1625 signal_pending(current) || 1625 signal_pending(current) ||
diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig
new file mode 100644
index 000000000000..1debe1cb054e
--- /dev/null
+++ b/net/wanrouter/Kconfig
@@ -0,0 +1,29 @@
1#
2# Configuration for WAN router
3#
4
5config WAN_ROUTER
6 tristate "WAN router"
7 depends on EXPERIMENTAL
8 ---help---
9 Wide Area Networks (WANs), such as X.25, frame relay and leased
10 lines, are used to interconnect Local Area Networks (LANs) over vast
11 distances with data transfer rates significantly higher than those
12 achievable with commonly used asynchronous modem connections.
13 Usually, a quite expensive external device called a `WAN router' is
14 needed to connect to a WAN.
15
16 As an alternative, WAN routing can be built into the Linux kernel.
17 With relatively inexpensive WAN interface cards available on the
18 market, a perfectly usable router can be built for less than half
19 the price of an external router. If you have one of those cards and
20 wish to use your Linux box as a WAN router, say Y here and also to
21 the WAN driver for your card, below. You will then need the
22 wan-tools package which is available from <ftp://ftp.sangoma.com/>.
23 Read <file:Documentation/networking/wan-router.txt> for more
24 information.
25
26 To compile WAN routing support as a module, choose M here: the
27 module will be called wanrouter.
28
29 If unsure, say N.
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index d6844ac226f5..13b650ad22e2 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -358,10 +358,10 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
358 */ 358 */
359 359
360 360
361unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev) 361__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
362{ 362{
363 int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ 363 int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */
364 unsigned short ethertype; 364 __be16 ethertype;
365 365
366 switch (skb->data[cnt]) { 366 switch (skb->data[cnt]) {
367 case NLPID_IP: /* IP datagramm */ 367 case NLPID_IP: /* IP datagramm */
@@ -379,7 +379,7 @@ unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
379 skb->data[cnt+3], dev->name); 379 skb->data[cnt+3], dev->name);
380 return 0; 380 return 0;
381 } 381 }
382 ethertype = *((unsigned short*)&skb->data[cnt+4]); 382 ethertype = *((__be16*)&skb->data[cnt+4]);
383 cnt += 6; 383 cnt += 6;
384 break; 384 break;
385 385
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 000000000000..e6759c9660bb
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,36 @@
1#
2# CCITT X.25 Packet Layer
3#
4
5config X25
6 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 X.25 is a set of standardized network protocols, similar in scope to
10 frame relay; the one physical line from your box to the X.25 network
11 entry point can carry several logical point-to-point connections
12 (called "virtual circuits") to other computers connected to the X.25
13 network. Governments, banks, and other organizations tend to use it
14 to connect to each other or to form Wide Area Networks (WANs). Many
15 countries have public X.25 networks. X.25 consists of two
16 protocols: the higher level Packet Layer Protocol (PLP) (say Y here
17 if you want that) and the lower level data link layer protocol LAPB
18 (say Y to "LAPB Data Link Driver" below if you want that).
19
20 You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
21 <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
22 Information about X.25 for Linux is contained in the files
23 <file:Documentation/networking/x25.txt> and
24 <file:Documentation/networking/x25-iface.txt>.
25
26 One connects to an X.25 network either with a dedicated network card
27 using the X.21 protocol (not yet supported by Linux) or one can do
28 X.25 over a standard telephone line using an ordinary modem (say Y
29 to "X.25 async driver" below) or over Ethernet using an ordinary
30 Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
31 Driver" and "LAPB over Ethernet driver" below).
32
33 To compile this driver as a module, choose M here: the module
34 will be called x25. If unsure, say N.
35
36
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 2a24b243b841..04bec047fa9a 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -29,6 +29,10 @@
29 * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN 29 * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN
30 * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to 30 * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
31 * x25_proc.c, using seq_file 31 * x25_proc.c, using seq_file
32 * 2005-04-02 Shaun Pereira Selective sub address matching
33 * with call user data
34 * 2005-04-15 Shaun Pereira Fast select with no restriction on
35 * response
32 */ 36 */
33 37
34#include <linux/config.h> 38#include <linux/config.h>
@@ -219,7 +223,8 @@ static void x25_insert_socket(struct sock *sk)
219 * Note: if a listening socket has cud set it must only get calls 223 * Note: if a listening socket has cud set it must only get calls
220 * with matching cud. 224 * with matching cud.
221 */ 225 */
222static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata) 226static struct sock *x25_find_listener(struct x25_address *addr,
227 struct sk_buff *skb)
223{ 228{
224 struct sock *s; 229 struct sock *s;
225 struct sock *next_best; 230 struct sock *next_best;
@@ -230,22 +235,23 @@ static struct sock *x25_find_listener(struct x25_address *addr, struct x25_callu
230 235
231 sk_for_each(s, node, &x25_list) 236 sk_for_each(s, node, &x25_list)
232 if ((!strcmp(addr->x25_addr, 237 if ((!strcmp(addr->x25_addr,
233 x25_sk(s)->source_addr.x25_addr) || 238 x25_sk(s)->source_addr.x25_addr) ||
234 !strcmp(addr->x25_addr, 239 !strcmp(addr->x25_addr,
235 null_x25_address.x25_addr)) && 240 null_x25_address.x25_addr)) &&
236 s->sk_state == TCP_LISTEN) { 241 s->sk_state == TCP_LISTEN) {
237
238 /* 242 /*
239 * Found a listening socket, now check the incoming 243 * Found a listening socket, now check the incoming
240 * call user data vs this sockets call user data 244 * call user data vs this sockets call user data
241 */ 245 */
242 if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) { 246 if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) {
243 sock_hold(s); 247 if((memcmp(x25_sk(s)->calluserdata.cuddata,
244 goto found; 248 skb->data,
245 } 249 x25_sk(s)->cudmatchlength)) == 0) {
246 if (x25_sk(s)->calluserdata.cudlength == 0) { 250 sock_hold(s);
251 goto found;
252 }
253 } else
247 next_best = s; 254 next_best = s;
248 }
249 } 255 }
250 if (next_best) { 256 if (next_best) {
251 s = next_best; 257 s = next_best;
@@ -497,6 +503,9 @@ static int x25_create(struct socket *sock, int protocol)
497 x25->t23 = sysctl_x25_clear_request_timeout; 503 x25->t23 = sysctl_x25_clear_request_timeout;
498 x25->t2 = sysctl_x25_ack_holdback_timeout; 504 x25->t2 = sysctl_x25_ack_holdback_timeout;
499 x25->state = X25_STATE_0; 505 x25->state = X25_STATE_0;
506 x25->cudmatchlength = 0;
507 x25->accptapprv = X25_DENY_ACCPT_APPRV; /* normally no cud */
508 /* on call accept */
500 509
501 x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; 510 x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE;
502 x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; 511 x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -545,6 +554,8 @@ static struct sock *x25_make_new(struct sock *osk)
545 x25->t2 = ox25->t2; 554 x25->t2 = ox25->t2;
546 x25->facilities = ox25->facilities; 555 x25->facilities = ox25->facilities;
547 x25->qbitincl = ox25->qbitincl; 556 x25->qbitincl = ox25->qbitincl;
557 x25->cudmatchlength = ox25->cudmatchlength;
558 x25->accptapprv = ox25->accptapprv;
548 559
549 x25_init_timers(sk); 560 x25_init_timers(sk);
550out: 561out:
@@ -822,7 +833,6 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
822 struct x25_sock *makex25; 833 struct x25_sock *makex25;
823 struct x25_address source_addr, dest_addr; 834 struct x25_address source_addr, dest_addr;
824 struct x25_facilities facilities; 835 struct x25_facilities facilities;
825 struct x25_calluserdata calluserdata;
826 int len, rc; 836 int len, rc;
827 837
828 /* 838 /*
@@ -845,19 +855,10 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
845 skb_pull(skb,len); 855 skb_pull(skb,len);
846 856
847 /* 857 /*
848 * Incoming Call User Data.
849 */
850 if (skb->len >= 0) {
851 memcpy(calluserdata.cuddata, skb->data, skb->len);
852 calluserdata.cudlength = skb->len;
853 }
854
855 skb_push(skb,len);
856
857 /*
858 * Find a listener for the particular address/cud pair. 858 * Find a listener for the particular address/cud pair.
859 */ 859 */
860 sk = x25_find_listener(&source_addr,&calluserdata); 860 sk = x25_find_listener(&source_addr,skb);
861 skb_push(skb,len);
861 862
862 /* 863 /*
863 * We can't accept the Call Request. 864 * We can't accept the Call Request.
@@ -900,11 +901,23 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
900 makex25->neighbour = nb; 901 makex25->neighbour = nb;
901 makex25->facilities = facilities; 902 makex25->facilities = facilities;
902 makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; 903 makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
903 makex25->calluserdata = calluserdata; 904 /* ensure no reverse facil on accept */
904 905 makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
905 x25_write_internal(make, X25_CALL_ACCEPTED); 906 makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
907
908 /* Normally all calls are accepted immediatly */
909 if(makex25->accptapprv & X25_DENY_ACCPT_APPRV) {
910 x25_write_internal(make, X25_CALL_ACCEPTED);
911 makex25->state = X25_STATE_3;
912 }
906 913
907 makex25->state = X25_STATE_3; 914 /*
915 * Incoming Call User Data.
916 */
917 if (skb->len >= 0) {
918 memcpy(makex25->calluserdata.cuddata, skb->data, skb->len);
919 makex25->calluserdata.cudlength = skb->len;
920 }
908 921
909 sk->sk_ack_backlog++; 922 sk->sk_ack_backlog++;
910 923
@@ -1288,7 +1301,8 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1288 if (facilities.throughput < 0x03 || 1301 if (facilities.throughput < 0x03 ||
1289 facilities.throughput > 0xDD) 1302 facilities.throughput > 0xDD)
1290 break; 1303 break;
1291 if (facilities.reverse && facilities.reverse != 1) 1304 if (facilities.reverse &&
1305 (facilities.reverse | 0x81)!= 0x81)
1292 break; 1306 break;
1293 x25->facilities = facilities; 1307 x25->facilities = facilities;
1294 rc = 0; 1308 rc = 0;
@@ -1325,6 +1339,44 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1325 break; 1339 break;
1326 } 1340 }
1327 1341
1342 case SIOCX25SCUDMATCHLEN: {
1343 struct x25_subaddr sub_addr;
1344 rc = -EINVAL;
1345 if(sk->sk_state != TCP_CLOSE)
1346 break;
1347 rc = -EFAULT;
1348 if (copy_from_user(&sub_addr, argp,
1349 sizeof(sub_addr)))
1350 break;
1351 rc = -EINVAL;
1352 if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
1353 break;
1354 x25->cudmatchlength = sub_addr.cudmatchlength;
1355 rc = 0;
1356 break;
1357 }
1358
1359 case SIOCX25CALLACCPTAPPRV: {
1360 rc = -EINVAL;
1361 if (sk->sk_state != TCP_CLOSE)
1362 break;
1363 x25->accptapprv = X25_ALLOW_ACCPT_APPRV;
1364 rc = 0;
1365 break;
1366 }
1367
1368 case SIOCX25SENDCALLACCPT: {
1369 rc = -EINVAL;
1370 if (sk->sk_state != TCP_ESTABLISHED)
1371 break;
1372 if (x25->accptapprv) /* must call accptapprv above */
1373 break;
1374 x25_write_internal(sk, X25_CALL_ACCEPTED);
1375 x25->state = X25_STATE_3;
1376 rc = 0;
1377 break;
1378 }
1379
1328 default: 1380 default:
1329 rc = dev_ioctl(cmd, argp); 1381 rc = dev_ioctl(cmd, argp);
1330 break; 1382 break;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21bdb95f9a8..54278b962f4c 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -17,6 +17,8 @@
17 * X.25 001 Split from x25_subr.c 17 * X.25 001 Split from x25_subr.c
18 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities 18 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
19 * negotiation. 19 * negotiation.
20 * apr/14/05 Shaun Pereira - Allow fast select with no restriction
21 * on response.
20 */ 22 */
21 23
22#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -43,9 +45,31 @@ int x25_parse_facilities(struct sk_buff *skb,
43 case X25_FAC_CLASS_A: 45 case X25_FAC_CLASS_A:
44 switch (*p) { 46 switch (*p) {
45 case X25_FAC_REVERSE: 47 case X25_FAC_REVERSE:
46 facilities->reverse = p[1] & 0x01; 48 if((p[1] & 0x81) == 0x81) {
47 *vc_fac_mask |= X25_MASK_REVERSE; 49 facilities->reverse = p[1] & 0x81;
48 break; 50 *vc_fac_mask |= X25_MASK_REVERSE;
51 break;
52 }
53
54 if((p[1] & 0x01) == 0x01) {
55 facilities->reverse = p[1] & 0x01;
56 *vc_fac_mask |= X25_MASK_REVERSE;
57 break;
58 }
59
60 if((p[1] & 0x80) == 0x80) {
61 facilities->reverse = p[1] & 0x80;
62 *vc_fac_mask |= X25_MASK_REVERSE;
63 break;
64 }
65
66 if(p[1] == 0x00) {
67 facilities->reverse
68 = X25_DEFAULT_REVERSE;
69 *vc_fac_mask |= X25_MASK_REVERSE;
70 break;
71 }
72
49 case X25_FAC_THROUGHPUT: 73 case X25_FAC_THROUGHPUT:
50 facilities->throughput = p[1]; 74 facilities->throughput = p[1];
51 *vc_fac_mask |= X25_MASK_THROUGHPUT; 75 *vc_fac_mask |= X25_MASK_THROUGHPUT;
@@ -122,7 +146,7 @@ int x25_create_facilities(unsigned char *buffer,
122 146
123 if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) { 147 if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
124 *p++ = X25_FAC_REVERSE; 148 *p++ = X25_FAC_REVERSE;
125 *p++ = !!facilities->reverse; 149 *p++ = facilities->reverse;
126 } 150 }
127 151
128 if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) { 152 if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
@@ -171,7 +195,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
171 /* 195 /*
172 * They want reverse charging, we won't accept it. 196 * They want reverse charging, we won't accept it.
173 */ 197 */
174 if (theirs.reverse && ours->reverse) { 198 if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
175 SOCK_DEBUG(sk, "X.25: rejecting reverse charging request"); 199 SOCK_DEBUG(sk, "X.25: rejecting reverse charging request");
176 return -1; 200 return -1;
177 } 201 }
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 183fea3bba67..7fd872ad0c20 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -19,6 +19,8 @@
19 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities 19 * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
20 * negotiation. 20 * negotiation.
21 * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups 21 * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups
22 * apr/04/15 Shaun Pereira Fast select with no
23 * restriction on response.
22 */ 24 */
23 25
24#include <linux/kernel.h> 26#include <linux/kernel.h>
@@ -127,8 +129,12 @@ void x25_write_internal(struct sock *sk, int frametype)
127 len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN + 129 len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN +
128 X25_MAX_CUD_LEN; 130 X25_MAX_CUD_LEN;
129 break; 131 break;
130 case X25_CALL_ACCEPTED: 132 case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
131 len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN; 133 if(x25->facilities.reverse & 0x80) {
134 len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
135 } else {
136 len += 1 + X25_MAX_FAC_LEN;
137 }
132 break; 138 break;
133 case X25_CLEAR_REQUEST: 139 case X25_CLEAR_REQUEST:
134 case X25_RESET_REQUEST: 140 case X25_RESET_REQUEST:
@@ -203,9 +209,16 @@ void x25_write_internal(struct sock *sk, int frametype)
203 x25->vc_facil_mask); 209 x25->vc_facil_mask);
204 dptr = skb_put(skb, len); 210 dptr = skb_put(skb, len);
205 memcpy(dptr, facilities, len); 211 memcpy(dptr, facilities, len);
206 dptr = skb_put(skb, x25->calluserdata.cudlength); 212
207 memcpy(dptr, x25->calluserdata.cuddata, 213 /* fast select with no restriction on response
208 x25->calluserdata.cudlength); 214 allows call user data. Userland must
215 ensure it is ours and not theirs */
216 if(x25->facilities.reverse & 0x80) {
217 dptr = skb_put(skb,
218 x25->calluserdata.cudlength);
219 memcpy(dptr, x25->calluserdata.cuddata,
220 x25->calluserdata.cudlength);
221 }
209 x25->calluserdata.cudlength = 0; 222 x25->calluserdata.cudlength = 0;
210 break; 223 break;
211 224
@@ -354,21 +367,3 @@ void x25_check_rbuf(struct sock *sk)
354 } 367 }
355} 368}
356 369
357/*
358 * Compare 2 calluserdata structures, used to find correct listening sockets
359 * when call user data is used.
360 */
361int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs)
362{
363 int i;
364 if (ours->cudlength != theirs->cudlength)
365 return 0;
366
367 for (i=0;i<ours->cudlength;i++) {
368 if (ours->cuddata[i] != theirs->cuddata[i]) {
369 return 0;
370 }
371 }
372 return 1;
373}
374
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 58ca6a972c48..0c1c04322baf 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -1,6 +1,10 @@
1# 1#
2# XFRM configuration 2# XFRM configuration
3# 3#
4config XFRM
5 bool
6 depends on NET
7
4config XFRM_USER 8config XFRM_USER
5 tristate "IPsec user configuration interface" 9 tristate "IPsec user configuration interface"
6 depends on INET && XFRM 10 depends on INET && XFRM
@@ -10,3 +14,14 @@ config XFRM_USER
10 14
11 If unsure, say Y. 15 If unsure, say Y.
12 16
17config NET_KEY
18 tristate "PF_KEY sockets"
19 select XFRM
20 ---help---
21 PF_KEYv2 socket family, compatible to KAME ones.
22 They are required if you are going to use IPsec tools ported
23 from KAME.
24
25 Say Y unless you know what you are doing.
26
27
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d07f5ce31824..d65ed8684fc1 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -118,7 +118,6 @@ retry:
118 xfrm_policy_put_afinfo(afinfo); 118 xfrm_policy_put_afinfo(afinfo);
119 return type; 119 return type;
120} 120}
121EXPORT_SYMBOL(xfrm_get_type);
122 121
123int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, 122int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl,
124 unsigned short family) 123 unsigned short family)
@@ -216,8 +215,8 @@ out:
216 215
217expired: 216expired:
218 read_unlock(&xp->lock); 217 read_unlock(&xp->lock);
219 km_policy_expired(xp, dir, 1); 218 if (!xfrm_policy_delete(xp, dir))
220 xfrm_policy_delete(xp, dir); 219 km_policy_expired(xp, dir, 1);
221 xfrm_pol_put(xp); 220 xfrm_pol_put(xp);
222} 221}
223 222
@@ -555,7 +554,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
555 return NULL; 554 return NULL;
556} 555}
557 556
558void xfrm_policy_delete(struct xfrm_policy *pol, int dir) 557int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
559{ 558{
560 write_lock_bh(&xfrm_policy_lock); 559 write_lock_bh(&xfrm_policy_lock);
561 pol = __xfrm_policy_unlink(pol, dir); 560 pol = __xfrm_policy_unlink(pol, dir);
@@ -564,7 +563,9 @@ void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
564 if (dir < XFRM_POLICY_MAX) 563 if (dir < XFRM_POLICY_MAX)
565 atomic_inc(&flow_cache_genid); 564 atomic_inc(&flow_cache_genid);
566 xfrm_policy_kill(pol); 565 xfrm_policy_kill(pol);
566 return 0;
567 } 567 }
568 return -ENOENT;
568} 569}
569 570
570int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) 571int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d11747c2a763..9d206c282cf1 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -50,7 +50,7 @@ static DEFINE_SPINLOCK(xfrm_state_gc_lock);
50 50
51static int xfrm_state_gc_flush_bundles; 51static int xfrm_state_gc_flush_bundles;
52 52
53static void __xfrm_state_delete(struct xfrm_state *x); 53static int __xfrm_state_delete(struct xfrm_state *x);
54 54
55static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family); 55static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family);
56static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); 56static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
@@ -154,6 +154,7 @@ static void xfrm_timer_handler(unsigned long data)
154 next = tmo; 154 next = tmo;
155 } 155 }
156 156
157 x->km.dying = warn;
157 if (warn) 158 if (warn)
158 km_state_expired(x, 0); 159 km_state_expired(x, 0);
159resched: 160resched:
@@ -169,9 +170,8 @@ expired:
169 next = 2; 170 next = 2;
170 goto resched; 171 goto resched;
171 } 172 }
172 if (x->id.spi != 0) 173 if (!__xfrm_state_delete(x) && x->id.spi)
173 km_state_expired(x, 1); 174 km_state_expired(x, 1);
174 __xfrm_state_delete(x);
175 175
176out: 176out:
177 spin_unlock(&x->lock); 177 spin_unlock(&x->lock);
@@ -215,8 +215,10 @@ void __xfrm_state_destroy(struct xfrm_state *x)
215} 215}
216EXPORT_SYMBOL(__xfrm_state_destroy); 216EXPORT_SYMBOL(__xfrm_state_destroy);
217 217
218static void __xfrm_state_delete(struct xfrm_state *x) 218static int __xfrm_state_delete(struct xfrm_state *x)
219{ 219{
220 int err = -ESRCH;
221
220 if (x->km.state != XFRM_STATE_DEAD) { 222 if (x->km.state != XFRM_STATE_DEAD) {
221 x->km.state = XFRM_STATE_DEAD; 223 x->km.state = XFRM_STATE_DEAD;
222 spin_lock(&xfrm_state_lock); 224 spin_lock(&xfrm_state_lock);
@@ -245,14 +247,21 @@ static void __xfrm_state_delete(struct xfrm_state *x)
245 * is what we are dropping here. 247 * is what we are dropping here.
246 */ 248 */
247 atomic_dec(&x->refcnt); 249 atomic_dec(&x->refcnt);
250 err = 0;
248 } 251 }
252
253 return err;
249} 254}
250 255
251void xfrm_state_delete(struct xfrm_state *x) 256int xfrm_state_delete(struct xfrm_state *x)
252{ 257{
258 int err;
259
253 spin_lock_bh(&x->lock); 260 spin_lock_bh(&x->lock);
254 __xfrm_state_delete(x); 261 err = __xfrm_state_delete(x);
255 spin_unlock_bh(&x->lock); 262 spin_unlock_bh(&x->lock);
263
264 return err;
256} 265}
257EXPORT_SYMBOL(xfrm_state_delete); 266EXPORT_SYMBOL(xfrm_state_delete);
258 267
@@ -557,16 +566,18 @@ int xfrm_state_check_expire(struct xfrm_state *x)
557 566
558 if (x->curlft.bytes >= x->lft.hard_byte_limit || 567 if (x->curlft.bytes >= x->lft.hard_byte_limit ||
559 x->curlft.packets >= x->lft.hard_packet_limit) { 568 x->curlft.packets >= x->lft.hard_packet_limit) {
560 km_state_expired(x, 1); 569 x->km.state = XFRM_STATE_EXPIRED;
561 if (!mod_timer(&x->timer, jiffies + XFRM_ACQ_EXPIRES*HZ)) 570 if (!mod_timer(&x->timer, jiffies))
562 xfrm_state_hold(x); 571 xfrm_state_hold(x);
563 return -EINVAL; 572 return -EINVAL;
564 } 573 }
565 574
566 if (!x->km.dying && 575 if (!x->km.dying &&
567 (x->curlft.bytes >= x->lft.soft_byte_limit || 576 (x->curlft.bytes >= x->lft.soft_byte_limit ||
568 x->curlft.packets >= x->lft.soft_packet_limit)) 577 x->curlft.packets >= x->lft.soft_packet_limit)) {
578 x->km.dying = 1;
569 km_state_expired(x, 0); 579 km_state_expired(x, 0);
580 }
570 return 0; 581 return 0;
571} 582}
572EXPORT_SYMBOL(xfrm_state_check_expire); 583EXPORT_SYMBOL(xfrm_state_check_expire);
@@ -796,34 +807,56 @@ EXPORT_SYMBOL(xfrm_replay_advance);
796static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list); 807static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list);
797static DEFINE_RWLOCK(xfrm_km_lock); 808static DEFINE_RWLOCK(xfrm_km_lock);
798 809
799static void km_state_expired(struct xfrm_state *x, int hard) 810void km_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
800{ 811{
801 struct xfrm_mgr *km; 812 struct xfrm_mgr *km;
802 813
803 if (hard) 814 read_lock(&xfrm_km_lock);
804 x->km.state = XFRM_STATE_EXPIRED; 815 list_for_each_entry(km, &xfrm_km_list, list)
805 else 816 if (km->notify_policy)
806 x->km.dying = 1; 817 km->notify_policy(xp, dir, c);
818 read_unlock(&xfrm_km_lock);
819}
807 820
821void km_state_notify(struct xfrm_state *x, struct km_event *c)
822{
823 struct xfrm_mgr *km;
808 read_lock(&xfrm_km_lock); 824 read_lock(&xfrm_km_lock);
809 list_for_each_entry(km, &xfrm_km_list, list) 825 list_for_each_entry(km, &xfrm_km_list, list)
810 km->notify(x, hard); 826 if (km->notify)
827 km->notify(x, c);
811 read_unlock(&xfrm_km_lock); 828 read_unlock(&xfrm_km_lock);
829}
830
831EXPORT_SYMBOL(km_policy_notify);
832EXPORT_SYMBOL(km_state_notify);
833
834static void km_state_expired(struct xfrm_state *x, int hard)
835{
836 struct km_event c;
837
838 c.data.hard = hard;
839 c.event = XFRM_MSG_EXPIRE;
840 km_state_notify(x, &c);
812 841
813 if (hard) 842 if (hard)
814 wake_up(&km_waitq); 843 wake_up(&km_waitq);
815} 844}
816 845
846/*
847 * We send to all registered managers regardless of failure
848 * We are happy with one success
849*/
817static int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) 850static int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
818{ 851{
819 int err = -EINVAL; 852 int err = -EINVAL, acqret;
820 struct xfrm_mgr *km; 853 struct xfrm_mgr *km;
821 854
822 read_lock(&xfrm_km_lock); 855 read_lock(&xfrm_km_lock);
823 list_for_each_entry(km, &xfrm_km_list, list) { 856 list_for_each_entry(km, &xfrm_km_list, list) {
824 err = km->acquire(x, t, pol, XFRM_POLICY_OUT); 857 acqret = km->acquire(x, t, pol, XFRM_POLICY_OUT);
825 if (!err) 858 if (!acqret)
826 break; 859 err = acqret;
827 } 860 }
828 read_unlock(&xfrm_km_lock); 861 read_unlock(&xfrm_km_lock);
829 return err; 862 return err;
@@ -848,13 +881,11 @@ EXPORT_SYMBOL(km_new_mapping);
848 881
849void km_policy_expired(struct xfrm_policy *pol, int dir, int hard) 882void km_policy_expired(struct xfrm_policy *pol, int dir, int hard)
850{ 883{
851 struct xfrm_mgr *km; 884 struct km_event c;
852 885
853 read_lock(&xfrm_km_lock); 886 c.data.hard = hard;
854 list_for_each_entry(km, &xfrm_km_list, list) 887 c.event = XFRM_MSG_POLEXPIRE;
855 if (km->notify_policy) 888 km_policy_notify(pol, dir, &c);
856 km->notify_policy(pol, dir, hard);
857 read_unlock(&xfrm_km_lock);
858 889
859 if (hard) 890 if (hard)
860 wake_up(&km_waitq); 891 wake_up(&km_waitq);
@@ -1024,6 +1055,43 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu)
1024} 1055}
1025 1056
1026EXPORT_SYMBOL(xfrm_state_mtu); 1057EXPORT_SYMBOL(xfrm_state_mtu);
1058
1059int xfrm_init_state(struct xfrm_state *x)
1060{
1061 struct xfrm_state_afinfo *afinfo;
1062 int family = x->props.family;
1063 int err;
1064
1065 err = -EAFNOSUPPORT;
1066 afinfo = xfrm_state_get_afinfo(family);
1067 if (!afinfo)
1068 goto error;
1069
1070 err = 0;
1071 if (afinfo->init_flags)
1072 err = afinfo->init_flags(x);
1073
1074 xfrm_state_put_afinfo(afinfo);
1075
1076 if (err)
1077 goto error;
1078
1079 err = -EPROTONOSUPPORT;
1080 x->type = xfrm_get_type(x->id.proto, family);
1081 if (x->type == NULL)
1082 goto error;
1083
1084 err = x->type->init_state(x);
1085 if (err)
1086 goto error;
1087
1088 x->km.state = XFRM_STATE_VALID;
1089
1090error:
1091 return err;
1092}
1093
1094EXPORT_SYMBOL(xfrm_init_state);
1027 1095
1028void __init xfrm_state_init(void) 1096void __init xfrm_state_init(void)
1029{ 1097{
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 97509011c274..8da3e25b2c4c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -249,17 +249,10 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
249 if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1]))) 249 if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1])))
250 goto error; 250 goto error;
251 251
252 err = -ENOENT; 252 err = xfrm_init_state(x);
253 x->type = xfrm_get_type(x->id.proto, x->props.family);
254 if (x->type == NULL)
255 goto error;
256
257 err = x->type->init_state(x, NULL);
258 if (err) 253 if (err)
259 goto error; 254 goto error;
260 255
261 x->curlft.add_time = (unsigned long) xtime.tv_sec;
262 x->km.state = XFRM_STATE_VALID;
263 x->km.seq = p->seq; 256 x->km.seq = p->seq;
264 257
265 return x; 258 return x;
@@ -277,6 +270,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
277 struct xfrm_usersa_info *p = NLMSG_DATA(nlh); 270 struct xfrm_usersa_info *p = NLMSG_DATA(nlh);
278 struct xfrm_state *x; 271 struct xfrm_state *x;
279 int err; 272 int err;
273 struct km_event c;
280 274
281 err = verify_newsa_info(p, (struct rtattr **) xfrma); 275 err = verify_newsa_info(p, (struct rtattr **) xfrma);
282 if (err) 276 if (err)
@@ -286,6 +280,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
286 if (!x) 280 if (!x)
287 return err; 281 return err;
288 282
283 xfrm_state_hold(x);
289 if (nlh->nlmsg_type == XFRM_MSG_NEWSA) 284 if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
290 err = xfrm_state_add(x); 285 err = xfrm_state_add(x);
291 else 286 else
@@ -294,14 +289,24 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
294 if (err < 0) { 289 if (err < 0) {
295 x->km.state = XFRM_STATE_DEAD; 290 x->km.state = XFRM_STATE_DEAD;
296 xfrm_state_put(x); 291 xfrm_state_put(x);
292 goto out;
297 } 293 }
298 294
295 c.seq = nlh->nlmsg_seq;
296 c.pid = nlh->nlmsg_pid;
297 c.event = nlh->nlmsg_type;
298
299 km_state_notify(x, &c);
300out:
301 xfrm_state_put(x);
299 return err; 302 return err;
300} 303}
301 304
302static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) 305static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
303{ 306{
304 struct xfrm_state *x; 307 struct xfrm_state *x;
308 int err;
309 struct km_event c;
305 struct xfrm_usersa_id *p = NLMSG_DATA(nlh); 310 struct xfrm_usersa_id *p = NLMSG_DATA(nlh);
306 311
307 x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family); 312 x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
@@ -313,10 +318,19 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
313 return -EPERM; 318 return -EPERM;
314 } 319 }
315 320
316 xfrm_state_delete(x); 321 err = xfrm_state_delete(x);
322 if (err < 0) {
323 xfrm_state_put(x);
324 return err;
325 }
326
327 c.seq = nlh->nlmsg_seq;
328 c.pid = nlh->nlmsg_pid;
329 c.event = nlh->nlmsg_type;
330 km_state_notify(x, &c);
317 xfrm_state_put(x); 331 xfrm_state_put(x);
318 332
319 return 0; 333 return err;
320} 334}
321 335
322static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) 336static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
@@ -681,6 +695,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
681{ 695{
682 struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh); 696 struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh);
683 struct xfrm_policy *xp; 697 struct xfrm_policy *xp;
698 struct km_event c;
684 int err; 699 int err;
685 int excl; 700 int excl;
686 701
@@ -692,6 +707,10 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
692 if (!xp) 707 if (!xp)
693 return err; 708 return err;
694 709
710 /* shouldnt excl be based on nlh flags??
711 * Aha! this is anti-netlink really i.e more pfkey derived
712 * in netlink excl is a flag and you wouldnt need
713 * a type XFRM_MSG_UPDPOLICY - JHS */
695 excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; 714 excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
696 err = xfrm_policy_insert(p->dir, xp, excl); 715 err = xfrm_policy_insert(p->dir, xp, excl);
697 if (err) { 716 if (err) {
@@ -699,6 +718,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
699 return err; 718 return err;
700 } 719 }
701 720
721 c.event = nlh->nlmsg_type;
722 c.seq = nlh->nlmsg_seq;
723 c.pid = nlh->nlmsg_pid;
724 km_policy_notify(xp, p->dir, &c);
725
702 xfrm_pol_put(xp); 726 xfrm_pol_put(xp);
703 727
704 return 0; 728 return 0;
@@ -816,6 +840,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
816 struct xfrm_policy *xp; 840 struct xfrm_policy *xp;
817 struct xfrm_userpolicy_id *p; 841 struct xfrm_userpolicy_id *p;
818 int err; 842 int err;
843 struct km_event c;
819 int delete; 844 int delete;
820 845
821 p = NLMSG_DATA(nlh); 846 p = NLMSG_DATA(nlh);
@@ -843,6 +868,12 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
843 NETLINK_CB(skb).pid, 868 NETLINK_CB(skb).pid,
844 MSG_DONTWAIT); 869 MSG_DONTWAIT);
845 } 870 }
871 } else {
872 c.data.byid = p->index;
873 c.event = nlh->nlmsg_type;
874 c.seq = nlh->nlmsg_seq;
875 c.pid = nlh->nlmsg_pid;
876 km_policy_notify(xp, p->dir, &c);
846 } 877 }
847 878
848 xfrm_pol_put(xp); 879 xfrm_pol_put(xp);
@@ -852,15 +883,28 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
852 883
853static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) 884static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
854{ 885{
886 struct km_event c;
855 struct xfrm_usersa_flush *p = NLMSG_DATA(nlh); 887 struct xfrm_usersa_flush *p = NLMSG_DATA(nlh);
856 888
857 xfrm_state_flush(p->proto); 889 xfrm_state_flush(p->proto);
890 c.data.proto = p->proto;
891 c.event = nlh->nlmsg_type;
892 c.seq = nlh->nlmsg_seq;
893 c.pid = nlh->nlmsg_pid;
894 km_state_notify(NULL, &c);
895
858 return 0; 896 return 0;
859} 897}
860 898
861static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) 899static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
862{ 900{
901 struct km_event c;
902
863 xfrm_policy_flush(); 903 xfrm_policy_flush();
904 c.event = nlh->nlmsg_type;
905 c.seq = nlh->nlmsg_seq;
906 c.pid = nlh->nlmsg_pid;
907 km_policy_notify(NULL, 0, &c);
864 return 0; 908 return 0;
865} 909}
866 910
@@ -1069,15 +1113,16 @@ nlmsg_failure:
1069 return -1; 1113 return -1;
1070} 1114}
1071 1115
1072static int xfrm_send_state_notify(struct xfrm_state *x, int hard) 1116static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
1073{ 1117{
1074 struct sk_buff *skb; 1118 struct sk_buff *skb;
1119 int len = NLMSG_LENGTH(sizeof(struct xfrm_user_expire));
1075 1120
1076 skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC); 1121 skb = alloc_skb(len, GFP_ATOMIC);
1077 if (skb == NULL) 1122 if (skb == NULL)
1078 return -ENOMEM; 1123 return -ENOMEM;
1079 1124
1080 if (build_expire(skb, x, hard) < 0) 1125 if (build_expire(skb, x, c->data.hard) < 0)
1081 BUG(); 1126 BUG();
1082 1127
1083 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; 1128 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
@@ -1085,6 +1130,131 @@ static int xfrm_send_state_notify(struct xfrm_state *x, int hard)
1085 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); 1130 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
1086} 1131}
1087 1132
1133static int xfrm_notify_sa_flush(struct km_event *c)
1134{
1135 struct xfrm_usersa_flush *p;
1136 struct nlmsghdr *nlh;
1137 struct sk_buff *skb;
1138 unsigned char *b;
1139 int len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush));
1140
1141 skb = alloc_skb(len, GFP_ATOMIC);
1142 if (skb == NULL)
1143 return -ENOMEM;
1144 b = skb->tail;
1145
1146 nlh = NLMSG_PUT(skb, c->pid, c->seq,
1147 XFRM_MSG_FLUSHSA, sizeof(*p));
1148 nlh->nlmsg_flags = 0;
1149
1150 p = NLMSG_DATA(nlh);
1151 p->proto = c->data.proto;
1152
1153 nlh->nlmsg_len = skb->tail - b;
1154
1155 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
1156
1157nlmsg_failure:
1158 kfree_skb(skb);
1159 return -1;
1160}
1161
1162static int inline xfrm_sa_len(struct xfrm_state *x)
1163{
1164 int l = 0;
1165 if (x->aalg)
1166 l += RTA_SPACE(sizeof(*x->aalg) + (x->aalg->alg_key_len+7)/8);
1167 if (x->ealg)
1168 l += RTA_SPACE(sizeof(*x->ealg) + (x->ealg->alg_key_len+7)/8);
1169 if (x->calg)
1170 l += RTA_SPACE(sizeof(*x->calg));
1171 if (x->encap)
1172 l += RTA_SPACE(sizeof(*x->encap));
1173
1174 return l;
1175}
1176
1177static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
1178{
1179 struct xfrm_usersa_info *p;
1180 struct xfrm_usersa_id *id;
1181 struct nlmsghdr *nlh;
1182 struct sk_buff *skb;
1183 unsigned char *b;
1184 int len = xfrm_sa_len(x);
1185 int headlen;
1186
1187 headlen = sizeof(*p);
1188 if (c->event == XFRM_MSG_DELSA) {
1189 len += RTA_SPACE(headlen);
1190 headlen = sizeof(*id);
1191 }
1192 len += NLMSG_SPACE(headlen);
1193
1194 skb = alloc_skb(len, GFP_ATOMIC);
1195 if (skb == NULL)
1196 return -ENOMEM;
1197 b = skb->tail;
1198
1199 nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, headlen);
1200 nlh->nlmsg_flags = 0;
1201
1202 p = NLMSG_DATA(nlh);
1203 if (c->event == XFRM_MSG_DELSA) {
1204 id = NLMSG_DATA(nlh);
1205 memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
1206 id->spi = x->id.spi;
1207 id->family = x->props.family;
1208 id->proto = x->id.proto;
1209
1210 p = RTA_DATA(__RTA_PUT(skb, XFRMA_SA, sizeof(*p)));
1211 }
1212
1213 copy_to_user_state(x, p);
1214
1215 if (x->aalg)
1216 RTA_PUT(skb, XFRMA_ALG_AUTH,
1217 sizeof(*(x->aalg))+(x->aalg->alg_key_len+7)/8, x->aalg);
1218 if (x->ealg)
1219 RTA_PUT(skb, XFRMA_ALG_CRYPT,
1220 sizeof(*(x->ealg))+(x->ealg->alg_key_len+7)/8, x->ealg);
1221 if (x->calg)
1222 RTA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg);
1223
1224 if (x->encap)
1225 RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
1226
1227 nlh->nlmsg_len = skb->tail - b;
1228
1229 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
1230
1231nlmsg_failure:
1232rtattr_failure:
1233 kfree_skb(skb);
1234 return -1;
1235}
1236
1237static int xfrm_send_state_notify(struct xfrm_state *x, struct km_event *c)
1238{
1239
1240 switch (c->event) {
1241 case XFRM_MSG_EXPIRE:
1242 return xfrm_exp_state_notify(x, c);
1243 case XFRM_MSG_DELSA:
1244 case XFRM_MSG_UPDSA:
1245 case XFRM_MSG_NEWSA:
1246 return xfrm_notify_sa(x, c);
1247 case XFRM_MSG_FLUSHSA:
1248 return xfrm_notify_sa_flush(c);
1249 default:
1250 printk("xfrm_user: Unknown SA event %d\n", c->event);
1251 break;
1252 }
1253
1254 return 0;
1255
1256}
1257
1088static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, 1258static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
1089 struct xfrm_tmpl *xt, struct xfrm_policy *xp, 1259 struct xfrm_tmpl *xt, struct xfrm_policy *xp,
1090 int dir) 1260 int dir)
@@ -1180,6 +1350,9 @@ static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
1180 if (nr > XFRM_MAX_DEPTH) 1350 if (nr > XFRM_MAX_DEPTH)
1181 return NULL; 1351 return NULL;
1182 1352
1353 if (p->dir > XFRM_POLICY_OUT)
1354 return NULL;
1355
1183 xp = xfrm_policy_alloc(GFP_KERNEL); 1356 xp = xfrm_policy_alloc(GFP_KERNEL);
1184 if (xp == NULL) { 1357 if (xp == NULL) {
1185 *dir = -ENOBUFS; 1358 *dir = -ENOBUFS;
@@ -1218,7 +1391,7 @@ nlmsg_failure:
1218 return -1; 1391 return -1;
1219} 1392}
1220 1393
1221static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard) 1394static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
1222{ 1395{
1223 struct sk_buff *skb; 1396 struct sk_buff *skb;
1224 size_t len; 1397 size_t len;
@@ -1229,7 +1402,7 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
1229 if (skb == NULL) 1402 if (skb == NULL)
1230 return -ENOMEM; 1403 return -ENOMEM;
1231 1404
1232 if (build_polexpire(skb, xp, dir, hard) < 0) 1405 if (build_polexpire(skb, xp, dir, c->data.hard) < 0)
1233 BUG(); 1406 BUG();
1234 1407
1235 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; 1408 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
@@ -1237,6 +1410,103 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
1237 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); 1410 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
1238} 1411}
1239 1412
1413static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
1414{
1415 struct xfrm_userpolicy_info *p;
1416 struct xfrm_userpolicy_id *id;
1417 struct nlmsghdr *nlh;
1418 struct sk_buff *skb;
1419 unsigned char *b;
1420 int len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
1421 int headlen;
1422
1423 headlen = sizeof(*p);
1424 if (c->event == XFRM_MSG_DELPOLICY) {
1425 len += RTA_SPACE(headlen);
1426 headlen = sizeof(*id);
1427 }
1428 len += NLMSG_SPACE(headlen);
1429
1430 skb = alloc_skb(len, GFP_ATOMIC);
1431 if (skb == NULL)
1432 return -ENOMEM;
1433 b = skb->tail;
1434
1435 nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, headlen);
1436
1437 p = NLMSG_DATA(nlh);
1438 if (c->event == XFRM_MSG_DELPOLICY) {
1439 id = NLMSG_DATA(nlh);
1440 memset(id, 0, sizeof(*id));
1441 id->dir = dir;
1442 if (c->data.byid)
1443 id->index = xp->index;
1444 else
1445 memcpy(&id->sel, &xp->selector, sizeof(id->sel));
1446
1447 p = RTA_DATA(__RTA_PUT(skb, XFRMA_POLICY, sizeof(*p)));
1448 }
1449
1450 nlh->nlmsg_flags = 0;
1451
1452 copy_to_user_policy(xp, p, dir);
1453 if (copy_to_user_tmpl(xp, skb) < 0)
1454 goto nlmsg_failure;
1455
1456 nlh->nlmsg_len = skb->tail - b;
1457
1458 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
1459
1460nlmsg_failure:
1461rtattr_failure:
1462 kfree_skb(skb);
1463 return -1;
1464}
1465
1466static int xfrm_notify_policy_flush(struct km_event *c)
1467{
1468 struct nlmsghdr *nlh;
1469 struct sk_buff *skb;
1470 unsigned char *b;
1471 int len = NLMSG_LENGTH(0);
1472
1473 skb = alloc_skb(len, GFP_ATOMIC);
1474 if (skb == NULL)
1475 return -ENOMEM;
1476 b = skb->tail;
1477
1478
1479 nlh = NLMSG_PUT(skb, c->pid, c->seq, XFRM_MSG_FLUSHPOLICY, 0);
1480
1481 nlh->nlmsg_len = skb->tail - b;
1482
1483 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
1484
1485nlmsg_failure:
1486 kfree_skb(skb);
1487 return -1;
1488}
1489
1490static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
1491{
1492
1493 switch (c->event) {
1494 case XFRM_MSG_NEWPOLICY:
1495 case XFRM_MSG_UPDPOLICY:
1496 case XFRM_MSG_DELPOLICY:
1497 return xfrm_notify_policy(xp, dir, c);
1498 case XFRM_MSG_FLUSHPOLICY:
1499 return xfrm_notify_policy_flush(c);
1500 case XFRM_MSG_POLEXPIRE:
1501 return xfrm_exp_policy_notify(xp, dir, c);
1502 default:
1503 printk("xfrm_user: Unknown Policy event %d\n", c->event);
1504 }
1505
1506 return 0;
1507
1508}
1509
1240static struct xfrm_mgr netlink_mgr = { 1510static struct xfrm_mgr netlink_mgr = {
1241 .id = "netlink", 1511 .id = "netlink",
1242 .notify = xfrm_send_state_notify, 1512 .notify = xfrm_send_state_notify,