aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDave Kleikamp <shaggy@austin.ibm.com>2005-07-13 09:57:38 -0400
committerDave Kleikamp <shaggy@austin.ibm.com>2005-07-13 09:57:38 -0400
commitf7f24758ac98a506770bc5910d33567610fa3403 (patch)
treeff7fad3d01bf9dc2e2e54b908f9fca4891e1ee72 /net
parentb38a3ab3d1bb0dc3288f73903d4dc4672b5cd2d0 (diff)
parentc32511e2718618f0b53479eb36e07439aa363a74 (diff)
Merge with /home/shaggy/git/linus-clean/
Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Diffstat (limited to 'net')
-rw-r--r--net/802/fddi.c4
-rw-r--r--net/8021q/Kconfig19
-rw-r--r--net/8021q/vlan.c8
-rw-r--r--net/Kconfig456
-rw-r--r--net/atm/Kconfig74
-rw-r--r--net/atm/br2684.c3
-rw-r--r--net/bluetooth/cmtp/core.c6
-rw-r--r--net/bluetooth/hidp/core.c5
-rw-r--r--net/bluetooth/rfcomm/sock.c7
-rw-r--r--net/bluetooth/rfcomm/tty.c2
-rw-r--r--net/bridge/Kconfig31
-rw-r--r--net/bridge/br_netfilter.c2
-rw-r--r--net/bridge/netfilter/ebt_log.c6
-rw-r--r--net/core/dev.c132
-rw-r--r--net/core/filter.c104
-rw-r--r--net/core/neighbour.c6
-rw-r--r--net/core/pktgen.c29
-rw-r--r--net/core/rtnetlink.c2
-rw-r--r--net/core/skbuff.c176
-rw-r--r--net/core/sock.c11
-rw-r--r--net/core/sysctl_net_core.c46
-rw-r--r--net/core/wireless.c1
-rw-r--r--net/decnet/Kconfig23
-rw-r--r--net/decnet/af_decnet.c10
-rw-r--r--net/decnet/dn_fib.c3
-rw-r--r--net/decnet/dn_nsp_out.c3
-rw-r--r--net/econet/Kconfig36
-rw-r--r--net/ethernet/eth.c9
-rw-r--r--net/ipv4/Kconfig180
-rw-r--r--net/ipv4/Makefile10
-rw-r--r--net/ipv4/af_inet.c11
-rw-r--r--net/ipv4/fib_trie.c256
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/igmp.c96
-rw-r--r--net/ipv4/ip_input.c6
-rw-r--r--net/ipv4/ip_output.c19
-rw-r--r--net/ipv4/ip_sockglue.c6
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipmr.c10
-rw-r--r--net/ipv4/ipvs/Kconfig4
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c31
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c17
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c4
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c7
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/route.c137
-rw-r--r--net/ipv4/sysctl_net_ipv4.c114
-rw-r--r--net/ipv4/tcp.c85
-rw-r--r--net/ipv4/tcp_bic.c331
-rw-r--r--net/ipv4/tcp_cong.c237
-rw-r--r--net/ipv4/tcp_diag.c34
-rw-r--r--net/ipv4/tcp_highspeed.c181
-rw-r--r--net/ipv4/tcp_htcp.c289
-rw-r--r--net/ipv4/tcp_hybla.c187
-rw-r--r--net/ipv4/tcp_input.c824
-rw-r--r--net/ipv4/tcp_ipv4.c5
-rw-r--r--net/ipv4/tcp_minisocks.c4
-rw-r--r--net/ipv4/tcp_output.c569
-rw-r--r--net/ipv4/tcp_scalable.c68
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv4/tcp_vegas.c411
-rw-r--r--net/ipv4/tcp_westwood.c259
-rw-r--r--net/ipv6/Kconfig22
-rw-r--r--net/ipv6/addrconf.c19
-rw-r--r--net/ipv6/af_inet6.c4
-rw-r--r--net/ipv6/ip6_flowlabel.c1
-rw-r--r--net/ipv6/ip6_output.c1
-rw-r--r--net/ipv6/mcast.c29
-rw-r--r--net/ipv6/tcp_ipv6.c4
-rw-r--r--net/ipx/Kconfig33
-rw-r--r--net/irda/irlap.c3
-rw-r--r--net/irda/irlap_event.c14
-rw-r--r--net/irda/irlap_frame.c8
-rw-r--r--net/irda/irttp.c2
-rw-r--r--net/lapb/Kconfig22
-rw-r--r--net/llc/llc_c_ev.c2
-rw-r--r--net/netlink/af_netlink.c13
-rw-r--r--net/packet/Kconfig26
-rw-r--r--net/packet/af_packet.c6
-rw-r--r--net/rxrpc/krxiod.c2
-rw-r--r--net/rxrpc/krxsecd.c2
-rw-r--r--net/rxrpc/krxtimod.c2
-rw-r--r--net/sched/Kconfig50
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c10
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/cls_rsvp.h1
-rw-r--r--net/sched/em_meta.c6
-rw-r--r--net/sched/em_text.c157
-rw-r--r--net/sched/sch_api.c65
-rw-r--r--net/sched/sch_blackhole.c54
-rw-r--r--net/sched/sch_cbq.c3
-rw-r--r--net/sched/sch_generic.c35
-rw-r--r--net/sched/sch_red.c2
-rw-r--r--net/sctp/associola.c15
-rw-r--r--net/sctp/bind_addr.c16
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/endpointola.c19
-rw-r--r--net/sctp/input.c26
-rw-r--r--net/sctp/inqueue.c18
-rw-r--r--net/sctp/output.c22
-rw-r--r--net/sctp/outqueue.c50
-rw-r--r--net/sctp/protocol.c7
-rw-r--r--net/sctp/sm_make_chunk.c27
-rw-r--r--net/sctp/sm_sideeffect.c13
-rw-r--r--net/sctp/sm_statefuns.c16
-rw-r--r--net/sctp/socket.c2
-rw-r--r--net/sctp/ssnmap.c3
-rw-r--r--net/sctp/sysctl.c13
-rw-r--r--net/sctp/transport.c6
-rw-r--r--net/sctp/ulpevent.c19
-rw-r--r--net/sctp/ulpqueue.c9
-rw-r--r--net/sunrpc/sunrpc_syms.c1
-rw-r--r--net/sunrpc/svcsock.c6
-rw-r--r--net/sunrpc/xprt.c6
-rw-r--r--net/unix/Kconfig21
-rw-r--r--net/unix/af_unix.c4
-rw-r--r--net/wanrouter/Kconfig29
-rw-r--r--net/wanrouter/wanmain.c6
-rw-r--r--net/x25/Kconfig36
-rw-r--r--net/xfrm/Kconfig15
121 files changed, 4455 insertions, 2180 deletions
diff --git a/net/802/fddi.c b/net/802/fddi.c
index ebcf4830d6f1..5ce24c4bb840 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -122,10 +122,10 @@ static int fddi_rebuild_header(struct sk_buff *skb)
122 * the proper pointer to the start of packet data (skb->data). 122 * the proper pointer to the start of packet data (skb->data).
123 */ 123 */
124 124
125unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev) 125__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
126{ 126{
127 struct fddihdr *fddi = (struct fddihdr *)skb->data; 127 struct fddihdr *fddi = (struct fddihdr *)skb->data;
128 unsigned short type; 128 __be16 type;
129 129
130 /* 130 /*
131 * Set mac.raw field to point to FC byte, set data field to point 131 * Set mac.raw field to point to FC byte, set data field to point
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 000000000000..c4a382e450e2
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,19 @@
1#
2# Configuration for 802.1Q VLAN support
3#
4
5config VLAN_8021Q
6 tristate "802.1Q VLAN Support"
7 ---help---
8 Select this and you will be able to create 802.1Q VLAN interfaces
9 on your ethernet interfaces. 802.1Q VLAN supports almost
10 everything a regular ethernet interface does, including
11 firewalling, bridging, and of course IP traffic. You will need
12 the 'vconfig' tool from the VLAN project in order to effectively
13 use VLANs. See the VLAN web page for more information:
14 <http://www.candelatech.com/~greear/vlan.html>
15
16 To compile this code as a module, choose M here: the module
17 will be called 8021q.
18
19 If unsure, say N.
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1f6d31670bc7..91e412b0ab00 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -578,6 +578,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
578 if (!vlandev) 578 if (!vlandev)
579 continue; 579 continue;
580 580
581 if (netif_carrier_ok(dev)) {
582 if (!netif_carrier_ok(vlandev))
583 netif_carrier_on(vlandev);
584 } else {
585 if (netif_carrier_ok(vlandev))
586 netif_carrier_off(vlandev);
587 }
588
581 if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) { 589 if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) {
582 vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) 590 vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK)
583 | flgs; 591 | flgs;
diff --git a/net/Kconfig b/net/Kconfig
index 9251b28e8d5d..2684e809a649 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -2,7 +2,7 @@
2# Network configuration 2# Network configuration
3# 3#
4 4
5menu "Networking support" 5menu "Networking"
6 6
7config NET 7config NET
8 bool "Networking support" 8 bool "Networking support"
@@ -10,7 +10,9 @@ config NET
10 Unless you really know what you are doing, you should say Y here. 10 Unless you really know what you are doing, you should say Y here.
11 The reason is that some programs need kernel networking support even 11 The reason is that some programs need kernel networking support even
12 when running on a stand-alone machine that isn't connected to any 12 when running on a stand-alone machine that isn't connected to any
13 other computer. If you are upgrading from an older kernel, you 13 other computer.
14
15 If you are upgrading from an older kernel, you
14 should consider updating your networking tools too because changes 16 should consider updating your networking tools too because changes
15 in the kernel and the tools often go hand in hand. The tools are 17 in the kernel and the tools often go hand in hand. The tools are
16 contained in the package net-tools, the location and version number 18 contained in the package net-tools, the location and version number
@@ -20,57 +22,14 @@ config NET
20 recommended to read the NET-HOWTO, available from 22 recommended to read the NET-HOWTO, available from
21 <http://www.tldp.org/docs.html#howto>. 23 <http://www.tldp.org/docs.html#howto>.
22 24
23menu "Networking options" 25# Make sure that all config symbols are dependent on NET
24 depends on NET 26if NET
25
26config PACKET
27 tristate "Packet socket"
28 ---help---
29 The Packet protocol is used by applications which communicate
30 directly with network devices without an intermediate network
31 protocol implemented in the kernel, e.g. tcpdump. If you want them
32 to work, choose Y.
33 27
34 To compile this driver as a module, choose M here: the module will 28menu "Networking options"
35 be called af_packet.
36
37 If unsure, say Y.
38
39config PACKET_MMAP
40 bool "Packet socket: mmapped IO"
41 depends on PACKET
42 help
43 If you say Y here, the Packet protocol driver will use an IO
44 mechanism that results in faster communication.
45
46 If unsure, say N.
47
48config UNIX
49 tristate "Unix domain sockets"
50 ---help---
51 If you say Y here, you will include support for Unix domain sockets;
52 sockets are the standard Unix mechanism for establishing and
53 accessing network connections. Many commonly used programs such as
54 the X Window system and syslog use these sockets even if your
55 machine is not connected to any network. Unless you are working on
56 an embedded system or something similar, you therefore definitely
57 want to say Y here.
58
59 To compile this driver as a module, choose M here: the module will be
60 called unix. Note that several important services won't work
61 correctly if you say M here and then neglect to load the module.
62
63 Say Y unless you know what you are doing.
64
65config NET_KEY
66 tristate "PF_KEY sockets"
67 select XFRM
68 ---help---
69 PF_KEYv2 socket family, compatible to KAME ones.
70 They are required if you are going to use IPsec tools ported
71 from KAME.
72 29
73 Say Y unless you know what you are doing. 30source "net/packet/Kconfig"
31source "net/unix/Kconfig"
32source "net/xfrm/Kconfig"
74 33
75config INET 34config INET
76 bool "TCP/IP networking" 35 bool "TCP/IP networking"
@@ -94,30 +53,12 @@ config INET
94 53
95 Short answer: say Y. 54 Short answer: say Y.
96 55
56if INET
97source "net/ipv4/Kconfig" 57source "net/ipv4/Kconfig"
98
99# IPv6 as module will cause a CRASH if you try to unload it
100config IPV6
101 tristate "The IPv6 protocol"
102 depends on INET
103 default m
104 select CRYPTO if IPV6_PRIVACY
105 select CRYPTO_MD5 if IPV6_PRIVACY
106 ---help---
107 This is complemental support for the IP version 6.
108 You will still be able to do traditional IPv4 networking as well.
109
110 For general information about IPv6, see
111 <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
112 For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
113 For specific information about IPv6 under Linux, read the HOWTO at
114 <http://www.bieringer.de/linux/IPv6/>.
115
116 To compile this protocol support as a module, choose M here: the
117 module will be called ipv6.
118
119source "net/ipv6/Kconfig" 58source "net/ipv6/Kconfig"
120 59
60endif # if INET
61
121menuconfig NETFILTER 62menuconfig NETFILTER
122 bool "Network packet filtering (replaces ipchains)" 63 bool "Network packet filtering (replaces ipchains)"
123 ---help--- 64 ---help---
@@ -206,269 +147,16 @@ source "net/bridge/netfilter/Kconfig"
206 147
207endif 148endif
208 149
209config XFRM
210 bool
211 depends on NET
212
213source "net/xfrm/Kconfig"
214
215source "net/sctp/Kconfig" 150source "net/sctp/Kconfig"
216 151source "net/atm/Kconfig"
217config ATM 152source "net/bridge/Kconfig"
218 tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)" 153source "net/8021q/Kconfig"
219 depends on EXPERIMENTAL
220 ---help---
221 ATM is a high-speed networking technology for Local Area Networks
222 and Wide Area Networks. It uses a fixed packet size and is
223 connection oriented, allowing for the negotiation of minimum
224 bandwidth requirements.
225
226 In order to participate in an ATM network, your Linux box needs an
227 ATM networking card. If you have that, say Y here and to the driver
228 of your ATM card below.
229
230 Note that you need a set of user-space programs to actually make use
231 of ATM. See the file <file:Documentation/networking/atm.txt> for
232 further details.
233
234config ATM_CLIP
235 tristate "Classical IP over ATM (EXPERIMENTAL)"
236 depends on ATM && INET
237 help
238 Classical IP over ATM for PVCs and SVCs, supporting InARP and
239 ATMARP. If you want to communication with other IP hosts on your ATM
240 network, you will typically either say Y here or to "LAN Emulation
241 (LANE)" below.
242
243config ATM_CLIP_NO_ICMP
244 bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
245 depends on ATM_CLIP
246 help
247 Normally, an "ICMP host unreachable" message is sent if a neighbour
248 cannot be reached because there is no VC to it in the kernel's
249 ATMARP table. This may cause problems when ATMARP table entries are
250 briefly removed during revalidation. If you say Y here, packets to
251 such neighbours are silently discarded instead.
252
253config ATM_LANE
254 tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
255 depends on ATM
256 help
257 LAN Emulation emulates services of existing LANs across an ATM
258 network. Besides operating as a normal ATM end station client, Linux
259 LANE client can also act as an proxy client bridging packets between
260 ELAN and Ethernet segments. You need LANE if you want to try MPOA.
261
262config ATM_MPOA
263 tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
264 depends on ATM && INET && ATM_LANE!=n
265 help
266 Multi-Protocol Over ATM allows ATM edge devices such as routers,
267 bridges and ATM attached hosts establish direct ATM VCs across
268 subnetwork boundaries. These shortcut connections bypass routers
269 enhancing overall network performance.
270
271config ATM_BR2684
272 tristate "RFC1483/2684 Bridged protocols"
273 depends on ATM && INET
274 help
275 ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
276 This device will act like an ethernet from the kernels point of view,
277 with the traffic being carried by ATM PVCs (currently 1 PVC/device).
278 This is sometimes used over DSL lines. If in doubt, say N.
279
280config ATM_BR2684_IPFILTER
281 bool "Per-VC IP filter kludge"
282 depends on ATM_BR2684
283 help
284 This is an experimental mechanism for users who need to terminating a
285 large number of IP-only vcc's. Do not enable this unless you are sure
286 you know what you are doing.
287
288config BRIDGE
289 tristate "802.1d Ethernet Bridging"
290 ---help---
291 If you say Y here, then your Linux box will be able to act as an
292 Ethernet bridge, which means that the different Ethernet segments it
293 is connected to will appear as one Ethernet to the participants.
294 Several such bridges can work together to create even larger
295 networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
296 As this is a standard, Linux bridges will cooperate properly with
297 other third party bridge products.
298
299 In order to use the Ethernet bridge, you'll need the bridge
300 configuration tools; see <file:Documentation/networking/bridge.txt>
301 for location. Please read the Bridge mini-HOWTO for more
302 information.
303
304 If you enable iptables support along with the bridge support then you
305 turn your bridge into a bridging IP firewall.
306 iptables will then see the IP packets being bridged, so you need to
307 take this into account when setting up your firewall rules.
308 Enabling arptables support when bridging will let arptables see
309 bridged ARP traffic in the arptables FORWARD chain.
310
311 To compile this code as a module, choose M here: the module
312 will be called bridge.
313
314 If unsure, say N.
315
316config VLAN_8021Q
317 tristate "802.1Q VLAN Support"
318 ---help---
319 Select this and you will be able to create 802.1Q VLAN interfaces
320 on your ethernet interfaces. 802.1Q VLAN supports almost
321 everything a regular ethernet interface does, including
322 firewalling, bridging, and of course IP traffic. You will need
323 the 'vconfig' tool from the VLAN project in order to effectively
324 use VLANs. See the VLAN web page for more information:
325 <http://www.candelatech.com/~greear/vlan.html>
326
327 To compile this code as a module, choose M here: the module
328 will be called 8021q.
329
330 If unsure, say N.
331
332config DECNET
333 tristate "DECnet Support"
334 ---help---
335 The DECnet networking protocol was used in many products made by
336 Digital (now Compaq). It provides reliable stream and sequenced
337 packet communications over which run a variety of services similar
338 to those which run over TCP/IP.
339
340 To find some tools to use with the kernel layer support, please
341 look at Patrick Caulfield's web site:
342 <http://linux-decnet.sourceforge.net/>.
343
344 More detailed documentation is available in
345 <file:Documentation/networking/decnet.txt>.
346
347 Be sure to say Y to "/proc file system support" and "Sysctl support"
348 below when using DECnet, since you will need sysctl support to aid
349 in configuration at run time.
350
351 The DECnet code is also available as a module ( = code which can be
352 inserted in and removed from the running kernel whenever you want).
353 The module is called decnet.
354
355source "net/decnet/Kconfig" 154source "net/decnet/Kconfig"
356
357source "net/llc/Kconfig" 155source "net/llc/Kconfig"
358
359config IPX
360 tristate "The IPX protocol"
361 select LLC
362 ---help---
363 This is support for the Novell networking protocol, IPX, commonly
364 used for local networks of Windows machines. You need it if you
365 want to access Novell NetWare file or print servers using the Linux
366 Novell client ncpfs (available from
367 <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
368 within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
369 available from <http://www.tldp.org/docs.html#howto>). In order
370 to do the former, you'll also have to say Y to "NCP file system
371 support", below.
372
373 IPX is similar in scope to IP, while SPX, which runs on top of IPX,
374 is similar to TCP. There is also experimental support for SPX in
375 Linux (see "SPX networking", below).
376
377 To turn your Linux box into a fully featured NetWare file server and
378 IPX router, say Y here and fetch either lwared from
379 <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
380 mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
381 information, read the IPX-HOWTO available from
382 <http://www.tldp.org/docs.html#howto>.
383
384 General information about how to connect Linux, Windows machines and
385 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
386
387 The IPX driver would enlarge your kernel by about 16 KB. To compile
388 this driver as a module, choose M here: the module will be called ipx.
389 Unless you want to integrate your Linux box with a local Novell
390 network, say N.
391
392source "net/ipx/Kconfig" 156source "net/ipx/Kconfig"
393
394config ATALK
395 tristate "Appletalk protocol support"
396 select LLC
397 ---help---
398 AppleTalk is the protocol that Apple computers can use to communicate
399 on a network. If your Linux box is connected to such a network and you
400 wish to connect to it, say Y. You will need to use the netatalk package
401 so that your Linux box can act as a print and file server for Macs as
402 well as access AppleTalk printers. Check out
403 <http://www.zettabyte.net/netatalk/> on the WWW for details.
404 EtherTalk is the name used for AppleTalk over Ethernet and the
405 cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
406 network using serial links. EtherTalk and LocalTalk are fully
407 supported by Linux.
408
409 General information about how to connect Linux, Windows machines and
410 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. The
411 NET-3-HOWTO, available from
412 <http://www.tldp.org/docs.html#howto>, contains valuable
413 information as well.
414
415 To compile this driver as a module, choose M here: the module will be
416 called appletalk. You almost certainly want to compile it as a
417 module so you can restart your AppleTalk stack without rebooting
418 your machine. I hear that the GNU boycott of Apple is over, so
419 even politically correct people are allowed to say Y here.
420
421source "drivers/net/appletalk/Kconfig" 157source "drivers/net/appletalk/Kconfig"
422 158source "net/x25/Kconfig"
423config X25 159source "net/lapb/Kconfig"
424 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
425 depends on EXPERIMENTAL
426 ---help---
427 X.25 is a set of standardized network protocols, similar in scope to
428 frame relay; the one physical line from your box to the X.25 network
429 entry point can carry several logical point-to-point connections
430 (called "virtual circuits") to other computers connected to the X.25
431 network. Governments, banks, and other organizations tend to use it
432 to connect to each other or to form Wide Area Networks (WANs). Many
433 countries have public X.25 networks. X.25 consists of two
434 protocols: the higher level Packet Layer Protocol (PLP) (say Y here
435 if you want that) and the lower level data link layer protocol LAPB
436 (say Y to "LAPB Data Link Driver" below if you want that).
437
438 You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
439 <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
440 Information about X.25 for Linux is contained in the files
441 <file:Documentation/networking/x25.txt> and
442 <file:Documentation/networking/x25-iface.txt>.
443
444 One connects to an X.25 network either with a dedicated network card
445 using the X.21 protocol (not yet supported by Linux) or one can do
446 X.25 over a standard telephone line using an ordinary modem (say Y
447 to "X.25 async driver" below) or over Ethernet using an ordinary
448 Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
449 Driver" and "LAPB over Ethernet driver" below).
450
451 To compile this driver as a module, choose M here: the module
452 will be called x25. If unsure, say N.
453
454config LAPB
455 tristate "LAPB Data Link Driver (EXPERIMENTAL)"
456 depends on EXPERIMENTAL
457 ---help---
458 Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
459 the lower) part of the X.25 protocol. It offers a reliable
460 connection service to exchange data frames with one other host, and
461 it is used to transport higher level protocols (mostly X.25 Packet
462 Layer, the higher part of X.25, but others are possible as well).
463 Usually, LAPB is used with specialized X.21 network cards, but Linux
464 currently supports LAPB only over Ethernet connections. If you want
465 to use LAPB connections over Ethernet, say Y here and to "LAPB over
466 Ethernet driver" below. Read
467 <file:Documentation/networking/lapb-module.txt> for technical
468 details.
469
470 To compile this driver as a module, choose M here: the
471 module will be called lapb. If unsure, say N.
472 160
473config NET_DIVERT 161config NET_DIVERT
474 bool "Frame Diverter (EXPERIMENTAL)" 162 bool "Frame Diverter (EXPERIMENTAL)"
@@ -496,107 +184,10 @@ config NET_DIVERT
496 184
497 If unsure, say N. 185 If unsure, say N.
498 186
499config ECONET 187source "net/econet/Kconfig"
500 tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)" 188source "net/wanrouter/Kconfig"
501 depends on EXPERIMENTAL && INET
502 ---help---
503 Econet is a fairly old and slow networking protocol mainly used by
504 Acorn computers to access file and print servers. It uses native
505 Econet network cards. AUN is an implementation of the higher level
506 parts of Econet that runs over ordinary Ethernet connections, on
507 top of the UDP packet protocol, which in turn runs on top of the
508 Internet protocol IP.
509
510 If you say Y here, you can choose with the next two options whether
511 to send Econet/AUN traffic over a UDP Ethernet connection or over
512 a native Econet network card.
513
514 To compile this driver as a module, choose M here: the module
515 will be called econet.
516
517config ECONET_AUNUDP
518 bool "AUN over UDP"
519 depends on ECONET
520 help
521 Say Y here if you want to send Econet/AUN traffic over a UDP
522 connection (UDP is a packet based protocol that runs on top of the
523 Internet protocol IP) using an ordinary Ethernet network card.
524
525config ECONET_NATIVE
526 bool "Native Econet"
527 depends on ECONET
528 help
529 Say Y here if you have a native Econet network card installed in
530 your computer.
531
532config WAN_ROUTER
533 tristate "WAN router"
534 depends on EXPERIMENTAL
535 ---help---
536 Wide Area Networks (WANs), such as X.25, frame relay and leased
537 lines, are used to interconnect Local Area Networks (LANs) over vast
538 distances with data transfer rates significantly higher than those
539 achievable with commonly used asynchronous modem connections.
540 Usually, a quite expensive external device called a `WAN router' is
541 needed to connect to a WAN.
542
543 As an alternative, WAN routing can be built into the Linux kernel.
544 With relatively inexpensive WAN interface cards available on the
545 market, a perfectly usable router can be built for less than half
546 the price of an external router. If you have one of those cards and
547 wish to use your Linux box as a WAN router, say Y here and also to
548 the WAN driver for your card, below. You will then need the
549 wan-tools package which is available from <ftp://ftp.sangoma.com/>.
550 Read <file:Documentation/networking/wan-router.txt> for more
551 information.
552
553 To compile WAN routing support as a module, choose M here: the
554 module will be called wanrouter.
555
556 If unsure, say N.
557
558menu "QoS and/or fair queueing"
559
560config NET_SCHED
561 bool "QoS and/or fair queueing"
562 ---help---
563 When the kernel has several packets to send out over a network
564 device, it has to decide which ones to send first, which ones to
565 delay, and which ones to drop. This is the job of the packet
566 scheduler, and several different algorithms for how to do this
567 "fairly" have been proposed.
568
569 If you say N here, you will get the standard packet scheduler, which
570 is a FIFO (first come, first served). If you say Y here, you will be
571 able to choose from among several alternative algorithms which can
572 then be attached to different network devices. This is useful for
573 example if some of your network devices are real time devices that
574 need a certain minimum data flow rate, or if you need to limit the
575 maximum data flow rate for traffic which matches specified criteria.
576 This code is considered to be experimental.
577
578 To administer these schedulers, you'll need the user-level utilities
579 from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
580 That package also contains some documentation; for more, check out
581 <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
582
583 This Quality of Service (QoS) support will enable you to use
584 Differentiated Services (diffserv) and Resource Reservation Protocol
585 (RSVP) on your Linux router if you also say Y to "QoS support",
586 "Packet classifier API" and to some classifiers below. Documentation
587 and software is at <http://diffserv.sourceforge.net/>.
588
589 If you say Y here and to "/proc file system" below, you will be able
590 to read status information about packet schedulers from the file
591 /proc/net/psched.
592
593 The available schedulers are listed in the following questions; you
594 can say Y to as many as you like. If unsure, say N now.
595
596source "net/sched/Kconfig" 189source "net/sched/Kconfig"
597 190
598endmenu
599
600menu "Network testing" 191menu "Network testing"
601 192
602config NET_PKTGEN 193config NET_PKTGEN
@@ -635,12 +226,9 @@ config NET_POLL_CONTROLLER
635 def_bool NETPOLL 226 def_bool NETPOLL
636 227
637source "net/ax25/Kconfig" 228source "net/ax25/Kconfig"
638
639source "net/irda/Kconfig" 229source "net/irda/Kconfig"
640
641source "net/bluetooth/Kconfig" 230source "net/bluetooth/Kconfig"
642 231
643source "drivers/net/Kconfig" 232endif # if NET
644 233endmenu # Networking
645endmenu
646 234
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 000000000000..bea2426229b1
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,74 @@
1#
2# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)
3#
4
5config ATM
6 tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 ATM is a high-speed networking technology for Local Area Networks
10 and Wide Area Networks. It uses a fixed packet size and is
11 connection oriented, allowing for the negotiation of minimum
12 bandwidth requirements.
13
14 In order to participate in an ATM network, your Linux box needs an
15 ATM networking card. If you have that, say Y here and to the driver
16 of your ATM card below.
17
18 Note that you need a set of user-space programs to actually make use
19 of ATM. See the file <file:Documentation/networking/atm.txt> for
20 further details.
21
22config ATM_CLIP
23 tristate "Classical IP over ATM (EXPERIMENTAL)"
24 depends on ATM && INET
25 help
26 Classical IP over ATM for PVCs and SVCs, supporting InARP and
27 ATMARP. If you want to communication with other IP hosts on your ATM
28 network, you will typically either say Y here or to "LAN Emulation
29 (LANE)" below.
30
31config ATM_CLIP_NO_ICMP
32 bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
33 depends on ATM_CLIP
34 help
35 Normally, an "ICMP host unreachable" message is sent if a neighbour
36 cannot be reached because there is no VC to it in the kernel's
37 ATMARP table. This may cause problems when ATMARP table entries are
38 briefly removed during revalidation. If you say Y here, packets to
39 such neighbours are silently discarded instead.
40
41config ATM_LANE
42 tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
43 depends on ATM
44 help
45 LAN Emulation emulates services of existing LANs across an ATM
46 network. Besides operating as a normal ATM end station client, Linux
47 LANE client can also act as an proxy client bridging packets between
48 ELAN and Ethernet segments. You need LANE if you want to try MPOA.
49
50config ATM_MPOA
51 tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
52 depends on ATM && INET && ATM_LANE!=n
53 help
54 Multi-Protocol Over ATM allows ATM edge devices such as routers,
55 bridges and ATM attached hosts establish direct ATM VCs across
56 subnetwork boundaries. These shortcut connections bypass routers
57 enhancing overall network performance.
58
59config ATM_BR2684
60 tristate "RFC1483/2684 Bridged protocols"
61 depends on ATM && INET
62 help
63 ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
64 This device will act like an ethernet from the kernels point of view,
65 with the traffic being carried by ATM PVCs (currently 1 PVC/device).
66 This is sometimes used over DSL lines. If in doubt, say N.
67
68config ATM_BR2684_IPFILTER
69 bool "Per-VC IP filter kludge"
70 depends on ATM_BR2684
71 help
72 This is an experimental mechanism for users who need to terminating a
73 large number of IP-only vcc's. Do not enable this unless you are sure
74 you know what you are doing.
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index e6954cf1459d..289956c4dd3e 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -289,8 +289,7 @@ xmit will add the additional header part in that case */
289 * This is similar to eth_type_trans, which cannot be used because of 289 * This is similar to eth_type_trans, which cannot be used because of
290 * our dev->hard_header_len 290 * our dev->hard_header_len
291 */ 291 */
292static inline unsigned short br_type_trans(struct sk_buff *skb, 292static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
293 struct net_device *dev)
294{ 293{
295 struct ethhdr *eth; 294 struct ethhdr *eth;
296 unsigned char *rawp; 295 unsigned char *rawp;
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 2e341de3e763..901eff7ebe74 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, in
213 return kernel_sendmsg(sock, &msg, &iv, 1, len); 213 return kernel_sendmsg(sock, &msg, &iv, 1, len);
214} 214}
215 215
216static int cmtp_process_transmit(struct cmtp_session *session) 216static void cmtp_process_transmit(struct cmtp_session *session)
217{ 217{
218 struct sk_buff *skb, *nskb; 218 struct sk_buff *skb, *nskb;
219 unsigned char *hdr; 219 unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct cmtp_session *session)
223 223
224 if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) { 224 if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
225 BT_ERR("Can't allocate memory for new frame"); 225 BT_ERR("Can't allocate memory for new frame");
226 return -ENOMEM; 226 return;
227 } 227 }
228 228
229 while ((skb = skb_dequeue(&session->transmit))) { 229 while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct cmtp_session *session)
275 cmtp_send_frame(session, nskb->data, nskb->len); 275 cmtp_send_frame(session, nskb->data, nskb->len);
276 276
277 kfree_skb(nskb); 277 kfree_skb(nskb);
278
279 return skb_queue_len(&session->transmit);
280} 278}
281 279
282static int cmtp_session(void *arg) 280static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index affbc55462e8..de8af5f42394 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
428 return kernel_sendmsg(sock, &msg, &iv, 1, len); 428 return kernel_sendmsg(sock, &msg, &iv, 1, len);
429} 429}
430 430
431static int hidp_process_transmit(struct hidp_session *session) 431static void hidp_process_transmit(struct hidp_session *session)
432{ 432{
433 struct sk_buff *skb; 433 struct sk_buff *skb;
434 434
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct hidp_session *session)
453 hidp_set_timer(session); 453 hidp_set_timer(session);
454 kfree_skb(skb); 454 kfree_skb(skb);
455 } 455 }
456
457 return skb_queue_len(&session->ctrl_transmit) +
458 skb_queue_len(&session->intr_transmit);
459} 456}
460 457
461static int hidp_session(void *arg) 458static int hidp_session(void *arg)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index f3f6355a2786..63a123c5c41b 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
590 for (;;) { 590 for (;;) {
591 set_current_state(TASK_INTERRUPTIBLE); 591 set_current_state(TASK_INTERRUPTIBLE);
592 592
593 if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || 593 if (!skb_queue_empty(&sk->sk_receive_queue) ||
594 signal_pending(current) || !timeo) 594 sk->sk_err ||
595 (sk->sk_shutdown & RCV_SHUTDOWN) ||
596 signal_pending(current) ||
597 !timeo)
595 break; 598 break;
596 599
597 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); 600 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6d689200bcf3..6304590fd36a 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
781 781
782 BT_DBG("tty %p dev %p", tty, dev); 782 BT_DBG("tty %p dev %p", tty, dev);
783 783
784 if (skb_queue_len(&dlc->tx_queue)) 784 if (!skb_queue_empty(&dlc->tx_queue))
785 return dlc->mtu; 785 return dlc->mtu;
786 786
787 return 0; 787 return 0;
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 000000000000..db23d59746cf
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,31 @@
1#
2# 802.1d Ethernet Bridging
3#
4
5config BRIDGE
6 tristate "802.1d Ethernet Bridging"
7 ---help---
8 If you say Y here, then your Linux box will be able to act as an
9 Ethernet bridge, which means that the different Ethernet segments it
10 is connected to will appear as one Ethernet to the participants.
11 Several such bridges can work together to create even larger
12 networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
13 As this is a standard, Linux bridges will cooperate properly with
14 other third party bridge products.
15
16 In order to use the Ethernet bridge, you'll need the bridge
17 configuration tools; see <file:Documentation/networking/bridge.txt>
18 for location. Please read the Bridge mini-HOWTO for more
19 information.
20
21 If you enable iptables support along with the bridge support then you
22 turn your bridge into a bridging IP firewall.
23 iptables will then see the IP packets being bridged, so you need to
24 take this into account when setting up your firewall rules.
25 Enabling arptables support when bridging will let arptables see
26 bridged ARP traffic in the arptables FORWARD chain.
27
28 To compile this code as a module, choose M here: the module
29 will be called bridge.
30
31 If unsure, say N.
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 03ae4edddac3..2d52fee63a8c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -844,7 +844,7 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
844 * doesn't use the bridge parent of the indev by using 844 * doesn't use the bridge parent of the indev by using
845 * the BRNF_DONT_TAKE_PARENT mask. */ 845 * the BRNF_DONT_TAKE_PARENT mask. */
846 if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) { 846 if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
847 nf_bridge->mask &= BRNF_DONT_TAKE_PARENT; 847 nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
848 nf_bridge->physindev = (struct net_device *)in; 848 nf_bridge->physindev = (struct net_device *)in;
849 } 849 }
850#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) 850#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e4ae34b88925..662975be3d1d 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -61,8 +61,6 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
61{ 61{
62 struct ebt_log_info *info = (struct ebt_log_info *)data; 62 struct ebt_log_info *info = (struct ebt_log_info *)data;
63 char level_string[4] = "< >"; 63 char level_string[4] = "< >";
64 union {struct iphdr iph; struct tcpudphdr ports;
65 struct arphdr arph; struct arppayload arpp;} u;
66 64
67 level_string[1] = '0' + info->loglevel; 65 level_string[1] = '0' + info->loglevel;
68 spin_lock_bh(&ebt_log_lock); 66 spin_lock_bh(&ebt_log_lock);
@@ -88,7 +86,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
88 } 86 }
89 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", 87 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
90 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); 88 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
91 printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos, 89 printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
92 ih->protocol); 90 ih->protocol);
93 if (ih->protocol == IPPROTO_TCP || 91 if (ih->protocol == IPPROTO_TCP ||
94 ih->protocol == IPPROTO_UDP) { 92 ih->protocol == IPPROTO_UDP) {
@@ -127,7 +125,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
127 ah->ar_pln == sizeof(uint32_t)) { 125 ah->ar_pln == sizeof(uint32_t)) {
128 struct arppayload _arpp, *ap; 126 struct arppayload _arpp, *ap;
129 127
130 ap = skb_header_pointer(skb, sizeof(u.arph), 128 ap = skb_header_pointer(skb, sizeof(_arph),
131 sizeof(_arpp), &_arpp); 129 sizeof(_arpp), &_arpp);
132 if (ap == NULL) { 130 if (ap == NULL) {
133 printk(" INCOMPLETE ARP payload"); 131 printk(" INCOMPLETE ARP payload");
diff --git a/net/core/dev.c b/net/core/dev.c
index ab935778ce81..ff9dc029233a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
115#endif /* CONFIG_NET_RADIO */ 115#endif /* CONFIG_NET_RADIO */
116#include <asm/current.h> 116#include <asm/current.h>
117 117
118/* This define, if set, will randomly drop a packet when congestion
119 * is more than moderate. It helps fairness in the multi-interface
120 * case when one of them is a hog, but it kills performance for the
121 * single interface case so it is off now by default.
122 */
123#undef RAND_LIE
124
125/* Setting this will sample the queue lengths and thus congestion
126 * via a timer instead of as each packet is received.
127 */
128#undef OFFLINE_SAMPLE
129
130/* 118/*
131 * The list of packet types we will receive (as opposed to discard) 119 * The list of packet types we will receive (as opposed to discard)
132 * and the routines to invoke. 120 * and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
159static struct list_head ptype_base[16]; /* 16 way hashed list */ 147static struct list_head ptype_base[16]; /* 16 way hashed list */
160static struct list_head ptype_all; /* Taps */ 148static struct list_head ptype_all; /* Taps */
161 149
162#ifdef OFFLINE_SAMPLE
163static void sample_queue(unsigned long dummy);
164static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
165#endif
166
167/* 150/*
168 * The @dev_base list is protected by @dev_base_lock and the rtln 151 * The @dev_base list is protected by @dev_base_lock and the rtln
169 * semaphore. 152 * semaphore.
@@ -215,7 +198,7 @@ static struct notifier_block *netdev_chain;
215 * Device drivers call our routines to queue packets here. We empty the 198 * Device drivers call our routines to queue packets here. We empty the
216 * queue in the local softnet handler. 199 * queue in the local softnet handler.
217 */ 200 */
218DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; 201DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
219 202
220#ifdef CONFIG_SYSFS 203#ifdef CONFIG_SYSFS
221extern int netdev_sysfs_init(void); 204extern int netdev_sysfs_init(void);
@@ -1144,7 +1127,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1144extern void skb_release_data(struct sk_buff *); 1127extern void skb_release_data(struct sk_buff *);
1145 1128
1146/* Keep head the same: replace data */ 1129/* Keep head the same: replace data */
1147int __skb_linearize(struct sk_buff *skb, int gfp_mask) 1130int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
1148{ 1131{
1149 unsigned int size; 1132 unsigned int size;
1150 u8 *data; 1133 u8 *data;
@@ -1363,71 +1346,13 @@ out:
1363 Receiver routines 1346 Receiver routines
1364 =======================================================================*/ 1347 =======================================================================*/
1365 1348
1366int netdev_max_backlog = 300; 1349int netdev_max_backlog = 1000;
1350int netdev_budget = 300;
1367int weight_p = 64; /* old backlog weight */ 1351int weight_p = 64; /* old backlog weight */
1368/* These numbers are selected based on intuition and some
1369 * experimentatiom, if you have more scientific way of doing this
1370 * please go ahead and fix things.
1371 */
1372int no_cong_thresh = 10;
1373int no_cong = 20;
1374int lo_cong = 100;
1375int mod_cong = 290;
1376 1352
1377DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 1353DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1378 1354
1379 1355
1380static void get_sample_stats(int cpu)
1381{
1382#ifdef RAND_LIE
1383 unsigned long rd;
1384 int rq;
1385#endif
1386 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1387 int blog = sd->input_pkt_queue.qlen;
1388 int avg_blog = sd->avg_blog;
1389
1390 avg_blog = (avg_blog >> 1) + (blog >> 1);
1391
1392 if (avg_blog > mod_cong) {
1393 /* Above moderate congestion levels. */
1394 sd->cng_level = NET_RX_CN_HIGH;
1395#ifdef RAND_LIE
1396 rd = net_random();
1397 rq = rd % netdev_max_backlog;
1398 if (rq < avg_blog) /* unlucky bastard */
1399 sd->cng_level = NET_RX_DROP;
1400#endif
1401 } else if (avg_blog > lo_cong) {
1402 sd->cng_level = NET_RX_CN_MOD;
1403#ifdef RAND_LIE
1404 rd = net_random();
1405 rq = rd % netdev_max_backlog;
1406 if (rq < avg_blog) /* unlucky bastard */
1407 sd->cng_level = NET_RX_CN_HIGH;
1408#endif
1409 } else if (avg_blog > no_cong)
1410 sd->cng_level = NET_RX_CN_LOW;
1411 else /* no congestion */
1412 sd->cng_level = NET_RX_SUCCESS;
1413
1414 sd->avg_blog = avg_blog;
1415}
1416
1417#ifdef OFFLINE_SAMPLE
1418static void sample_queue(unsigned long dummy)
1419{
1420/* 10 ms 0r 1ms -- i don't care -- JHS */
1421 int next_tick = 1;
1422 int cpu = smp_processor_id();
1423
1424 get_sample_stats(cpu);
1425 next_tick += jiffies;
1426 mod_timer(&samp_timer, next_tick);
1427}
1428#endif
1429
1430
1431/** 1356/**
1432 * netif_rx - post buffer to the network code 1357 * netif_rx - post buffer to the network code
1433 * @skb: buffer to post 1358 * @skb: buffer to post
@@ -1448,7 +1373,6 @@ static void sample_queue(unsigned long dummy)
1448 1373
1449int netif_rx(struct sk_buff *skb) 1374int netif_rx(struct sk_buff *skb)
1450{ 1375{
1451 int this_cpu;
1452 struct softnet_data *queue; 1376 struct softnet_data *queue;
1453 unsigned long flags; 1377 unsigned long flags;
1454 1378
@@ -1464,38 +1388,22 @@ int netif_rx(struct sk_buff *skb)
1464 * short when CPU is congested, but is still operating. 1388 * short when CPU is congested, but is still operating.
1465 */ 1389 */
1466 local_irq_save(flags); 1390 local_irq_save(flags);
1467 this_cpu = smp_processor_id();
1468 queue = &__get_cpu_var(softnet_data); 1391 queue = &__get_cpu_var(softnet_data);
1469 1392
1470 __get_cpu_var(netdev_rx_stat).total++; 1393 __get_cpu_var(netdev_rx_stat).total++;
1471 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 1394 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1472 if (queue->input_pkt_queue.qlen) { 1395 if (queue->input_pkt_queue.qlen) {
1473 if (queue->throttle)
1474 goto drop;
1475
1476enqueue: 1396enqueue:
1477 dev_hold(skb->dev); 1397 dev_hold(skb->dev);
1478 __skb_queue_tail(&queue->input_pkt_queue, skb); 1398 __skb_queue_tail(&queue->input_pkt_queue, skb);
1479#ifndef OFFLINE_SAMPLE
1480 get_sample_stats(this_cpu);
1481#endif
1482 local_irq_restore(flags); 1399 local_irq_restore(flags);
1483 return queue->cng_level; 1400 return NET_RX_SUCCESS;
1484 } 1401 }
1485 1402
1486 if (queue->throttle)
1487 queue->throttle = 0;
1488
1489 netif_rx_schedule(&queue->backlog_dev); 1403 netif_rx_schedule(&queue->backlog_dev);
1490 goto enqueue; 1404 goto enqueue;
1491 } 1405 }
1492 1406
1493 if (!queue->throttle) {
1494 queue->throttle = 1;
1495 __get_cpu_var(netdev_rx_stat).throttled++;
1496 }
1497
1498drop:
1499 __get_cpu_var(netdev_rx_stat).dropped++; 1407 __get_cpu_var(netdev_rx_stat).dropped++;
1500 local_irq_restore(flags); 1408 local_irq_restore(flags);
1501 1409
@@ -1780,8 +1688,6 @@ job_done:
1780 smp_mb__before_clear_bit(); 1688 smp_mb__before_clear_bit();
1781 netif_poll_enable(backlog_dev); 1689 netif_poll_enable(backlog_dev);
1782 1690
1783 if (queue->throttle)
1784 queue->throttle = 0;
1785 local_irq_enable(); 1691 local_irq_enable();
1786 return 0; 1692 return 0;
1787} 1693}
@@ -1790,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h)
1790{ 1696{
1791 struct softnet_data *queue = &__get_cpu_var(softnet_data); 1697 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1792 unsigned long start_time = jiffies; 1698 unsigned long start_time = jiffies;
1793 int budget = netdev_max_backlog; 1699 int budget = netdev_budget;
1794
1795 1700
1796 local_irq_disable(); 1701 local_irq_disable();
1797 1702
@@ -2055,15 +1960,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
2055 struct netif_rx_stats *s = v; 1960 struct netif_rx_stats *s = v;
2056 1961
2057 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 1962 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2058 s->total, s->dropped, s->time_squeeze, s->throttled, 1963 s->total, s->dropped, s->time_squeeze, 0,
2059 s->fastroute_hit, s->fastroute_success, s->fastroute_defer, 1964 0, 0, 0, 0, /* was fastroute */
2060 s->fastroute_deferred_out, 1965 s->cpu_collision );
2061#if 0
2062 s->fastroute_latency_reduction
2063#else
2064 s->cpu_collision
2065#endif
2066 );
2067 return 0; 1966 return 0;
2068} 1967}
2069 1968
@@ -2190,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
2190{ 2089{
2191 unsigned short old_flags = dev->flags; 2090 unsigned short old_flags = dev->flags;
2192 2091
2193 dev->flags |= IFF_PROMISC;
2194 if ((dev->promiscuity += inc) == 0) 2092 if ((dev->promiscuity += inc) == 0)
2195 dev->flags &= ~IFF_PROMISC; 2093 dev->flags &= ~IFF_PROMISC;
2196 if (dev->flags ^ old_flags) { 2094 else
2095 dev->flags |= IFF_PROMISC;
2096 if (dev->flags != old_flags) {
2197 dev_mc_upload(dev); 2097 dev_mc_upload(dev);
2198 printk(KERN_INFO "device %s %s promiscuous mode\n", 2098 printk(KERN_INFO "device %s %s promiscuous mode\n",
2199 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 2099 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
@@ -3305,9 +3205,6 @@ static int __init net_dev_init(void)
3305 3205
3306 queue = &per_cpu(softnet_data, i); 3206 queue = &per_cpu(softnet_data, i);
3307 skb_queue_head_init(&queue->input_pkt_queue); 3207 skb_queue_head_init(&queue->input_pkt_queue);
3308 queue->throttle = 0;
3309 queue->cng_level = 0;
3310 queue->avg_blog = 10; /* arbitrary non-zero */
3311 queue->completion_queue = NULL; 3208 queue->completion_queue = NULL;
3312 INIT_LIST_HEAD(&queue->poll_list); 3209 INIT_LIST_HEAD(&queue->poll_list);
3313 set_bit(__LINK_STATE_START, &queue->backlog_dev.state); 3210 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3316,11 +3213,6 @@ static int __init net_dev_init(void)
3316 atomic_set(&queue->backlog_dev.refcnt, 1); 3213 atomic_set(&queue->backlog_dev.refcnt, 1);
3317 } 3214 }
3318 3215
3319#ifdef OFFLINE_SAMPLE
3320 samp_timer.expires = jiffies + (10 * HZ);
3321 add_timer(&samp_timer);
3322#endif
3323
3324 dev_boot_phase = 0; 3216 dev_boot_phase = 0;
3325 3217
3326 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); 3218 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ace2..cd91a24f9720 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
36#include <linux/filter.h> 36#include <linux/filter.h>
37 37
38/* No hurry in this branch */ 38/* No hurry in this branch */
39static u8 *load_pointer(struct sk_buff *skb, int k) 39static void *__load_pointer(struct sk_buff *skb, int k)
40{ 40{
41 u8 *ptr = NULL; 41 u8 *ptr = NULL;
42 42
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
50 return NULL; 50 return NULL;
51} 51}
52 52
53static inline void *load_pointer(struct sk_buff *skb, int k,
54 unsigned int size, void *buffer)
55{
56 if (k >= 0)
57 return skb_header_pointer(skb, k, size, buffer);
58 else {
59 if (k >= SKF_AD_OFF)
60 return NULL;
61 return __load_pointer(skb, k);
62 }
63}
64
53/** 65/**
54 * sk_run_filter - run a filter on a socket 66 * sk_run_filter - run a filter on a socket
55 * @skb: buffer to run the filter on 67 * @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
64 76
65int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) 77int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
66{ 78{
67 unsigned char *data = skb->data;
68 /* len is UNSIGNED. Byte wide insns relies only on implicit
69 type casts to prevent reading arbitrary memory locations.
70 */
71 unsigned int len = skb->len-skb->data_len;
72 struct sock_filter *fentry; /* We walk down these */ 79 struct sock_filter *fentry; /* We walk down these */
80 void *ptr;
73 u32 A = 0; /* Accumulator */ 81 u32 A = 0; /* Accumulator */
74 u32 X = 0; /* Index Register */ 82 u32 X = 0; /* Index Register */
75 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ 83 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
84 u32 tmp;
76 int k; 85 int k;
77 int pc; 86 int pc;
78 87
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
168 case BPF_LD|BPF_W|BPF_ABS: 177 case BPF_LD|BPF_W|BPF_ABS:
169 k = fentry->k; 178 k = fentry->k;
170 load_w: 179 load_w:
171 if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { 180 ptr = load_pointer(skb, k, 4, &tmp);
172 A = ntohl(*(u32*)&data[k]); 181 if (ptr != NULL) {
182 A = ntohl(*(u32 *)ptr);
173 continue; 183 continue;
174 } 184 }
175 if (k < 0) {
176 u8 *ptr;
177
178 if (k >= SKF_AD_OFF)
179 break;
180 ptr = load_pointer(skb, k);
181 if (ptr) {
182 A = ntohl(*(u32*)ptr);
183 continue;
184 }
185 } else {
186 u32 _tmp, *p;
187 p = skb_header_pointer(skb, k, 4, &_tmp);
188 if (p != NULL) {
189 A = ntohl(*p);
190 continue;
191 }
192 }
193 return 0; 185 return 0;
194 case BPF_LD|BPF_H|BPF_ABS: 186 case BPF_LD|BPF_H|BPF_ABS:
195 k = fentry->k; 187 k = fentry->k;
196 load_h: 188 load_h:
197 if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { 189 ptr = load_pointer(skb, k, 2, &tmp);
198 A = ntohs(*(u16*)&data[k]); 190 if (ptr != NULL) {
191 A = ntohs(*(u16 *)ptr);
199 continue; 192 continue;
200 } 193 }
201 if (k < 0) {
202 u8 *ptr;
203
204 if (k >= SKF_AD_OFF)
205 break;
206 ptr = load_pointer(skb, k);
207 if (ptr) {
208 A = ntohs(*(u16*)ptr);
209 continue;
210 }
211 } else {
212 u16 _tmp, *p;
213 p = skb_header_pointer(skb, k, 2, &_tmp);
214 if (p != NULL) {
215 A = ntohs(*p);
216 continue;
217 }
218 }
219 return 0; 194 return 0;
220 case BPF_LD|BPF_B|BPF_ABS: 195 case BPF_LD|BPF_B|BPF_ABS:
221 k = fentry->k; 196 k = fentry->k;
222load_b: 197load_b:
223 if (k >= 0 && (unsigned int)k < len) { 198 ptr = load_pointer(skb, k, 1, &tmp);
224 A = data[k]; 199 if (ptr != NULL) {
200 A = *(u8 *)ptr;
225 continue; 201 continue;
226 } 202 }
227 if (k < 0) {
228 u8 *ptr;
229
230 if (k >= SKF_AD_OFF)
231 break;
232 ptr = load_pointer(skb, k);
233 if (ptr) {
234 A = *ptr;
235 continue;
236 }
237 } else {
238 u8 _tmp, *p;
239 p = skb_header_pointer(skb, k, 1, &_tmp);
240 if (p != NULL) {
241 A = *p;
242 continue;
243 }
244 }
245 return 0; 203 return 0;
246 case BPF_LD|BPF_W|BPF_LEN: 204 case BPF_LD|BPF_W|BPF_LEN:
247 A = len; 205 A = skb->len;
248 continue; 206 continue;
249 case BPF_LDX|BPF_W|BPF_LEN: 207 case BPF_LDX|BPF_W|BPF_LEN:
250 X = len; 208 X = skb->len;
251 continue; 209 continue;
252 case BPF_LD|BPF_W|BPF_IND: 210 case BPF_LD|BPF_W|BPF_IND:
253 k = X + fentry->k; 211 k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
259 k = X + fentry->k; 217 k = X + fentry->k;
260 goto load_b; 218 goto load_b;
261 case BPF_LDX|BPF_B|BPF_MSH: 219 case BPF_LDX|BPF_B|BPF_MSH:
262 if (fentry->k >= len) 220 ptr = load_pointer(skb, fentry->k, 1, &tmp);
263 return 0; 221 if (ptr != NULL) {
264 X = (data[fentry->k] & 0xf) << 2; 222 X = (*(u8 *)ptr & 0xf) << 2;
265 continue; 223 continue;
224 }
225 return 0;
266 case BPF_LD|BPF_IMM: 226 case BPF_LD|BPF_IMM:
267 A = fentry->k; 227 A = fentry->k;
268 continue; 228 continue;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 851eb927ed97..1beb782ac41b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1598,6 +1598,8 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
1598 1598
1599 read_lock_bh(&tbl->lock); 1599 read_lock_bh(&tbl->lock);
1600 ndtmsg->ndtm_family = tbl->family; 1600 ndtmsg->ndtm_family = tbl->family;
1601 ndtmsg->ndtm_pad1 = 0;
1602 ndtmsg->ndtm_pad2 = 0;
1601 1603
1602 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id); 1604 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1603 RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval); 1605 RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
@@ -1683,6 +1685,8 @@ static int neightbl_fill_param_info(struct neigh_table *tbl,
1683 1685
1684 read_lock_bh(&tbl->lock); 1686 read_lock_bh(&tbl->lock);
1685 ndtmsg->ndtm_family = tbl->family; 1687 ndtmsg->ndtm_family = tbl->family;
1688 ndtmsg->ndtm_pad1 = 0;
1689 ndtmsg->ndtm_pad2 = 0;
1686 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id); 1690 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1687 1691
1688 if (neightbl_fill_parms(skb, parms) < 0) 1692 if (neightbl_fill_parms(skb, parms) < 0)
@@ -1872,6 +1876,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
1872 struct ndmsg *ndm = NLMSG_DATA(nlh); 1876 struct ndmsg *ndm = NLMSG_DATA(nlh);
1873 1877
1874 ndm->ndm_family = n->ops->family; 1878 ndm->ndm_family = n->ops->family;
1879 ndm->ndm_pad1 = 0;
1880 ndm->ndm_pad2 = 0;
1875 ndm->ndm_flags = n->flags; 1881 ndm->ndm_flags = n->flags;
1876 ndm->ndm_type = n->type; 1882 ndm->ndm_type = n->type;
1877 ndm->ndm_ifindex = n->dev->ifindex; 1883 ndm->ndm_ifindex = n->dev->ifindex;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06bc79f3..975d651312dc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
151#include <asm/timex.h> 151#include <asm/timex.h>
152 152
153 153
154#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n" 154#define VERSION "pktgen v2.62: Packet Generator for packet performance testing.\n"
155 155
156/* #define PG_DEBUG(a) a */ 156/* #define PG_DEBUG(a) a */
157#define PG_DEBUG(a) 157#define PG_DEBUG(a)
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1921 struct iphdr *iph; 1921 struct iphdr *iph;
1922 struct pktgen_hdr *pgh = NULL; 1922 struct pktgen_hdr *pgh = NULL;
1923 1923
1924 /* Update any of the values, used when we're incrementing various
1925 * fields.
1926 */
1927 mod_cur_headers(pkt_dev);
1928
1924 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 1929 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
1925 if (!skb) { 1930 if (!skb) {
1926 sprintf(pkt_dev->result, "No memory"); 1931 sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1934 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); 1939 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
1935 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 1940 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
1936 1941
1937 /* Update any of the values, used when we're incrementing various
1938 * fields.
1939 */
1940 mod_cur_headers(pkt_dev);
1941
1942 memcpy(eth, pkt_dev->hh, 12); 1942 memcpy(eth, pkt_dev->hh, 12);
1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP); 1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
1944 1944
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2192 int datalen; 2192 int datalen;
2193 struct ipv6hdr *iph; 2193 struct ipv6hdr *iph;
2194 struct pktgen_hdr *pgh = NULL; 2194 struct pktgen_hdr *pgh = NULL;
2195 2195
2196 /* Update any of the values, used when we're incrementing various
2197 * fields.
2198 */
2199 mod_cur_headers(pkt_dev);
2200
2196 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 2201 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
2197 if (!skb) { 2202 if (!skb) {
2198 sprintf(pkt_dev->result, "No memory"); 2203 sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2206 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); 2211 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
2207 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 2212 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
2208 2213
2209
2210 /* Update any of the values, used when we're incrementing various
2211 * fields.
2212 */
2213 mod_cur_headers(pkt_dev);
2214
2215
2216 memcpy(eth, pkt_dev->hh, 12); 2214 memcpy(eth, pkt_dev->hh, 12);
2217 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6); 2215 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
2218 2216
2219
2220 datalen = pkt_dev->cur_pkt_size-14- 2217 datalen = pkt_dev->cur_pkt_size-14-
2221 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */ 2218 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
2222 2219
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e013d836a7ab..4b1bb30e6381 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -126,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
126 rta->rta_type = attrtype; 126 rta->rta_type = attrtype;
127 rta->rta_len = size; 127 rta->rta_len = size;
128 memcpy(RTA_DATA(rta), data, attrlen); 128 memcpy(RTA_DATA(rta), data, attrlen);
129 memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
129} 130}
130 131
131size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size) 132size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
@@ -188,6 +189,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
188 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags); 189 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
189 r = NLMSG_DATA(nlh); 190 r = NLMSG_DATA(nlh);
190 r->ifi_family = AF_UNSPEC; 191 r->ifi_family = AF_UNSPEC;
192 r->__ifi_pad = 0;
191 r->ifi_type = dev->type; 193 r->ifi_type = dev->type;
192 r->ifi_index = dev->ifindex; 194 r->ifi_index = dev->ifindex;
193 r->ifi_flags = dev_get_flags(dev); 195 r->ifi_flags = dev_get_flags(dev);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6d68c03bc051..d9f7b06fe886 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -129,7 +129,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
129 * Buffers may only be allocated from interrupts using a @gfp_mask of 129 * Buffers may only be allocated from interrupts using a @gfp_mask of
130 * %GFP_ATOMIC. 130 * %GFP_ATOMIC.
131 */ 131 */
132struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) 132struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
133{ 133{
134 struct sk_buff *skb; 134 struct sk_buff *skb;
135 u8 *data; 135 u8 *data;
@@ -182,7 +182,8 @@ nodata:
182 * %GFP_ATOMIC. 182 * %GFP_ATOMIC.
183 */ 183 */
184struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, 184struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
185 unsigned int size, int gfp_mask) 185 unsigned int size,
186 unsigned int __nocast gfp_mask)
186{ 187{
187 struct sk_buff *skb; 188 struct sk_buff *skb;
188 u8 *data; 189 u8 *data;
@@ -322,7 +323,7 @@ void __kfree_skb(struct sk_buff *skb)
322 * %GFP_ATOMIC. 323 * %GFP_ATOMIC.
323 */ 324 */
324 325
325struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) 326struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
326{ 327{
327 struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 328 struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
328 329
@@ -357,7 +358,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
357 C(ip_summed); 358 C(ip_summed);
358 C(priority); 359 C(priority);
359 C(protocol); 360 C(protocol);
360 C(security);
361 n->destructor = NULL; 361 n->destructor = NULL;
362#ifdef CONFIG_NETFILTER 362#ifdef CONFIG_NETFILTER
363 C(nfmark); 363 C(nfmark);
@@ -422,7 +422,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
422 new->pkt_type = old->pkt_type; 422 new->pkt_type = old->pkt_type;
423 new->stamp = old->stamp; 423 new->stamp = old->stamp;
424 new->destructor = NULL; 424 new->destructor = NULL;
425 new->security = old->security;
426#ifdef CONFIG_NETFILTER 425#ifdef CONFIG_NETFILTER
427 new->nfmark = old->nfmark; 426 new->nfmark = old->nfmark;
428 new->nfcache = old->nfcache; 427 new->nfcache = old->nfcache;
@@ -462,7 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
462 * header is going to be modified. Use pskb_copy() instead. 461 * header is going to be modified. Use pskb_copy() instead.
463 */ 462 */
464 463
465struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) 464struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
466{ 465{
467 int headerlen = skb->data - skb->head; 466 int headerlen = skb->data - skb->head;
468 /* 467 /*
@@ -501,7 +500,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
501 * The returned buffer has a reference count of 1. 500 * The returned buffer has a reference count of 1.
502 */ 501 */
503 502
504struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) 503struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
505{ 504{
506 /* 505 /*
507 * Allocate the copy buffer 506 * Allocate the copy buffer
@@ -559,7 +558,8 @@ out:
559 * reloaded after call to this function. 558 * reloaded after call to this function.
560 */ 559 */
561 560
562int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) 561int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
562 unsigned int __nocast gfp_mask)
563{ 563{
564 int i; 564 int i;
565 u8 *data; 565 u8 *data;
@@ -649,7 +649,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
649 * only by netfilter in the cases when checksum is recalculated? --ANK 649 * only by netfilter in the cases when checksum is recalculated? --ANK
650 */ 650 */
651struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 651struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
652 int newheadroom, int newtailroom, int gfp_mask) 652 int newheadroom, int newtailroom,
653 unsigned int __nocast gfp_mask)
653{ 654{
654 /* 655 /*
655 * Allocate the copy buffer 656 * Allocate the copy buffer
@@ -1500,6 +1501,159 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
1500 skb_split_no_header(skb, skb1, len, pos); 1501 skb_split_no_header(skb, skb1, len, pos);
1501} 1502}
1502 1503
1504/**
1505 * skb_prepare_seq_read - Prepare a sequential read of skb data
1506 * @skb: the buffer to read
1507 * @from: lower offset of data to be read
1508 * @to: upper offset of data to be read
1509 * @st: state variable
1510 *
1511 * Initializes the specified state variable. Must be called before
1512 * invoking skb_seq_read() for the first time.
1513 */
1514void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
1515 unsigned int to, struct skb_seq_state *st)
1516{
1517 st->lower_offset = from;
1518 st->upper_offset = to;
1519 st->root_skb = st->cur_skb = skb;
1520 st->frag_idx = st->stepped_offset = 0;
1521 st->frag_data = NULL;
1522}
1523
1524/**
1525 * skb_seq_read - Sequentially read skb data
1526 * @consumed: number of bytes consumed by the caller so far
1527 * @data: destination pointer for data to be returned
1528 * @st: state variable
1529 *
1530 * Reads a block of skb data at &consumed relative to the
1531 * lower offset specified to skb_prepare_seq_read(). Assigns
1532 * the head of the data block to &data and returns the length
1533 * of the block or 0 if the end of the skb data or the upper
1534 * offset has been reached.
1535 *
1536 * The caller is not required to consume all of the data
1537 * returned, i.e. &consumed is typically set to the number
1538 * of bytes already consumed and the next call to
1539 * skb_seq_read() will return the remaining part of the block.
1540 *
1541 * Note: The size of each block of data returned can be arbitary,
1542 * this limitation is the cost for zerocopy seqeuental
1543 * reads of potentially non linear data.
1544 *
1545 * Note: Fragment lists within fragments are not implemented
1546 * at the moment, state->root_skb could be replaced with
1547 * a stack for this purpose.
1548 */
1549unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
1550 struct skb_seq_state *st)
1551{
1552 unsigned int block_limit, abs_offset = consumed + st->lower_offset;
1553 skb_frag_t *frag;
1554
1555 if (unlikely(abs_offset >= st->upper_offset))
1556 return 0;
1557
1558next_skb:
1559 block_limit = skb_headlen(st->cur_skb);
1560
1561 if (abs_offset < block_limit) {
1562 *data = st->cur_skb->data + abs_offset;
1563 return block_limit - abs_offset;
1564 }
1565
1566 if (st->frag_idx == 0 && !st->frag_data)
1567 st->stepped_offset += skb_headlen(st->cur_skb);
1568
1569 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
1570 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
1571 block_limit = frag->size + st->stepped_offset;
1572
1573 if (abs_offset < block_limit) {
1574 if (!st->frag_data)
1575 st->frag_data = kmap_skb_frag(frag);
1576
1577 *data = (u8 *) st->frag_data + frag->page_offset +
1578 (abs_offset - st->stepped_offset);
1579
1580 return block_limit - abs_offset;
1581 }
1582
1583 if (st->frag_data) {
1584 kunmap_skb_frag(st->frag_data);
1585 st->frag_data = NULL;
1586 }
1587
1588 st->frag_idx++;
1589 st->stepped_offset += frag->size;
1590 }
1591
1592 if (st->cur_skb->next) {
1593 st->cur_skb = st->cur_skb->next;
1594 st->frag_idx = 0;
1595 goto next_skb;
1596 } else if (st->root_skb == st->cur_skb &&
1597 skb_shinfo(st->root_skb)->frag_list) {
1598 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
1599 goto next_skb;
1600 }
1601
1602 return 0;
1603}
1604
1605/**
1606 * skb_abort_seq_read - Abort a sequential read of skb data
1607 * @st: state variable
1608 *
1609 * Must be called if skb_seq_read() was not called until it
1610 * returned 0.
1611 */
1612void skb_abort_seq_read(struct skb_seq_state *st)
1613{
1614 if (st->frag_data)
1615 kunmap_skb_frag(st->frag_data);
1616}
1617
1618#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
1619
1620static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
1621 struct ts_config *conf,
1622 struct ts_state *state)
1623{
1624 return skb_seq_read(offset, text, TS_SKB_CB(state));
1625}
1626
1627static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
1628{
1629 skb_abort_seq_read(TS_SKB_CB(state));
1630}
1631
1632/**
1633 * skb_find_text - Find a text pattern in skb data
1634 * @skb: the buffer to look in
1635 * @from: search offset
1636 * @to: search limit
1637 * @config: textsearch configuration
1638 * @state: uninitialized textsearch state variable
1639 *
1640 * Finds a pattern in the skb data according to the specified
1641 * textsearch configuration. Use textsearch_next() to retrieve
1642 * subsequent occurrences of the pattern. Returns the offset
1643 * to the first occurrence or UINT_MAX if no match was found.
1644 */
1645unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
1646 unsigned int to, struct ts_config *config,
1647 struct ts_state *state)
1648{
1649 config->get_next_block = skb_ts_get_next_block;
1650 config->finish = skb_ts_finish;
1651
1652 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
1653
1654 return textsearch_find(config, state);
1655}
1656
1503void __init skb_init(void) 1657void __init skb_init(void)
1504{ 1658{
1505 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 1659 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1538,3 +1692,7 @@ EXPORT_SYMBOL(skb_queue_tail);
1538EXPORT_SYMBOL(skb_unlink); 1692EXPORT_SYMBOL(skb_unlink);
1539EXPORT_SYMBOL(skb_append); 1693EXPORT_SYMBOL(skb_append);
1540EXPORT_SYMBOL(skb_split); 1694EXPORT_SYMBOL(skb_split);
1695EXPORT_SYMBOL(skb_prepare_seq_read);
1696EXPORT_SYMBOL(skb_seq_read);
1697EXPORT_SYMBOL(skb_abort_seq_read);
1698EXPORT_SYMBOL(skb_find_text);
diff --git a/net/core/sock.c b/net/core/sock.c
index a6ec3ada7f9e..8b35ccdc2b3b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -622,7 +622,8 @@ lenout:
622 * @prot: struct proto associated with this new sock instance 622 * @prot: struct proto associated with this new sock instance
623 * @zero_it: if we should zero the newly allocated sock 623 * @zero_it: if we should zero the newly allocated sock
624 */ 624 */
625struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it) 625struct sock *sk_alloc(int family, unsigned int __nocast priority,
626 struct proto *prot, int zero_it)
626{ 627{
627 struct sock *sk = NULL; 628 struct sock *sk = NULL;
628 kmem_cache_t *slab = prot->slab; 629 kmem_cache_t *slab = prot->slab;
@@ -750,7 +751,8 @@ unsigned long sock_i_ino(struct sock *sk)
750/* 751/*
751 * Allocate a skb from the socket's send buffer. 752 * Allocate a skb from the socket's send buffer.
752 */ 753 */
753struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) 754struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
755 unsigned int __nocast priority)
754{ 756{
755 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 757 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
756 struct sk_buff * skb = alloc_skb(size, priority); 758 struct sk_buff * skb = alloc_skb(size, priority);
@@ -765,7 +767,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
765/* 767/*
766 * Allocate a skb from the socket's receive buffer. 768 * Allocate a skb from the socket's receive buffer.
767 */ 769 */
768struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) 770struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
771 unsigned int __nocast priority)
769{ 772{
770 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 773 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
771 struct sk_buff *skb = alloc_skb(size, priority); 774 struct sk_buff *skb = alloc_skb(size, priority);
@@ -780,7 +783,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
780/* 783/*
781 * Allocate a memory block from the socket's option memory buffer. 784 * Allocate a memory block from the socket's option memory buffer.
782 */ 785 */
783void *sock_kmalloc(struct sock *sk, int size, int priority) 786void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
784{ 787{
785 if ((unsigned)size <= sysctl_optmem_max && 788 if ((unsigned)size <= sysctl_optmem_max &&
786 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 789 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 880a88815211..8f817ad9f546 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,12 +13,8 @@
13#ifdef CONFIG_SYSCTL 13#ifdef CONFIG_SYSCTL
14 14
15extern int netdev_max_backlog; 15extern int netdev_max_backlog;
16extern int netdev_budget;
16extern int weight_p; 17extern int weight_p;
17extern int no_cong_thresh;
18extern int no_cong;
19extern int lo_cong;
20extern int mod_cong;
21extern int netdev_fastroute;
22extern int net_msg_cost; 18extern int net_msg_cost;
23extern int net_msg_burst; 19extern int net_msg_burst;
24 20
@@ -86,38 +82,6 @@ ctl_table core_table[] = {
86 .proc_handler = &proc_dointvec 82 .proc_handler = &proc_dointvec
87 }, 83 },
88 { 84 {
89 .ctl_name = NET_CORE_NO_CONG_THRESH,
90 .procname = "no_cong_thresh",
91 .data = &no_cong_thresh,
92 .maxlen = sizeof(int),
93 .mode = 0644,
94 .proc_handler = &proc_dointvec
95 },
96 {
97 .ctl_name = NET_CORE_NO_CONG,
98 .procname = "no_cong",
99 .data = &no_cong,
100 .maxlen = sizeof(int),
101 .mode = 0644,
102 .proc_handler = &proc_dointvec
103 },
104 {
105 .ctl_name = NET_CORE_LO_CONG,
106 .procname = "lo_cong",
107 .data = &lo_cong,
108 .maxlen = sizeof(int),
109 .mode = 0644,
110 .proc_handler = &proc_dointvec
111 },
112 {
113 .ctl_name = NET_CORE_MOD_CONG,
114 .procname = "mod_cong",
115 .data = &mod_cong,
116 .maxlen = sizeof(int),
117 .mode = 0644,
118 .proc_handler = &proc_dointvec
119 },
120 {
121 .ctl_name = NET_CORE_MSG_COST, 85 .ctl_name = NET_CORE_MSG_COST,
122 .procname = "message_cost", 86 .procname = "message_cost",
123 .data = &net_msg_cost, 87 .data = &net_msg_cost,
@@ -161,6 +125,14 @@ ctl_table core_table[] = {
161 .mode = 0644, 125 .mode = 0644,
162 .proc_handler = &proc_dointvec 126 .proc_handler = &proc_dointvec
163 }, 127 },
128 {
129 .ctl_name = NET_CORE_BUDGET,
130 .procname = "netdev_budget",
131 .data = &netdev_budget,
132 .maxlen = sizeof(int),
133 .mode = 0644,
134 .proc_handler = &proc_dointvec
135 },
164 { .ctl_name = 0 } 136 { .ctl_name = 0 }
165}; 137};
166 138
diff --git a/net/core/wireless.c b/net/core/wireless.c
index b2fe378dfbf8..3ff5639c0b78 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -1102,6 +1102,7 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb,
1102 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r)); 1102 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
1103 r = NLMSG_DATA(nlh); 1103 r = NLMSG_DATA(nlh);
1104 r->ifi_family = AF_UNSPEC; 1104 r->ifi_family = AF_UNSPEC;
1105 r->__ifi_pad = 0;
1105 r->ifi_type = dev->type; 1106 r->ifi_type = dev->type;
1106 r->ifi_index = dev->ifindex; 1107 r->ifi_index = dev->ifindex;
1107 r->ifi_flags = dev->flags; 1108 r->ifi_flags = dev->flags;
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 2101da542ba8..92f2ec46fd22 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -1,6 +1,29 @@
1# 1#
2# DECnet configuration 2# DECnet configuration
3# 3#
4config DECNET
5 tristate "DECnet Support"
6 ---help---
7 The DECnet networking protocol was used in many products made by
8 Digital (now Compaq). It provides reliable stream and sequenced
9 packet communications over which run a variety of services similar
10 to those which run over TCP/IP.
11
12 To find some tools to use with the kernel layer support, please
13 look at Patrick Caulfield's web site:
14 <http://linux-decnet.sourceforge.net/>.
15
16 More detailed documentation is available in
17 <file:Documentation/networking/decnet.txt>.
18
19 Be sure to say Y to "/proc file system support" and "Sysctl support"
20 below when using DECnet, since you will need sysctl support to aid
21 in configuration at run time.
22
23 The DECnet code is also available as a module ( = code which can be
24 inserted in and removed from the running kernel whenever you want).
25 The module is called decnet.
26
4config DECNET_ROUTER 27config DECNET_ROUTER
5 bool "DECnet: router support (EXPERIMENTAL)" 28 bool "DECnet: router support (EXPERIMENTAL)"
6 depends on DECNET && EXPERIMENTAL 29 depends on DECNET && EXPERIMENTAL
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 29bb3cd21965..96a02800cd28 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk)
536 * we are double checking that we are not sending too 536 * we are double checking that we are not sending too
537 * many of these keepalive frames. 537 * many of these keepalive frames.
538 */ 538 */
539 if (skb_queue_len(&scp->other_xmit_queue) == 0) 539 if (skb_queue_empty(&scp->other_xmit_queue))
540 dn_nsp_send_link(sk, DN_NOCHANGE, 0); 540 dn_nsp_send_link(sk, DN_NOCHANGE, 0);
541} 541}
542 542
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table
1191 struct dn_scp *scp = DN_SK(sk); 1191 struct dn_scp *scp = DN_SK(sk);
1192 int mask = datagram_poll(file, sock, wait); 1192 int mask = datagram_poll(file, sock, wait);
1193 1193
1194 if (skb_queue_len(&scp->other_receive_queue)) 1194 if (!skb_queue_empty(&scp->other_receive_queue))
1195 mask |= POLLRDBAND; 1195 mask |= POLLRDBAND;
1196 1196
1197 return mask; 1197 return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1214 1214
1215 case SIOCATMARK: 1215 case SIOCATMARK:
1216 lock_sock(sk); 1216 lock_sock(sk);
1217 val = (skb_queue_len(&scp->other_receive_queue) != 0); 1217 val = !skb_queue_empty(&scp->other_receive_queue);
1218 if (scp->state != DN_RUN) 1218 if (scp->state != DN_RUN)
1219 val = -ENOTCONN; 1219 val = -ENOTCONN;
1220 release_sock(sk); 1220 release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int
1630 int len = 0; 1630 int len = 0;
1631 1631
1632 if (flags & MSG_OOB) 1632 if (flags & MSG_OOB)
1633 return skb_queue_len(q) ? 1 : 0; 1633 return !skb_queue_empty(q) ? 1 : 0;
1634 1634
1635 while(skb != (struct sk_buff *)q) { 1635 while(skb != (struct sk_buff *)q) {
1636 struct dn_skb_cb *cb = DN_SKB_CB(skb); 1636 struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
1707 if (sk->sk_err) 1707 if (sk->sk_err)
1708 goto out; 1708 goto out;
1709 1709
1710 if (skb_queue_len(&scp->other_receive_queue)) { 1710 if (!skb_queue_empty(&scp->other_receive_queue)) {
1711 if (!(flags & MSG_OOB)) { 1711 if (!(flags & MSG_OOB)) {
1712 msg->msg_flags |= MSG_OOB; 1712 msg->msg_flags |= MSG_OOB;
1713 if (!scp->other_report) { 1713 if (!scp->other_report) {
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720e4..99bc061759c3 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
551 if (t < s_t) 551 if (t < s_t)
552 continue; 552 continue;
553 if (t > s_t) 553 if (t > s_t)
554 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); 554 memset(&cb->args[1], 0,
555 sizeof(cb->args) - sizeof(cb->args[0]));
555 tb = dn_fib_get_table(t, 0); 556 tb = dn_fib_get_table(t, 0);
556 if (tb == NULL) 557 if (tb == NULL)
557 continue; 558 continue;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 42abbf3f524f..8cce1fdbda90 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
342 342
343 dn_nsp_output(sk); 343 dn_nsp_output(sk);
344 344
345 if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue)) 345 if (!skb_queue_empty(&scp->data_xmit_queue) ||
346 !skb_queue_empty(&scp->other_xmit_queue))
346 scp->persist = dn_nsp_persist(sk); 347 scp->persist = dn_nsp_persist(sk);
347 348
348 return 0; 349 return 0;
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
new file mode 100644
index 000000000000..39a2d2975e0e
--- /dev/null
+++ b/net/econet/Kconfig
@@ -0,0 +1,36 @@
1#
2# Acorn Econet/AUN protocols
3#
4
5config ECONET
6 tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
7 depends on EXPERIMENTAL && INET
8 ---help---
9 Econet is a fairly old and slow networking protocol mainly used by
10 Acorn computers to access file and print servers. It uses native
11 Econet network cards. AUN is an implementation of the higher level
12 parts of Econet that runs over ordinary Ethernet connections, on
13 top of the UDP packet protocol, which in turn runs on top of the
14 Internet protocol IP.
15
16 If you say Y here, you can choose with the next two options whether
17 to send Econet/AUN traffic over a UDP Ethernet connection or over
18 a native Econet network card.
19
20 To compile this driver as a module, choose M here: the module
21 will be called econet.
22
23config ECONET_AUNUDP
24 bool "AUN over UDP"
25 depends on ECONET
26 help
27 Say Y here if you want to send Econet/AUN traffic over a UDP
28 connection (UDP is a packet based protocol that runs on top of the
29 Internet protocol IP) using an ordinary Ethernet network card.
30
31config ECONET_NATIVE
32 bool "Native Econet"
33 depends on ECONET
34 help
35 Say Y here if you have a native Econet network card installed in
36 your computer.
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 6617ea47d365..f6dbfb99b14d 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -92,10 +92,9 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
92 * Set the source hardware address. 92 * Set the source hardware address.
93 */ 93 */
94 94
95 if(saddr) 95 if(!saddr)
96 memcpy(eth->h_source,saddr,dev->addr_len); 96 saddr = dev->dev_addr;
97 else 97 memcpy(eth->h_source,saddr,dev->addr_len);
98 memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
99 98
100 /* 99 /*
101 * Anyway, the loopback-device should never use this function... 100 * Anyway, the loopback-device should never use this function...
@@ -156,7 +155,7 @@ int eth_rebuild_header(struct sk_buff *skb)
156 * This is normal practice and works for any 'now in use' protocol. 155 * This is normal practice and works for any 'now in use' protocol.
157 */ 156 */
158 157
159unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev) 158__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
160{ 159{
161 struct ethhdr *eth; 160 struct ethhdr *eth;
162 unsigned char *rawp; 161 unsigned char *rawp;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 567b03b1c349..df5386885a90 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,35 +1,8 @@
1# 1#
2# IP configuration 2# IP configuration
3# 3#
4choice
5 prompt "Choose IP: FIB lookup"
6 depends on INET
7 default IP_FIB_HASH
8
9config IP_FIB_HASH
10 bool "FIB_HASH"
11 ---help---
12 Current FIB is very proven and good enough for most users.
13
14config IP_FIB_TRIE
15 bool "FIB_TRIE"
16 ---help---
17 Use new experimental LC-trie as FIB lookup algoritm.
18 This improves lookup performance
19
20 LC-trie is described in:
21
22 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 An experimental study of compression methods for dynamic tries
25 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
26 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
27
28endchoice
29
30config IP_MULTICAST 4config IP_MULTICAST
31 bool "IP: multicasting" 5 bool "IP: multicasting"
32 depends on INET
33 help 6 help
34 This is code for addressing several networked computers at once, 7 This is code for addressing several networked computers at once,
35 enlarging your kernel by about 2 KB. You need multicasting if you 8 enlarging your kernel by about 2 KB. You need multicasting if you
@@ -43,7 +16,6 @@ config IP_MULTICAST
43 16
44config IP_ADVANCED_ROUTER 17config IP_ADVANCED_ROUTER
45 bool "IP: advanced router" 18 bool "IP: advanced router"
46 depends on INET
47 ---help--- 19 ---help---
48 If you intend to run your Linux box mostly as a router, i.e. as a 20 If you intend to run your Linux box mostly as a router, i.e. as a
49 computer that forwards and redistributes network packets, say Y; you 21 computer that forwards and redistributes network packets, say Y; you
@@ -79,6 +51,44 @@ config IP_ADVANCED_ROUTER
79 51
80 If unsure, say N here. 52 If unsure, say N here.
81 53
54choice
55 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
56 depends on IP_ADVANCED_ROUTER
57 default IP_FIB_HASH
58
59config IP_FIB_HASH
60 bool "FIB_HASH"
61 ---help---
62 Current FIB is very proven and good enough for most users.
63
64config IP_FIB_TRIE
65 bool "FIB_TRIE"
66 ---help---
67 Use new experimental LC-trie as FIB lookup algoritm.
68 This improves lookup performance if you have a large
69 number of routes.
70
71 LC-trie is a longest matching prefix lookup algorithm which
72 performs better than FIB_HASH for large routing tables.
73 But, it consumes more memory and is more complex.
74
75 LC-trie is described in:
76
77 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
78 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
79 An experimental study of compression methods for dynamic tries
80 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
81 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
82
83endchoice
84
85# If the user does not enable advanced routing, he gets the safe
86# default of the fib-hash algorithm.
87config IP_FIB_HASH
88 bool
89 depends on !IP_ADVANCED_ROUTER
90 default y
91
82config IP_MULTIPLE_TABLES 92config IP_MULTIPLE_TABLES
83 bool "IP: policy routing" 93 bool "IP: policy routing"
84 depends on IP_ADVANCED_ROUTER 94 depends on IP_ADVANCED_ROUTER
@@ -171,7 +181,6 @@ config IP_ROUTE_VERBOSE
171 181
172config IP_PNP 182config IP_PNP
173 bool "IP: kernel level autoconfiguration" 183 bool "IP: kernel level autoconfiguration"
174 depends on INET
175 help 184 help
176 This enables automatic configuration of IP addresses of devices and 185 This enables automatic configuration of IP addresses of devices and
177 of the routing table during kernel boot, based on either information 186 of the routing table during kernel boot, based on either information
@@ -230,7 +239,6 @@ config IP_PNP_RARP
230# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 239# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
231config NET_IPIP 240config NET_IPIP
232 tristate "IP: tunneling" 241 tristate "IP: tunneling"
233 depends on INET
234 select INET_TUNNEL 242 select INET_TUNNEL
235 ---help--- 243 ---help---
236 Tunneling means encapsulating data of one protocol type within 244 Tunneling means encapsulating data of one protocol type within
@@ -248,7 +256,6 @@ config NET_IPIP
248 256
249config NET_IPGRE 257config NET_IPGRE
250 tristate "IP: GRE tunnels over IP" 258 tristate "IP: GRE tunnels over IP"
251 depends on INET
252 select XFRM 259 select XFRM
253 help 260 help
254 Tunneling means encapsulating data of one protocol type within 261 Tunneling means encapsulating data of one protocol type within
@@ -307,7 +314,7 @@ config IP_PIMSM_V2
307 314
308config ARPD 315config ARPD
309 bool "IP: ARP daemon support (EXPERIMENTAL)" 316 bool "IP: ARP daemon support (EXPERIMENTAL)"
310 depends on INET && EXPERIMENTAL 317 depends on EXPERIMENTAL
311 ---help--- 318 ---help---
312 Normally, the kernel maintains an internal cache which maps IP 319 Normally, the kernel maintains an internal cache which maps IP
313 addresses to hardware addresses on the local network, so that 320 addresses to hardware addresses on the local network, so that
@@ -332,7 +339,6 @@ config ARPD
332 339
333config SYN_COOKIES 340config SYN_COOKIES
334 bool "IP: TCP syncookie support (disabled per default)" 341 bool "IP: TCP syncookie support (disabled per default)"
335 depends on INET
336 ---help--- 342 ---help---
337 Normal TCP/IP networking is open to an attack known as "SYN 343 Normal TCP/IP networking is open to an attack known as "SYN
338 flooding". This denial-of-service attack prevents legitimate remote 344 flooding". This denial-of-service attack prevents legitimate remote
@@ -369,7 +375,6 @@ config SYN_COOKIES
369 375
370config INET_AH 376config INET_AH
371 tristate "IP: AH transformation" 377 tristate "IP: AH transformation"
372 depends on INET
373 select XFRM 378 select XFRM
374 select CRYPTO 379 select CRYPTO
375 select CRYPTO_HMAC 380 select CRYPTO_HMAC
@@ -382,7 +387,6 @@ config INET_AH
382 387
383config INET_ESP 388config INET_ESP
384 tristate "IP: ESP transformation" 389 tristate "IP: ESP transformation"
385 depends on INET
386 select XFRM 390 select XFRM
387 select CRYPTO 391 select CRYPTO
388 select CRYPTO_HMAC 392 select CRYPTO_HMAC
@@ -396,7 +400,6 @@ config INET_ESP
396 400
397config INET_IPCOMP 401config INET_IPCOMP
398 tristate "IP: IPComp transformation" 402 tristate "IP: IPComp transformation"
399 depends on INET
400 select XFRM 403 select XFRM
401 select INET_TUNNEL 404 select INET_TUNNEL
402 select CRYPTO 405 select CRYPTO
@@ -409,7 +412,6 @@ config INET_IPCOMP
409 412
410config INET_TUNNEL 413config INET_TUNNEL
411 tristate "IP: tunnel transformation" 414 tristate "IP: tunnel transformation"
412 depends on INET
413 select XFRM 415 select XFRM
414 ---help--- 416 ---help---
415 Support for generic IP tunnel transformation, which is required by 417 Support for generic IP tunnel transformation, which is required by
@@ -419,7 +421,6 @@ config INET_TUNNEL
419 421
420config IP_TCPDIAG 422config IP_TCPDIAG
421 tristate "IP: TCP socket monitoring interface" 423 tristate "IP: TCP socket monitoring interface"
422 depends on INET
423 default y 424 default y
424 ---help--- 425 ---help---
425 Support for TCP socket monitoring interface used by native Linux 426 Support for TCP socket monitoring interface used by native Linux
@@ -433,5 +434,108 @@ config IP_TCPDIAG
433config IP_TCPDIAG_IPV6 434config IP_TCPDIAG_IPV6
434 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) 435 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
435 436
437config TCP_CONG_ADVANCED
438 bool "TCP: advanced congestion control"
439 ---help---
440 Support for selection of various TCP congestion control
441 modules.
442
443 Nearly all users can safely say no here, and a safe default
444 selection will be made (BIC-TCP with new Reno as a fallback).
445
446 If unsure, say N.
447
448# TCP Reno is builtin (required as fallback)
449menu "TCP congestion control"
450 depends on TCP_CONG_ADVANCED
451
452config TCP_CONG_BIC
453 tristate "Binary Increase Congestion (BIC) control"
454 default y
455 ---help---
456 BIC-TCP is a sender-side only change that ensures a linear RTT
457 fairness under large windows while offering both scalability and
458 bounded TCP-friendliness. The protocol combines two schemes
459 called additive increase and binary search increase. When the
460 congestion window is large, additive increase with a large
461 increment ensures linear RTT fairness as well as good
462 scalability. Under small congestion windows, binary search
463 increase provides TCP friendliness.
464 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
465
466config TCP_CONG_WESTWOOD
467 tristate "TCP Westwood+"
468 default m
469 ---help---
470 TCP Westwood+ is a sender-side only modification of the TCP Reno
471 protocol stack that optimizes the performance of TCP congestion
472 control. It is based on end-to-end bandwidth estimation to set
473 congestion window and slow start threshold after a congestion
474 episode. Using this estimation, TCP Westwood+ adaptively sets a
475 slow start threshold and a congestion window which takes into
476 account the bandwidth used at the time congestion is experienced.
477 TCP Westwood+ significantly increases fairness wrt TCP Reno in
478 wired networks and throughput over wireless links.
479
480config TCP_CONG_HTCP
481 tristate "H-TCP"
482 default m
483 ---help---
484 H-TCP is a send-side only modifications of the TCP Reno
485 protocol stack that optimizes the performance of TCP
486 congestion control for high speed network links. It uses a
487 modeswitch to change the alpha and beta parameters of TCP Reno
488 based on network conditions and in a way so as to be fair with
489 other Reno and H-TCP flows.
490
491config TCP_CONG_HSTCP
492 tristate "High Speed TCP"
493 depends on EXPERIMENTAL
494 default n
495 ---help---
496 Sally Floyd's High Speed TCP (RFC 3649) congestion control.
497 A modification to TCP's congestion control mechanism for use
498 with large congestion windows. A table indicates how much to
499 increase the congestion window by when an ACK is received.
500 For more detail see http://www.icir.org/floyd/hstcp.html
501
502config TCP_CONG_HYBLA
503 tristate "TCP-Hybla congestion control algorithm"
504 depends on EXPERIMENTAL
505 default n
506 ---help---
507 TCP-Hybla is a sender-side only change that eliminates penalization of
508 long-RTT, large-bandwidth connections, like when satellite legs are
509 involved, expecially when sharing a common bottleneck with normal
510 terrestrial connections.
511
512config TCP_CONG_VEGAS
513 tristate "TCP Vegas"
514 depends on EXPERIMENTAL
515 default n
516 ---help---
517 TCP Vegas is a sender-side only change to TCP that anticipates
518 the onset of congestion by estimating the bandwidth. TCP Vegas
519 adjusts the sending rate by modifying the congestion
520 window. TCP Vegas should provide less packet loss, but it is
521 not as aggressive as TCP Reno.
522
523config TCP_CONG_SCALABLE
524 tristate "Scalable TCP"
525 depends on EXPERIMENTAL
526 default n
527 ---help---
528 Scalable TCP is a sender-side only change to TCP which uses a
529 MIMD congestion control algorithm which has some nice scaling
530 properties, though is known to have fairness issues.
531 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
532
533endmenu
534
535config TCP_CONG_BIC
536 tristate
537 depends on !TCP_CONG_ADVANCED
538 default y
539
436source "net/ipv4/ipvs/Kconfig" 540source "net/ipv4/ipvs/Kconfig"
437 541
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1add..5718cdb3a61e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,7 +5,8 @@
5obj-y := utils.o route.o inetpeer.o protocol.o \ 5obj-y := utils.o route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
9 tcp_minisocks.o tcp_cong.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 10 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o 11 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 12
@@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/
30obj-$(CONFIG_IP_VS) += ipvs/ 31obj-$(CONFIG_IP_VS) += ipvs/
31obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 32obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
32obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 33obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
34obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
37obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
38obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
39obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
40obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
33 41
34obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 42obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
35 xfrm4_output.o 43 xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
1009static int ipv4_proc_init(void); 1009static int ipv4_proc_init(void);
1010extern void ipfrag_init(void); 1010extern void ipfrag_init(void);
1011 1011
1012/*
1013 * IP protocol layer initialiser
1014 */
1015
1016static struct packet_type ip_packet_type = {
1017 .type = __constant_htons(ETH_P_IP),
1018 .func = ip_rcv,
1019};
1020
1012static int __init inet_init(void) 1021static int __init inet_init(void)
1013{ 1022{
1014 struct sk_buff *dummy_skb; 1023 struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
1102 1111
1103 ipfrag_init(); 1112 ipfrag_init();
1104 1113
1114 dev_add_pack(&ip_packet_type);
1115
1105 rc = 0; 1116 rc = 0;
1106out: 1117out:
1107 return rc; 1118 return rc;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0671569ee6f0..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.323" 46#define VERSION "0.325"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
136 unsigned int semantic_match_passed; 136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss; 137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit; 138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
139}; 140};
140#endif 141#endif
141 142
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn); 166static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn); 167static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn); 168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
168static struct tnode *halve(struct trie *t, struct tnode *tn); 169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
169static void tnode_free(struct tnode *tn); 170static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t); 171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); 172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -341,8 +342,10 @@ static struct leaf *leaf_new(void)
341static struct leaf_info *leaf_info_new(int plen) 342static struct leaf_info *leaf_info_new(int plen)
342{ 343{
343 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 344 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
344 li->plen = plen; 345 if(li) {
345 INIT_LIST_HEAD(&li->falh); 346 li->plen = plen;
347 INIT_LIST_HEAD(&li->falh);
348 }
346 return li; 349 return li;
347} 350}
348 351
@@ -356,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
356 kfree(li); 359 kfree(li);
357} 360}
358 361
362static struct tnode *tnode_alloc(unsigned int size)
363{
364 if (size <= PAGE_SIZE) {
365 return kmalloc(size, GFP_KERNEL);
366 } else {
367 return (struct tnode *)
368 __get_free_pages(GFP_KERNEL, get_order(size));
369 }
370}
371
372static void __tnode_free(struct tnode *tn)
373{
374 unsigned int size = sizeof(struct tnode) +
375 (1<<tn->bits) * sizeof(struct node *);
376
377 if (size <= PAGE_SIZE)
378 kfree(tn);
379 else
380 free_pages((unsigned long)tn, get_order(size));
381}
382
359static struct tnode* tnode_new(t_key key, int pos, int bits) 383static struct tnode* tnode_new(t_key key, int pos, int bits)
360{ 384{
361 int nchildren = 1<<bits; 385 int nchildren = 1<<bits;
362 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 386 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 struct tnode *tn = kmalloc(sz, GFP_KERNEL); 387 struct tnode *tn = tnode_alloc(sz);
364 388
365 if(tn) { 389 if(tn) {
366 memset(tn, 0, sz); 390 memset(tn, 0, sz);
@@ -388,7 +412,7 @@ static void tnode_free(struct tnode *tn)
388 printk("FL %p \n", tn); 412 printk("FL %p \n", tn);
389 } 413 }
390 else if(IS_TNODE(tn)) { 414 else if(IS_TNODE(tn)) {
391 kfree(tn); 415 __tnode_free(tn);
392 if(trie_debug > 0 ) 416 if(trie_debug > 0 )
393 printk("FT %p \n", tn); 417 printk("FT %p \n", tn);
394 } 418 }
@@ -458,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
458static struct node *resize(struct trie *t, struct tnode *tn) 482static struct node *resize(struct trie *t, struct tnode *tn)
459{ 483{
460 int i; 484 int i;
485 int err = 0;
461 486
462 if (!tn) 487 if (!tn)
463 return NULL; 488 return NULL;
@@ -554,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
554 */ 579 */
555 580
556 check_tnode(tn); 581 check_tnode(tn);
557 582
583 err = 0;
558 while ((tn->full_children > 0 && 584 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 585 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) { 586 inflate_threshold * tnode_child_length(tn))) {
561 587
562 tn = inflate(t, tn); 588 tn = inflate(t, tn, &err);
589
590 if(err) {
591#ifdef CONFIG_IP_FIB_TRIE_STATS
592 t->stats.resize_node_skipped++;
593#endif
594 break;
595 }
563 } 596 }
564 597
565 check_tnode(tn); 598 check_tnode(tn);
@@ -568,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
568 * Halve as long as the number of empty children in this 601 * Halve as long as the number of empty children in this
569 * node is above threshold. 602 * node is above threshold.
570 */ 603 */
604
605 err = 0;
571 while (tn->bits > 1 && 606 while (tn->bits > 1 &&
572 100 * (tnode_child_length(tn) - tn->empty_children) < 607 100 * (tnode_child_length(tn) - tn->empty_children) <
573 halve_threshold * tnode_child_length(tn)) 608 halve_threshold * tnode_child_length(tn)) {
609
610 tn = halve(t, tn, &err);
611
612 if(err) {
613#ifdef CONFIG_IP_FIB_TRIE_STATS
614 t->stats.resize_node_skipped++;
615#endif
616 break;
617 }
618 }
574 619
575 tn = halve(t, tn);
576 620
577 /* Only one child remains */ 621 /* Only one child remains */
578 622
@@ -597,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
597 return (struct node *) tn; 641 return (struct node *) tn;
598} 642}
599 643
600static struct tnode *inflate(struct trie *t, struct tnode *tn) 644static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
601{ 645{
602 struct tnode *inode; 646 struct tnode *inode;
603 struct tnode *oldtnode = tn; 647 struct tnode *oldtnode = tn;
@@ -609,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
609 653
610 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 654 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611 655
612 if (!tn) 656 if (!tn) {
613 trie_bug("tnode_new failed"); 657 *err = -ENOMEM;
658 return oldtnode;
659 }
660
661 /*
662 * Preallocate and store tnodes before the actual work so we
663 * don't get into an inconsistent state if memory allocation
664 * fails. In case of failure we return the oldnode and inflate
665 * of tnode is ignored.
666 */
667
668 for(i = 0; i < olen; i++) {
669 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670
671 if (inode &&
672 IS_TNODE(inode) &&
673 inode->pos == oldtnode->pos + oldtnode->bits &&
674 inode->bits > 1) {
675 struct tnode *left, *right;
676
677 t_key m = TKEY_GET_MASK(inode->pos, 1);
678
679 left = tnode_new(inode->key&(~m), inode->pos + 1,
680 inode->bits - 1);
681
682 if(!left) {
683 *err = -ENOMEM;
684 break;
685 }
686
687 right = tnode_new(inode->key|m, inode->pos + 1,
688 inode->bits - 1);
689
690 if(!right) {
691 *err = -ENOMEM;
692 break;
693 }
694
695 put_child(t, tn, 2*i, (struct node *) left);
696 put_child(t, tn, 2*i+1, (struct node *) right);
697 }
698 }
699
700 if(*err) {
701 int size = tnode_child_length(tn);
702 int j;
703
704 for(j = 0; j < size; j++)
705 if( tn->child[j])
706 tnode_free((struct tnode *)tn->child[j]);
707
708 tnode_free(tn);
709
710 *err = -ENOMEM;
711 return oldtnode;
712 }
614 713
615 for(i = 0; i < olen; i++) { 714 for(i = 0; i < olen; i++) {
616 struct node *node = tnode_get_child(oldtnode, i); 715 struct node *node = tnode_get_child(oldtnode, i);
@@ -623,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
623 722
624 if(IS_LEAF(node) || ((struct tnode *) node)->pos > 723 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
625 tn->pos + tn->bits - 1) { 724 tn->pos + tn->bits - 1) {
626 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, 725 if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
627 1) == 0) 726 1) == 0)
628 put_child(t, tn, 2*i, node); 727 put_child(t, tn, 2*i, node);
629 else 728 else
@@ -663,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
663 * the position (inode->pos) 762 * the position (inode->pos)
664 */ 763 */
665 764
666 t_key m = TKEY_GET_MASK(inode->pos, 1);
667
668 /* Use the old key, but set the new significant 765 /* Use the old key, but set the new significant
669 * bit to zero. 766 * bit to zero.
670 */ 767 */
671 left = tnode_new(inode->key&(~m), inode->pos + 1,
672 inode->bits - 1);
673 768
674 if(!left) 769 left = (struct tnode *) tnode_get_child(tn, 2*i);
675 trie_bug("tnode_new failed"); 770 put_child(t, tn, 2*i, NULL);
676 771
677 772 if(!left)
678 /* Use the old key, but set the new significant 773 BUG();
679 * bit to one. 774
680 */ 775 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
681 right = tnode_new(inode->key|m, inode->pos + 1, 776 put_child(t, tn, 2*i+1, NULL);
682 inode->bits - 1); 777
778 if(!right)
779 BUG();
683 780
684 if(!right)
685 trie_bug("tnode_new failed");
686
687 size = tnode_child_length(left); 781 size = tnode_child_length(left);
688 for(j = 0; j < size; j++) { 782 for(j = 0; j < size; j++) {
689 put_child(t, left, j, inode->child[j]); 783 put_child(t, left, j, inode->child[j]);
@@ -699,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
699 return tn; 793 return tn;
700} 794}
701 795
702static struct tnode *halve(struct trie *t, struct tnode *tn) 796static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
703{ 797{
704 struct tnode *oldtnode = tn; 798 struct tnode *oldtnode = tn;
705 struct node *left, *right; 799 struct node *left, *right;
@@ -710,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
710 804
711 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 805 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
712 806
713 if(!tn) 807 if (!tn) {
714 trie_bug("tnode_new failed"); 808 *err = -ENOMEM;
809 return oldtnode;
810 }
811
812 /*
813 * Preallocate and store tnodes before the actual work so we
814 * don't get into an inconsistent state if memory allocation
815 * fails. In case of failure we return the oldnode and halve
816 * of tnode is ignored.
817 */
818
819 for(i = 0; i < olen; i += 2) {
820 left = tnode_get_child(oldtnode, i);
821 right = tnode_get_child(oldtnode, i+1);
822
823 /* Two nonempty children */
824 if( left && right) {
825 struct tnode *newBinNode =
826 tnode_new(left->key, tn->pos + tn->bits, 1);
827
828 if(!newBinNode) {
829 *err = -ENOMEM;
830 break;
831 }
832 put_child(t, tn, i/2, (struct node *)newBinNode);
833 }
834 }
835
836 if(*err) {
837 int size = tnode_child_length(tn);
838 int j;
839
840 for(j = 0; j < size; j++)
841 if( tn->child[j])
842 tnode_free((struct tnode *)tn->child[j]);
843
844 tnode_free(tn);
845
846 *err = -ENOMEM;
847 return oldtnode;
848 }
715 849
716 for(i = 0; i < olen; i += 2) { 850 for(i = 0; i < olen; i += 2) {
717 left = tnode_get_child(oldtnode, i); 851 left = tnode_get_child(oldtnode, i);
@@ -728,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
728 /* Two nonempty children */ 862 /* Two nonempty children */
729 else { 863 else {
730 struct tnode *newBinNode = 864 struct tnode *newBinNode =
731 tnode_new(left->key, tn->pos + tn->bits, 1); 865 (struct tnode *) tnode_get_child(tn, i/2);
866 put_child(t, tn, i/2, NULL);
732 867
733 if(!newBinNode) 868 if(!newBinNode)
734 trie_bug("tnode_new failed"); 869 BUG();
735 870
736 put_child(t, newBinNode, 0, left); 871 put_child(t, newBinNode, 0, left);
737 put_child(t, newBinNode, 1, right); 872 put_child(t, newBinNode, 1, right);
@@ -879,8 +1014,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
879 return (struct node*) tn; 1014 return (struct node*) tn;
880} 1015}
881 1016
882static struct list_head * 1017static struct list_head *
883fib_insert_node(struct trie *t, u32 key, int plen) 1018fib_insert_node(struct trie *t, int *err, u32 key, int plen)
884{ 1019{
885 int pos, newpos; 1020 int pos, newpos;
886 struct tnode *tp = NULL, *tn = NULL; 1021 struct tnode *tp = NULL, *tn = NULL;
@@ -940,7 +1075,6 @@ fib_insert_node(struct trie *t, u32 key, int plen)
940 if(tp && IS_LEAF(tp)) 1075 if(tp && IS_LEAF(tp))
941 BUG(); 1076 BUG();
942 1077
943 t->revision++;
944 1078
945 /* Case 1: n is a leaf. Compare prefixes */ 1079 /* Case 1: n is a leaf. Compare prefixes */
946 1080
@@ -949,8 +1083,10 @@ fib_insert_node(struct trie *t, u32 key, int plen)
949 1083
950 li = leaf_info_new(plen); 1084 li = leaf_info_new(plen);
951 1085
952 if(! li) 1086 if(! li) {
953 BUG(); 1087 *err = -ENOMEM;
1088 goto err;
1089 }
954 1090
955 fa_head = &li->falh; 1091 fa_head = &li->falh;
956 insert_leaf_info(&l->list, li); 1092 insert_leaf_info(&l->list, li);
@@ -959,14 +1095,19 @@ fib_insert_node(struct trie *t, u32 key, int plen)
959 t->size++; 1095 t->size++;
960 l = leaf_new(); 1096 l = leaf_new();
961 1097
962 if(! l) 1098 if(! l) {
963 BUG(); 1099 *err = -ENOMEM;
1100 goto err;
1101 }
964 1102
965 l->key = key; 1103 l->key = key;
966 li = leaf_info_new(plen); 1104 li = leaf_info_new(plen);
967 1105
968 if(! li) 1106 if(! li) {
969 BUG(); 1107 tnode_free((struct tnode *) l);
1108 *err = -ENOMEM;
1109 goto err;
1110 }
970 1111
971 fa_head = &li->falh; 1112 fa_head = &li->falh;
972 insert_leaf_info(&l->list, li); 1113 insert_leaf_info(&l->list, li);
@@ -1003,9 +1144,14 @@ fib_insert_node(struct trie *t, u32 key, int plen)
1003 newpos = 0; 1144 newpos = 0;
1004 tn = tnode_new(key, newpos, 1); /* First tnode */ 1145 tn = tnode_new(key, newpos, 1); /* First tnode */
1005 } 1146 }
1006 if(!tn)
1007 trie_bug("tnode_pfx_new failed");
1008 1147
1148 if(!tn) {
1149 free_leaf_info(li);
1150 tnode_free((struct tnode *) l);
1151 *err = -ENOMEM;
1152 goto err;
1153 }
1154
1009 NODE_SET_PARENT(tn, tp); 1155 NODE_SET_PARENT(tn, tp);
1010 1156
1011 missbit=tkey_extract_bits(key, newpos, 1); 1157 missbit=tkey_extract_bits(key, newpos, 1);
@@ -1027,7 +1173,9 @@ fib_insert_node(struct trie *t, u32 key, int plen)
1027 } 1173 }
1028 /* Rebalance the trie */ 1174 /* Rebalance the trie */
1029 t->trie = trie_rebalance(t, tp); 1175 t->trie = trie_rebalance(t, tp);
1030done:; 1176done:
1177 t->revision++;
1178err:;
1031 return fa_head; 1179 return fa_head;
1032} 1180}
1033 1181
@@ -1156,8 +1304,12 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1156 * Insert new entry to the list. 1304 * Insert new entry to the list.
1157 */ 1305 */
1158 1306
1159 if(!fa_head) 1307 if(!fa_head) {
1160 fa_head = fib_insert_node(t, key, plen); 1308 fa_head = fib_insert_node(t, &err, key, plen);
1309 err = 0;
1310 if(err)
1311 goto out_free_new_fa;
1312 }
1161 1313
1162 write_lock_bh(&fib_lock); 1314 write_lock_bh(&fib_lock);
1163 1315
@@ -1170,6 +1322,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1170 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1322 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171succeeded: 1323succeeded:
1172 return 0; 1324 return 0;
1325
1326out_free_new_fa:
1327 kmem_cache_free(fn_alias_kmem, new_fa);
1173out: 1328out:
1174 fib_release_info(fi); 1329 fib_release_info(fi);
1175err:; 1330err:;
@@ -2279,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2279 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); 2434 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); 2435 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); 2436 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2437 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2282#ifdef CLEAR_STATS 2438#ifdef CLEAR_STATS
2283 memset(&(t->stats), 0, sizeof(t->stats)); 2439 memset(&(t->stats), 0, sizeof(t->stats));
2284#endif 2440#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cb759484979d..279f57abfecb 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -970,7 +970,8 @@ int icmp_rcv(struct sk_buff *skb)
970 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently 970 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
971 * discarded if to broadcast/multicast. 971 * discarded if to broadcast/multicast.
972 */ 972 */
973 if (icmph->type == ICMP_ECHO && 973 if ((icmph->type == ICMP_ECHO ||
974 icmph->type == ICMP_TIMESTAMP) &&
974 sysctl_icmp_echo_ignore_broadcasts) { 975 sysctl_icmp_echo_ignore_broadcasts) {
975 goto error; 976 goto error;
976 } 977 }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a90..5088f90835ae 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1615{ 1615{
1616 int err; 1616 int err;
1617 u32 addr = imr->imr_multiaddr.s_addr; 1617 u32 addr = imr->imr_multiaddr.s_addr;
1618 struct ip_mc_socklist *iml, *i; 1618 struct ip_mc_socklist *iml=NULL, *i;
1619 struct in_device *in_dev; 1619 struct in_device *in_dev;
1620 struct inet_sock *inet = inet_sk(sk); 1620 struct inet_sock *inet = inet_sk(sk);
1621 int ifindex;
1621 int count = 0; 1622 int count = 0;
1622 1623
1623 if (!MULTICAST(addr)) 1624 if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1633 goto done; 1634 goto done;
1634 } 1635 }
1635 1636
1636 iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
1637
1638 err = -EADDRINUSE; 1637 err = -EADDRINUSE;
1638 ifindex = imr->imr_ifindex;
1639 for (i = inet->mc_list; i; i = i->next) { 1639 for (i = inet->mc_list; i; i = i->next) {
1640 if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { 1640 if (i->multi.imr_multiaddr.s_addr == addr &&
1641 /* New style additions are reference counted */ 1641 i->multi.imr_ifindex == ifindex)
1642 if (imr->imr_address.s_addr == 0) {
1643 i->count++;
1644 err = 0;
1645 }
1646 goto done; 1642 goto done;
1647 }
1648 count++; 1643 count++;
1649 } 1644 }
1650 err = -ENOBUFS; 1645 err = -ENOBUFS;
1651 if (iml == NULL || count >= sysctl_igmp_max_memberships) 1646 if (count >= sysctl_igmp_max_memberships)
1647 goto done;
1648 iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
1649 if (iml == NULL)
1652 goto done; 1650 goto done;
1651
1653 memcpy(&iml->multi, imr, sizeof(*imr)); 1652 memcpy(&iml->multi, imr, sizeof(*imr));
1654 iml->next = inet->mc_list; 1653 iml->next = inet->mc_list;
1655 iml->count = 1;
1656 iml->sflist = NULL; 1654 iml->sflist = NULL;
1657 iml->sfmode = MCAST_EXCLUDE; 1655 iml->sfmode = MCAST_EXCLUDE;
1658 inet->mc_list = iml; 1656 inet->mc_list = iml;
1659 ip_mc_inc_group(in_dev, addr); 1657 ip_mc_inc_group(in_dev, addr);
1660 iml = NULL;
1661 err = 0; 1658 err = 0;
1662
1663done: 1659done:
1664 rtnl_shunlock(); 1660 rtnl_shunlock();
1665 if (iml)
1666 sock_kfree_s(sk, iml, sizeof(*iml));
1667 return err; 1661 return err;
1668} 1662}
1669 1663
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1693{ 1687{
1694 struct inet_sock *inet = inet_sk(sk); 1688 struct inet_sock *inet = inet_sk(sk);
1695 struct ip_mc_socklist *iml, **imlp; 1689 struct ip_mc_socklist *iml, **imlp;
1690 struct in_device *in_dev;
1691 u32 group = imr->imr_multiaddr.s_addr;
1692 u32 ifindex;
1696 1693
1697 rtnl_lock(); 1694 rtnl_lock();
1695 in_dev = ip_mc_find_dev(imr);
1696 if (!in_dev) {
1697 rtnl_unlock();
1698 return -ENODEV;
1699 }
1700 ifindex = imr->imr_ifindex;
1698 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1701 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
1699 if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && 1702 if (iml->multi.imr_multiaddr.s_addr == group &&
1700 iml->multi.imr_address.s_addr==imr->imr_address.s_addr && 1703 iml->multi.imr_ifindex == ifindex) {
1701 (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { 1704 (void) ip_mc_leave_src(sk, iml, in_dev);
1702 struct in_device *in_dev;
1703
1704 in_dev = inetdev_by_index(iml->multi.imr_ifindex);
1705 if (in_dev)
1706 (void) ip_mc_leave_src(sk, iml, in_dev);
1707 if (--iml->count) {
1708 rtnl_unlock();
1709 if (in_dev)
1710 in_dev_put(in_dev);
1711 return 0;
1712 }
1713 1705
1714 *imlp = iml->next; 1706 *imlp = iml->next;
1715 1707
1716 if (in_dev) { 1708 ip_mc_dec_group(in_dev, group);
1717 ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
1718 in_dev_put(in_dev);
1719 }
1720 rtnl_unlock(); 1709 rtnl_unlock();
1721 sock_kfree_s(sk, iml, sizeof(*iml)); 1710 sock_kfree_s(sk, iml, sizeof(*iml));
1722 return 0; 1711 return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1736 struct in_device *in_dev = NULL; 1725 struct in_device *in_dev = NULL;
1737 struct inet_sock *inet = inet_sk(sk); 1726 struct inet_sock *inet = inet_sk(sk);
1738 struct ip_sf_socklist *psl; 1727 struct ip_sf_socklist *psl;
1728 int leavegroup = 0;
1739 int i, j, rv; 1729 int i, j, rv;
1740 1730
1741 if (!MULTICAST(addr)) 1731 if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1755 err = -EADDRNOTAVAIL; 1745 err = -EADDRNOTAVAIL;
1756 1746
1757 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1747 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1758 if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0) 1748 if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
1749 && pmc->multi.imr_ifindex == imr.imr_ifindex)
1759 break; 1750 break;
1760 } 1751 }
1761 if (!pmc) /* must have a prior join */ 1752 if (!pmc) { /* must have a prior join */
1753 err = -EINVAL;
1762 goto done; 1754 goto done;
1755 }
1763 /* if a source filter was set, must be the same mode as before */ 1756 /* if a source filter was set, must be the same mode as before */
1764 if (pmc->sflist) { 1757 if (pmc->sflist) {
1765 if (pmc->sfmode != omode) 1758 if (pmc->sfmode != omode) {
1759 err = -EINVAL;
1766 goto done; 1760 goto done;
1761 }
1767 } else if (pmc->sfmode != omode) { 1762 } else if (pmc->sfmode != omode) {
1768 /* allow mode switches for empty-set filters */ 1763 /* allow mode switches for empty-set filters */
1769 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); 1764 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1775 psl = pmc->sflist; 1770 psl = pmc->sflist;
1776 if (!add) { 1771 if (!add) {
1777 if (!psl) 1772 if (!psl)
1778 goto done; 1773 goto done; /* err = -EADDRNOTAVAIL */
1779 rv = !0; 1774 rv = !0;
1780 for (i=0; i<psl->sl_count; i++) { 1775 for (i=0; i<psl->sl_count; i++) {
1781 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, 1776 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1784 break; 1779 break;
1785 } 1780 }
1786 if (rv) /* source not found */ 1781 if (rv) /* source not found */
1782 goto done; /* err = -EADDRNOTAVAIL */
1783
1784 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1785 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
1786 leavegroup = 1;
1787 goto done; 1787 goto done;
1788 }
1788 1789
1789 /* update the interface filter */ 1790 /* update the interface filter */
1790 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 1791 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1842 &mreqs->imr_sourceaddr, 1); 1843 &mreqs->imr_sourceaddr, 1);
1843done: 1844done:
1844 rtnl_shunlock(); 1845 rtnl_shunlock();
1846 if (leavegroup)
1847 return ip_mc_leave_group(sk, &imr);
1845 return err; 1848 return err;
1846} 1849}
1847 1850
1848int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) 1851int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1849{ 1852{
1850 int err; 1853 int err = 0;
1851 struct ip_mreqn imr; 1854 struct ip_mreqn imr;
1852 u32 addr = msf->imsf_multiaddr; 1855 u32 addr = msf->imsf_multiaddr;
1853 struct ip_mc_socklist *pmc; 1856 struct ip_mc_socklist *pmc;
1854 struct in_device *in_dev; 1857 struct in_device *in_dev;
1855 struct inet_sock *inet = inet_sk(sk); 1858 struct inet_sock *inet = inet_sk(sk);
1856 struct ip_sf_socklist *newpsl, *psl; 1859 struct ip_sf_socklist *newpsl, *psl;
1860 int leavegroup = 0;
1857 1861
1858 if (!MULTICAST(addr)) 1862 if (!MULTICAST(addr))
1859 return -EINVAL; 1863 return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1872 err = -ENODEV; 1876 err = -ENODEV;
1873 goto done; 1877 goto done;
1874 } 1878 }
1875 err = -EADDRNOTAVAIL; 1879
1880 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1881 if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
1882 leavegroup = 1;
1883 goto done;
1884 }
1876 1885
1877 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1886 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1878 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 1887 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
1879 pmc->multi.imr_ifindex == imr.imr_ifindex) 1888 pmc->multi.imr_ifindex == imr.imr_ifindex)
1880 break; 1889 break;
1881 } 1890 }
1882 if (!pmc) /* must have a prior join */ 1891 if (!pmc) { /* must have a prior join */
1892 err = -EINVAL;
1883 goto done; 1893 goto done;
1894 }
1884 if (msf->imsf_numsrc) { 1895 if (msf->imsf_numsrc) {
1885 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, 1896 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
1886 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); 1897 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1909 0, NULL, 0); 1920 0, NULL, 0);
1910 pmc->sflist = newpsl; 1921 pmc->sflist = newpsl;
1911 pmc->sfmode = msf->imsf_fmode; 1922 pmc->sfmode = msf->imsf_fmode;
1923 err = 0;
1912done: 1924done:
1913 rtnl_shunlock(); 1925 rtnl_shunlock();
1926 if (leavegroup)
1927 err = ip_mc_leave_group(sk, &imr);
1914 return err; 1928 return err;
1915} 1929}
1916 1930
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index af2ec88bbb2f..c703528e0bcd 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -283,14 +283,18 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
283{ 283{
284 struct net_device *dev = skb->dev; 284 struct net_device *dev = skb->dev;
285 struct iphdr *iph = skb->nh.iph; 285 struct iphdr *iph = skb->nh.iph;
286 int err;
286 287
287 /* 288 /*
288 * Initialise the virtual path cache for the packet. It describes 289 * Initialise the virtual path cache for the packet. It describes
289 * how the packet travels inside Linux networking. 290 * how the packet travels inside Linux networking.
290 */ 291 */
291 if (skb->dst == NULL) { 292 if (skb->dst == NULL) {
292 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) 293 if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
294 if (err == -EHOSTUNREACH)
295 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
293 goto drop; 296 goto drop;
297 }
294 } 298 }
295 299
296#ifdef CONFIG_NET_CLS_ROUTE 300#ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ee07aec215a0..80d13103b2b0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,7 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110 nf_reset(newskb);
111 netif_rx(newskb); 110 netif_rx(newskb);
112 return 0; 111 return 0;
113} 112}
@@ -188,8 +187,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
188 skb = skb2; 187 skb = skb2;
189 } 188 }
190 189
191 nf_reset(skb);
192
193 if (hh) { 190 if (hh) {
194 int hh_alen; 191 int hh_alen;
195 192
@@ -383,7 +380,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
383 to->pkt_type = from->pkt_type; 380 to->pkt_type = from->pkt_type;
384 to->priority = from->priority; 381 to->priority = from->priority;
385 to->protocol = from->protocol; 382 to->protocol = from->protocol;
386 to->security = from->security;
387 dst_release(to->dst); 383 dst_release(to->dst);
388 to->dst = dst_clone(from->dst); 384 to->dst = dst_clone(from->dst);
389 to->dev = from->dev; 385 to->dev = from->dev;
@@ -1323,23 +1319,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1323 ip_rt_put(rt); 1319 ip_rt_put(rt);
1324} 1320}
1325 1321
1326/*
1327 * IP protocol layer initialiser
1328 */
1329
1330static struct packet_type ip_packet_type = {
1331 .type = __constant_htons(ETH_P_IP),
1332 .func = ip_rcv,
1333};
1334
1335/*
1336 * IP registers the packet type and then calls the subprotocol initialisers
1337 */
1338
1339void __init ip_init(void) 1322void __init ip_init(void)
1340{ 1323{
1341 dev_add_pack(&ip_packet_type);
1342
1343 ip_rt_init(); 1324 ip_rt_init();
1344 inet_initpeers(); 1325 inet_initpeers();
1345 1326
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f8b172f89811..fc7c481d0d79 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
677 mreq.imr_address.s_addr = mreqs.imr_interface; 677 mreq.imr_address.s_addr = mreqs.imr_interface;
678 mreq.imr_ifindex = 0; 678 mreq.imr_ifindex = 0;
679 err = ip_mc_join_group(sk, &mreq); 679 err = ip_mc_join_group(sk, &mreq);
680 if (err) 680 if (err && err != -EADDRINUSE)
681 break; 681 break;
682 omode = MCAST_INCLUDE; 682 omode = MCAST_INCLUDE;
683 add = 1; 683 add = 1;
684 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 684 } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
685 omode = MCAST_INCLUDE; 685 omode = MCAST_INCLUDE;
686 add = 0; 686 add = 0;
687 } 687 }
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
754 mreq.imr_address.s_addr = 0; 754 mreq.imr_address.s_addr = 0;
755 mreq.imr_ifindex = greqs.gsr_interface; 755 mreq.imr_ifindex = greqs.gsr_interface;
756 err = ip_mc_join_group(sk, &mreq); 756 err = ip_mc_join_group(sk, &mreq);
757 if (err) 757 if (err && err != -EADDRINUSE)
758 break; 758 break;
759 greqs.gsr_interface = mreq.imr_ifindex; 759 greqs.gsr_interface = mreq.imr_ifindex;
760 omode = MCAST_INCLUDE; 760 omode = MCAST_INCLUDE;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f2509034ce72..d2bf8e1930a3 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1149,8 +1149,10 @@ static int __init ic_dynamic(void)
1149 ic_rarp_cleanup(); 1149 ic_rarp_cleanup();
1150#endif 1150#endif
1151 1151
1152 if (!ic_got_reply) 1152 if (!ic_got_reply) {
1153 ic_myaddr = INADDR_NONE;
1153 return -1; 1154 return -1;
1155 }
1154 1156
1155 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ", 1157 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
1156 ((ic_got_reply & IC_RARP) ? "RARP" 1158 ((ic_got_reply & IC_RARP) ? "RARP"
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e4f809a93f47..7833d920bdba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
297static void ipmr_destroy_unres(struct mfc_cache *c) 297static void ipmr_destroy_unres(struct mfc_cache *c)
298{ 298{
299 struct sk_buff *skb; 299 struct sk_buff *skb;
300 struct nlmsgerr *e;
300 301
301 atomic_dec(&cache_resolve_queue_len); 302 atomic_dec(&cache_resolve_queue_len);
302 303
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
306 nlh->nlmsg_type = NLMSG_ERROR; 307 nlh->nlmsg_type = NLMSG_ERROR;
307 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 308 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308 skb_trim(skb, nlh->nlmsg_len); 309 skb_trim(skb, nlh->nlmsg_len);
309 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; 310 e = NLMSG_DATA(nlh);
311 e->error = -ETIMEDOUT;
312 memset(&e->msg, 0, sizeof(e->msg));
310 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 313 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
311 } else 314 } else
312 kfree_skb(skb); 315 kfree_skb(skb);
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
499static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) 502static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
500{ 503{
501 struct sk_buff *skb; 504 struct sk_buff *skb;
505 struct nlmsgerr *e;
502 506
503 /* 507 /*
504 * Play the pending entries through our router 508 * Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
515 nlh->nlmsg_type = NLMSG_ERROR; 519 nlh->nlmsg_type = NLMSG_ERROR;
516 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 520 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517 skb_trim(skb, nlh->nlmsg_len); 521 skb_trim(skb, nlh->nlmsg_len);
518 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; 522 e = NLMSG_DATA(nlh);
523 e->error = -EMSGSIZE;
524 memset(&e->msg, 0, sizeof(e->msg));
519 } 525 }
520 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 526 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
521 } else 527 } else
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64bb..c9820bfc493a 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
2# IP Virtual Server configuration 2# IP Virtual Server configuration
3# 3#
4menu "IP: Virtual Server Configuration" 4menu "IP: Virtual Server Configuration"
5 depends on INET && NETFILTER 5 depends on NETFILTER
6 6
7config IP_VS 7config IP_VS
8 tristate "IP virtual server support (EXPERIMENTAL)" 8 tristate "IP virtual server support (EXPERIMENTAL)"
9 depends on INET && NETFILTER 9 depends on NETFILTER
10 ---help--- 10 ---help---
11 IP Virtual Server support will let you build a high-performance 11 IP Virtual Server support will let you build a high-performance
12 virtual server based on cluster of two or more real servers. This 12 virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5499fe..d0145a8b1551 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
548{ 548{
549 if (del_timer(&cp->timer)) 549 if (del_timer(&cp->timer))
550 mod_timer(&cp->timer, jiffies); 550 mod_timer(&cp->timer, jiffies);
551 __ip_vs_conn_put(cp);
552} 551}
553 552
554 553
@@ -759,12 +758,11 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
759 return 1; 758 return 1;
760} 759}
761 760
762 761/* Called from keventd and must protect itself from softirqs */
763void ip_vs_random_dropentry(void) 762void ip_vs_random_dropentry(void)
764{ 763{
765 int idx; 764 int idx;
766 struct ip_vs_conn *cp; 765 struct ip_vs_conn *cp;
767 struct ip_vs_conn *ct;
768 766
769 /* 767 /*
770 * Randomly scan 1/32 of the whole table every second 768 * Randomly scan 1/32 of the whole table every second
@@ -775,7 +773,7 @@ void ip_vs_random_dropentry(void)
775 /* 773 /*
776 * Lock is actually needed in this loop. 774 * Lock is actually needed in this loop.
777 */ 775 */
778 ct_write_lock(hash); 776 ct_write_lock_bh(hash);
779 777
780 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
781 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) 779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -801,23 +799,14 @@ void ip_vs_random_dropentry(void)
801 continue; 799 continue;
802 } 800 }
803 801
804 /*
805 * Drop the entry, and drop its ct if not referenced
806 */
807 atomic_inc(&cp->refcnt);
808 ct_write_unlock(hash);
809
810 if ((ct = cp->control))
811 atomic_inc(&ct->refcnt);
812 IP_VS_DBG(4, "del connection\n"); 802 IP_VS_DBG(4, "del connection\n");
813 ip_vs_conn_expire_now(cp); 803 ip_vs_conn_expire_now(cp);
814 if (ct) { 804 if (cp->control) {
815 IP_VS_DBG(4, "del conn template\n"); 805 IP_VS_DBG(4, "del conn template\n");
816 ip_vs_conn_expire_now(ct); 806 ip_vs_conn_expire_now(cp->control);
817 } 807 }
818 ct_write_lock(hash);
819 } 808 }
820 ct_write_unlock(hash); 809 ct_write_unlock_bh(hash);
821 } 810 }
822} 811}
823 812
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
829{ 818{
830 int idx; 819 int idx;
831 struct ip_vs_conn *cp; 820 struct ip_vs_conn *cp;
832 struct ip_vs_conn *ct;
833 821
834 flush_again: 822 flush_again:
835 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 823 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
839 ct_write_lock_bh(idx); 827 ct_write_lock_bh(idx);
840 828
841 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 829 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
842 atomic_inc(&cp->refcnt);
843 ct_write_unlock(idx);
844 830
845 if ((ct = cp->control))
846 atomic_inc(&ct->refcnt);
847 IP_VS_DBG(4, "del connection\n"); 831 IP_VS_DBG(4, "del connection\n");
848 ip_vs_conn_expire_now(cp); 832 ip_vs_conn_expire_now(cp);
849 if (ct) { 833 if (cp->control) {
850 IP_VS_DBG(4, "del conn template\n"); 834 IP_VS_DBG(4, "del conn template\n");
851 ip_vs_conn_expire_now(ct); 835 ip_vs_conn_expire_now(cp->control);
852 } 836 }
853 ct_write_lock(idx);
854 } 837 }
855 ct_write_unlock_bh(idx); 838 ct_write_unlock_bh(idx);
856 } 839 }
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d9701036e..7d99ede2ef79 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
90#endif 90#endif
91 91
92/* 92/*
93 * update_defense_level is called from keventd and from sysctl. 93 * update_defense_level is called from keventd and from sysctl,
94 * so it needs to protect itself from softirqs
94 */ 95 */
95static void update_defense_level(void) 96static void update_defense_level(void)
96{ 97{
@@ -110,6 +111,8 @@ static void update_defense_level(void)
110 111
111 nomem = (availmem < sysctl_ip_vs_amemthresh); 112 nomem = (availmem < sysctl_ip_vs_amemthresh);
112 113
114 local_bh_disable();
115
113 /* drop_entry */ 116 /* drop_entry */
114 spin_lock(&__ip_vs_dropentry_lock); 117 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) { 118 switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
206 if (to_change >= 0) 209 if (to_change >= 0)
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); 210 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock); 211 write_unlock(&__ip_vs_securetcp_lock);
212
213 local_bh_enable();
209} 214}
210 215
211 216
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1360 /* Restore the correct value */ 1365 /* Restore the correct value */
1361 *valp = val; 1366 *valp = val;
1362 } else { 1367 } else {
1363 local_bh_disable();
1364 update_defense_level(); 1368 update_defense_level();
1365 local_bh_enable();
1366 } 1369 }
1367 } 1370 }
1368 return rc; 1371 return rc;
@@ -2059,7 +2062,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2059 dst->addr = src->addr; 2062 dst->addr = src->addr;
2060 dst->port = src->port; 2063 dst->port = src->port;
2061 dst->fwmark = src->fwmark; 2064 dst->fwmark = src->fwmark;
2062 strcpy(dst->sched_name, src->scheduler->name); 2065 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2063 dst->flags = src->flags; 2066 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ; 2067 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask; 2068 dst->netmask = src->netmask;
@@ -2080,6 +2083,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2083 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services) 2084 if (count >= get->num_services)
2082 goto out; 2085 goto out;
2086 memset(&entry, 0, sizeof(entry));
2083 ip_vs_copy_service(&entry, svc); 2087 ip_vs_copy_service(&entry, svc);
2084 if (copy_to_user(&uptr->entrytable[count], 2088 if (copy_to_user(&uptr->entrytable[count],
2085 &entry, sizeof(entry))) { 2089 &entry, sizeof(entry))) {
@@ -2094,6 +2098,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2094 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2098 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095 if (count >= get->num_services) 2099 if (count >= get->num_services)
2096 goto out; 2100 goto out;
2101 memset(&entry, 0, sizeof(entry));
2097 ip_vs_copy_service(&entry, svc); 2102 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count], 2103 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) { 2104 &entry, sizeof(entry))) {
@@ -2304,12 +2309,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2304 memset(&d, 0, sizeof(d)); 2309 memset(&d, 0, sizeof(d));
2305 if (ip_vs_sync_state & IP_VS_STATE_MASTER) { 2310 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306 d[0].state = IP_VS_STATE_MASTER; 2311 d[0].state = IP_VS_STATE_MASTER;
2307 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); 2312 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2308 d[0].syncid = ip_vs_master_syncid; 2313 d[0].syncid = ip_vs_master_syncid;
2309 } 2314 }
2310 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { 2315 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311 d[1].state = IP_VS_STATE_BACKUP; 2316 d[1].state = IP_VS_STATE_BACKUP;
2312 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); 2317 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2313 d[1].syncid = ip_vs_backup_syncid; 2318 d[1].syncid = ip_vs_backup_syncid;
2314 } 2319 }
2315 if (copy_to_user(user, &d, sizeof(d)) != 0) 2320 if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a32..574d1f509b46 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
839 839
840 ip_vs_sync_state |= state; 840 ip_vs_sync_state |= state;
841 if (state == IP_VS_STATE_MASTER) { 841 if (state == IP_VS_STATE_MASTER) {
842 strcpy(ip_vs_master_mcast_ifn, mcast_ifn); 842 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
843 ip_vs_master_syncid = syncid; 843 ip_vs_master_syncid = syncid;
844 } else { 844 } else {
845 strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); 845 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
846 ip_vs_backup_syncid = syncid; 846 ip_vs_backup_syncid = syncid;
847 } 847 }
848 848
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 42dc95102873..1dd824f3cf0a 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -432,6 +432,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
432 const struct net_device *out, 432 const struct net_device *out,
433 int (*okfn)(struct sk_buff *)) 433 int (*okfn)(struct sk_buff *))
434{ 434{
435#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
436 /* Previously seen (loopback)? Ignore. Do this before
437 fragment check. */
438 if ((*pskb)->nfct)
439 return NF_ACCEPT;
440#endif
441
435 /* Gather fragments. */ 442 /* Gather fragments. */
436 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 443 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
437 *pskb = ip_ct_gather_frags(*pskb, 444 *pskb = ip_ct_gather_frags(*pskb,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 9cde8c61f525..6706d3a1bc4f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -30,7 +30,7 @@
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32 32
33#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.7"
34 34
35#define DEBUG_CLUSTERIP 35#define DEBUG_CLUSTERIP
36 36
@@ -524,8 +524,9 @@ arp_mangle(unsigned int hook,
524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) 524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
525 return NF_ACCEPT; 525 return NF_ACCEPT;
526 526
527 /* we only want to mangle arp replies */ 527 /* we only want to mangle arp requests and replies */
528 if (arp->ar_op != htons(ARPOP_REPLY)) 528 if (arp->ar_op != htons(ARPOP_REPLY)
529 && arp->ar_op != htons(ARPOP_REQUEST))
529 return NF_ACCEPT; 530 return NF_ACCEPT;
530 531
531 payload = (void *)(arp+1); 532 payload = (void *)(arp+1);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 80cf633d9f4a..d675ff80b04d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
54 * Marc Boucher : routing by fwmark 54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics 55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * 58 *
58 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
70#include <linux/kernel.h> 71#include <linux/kernel.h>
71#include <linux/sched.h> 72#include <linux/sched.h>
72#include <linux/mm.h> 73#include <linux/mm.h>
74#include <linux/bootmem.h>
73#include <linux/string.h> 75#include <linux/string.h>
74#include <linux/socket.h> 76#include <linux/socket.h>
75#include <linux/sockios.h> 77#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
201 203
202struct rt_hash_bucket { 204struct rt_hash_bucket {
203 struct rtable *chain; 205 struct rtable *chain;
204 spinlock_t lock; 206};
205} __attribute__((__aligned__(8))); 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
206 237
207static struct rt_hash_bucket *rt_hash_table; 238static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask; 239static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
575/* This runs via a timer and thus is always in BH context. */ 606/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy) 607static void rt_check_expire(unsigned long dummy)
577{ 608{
578 static int rover; 609 static unsigned int rover;
579 int i = rover, t; 610 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 611 struct rtable *rth, **rthp;
581 unsigned long now = jiffies; 612 unsigned long now = jiffies;
582 613 u64 mult;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 614
584 t -= ip_rt_gc_timeout) { 615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
585 unsigned long tmo = ip_rt_gc_timeout; 621 unsigned long tmo = ip_rt_gc_timeout;
586 622
587 i = (i + 1) & rt_hash_mask; 623 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain; 624 rthp = &rt_hash_table[i].chain;
589 625
590 spin_lock(&rt_hash_table[i].lock); 626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
591 while ((rth = *rthp) != NULL) { 629 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) { 630 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */ 631 /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
620 rt_free(rth); 658 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 } 660 }
623 spin_unlock(&rt_hash_table[i].lock); 661 spin_unlock(rt_hash_lock_addr(i));
624 662
625 /* Fallback loop breaker. */ 663 /* Fallback loop breaker. */
626 if (time_after(jiffies, now)) 664 if (time_after(jiffies, now))
627 break; 665 break;
628 } 666 }
629 rover = i; 667 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
631} 669}
632 670
633/* This can run from both BH and non-BH contexts, the latter 671/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
643 get_random_bytes(&rt_hash_rnd, 4); 681 get_random_bytes(&rt_hash_rnd, 4);
644 682
645 for (i = rt_hash_mask; i >= 0; i--) { 683 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock); 684 spin_lock_bh(rt_hash_lock_addr(i));
647 rth = rt_hash_table[i].chain; 685 rth = rt_hash_table[i].chain;
648 if (rth) 686 if (rth)
649 rt_hash_table[i].chain = NULL; 687 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock); 688 spin_unlock_bh(rt_hash_lock_addr(i));
651 689
652 for (; rth; rth = next) { 690 for (; rth; rth = next) {
653 next = rth->u.rt_next; 691 next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
780 818
781 k = (k + 1) & rt_hash_mask; 819 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain; 820 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock); 821 spin_lock_bh(rt_hash_lock_addr(k));
784 while ((rth = *rthp) != NULL) { 822 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) { 823 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1; 824 tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
812 goal--; 850 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 } 852 }
815 spin_unlock_bh(&rt_hash_table[k].lock); 853 spin_unlock_bh(rt_hash_lock_addr(k));
816 if (goal <= 0) 854 if (goal <= 0)
817 break; 855 break;
818 } 856 }
@@ -882,7 +920,7 @@ restart:
882 920
883 rthp = &rt_hash_table[hash].chain; 921 rthp = &rt_hash_table[hash].chain;
884 922
885 spin_lock_bh(&rt_hash_table[hash].lock); 923 spin_lock_bh(rt_hash_lock_addr(hash));
886 while ((rth = *rthp) != NULL) { 924 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) && 926 if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
908 rth->u.dst.__use++; 946 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst); 947 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now; 948 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock); 949 spin_unlock_bh(rt_hash_lock_addr(hash));
912 950
913 rt_drop(rt); 951 rt_drop(rt);
914 *rp = rth; 952 *rp = rth;
@@ -949,7 +987,7 @@ restart:
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst); 988 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) { 989 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock); 990 spin_unlock_bh(rt_hash_lock_addr(hash));
953 991
954 if (err != -ENOBUFS) { 992 if (err != -ENOBUFS) {
955 rt_drop(rt); 993 rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
990 } 1028 }
991#endif 1029#endif
992 rt_hash_table[hash].chain = rt; 1030 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock); 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
994 *rp = rt; 1032 *rp = rt;
995 return 0; 1033 return 0;
996} 1034}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1058{ 1096{
1059 struct rtable **rthp; 1097 struct rtable **rthp;
1060 1098
1061 spin_lock_bh(&rt_hash_table[hash].lock); 1099 spin_lock_bh(rt_hash_lock_addr(hash));
1062 ip_rt_put(rt); 1100 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next) 1102 rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1067 rt_free(rt); 1105 rt_free(rt);
1068 break; 1106 break;
1069 } 1107 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1071} 1109}
1072 1110
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1647,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1647 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1648 "%u.%u.%u.%u, on dev %s\n", 1686 "%u.%u.%u.%u, on dev %s\n",
1649 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1650 if (dev->hard_header_len) { 1688 if (dev->hard_header_len && skb->mac.raw) {
1651 int i; 1689 int i;
1652 unsigned char *p = skb->mac.raw; 1690 unsigned char *p = skb->mac.raw;
1653 printk(KERN_WARNING "ll header: "); 1691 printk(KERN_WARNING "ll header: ");
@@ -1909,7 +1947,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1909 */ 1947 */
1910 if ((err = fib_lookup(&fl, &res)) != 0) { 1948 if ((err = fib_lookup(&fl, &res)) != 0) {
1911 if (!IN_DEV_FORWARD(in_dev)) 1949 if (!IN_DEV_FORWARD(in_dev))
1912 goto e_inval; 1950 goto e_hostunreach;
1913 goto no_route; 1951 goto no_route;
1914 } 1952 }
1915 free_res = 1; 1953 free_res = 1;
@@ -1933,7 +1971,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1933 } 1971 }
1934 1972
1935 if (!IN_DEV_FORWARD(in_dev)) 1973 if (!IN_DEV_FORWARD(in_dev))
1936 goto e_inval; 1974 goto e_hostunreach;
1937 if (res.type != RTN_UNICAST) 1975 if (res.type != RTN_UNICAST)
1938 goto martian_destination; 1976 goto martian_destination;
1939 1977
@@ -2025,6 +2063,11 @@ martian_destination:
2025 "%u.%u.%u.%u, dev %s\n", 2063 "%u.%u.%u.%u, dev %s\n",
2026 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 2064 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027#endif 2065#endif
2066
2067e_hostunreach:
2068 err = -EHOSTUNREACH;
2069 goto done;
2070
2028e_inval: 2071e_inval:
2029 err = -EINVAL; 2072 err = -EINVAL;
2030 goto done; 2073 goto done;
@@ -3068,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
3068 3111
3069int __init ip_rt_init(void) 3112int __init ip_rt_init(void)
3070{ 3113{
3071 int i, order, goal, rc = 0; 3114 int rc = 0;
3072 3115
3073 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074 (jiffies ^ (jiffies >> 7))); 3117 (jiffies ^ (jiffies >> 7)));
3075 3118
3076#ifdef CONFIG_NET_CLS_ROUTE 3119#ifdef CONFIG_NET_CLS_ROUTE
3120 {
3121 int order;
3077 for (order = 0; 3122 for (order = 0;
3078 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079 /* NOTHING */; 3124 /* NOTHING */;
@@ -3081,6 +3126,7 @@ int __init ip_rt_init(void)
3081 if (!ip_rt_acct) 3126 if (!ip_rt_acct)
3082 panic("IP: failed to allocate ip_rt_acct\n"); 3127 panic("IP: failed to allocate ip_rt_acct\n");
3083 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 }
3084#endif 3130#endif
3085 3131
3086 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3091,36 +3137,19 @@ int __init ip_rt_init(void)
3091 if (!ipv4_dst_ops.kmem_cachep) 3137 if (!ipv4_dst_ops.kmem_cachep)
3092 panic("IP: failed to allocate ip_dst_cache\n"); 3138 panic("IP: failed to allocate ip_dst_cache\n");
3093 3139
3094 goal = num_physpages >> (26 - PAGE_SHIFT); 3140 rt_hash_table = (struct rt_hash_bucket *)
3095 if (rhash_entries) 3141 alloc_large_system_hash("IP route cache",
3096 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3142 sizeof(struct rt_hash_bucket),
3097 for (order = 0; (1UL << order) < goal; order++) 3143 rhash_entries,
3098 /* NOTHING */; 3144 (num_physpages >= 128 * 1024) ?
3099 3145 (27 - PAGE_SHIFT) :
3100 do { 3146 (29 - PAGE_SHIFT),
3101 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3147 HASH_HIGHMEM,
3102 sizeof(struct rt_hash_bucket); 3148 &rt_hash_log,
3103 while (rt_hash_mask & (rt_hash_mask - 1)) 3149 &rt_hash_mask,
3104 rt_hash_mask--; 3150 0);
3105 rt_hash_table = (struct rt_hash_bucket *) 3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3106 __get_free_pages(GFP_ATOMIC, order); 3152 rt_hash_lock_init();
3107 } while (rt_hash_table == NULL && --order > 0);
3108
3109 if (!rt_hash_table)
3110 panic("Failed to allocate IP route cache hash table\n");
3111
3112 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113 rt_hash_mask,
3114 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3115
3116 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117 /* NOTHING */;
3118
3119 rt_hash_mask--;
3120 for (i = 0; i <= rt_hash_mask; i++) {
3121 spin_lock_init(&rt_hash_table[i].lock);
3122 rt_hash_table[i].chain = NULL;
3123 }
3124 3153
3125 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf0b..e32894532416 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
118 return 1; 118 return 1;
119} 119}
120 120
121static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
122 void __user *buffer, size_t *lenp, loff_t *ppos)
123{
124 char val[TCP_CA_NAME_MAX];
125 ctl_table tbl = {
126 .data = val,
127 .maxlen = TCP_CA_NAME_MAX,
128 };
129 int ret;
130
131 tcp_get_default_congestion_control(val);
132
133 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
134 if (write && ret == 0)
135 ret = tcp_set_default_congestion_control(val);
136 return ret;
137}
138
139int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
140 void __user *oldval, size_t __user *oldlenp,
141 void __user *newval, size_t newlen,
142 void **context)
143{
144 char val[TCP_CA_NAME_MAX];
145 ctl_table tbl = {
146 .data = val,
147 .maxlen = TCP_CA_NAME_MAX,
148 };
149 int ret;
150
151 tcp_get_default_congestion_control(val);
152 ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
153 context);
154 if (ret == 0 && newval && newlen)
155 ret = tcp_set_default_congestion_control(val);
156 return ret;
157}
158
159
121ctl_table ipv4_table[] = { 160ctl_table ipv4_table[] = {
122 { 161 {
123 .ctl_name = NET_IPV4_TCP_TIMESTAMPS, 162 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -612,70 +651,6 @@ ctl_table ipv4_table[] = {
612 .proc_handler = &proc_dointvec, 651 .proc_handler = &proc_dointvec,
613 }, 652 },
614 { 653 {
615 .ctl_name = NET_TCP_WESTWOOD,
616 .procname = "tcp_westwood",
617 .data = &sysctl_tcp_westwood,
618 .maxlen = sizeof(int),
619 .mode = 0644,
620 .proc_handler = &proc_dointvec,
621 },
622 {
623 .ctl_name = NET_TCP_VEGAS,
624 .procname = "tcp_vegas_cong_avoid",
625 .data = &sysctl_tcp_vegas_cong_avoid,
626 .maxlen = sizeof(int),
627 .mode = 0644,
628 .proc_handler = &proc_dointvec,
629 },
630 {
631 .ctl_name = NET_TCP_VEGAS_ALPHA,
632 .procname = "tcp_vegas_alpha",
633 .data = &sysctl_tcp_vegas_alpha,
634 .maxlen = sizeof(int),
635 .mode = 0644,
636 .proc_handler = &proc_dointvec,
637 },
638 {
639 .ctl_name = NET_TCP_VEGAS_BETA,
640 .procname = "tcp_vegas_beta",
641 .data = &sysctl_tcp_vegas_beta,
642 .maxlen = sizeof(int),
643 .mode = 0644,
644 .proc_handler = &proc_dointvec,
645 },
646 {
647 .ctl_name = NET_TCP_VEGAS_GAMMA,
648 .procname = "tcp_vegas_gamma",
649 .data = &sysctl_tcp_vegas_gamma,
650 .maxlen = sizeof(int),
651 .mode = 0644,
652 .proc_handler = &proc_dointvec,
653 },
654 {
655 .ctl_name = NET_TCP_BIC,
656 .procname = "tcp_bic",
657 .data = &sysctl_tcp_bic,
658 .maxlen = sizeof(int),
659 .mode = 0644,
660 .proc_handler = &proc_dointvec,
661 },
662 {
663 .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
664 .procname = "tcp_bic_fast_convergence",
665 .data = &sysctl_tcp_bic_fast_convergence,
666 .maxlen = sizeof(int),
667 .mode = 0644,
668 .proc_handler = &proc_dointvec,
669 },
670 {
671 .ctl_name = NET_TCP_BIC_LOW_WINDOW,
672 .procname = "tcp_bic_low_window",
673 .data = &sysctl_tcp_bic_low_window,
674 .maxlen = sizeof(int),
675 .mode = 0644,
676 .proc_handler = &proc_dointvec,
677 },
678 {
679 .ctl_name = NET_TCP_MODERATE_RCVBUF, 654 .ctl_name = NET_TCP_MODERATE_RCVBUF,
680 .procname = "tcp_moderate_rcvbuf", 655 .procname = "tcp_moderate_rcvbuf",
681 .data = &sysctl_tcp_moderate_rcvbuf, 656 .data = &sysctl_tcp_moderate_rcvbuf,
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
692 .proc_handler = &proc_dointvec, 667 .proc_handler = &proc_dointvec,
693 }, 668 },
694 { 669 {
695 .ctl_name = NET_TCP_BIC_BETA, 670 .ctl_name = NET_TCP_CONG_CONTROL,
696 .procname = "tcp_bic_beta", 671 .procname = "tcp_congestion_control",
697 .data = &sysctl_tcp_bic_beta,
698 .maxlen = sizeof(int),
699 .mode = 0644, 672 .mode = 0644,
700 .proc_handler = &proc_dointvec, 673 .maxlen = TCP_CA_NAME_MAX,
674 .proc_handler = &proc_tcp_congestion_control,
675 .strategy = &sysctl_tcp_congestion_control,
701 }, 676 },
677
702 { .ctl_name = 0 } 678 { .ctl_name = 0 }
703}; 679};
704 680
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd36..ddb6ce4ecff2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
615 size_t psize, int flags) 615 size_t psize, int flags)
616{ 616{
617 struct tcp_sock *tp = tcp_sk(sk); 617 struct tcp_sock *tp = tcp_sk(sk);
618 int mss_now; 618 int mss_now, size_goal;
619 int err; 619 int err;
620 ssize_t copied; 620 ssize_t copied;
621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 629
630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 size_goal = tp->xmit_size_goal;
631 copied = 0; 632 copied = 0;
632 633
633 err = -EPIPE; 634 err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
641 int offset = poffset % PAGE_SIZE; 642 int offset = poffset % PAGE_SIZE;
642 int size = min_t(size_t, psize, PAGE_SIZE - offset); 643 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643 644
644 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { 645 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
645new_segment: 646new_segment:
646 if (!sk_stream_memory_free(sk)) 647 if (!sk_stream_memory_free(sk))
647 goto wait_for_sndbuf; 648 goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
652 goto wait_for_memory; 653 goto wait_for_memory;
653 654
654 skb_entail(sk, tp, skb); 655 skb_entail(sk, tp, skb);
655 copy = mss_now; 656 copy = size_goal;
656 } 657 }
657 658
658 if (copy > size) 659 if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
693 if (!(psize -= copy)) 694 if (!(psize -= copy))
694 goto out; 695 goto out;
695 696
696 if (skb->len != mss_now || (flags & MSG_OOB)) 697 if (skb->len < mss_now || (flags & MSG_OOB))
697 continue; 698 continue;
698 699
699 if (forced_push(tp)) { 700 if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
713 goto do_error; 714 goto do_error;
714 715
715 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 716 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 size_goal = tp->xmit_size_goal;
716 } 718 }
717 719
718out: 720out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
754 756
755static inline int select_size(struct sock *sk, struct tcp_sock *tp) 757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{ 758{
757 int tmp = tp->mss_cache_std; 759 int tmp = tp->mss_cache;
758 760
759 if (sk->sk_route_caps & NETIF_F_SG) { 761 if (sk->sk_route_caps & NETIF_F_SG) {
760 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 762 if (sk->sk_route_caps & NETIF_F_TSO)
763 tmp = 0;
764 else {
765 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761 766
762 if (tmp >= pgbreak && 767 if (tmp >= pgbreak &&
763 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) 768 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764 tmp = pgbreak; 769 tmp = pgbreak;
770 }
765 } 771 }
772
766 return tmp; 773 return tmp;
767} 774}
768 775
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
773 struct tcp_sock *tp = tcp_sk(sk); 780 struct tcp_sock *tp = tcp_sk(sk);
774 struct sk_buff *skb; 781 struct sk_buff *skb;
775 int iovlen, flags; 782 int iovlen, flags;
776 int mss_now; 783 int mss_now, size_goal;
777 int err, copied; 784 int err, copied;
778 long timeo; 785 long timeo;
779 786
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
792 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 799 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793 800
794 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 801 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 size_goal = tp->xmit_size_goal;
795 803
796 /* Ok commence sending. */ 804 /* Ok commence sending. */
797 iovlen = msg->msg_iovlen; 805 iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
814 skb = sk->sk_write_queue.prev; 822 skb = sk->sk_write_queue.prev;
815 823
816 if (!sk->sk_send_head || 824 if (!sk->sk_send_head ||
817 (copy = mss_now - skb->len) <= 0) { 825 (copy = size_goal - skb->len) <= 0) {
818 826
819new_segment: 827new_segment:
820 /* Allocate new segment. If the interface is SG, 828 /* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
837 skb->ip_summed = CHECKSUM_HW; 845 skb->ip_summed = CHECKSUM_HW;
838 846
839 skb_entail(sk, tp, skb); 847 skb_entail(sk, tp, skb);
840 copy = mss_now; 848 copy = size_goal;
841 } 849 }
842 850
843 /* Try to append data to the end of skb. */ 851 /* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
872 tcp_mark_push(tp, skb); 880 tcp_mark_push(tp, skb);
873 goto new_segment; 881 goto new_segment;
874 } else if (page) { 882 } else if (page) {
875 /* If page is cached, align
876 * offset to L1 cache boundary
877 */
878 off = (off + L1_CACHE_BYTES - 1) &
879 ~(L1_CACHE_BYTES - 1);
880 if (off == PAGE_SIZE) { 883 if (off == PAGE_SIZE) {
881 put_page(page); 884 put_page(page);
882 TCP_PAGE(sk) = page = NULL; 885 TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
937 if ((seglen -= copy) == 0 && iovlen == 0) 940 if ((seglen -= copy) == 0 && iovlen == 0)
938 goto out; 941 goto out;
939 942
940 if (skb->len != mss_now || (flags & MSG_OOB)) 943 if (skb->len < mss_now || (flags & MSG_OOB))
941 continue; 944 continue;
942 945
943 if (forced_push(tp)) { 946 if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
957 goto do_error; 960 goto do_error;
958 961
959 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 962 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 size_goal = tp->xmit_size_goal;
960 } 964 }
961 } 965 }
962 966
@@ -1101,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
1101 struct sk_buff *skb; 1105 struct sk_buff *skb;
1102 struct tcp_sock *tp = tcp_sk(sk); 1106 struct tcp_sock *tp = tcp_sk(sk);
1103 1107
1104 NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue)); 1108 NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1105 1109
1106 /* RX process wants to run with disabled BHs, though it is not 1110 /* RX process wants to run with disabled BHs, though it is not
1107 * necessary */ 1111 * necessary */
@@ -1365,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1365 * is not empty. It is more elegant, but eats cycles, 1369 * is not empty. It is more elegant, but eats cycles,
1366 * unfortunately. 1370 * unfortunately.
1367 */ 1371 */
1368 if (skb_queue_len(&tp->ucopy.prequeue)) 1372 if (!skb_queue_empty(&tp->ucopy.prequeue))
1369 goto do_prequeue; 1373 goto do_prequeue;
1370 1374
1371 /* __ Set realtime policy in scheduler __ */ 1375 /* __ Set realtime policy in scheduler __ */
@@ -1390,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1390 } 1394 }
1391 1395
1392 if (tp->rcv_nxt == tp->copied_seq && 1396 if (tp->rcv_nxt == tp->copied_seq &&
1393 skb_queue_len(&tp->ucopy.prequeue)) { 1397 !skb_queue_empty(&tp->ucopy.prequeue)) {
1394do_prequeue: 1398do_prequeue:
1395 tcp_prequeue_process(sk); 1399 tcp_prequeue_process(sk);
1396 1400
@@ -1472,7 +1476,7 @@ skip_copy:
1472 } while (len > 0); 1476 } while (len > 0);
1473 1477
1474 if (user_recv) { 1478 if (user_recv) {
1475 if (skb_queue_len(&tp->ucopy.prequeue)) { 1479 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1476 int chunk; 1480 int chunk;
1477 1481
1478 tp->ucopy.len = copied > 0 ? len : 0; 1482 tp->ucopy.len = copied > 0 ? len : 0;
@@ -1927,6 +1931,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1927 return tp->af_specific->setsockopt(sk, level, optname, 1931 return tp->af_specific->setsockopt(sk, level, optname,
1928 optval, optlen); 1932 optval, optlen);
1929 1933
1934 /* This is a string value all the others are int's */
1935 if (optname == TCP_CONGESTION) {
1936 char name[TCP_CA_NAME_MAX];
1937
1938 if (optlen < 1)
1939 return -EINVAL;
1940
1941 val = strncpy_from_user(name, optval,
1942 min(TCP_CA_NAME_MAX-1, optlen));
1943 if (val < 0)
1944 return -EFAULT;
1945 name[val] = 0;
1946
1947 lock_sock(sk);
1948 err = tcp_set_congestion_control(tp, name);
1949 release_sock(sk);
1950 return err;
1951 }
1952
1930 if (optlen < sizeof(int)) 1953 if (optlen < sizeof(int))
1931 return -EINVAL; 1954 return -EINVAL;
1932 1955
@@ -2109,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2109 2132
2110 info->tcpi_rto = jiffies_to_usecs(tp->rto); 2133 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2111 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2112 info->tcpi_snd_mss = tp->mss_cache_std; 2135 info->tcpi_snd_mss = tp->mss_cache;
2113 info->tcpi_rcv_mss = tp->ack.rcv_mss; 2136 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2114 2137
2115 info->tcpi_unacked = tp->packets_out; 2138 info->tcpi_unacked = tp->packets_out;
@@ -2159,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2159 2182
2160 switch (optname) { 2183 switch (optname) {
2161 case TCP_MAXSEG: 2184 case TCP_MAXSEG:
2162 val = tp->mss_cache_std; 2185 val = tp->mss_cache;
2163 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2186 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2164 val = tp->rx_opt.user_mss; 2187 val = tp->rx_opt.user_mss;
2165 break; 2188 break;
@@ -2211,6 +2234,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2211 case TCP_QUICKACK: 2234 case TCP_QUICKACK:
2212 val = !tp->ack.pingpong; 2235 val = !tp->ack.pingpong;
2213 break; 2236 break;
2237
2238 case TCP_CONGESTION:
2239 if (get_user(len, optlen))
2240 return -EFAULT;
2241 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242 if (put_user(len, optlen))
2243 return -EFAULT;
2244 if (copy_to_user(optval, tp->ca_ops->name, len))
2245 return -EFAULT;
2246 return 0;
2214 default: 2247 default:
2215 return -ENOPROTOOPT; 2248 return -ENOPROTOOPT;
2216 }; 2249 };
@@ -2224,7 +2257,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2224 2257
2225 2258
2226extern void __skb_cb_too_small_for_tcp(int, int); 2259extern void __skb_cb_too_small_for_tcp(int, int);
2227extern void tcpdiag_init(void); 2260extern struct tcp_congestion_ops tcp_reno;
2228 2261
2229static __initdata unsigned long thash_entries; 2262static __initdata unsigned long thash_entries;
2230static int __init set_thash_entries(char *str) 2263static int __init set_thash_entries(char *str)
@@ -2333,6 +2366,8 @@ void __init tcp_init(void)
2333 printk(KERN_INFO "TCP: Hash tables configured " 2366 printk(KERN_INFO "TCP: Hash tables configured "
2334 "(established %d bind %d)\n", 2367 "(established %d bind %d)\n",
2335 tcp_ehash_size << 1, tcp_bhash_size); 2368 tcp_ehash_size << 1, tcp_bhash_size);
2369
2370 tcp_register_congestion_control(&tcp_reno);
2336} 2371}
2337 2372
2338EXPORT_SYMBOL(tcp_accept); 2373EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 000000000000..ec38d45d6649
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
1/*
2 * Binary Increase Congestion control for TCP
3 *
4 * This is from the implementation of BICTCP in
5 * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
6 * "Binary Increase Congestion Control for Fast, Long Distance
7 * Networks" in InfoComm 2004
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
10 *
11 * Unless BIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28
29static int fast_convergence = 1;
30static int max_increment = 32;
31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100;
36static int smooth_part = 20;
37
38module_param(fast_convergence, int, 0644);
39MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
40module_param(max_increment, int, 0644);
41MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
42module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644);
53MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
54
55
56/* BIC TCP Parameters */
57struct bictcp {
58 u32 cnt; /* increase cwnd by 1 after ACKs */
59 u32 last_max_cwnd; /* last maximum snd_cwnd */
60 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
71};
72
73static inline void bictcp_reset(struct bictcp *ca)
74{
75 ca->cnt = 0;
76 ca->last_max_cwnd = 0;
77 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0;
79 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87}
88
89static void bictcp_init(struct tcp_sock *tp)
90{
91 bictcp_reset(tcp_ca(tp));
92 if (initial_ssthresh)
93 tp->snd_ssthresh = initial_ssthresh;
94}
95
96/*
97 * Compute congestion window to use.
98 */
99static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
100{
101 if (ca->last_cwnd == cwnd &&
102 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
103 return;
104
105 ca->last_cwnd = cwnd;
106 ca->last_time = tcp_time_stamp;
107
108 if (ca->epoch_start == 0) /* record the beginning of an epoch */
109 ca->epoch_start = tcp_time_stamp;
110
111 /* start off normal */
112 if (cwnd <= low_window) {
113 ca->cnt = cwnd;
114 return;
115 }
116
117 /* binary increase */
118 if (cwnd < ca->last_max_cwnd) {
119 __u32 dist = (ca->last_max_cwnd - cwnd)
120 / BICTCP_B;
121
122 if (dist > max_increment)
123 /* linear increase */
124 ca->cnt = cwnd / max_increment;
125 else if (dist <= 1U)
126 /* binary search increase */
127 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
128 else
129 /* binary search increase */
130 ca->cnt = cwnd / dist;
131 } else {
132 /* slow start AMD linear increase */
133 if (cwnd < ca->last_max_cwnd + BICTCP_B)
134 /* slow start */
135 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
136 else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
137 /* slow start */
138 ca->cnt = (cwnd * (BICTCP_B-1))
139 / cwnd-ca->last_max_cwnd;
140 else
141 /* linear increase */
142 ca->cnt = cwnd / max_increment;
143 }
144
145 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 ||
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20;
150 }
151
152 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
153 if (ca->cnt == 0) /* cannot be zero */
154 ca->cnt = 1;
155}
156
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
160{
161 struct bictcp *ca = tcp_ca(tp);
162 u32 dist, delay;
163
164 /* No time stamp */
165 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
166 /* Discard delay samples right after fast recovery */
167 tcp_time_stamp < ca->epoch_start + HZ ||
168 /* this delay samples may not be accurate */
169 flag == 0) {
170 ca->last_delay = 0;
171 goto notlow;
172 }
173
174 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
175 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
176 if (delay == 0) /* no previous delay sample */
177 goto notlow;
178
179 /* first time call or link delay decreases */
180 if (ca->delay_min == 0 || ca->delay_min > delay) {
181 ca->delay_min = ca->delay_max = delay;
182 goto notlow;
183 }
184
185 if (ca->delay_max < delay)
186 ca->delay_max = delay;
187
188 /* utilization is low, if avg delay < dist*threshold
189 for checking_period time */
190 dist = ca->delay_max - ca->delay_min;
191 if (dist <= ca->delay_min>>6 ||
192 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
193 goto notlow;
194
195 if (ca->low_utilization_start == 0) {
196 ca->low_utilization = 0;
197 ca->low_utilization_start = tcp_time_stamp;
198 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
199 > low_utilization_period*HZ) {
200 ca->low_utilization = 1;
201 }
202
203 return;
204
205 notlow:
206 ca->low_utilization = 0;
207 ca->low_utilization_start = 0;
208
209}
210
211static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
212 u32 seq_rtt, u32 in_flight, int data_acked)
213{
214 struct bictcp *ca = tcp_ca(tp);
215
216 bictcp_low_utilization(tp, data_acked);
217
218 if (in_flight < tp->snd_cwnd)
219 return;
220
221 if (tp->snd_cwnd <= tp->snd_ssthresh) {
222 /* In "safe" area, increase. */
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 } else {
226 bictcp_update(ca, tp->snd_cwnd);
227
228 /* In dangerous area, increase slowly.
229 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
230 */
231 if (tp->snd_cwnd_cnt >= ca->cnt) {
232 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
233 tp->snd_cwnd++;
234 tp->snd_cwnd_cnt = 0;
235 } else
236 tp->snd_cwnd_cnt++;
237 }
238
239}
240
241/*
242 * behave like Reno until low_window is reached,
243 * then increase congestion window slowly
244 */
245static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
246{
247 struct bictcp *ca = tcp_ca(tp);
248
249 ca->epoch_start = 0; /* end of epoch */
250
251 /* in case of wrong delay_max*/
252 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
253 ca->delay_max = ca->delay_min
254 + ((ca->delay_max - ca->delay_min)* 90) / 100;
255
256 /* Wmax and fast convergence */
257 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
258 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
259 / (2 * BICTCP_BETA_SCALE);
260 else
261 ca->last_max_cwnd = tp->snd_cwnd;
262
263 ca->loss_cwnd = tp->snd_cwnd;
264
265
266 if (tp->snd_cwnd <= low_window)
267 return max(tp->snd_cwnd >> 1U, 2U);
268 else
269 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
270}
271
272static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
273{
274 struct bictcp *ca = tcp_ca(tp);
275
276 return max(tp->snd_cwnd, ca->last_max_cwnd);
277}
278
279static u32 bictcp_min_cwnd(struct tcp_sock *tp)
280{
281 return tp->snd_ssthresh;
282}
283
284static void bictcp_state(struct tcp_sock *tp, u8 new_state)
285{
286 if (new_state == TCP_CA_Loss)
287 bictcp_reset(tcp_ca(tp));
288}
289
290/* Track delayed acknowledgement ratio using sliding window
291 * ratio = (15*ratio + sample) / 16
292 */
293static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
294{
295 if (cnt > 0 && tp->ca_state == TCP_CA_Open) {
296 struct bictcp *ca = tcp_ca(tp);
297 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
298 ca->delayed_ack += cnt;
299 }
300}
301
302
303static struct tcp_congestion_ops bictcp = {
304 .init = bictcp_init,
305 .ssthresh = bictcp_recalc_ssthresh,
306 .cong_avoid = bictcp_cong_avoid,
307 .set_state = bictcp_state,
308 .undo_cwnd = bictcp_undo_cwnd,
309 .min_cwnd = bictcp_min_cwnd,
310 .pkts_acked = bictcp_acked,
311 .owner = THIS_MODULE,
312 .name = "bic",
313};
314
315static int __init bictcp_register(void)
316{
317 BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
318 return tcp_register_congestion_control(&bictcp);
319}
320
321static void __exit bictcp_unregister(void)
322{
323 tcp_unregister_congestion_control(&bictcp);
324}
325
326module_init(bictcp_register);
327module_exit(bictcp_unregister);
328
329MODULE_AUTHOR("Stephen Hemminger");
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000000..4970d10a7785
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,237 @@
1/*
2 * Plugable TCP congestion control support and newReno
3 * congestion control.
4 * Based on ideas from I/O scheduler suport and Web100.
5 *
6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/mm.h>
12#include <linux/types.h>
13#include <linux/list.h>
14#include <net/tcp.h>
15
16static DEFINE_SPINLOCK(tcp_cong_list_lock);
17static LIST_HEAD(tcp_cong_list);
18
19/* Simple linear search, don't expect many entries! */
20static struct tcp_congestion_ops *tcp_ca_find(const char *name)
21{
22 struct tcp_congestion_ops *e;
23
24 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
25 if (strcmp(e->name, name) == 0)
26 return e;
27 }
28
29 return NULL;
30}
31
32/*
33 * Attach new congestion control algorthim to the list
34 * of available options.
35 */
36int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
37{
38 int ret = 0;
39
40 /* all algorithms must implement ssthresh and cong_avoid ops */
41 if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
42 printk(KERN_ERR "TCP %s does not implement required ops\n",
43 ca->name);
44 return -EINVAL;
45 }
46
47 spin_lock(&tcp_cong_list_lock);
48 if (tcp_ca_find(ca->name)) {
49 printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
50 ret = -EEXIST;
51 } else {
52 list_add_rcu(&ca->list, &tcp_cong_list);
53 printk(KERN_INFO "TCP %s registered\n", ca->name);
54 }
55 spin_unlock(&tcp_cong_list_lock);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
60
61/*
62 * Remove congestion control algorithm, called from
63 * the module's remove function. Module ref counts are used
64 * to ensure that this can't be done till all sockets using
65 * that method are closed.
66 */
67void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
68{
69 spin_lock(&tcp_cong_list_lock);
70 list_del_rcu(&ca->list);
71 spin_unlock(&tcp_cong_list_lock);
72}
73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
74
75/* Assign choice of congestion control. */
76void tcp_init_congestion_control(struct tcp_sock *tp)
77{
78 struct tcp_congestion_ops *ca;
79
80 if (tp->ca_ops != &tcp_init_congestion_ops)
81 return;
82
83 rcu_read_lock();
84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
85 if (try_module_get(ca->owner)) {
86 tp->ca_ops = ca;
87 break;
88 }
89
90 }
91 rcu_read_unlock();
92
93 if (tp->ca_ops->init)
94 tp->ca_ops->init(tp);
95}
96
97/* Manage refcounts on socket close. */
98void tcp_cleanup_congestion_control(struct tcp_sock *tp)
99{
100 if (tp->ca_ops->release)
101 tp->ca_ops->release(tp);
102 module_put(tp->ca_ops->owner);
103}
104
105/* Used by sysctl to change default congestion control */
106int tcp_set_default_congestion_control(const char *name)
107{
108 struct tcp_congestion_ops *ca;
109 int ret = -ENOENT;
110
111 spin_lock(&tcp_cong_list_lock);
112 ca = tcp_ca_find(name);
113#ifdef CONFIG_KMOD
114 if (!ca) {
115 spin_unlock(&tcp_cong_list_lock);
116
117 request_module("tcp_%s", name);
118 spin_lock(&tcp_cong_list_lock);
119 ca = tcp_ca_find(name);
120 }
121#endif
122
123 if (ca) {
124 list_move(&ca->list, &tcp_cong_list);
125 ret = 0;
126 }
127 spin_unlock(&tcp_cong_list_lock);
128
129 return ret;
130}
131
132/* Get current default congestion control */
133void tcp_get_default_congestion_control(char *name)
134{
135 struct tcp_congestion_ops *ca;
136 /* We will always have reno... */
137 BUG_ON(list_empty(&tcp_cong_list));
138
139 rcu_read_lock();
140 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
141 strncpy(name, ca->name, TCP_CA_NAME_MAX);
142 rcu_read_unlock();
143}
144
145/* Change congestion control for socket */
146int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
147{
148 struct tcp_congestion_ops *ca;
149 int err = 0;
150
151 rcu_read_lock();
152 ca = tcp_ca_find(name);
153 if (ca == tp->ca_ops)
154 goto out;
155
156 if (!ca)
157 err = -ENOENT;
158
159 else if (!try_module_get(ca->owner))
160 err = -EBUSY;
161
162 else {
163 tcp_cleanup_congestion_control(tp);
164 tp->ca_ops = ca;
165 if (tp->ca_ops->init)
166 tp->ca_ops->init(tp);
167 }
168 out:
169 rcu_read_unlock();
170 return err;
171}
172
173/*
174 * TCP Reno congestion control
175 * This is special case used for fallback as well.
176 */
177/* This is Jacobson's slow start and congestion avoidance.
178 * SIGCOMM '88, p. 328.
179 */
180void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
181 int flag)
182{
183 if (in_flight < tp->snd_cwnd)
184 return;
185
186 if (tp->snd_cwnd <= tp->snd_ssthresh) {
187 /* In "safe" area, increase. */
188 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
189 tp->snd_cwnd++;
190 } else {
191 /* In dangerous area, increase slowly.
192 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
193 */
194 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
195 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
196 tp->snd_cwnd++;
197 tp->snd_cwnd_cnt = 0;
198 } else
199 tp->snd_cwnd_cnt++;
200 }
201}
202EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
203
204/* Slow start threshold is half the congestion window (min 2) */
205u32 tcp_reno_ssthresh(struct tcp_sock *tp)
206{
207 return max(tp->snd_cwnd >> 1U, 2U);
208}
209EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
210
211/* Lower bound on congestion window. */
212u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
213{
214 return tp->snd_ssthresh/2;
215}
216EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
217
218struct tcp_congestion_ops tcp_reno = {
219 .name = "reno",
220 .owner = THIS_MODULE,
221 .ssthresh = tcp_reno_ssthresh,
222 .cong_avoid = tcp_reno_cong_avoid,
223 .min_cwnd = tcp_reno_min_cwnd,
224};
225
226/* Initial congestion control used (until SYN)
227 * really reno under another name so we can tell difference
228 * during tcp_set_default_congestion_control
229 */
230struct tcp_congestion_ops tcp_init_congestion_ops = {
231 .name = "",
232 .owner = THIS_MODULE,
233 .ssthresh = tcp_reno_ssthresh,
234 .cong_avoid = tcp_reno_cong_avoid,
235 .min_cwnd = tcp_reno_min_cwnd,
236};
237EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 634befc07921..f66945cb158f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,15 +42,8 @@ struct tcpdiag_entry
42 42
43static struct sock *tcpnl; 43static struct sock *tcpnl;
44 44
45
46#define TCPDIAG_PUT(skb, attrtype, attrlen) \ 45#define TCPDIAG_PUT(skb, attrtype, attrlen) \
47({ int rtalen = RTA_LENGTH(attrlen); \ 46 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
48 struct rtattr *rta; \
49 if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
50 rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
51 rta->rta_type = attrtype; \
52 rta->rta_len = rtalen; \
53 RTA_DATA(rta); })
54 47
55static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, 48static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
56 int ext, u32 pid, u32 seq, u16 nlmsg_flags) 49 int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
61 struct nlmsghdr *nlh; 54 struct nlmsghdr *nlh;
62 struct tcp_info *info = NULL; 55 struct tcp_info *info = NULL;
63 struct tcpdiag_meminfo *minfo = NULL; 56 struct tcpdiag_meminfo *minfo = NULL;
64 struct tcpvegas_info *vinfo = NULL;
65 unsigned char *b = skb->tail; 57 unsigned char *b = skb->tail;
66 58
67 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); 59 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
73 if (ext & (1<<(TCPDIAG_INFO-1))) 65 if (ext & (1<<(TCPDIAG_INFO-1)))
74 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); 66 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
75 67
76 if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) 68 if (ext & (1<<(TCPDIAG_CONG-1))) {
77 && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) 69 size_t len = strlen(tp->ca_ops->name);
78 vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); 70 strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
71 tp->ca_ops->name);
72 }
79 } 73 }
80 r->tcpdiag_family = sk->sk_family; 74 r->tcpdiag_family = sk->sk_family;
81 r->tcpdiag_state = sk->sk_state; 75 r->tcpdiag_state = sk->sk_state;
@@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
166 if (info) 160 if (info)
167 tcp_get_info(sk, info); 161 tcp_get_info(sk, info);
168 162
169 if (vinfo) { 163 if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
170 if (tcp_is_vegas(tp)) { 164 tp->ca_ops->get_info(tp, ext, skb);
171 vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
172 vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
173 vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
174 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
175 } else {
176 vinfo->tcpv_enabled = 0;
177 vinfo->tcpv_rttcnt = 0;
178 vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
179 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
180 }
181 }
182 165
183 nlh->nlmsg_len = skb->tail - b; 166 nlh->nlmsg_len = skb->tail - b;
184 return skb->len; 167 return skb->len;
185 168
169rtattr_failure:
186nlmsg_failure: 170nlmsg_failure:
187 skb_trim(skb, b - skb->data); 171 skb_trim(skb, b - skb->data);
188 return -1; 172 return -1;
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 000000000000..36c51f8136bf
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
1/*
2 * Sally Floyd's High Speed TCP (RFC 3649) congestion control
3 *
4 * See http://www.icir.org/floyd/hstcp.html
5 *
6 * John Heffner <jheffner@psc.edu>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <net/tcp.h>
12
13
14/* From AIMD tables from RFC 3649 appendix B,
15 * with fixed-point MD scaled <<8.
16 */
17static const struct hstcp_aimd_val {
18 unsigned int cwnd;
19 unsigned int md;
20} hstcp_aimd_vals[] = {
21 { 38, 128, /* 0.50 */ },
22 { 118, 112, /* 0.44 */ },
23 { 221, 104, /* 0.41 */ },
24 { 347, 98, /* 0.38 */ },
25 { 495, 93, /* 0.37 */ },
26 { 663, 89, /* 0.35 */ },
27 { 851, 86, /* 0.34 */ },
28 { 1058, 83, /* 0.33 */ },
29 { 1284, 81, /* 0.32 */ },
30 { 1529, 78, /* 0.31 */ },
31 { 1793, 76, /* 0.30 */ },
32 { 2076, 74, /* 0.29 */ },
33 { 2378, 72, /* 0.28 */ },
34 { 2699, 71, /* 0.28 */ },
35 { 3039, 69, /* 0.27 */ },
36 { 3399, 68, /* 0.27 */ },
37 { 3778, 66, /* 0.26 */ },
38 { 4177, 65, /* 0.26 */ },
39 { 4596, 64, /* 0.25 */ },
40 { 5036, 62, /* 0.25 */ },
41 { 5497, 61, /* 0.24 */ },
42 { 5979, 60, /* 0.24 */ },
43 { 6483, 59, /* 0.23 */ },
44 { 7009, 58, /* 0.23 */ },
45 { 7558, 57, /* 0.22 */ },
46 { 8130, 56, /* 0.22 */ },
47 { 8726, 55, /* 0.22 */ },
48 { 9346, 54, /* 0.21 */ },
49 { 9991, 53, /* 0.21 */ },
50 { 10661, 52, /* 0.21 */ },
51 { 11358, 52, /* 0.20 */ },
52 { 12082, 51, /* 0.20 */ },
53 { 12834, 50, /* 0.20 */ },
54 { 13614, 49, /* 0.19 */ },
55 { 14424, 48, /* 0.19 */ },
56 { 15265, 48, /* 0.19 */ },
57 { 16137, 47, /* 0.19 */ },
58 { 17042, 46, /* 0.18 */ },
59 { 17981, 45, /* 0.18 */ },
60 { 18955, 45, /* 0.18 */ },
61 { 19965, 44, /* 0.17 */ },
62 { 21013, 43, /* 0.17 */ },
63 { 22101, 43, /* 0.17 */ },
64 { 23230, 42, /* 0.17 */ },
65 { 24402, 41, /* 0.16 */ },
66 { 25618, 41, /* 0.16 */ },
67 { 26881, 40, /* 0.16 */ },
68 { 28193, 39, /* 0.16 */ },
69 { 29557, 39, /* 0.15 */ },
70 { 30975, 38, /* 0.15 */ },
71 { 32450, 38, /* 0.15 */ },
72 { 33986, 37, /* 0.15 */ },
73 { 35586, 36, /* 0.14 */ },
74 { 37253, 36, /* 0.14 */ },
75 { 38992, 35, /* 0.14 */ },
76 { 40808, 35, /* 0.14 */ },
77 { 42707, 34, /* 0.13 */ },
78 { 44694, 33, /* 0.13 */ },
79 { 46776, 33, /* 0.13 */ },
80 { 48961, 32, /* 0.13 */ },
81 { 51258, 32, /* 0.13 */ },
82 { 53677, 31, /* 0.12 */ },
83 { 56230, 30, /* 0.12 */ },
84 { 58932, 30, /* 0.12 */ },
85 { 61799, 29, /* 0.12 */ },
86 { 64851, 28, /* 0.11 */ },
87 { 68113, 28, /* 0.11 */ },
88 { 71617, 27, /* 0.11 */ },
89 { 75401, 26, /* 0.10 */ },
90 { 79517, 26, /* 0.10 */ },
91 { 84035, 25, /* 0.10 */ },
92 { 89053, 24, /* 0.10 */ },
93};
94
95#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
96
97struct hstcp {
98 u32 ai;
99};
100
101static void hstcp_init(struct tcp_sock *tp)
102{
103 struct hstcp *ca = tcp_ca(tp);
104
105 ca->ai = 0;
106
107 /* Ensure the MD arithmetic works. This is somewhat pedantic,
108 * since I don't think we will see a cwnd this large. :) */
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110}
111
112static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
113 u32 in_flight, int good)
114{
115 struct hstcp *ca = tcp_ca(tp);
116
117 if (in_flight < tp->snd_cwnd)
118 return;
119
120 if (tp->snd_cwnd <= tp->snd_ssthresh) {
121 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
122 tp->snd_cwnd++;
123 } else {
124 /* Update AIMD parameters */
125 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
126 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
127 ca->ai < HSTCP_AIMD_MAX)
128 ca->ai++;
129 } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
130 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
131 ca->ai > 0)
132 ca->ai--;
133 }
134
135 /* Do additive increase */
136 if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
137 tp->snd_cwnd_cnt += ca->ai;
138 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
139 tp->snd_cwnd++;
140 tp->snd_cwnd_cnt -= tp->snd_cwnd;
141 }
142 }
143 }
144}
145
146static u32 hstcp_ssthresh(struct tcp_sock *tp)
147{
148 struct hstcp *ca = tcp_ca(tp);
149
150 /* Do multiplicative decrease */
151 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
152}
153
154
155static struct tcp_congestion_ops tcp_highspeed = {
156 .init = hstcp_init,
157 .ssthresh = hstcp_ssthresh,
158 .cong_avoid = hstcp_cong_avoid,
159 .min_cwnd = tcp_reno_min_cwnd,
160
161 .owner = THIS_MODULE,
162 .name = "highspeed"
163};
164
165static int __init hstcp_register(void)
166{
167 BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
168 return tcp_register_congestion_control(&tcp_highspeed);
169}
170
171static void __exit hstcp_unregister(void)
172{
173 tcp_unregister_congestion_control(&tcp_highspeed);
174}
175
176module_init(hstcp_register);
177module_exit(hstcp_unregister);
178
179MODULE_AUTHOR("John Heffner");
180MODULE_LICENSE("GPL");
181MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 000000000000..40168275acf9
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
1/*
2 * H-TCP congestion control. The algorithm is detailed in:
3 * R.N.Shorten, D.J.Leith:
4 * "H-TCP: TCP for high-speed and long-distance networks"
5 * Proc. PFLDnet, Argonne, 2004.
6 * http://www.hamilton.ie/net/htcp3.pdf
7 */
8
9#include <linux/config.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <net/tcp.h>
13
14#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
15#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
16#define BETA_MAX 102 /* 0.8 with shift << 7 */
17
18static int use_rtt_scaling = 1;
19module_param(use_rtt_scaling, int, 0644);
20MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
21
22static int use_bandwidth_switch = 1;
23module_param(use_bandwidth_switch, int, 0644);
24MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
25
26struct htcp {
27 u16 alpha; /* Fixed point arith, << 7 */
28 u8 beta; /* Fixed point arith, << 7 */
29 u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */
30 u8 ccount; /* Number of RTTs since last congestion event */
31 u8 undo_ccount;
32 u16 packetcount;
33 u32 minRTT;
34 u32 maxRTT;
35 u32 snd_cwnd_cnt2;
36
37 u32 undo_maxRTT;
38 u32 undo_old_maxB;
39
40 /* Bandwidth estimation */
41 u32 minB;
42 u32 maxB;
43 u32 old_maxB;
44 u32 Bi;
45 u32 lasttime;
46};
47
48static inline void htcp_reset(struct htcp *ca)
49{
50 ca->undo_ccount = ca->ccount;
51 ca->undo_maxRTT = ca->maxRTT;
52 ca->undo_old_maxB = ca->old_maxB;
53
54 ca->ccount = 0;
55 ca->snd_cwnd_cnt2 = 0;
56}
57
58static u32 htcp_cwnd_undo(struct tcp_sock *tp)
59{
60 struct htcp *ca = tcp_ca(tp);
61 ca->ccount = ca->undo_ccount;
62 ca->maxRTT = ca->undo_maxRTT;
63 ca->old_maxB = ca->undo_old_maxB;
64 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
65}
66
67static inline void measure_rtt(struct tcp_sock *tp)
68{
69 struct htcp *ca = tcp_ca(tp);
70 u32 srtt = tp->srtt>>3;
71
72 /* keep track of minimum RTT seen so far, minRTT is zero at first */
73 if (ca->minRTT > srtt || !ca->minRTT)
74 ca->minRTT = srtt;
75
76 /* max RTT */
77 if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
78 if (ca->maxRTT < ca->minRTT)
79 ca->maxRTT = ca->minRTT;
80 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
81 ca->maxRTT = srtt;
82 }
83}
84
85static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
86{
87 struct htcp *ca = tcp_ca(tp);
88 u32 now = tcp_time_stamp;
89
90 /* achieved throughput calculations */
91 if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
92 ca->packetcount = 0;
93 ca->lasttime = now;
94 return;
95 }
96
97 ca->packetcount += pkts_acked;
98
99 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
100 && now - ca->lasttime >= ca->minRTT
101 && ca->minRTT > 0) {
102 __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
103 if (ca->ccount <= 3) {
104 /* just after backoff */
105 ca->minB = ca->maxB = ca->Bi = cur_Bi;
106 } else {
107 ca->Bi = (3*ca->Bi + cur_Bi)/4;
108 if (ca->Bi > ca->maxB)
109 ca->maxB = ca->Bi;
110 if (ca->minB > ca->maxB)
111 ca->minB = ca->maxB;
112 }
113 ca->packetcount = 0;
114 ca->lasttime = now;
115 }
116}
117
118static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
119{
120 if (use_bandwidth_switch) {
121 u32 maxB = ca->maxB;
122 u32 old_maxB = ca->old_maxB;
123 ca->old_maxB = ca->maxB;
124
125 if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
126 ca->beta = BETA_MIN;
127 ca->modeswitch = 0;
128 return;
129 }
130 }
131
132 if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
133 ca->beta = (minRTT<<7)/maxRTT;
134 if (ca->beta < BETA_MIN)
135 ca->beta = BETA_MIN;
136 else if (ca->beta > BETA_MAX)
137 ca->beta = BETA_MAX;
138 } else {
139 ca->beta = BETA_MIN;
140 ca->modeswitch = 1;
141 }
142}
143
144static inline void htcp_alpha_update(struct htcp *ca)
145{
146 u32 minRTT = ca->minRTT;
147 u32 factor = 1;
148 u32 diff = ca->ccount * minRTT; /* time since last backoff */
149
150 if (diff > HZ) {
151 diff -= HZ;
152 factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
153 }
154
155 if (use_rtt_scaling && minRTT) {
156 u32 scale = (HZ<<3)/(10*minRTT);
157 scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
158 factor = (factor<<3)/scale;
159 if (!factor)
160 factor = 1;
161 }
162
163 ca->alpha = 2*factor*((1<<7)-ca->beta);
164 if (!ca->alpha)
165 ca->alpha = ALPHA_BASE;
166}
167
168/* After we have the rtt data to calculate beta, we'd still prefer to wait one
169 * rtt before we adjust our beta to ensure we are working from a consistent
170 * data.
171 *
172 * This function should be called when we hit a congestion event since only at
173 * that point do we really have a real sense of maxRTT (the queues en route
174 * were getting just too full now).
175 */
176static void htcp_param_update(struct tcp_sock *tp)
177{
178 struct htcp *ca = tcp_ca(tp);
179 u32 minRTT = ca->minRTT;
180 u32 maxRTT = ca->maxRTT;
181
182 htcp_beta_update(ca, minRTT, maxRTT);
183 htcp_alpha_update(ca);
184
185 /* add slowly fading memory for maxRTT to accommodate routing changes etc */
186 if (minRTT > 0 && maxRTT > minRTT)
187 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
188}
189
190static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
191{
192 struct htcp *ca = tcp_ca(tp);
193 htcp_param_update(tp);
194 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
195}
196
197static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
198 u32 in_flight, int data_acked)
199{
200 struct htcp *ca = tcp_ca(tp);
201
202 if (in_flight < tp->snd_cwnd)
203 return;
204
205 if (tp->snd_cwnd <= tp->snd_ssthresh) {
206 /* In "safe" area, increase. */
207 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
208 tp->snd_cwnd++;
209 } else {
210 measure_rtt(tp);
211
212 /* keep track of number of round-trip times since last backoff event */
213 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
214 ca->ccount++;
215 ca->snd_cwnd_cnt2 = 0;
216 htcp_alpha_update(ca);
217 }
218
219 /* In dangerous area, increase slowly.
220 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
221 */
222 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 tp->snd_cwnd_cnt = 0;
226 ca->ccount++;
227 }
228 }
229}
230
231/* Lower bound on congestion window. */
232static u32 htcp_min_cwnd(struct tcp_sock *tp)
233{
234 return tp->snd_ssthresh;
235}
236
237
238static void htcp_init(struct tcp_sock *tp)
239{
240 struct htcp *ca = tcp_ca(tp);
241
242 memset(ca, 0, sizeof(struct htcp));
243 ca->alpha = ALPHA_BASE;
244 ca->beta = BETA_MIN;
245}
246
247static void htcp_state(struct tcp_sock *tp, u8 new_state)
248{
249 switch (new_state) {
250 case TCP_CA_CWR:
251 case TCP_CA_Recovery:
252 case TCP_CA_Loss:
253 htcp_reset(tcp_ca(tp));
254 break;
255 }
256}
257
258static struct tcp_congestion_ops htcp = {
259 .init = htcp_init,
260 .ssthresh = htcp_recalc_ssthresh,
261 .min_cwnd = htcp_min_cwnd,
262 .cong_avoid = htcp_cong_avoid,
263 .set_state = htcp_state,
264 .undo_cwnd = htcp_cwnd_undo,
265 .pkts_acked = measure_achieved_throughput,
266 .owner = THIS_MODULE,
267 .name = "htcp",
268};
269
270static int __init htcp_register(void)
271{
272 BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
273 BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
274 if (!use_bandwidth_switch)
275 htcp.pkts_acked = NULL;
276 return tcp_register_congestion_control(&htcp);
277}
278
279static void __exit htcp_unregister(void)
280{
281 tcp_unregister_congestion_control(&htcp);
282}
283
284module_init(htcp_register);
285module_exit(htcp_unregister);
286
287MODULE_AUTHOR("Baruch Even");
288MODULE_LICENSE("GPL");
289MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 000000000000..13a66342c304
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
1/*
2 * TCP HYBLA
3 *
4 * TCP-HYBLA Congestion control algorithm, based on:
5 * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
6 * for Heterogeneous Networks",
7 * International Journal on satellite Communications,
8 * September 2004
9 * Daniele Lacamera
10 * root at danielinux.net
11 */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <net/tcp.h>
16
17/* Tcp Hybla structure. */
18struct hybla {
19 u8 hybla_en;
20 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
21 u32 rho; /* Rho parameter, integer part */
22 u32 rho2; /* Rho * Rho, integer part */
23 u32 rho_3ls; /* Rho parameter, <<3 */
24 u32 rho2_7ls; /* Rho^2, <<7 */
25 u32 minrtt; /* Minimum smoothed round trip time value seen */
26};
27
28/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
29 expressed in jiffies */
30static int rtt0 = 25;
31module_param(rtt0, int, 0644);
32MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
33
34
35/* This is called to refresh values for hybla parameters */
36static inline void hybla_recalc_param (struct tcp_sock *tp)
37{
38 struct hybla *ca = tcp_ca(tp);
39
40 ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
41 ca->rho = ca->rho_3ls >> 3;
42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
43 ca->rho2 = ca->rho2_7ls >>7;
44}
45
46static void hybla_init(struct tcp_sock *tp)
47{
48 struct hybla *ca = tcp_ca(tp);
49
50 ca->rho = 0;
51 ca->rho2 = 0;
52 ca->rho_3ls = 0;
53 ca->rho2_7ls = 0;
54 ca->snd_cwnd_cents = 0;
55 ca->hybla_en = 1;
56 tp->snd_cwnd = 2;
57 tp->snd_cwnd_clamp = 65535;
58
59 /* 1st Rho measurement based on initial srtt */
60 hybla_recalc_param(tp);
61
62 /* set minimum rtt as this is the 1st ever seen */
63 ca->minrtt = tp->srtt;
64 tp->snd_cwnd = ca->rho;
65}
66
67static void hybla_state(struct tcp_sock *tp, u8 ca_state)
68{
69 struct hybla *ca = tcp_ca(tp);
70
71 ca->hybla_en = (ca_state == TCP_CA_Open);
72}
73
74static inline u32 hybla_fraction(u32 odds)
75{
76 static const u32 fractions[] = {
77 128, 139, 152, 165, 181, 197, 215, 234,
78 };
79
80 return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
81}
82
83/* TCP Hybla main routine.
84 * This is the algorithm behavior:
85 * o Recalc Hybla parameters if min_rtt has changed
86 * o Give cwnd a new value based on the model proposed
87 * o remember increments <1
88 */
89static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
90 u32 in_flight, int flag)
91{
92 struct hybla *ca = tcp_ca(tp);
93 u32 increment, odd, rho_fractions;
94 int is_slowstart = 0;
95
96 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){
98 hybla_recalc_param(tp);
99 ca->minrtt = tp->srtt;
100 }
101
102 if (!ca->hybla_en)
103 return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
104
105 if (in_flight < tp->snd_cwnd)
106 return;
107
108 if (ca->rho == 0)
109 hybla_recalc_param(tp);
110
111 rho_fractions = ca->rho_3ls - (ca->rho << 3);
112
113 if (tp->snd_cwnd < tp->snd_ssthresh) {
114 /*
115 * slow start
116 * INC = 2^RHO - 1
117 * This is done by splitting the rho parameter
118 * into 2 parts: an integer part and a fraction part.
119 * Inrement<<7 is estimated by doing:
120 * [2^(int+fract)]<<7
121 * that is equal to:
122 * (2^int) * [(2^fract) <<7]
123 * 2^int is straightly computed as 1<<int,
124 * while we will use hybla_slowstart_fraction_increment() to
125 * calculate 2^fract in a <<7 value.
126 */
127 is_slowstart = 1;
128 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
129 - 128;
130 } else {
131 /*
132 * congestion avoidance
133 * INC = RHO^2 / W
134 * as long as increment is estimated as (rho<<7)/window
135 * it already is <<7 and we can easily count its fractions.
136 */
137 increment = ca->rho2_7ls / tp->snd_cwnd;
138 if (increment < 128)
139 tp->snd_cwnd_cnt++;
140 }
141
142 odd = increment % 128;
143 tp->snd_cwnd += increment >> 7;
144 ca->snd_cwnd_cents += odd;
145
146 /* check when fractions goes >=128 and increase cwnd by 1. */
147 while(ca->snd_cwnd_cents >= 128) {
148 tp->snd_cwnd++;
149 ca->snd_cwnd_cents -= 128;
150 tp->snd_cwnd_cnt = 0;
151 }
152
153 /* clamp down slowstart cwnd to ssthresh value. */
154 if (is_slowstart)
155 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
156
157 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
158}
159
160static struct tcp_congestion_ops tcp_hybla = {
161 .init = hybla_init,
162 .ssthresh = tcp_reno_ssthresh,
163 .min_cwnd = tcp_reno_min_cwnd,
164 .cong_avoid = hybla_cong_avoid,
165 .set_state = hybla_state,
166
167 .owner = THIS_MODULE,
168 .name = "hybla"
169};
170
171static int __init hybla_register(void)
172{
173 BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
174 return tcp_register_congestion_control(&tcp_hybla);
175}
176
177static void __exit hybla_unregister(void)
178{
179 tcp_unregister_congestion_control(&tcp_hybla);
180}
181
182module_init(hybla_register);
183module_exit(hybla_unregister);
184
185MODULE_AUTHOR("Daniele Lacamera");
186MODULE_LICENSE("GPL");
187MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a3..53a8a5399f1e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission 61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission
62 * engine. Lots of bugs are found. 62 * engine. Lots of bugs are found.
63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
64 * Angelo Dell'Aera: TCP Westwood+ support
65 */ 64 */
66 65
67#include <linux/config.h> 66#include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
88int sysctl_tcp_max_orphans = NR_FILE; 87int sysctl_tcp_max_orphans = NR_FILE;
89int sysctl_tcp_frto; 88int sysctl_tcp_frto;
90int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
91int sysctl_tcp_westwood;
92int sysctl_tcp_vegas_cong_avoid;
93 90
94int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
95 92
96/* Default values of the Vegas variables, in fixed-point representation
97 * with V_PARAM_SHIFT bits to the right of the binary point.
98 */
99#define V_PARAM_SHIFT 1
100int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
101int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
102int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
103int sysctl_tcp_bic = 1;
104int sysctl_tcp_bic_fast_convergence = 1;
105int sysctl_tcp_bic_low_window = 14;
106int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
107
108#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 93#define FLAG_DATA 0x01 /* Incoming frame contained data. */
109#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
110#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ 95#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
333 tp->snd_cwnd_stamp = tcp_time_stamp; 318 tp->snd_cwnd_stamp = tcp_time_stamp;
334} 319}
335 320
336static void init_bictcp(struct tcp_sock *tp)
337{
338 tp->bictcp.cnt = 0;
339
340 tp->bictcp.last_max_cwnd = 0;
341 tp->bictcp.last_cwnd = 0;
342 tp->bictcp.last_stamp = 0;
343}
344
345/* 5. Recalculate window clamp after socket hit its memory bounds. */ 321/* 5. Recalculate window clamp after socket hit its memory bounds. */
346static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
347{ 323{
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
558 tcp_grow_window(sk, tp, skb); 534 tcp_grow_window(sk, tp, skb);
559} 535}
560 536
561/* When starting a new connection, pin down the current choice of
562 * congestion algorithm.
563 */
564void tcp_ca_init(struct tcp_sock *tp)
565{
566 if (sysctl_tcp_westwood)
567 tp->adv_cong = TCP_WESTWOOD;
568 else if (sysctl_tcp_bic)
569 tp->adv_cong = TCP_BIC;
570 else if (sysctl_tcp_vegas_cong_avoid) {
571 tp->adv_cong = TCP_VEGAS;
572 tp->vegas.baseRTT = 0x7fffffff;
573 tcp_vegas_enable(tp);
574 }
575}
576
577/* Do RTT sampling needed for Vegas.
578 * Basically we:
579 * o min-filter RTT samples from within an RTT to get the current
580 * propagation delay + queuing delay (we are min-filtering to try to
581 * avoid the effects of delayed ACKs)
582 * o min-filter RTT samples from a much longer window (forever for now)
583 * to find the propagation delay (baseRTT)
584 */
585static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
586{
587 __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
588
589 /* Filter to find propagation delay: */
590 if (vrtt < tp->vegas.baseRTT)
591 tp->vegas.baseRTT = vrtt;
592
593 /* Find the min RTT during the last RTT to find
594 * the current prop. delay + queuing delay:
595 */
596 tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
597 tp->vegas.cntRTT++;
598}
599
600/* Called to compute a smoothed rtt estimate. The data fed to this 537/* Called to compute a smoothed rtt estimate. The data fed to this
601 * routine either comes from timestamps, or from segments that were 538 * routine either comes from timestamps, or from segments that were
602 * known _not_ to have been retransmitted [see Karn/Partridge 539 * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
606 * To save cycles in the RFC 1323 implementation it was better to break 543 * To save cycles in the RFC 1323 implementation it was better to break
607 * it up into three procedures. -- erics 544 * it up into three procedures. -- erics
608 */ 545 */
609static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) 546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
610{ 547{
611 long m = mrtt; /* RTT */ 548 long m = mrtt; /* RTT */
612 549
613 if (tcp_vegas_enabled(tp))
614 vegas_rtt_calc(tp, mrtt);
615
616 /* The following amusing code comes from Jacobson's 550 /* The following amusing code comes from Jacobson's
617 * article in SIGCOMM '88. Note that rtt and mdev 551 * article in SIGCOMM '88. Note that rtt and mdev
618 * are scaled versions of rtt and mean deviation. 552 * are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
670 tp->rtt_seq = tp->snd_nxt; 604 tp->rtt_seq = tp->snd_nxt;
671 } 605 }
672 606
673 tcp_westwood_update_rtt(tp, tp->srtt >> 3); 607 if (tp->ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt);
674} 609}
675 610
676/* Calculate rto without backoff. This is the second half of Van Jacobson's 611/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -805,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
805 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
806 741
807 if (!cwnd) { 742 if (!cwnd) {
808 if (tp->mss_cache_std > 1460) 743 if (tp->mss_cache > 1460)
809 cwnd = 2; 744 cwnd = 2;
810 else 745 else
811 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; 746 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
812 } 747 }
813 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
814} 749}
@@ -979,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
979 if (sk->sk_route_caps & NETIF_F_TSO) { 914 if (sk->sk_route_caps & NETIF_F_TSO) {
980 sk->sk_route_caps &= ~NETIF_F_TSO; 915 sk->sk_route_caps &= ~NETIF_F_TSO;
981 sock_set_flag(sk, SOCK_NO_LARGESEND); 916 sock_set_flag(sk, SOCK_NO_LARGESEND);
982 tp->mss_cache = tp->mss_cache_std; 917 tp->mss_cache = tp->mss_cache;
983 } 918 }
984 919
985 if (!tp->sacked_out) 920 if (!tp->sacked_out)
@@ -1142,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1142 (IsFack(tp) || 1077 (IsFack(tp) ||
1143 !before(lost_retrans, 1078 !before(lost_retrans,
1144 TCP_SKB_CB(skb)->ack_seq + tp->reordering * 1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1145 tp->mss_cache_std))) { 1080 tp->mss_cache))) {
1146 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1147 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1148 1083
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
1185 tp->snd_una == tp->high_seq || 1120 tp->snd_una == tp->high_seq ||
1186 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1187 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1122 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1188 if (!tcp_westwood_ssthresh(tp)) 1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1189 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1124 tcp_ca_event(tp, CA_EVENT_FRTO);
1190 } 1125 }
1191 1126
1192 /* Have to clear retransmission markers here to keep the bookkeeping 1127 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
1252 tcp_set_ca_state(tp, TCP_CA_Loss); 1187 tcp_set_ca_state(tp, TCP_CA_Loss);
1253 tp->high_seq = tp->frto_highmark; 1188 tp->high_seq = tp->frto_highmark;
1254 TCP_ECN_queue_cwr(tp); 1189 TCP_ECN_queue_cwr(tp);
1255
1256 init_bictcp(tp);
1257} 1190}
1258 1191
1259void tcp_clear_retrans(struct tcp_sock *tp) 1192void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1283 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1284 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1285 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1218 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1286 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1220 tcp_ca_event(tp, CA_EVENT_LOSS);
1287 } 1221 }
1288 tp->snd_cwnd = 1; 1222 tp->snd_cwnd = 1;
1289 tp->snd_cwnd_cnt = 0; 1223 tp->snd_cwnd_cnt = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1596} 1530}
1597 1531
1598/* Decrease cwnd each second ack. */ 1532/* Decrease cwnd each second ack. */
1599
1600static void tcp_cwnd_down(struct tcp_sock *tp) 1533static void tcp_cwnd_down(struct tcp_sock *tp)
1601{ 1534{
1602 int decr = tp->snd_cwnd_cnt + 1; 1535 int decr = tp->snd_cwnd_cnt + 1;
1603 __u32 limit;
1604
1605 /*
1606 * TCP Westwood
1607 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
1608 * in packets we use mss_cache). If sysctl_tcp_westwood is off
1609 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
1610 * still used as usual. It prevents other strange cases in which
1611 * BWE*RTTmin could assume value 0. It should not happen but...
1612 */
1613
1614 if (!(limit = tcp_westwood_bw_rttmin(tp)))
1615 limit = tp->snd_ssthresh/2;
1616 1536
1617 tp->snd_cwnd_cnt = decr&1; 1537 tp->snd_cwnd_cnt = decr&1;
1618 decr >>= 1; 1538 decr >>= 1;
1619 1539
1620 if (decr && tp->snd_cwnd > limit) 1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
1621 tp->snd_cwnd -= decr; 1541 tp->snd_cwnd -= decr;
1622 1542
1623 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1654static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
1655{ 1575{
1656 if (tp->prior_ssthresh) { 1576 if (tp->prior_ssthresh) {
1657 if (tcp_is_bic(tp)) 1577 if (tp->ca_ops->undo_cwnd)
1658 tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); 1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
1659 else 1579 else
1660 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1661 1581
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1767 1687
1768static inline void tcp_complete_cwr(struct tcp_sock *tp) 1688static inline void tcp_complete_cwr(struct tcp_sock *tp)
1769{ 1689{
1770 if (tcp_westwood_cwnd(tp)) 1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1771 tp->snd_ssthresh = tp->snd_cwnd;
1772 else
1773 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1774 tp->snd_cwnd_stamp = tcp_time_stamp; 1691 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
1775} 1693}
1776 1694
1777static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1946 if (tp->ca_state < TCP_CA_CWR) { 1864 if (tp->ca_state < TCP_CA_CWR) {
1947 if (!(flag&FLAG_ECE)) 1865 if (!(flag&FLAG_ECE))
1948 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1866 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1949 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1950 TCP_ECN_queue_cwr(tp); 1868 TCP_ECN_queue_cwr(tp);
1951 } 1869 }
1952 1870
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1963/* Read draft-ietf-tcplw-high-performance before mucking 1881/* Read draft-ietf-tcplw-high-performance before mucking
1964 * with this code. (Superceeds RFC1323) 1882 * with this code. (Superceeds RFC1323)
1965 */ 1883 */
1966static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) 1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1967{ 1885{
1968 __u32 seq_rtt; 1886 __u32 seq_rtt;
1969 1887
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
1983 * in window is lost... Voila. --ANK (010210) 1901 * in window is lost... Voila. --ANK (010210)
1984 */ 1902 */
1985 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1986 tcp_rtt_estimator(tp, seq_rtt); 1904 tcp_rtt_estimator(tp, seq_rtt, usrtt);
1987 tcp_set_rto(tp); 1905 tcp_set_rto(tp);
1988 tp->backoff = 0; 1906 tp->backoff = 0;
1989 tcp_bound_rto(tp); 1907 tcp_bound_rto(tp);
1990} 1908}
1991 1909
1992static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) 1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
1993{ 1911{
1994 /* We don't have a timestamp. Can only use 1912 /* We don't have a timestamp. Can only use
1995 * packets that are not retransmitted to determine 1913 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
2003 if (flag & FLAG_RETRANS_DATA_ACKED) 1921 if (flag & FLAG_RETRANS_DATA_ACKED)
2004 return; 1922 return;
2005 1923
2006 tcp_rtt_estimator(tp, seq_rtt); 1924 tcp_rtt_estimator(tp, seq_rtt, usrtt);
2007 tcp_set_rto(tp); 1925 tcp_set_rto(tp);
2008 tp->backoff = 0; 1926 tp->backoff = 0;
2009 tcp_bound_rto(tp); 1927 tcp_bound_rto(tp);
2010} 1928}
2011 1929
2012static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
2013 int flag, s32 seq_rtt) 1931 int flag, s32 seq_rtt, u32 *usrtt)
2014{ 1932{
2015 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2016 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2017 tcp_ack_saw_tstamp(tp, flag); 1935 tcp_ack_saw_tstamp(tp, usrtt, flag);
2018 else if (seq_rtt >= 0) 1936 else if (seq_rtt >= 0)
2019 tcp_ack_no_tstamp(tp, seq_rtt, flag); 1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
2020}
2021
2022/*
2023 * Compute congestion window to use.
2024 *
2025 * This is from the implementation of BICTCP in
2026 * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
2027 * "Binary Increase Congestion Control for Fast, Long Distance
2028 * Networks" in InfoComm 2004
2029 * Available from:
2030 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
2031 *
2032 * Unless BIC is enabled and congestion window is large
2033 * this behaves the same as the original Reno.
2034 */
2035static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
2036{
2037 /* orignal Reno behaviour */
2038 if (!tcp_is_bic(tp))
2039 return tp->snd_cwnd;
2040
2041 if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
2042 (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
2043 return tp->bictcp.cnt;
2044
2045 tp->bictcp.last_cwnd = tp->snd_cwnd;
2046 tp->bictcp.last_stamp = tcp_time_stamp;
2047
2048 /* start off normal */
2049 if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
2050 tp->bictcp.cnt = tp->snd_cwnd;
2051
2052 /* binary increase */
2053 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
2054 __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
2055 / BICTCP_B;
2056
2057 if (dist > BICTCP_MAX_INCREMENT)
2058 /* linear increase */
2059 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2060 else if (dist <= 1U)
2061 /* binary search increase */
2062 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2063 / BICTCP_B;
2064 else
2065 /* binary search increase */
2066 tp->bictcp.cnt = tp->snd_cwnd / dist;
2067 } else {
2068 /* slow start amd linear increase */
2069 if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
2070 /* slow start */
2071 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2072 / BICTCP_B;
2073 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
2074 + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
2075 /* slow start */
2076 tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
2077 / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
2078 else
2079 /* linear increase */
2080 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2081 }
2082 return tp->bictcp.cnt;
2083} 1938}
2084 1939
2085/* This is Jacobson's slow start and congestion avoidance. 1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
2086 * SIGCOMM '88, p. 328. 1941 u32 in_flight, int good)
2087 */
2088static inline void reno_cong_avoid(struct tcp_sock *tp)
2089{ 1942{
2090 if (tp->snd_cwnd <= tp->snd_ssthresh) { 1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
2091 /* In "safe" area, increase. */
2092 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2093 tp->snd_cwnd++;
2094 } else {
2095 /* In dangerous area, increase slowly.
2096 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
2097 */
2098 if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
2099 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2100 tp->snd_cwnd++;
2101 tp->snd_cwnd_cnt=0;
2102 } else
2103 tp->snd_cwnd_cnt++;
2104 }
2105 tp->snd_cwnd_stamp = tcp_time_stamp; 1944 tp->snd_cwnd_stamp = tcp_time_stamp;
2106} 1945}
2107 1946
2108/* This is based on the congestion detection/avoidance scheme described in
2109 * Lawrence S. Brakmo and Larry L. Peterson.
2110 * "TCP Vegas: End to end congestion avoidance on a global internet."
2111 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
2112 * October 1995. Available from:
2113 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
2114 *
2115 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
2116 * The main aspects that distinguish this implementation from the
2117 * Arizona Vegas implementation are:
2118 * o We do not change the loss detection or recovery mechanisms of
2119 * Linux in any way. Linux already recovers from losses quite well,
2120 * using fine-grained timers, NewReno, and FACK.
2121 * o To avoid the performance penalty imposed by increasing cwnd
2122 * only every-other RTT during slow start, we increase during
2123 * every RTT during slow start, just like Reno.
2124 * o Largely to allow continuous cwnd growth during slow start,
2125 * we use the rate at which ACKs come back as the "actual"
2126 * rate, rather than the rate at which data is sent.
2127 * o To speed convergence to the right rate, we set the cwnd
2128 * to achieve the right ("actual") rate when we exit slow start.
2129 * o To filter out the noise caused by delayed ACKs, we use the
2130 * minimum RTT sample observed during the last RTT to calculate
2131 * the actual rate.
2132 * o When the sender re-starts from idle, it waits until it has
2133 * received ACKs for an entire flight of new data before making
2134 * a cwnd adjustment decision. The original Vegas implementation
2135 * assumed senders never went idle.
2136 */
2137static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2138{
2139 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
2140 *
2141 * These are so named because they represent the approximate values
2142 * of snd_una and snd_nxt at the beginning of the current RTT. More
2143 * precisely, they represent the amount of data sent during the RTT.
2144 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
2145 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
2146 * bytes of data have been ACKed during the course of the RTT, giving
2147 * an "actual" rate of:
2148 *
2149 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
2150 *
2151 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
2152 * because delayed ACKs can cover more than one segment, so they
2153 * don't line up nicely with the boundaries of RTTs.
2154 *
2155 * Another unfortunate fact of life is that delayed ACKs delay the
2156 * advance of the left edge of our send window, so that the number
2157 * of bytes we send in an RTT is often less than our cwnd will allow.
2158 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
2159 */
2160
2161 if (after(ack, tp->vegas.beg_snd_nxt)) {
2162 /* Do the Vegas once-per-RTT cwnd adjustment. */
2163 u32 old_wnd, old_snd_cwnd;
2164
2165
2166 /* Here old_wnd is essentially the window of data that was
2167 * sent during the previous RTT, and has all
2168 * been acknowledged in the course of the RTT that ended
2169 * with the ACK we just received. Likewise, old_snd_cwnd
2170 * is the cwnd during the previous RTT.
2171 */
2172 old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
2173 tp->mss_cache_std;
2174 old_snd_cwnd = tp->vegas.beg_snd_cwnd;
2175
2176 /* Save the extent of the current window so we can use this
2177 * at the end of the next RTT.
2178 */
2179 tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
2180 tp->vegas.beg_snd_nxt = tp->snd_nxt;
2181 tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
2182
2183 /* Take into account the current RTT sample too, to
2184 * decrease the impact of delayed acks. This double counts
2185 * this sample since we count it for the next window as well,
2186 * but that's not too awful, since we're taking the min,
2187 * rather than averaging.
2188 */
2189 vegas_rtt_calc(tp, seq_rtt);
2190
2191 /* We do the Vegas calculations only if we got enough RTT
2192 * samples that we can be reasonably sure that we got
2193 * at least one RTT sample that wasn't from a delayed ACK.
2194 * If we only had 2 samples total,
2195 * then that means we're getting only 1 ACK per RTT, which
2196 * means they're almost certainly delayed ACKs.
2197 * If we have 3 samples, we should be OK.
2198 */
2199
2200 if (tp->vegas.cntRTT <= 2) {
2201 /* We don't have enough RTT samples to do the Vegas
2202 * calculation, so we'll behave like Reno.
2203 */
2204 if (tp->snd_cwnd > tp->snd_ssthresh)
2205 tp->snd_cwnd++;
2206 } else {
2207 u32 rtt, target_cwnd, diff;
2208
2209 /* We have enough RTT samples, so, using the Vegas
2210 * algorithm, we determine if we should increase or
2211 * decrease cwnd, and by how much.
2212 */
2213
2214 /* Pluck out the RTT we are using for the Vegas
2215 * calculations. This is the min RTT seen during the
2216 * last RTT. Taking the min filters out the effects
2217 * of delayed ACKs, at the cost of noticing congestion
2218 * a bit later.
2219 */
2220 rtt = tp->vegas.minRTT;
2221
2222 /* Calculate the cwnd we should have, if we weren't
2223 * going too fast.
2224 *
2225 * This is:
2226 * (actual rate in segments) * baseRTT
2227 * We keep it as a fixed point number with
2228 * V_PARAM_SHIFT bits to the right of the binary point.
2229 */
2230 target_cwnd = ((old_wnd * tp->vegas.baseRTT)
2231 << V_PARAM_SHIFT) / rtt;
2232
2233 /* Calculate the difference between the window we had,
2234 * and the window we would like to have. This quantity
2235 * is the "Diff" from the Arizona Vegas papers.
2236 *
2237 * Again, this is a fixed point number with
2238 * V_PARAM_SHIFT bits to the right of the binary
2239 * point.
2240 */
2241 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
2242
2243 if (tp->snd_cwnd < tp->snd_ssthresh) {
2244 /* Slow start. */
2245 if (diff > sysctl_tcp_vegas_gamma) {
2246 /* Going too fast. Time to slow down
2247 * and switch to congestion avoidance.
2248 */
2249 tp->snd_ssthresh = 2;
2250
2251 /* Set cwnd to match the actual rate
2252 * exactly:
2253 * cwnd = (actual rate) * baseRTT
2254 * Then we add 1 because the integer
2255 * truncation robs us of full link
2256 * utilization.
2257 */
2258 tp->snd_cwnd = min(tp->snd_cwnd,
2259 (target_cwnd >>
2260 V_PARAM_SHIFT)+1);
2261
2262 }
2263 } else {
2264 /* Congestion avoidance. */
2265 u32 next_snd_cwnd;
2266
2267 /* Figure out where we would like cwnd
2268 * to be.
2269 */
2270 if (diff > sysctl_tcp_vegas_beta) {
2271 /* The old window was too fast, so
2272 * we slow down.
2273 */
2274 next_snd_cwnd = old_snd_cwnd - 1;
2275 } else if (diff < sysctl_tcp_vegas_alpha) {
2276 /* We don't have enough extra packets
2277 * in the network, so speed up.
2278 */
2279 next_snd_cwnd = old_snd_cwnd + 1;
2280 } else {
2281 /* Sending just as fast as we
2282 * should be.
2283 */
2284 next_snd_cwnd = old_snd_cwnd;
2285 }
2286
2287 /* Adjust cwnd upward or downward, toward the
2288 * desired value.
2289 */
2290 if (next_snd_cwnd > tp->snd_cwnd)
2291 tp->snd_cwnd++;
2292 else if (next_snd_cwnd < tp->snd_cwnd)
2293 tp->snd_cwnd--;
2294 }
2295 }
2296
2297 /* Wipe the slate clean for the next RTT. */
2298 tp->vegas.cntRTT = 0;
2299 tp->vegas.minRTT = 0x7fffffff;
2300 }
2301
2302 /* The following code is executed for every ack we receive,
2303 * except for conditions checked in should_advance_cwnd()
2304 * before the call to tcp_cong_avoid(). Mainly this means that
2305 * we only execute this code if the ack actually acked some
2306 * data.
2307 */
2308
2309 /* If we are in slow start, increase our cwnd in response to this ACK.
2310 * (If we are not in slow start then we are in congestion avoidance,
2311 * and adjust our congestion window only once per RTT. See the code
2312 * above.)
2313 */
2314 if (tp->snd_cwnd <= tp->snd_ssthresh)
2315 tp->snd_cwnd++;
2316
2317 /* to keep cwnd from growing without bound */
2318 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
2319
2320 /* Make sure that we are never so timid as to reduce our cwnd below
2321 * 2 MSS.
2322 *
2323 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
2324 */
2325 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
2326
2327 tp->snd_cwnd_stamp = tcp_time_stamp;
2328}
2329
2330static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2331{
2332 if (tcp_vegas_enabled(tp))
2333 vegas_cong_avoid(tp, ack, seq_rtt);
2334 else
2335 reno_cong_avoid(tp);
2336}
2337
2338/* Restart timer after forward progress on connection. 1947/* Restart timer after forward progress on connection.
2339 * RFC2988 recommends to restart timer to now+rto. 1948 * RFC2988 recommends to restart timer to now+rto.
2340 */ 1949 */
@@ -2348,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
2348 } 1957 }
2349} 1958}
2350 1959
2351/* There is one downside to this scheme. Although we keep the
2352 * ACK clock ticking, adjusting packet counters and advancing
2353 * congestion window, we do not liberate socket send buffer
2354 * space.
2355 *
2356 * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
2357 * then making a write space wakeup callback is a possible
2358 * future enhancement. WARNING: it is not trivial to make.
2359 */
2360static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, 1960static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2361 __u32 now, __s32 *seq_rtt) 1961 __u32 now, __s32 *seq_rtt)
2362{ 1962{
@@ -2415,13 +2015,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2415 2015
2416 2016
2417/* Remove acknowledged frames from the retransmission queue. */ 2017/* Remove acknowledged frames from the retransmission queue. */
2418static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) 2018static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
2419{ 2019{
2420 struct tcp_sock *tp = tcp_sk(sk); 2020 struct tcp_sock *tp = tcp_sk(sk);
2421 struct sk_buff *skb; 2021 struct sk_buff *skb;
2422 __u32 now = tcp_time_stamp; 2022 __u32 now = tcp_time_stamp;
2423 int acked = 0; 2023 int acked = 0;
2424 __s32 seq_rtt = -1; 2024 __s32 seq_rtt = -1;
2025 struct timeval usnow;
2026 u32 pkts_acked = 0;
2027
2028 if (seq_usrtt)
2029 do_gettimeofday(&usnow);
2425 2030
2426 while ((skb = skb_peek(&sk->sk_write_queue)) && 2031 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2427 skb != sk->sk_send_head) { 2032 skb != sk->sk_send_head) {
@@ -2433,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2433 * the other end. 2038 * the other end.
2434 */ 2039 */
2435 if (after(scb->end_seq, tp->snd_una)) { 2040 if (after(scb->end_seq, tp->snd_una)) {
2436 if (tcp_skb_pcount(skb) > 1) 2041 if (tcp_skb_pcount(skb) > 1 &&
2042 after(tp->snd_una, scb->seq))
2437 acked |= tcp_tso_acked(sk, skb, 2043 acked |= tcp_tso_acked(sk, skb,
2438 now, &seq_rtt); 2044 now, &seq_rtt);
2439 break; 2045 break;
@@ -2448,6 +2054,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2448 */ 2054 */
2449 if (!(scb->flags & TCPCB_FLAG_SYN)) { 2055 if (!(scb->flags & TCPCB_FLAG_SYN)) {
2450 acked |= FLAG_DATA_ACKED; 2056 acked |= FLAG_DATA_ACKED;
2057 ++pkts_acked;
2451 } else { 2058 } else {
2452 acked |= FLAG_SYN_ACKED; 2059 acked |= FLAG_SYN_ACKED;
2453 tp->retrans_stamp = 0; 2060 tp->retrans_stamp = 0;
@@ -2461,6 +2068,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2461 seq_rtt = -1; 2068 seq_rtt = -1;
2462 } else if (seq_rtt < 0) 2069 } else if (seq_rtt < 0)
2463 seq_rtt = now - scb->when; 2070 seq_rtt = now - scb->when;
2071 if (seq_usrtt)
2072 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
2073 + (usnow.tv_usec - skb->stamp.tv_usec);
2074
2464 if (sacked & TCPCB_SACKED_ACKED) 2075 if (sacked & TCPCB_SACKED_ACKED)
2465 tp->sacked_out -= tcp_skb_pcount(skb); 2076 tp->sacked_out -= tcp_skb_pcount(skb);
2466 if (sacked & TCPCB_LOST) 2077 if (sacked & TCPCB_LOST)
@@ -2479,8 +2090,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2479 } 2090 }
2480 2091
2481 if (acked&FLAG_ACKED) { 2092 if (acked&FLAG_ACKED) {
2482 tcp_ack_update_rtt(tp, acked, seq_rtt); 2093 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
2483 tcp_ack_packets_out(sk, tp); 2094 tcp_ack_packets_out(sk, tp);
2095
2096 if (tp->ca_ops->pkts_acked)
2097 tp->ca_ops->pkts_acked(tp, pkts_acked);
2484 } 2098 }
2485 2099
2486#if FASTRETRANS_DEBUG > 0 2100#if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2238,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2624 tp->frto_counter = (tp->frto_counter + 1) % 3; 2238 tp->frto_counter = (tp->frto_counter + 1) % 3;
2625} 2239}
2626 2240
2627/*
2628 * TCP Westwood+
2629 */
2630
2631/*
2632 * @init_westwood
2633 * This function initializes fields used in TCP Westwood+. We can't
2634 * get no information about RTTmin at this time so we simply set it to
2635 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
2636 * since in this way we're sure it will be updated in a consistent
2637 * way as soon as possible. It will reasonably happen within the first
2638 * RTT period of the connection lifetime.
2639 */
2640
2641static void init_westwood(struct sock *sk)
2642{
2643 struct tcp_sock *tp = tcp_sk(sk);
2644
2645 tp->westwood.bw_ns_est = 0;
2646 tp->westwood.bw_est = 0;
2647 tp->westwood.accounted = 0;
2648 tp->westwood.cumul_ack = 0;
2649 tp->westwood.rtt_win_sx = tcp_time_stamp;
2650 tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
2651 tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
2652 tp->westwood.snd_una = tp->snd_una;
2653}
2654
2655/*
2656 * @westwood_do_filter
2657 * Low-pass filter. Implemented using constant coeffients.
2658 */
2659
2660static inline __u32 westwood_do_filter(__u32 a, __u32 b)
2661{
2662 return (((7 * a) + b) >> 3);
2663}
2664
2665static void westwood_filter(struct sock *sk, __u32 delta)
2666{
2667 struct tcp_sock *tp = tcp_sk(sk);
2668
2669 tp->westwood.bw_ns_est =
2670 westwood_do_filter(tp->westwood.bw_ns_est,
2671 tp->westwood.bk / delta);
2672 tp->westwood.bw_est =
2673 westwood_do_filter(tp->westwood.bw_est,
2674 tp->westwood.bw_ns_est);
2675}
2676
2677/*
2678 * @westwood_update_rttmin
2679 * It is used to update RTTmin. In this case we MUST NOT use
2680 * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
2681 */
2682
2683static inline __u32 westwood_update_rttmin(const struct sock *sk)
2684{
2685 const struct tcp_sock *tp = tcp_sk(sk);
2686 __u32 rttmin = tp->westwood.rtt_min;
2687
2688 if (tp->westwood.rtt != 0 &&
2689 (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
2690 rttmin = tp->westwood.rtt;
2691
2692 return rttmin;
2693}
2694
2695/*
2696 * @westwood_acked
2697 * Evaluate increases for dk.
2698 */
2699
2700static inline __u32 westwood_acked(const struct sock *sk)
2701{
2702 const struct tcp_sock *tp = tcp_sk(sk);
2703
2704 return tp->snd_una - tp->westwood.snd_una;
2705}
2706
2707/*
2708 * @westwood_new_window
2709 * It evaluates if we are receiving data inside the same RTT window as
2710 * when we started.
2711 * Return value:
2712 * It returns 0 if we are still evaluating samples in the same RTT
2713 * window, 1 if the sample has to be considered in the next window.
2714 */
2715
2716static int westwood_new_window(const struct sock *sk)
2717{
2718 const struct tcp_sock *tp = tcp_sk(sk);
2719 __u32 left_bound;
2720 __u32 rtt;
2721 int ret = 0;
2722
2723 left_bound = tp->westwood.rtt_win_sx;
2724 rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
2725
2726 /*
2727 * A RTT-window has passed. Be careful since if RTT is less than
2728 * 50ms we don't filter but we continue 'building the sample'.
2729 * This minimum limit was choosen since an estimation on small
2730 * time intervals is better to avoid...
2731 * Obvioulsy on a LAN we reasonably will always have
2732 * right_bound = left_bound + WESTWOOD_RTT_MIN
2733 */
2734
2735 if ((left_bound + rtt) < tcp_time_stamp)
2736 ret = 1;
2737
2738 return ret;
2739}
2740
2741/*
2742 * @westwood_update_window
2743 * It updates RTT evaluation window if it is the right moment to do
2744 * it. If so it calls filter for evaluating bandwidth.
2745 */
2746
2747static void __westwood_update_window(struct sock *sk, __u32 now)
2748{
2749 struct tcp_sock *tp = tcp_sk(sk);
2750 __u32 delta = now - tp->westwood.rtt_win_sx;
2751
2752 if (delta) {
2753 if (tp->westwood.rtt)
2754 westwood_filter(sk, delta);
2755
2756 tp->westwood.bk = 0;
2757 tp->westwood.rtt_win_sx = tcp_time_stamp;
2758 }
2759}
2760
2761
2762static void westwood_update_window(struct sock *sk, __u32 now)
2763{
2764 if (westwood_new_window(sk))
2765 __westwood_update_window(sk, now);
2766}
2767
2768/*
2769 * @__tcp_westwood_fast_bw
2770 * It is called when we are in fast path. In particular it is called when
2771 * header prediction is successfull. In such case infact update is
2772 * straight forward and doesn't need any particular care.
2773 */
2774
2775static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779 westwood_update_window(sk, tcp_time_stamp);
2780
2781 tp->westwood.bk += westwood_acked(sk);
2782 tp->westwood.snd_una = tp->snd_una;
2783 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2784}
2785
2786static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2787{
2788 if (tcp_is_westwood(tcp_sk(sk)))
2789 __tcp_westwood_fast_bw(sk, skb);
2790}
2791
2792
2793/*
2794 * @westwood_dupack_update
2795 * It updates accounted and cumul_ack when receiving a dupack.
2796 */
2797
2798static void westwood_dupack_update(struct sock *sk)
2799{
2800 struct tcp_sock *tp = tcp_sk(sk);
2801
2802 tp->westwood.accounted += tp->mss_cache_std;
2803 tp->westwood.cumul_ack = tp->mss_cache_std;
2804}
2805
2806static inline int westwood_may_change_cumul(struct tcp_sock *tp)
2807{
2808 return (tp->westwood.cumul_ack > tp->mss_cache_std);
2809}
2810
2811static inline void westwood_partial_update(struct tcp_sock *tp)
2812{
2813 tp->westwood.accounted -= tp->westwood.cumul_ack;
2814 tp->westwood.cumul_ack = tp->mss_cache_std;
2815}
2816
2817static inline void westwood_complete_update(struct tcp_sock *tp)
2818{
2819 tp->westwood.cumul_ack -= tp->westwood.accounted;
2820 tp->westwood.accounted = 0;
2821}
2822
2823/*
2824 * @westwood_acked_count
2825 * This function evaluates cumul_ack for evaluating dk in case of
2826 * delayed or partial acks.
2827 */
2828
2829static inline __u32 westwood_acked_count(struct sock *sk)
2830{
2831 struct tcp_sock *tp = tcp_sk(sk);
2832
2833 tp->westwood.cumul_ack = westwood_acked(sk);
2834
2835 /* If cumul_ack is 0 this is a dupack since it's not moving
2836 * tp->snd_una.
2837 */
2838 if (!(tp->westwood.cumul_ack))
2839 westwood_dupack_update(sk);
2840
2841 if (westwood_may_change_cumul(tp)) {
2842 /* Partial or delayed ack */
2843 if (tp->westwood.accounted >= tp->westwood.cumul_ack)
2844 westwood_partial_update(tp);
2845 else
2846 westwood_complete_update(tp);
2847 }
2848
2849 tp->westwood.snd_una = tp->snd_una;
2850
2851 return tp->westwood.cumul_ack;
2852}
2853
2854
2855/*
2856 * @__tcp_westwood_slow_bw
2857 * It is called when something is going wrong..even if there could
2858 * be no problems! Infact a simple delayed packet may trigger a
2859 * dupack. But we need to be careful in such case.
2860 */
2861
2862static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2863{
2864 struct tcp_sock *tp = tcp_sk(sk);
2865
2866 westwood_update_window(sk, tcp_time_stamp);
2867
2868 tp->westwood.bk += westwood_acked_count(sk);
2869 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2870}
2871
2872static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2873{
2874 if (tcp_is_westwood(tcp_sk(sk)))
2875 __tcp_westwood_slow_bw(sk, skb);
2876}
2877
2878/* This routine deals with incoming acks, but not outgoing ones. */ 2241/* This routine deals with incoming acks, but not outgoing ones. */
2879static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2242static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2880{ 2243{
@@ -2884,6 +2247,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2884 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2247 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2885 u32 prior_in_flight; 2248 u32 prior_in_flight;
2886 s32 seq_rtt; 2249 s32 seq_rtt;
2250 s32 seq_usrtt = 0;
2887 int prior_packets; 2251 int prior_packets;
2888 2252
2889 /* If the ack is newer than sent or older than previous acks 2253 /* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2266,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2902 */ 2266 */
2903 tcp_update_wl(tp, ack, ack_seq); 2267 tcp_update_wl(tp, ack, ack_seq);
2904 tp->snd_una = ack; 2268 tp->snd_una = ack;
2905 tcp_westwood_fast_bw(sk, skb);
2906 flag |= FLAG_WIN_UPDATE; 2269 flag |= FLAG_WIN_UPDATE;
2907 2270
2271 tcp_ca_event(tp, CA_EVENT_FAST_ACK);
2272
2908 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2273 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2909 } else { 2274 } else {
2910 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 2275 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2285,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2920 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2285 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2921 flag |= FLAG_ECE; 2286 flag |= FLAG_ECE;
2922 2287
2923 tcp_westwood_slow_bw(sk,skb); 2288 tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
2924 } 2289 }
2925 2290
2926 /* We passed data and got it acked, remove any soft error 2291 /* We passed data and got it acked, remove any soft error
@@ -2935,22 +2300,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2935 prior_in_flight = tcp_packets_in_flight(tp); 2300 prior_in_flight = tcp_packets_in_flight(tp);
2936 2301
2937 /* See if we can take anything off of the retransmit queue. */ 2302 /* See if we can take anything off of the retransmit queue. */
2938 flag |= tcp_clean_rtx_queue(sk, &seq_rtt); 2303 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2304 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
2939 2305
2940 if (tp->frto_counter) 2306 if (tp->frto_counter)
2941 tcp_process_frto(sk, prior_snd_una); 2307 tcp_process_frto(sk, prior_snd_una);
2942 2308
2943 if (tcp_ack_is_dubious(tp, flag)) { 2309 if (tcp_ack_is_dubious(tp, flag)) {
2944 /* Advanve CWND, if state allows this. */ 2310 /* Advanve CWND, if state allows this. */
2945 if ((flag & FLAG_DATA_ACKED) && 2311 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
2946 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && 2312 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
2947 tcp_may_raise_cwnd(tp, flag))
2948 tcp_cong_avoid(tp, ack, seq_rtt);
2949 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2313 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2950 } else { 2314 } else {
2951 if ((flag & FLAG_DATA_ACKED) && 2315 if ((flag & FLAG_DATA_ACKED))
2952 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) 2316 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
2953 tcp_cong_avoid(tp, ack, seq_rtt);
2954 } 2317 }
2955 2318
2956 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2319 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -3439,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
3439 int this_sack; 2802 int this_sack;
3440 2803
3441 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 2804 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
3442 if (skb_queue_len(&tp->out_of_order_queue) == 0) { 2805 if (skb_queue_empty(&tp->out_of_order_queue)) {
3443 tp->rx_opt.num_sacks = 0; 2806 tp->rx_opt.num_sacks = 0;
3444 tp->rx_opt.eff_sacks = tp->rx_opt.dsack; 2807 tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
3445 return; 2808 return;
@@ -3572,13 +2935,13 @@ queue_and_out:
3572 if(th->fin) 2935 if(th->fin)
3573 tcp_fin(skb, sk, th); 2936 tcp_fin(skb, sk, th);
3574 2937
3575 if (skb_queue_len(&tp->out_of_order_queue)) { 2938 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3576 tcp_ofo_queue(sk); 2939 tcp_ofo_queue(sk);
3577 2940
3578 /* RFC2581. 4.2. SHOULD send immediate ACK, when 2941 /* RFC2581. 4.2. SHOULD send immediate ACK, when
3579 * gap in queue is filled. 2942 * gap in queue is filled.
3580 */ 2943 */
3581 if (!skb_queue_len(&tp->out_of_order_queue)) 2944 if (skb_queue_empty(&tp->out_of_order_queue))
3582 tp->ack.pingpong = 0; 2945 tp->ack.pingpong = 0;
3583 } 2946 }
3584 2947
@@ -3886,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
3886 * This must not ever occur. */ 3249 * This must not ever occur. */
3887 3250
3888 /* First, purge the out_of_order queue. */ 3251 /* First, purge the out_of_order queue. */
3889 if (skb_queue_len(&tp->out_of_order_queue)) { 3252 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3890 NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 3253 NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
3891 skb_queue_len(&tp->out_of_order_queue));
3892 __skb_queue_purge(&tp->out_of_order_queue); 3254 __skb_queue_purge(&tp->out_of_order_queue);
3893 3255
3894 /* Reset SACK state. A conforming SACK implementation will 3256 /* Reset SACK state. A conforming SACK implementation will
@@ -3937,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
3937 tp->snd_cwnd_stamp = tcp_time_stamp; 3299 tp->snd_cwnd_stamp = tcp_time_stamp;
3938} 3300}
3939 3301
3302static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3303{
3304 /* If the user specified a specific send buffer setting, do
3305 * not modify it.
3306 */
3307 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
3308 return 0;
3309
3310 /* If we are under global TCP memory pressure, do not expand. */
3311 if (tcp_memory_pressure)
3312 return 0;
3313
3314 /* If we are under soft global TCP memory pressure, do not expand. */
3315 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
3316 return 0;
3317
3318 /* If we filled the congestion window, do not expand. */
3319 if (tp->packets_out >= tp->snd_cwnd)
3320 return 0;
3321
3322 return 1;
3323}
3940 3324
3941/* When incoming ACK allowed to free some skb from write_queue, 3325/* When incoming ACK allowed to free some skb from write_queue,
3942 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket 3326 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3948,11 +3332,8 @@ static void tcp_new_space(struct sock *sk)
3948{ 3332{
3949 struct tcp_sock *tp = tcp_sk(sk); 3333 struct tcp_sock *tp = tcp_sk(sk);
3950 3334
3951 if (tp->packets_out < tp->snd_cwnd && 3335 if (tcp_should_expand_sndbuf(sk, tp)) {
3952 !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && 3336 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3953 !tcp_memory_pressure &&
3954 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
3955 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
3956 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3337 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3957 demanded = max_t(unsigned int, tp->snd_cwnd, 3338 demanded = max_t(unsigned int, tp->snd_cwnd,
3958 tp->reordering + 1); 3339 tp->reordering + 1);
@@ -3975,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk)
3975 } 3356 }
3976} 3357}
3977 3358
3978static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) 3359static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3979{ 3360{
3980 struct tcp_sock *tp = tcp_sk(sk); 3361 tcp_push_pending_frames(sk, tp);
3981
3982 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
3983 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
3984 tcp_write_xmit(sk, tp->nonagle))
3985 tcp_check_probe_timer(sk, tp);
3986}
3987
3988static __inline__ void tcp_data_snd_check(struct sock *sk)
3989{
3990 struct sk_buff *skb = sk->sk_send_head;
3991
3992 if (skb != NULL)
3993 __tcp_data_snd_check(sk, skb);
3994 tcp_check_space(sk); 3362 tcp_check_space(sk);
3995} 3363}
3996 3364
@@ -4284,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4284 */ 3652 */
4285 tcp_ack(sk, skb, 0); 3653 tcp_ack(sk, skb, 0);
4286 __kfree_skb(skb); 3654 __kfree_skb(skb);
4287 tcp_data_snd_check(sk); 3655 tcp_data_snd_check(sk, tp);
4288 return 0; 3656 return 0;
4289 } else { /* Header too small */ 3657 } else { /* Header too small */
4290 TCP_INC_STATS_BH(TCP_MIB_INERRS); 3658 TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -4350,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4350 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 3718 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
4351 /* Well, only one small jumplet in fast path... */ 3719 /* Well, only one small jumplet in fast path... */
4352 tcp_ack(sk, skb, FLAG_DATA); 3720 tcp_ack(sk, skb, FLAG_DATA);
4353 tcp_data_snd_check(sk); 3721 tcp_data_snd_check(sk, tp);
4354 if (!tcp_ack_scheduled(tp)) 3722 if (!tcp_ack_scheduled(tp))
4355 goto no_ack; 3723 goto no_ack;
4356 } 3724 }
@@ -4428,7 +3796,7 @@ step5:
4428 /* step 7: process the segment text */ 3796 /* step 7: process the segment text */
4429 tcp_data_queue(sk, skb); 3797 tcp_data_queue(sk, skb);
4430 3798
4431 tcp_data_snd_check(sk); 3799 tcp_data_snd_check(sk, tp);
4432 tcp_ack_snd_check(sk); 3800 tcp_ack_snd_check(sk);
4433 return 0; 3801 return 0;
4434 3802
@@ -4552,6 +3920,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4552 3920
4553 tcp_init_metrics(sk); 3921 tcp_init_metrics(sk);
4554 3922
3923 tcp_init_congestion_control(tp);
3924
4555 /* Prevent spurious tcp_cwnd_restart() on first data 3925 /* Prevent spurious tcp_cwnd_restart() on first data
4556 * packet. 3926 * packet.
4557 */ 3927 */
@@ -4708,9 +4078,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4708 if(tp->af_specific->conn_request(sk, skb) < 0) 4078 if(tp->af_specific->conn_request(sk, skb) < 0)
4709 return 1; 4079 return 1;
4710 4080
4711 init_westwood(sk);
4712 init_bictcp(tp);
4713
4714 /* Now we have several options: In theory there is 4081 /* Now we have several options: In theory there is
4715 * nothing else in the frame. KA9Q has an option to 4082 * nothing else in the frame. KA9Q has an option to
4716 * send data with the syn, BSD accepts data with the 4083 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4099,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4732 goto discard; 4099 goto discard;
4733 4100
4734 case TCP_SYN_SENT: 4101 case TCP_SYN_SENT:
4735 init_westwood(sk);
4736 init_bictcp(tp);
4737
4738 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 4102 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4739 if (queued >= 0) 4103 if (queued >= 0)
4740 return queued; 4104 return queued;
@@ -4742,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4742 /* Do step6 onward by hand. */ 4106 /* Do step6 onward by hand. */
4743 tcp_urg(sk, skb, th); 4107 tcp_urg(sk, skb, th);
4744 __kfree_skb(skb); 4108 __kfree_skb(skb);
4745 tcp_data_snd_check(sk); 4109 tcp_data_snd_check(sk, tp);
4746 return 0; 4110 return 0;
4747 } 4111 }
4748 4112
@@ -4816,7 +4180,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4816 */ 4180 */
4817 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4181 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4818 !tp->srtt) 4182 !tp->srtt)
4819 tcp_ack_saw_tstamp(tp, 0); 4183 tcp_ack_saw_tstamp(tp, 0, 0);
4820 4184
4821 if (tp->rx_opt.tstamp_ok) 4185 if (tp->rx_opt.tstamp_ok)
4822 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4186 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4192,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4828 4192
4829 tcp_init_metrics(sk); 4193 tcp_init_metrics(sk);
4830 4194
4195 tcp_init_congestion_control(tp);
4196
4831 /* Prevent spurious tcp_cwnd_restart() on 4197 /* Prevent spurious tcp_cwnd_restart() on
4832 * first data packet. 4198 * first data packet.
4833 */ 4199 */
@@ -4931,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4931 4297
4932 /* tcp_data could move socket to TIME-WAIT */ 4298 /* tcp_data could move socket to TIME-WAIT */
4933 if (sk->sk_state != TCP_CLOSE) { 4299 if (sk->sk_state != TCP_CLOSE) {
4934 tcp_data_snd_check(sk); 4300 tcp_data_snd_check(sk, tp);
4935 tcp_ack_snd_check(sk); 4301 tcp_ack_snd_check(sk);
4936 } 4302 }
4937 4303
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2d41d5d6ad19..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,9 +2045,10 @@ static int tcp_v4_init_sock(struct sock *sk)
2045 */ 2045 */
2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2047 tp->snd_cwnd_clamp = ~0; 2047 tp->snd_cwnd_clamp = ~0;
2048 tp->mss_cache_std = tp->mss_cache = 536; 2048 tp->mss_cache = 536;
2049 2049
2050 tp->reordering = sysctl_tcp_reordering; 2050 tp->reordering = sysctl_tcp_reordering;
2051 tp->ca_ops = &tcp_init_congestion_ops;
2051 2052
2052 sk->sk_state = TCP_CLOSE; 2053 sk->sk_state = TCP_CLOSE;
2053 2054
@@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
2070 2071
2071 tcp_clear_xmit_timers(sk); 2072 tcp_clear_xmit_timers(sk);
2072 2073
2074 tcp_cleanup_congestion_control(tp);
2075
2073 /* Cleanup up the write buffer. */ 2076 /* Cleanup up the write buffer. */
2074 sk_stream_writequeue_purge(sk); 2077 sk_stream_writequeue_purge(sk);
2075 2078
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b3943e7562f3..f42a284164b7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
774 newtp->frto_counter = 0; 774 newtp->frto_counter = 0;
775 newtp->frto_highmark = 0; 775 newtp->frto_highmark = 0;
776 776
777 newtp->ca_ops = &tcp_reno;
778
777 tcp_set_ca_state(newtp, TCP_CA_Open); 779 tcp_set_ca_state(newtp, TCP_CA_Open);
778 tcp_init_xmit_timers(newsk); 780 tcp_init_xmit_timers(newsk);
779 skb_queue_head_init(&newtp->out_of_order_queue); 781 skb_queue_head_init(&newtp->out_of_order_queue);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
842 if (newtp->ecn_flags&TCP_ECN_OK) 844 if (newtp->ecn_flags&TCP_ECN_OK)
843 sock_set_flag(newsk, SOCK_NO_LARGESEND); 845 sock_set_flag(newsk, SOCK_NO_LARGESEND);
844 846
845 tcp_ca_init(newtp);
846
847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); 847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
848 } 848 }
849 return newsk; 849 return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e337..e3f8ea1bfa9c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 112 u32 cwnd = tp->snd_cwnd;
113 113
114 if (tcp_is_vegas(tp)) 114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
115 tcp_vegas_enable(tp);
116 115
117 tp->snd_ssthresh = tcp_current_ssthresh(tp); 116 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd); 117 restart_cwnd = min(restart_cwnd, cwnd);
@@ -141,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
141 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
142} 141}
143 142
144static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
145{ 144{
146 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
147 146
148 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
150} 149}
151 150
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_WSCALE 0x2 279#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4 280#define SYSCTL_FLAG_SACK 0x4
282 281
282 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp);
285
283 sysctl_flags = 0; 286 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) { 287 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 288 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 307 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 } 308 }
306 309
307 /* 310 if (tcp_packets_in_flight(tp) == 0)
308 * If the connection is idle and we are restarting, 311 tcp_ca_event(tp, CA_EVENT_TX_START);
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318 312
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 313 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th; 314 skb->h.th = th;
@@ -361,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
361 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
362 356
363 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
365 359
366 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -409,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
409 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
410} 404}
411 405
412static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
413{
414 /* Force push to be on for any TSO frames to workaround
415 * problems with busted implementations like Mac OS-X that
416 * hold off socket receive wakeups until push is seen.
417 */
418 if (tcp_skb_pcount(skb) > 1)
419 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
420}
421
422/* Send _single_ skb sitting at the send head. This function requires
423 * true push pending frames to setup probe timer etc.
424 */
425void tcp_push_one(struct sock *sk, unsigned cur_mss)
426{ 407{
427 struct tcp_sock *tp = tcp_sk(sk); 408 struct tcp_sock *tp = tcp_sk(sk);
428 struct sk_buff *skb = sk->sk_send_head;
429 409
430 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { 410 if (skb->len <= tp->mss_cache ||
431 /* Send it out now. */
432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
433 tcp_tso_set_push(skb);
434 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
435 sk->sk_send_head = NULL;
436 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
437 tcp_packets_out_inc(sk, tp, skb);
438 return;
439 }
440 }
441}
442
443void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
444{
445 struct tcp_sock *tp = tcp_sk(sk);
446
447 if (skb->len <= tp->mss_cache_std ||
448 !(sk->sk_route_caps & NETIF_F_TSO)) { 411 !(sk->sk_route_caps & NETIF_F_TSO)) {
449 /* Avoid the costly divide in the normal 412 /* Avoid the costly divide in the normal
450 * non-TSO case. 413 * non-TSO case.
@@ -454,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
454 } else { 417 } else {
455 unsigned int factor; 418 unsigned int factor;
456 419
457 factor = skb->len + (tp->mss_cache_std - 1); 420 factor = skb->len + (tp->mss_cache - 1);
458 factor /= tp->mss_cache_std; 421 factor /= tp->mss_cache;
459 skb_shinfo(skb)->tso_segs = factor; 422 skb_shinfo(skb)->tso_segs = factor;
460 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 423 skb_shinfo(skb)->tso_size = tp->mss_cache;
461 } 424 }
462} 425}
463 426
@@ -521,6 +484,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
521 * skbs, which it never sent before. --ANK 484 * skbs, which it never sent before. --ANK
522 */ 485 */
523 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 486 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
487 buff->stamp = skb->stamp;
524 488
525 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 489 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
526 tp->lost_out -= tcp_skb_pcount(skb); 490 tp->lost_out -= tcp_skb_pcount(skb);
@@ -542,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
542 } 506 }
543 507
544 /* Link BUFF into the send queue. */ 508 /* Link BUFF into the send queue. */
509 skb_header_release(buff);
545 __skb_append(skb, buff); 510 __skb_append(skb, buff);
546 511
547 return 0; 512 return 0;
@@ -662,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
662 627
663 /* And store cached results */ 628 /* And store cached results */
664 tp->pmtu_cookie = pmtu; 629 tp->pmtu_cookie = pmtu;
665 tp->mss_cache = tp->mss_cache_std = mss_now; 630 tp->mss_cache = mss_now;
666 631
667 return mss_now; 632 return mss_now;
668} 633}
@@ -674,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
674 * cannot be large. However, taking into account rare use of URG, this 639 * cannot be large. However, taking into account rare use of URG, this
675 * is not a big flaw. 640 * is not a big flaw.
676 */ 641 */
677 642unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
678unsigned int tcp_current_mss(struct sock *sk, int large)
679{ 643{
680 struct tcp_sock *tp = tcp_sk(sk); 644 struct tcp_sock *tp = tcp_sk(sk);
681 struct dst_entry *dst = __sk_dst_get(sk); 645 struct dst_entry *dst = __sk_dst_get(sk);
682 unsigned int do_large, mss_now; 646 u32 mss_now;
647 u16 xmit_size_goal;
648 int doing_tso = 0;
649
650 mss_now = tp->mss_cache;
651
652 if (large_allowed &&
653 (sk->sk_route_caps & NETIF_F_TSO) &&
654 !tp->urg_mode)
655 doing_tso = 1;
683 656
684 mss_now = tp->mss_cache_std;
685 if (dst) { 657 if (dst) {
686 u32 mtu = dst_mtu(dst); 658 u32 mtu = dst_mtu(dst);
687 if (mtu != tp->pmtu_cookie) 659 if (mtu != tp->pmtu_cookie)
688 mss_now = tcp_sync_mss(sk, mtu); 660 mss_now = tcp_sync_mss(sk, mtu);
689 } 661 }
690 662
691 do_large = (large && 663 if (tp->rx_opt.eff_sacks)
692 (sk->sk_route_caps & NETIF_F_TSO) && 664 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
693 !tp->urg_mode); 665 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
694 666
695 if (do_large) { 667 xmit_size_goal = mss_now;
696 unsigned int large_mss, factor, limit;
697 668
698 large_mss = 65535 - tp->af_specific->net_header_len - 669 if (doing_tso) {
670 xmit_size_goal = 65535 -
671 tp->af_specific->net_header_len -
699 tp->ext_header_len - tp->tcp_header_len; 672 tp->ext_header_len - tp->tcp_header_len;
700 673
701 if (tp->max_window && large_mss > (tp->max_window>>1)) 674 if (tp->max_window &&
702 large_mss = max((tp->max_window>>1), 675 (xmit_size_goal > (tp->max_window >> 1)))
703 68U - tp->tcp_header_len); 676 xmit_size_goal = max((tp->max_window >> 1),
677 68U - tp->tcp_header_len);
704 678
705 factor = large_mss / mss_now; 679 xmit_size_goal -= (xmit_size_goal % mss_now);
680 }
681 tp->xmit_size_goal = xmit_size_goal;
706 682
707 /* Always keep large mss multiple of real mss, but 683 return mss_now;
708 * do not exceed 1/tso_win_divisor of the congestion window 684}
709 * so we can keep the ACK clock ticking and minimize 685
710 * bursting. 686/* Congestion window validation. (RFC2861) */
711 */ 687
712 limit = tp->snd_cwnd; 688static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
713 if (sysctl_tcp_tso_win_divisor) 689{
714 limit /= sysctl_tcp_tso_win_divisor; 690 __u32 packets_out = tp->packets_out;
715 limit = max(1U, limit);
716 if (factor > limit)
717 factor = limit;
718 691
719 tp->mss_cache = mss_now * factor; 692 if (packets_out >= tp->snd_cwnd) {
693 /* Network is feed fully. */
694 tp->snd_cwnd_used = 0;
695 tp->snd_cwnd_stamp = tcp_time_stamp;
696 } else {
697 /* Network starves. */
698 if (tp->packets_out > tp->snd_cwnd_used)
699 tp->snd_cwnd_used = tp->packets_out;
720 700
721 mss_now = tp->mss_cache; 701 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
702 tcp_cwnd_application_limited(sk);
722 } 703 }
704}
723 705
724 if (tp->rx_opt.eff_sacks) 706static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
725 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 707{
726 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 708 u32 window, cwnd_len;
727 return mss_now; 709
710 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
711 cwnd_len = mss_now * cwnd;
712 return min(window, cwnd_len);
713}
714
715/* Can at least one segment of SKB be sent right now, according to the
716 * congestion window rules? If so, return how many segments are allowed.
717 */
718static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
719{
720 u32 in_flight, cwnd;
721
722 /* Don't be strict about the congestion window for the final FIN. */
723 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
724 return 1;
725
726 in_flight = tcp_packets_in_flight(tp);
727 cwnd = tp->snd_cwnd;
728 if (in_flight < cwnd)
729 return (cwnd - in_flight);
730
731 return 0;
732}
733
734/* This must be invoked the first time we consider transmitting
735 * SKB onto the wire.
736 */
737static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
738{
739 int tso_segs = tcp_skb_pcount(skb);
740
741 if (!tso_segs) {
742 tcp_set_skb_tso_segs(sk, skb);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 BUG_ON(skb->len != skb->data_len);
865
866 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
867 if (unlikely(buff == NULL))
868 return -ENOMEM;
869
870 buff->truesize = nlen;
871 skb->truesize -= nlen;
872
873 /* Correct the sequence numbers. */
874 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
875 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
876 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
877
878 /* PSH and FIN should only be set in the second packet. */
879 flags = TCP_SKB_CB(skb)->flags;
880 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
881 TCP_SKB_CB(buff)->flags = flags;
882
883 /* This packet was never sent out yet, so no SACK bits. */
884 TCP_SKB_CB(buff)->sacked = 0;
885
886 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
887 skb_split(skb, buff, len);
888
889 /* Fix up tso_factor for both original and new SKB. */
890 tcp_set_skb_tso_segs(sk, skb);
891 tcp_set_skb_tso_segs(sk, buff);
892
893 /* Link BUFF into the send queue. */
894 skb_header_release(buff);
895 __skb_append(skb, buff);
896
897 return 0;
898}
899
900/* Try to defer sending, if possible, in order to minimize the amount
901 * of TSO splitting we do. View it as a kind of TSO Nagle test.
902 *
903 * This algorithm is from John Heffner.
904 */
905static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
906{
907 u32 send_win, cong_win, limit, in_flight;
908
909 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
910 return 0;
911
912 if (tp->ca_state != TCP_CA_Open)
913 return 0;
914
915 in_flight = tcp_packets_in_flight(tp);
916
917 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
918 (tp->snd_cwnd <= in_flight));
919
920 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
921
922 /* From in_flight test above, we know that cwnd > in_flight. */
923 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
924
925 limit = min(send_win, cong_win);
926
927 /* If sk_send_head can be sent fully now, just do it. */
928 if (skb->len <= limit)
929 return 0;
930
931 if (sysctl_tcp_tso_win_divisor) {
932 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
933
934 /* If at least some fraction of a window is available,
935 * just use it.
936 */
937 chunk /= sysctl_tcp_tso_win_divisor;
938 if (limit >= chunk)
939 return 0;
940 } else {
941 /* Different approach, try not to defer past a single
942 * ACK. Receiver should ACK every other full sized
943 * frame, so if we have space for more than 3 frames
944 * then send now.
945 */
946 if (limit > tcp_max_burst(tp) * tp->mss_cache)
947 return 0;
948 }
949
950 /* Ok, it looks like it is advisable to defer. */
951 return 1;
728} 952}
729 953
730/* This routine writes packets to the network. It advances the 954/* This routine writes packets to the network. It advances the
@@ -734,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
734 * Returns 1, if no segments are in flight and we have queued segments, but 958 * Returns 1, if no segments are in flight and we have queued segments, but
735 * cannot send anything now because of SWS or another problem. 959 * cannot send anything now because of SWS or another problem.
736 */ 960 */
737int tcp_write_xmit(struct sock *sk, int nonagle) 961static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
738{ 962{
739 struct tcp_sock *tp = tcp_sk(sk); 963 struct tcp_sock *tp = tcp_sk(sk);
740 unsigned int mss_now; 964 struct sk_buff *skb;
965 unsigned int tso_segs, sent_pkts;
966 int cwnd_quota;
741 967
742 /* If we are closed, the bytes will have to remain here. 968 /* If we are closed, the bytes will have to remain here.
743 * In time closedown will finish, we empty the write queue and all 969 * In time closedown will finish, we empty the write queue and all
744 * will be happy. 970 * will be happy.
745 */ 971 */
746 if (sk->sk_state != TCP_CLOSE) { 972 if (unlikely(sk->sk_state == TCP_CLOSE))
747 struct sk_buff *skb; 973 return 0;
748 int sent_pkts = 0; 974
975 skb = sk->sk_send_head;
976 if (unlikely(!skb))
977 return 0;
978
979 tso_segs = tcp_init_tso_segs(sk, skb);
980 cwnd_quota = tcp_cwnd_test(tp, skb);
981 if (unlikely(!cwnd_quota))
982 goto out;
983
984 sent_pkts = 0;
985 while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
986 BUG_ON(!tso_segs);
987
988 if (tso_segs == 1) {
989 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
990 (tcp_skb_is_last(sk, skb) ?
991 nonagle : TCP_NAGLE_PUSH))))
992 break;
993 } else {
994 if (tcp_tso_should_defer(sk, tp, skb))
995 break;
996 }
749 997
750 /* Account for SACKS, we may need to fragment due to this. 998 if (tso_segs > 1) {
751 * It is just like the real MSS changing on us midstream. 999 u32 limit = tcp_window_allows(tp, skb,
752 * We also handle things correctly when the user adds some 1000 mss_now, cwnd_quota);
753 * IP options mid-stream. Silly to do, but cover it. 1001
754 */ 1002 if (skb->len < limit) {
755 mss_now = tcp_current_mss(sk, 1); 1003 unsigned int trim = skb->len % mss_now;
756 1004
757 while ((skb = sk->sk_send_head) && 1005 if (trim)
758 tcp_snd_test(sk, skb, mss_now, 1006 limit = skb->len - trim;
759 tcp_skb_is_last(sk, skb) ? nonagle : 1007 }
760 TCP_NAGLE_PUSH)) { 1008 if (skb->len > limit) {
761 if (skb->len > mss_now) { 1009 if (tso_fragment(sk, skb, limit))
762 if (tcp_fragment(sk, skb, mss_now))
763 break; 1010 break;
764 } 1011 }
765 1012 } else if (unlikely(skb->len > mss_now)) {
766 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1013 if (unlikely(tcp_fragment(sk, skb, mss_now)))
767 tcp_tso_set_push(skb);
768 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
769 break; 1014 break;
1015 }
770 1016
771 /* Advance the send_head. This one is sent out. 1017 TCP_SKB_CB(skb)->when = tcp_time_stamp;
772 * This call will increment packets_out. 1018
773 */ 1019 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
774 update_send_head(sk, tp, skb); 1020 break;
1021
1022 /* Advance the send_head. This one is sent out.
1023 * This call will increment packets_out.
1024 */
1025 update_send_head(sk, tp, skb);
1026
1027 tcp_minshall_update(tp, mss_now, skb);
1028 sent_pkts++;
1029
1030 /* Do not optimize this to use tso_segs. If we chopped up
1031 * the packet above, tso_segs will no longer be valid.
1032 */
1033 cwnd_quota -= tcp_skb_pcount(skb);
1034
1035 BUG_ON(cwnd_quota < 0);
1036 if (!cwnd_quota)
1037 break;
1038
1039 skb = sk->sk_send_head;
1040 if (!skb)
1041 break;
1042 tso_segs = tcp_init_tso_segs(sk, skb);
1043 }
1044
1045 if (likely(sent_pkts)) {
1046 tcp_cwnd_validate(sk, tp);
1047 return 0;
1048 }
1049out:
1050 return !tp->packets_out && sk->sk_send_head;
1051}
1052
1053/* Push out any pending frames which were held back due to
1054 * TCP_CORK or attempt at coalescing tiny packets.
1055 * The socket must be locked by the caller.
1056 */
1057void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1058 unsigned int cur_mss, int nonagle)
1059{
1060 struct sk_buff *skb = sk->sk_send_head;
1061
1062 if (skb) {
1063 if (tcp_write_xmit(sk, cur_mss, nonagle))
1064 tcp_check_probe_timer(sk, tp);
1065 }
1066}
1067
1068/* Send _single_ skb sitting at the send head. This function requires
1069 * true push pending frames to setup probe timer etc.
1070 */
1071void tcp_push_one(struct sock *sk, unsigned int mss_now)
1072{
1073 struct tcp_sock *tp = tcp_sk(sk);
1074 struct sk_buff *skb = sk->sk_send_head;
1075 unsigned int tso_segs, cwnd_quota;
1076
1077 BUG_ON(!skb || skb->len < mss_now);
1078
1079 tso_segs = tcp_init_tso_segs(sk, skb);
1080 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1081
1082 if (likely(cwnd_quota)) {
1083 BUG_ON(!tso_segs);
775 1084
776 tcp_minshall_update(tp, mss_now, skb); 1085 if (tso_segs > 1) {
777 sent_pkts = 1; 1086 u32 limit = tcp_window_allows(tp, skb,
1087 mss_now, cwnd_quota);
1088
1089 if (skb->len < limit) {
1090 unsigned int trim = skb->len % mss_now;
1091
1092 if (trim)
1093 limit = skb->len - trim;
1094 }
1095 if (skb->len > limit) {
1096 if (unlikely(tso_fragment(sk, skb, limit)))
1097 return;
1098 }
1099 } else if (unlikely(skb->len > mss_now)) {
1100 if (unlikely(tcp_fragment(sk, skb, mss_now)))
1101 return;
778 } 1102 }
779 1103
780 if (sent_pkts) { 1104 /* Send it out now. */
1105 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1106
1107 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1108 update_send_head(sk, tp, skb);
781 tcp_cwnd_validate(sk, tp); 1109 tcp_cwnd_validate(sk, tp);
782 return 0; 1110 return;
783 } 1111 }
784
785 return !tp->packets_out && sk->sk_send_head;
786 } 1112 }
787 return 0;
788} 1113}
789 1114
790/* This function returns the amount that we can raise the 1115/* This function returns the amount that we can raise the
@@ -1044,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1044 if (sk->sk_route_caps & NETIF_F_TSO) { 1369 if (sk->sk_route_caps & NETIF_F_TSO) {
1045 sk->sk_route_caps &= ~NETIF_F_TSO; 1370 sk->sk_route_caps &= ~NETIF_F_TSO;
1046 sock_set_flag(sk, SOCK_NO_LARGESEND); 1371 sock_set_flag(sk, SOCK_NO_LARGESEND);
1047 tp->mss_cache = tp->mss_cache_std;
1048 } 1372 }
1049 1373
1050 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1374 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1106,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1106 * is still in somebody's hands, else make a clone. 1430 * is still in somebody's hands, else make a clone.
1107 */ 1431 */
1108 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1109 tcp_tso_set_push(skb);
1110 1433
1111 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1434 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1112 pskb_copy(skb, GFP_ATOMIC): 1435 pskb_copy(skb, GFP_ATOMIC):
@@ -1290,7 +1613,7 @@ void tcp_send_fin(struct sock *sk)
1290 * was unread data in the receive queue. This behavior is recommended 1613 * was unread data in the receive queue. This behavior is recommended
1291 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM 1614 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1292 */ 1615 */
1293void tcp_send_active_reset(struct sock *sk, int priority) 1616void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
1294{ 1617{
1295 struct tcp_sock *tp = tcp_sk(sk); 1618 struct tcp_sock *tp = tcp_sk(sk);
1296 struct sk_buff *skb; 1619 struct sk_buff *skb;
@@ -1449,7 +1772,6 @@ static inline void tcp_connect_init(struct sock *sk)
1449 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 1772 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1450 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 1773 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451 tcp_initialize_rcv_mss(sk); 1774 tcp_initialize_rcv_mss(sk);
1452 tcp_ca_init(tp);
1453 1775
1454 tcp_select_initial_window(tcp_full_space(sk), 1776 tcp_select_initial_window(tcp_full_space(sk),
1455 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 1777 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1825,6 @@ int tcp_connect(struct sock *sk)
1503 TCP_SKB_CB(buff)->end_seq = tp->write_seq; 1825 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1504 tp->snd_nxt = tp->write_seq; 1826 tp->snd_nxt = tp->write_seq;
1505 tp->pushed_seq = tp->write_seq; 1827 tp->pushed_seq = tp->write_seq;
1506 tcp_ca_init(tp);
1507 1828
1508 /* Send it off. */ 1829 /* Send it off. */
1509 TCP_SKB_CB(buff)->when = tcp_time_stamp; 1830 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1677,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
1677 if (sk->sk_route_caps & NETIF_F_TSO) { 1998 if (sk->sk_route_caps & NETIF_F_TSO) {
1678 sock_set_flag(sk, SOCK_NO_LARGESEND); 1999 sock_set_flag(sk, SOCK_NO_LARGESEND);
1679 sk->sk_route_caps &= ~NETIF_F_TSO; 2000 sk->sk_route_caps &= ~NETIF_F_TSO;
1680 tp->mss_cache = tp->mss_cache_std;
1681 } 2001 }
1682 } else if (!tcp_skb_pcount(skb)) 2002 } else if (!tcp_skb_pcount(skb))
1683 tcp_set_skb_tso_segs(sk, skb); 2003 tcp_set_skb_tso_segs(sk, skb);
1684 2004
1685 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2005 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1686 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2006 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1687 tcp_tso_set_push(skb);
1688 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 2007 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1689 if (!err) { 2008 if (!err) {
1690 update_send_head(sk, tp, skb); 2009 update_send_head(sk, tp, skb);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 000000000000..70e108e15c71
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
1/* Tom Kelly's Scalable TCP
2 *
3 * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
4 *
5 * John Heffner <jheffner@sc.edu>
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10#include <net/tcp.h>
11
12/* These factors derived from the recommended values in the aer:
13 * .01 and and 7/8. We use 50 instead of 100 to account for
14 * delayed ack.
15 */
16#define TCP_SCALABLE_AI_CNT 50U
17#define TCP_SCALABLE_MD_SCALE 3
18
19static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
20 u32 in_flight, int flag)
21{
22 if (in_flight < tp->snd_cwnd)
23 return;
24
25 if (tp->snd_cwnd <= tp->snd_ssthresh) {
26 tp->snd_cwnd++;
27 } else {
28 tp->snd_cwnd_cnt++;
29 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
30 tp->snd_cwnd++;
31 tp->snd_cwnd_cnt = 0;
32 }
33 }
34 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
35 tp->snd_cwnd_stamp = tcp_time_stamp;
36}
37
38static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
39{
40 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
41}
42
43
44static struct tcp_congestion_ops tcp_scalable = {
45 .ssthresh = tcp_scalable_ssthresh,
46 .cong_avoid = tcp_scalable_cong_avoid,
47 .min_cwnd = tcp_reno_min_cwnd,
48
49 .owner = THIS_MODULE,
50 .name = "scalable",
51};
52
53static int __init tcp_scalable_register(void)
54{
55 return tcp_register_congestion_control(&tcp_scalable);
56}
57
58static void __exit tcp_scalable_unregister(void)
59{
60 tcp_unregister_congestion_control(&tcp_scalable);
61}
62
63module_init(tcp_scalable_register);
64module_exit(tcp_scalable_unregister);
65
66MODULE_AUTHOR("John Heffner");
67MODULE_LICENSE("GPL");
68MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b127b4498565..0084227438c2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
231 } 231 }
232 tp->ack.pending &= ~TCP_ACK_TIMER; 232 tp->ack.pending &= ~TCP_ACK_TIMER;
233 233
234 if (skb_queue_len(&tp->ucopy.prequeue)) { 234 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb; 235 struct sk_buff *skb;
236 236
237 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 237 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
238 skb_queue_len(&tp->ucopy.prequeue));
239 238
240 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 239 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
241 sk->sk_backlog_rcv(sk, skb); 240 sk->sk_backlog_rcv(sk, skb);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000000..9bd443db5193
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
1/*
2 * TCP Vegas congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * Lawrence S. Brakmo and Larry L. Peterson.
6 * "TCP Vegas: End to end congestion avoidance on a global internet."
7 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
8 * October 1995. Available from:
9 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
10 *
11 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
12 * The main aspects that distinguish this implementation from the
13 * Arizona Vegas implementation are:
14 * o We do not change the loss detection or recovery mechanisms of
15 * Linux in any way. Linux already recovers from losses quite well,
16 * using fine-grained timers, NewReno, and FACK.
17 * o To avoid the performance penalty imposed by increasing cwnd
18 * only every-other RTT during slow start, we increase during
19 * every RTT during slow start, just like Reno.
20 * o Largely to allow continuous cwnd growth during slow start,
21 * we use the rate at which ACKs come back as the "actual"
22 * rate, rather than the rate at which data is sent.
23 * o To speed convergence to the right rate, we set the cwnd
24 * to achieve the right ("actual") rate when we exit slow start.
25 * o To filter out the noise caused by delayed ACKs, we use the
26 * minimum RTT sample observed during the last RTT to calculate
27 * the actual rate.
28 * o When the sender re-starts from idle, it waits until it has
29 * received ACKs for an entire flight of new data before making
30 * a cwnd adjustment decision. The original Vegas implementation
31 * assumed senders never went idle.
32 */
33
34#include <linux/config.h>
35#include <linux/mm.h>
36#include <linux/module.h>
37#include <linux/skbuff.h>
38#include <linux/tcp_diag.h>
39
40#include <net/tcp.h>
41
42/* Default values of the Vegas variables, in fixed-point representation
43 * with V_PARAM_SHIFT bits to the right of the binary point.
44 */
45#define V_PARAM_SHIFT 1
46static int alpha = 1<<V_PARAM_SHIFT;
47static int beta = 3<<V_PARAM_SHIFT;
48static int gamma = 1<<V_PARAM_SHIFT;
49
50module_param(alpha, int, 0644);
51MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
52module_param(beta, int, 0644);
53MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
54module_param(gamma, int, 0644);
55MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
56
57
58/* Vegas variables */
59struct vegas {
60 u32 beg_snd_nxt; /* right edge during last RTT */
61 u32 beg_snd_una; /* left edge during last RTT */
62 u32 beg_snd_cwnd; /* saves the size of the cwnd */
63 u8 doing_vegas_now;/* if true, do vegas for this RTT */
64 u16 cntRTT; /* # of RTTs measured within last RTT */
65 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
66 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
67};
68
69/* There are several situations when we must "re-start" Vegas:
70 *
71 * o when a connection is established
72 * o after an RTO
73 * o after fast recovery
74 * o when we send a packet and there is no outstanding
75 * unacknowledged data (restarting an idle connection)
76 *
77 * In these circumstances we cannot do a Vegas calculation at the
78 * end of the first RTT, because any calculation we do is using
79 * stale info -- both the saved cwnd and congestion feedback are
80 * stale.
81 *
82 * Instead we must wait until the completion of an RTT during
83 * which we actually receive ACKs.
84 */
85static inline void vegas_enable(struct tcp_sock *tp)
86{
87 struct vegas *vegas = tcp_ca(tp);
88
89 /* Begin taking Vegas samples next time we send something. */
90 vegas->doing_vegas_now = 1;
91
92 /* Set the beginning of the next send window. */
93 vegas->beg_snd_nxt = tp->snd_nxt;
94
95 vegas->cntRTT = 0;
96 vegas->minRTT = 0x7fffffff;
97}
98
99/* Stop taking Vegas samples for now. */
100static inline void vegas_disable(struct tcp_sock *tp)
101{
102 struct vegas *vegas = tcp_ca(tp);
103
104 vegas->doing_vegas_now = 0;
105}
106
107static void tcp_vegas_init(struct tcp_sock *tp)
108{
109 struct vegas *vegas = tcp_ca(tp);
110
111 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(tp);
113}
114
115/* Do RTT sampling needed for Vegas.
116 * Basically we:
117 * o min-filter RTT samples from within an RTT to get the current
118 * propagation delay + queuing delay (we are min-filtering to try to
119 * avoid the effects of delayed ACKs)
120 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT)
122 */
123static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
124{
125 struct vegas *vegas = tcp_ca(tp);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
127
128 /* Filter to find propagation delay: */
129 if (vrtt < vegas->baseRTT)
130 vegas->baseRTT = vrtt;
131
132 /* Find the min RTT during the last RTT to find
133 * the current prop. delay + queuing delay:
134 */
135 vegas->minRTT = min(vegas->minRTT, vrtt);
136 vegas->cntRTT++;
137}
138
139static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
140{
141
142 if (ca_state == TCP_CA_Open)
143 vegas_enable(tp);
144 else
145 vegas_disable(tp);
146}
147
148/*
149 * If the connection is idle and we are restarting,
150 * then we don't want to do any Vegas calculations
151 * until we get fresh RTT samples. So when we
152 * restart, we reset our Vegas state to a clean
153 * slate. After we get acks for this flight of
154 * packets, _then_ we can make Vegas calculations
155 * again.
156 */
157static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
158{
159 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START)
161 tcp_vegas_init(tp);
162}
163
164static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag)
166{
167 struct vegas *vegas = tcp_ca(tp);
168
169 if (!vegas->doing_vegas_now)
170 return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
171
172 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
173 *
174 * These are so named because they represent the approximate values
175 * of snd_una and snd_nxt at the beginning of the current RTT. More
176 * precisely, they represent the amount of data sent during the RTT.
177 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
178 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
179 * bytes of data have been ACKed during the course of the RTT, giving
180 * an "actual" rate of:
181 *
182 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
183 *
184 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
185 * because delayed ACKs can cover more than one segment, so they
186 * don't line up nicely with the boundaries of RTTs.
187 *
188 * Another unfortunate fact of life is that delayed ACKs delay the
189 * advance of the left edge of our send window, so that the number
190 * of bytes we send in an RTT is often less than our cwnd will allow.
191 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
192 */
193
194 if (after(ack, vegas->beg_snd_nxt)) {
195 /* Do the Vegas once-per-RTT cwnd adjustment. */
196 u32 old_wnd, old_snd_cwnd;
197
198
199 /* Here old_wnd is essentially the window of data that was
200 * sent during the previous RTT, and has all
201 * been acknowledged in the course of the RTT that ended
202 * with the ACK we just received. Likewise, old_snd_cwnd
203 * is the cwnd during the previous RTT.
204 */
205 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
206 tp->mss_cache;
207 old_snd_cwnd = vegas->beg_snd_cwnd;
208
209 /* Save the extent of the current window so we can use this
210 * at the end of the next RTT.
211 */
212 vegas->beg_snd_una = vegas->beg_snd_nxt;
213 vegas->beg_snd_nxt = tp->snd_nxt;
214 vegas->beg_snd_cwnd = tp->snd_cwnd;
215
216 /* Take into account the current RTT sample too, to
217 * decrease the impact of delayed acks. This double counts
218 * this sample since we count it for the next window as well,
219 * but that's not too awful, since we're taking the min,
220 * rather than averaging.
221 */
222 tcp_vegas_rtt_calc(tp, seq_rtt*1000);
223
224 /* We do the Vegas calculations only if we got enough RTT
225 * samples that we can be reasonably sure that we got
226 * at least one RTT sample that wasn't from a delayed ACK.
227 * If we only had 2 samples total,
228 * then that means we're getting only 1 ACK per RTT, which
229 * means they're almost certainly delayed ACKs.
230 * If we have 3 samples, we should be OK.
231 */
232
233 if (vegas->cntRTT <= 2) {
234 /* We don't have enough RTT samples to do the Vegas
235 * calculation, so we'll behave like Reno.
236 */
237 if (tp->snd_cwnd > tp->snd_ssthresh)
238 tp->snd_cwnd++;
239 } else {
240 u32 rtt, target_cwnd, diff;
241
242 /* We have enough RTT samples, so, using the Vegas
243 * algorithm, we determine if we should increase or
244 * decrease cwnd, and by how much.
245 */
246
247 /* Pluck out the RTT we are using for the Vegas
248 * calculations. This is the min RTT seen during the
249 * last RTT. Taking the min filters out the effects
250 * of delayed ACKs, at the cost of noticing congestion
251 * a bit later.
252 */
253 rtt = vegas->minRTT;
254
255 /* Calculate the cwnd we should have, if we weren't
256 * going too fast.
257 *
258 * This is:
259 * (actual rate in segments) * baseRTT
260 * We keep it as a fixed point number with
261 * V_PARAM_SHIFT bits to the right of the binary point.
262 */
263 target_cwnd = ((old_wnd * vegas->baseRTT)
264 << V_PARAM_SHIFT) / rtt;
265
266 /* Calculate the difference between the window we had,
267 * and the window we would like to have. This quantity
268 * is the "Diff" from the Arizona Vegas papers.
269 *
270 * Again, this is a fixed point number with
271 * V_PARAM_SHIFT bits to the right of the binary
272 * point.
273 */
274 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
275
276 if (tp->snd_cwnd < tp->snd_ssthresh) {
277 /* Slow start. */
278 if (diff > gamma) {
279 /* Going too fast. Time to slow down
280 * and switch to congestion avoidance.
281 */
282 tp->snd_ssthresh = 2;
283
284 /* Set cwnd to match the actual rate
285 * exactly:
286 * cwnd = (actual rate) * baseRTT
287 * Then we add 1 because the integer
288 * truncation robs us of full link
289 * utilization.
290 */
291 tp->snd_cwnd = min(tp->snd_cwnd,
292 (target_cwnd >>
293 V_PARAM_SHIFT)+1);
294
295 }
296 } else {
297 /* Congestion avoidance. */
298 u32 next_snd_cwnd;
299
300 /* Figure out where we would like cwnd
301 * to be.
302 */
303 if (diff > beta) {
304 /* The old window was too fast, so
305 * we slow down.
306 */
307 next_snd_cwnd = old_snd_cwnd - 1;
308 } else if (diff < alpha) {
309 /* We don't have enough extra packets
310 * in the network, so speed up.
311 */
312 next_snd_cwnd = old_snd_cwnd + 1;
313 } else {
314 /* Sending just as fast as we
315 * should be.
316 */
317 next_snd_cwnd = old_snd_cwnd;
318 }
319
320 /* Adjust cwnd upward or downward, toward the
321 * desired value.
322 */
323 if (next_snd_cwnd > tp->snd_cwnd)
324 tp->snd_cwnd++;
325 else if (next_snd_cwnd < tp->snd_cwnd)
326 tp->snd_cwnd--;
327 }
328 }
329
330 /* Wipe the slate clean for the next RTT. */
331 vegas->cntRTT = 0;
332 vegas->minRTT = 0x7fffffff;
333 }
334
335 /* The following code is executed for every ack we receive,
336 * except for conditions checked in should_advance_cwnd()
337 * before the call to tcp_cong_avoid(). Mainly this means that
338 * we only execute this code if the ack actually acked some
339 * data.
340 */
341
342 /* If we are in slow start, increase our cwnd in response to this ACK.
343 * (If we are not in slow start then we are in congestion avoidance,
344 * and adjust our congestion window only once per RTT. See the code
345 * above.)
346 */
347 if (tp->snd_cwnd <= tp->snd_ssthresh)
348 tp->snd_cwnd++;
349
350 /* to keep cwnd from growing without bound */
351 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
352
353 /* Make sure that we are never so timid as to reduce our cwnd below
354 * 2 MSS.
355 *
356 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
357 */
358 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
359}
360
361/* Extract info for Tcp socket info provided via netlink. */
362static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
363 struct sk_buff *skb)
364{
365 const struct vegas *ca = tcp_ca(tp);
366 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
367 struct tcpvegas_info *info;
368
369 info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
370 sizeof(*info)));
371
372 info->tcpv_enabled = ca->doing_vegas_now;
373 info->tcpv_rttcnt = ca->cntRTT;
374 info->tcpv_rtt = ca->baseRTT;
375 info->tcpv_minrtt = ca->minRTT;
376 rtattr_failure: ;
377 }
378}
379
380static struct tcp_congestion_ops tcp_vegas = {
381 .init = tcp_vegas_init,
382 .ssthresh = tcp_reno_ssthresh,
383 .cong_avoid = tcp_vegas_cong_avoid,
384 .min_cwnd = tcp_reno_min_cwnd,
385 .rtt_sample = tcp_vegas_rtt_calc,
386 .set_state = tcp_vegas_state,
387 .cwnd_event = tcp_vegas_cwnd_event,
388 .get_info = tcp_vegas_get_info,
389
390 .owner = THIS_MODULE,
391 .name = "vegas",
392};
393
394static int __init tcp_vegas_register(void)
395{
396 BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
397 tcp_register_congestion_control(&tcp_vegas);
398 return 0;
399}
400
401static void __exit tcp_vegas_unregister(void)
402{
403 tcp_unregister_congestion_control(&tcp_vegas);
404}
405
406module_init(tcp_vegas_register);
407module_exit(tcp_vegas_unregister);
408
409MODULE_AUTHOR("Stephen Hemminger");
410MODULE_LICENSE("GPL");
411MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 000000000000..ef827242c940
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
1/*
2 * TCP Westwood+
3 *
4 * Angelo Dell'Aera: TCP Westwood+ support
5 */
6
7#include <linux/config.h>
8#include <linux/mm.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/tcp_diag.h>
12#include <net/tcp.h>
13
14/* TCP Westwood structure */
15struct westwood {
16 u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
17 u32 bw_est; /* bandwidth estimate */
18 u32 rtt_win_sx; /* here starts a new evaluation... */
19 u32 bk;
20 u32 snd_una; /* used for evaluating the number of acked bytes */
21 u32 cumul_ack;
22 u32 accounted;
23 u32 rtt;
24 u32 rtt_min; /* minimum observed RTT */
25};
26
27
28/* TCP Westwood functions and constants */
29#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
30#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
31
32/*
33 * @tcp_westwood_create
34 * This function initializes fields used in TCP Westwood+,
35 * it is called after the initial SYN, so the sequence numbers
36 * are correct but new passive connections we have no
37 * information about RTTmin at this time so we simply set it to
38 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
39 * since in this way we're sure it will be updated in a consistent
40 * way as soon as possible. It will reasonably happen within the first
41 * RTT period of the connection lifetime.
42 */
43static void tcp_westwood_init(struct tcp_sock *tp)
44{
45 struct westwood *w = tcp_ca(tp);
46
47 w->bk = 0;
48 w->bw_ns_est = 0;
49 w->bw_est = 0;
50 w->accounted = 0;
51 w->cumul_ack = 0;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tp->snd_una;
55}
56
57/*
58 * @westwood_do_filter
59 * Low-pass filter. Implemented using constant coefficients.
60 */
61static inline u32 westwood_do_filter(u32 a, u32 b)
62{
63 return (((7 * a) + b) >> 3);
64}
65
66static inline void westwood_filter(struct westwood *w, u32 delta)
67{
68 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
69 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
70}
71
72/*
73 * @westwood_pkts_acked
74 * Called after processing group of packets.
75 * but all westwood needs is the last sample of srtt.
76 */
77static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
78{
79 struct westwood *w = tcp_ca(tp);
80 if (cnt > 0)
81 w->rtt = tp->srtt >> 3;
82}
83
84/*
85 * @westwood_update_window
86 * It updates RTT evaluation window if it is the right moment to do
87 * it. If so it calls filter for evaluating bandwidth.
88 */
89static void westwood_update_window(struct tcp_sock *tp)
90{
91 struct westwood *w = tcp_ca(tp);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93
94 /*
95 * See if a RTT-window has passed.
96 * Be careful since if RTT is less than
97 * 50ms we don't filter but we continue 'building the sample'.
98 * This minimum limit was chosen since an estimation on small
99 * time intervals is better to avoid...
100 * Obviously on a LAN we reasonably will always have
101 * right_bound = left_bound + WESTWOOD_RTT_MIN
102 */
103 if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
104 westwood_filter(w, delta);
105
106 w->bk = 0;
107 w->rtt_win_sx = tcp_time_stamp;
108 }
109}
110
111/*
112 * @westwood_fast_bw
113 * It is called when we are in fast path. In particular it is called when
114 * header prediction is successful. In such case in fact update is
115 * straight forward and doesn't need any particular care.
116 */
117static inline void westwood_fast_bw(struct tcp_sock *tp)
118{
119 struct westwood *w = tcp_ca(tp);
120
121 westwood_update_window(tp);
122
123 w->bk += tp->snd_una - w->snd_una;
124 w->snd_una = tp->snd_una;
125 w->rtt_min = min(w->rtt, w->rtt_min);
126}
127
128/*
129 * @westwood_acked_count
130 * This function evaluates cumul_ack for evaluating bk in case of
131 * delayed or partial acks.
132 */
133static inline u32 westwood_acked_count(struct tcp_sock *tp)
134{
135 struct westwood *w = tcp_ca(tp);
136
137 w->cumul_ack = tp->snd_una - w->snd_una;
138
139 /* If cumul_ack is 0 this is a dupack since it's not moving
140 * tp->snd_una.
141 */
142 if (!w->cumul_ack) {
143 w->accounted += tp->mss_cache;
144 w->cumul_ack = tp->mss_cache;
145 }
146
147 if (w->cumul_ack > tp->mss_cache) {
148 /* Partial or delayed ack */
149 if (w->accounted >= w->cumul_ack) {
150 w->accounted -= w->cumul_ack;
151 w->cumul_ack = tp->mss_cache;
152 } else {
153 w->cumul_ack -= w->accounted;
154 w->accounted = 0;
155 }
156 }
157
158 w->snd_una = tp->snd_una;
159
160 return w->cumul_ack;
161}
162
163static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
164{
165 struct westwood *w = tcp_ca(tp);
166 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
167}
168
169/*
170 * TCP Westwood
171 * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
172 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
173 * so avoids ever returning 0.
174 */
175static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
176{
177 return westwood_bw_rttmin(tp);
178}
179
180static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
181{
182 struct westwood *w = tcp_ca(tp);
183
184 switch(event) {
185 case CA_EVENT_FAST_ACK:
186 westwood_fast_bw(tp);
187 break;
188
189 case CA_EVENT_COMPLETE_CWR:
190 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
191 break;
192
193 case CA_EVENT_FRTO:
194 tp->snd_ssthresh = westwood_bw_rttmin(tp);
195 break;
196
197 case CA_EVENT_SLOW_ACK:
198 westwood_update_window(tp);
199 w->bk += westwood_acked_count(tp);
200 w->rtt_min = min(w->rtt, w->rtt_min);
201 break;
202
203 default:
204 /* don't care */
205 break;
206 }
207}
208
209
210/* Extract info for Tcp socket info provided via netlink. */
211static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
212 struct sk_buff *skb)
213{
214 const struct westwood *ca = tcp_ca(tp);
215 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
216 struct rtattr *rta;
217 struct tcpvegas_info *info;
218
219 rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
220 info = RTA_DATA(rta);
221 info->tcpv_enabled = 1;
222 info->tcpv_rttcnt = 0;
223 info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
224 info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
225 rtattr_failure: ;
226 }
227}
228
229
230static struct tcp_congestion_ops tcp_westwood = {
231 .init = tcp_westwood_init,
232 .ssthresh = tcp_reno_ssthresh,
233 .cong_avoid = tcp_reno_cong_avoid,
234 .min_cwnd = tcp_westwood_cwnd_min,
235 .cwnd_event = tcp_westwood_event,
236 .get_info = tcp_westwood_info,
237 .pkts_acked = tcp_westwood_pkts_acked,
238
239 .owner = THIS_MODULE,
240 .name = "westwood"
241};
242
243static int __init tcp_westwood_register(void)
244{
245 BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
246 return tcp_register_congestion_control(&tcp_westwood);
247}
248
249static void __exit tcp_westwood_unregister(void)
250{
251 tcp_unregister_congestion_control(&tcp_westwood);
252}
253
254module_init(tcp_westwood_register);
255module_exit(tcp_westwood_unregister);
256
257MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
258MODULE_LICENSE("GPL");
259MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index e66ca9381cfd..95163cd52ae0 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,6 +1,26 @@
1# 1#
2# IPv6 configuration 2# IPv6 configuration
3# 3#
4
5# IPv6 as module will cause a CRASH if you try to unload it
6config IPV6
7 tristate "The IPv6 protocol"
8 default m
9 select CRYPTO if IPV6_PRIVACY
10 select CRYPTO_MD5 if IPV6_PRIVACY
11 ---help---
12 This is complemental support for the IP version 6.
13 You will still be able to do traditional IPv4 networking as well.
14
15 For general information about IPv6, see
16 <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
17 For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
18 For specific information about IPv6 under Linux, read the HOWTO at
19 <http://www.bieringer.de/linux/IPv6/>.
20
21 To compile this protocol support as a module, choose M here: the
22 module will be called ipv6.
23
4config IPV6_PRIVACY 24config IPV6_PRIVACY
5 bool "IPv6: Privacy Extensions (RFC 3041) support" 25 bool "IPv6: Privacy Extensions (RFC 3041) support"
6 depends on IPV6 26 depends on IPV6
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a54d4ef3fd35..77004b9456c0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2777,7 +2777,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2777 read_lock_bh(&idev->lock); 2777 read_lock_bh(&idev->lock);
2778 switch (type) { 2778 switch (type) {
2779 case UNICAST_ADDR: 2779 case UNICAST_ADDR:
2780 /* unicast address */ 2780 /* unicast address incl. temp addr */
2781 for (ifa = idev->addr_list; ifa; 2781 for (ifa = idev->addr_list; ifa;
2782 ifa = ifa->if_next, ip_idx++) { 2782 ifa = ifa->if_next, ip_idx++) {
2783 if (ip_idx < s_ip_idx) 2783 if (ip_idx < s_ip_idx)
@@ -2788,19 +2788,6 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2788 NLM_F_MULTI)) <= 0) 2788 NLM_F_MULTI)) <= 0)
2789 goto done; 2789 goto done;
2790 } 2790 }
2791 /* temp addr */
2792#ifdef CONFIG_IPV6_PRIVACY
2793 for (ifa = idev->tempaddr_list; ifa;
2794 ifa = ifa->tmp_next, ip_idx++) {
2795 if (ip_idx < s_ip_idx)
2796 continue;
2797 if ((err = inet6_fill_ifaddr(skb, ifa,
2798 NETLINK_CB(cb->skb).pid,
2799 cb->nlh->nlmsg_seq, RTM_NEWADDR,
2800 NLM_F_MULTI)) <= 0)
2801 goto done;
2802 }
2803#endif
2804 break; 2791 break;
2805 case MULTICAST_ADDR: 2792 case MULTICAST_ADDR:
2806 /* multicast address */ 2793 /* multicast address */
@@ -2923,6 +2910,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
2923 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); 2910 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2924 r = NLMSG_DATA(nlh); 2911 r = NLMSG_DATA(nlh);
2925 r->ifi_family = AF_INET6; 2912 r->ifi_family = AF_INET6;
2913 r->__ifi_pad = 0;
2926 r->ifi_type = dev->type; 2914 r->ifi_type = dev->type;
2927 r->ifi_index = dev->ifindex; 2915 r->ifi_index = dev->ifindex;
2928 r->ifi_flags = dev_get_flags(dev); 2916 r->ifi_flags = dev_get_flags(dev);
@@ -3030,9 +3018,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
3030 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags); 3018 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
3031 pmsg = NLMSG_DATA(nlh); 3019 pmsg = NLMSG_DATA(nlh);
3032 pmsg->prefix_family = AF_INET6; 3020 pmsg->prefix_family = AF_INET6;
3021 pmsg->prefix_pad1 = 0;
3022 pmsg->prefix_pad2 = 0;
3033 pmsg->prefix_ifindex = idev->dev->ifindex; 3023 pmsg->prefix_ifindex = idev->dev->ifindex;
3034 pmsg->prefix_len = pinfo->prefix_len; 3024 pmsg->prefix_len = pinfo->prefix_len;
3035 pmsg->prefix_type = pinfo->type; 3025 pmsg->prefix_type = pinfo->type;
3026 pmsg->prefix_pad3 = 0;
3036 3027
3037 pmsg->prefix_flags = 0; 3028 pmsg->prefix_flags = 0;
3038 if (pinfo->onlink) 3029 if (pinfo->onlink)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df49a..28d9bcab0970 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
774 if (if6_proc_init()) 774 if (if6_proc_init())
775 goto proc_if6_fail; 775 goto proc_if6_fail;
776#endif 776#endif
777 ipv6_packet_init();
778 ip6_route_init(); 777 ip6_route_init();
779 ip6_flowlabel_init(); 778 ip6_flowlabel_init();
780 err = addrconf_init(); 779 err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
791 /* Init v6 transport protocols. */ 790 /* Init v6 transport protocols. */
792 udpv6_init(); 791 udpv6_init();
793 tcpv6_init(); 792 tcpv6_init();
793
794 ipv6_packet_init();
794 err = 0; 795 err = 0;
795out: 796out:
796 return err; 797 return err;
@@ -798,7 +799,6 @@ out:
798addrconf_fail: 799addrconf_fail:
799 ip6_flowlabel_cleanup(); 800 ip6_flowlabel_cleanup();
800 ip6_route_cleanup(); 801 ip6_route_cleanup();
801 ipv6_packet_cleanup();
802#ifdef CONFIG_PROC_FS 802#ifdef CONFIG_PROC_FS
803 if6_proc_exit(); 803 if6_proc_exit();
804proc_if6_fail: 804proc_if6_fail:
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499debb..b6c73da5ff35 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
244 opt_space->opt_nflen = 0; 244 opt_space->opt_nflen = 0;
245 } 245 }
246 opt_space->dst1opt = fopt->dst1opt; 246 opt_space->dst1opt = fopt->dst1opt;
247 opt_space->auth = fopt->auth;
248 opt_space->opt_flen = fopt->opt_flen; 247 opt_space->opt_flen = fopt->opt_flen;
249 return opt_space; 248 return opt_space;
250} 249}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeedc5..1f2c2f9e353f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
465 to->pkt_type = from->pkt_type; 465 to->pkt_type = from->pkt_type;
466 to->priority = from->priority; 466 to->priority = from->priority;
467 to->protocol = from->protocol; 467 to->protocol = from->protocol;
468 to->security = from->security;
469 dst_release(to->dst); 468 dst_release(to->dst);
470 to->dst = dst_clone(from->dst); 469 to->dst = dst_clone(from->dst);
471 to->dev = from->dev; 470 to->dev = from->dev;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 562fcd14fdea..29fed6e58d0a 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -281,7 +281,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
281 } 281 }
282 write_unlock_bh(&ipv6_sk_mc_lock); 282 write_unlock_bh(&ipv6_sk_mc_lock);
283 283
284 return -ENOENT; 284 return -EADDRNOTAVAIL;
285} 285}
286 286
287static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex) 287static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex)
@@ -386,12 +386,16 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
386 if (ipv6_addr_equal(&pmc->addr, group)) 386 if (ipv6_addr_equal(&pmc->addr, group))
387 break; 387 break;
388 } 388 }
389 if (!pmc) /* must have a prior join */ 389 if (!pmc) { /* must have a prior join */
390 err = -EINVAL;
390 goto done; 391 goto done;
392 }
391 /* if a source filter was set, must be the same mode as before */ 393 /* if a source filter was set, must be the same mode as before */
392 if (pmc->sflist) { 394 if (pmc->sflist) {
393 if (pmc->sfmode != omode) 395 if (pmc->sfmode != omode) {
396 err = -EINVAL;
394 goto done; 397 goto done;
398 }
395 } else if (pmc->sfmode != omode) { 399 } else if (pmc->sfmode != omode) {
396 /* allow mode switches for empty-set filters */ 400 /* allow mode switches for empty-set filters */
397 ip6_mc_add_src(idev, group, omode, 0, NULL, 0); 401 ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
@@ -402,7 +406,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
402 psl = pmc->sflist; 406 psl = pmc->sflist;
403 if (!add) { 407 if (!add) {
404 if (!psl) 408 if (!psl)
405 goto done; 409 goto done; /* err = -EADDRNOTAVAIL */
406 rv = !0; 410 rv = !0;
407 for (i=0; i<psl->sl_count; i++) { 411 for (i=0; i<psl->sl_count; i++) {
408 rv = memcmp(&psl->sl_addr[i], source, 412 rv = memcmp(&psl->sl_addr[i], source,
@@ -411,7 +415,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
411 break; 415 break;
412 } 416 }
413 if (rv) /* source not found */ 417 if (rv) /* source not found */
414 goto done; 418 goto done; /* err = -EADDRNOTAVAIL */
415 419
416 /* special case - (INCLUDE, empty) == LEAVE_GROUP */ 420 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
417 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { 421 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
@@ -488,6 +492,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
488 struct inet6_dev *idev; 492 struct inet6_dev *idev;
489 struct ipv6_pinfo *inet6 = inet6_sk(sk); 493 struct ipv6_pinfo *inet6 = inet6_sk(sk);
490 struct ip6_sf_socklist *newpsl, *psl; 494 struct ip6_sf_socklist *newpsl, *psl;
495 int leavegroup = 0;
491 int i, err; 496 int i, err;
492 497
493 group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; 498 group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
@@ -503,7 +508,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
503 if (!idev) 508 if (!idev)
504 return -ENODEV; 509 return -ENODEV;
505 dev = idev->dev; 510 dev = idev->dev;
506 err = -EADDRNOTAVAIL; 511
512 err = 0;
513 if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
514 leavegroup = 1;
515 goto done;
516 }
507 517
508 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { 518 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
509 if (pmc->ifindex != gsf->gf_interface) 519 if (pmc->ifindex != gsf->gf_interface)
@@ -511,8 +521,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
511 if (ipv6_addr_equal(&pmc->addr, group)) 521 if (ipv6_addr_equal(&pmc->addr, group))
512 break; 522 break;
513 } 523 }
514 if (!pmc) /* must have a prior join */ 524 if (!pmc) { /* must have a prior join */
525 err = -EINVAL;
515 goto done; 526 goto done;
527 }
516 if (gsf->gf_numsrc) { 528 if (gsf->gf_numsrc) {
517 newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, 529 newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk,
518 IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC); 530 IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC);
@@ -544,10 +556,13 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
544 (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); 556 (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
545 pmc->sflist = newpsl; 557 pmc->sflist = newpsl;
546 pmc->sfmode = gsf->gf_fmode; 558 pmc->sfmode = gsf->gf_fmode;
559 err = 0;
547done: 560done:
548 read_unlock_bh(&idev->lock); 561 read_unlock_bh(&idev->lock);
549 in6_dev_put(idev); 562 in6_dev_put(idev);
550 dev_put(dev); 563 dev_put(dev);
564 if (leavegroup)
565 err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
551 return err; 566 return err;
552} 567}
553 568
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2414937f2a83..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2018,14 +2018,14 @@ static int tcp_v6_init_sock(struct sock *sk)
2018 */ 2018 */
2019 tp->snd_ssthresh = 0x7fffffff; 2019 tp->snd_ssthresh = 0x7fffffff;
2020 tp->snd_cwnd_clamp = ~0; 2020 tp->snd_cwnd_clamp = ~0;
2021 tp->mss_cache_std = tp->mss_cache = 536; 2021 tp->mss_cache = 536;
2022 2022
2023 tp->reordering = sysctl_tcp_reordering; 2023 tp->reordering = sysctl_tcp_reordering;
2024 2024
2025 sk->sk_state = TCP_CLOSE; 2025 sk->sk_state = TCP_CLOSE;
2026 2026
2027 tp->af_specific = &ipv6_specific; 2027 tp->af_specific = &ipv6_specific;
2028 2028 tp->ca_ops = &tcp_init_congestion_ops;
2029 sk->sk_write_space = sk_stream_write_space; 2029 sk->sk_write_space = sk_stream_write_space;
2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2031 2031
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index a16237c0e783..980a826f5d02 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -1,6 +1,39 @@
1# 1#
2# IPX configuration 2# IPX configuration
3# 3#
4config IPX
5 tristate "The IPX protocol"
6 select LLC
7 ---help---
8 This is support for the Novell networking protocol, IPX, commonly
9 used for local networks of Windows machines. You need it if you
10 want to access Novell NetWare file or print servers using the Linux
11 Novell client ncpfs (available from
12 <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
13 within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>). In order
15 to do the former, you'll also have to say Y to "NCP file system
16 support", below.
17
18 IPX is similar in scope to IP, while SPX, which runs on top of IPX,
19 is similar to TCP. There is also experimental support for SPX in
20 Linux (see "SPX networking", below).
21
22 To turn your Linux box into a fully featured NetWare file server and
23 IPX router, say Y here and fetch either lwared from
24 <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
25 mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
26 information, read the IPX-HOWTO available from
27 <http://www.tldp.org/docs.html#howto>.
28
29 General information about how to connect Linux, Windows machines and
30 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
31
32 The IPX driver would enlarge your kernel by about 16 KB. To compile
33 this driver as a module, choose M here: the module will be called ipx.
34 Unless you want to integrate your Linux box with a local Novell
35 network, say N.
36
4config IPX_INTERN 37config IPX_INTERN
5 bool "IPX: Full internal IPX network" 38 bool "IPX: Full internal IPX network"
6 depends on IPX 39 depends on IPX
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
index 046ad0750e48..7029618f5719 100644
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irlap_cb *self)
445 IRDA_ASSERT(self->magic == LAP_MAGIC, return;); 445 IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
446 446
447 /* Don't disconnect until all data frames are successfully sent */ 447 /* Don't disconnect until all data frames are successfully sent */
448 if (skb_queue_len(&self->txq) > 0) { 448 if (!skb_queue_empty(&self->txq)) {
449 self->disconnect_pending = TRUE; 449 self->disconnect_pending = TRUE;
450
451 return; 450 return;
452 } 451 }
453 452
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
index 1cd89f5f3b75..a505b5457608 100644
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
191 * Send out the RR frames faster if our own transmit queue is empty, or 191 * Send out the RR frames faster if our own transmit queue is empty, or
192 * if the peer is busy. The effect is a much faster conversation 192 * if the peer is busy. The effect is a much faster conversation
193 */ 193 */
194 if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) { 194 if (skb_queue_empty(&self->txq) || self->remote_busy) {
195 if (self->fast_RR == TRUE) { 195 if (self->fast_RR == TRUE) {
196 /* 196 /*
197 * Assert that the fast poll timer has not reached the 197 * Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
263 IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__, 263 IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
264 skb_queue_len(&self->txq)); 264 skb_queue_len(&self->txq));
265 265
266 if (skb_queue_len(&self->txq)) { 266 if (!skb_queue_empty(&self->txq)) {
267 /* Prevent race conditions with irlap_data_request() */ 267 /* Prevent race conditions with irlap_data_request() */
268 self->local_busy = TRUE; 268 self->local_busy = TRUE;
269 269
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
1074#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1074#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
1075 /* Window has been adjusted for the max packet 1075 /* Window has been adjusted for the max packet
1076 * size, so much simpler... - Jean II */ 1076 * size, so much simpler... - Jean II */
1077 nextfit = (skb_queue_len(&self->txq) > 0); 1077 nextfit = !skb_queue_empty(&self->txq);
1078#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1078#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
1079 /* 1079 /*
1080 * Send data with poll bit cleared only if window > 1 1080 * Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
1814#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1814#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
1815 /* Window has been adjusted for the max packet 1815 /* Window has been adjusted for the max packet
1816 * size, so much simpler... - Jean II */ 1816 * size, so much simpler... - Jean II */
1817 nextfit = (skb_queue_len(&self->txq) > 0); 1817 nextfit = !skb_queue_empty(&self->txq);
1818#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ 1818#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
1819 /* 1819 /*
1820 * Send data with final bit cleared only if window > 1 1820 * Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
1937 irlap_data_indication(self, skb, FALSE); 1937 irlap_data_indication(self, skb, FALSE);
1938 1938
1939 /* Any pending data requests? */ 1939 /* Any pending data requests? */
1940 if ((skb_queue_len(&self->txq) > 0) && 1940 if (!skb_queue_empty(&self->txq) &&
1941 (self->window > 0)) 1941 (self->window > 0))
1942 { 1942 {
1943 self->ack_required = TRUE; 1943 self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
2038 /* 2038 /*
2039 * Any pending data requests? 2039 * Any pending data requests?
2040 */ 2040 */
2041 if ((skb_queue_len(&self->txq) > 0) && 2041 if (!skb_queue_empty(&self->txq) &&
2042 (self->window > 0) && !self->remote_busy) 2042 (self->window > 0) && !self->remote_busy)
2043 { 2043 {
2044 irlap_data_indication(self, skb, TRUE); 2044 irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
2069 */ 2069 */
2070 nr_status = irlap_validate_nr_received(self, info->nr); 2070 nr_status = irlap_validate_nr_received(self, info->nr);
2071 if (nr_status == NR_EXPECTED) { 2071 if (nr_status == NR_EXPECTED) {
2072 if ((skb_queue_len( &self->txq) > 0) && 2072 if (!skb_queue_empty(&self->txq) &&
2073 (self->window > 0)) { 2073 (self->window > 0)) {
2074 self->remote_busy = FALSE; 2074 self->remote_busy = FALSE;
2075 2075
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 040abe714aa3..6dafbb43b529 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
1018 /* 1018 /*
1019 * We can now fill the window with additional data frames 1019 * We can now fill the window with additional data frames
1020 */ 1020 */
1021 while (skb_queue_len( &self->txq) > 0) { 1021 while (!skb_queue_empty(&self->txq)) {
1022 1022
1023 IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__); 1023 IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
1024 if ((skb_queue_len( &self->txq) > 0) && 1024 if (self->window > 0) {
1025 (self->window > 0)) {
1026 skb = skb_dequeue( &self->txq); 1025 skb = skb_dequeue( &self->txq);
1027 IRDA_ASSERT(skb != NULL, return;); 1026 IRDA_ASSERT(skb != NULL, return;);
1028 1027
@@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
1031 * bit cleared 1030 * bit cleared
1032 */ 1031 */
1033 if ((self->window > 1) && 1032 if ((self->window > 1) &&
1034 skb_queue_len(&self->txq) > 0) 1033 !skb_queue_empty(&self->txq)) {
1035 {
1036 irlap_send_data_primary(self, skb); 1034 irlap_send_data_primary(self, skb);
1037 } else { 1035 } else {
1038 irlap_send_data_primary_poll(self, skb); 1036 irlap_send_data_primary_poll(self, skb);
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index d091ccf773b3..6602d901f8b1 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
1513 /* 1513 /*
1514 * Check if there is still data segments in the transmit queue 1514 * Check if there is still data segments in the transmit queue
1515 */ 1515 */
1516 if (skb_queue_len(&self->tx_queue) > 0) { 1516 if (!skb_queue_empty(&self->tx_queue)) {
1517 if (priority == P_HIGH) { 1517 if (priority == P_HIGH) {
1518 /* 1518 /*
1519 * No need to send the queued data, if we are 1519 * No need to send the queued data, if we are
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 000000000000..f0b5efb31a00
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,22 @@
1#
2# LAPB Data Link Drive
3#
4
5config LAPB
6 tristate "LAPB Data Link Driver (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
10 the lower) part of the X.25 protocol. It offers a reliable
11 connection service to exchange data frames with one other host, and
12 it is used to transport higher level protocols (mostly X.25 Packet
13 Layer, the higher part of X.25, but others are possible as well).
14 Usually, LAPB is used with specialized X.21 network cards, but Linux
15 currently supports LAPB only over Ethernet connections. If you want
16 to use LAPB connections over Ethernet, say Y here and to "LAPB over
17 Ethernet driver" below. Read
18 <file:Documentation/networking/lapb-module.txt> for technical
19 details.
20
21 To compile this driver as a module, choose M here: the
22 module will be called lapb. If unsure, say N.
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index cd130c3b72bc..d5bdb53a348f 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
84 if (llc->dev->flags & IFF_LOOPBACK) 84 if (llc->dev->flags & IFF_LOOPBACK)
85 goto out; 85 goto out;
86 rc = 1; 86 rc = 1;
87 if (!skb_queue_len(&llc->pdu_unack_q)) 87 if (skb_queue_empty(&llc->pdu_unack_q))
88 goto out; 88 goto out;
89 skb = skb_peek(&llc->pdu_unack_q); 89 skb = skb_peek(&llc->pdu_unack_q);
90 pdu = llc_pdu_sn_hdr(skb); 90 pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 70bcd4744d93..3405fdf41b93 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -315,8 +315,8 @@ err:
315static void netlink_remove(struct sock *sk) 315static void netlink_remove(struct sock *sk)
316{ 316{
317 netlink_table_grab(); 317 netlink_table_grab();
318 nl_table[sk->sk_protocol].hash.entries--; 318 if (sk_del_node_init(sk))
319 sk_del_node_init(sk); 319 nl_table[sk->sk_protocol].hash.entries--;
320 if (nlk_sk(sk)->groups) 320 if (nlk_sk(sk)->groups)
321 __sk_del_bind_node(sk); 321 __sk_del_bind_node(sk);
322 netlink_table_ungrab(); 322 netlink_table_ungrab();
@@ -429,7 +429,12 @@ retry:
429 err = netlink_insert(sk, pid); 429 err = netlink_insert(sk, pid);
430 if (err == -EADDRINUSE) 430 if (err == -EADDRINUSE)
431 goto retry; 431 goto retry;
432 return 0; 432
433 /* If 2 threads race to autobind, that is fine. */
434 if (err == -EBUSY)
435 err = 0;
436
437 return err;
433} 438}
434 439
435static inline int netlink_capable(struct socket *sock, unsigned int flag) 440static inline int netlink_capable(struct socket *sock, unsigned int flag)
@@ -853,7 +858,7 @@ static inline void netlink_rcv_wake(struct sock *sk)
853{ 858{
854 struct netlink_sock *nlk = nlk_sk(sk); 859 struct netlink_sock *nlk = nlk_sk(sk);
855 860
856 if (!skb_queue_len(&sk->sk_receive_queue)) 861 if (skb_queue_empty(&sk->sk_receive_queue))
857 clear_bit(0, &nlk->state); 862 clear_bit(0, &nlk->state);
858 if (!test_bit(0, &nlk->state)) 863 if (!test_bit(0, &nlk->state))
859 wake_up_interruptible(&nlk->wait); 864 wake_up_interruptible(&nlk->wait);
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 000000000000..34ff93ff894d
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,26 @@
1#
2# Packet configuration
3#
4
5config PACKET
6 tristate "Packet socket"
7 ---help---
8 The Packet protocol is used by applications which communicate
9 directly with network devices without an intermediate network
10 protocol implemented in the kernel, e.g. tcpdump. If you want them
11 to work, choose Y.
12
13 To compile this driver as a module, choose M here: the module will
14 be called af_packet.
15
16 If unsure, say Y.
17
18config PACKET_MMAP
19 bool "Packet socket: mmapped IO"
20 depends on PACKET
21 help
22 If you say Y here, the Packet protocol driver will use an IO
23 mechanism that results in faster communication.
24
25 If unsure, say N.
26
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0269616e75a1..c9d5980aa4de 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -274,6 +274,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct
274 dst_release(skb->dst); 274 dst_release(skb->dst);
275 skb->dst = NULL; 275 skb->dst = NULL;
276 276
277 /* drop conntrack reference */
278 nf_reset(skb);
279
277 spkt = (struct sockaddr_pkt*)skb->cb; 280 spkt = (struct sockaddr_pkt*)skb->cb;
278 281
279 skb_push(skb, skb->data-skb->mac.raw); 282 skb_push(skb, skb->data-skb->mac.raw);
@@ -517,6 +520,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
517 dst_release(skb->dst); 520 dst_release(skb->dst);
518 skb->dst = NULL; 521 skb->dst = NULL;
519 522
523 /* drop conntrack reference */
524 nf_reset(skb);
525
520 spin_lock(&sk->sk_receive_queue.lock); 526 spin_lock(&sk->sk_receive_queue.lock);
521 po->stats.tp_packets++; 527 po->stats.tp_packets++;
522 __skb_queue_tail(&sk->sk_receive_queue, skb); 528 __skb_queue_tail(&sk->sk_receive_queue, skb);
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a17..dada34a77b21 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
138 138
139 _debug("### End Work"); 139 _debug("### End Work");
140 140
141 try_to_freeze(PF_FREEZE); 141 try_to_freeze();
142 142
143 /* discard pending signals */ 143 /* discard pending signals */
144 rxrpc_discard_my_signals(); 144 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d9228..1aadd026d354 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
107 107
108 _debug("### End Inbound Calls"); 108 _debug("### End Inbound Calls");
109 109
110 try_to_freeze(PF_FREEZE); 110 try_to_freeze();
111 111
112 /* discard pending signals */ 112 /* discard pending signals */
113 rxrpc_discard_my_signals(); 113 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290bb..3ac81cdd1211 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
90 complete_and_exit(&krxtimod_dead, 0); 90 complete_and_exit(&krxtimod_dead, 0);
91 } 91 }
92 92
93 try_to_freeze(PF_FREEZE); 93 try_to_freeze();
94 94
95 /* discard pending signals */ 95 /* discard pending signals */
96 rxrpc_discard_my_signals(); 96 rxrpc_discard_my_signals();
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b22c9beb604d..59d3e71f8b85 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,43 @@
1# 1#
2# Traffic control configuration. 2# Traffic control configuration.
3# 3#
4
5menuconfig NET_SCHED
6 bool "QoS and/or fair queueing"
7 ---help---
8 When the kernel has several packets to send out over a network
9 device, it has to decide which ones to send first, which ones to
10 delay, and which ones to drop. This is the job of the packet
11 scheduler, and several different algorithms for how to do this
12 "fairly" have been proposed.
13
14 If you say N here, you will get the standard packet scheduler, which
15 is a FIFO (first come, first served). If you say Y here, you will be
16 able to choose from among several alternative algorithms which can
17 then be attached to different network devices. This is useful for
18 example if some of your network devices are real time devices that
19 need a certain minimum data flow rate, or if you need to limit the
20 maximum data flow rate for traffic which matches specified criteria.
21 This code is considered to be experimental.
22
23 To administer these schedulers, you'll need the user-level utilities
24 from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
25 That package also contains some documentation; for more, check out
26 <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
27
28 This Quality of Service (QoS) support will enable you to use
29 Differentiated Services (diffserv) and Resource Reservation Protocol
30 (RSVP) on your Linux router if you also say Y to "QoS support",
31 "Packet classifier API" and to some classifiers below. Documentation
32 and software is at <http://diffserv.sourceforge.net/>.
33
34 If you say Y here and to "/proc file system" below, you will be able
35 to read status information about packet schedulers from the file
36 /proc/net/psched.
37
38 The available schedulers are listed in the following questions; you
39 can say Y to as many as you like. If unsure, say N now.
40
4choice 41choice
5 prompt "Packet scheduler clock source" 42 prompt "Packet scheduler clock source"
6 depends on NET_SCHED 43 depends on NET_SCHED
@@ -449,6 +486,19 @@ config NET_EMATCH_META
449 To compile this code as a module, choose M here: the 486 To compile this code as a module, choose M here: the
450 module will be called em_meta. 487 module will be called em_meta.
451 488
489config NET_EMATCH_TEXT
490 tristate "Textsearch"
491 depends on NET_EMATCH
492 select TEXTSEARCH
493 select TEXTSEARCH_KMP
494 select TEXTSEARCH_FSM
495 ---help---
496 Say Y here if you want to be ablt to classify packets based on
497 textsearch comparisons.
498
499 To compile this code as a module, choose M here: the
500 module will be called em_text.
501
452config NET_CLS_ACT 502config NET_CLS_ACT
453 bool "Packet ACTION" 503 bool "Packet ACTION"
454 depends on EXPERIMENTAL && NET_CLS && NET_QOS 504 depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eba8..e48d0d456b3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y := sch_generic.o 5obj-y := sch_generic.o
6 6
7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o 7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o
8obj-$(CONFIG_NET_CLS) += cls_api.o 8obj-$(CONFIG_NET_CLS) += cls_api.o
9obj-$(CONFIG_NET_CLS_ACT) += act_api.o 9obj-$(CONFIG_NET_CLS_ACT) += act_api.o
10obj-$(CONFIG_NET_ACT_POLICE) += police.o 10obj-$(CONFIG_NET_ACT_POLICE) += police.o
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o 40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o 41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o 42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
43obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9594206e6035..249c61936ea0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -439,6 +439,8 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
439 439
440 t = NLMSG_DATA(nlh); 440 t = NLMSG_DATA(nlh);
441 t->tca_family = AF_UNSPEC; 441 t->tca_family = AF_UNSPEC;
442 t->tca__pad1 = 0;
443 t->tca__pad2 = 0;
442 444
443 x = (struct rtattr*) skb->tail; 445 x = (struct rtattr*) skb->tail;
444 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 446 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -580,6 +582,8 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
580 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); 582 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
581 t = NLMSG_DATA(nlh); 583 t = NLMSG_DATA(nlh);
582 t->tca_family = AF_UNSPEC; 584 t->tca_family = AF_UNSPEC;
585 t->tca__pad1 = 0;
586 t->tca__pad2 = 0;
583 587
584 x = (struct rtattr *) skb->tail; 588 x = (struct rtattr *) skb->tail;
585 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 589 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -687,7 +691,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
687 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); 691 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
688 t = NLMSG_DATA(nlh); 692 t = NLMSG_DATA(nlh);
689 t->tca_family = AF_UNSPEC; 693 t->tca_family = AF_UNSPEC;
690 694 t->tca__pad1 = 0;
695 t->tca__pad2 = 0;
696
691 x = (struct rtattr*) skb->tail; 697 x = (struct rtattr*) skb->tail;
692 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 698 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
693 699
@@ -842,6 +848,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
842 cb->nlh->nlmsg_type, sizeof(*t)); 848 cb->nlh->nlmsg_type, sizeof(*t));
843 t = NLMSG_DATA(nlh); 849 t = NLMSG_DATA(nlh);
844 t->tca_family = AF_UNSPEC; 850 t->tca_family = AF_UNSPEC;
851 t->tca__pad1 = 0;
852 t->tca__pad2 = 0;
845 853
846 x = (struct rtattr *) skb->tail; 854 x = (struct rtattr *) skb->tail;
847 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 855 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1616bf5c9627..3b5714ef4d1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -331,6 +331,8 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
331 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 331 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
332 tcm = NLMSG_DATA(nlh); 332 tcm = NLMSG_DATA(nlh);
333 tcm->tcm_family = AF_UNSPEC; 333 tcm->tcm_family = AF_UNSPEC;
334 tcm->tcm__pad1 = 0;
335 tcm->tcm__pad1 = 0;
334 tcm->tcm_ifindex = tp->q->dev->ifindex; 336 tcm->tcm_ifindex = tp->q->dev->ifindex;
335 tcm->tcm_parent = tp->classid; 337 tcm->tcm_parent = tp->classid;
336 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); 338 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb9196810..006168d69376 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
618 pinfo.protocol = s->protocol; 618 pinfo.protocol = s->protocol;
619 pinfo.tunnelid = s->tunnelid; 619 pinfo.tunnelid = s->tunnelid;
620 pinfo.tunnelhdr = f->tunnelhdr; 620 pinfo.tunnelhdr = f->tunnelhdr;
621 pinfo.pad = 0;
621 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); 622 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
622 if (f->res.classid) 623 if (f->res.classid)
623 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); 624 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a35a..53d98f8d3d80 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
205 dst->value = skb->protocol; 205 dst->value = skb->protocol;
206} 206}
207 207
208META_COLLECTOR(int_security)
209{
210 dst->value = skb->security;
211}
212
213META_COLLECTOR(int_pkttype) 208META_COLLECTOR(int_pkttype)
214{ 209{
215 dst->value = skb->pkt_type; 210 dst->value = skb->pkt_type;
@@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
524 [META_ID(REALDEV)] = META_FUNC(int_realdev), 519 [META_ID(REALDEV)] = META_FUNC(int_realdev),
525 [META_ID(PRIORITY)] = META_FUNC(int_priority), 520 [META_ID(PRIORITY)] = META_FUNC(int_priority),
526 [META_ID(PROTOCOL)] = META_FUNC(int_protocol), 521 [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
527 [META_ID(SECURITY)] = META_FUNC(int_security),
528 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), 522 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
529 [META_ID(PKTLEN)] = META_FUNC(int_pktlen), 523 [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
530 [META_ID(DATALEN)] = META_FUNC(int_datalen), 524 [META_ID(DATALEN)] = META_FUNC(int_datalen),
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000000..873840d8d072
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
1/*
2 * net/sched/em_text.c Textsearch ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/string.h>
18#include <linux/skbuff.h>
19#include <linux/textsearch.h>
20#include <linux/tc_ematch/tc_em_text.h>
21#include <net/pkt_cls.h>
22
23struct text_match
24{
25 u16 from_offset;
26 u16 to_offset;
27 u8 from_layer;
28 u8 to_layer;
29 struct ts_config *config;
30};
31
32#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
33
34static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
35 struct tcf_pkt_info *info)
36{
37 struct text_match *tm = EM_TEXT_PRIV(m);
38 int from, to;
39 struct ts_state state;
40
41 from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
42 from += tm->from_offset;
43
44 to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
45 to += tm->to_offset;
46
47 return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
48}
49
50static int em_text_change(struct tcf_proto *tp, void *data, int len,
51 struct tcf_ematch *m)
52{
53 struct text_match *tm;
54 struct tcf_em_text *conf = data;
55 struct ts_config *ts_conf;
56 int flags = 0;
57
58 printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
59 conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
60
61 if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
62 return -EINVAL;
63
64 if (conf->from_layer > conf->to_layer)
65 return -EINVAL;
66
67 if (conf->from_layer == conf->to_layer &&
68 conf->from_offset > conf->to_offset)
69 return -EINVAL;
70
71retry:
72 ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
73 conf->pattern_len, GFP_KERNEL, flags);
74
75 if (flags & TS_AUTOLOAD)
76 rtnl_lock();
77
78 if (IS_ERR(ts_conf)) {
79 if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
80 rtnl_unlock();
81 flags |= TS_AUTOLOAD;
82 goto retry;
83 } else
84 return PTR_ERR(ts_conf);
85 } else if (flags & TS_AUTOLOAD) {
86 textsearch_destroy(ts_conf);
87 return -EAGAIN;
88 }
89
90 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
91 if (tm == NULL) {
92 textsearch_destroy(ts_conf);
93 return -ENOBUFS;
94 }
95
96 tm->from_offset = conf->from_offset;
97 tm->to_offset = conf->to_offset;
98 tm->from_layer = conf->from_layer;
99 tm->to_layer = conf->to_layer;
100 tm->config = ts_conf;
101
102 m->datalen = sizeof(*tm);
103 m->data = (unsigned long) tm;
104
105 return 0;
106}
107
108static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
109{
110 textsearch_destroy(EM_TEXT_PRIV(m)->config);
111}
112
113static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
114{
115 struct text_match *tm = EM_TEXT_PRIV(m);
116 struct tcf_em_text conf;
117
118 strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
119 conf.from_offset = tm->from_offset;
120 conf.to_offset = tm->to_offset;
121 conf.from_layer = tm->from_layer;
122 conf.to_layer = tm->to_layer;
123 conf.pattern_len = textsearch_get_pattern_len(tm->config);
124 conf.pad = 0;
125
126 RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
127 RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
128 return 0;
129
130rtattr_failure:
131 return -1;
132}
133
134static struct tcf_ematch_ops em_text_ops = {
135 .kind = TCF_EM_TEXT,
136 .change = em_text_change,
137 .match = em_text_match,
138 .destroy = em_text_destroy,
139 .dump = em_text_dump,
140 .owner = THIS_MODULE,
141 .link = LIST_HEAD_INIT(em_text_ops.link)
142};
143
144static int __init init_em_text(void)
145{
146 return tcf_em_register(&em_text_ops);
147}
148
149static void __exit exit_em_text(void)
150{
151 tcf_em_unregister(&em_text_ops);
152}
153
154MODULE_LICENSE("GPL");
155
156module_init(init_em_text);
157module_exit(exit_em_text);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 97c1c75d5c78..b9a069af4a02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{ 399{
400 int err; 400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1]; 401 struct rtattr *kind = tca[TCA_KIND-1];
402 void *p = NULL;
403 struct Qdisc *sch; 402 struct Qdisc *sch;
404 struct Qdisc_ops *ops; 403 struct Qdisc_ops *ops;
405 int size;
406 404
407 ops = qdisc_lookup_ops(kind); 405 ops = qdisc_lookup_ops(kind);
408#ifdef CONFIG_KMOD 406#ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
437 if (ops == NULL) 435 if (ops == NULL)
438 goto err_out; 436 goto err_out;
439 437
440 /* ensure that the Qdisc and the private data are 32-byte aligned */ 438 sch = qdisc_alloc(dev, ops);
441 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 439 if (IS_ERR(sch)) {
442 size += ops->priv_size + QDISC_ALIGN_CONST; 440 err = PTR_ERR(sch);
443
444 p = kmalloc(size, GFP_KERNEL);
445 err = -ENOBUFS;
446 if (!p)
447 goto err_out2; 441 goto err_out2;
448 memset(p, 0, size); 442 }
449 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 & ~QDISC_ALIGN_CONST);
451 sch->padded = (char *)sch - (char *)p;
452
453 INIT_LIST_HEAD(&sch->list);
454 skb_queue_head_init(&sch->q);
455 443
456 if (handle == TC_H_INGRESS) 444 if (handle == TC_H_INGRESS) {
457 sch->flags |= TCQ_F_INGRESS; 445 sch->flags |= TCQ_F_INGRESS;
458 446 handle = TC_H_MAKE(TC_H_INGRESS, 0);
459 sch->ops = ops; 447 } else if (handle == 0) {
460 sch->enqueue = ops->enqueue;
461 sch->dequeue = ops->dequeue;
462 sch->dev = dev;
463 dev_hold(dev);
464 atomic_set(&sch->refcnt, 1);
465 sch->stats_lock = &dev->queue_lock;
466 if (handle == 0) {
467 handle = qdisc_alloc_handle(dev); 448 handle = qdisc_alloc_handle(dev);
468 err = -ENOMEM; 449 err = -ENOMEM;
469 if (handle == 0) 450 if (handle == 0)
470 goto err_out3; 451 goto err_out3;
471 } 452 }
472 453
473 if (handle == TC_H_INGRESS) 454 sch->handle = handle;
474 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475 else
476 sch->handle = handle;
477 455
478 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
457#ifdef CONFIG_NET_ESTIMATOR
458 if (tca[TCA_RATE-1]) {
459 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
460 sch->stats_lock,
461 tca[TCA_RATE-1]);
462 if (err) {
463 /*
464 * Any broken qdiscs that would require
465 * a ops->reset() here? The qdisc was never
466 * in action so it shouldn't be necessary.
467 */
468 if (ops->destroy)
469 ops->destroy(sch);
470 goto err_out3;
471 }
472 }
473#endif
479 qdisc_lock_tree(dev); 474 qdisc_lock_tree(dev);
480 list_add_tail(&sch->list, &dev->qdisc_list); 475 list_add_tail(&sch->list, &dev->qdisc_list);
481 qdisc_unlock_tree(dev); 476 qdisc_unlock_tree(dev);
482 477
483#ifdef CONFIG_NET_ESTIMATOR
484 if (tca[TCA_RATE-1])
485 gen_new_estimator(&sch->bstats, &sch->rate_est,
486 sch->stats_lock, tca[TCA_RATE-1]);
487#endif
488 return sch; 478 return sch;
489 } 479 }
490err_out3: 480err_out3:
491 dev_put(dev); 481 dev_put(dev);
482 kfree((char *) sch - sch->padded);
492err_out2: 483err_out2:
493 module_put(ops->owner); 484 module_put(ops->owner);
494err_out: 485err_out:
495 *errp = err; 486 *errp = err;
496 if (p)
497 kfree(p);
498 return NULL; 487 return NULL;
499} 488}
500 489
@@ -770,6 +759,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
770 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 759 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
771 tcm = NLMSG_DATA(nlh); 760 tcm = NLMSG_DATA(nlh);
772 tcm->tcm_family = AF_UNSPEC; 761 tcm->tcm_family = AF_UNSPEC;
762 tcm->tcm__pad1 = 0;
763 tcm->tcm__pad2 = 0;
773 tcm->tcm_ifindex = q->dev->ifindex; 764 tcm->tcm_ifindex = q->dev->ifindex;
774 tcm->tcm_parent = clid; 765 tcm->tcm_parent = clid;
775 tcm->tcm_handle = q->handle; 766 tcm->tcm_handle = q->handle;
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000000..81f0b8346d17
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
1/*
2 * net/sched/sch_blackhole.c Black hole queue
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 *
11 * Note: Quantum tunneling is not supported.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/netdevice.h>
19#include <linux/skbuff.h>
20#include <net/pkt_sched.h>
21
22static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
23{
24 qdisc_drop(skb, sch);
25 return NET_XMIT_SUCCESS;
26}
27
28static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
29{
30 return NULL;
31}
32
33static struct Qdisc_ops blackhole_qdisc_ops = {
34 .id = "blackhole",
35 .priv_size = 0,
36 .enqueue = blackhole_enqueue,
37 .dequeue = blackhole_dequeue,
38 .owner = THIS_MODULE,
39};
40
41static int __init blackhole_module_init(void)
42{
43 return register_qdisc(&blackhole_qdisc_ops);
44}
45
46static void __exit blackhole_module_exit(void)
47{
48 unregister_qdisc(&blackhole_qdisc_ops);
49}
50
51module_init(blackhole_module_init)
52module_exit(blackhole_module_exit)
53
54MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d43e3b8cbf6a..09453f997d8c 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1528 1528
1529 opt.strategy = cl->ovl_strategy; 1529 opt.strategy = cl->ovl_strategy;
1530 opt.priority2 = cl->priority2+1; 1530 opt.priority2 = cl->priority2+1;
1531 opt.pad = 0;
1531 opt.penalty = (cl->penalty*1000)/HZ; 1532 opt.penalty = (cl->penalty*1000)/HZ;
1532 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1533 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1533 return skb->len; 1534 return skb->len;
@@ -1563,6 +1564,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1563 1564
1564 if (cl->police) { 1565 if (cl->police) {
1565 opt.police = cl->police; 1566 opt.police = cl->police;
1567 opt.__res1 = 0;
1568 opt.__res2 = 0;
1566 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); 1569 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1567 } 1570 }
1568 return skb->len; 1571 return skb->len;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7683b34dc6a9..73e218e646ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
395 .owner = THIS_MODULE, 395 .owner = THIS_MODULE,
396}; 396};
397 397
398struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) 398struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
399{ 399{
400 void *p; 400 void *p;
401 struct Qdisc *sch; 401 struct Qdisc *sch;
402 int size; 402 unsigned int size;
403 int err = -ENOBUFS;
403 404
404 /* ensure that the Qdisc and the private data are 32-byte aligned */ 405 /* ensure that the Qdisc and the private data are 32-byte aligned */
405 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 406 size = QDISC_ALIGN(sizeof(*sch));
406 size += ops->priv_size + QDISC_ALIGN_CONST; 407 size += ops->priv_size + (QDISC_ALIGNTO - 1);
407 408
408 p = kmalloc(size, GFP_KERNEL); 409 p = kmalloc(size, GFP_KERNEL);
409 if (!p) 410 if (!p)
410 return NULL; 411 goto errout;
411 memset(p, 0, size); 412 memset(p, 0, size);
412 413 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
413 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 414 sch->padded = (char *) sch - (char *) p;
414 & ~QDISC_ALIGN_CONST);
415 sch->padded = (char *)sch - (char *)p;
416 415
417 INIT_LIST_HEAD(&sch->list); 416 INIT_LIST_HEAD(&sch->list);
418 skb_queue_head_init(&sch->q); 417 skb_queue_head_init(&sch->q);
@@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
423 dev_hold(dev); 422 dev_hold(dev);
424 sch->stats_lock = &dev->queue_lock; 423 sch->stats_lock = &dev->queue_lock;
425 atomic_set(&sch->refcnt, 1); 424 atomic_set(&sch->refcnt, 1);
425
426 return sch;
427errout:
428 return ERR_PTR(-err);
429}
430
431struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
432{
433 struct Qdisc *sch;
434
435 sch = qdisc_alloc(dev, ops);
436 if (IS_ERR(sch))
437 goto errout;
438
426 if (!ops->init || ops->init(sch, NULL) == 0) 439 if (!ops->init || ops->init(sch, NULL) == 0)
427 return sch; 440 return sch;
428 441
429 dev_put(dev); 442errout:
430 kfree(p);
431 return NULL; 443 return NULL;
432} 444}
433 445
@@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
591EXPORT_SYMBOL(noop_qdisc); 603EXPORT_SYMBOL(noop_qdisc);
592EXPORT_SYMBOL(noop_qdisc_ops); 604EXPORT_SYMBOL(noop_qdisc_ops);
593EXPORT_SYMBOL(qdisc_create_dflt); 605EXPORT_SYMBOL(qdisc_create_dflt);
606EXPORT_SYMBOL(qdisc_alloc);
594EXPORT_SYMBOL(qdisc_destroy); 607EXPORT_SYMBOL(qdisc_destroy);
595EXPORT_SYMBOL(qdisc_reset); 608EXPORT_SYMBOL(qdisc_reset);
596EXPORT_SYMBOL(qdisc_restart); 609EXPORT_SYMBOL(qdisc_restart);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 664d0e47374f..7845d045eec4 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt)
385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); 385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
386 386
387 q->qcount = -1; 387 q->qcount = -1;
388 if (skb_queue_len(&sch->q) == 0) 388 if (skb_queue_empty(&sch->q))
389 PSCHED_SET_PASTPERFECT(q->qidlestart); 389 PSCHED_SET_PASTPERFECT(q->qidlestart);
390 sch_tree_unlock(sch); 390 sch_tree_unlock(sch);
391 return 0; 391 return 0;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 7ae6aa772dab..5b24ae0650d3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
71 const struct sctp_endpoint *ep, 71 const struct sctp_endpoint *ep,
72 const struct sock *sk, 72 const struct sock *sk,
73 sctp_scope_t scope, 73 sctp_scope_t scope,
74 int gfp) 74 unsigned int __nocast gfp)
75{ 75{
76 struct sctp_sock *sp; 76 struct sctp_sock *sp;
77 int i; 77 int i;
@@ -203,7 +203,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
203 */ 203 */
204 asoc->addip_serial = asoc->c.initial_tsn; 204 asoc->addip_serial = asoc->c.initial_tsn;
205 205
206 skb_queue_head_init(&asoc->addip_chunks); 206 INIT_LIST_HEAD(&asoc->addip_chunk_list);
207 207
208 /* Make an empty list of remote transport addresses. */ 208 /* Make an empty list of remote transport addresses. */
209 INIT_LIST_HEAD(&asoc->peer.transport_addr_list); 209 INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
@@ -272,7 +272,8 @@ fail_init:
272/* Allocate and initialize a new association */ 272/* Allocate and initialize a new association */
273struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, 273struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
274 const struct sock *sk, 274 const struct sock *sk,
275 sctp_scope_t scope, int gfp) 275 sctp_scope_t scope,
276 unsigned int __nocast gfp)
276{ 277{
277 struct sctp_association *asoc; 278 struct sctp_association *asoc;
278 279
@@ -478,7 +479,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
478/* Add a transport address to an association. */ 479/* Add a transport address to an association. */
479struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, 480struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
480 const union sctp_addr *addr, 481 const union sctp_addr *addr,
481 const int gfp, 482 const unsigned int __nocast gfp,
482 const int peer_state) 483 const int peer_state)
483{ 484{
484 struct sctp_transport *peer; 485 struct sctp_transport *peer;
@@ -1229,7 +1230,8 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
1229/* Build the bind address list for the association based on info from the 1230/* Build the bind address list for the association based on info from the
1230 * local endpoint and the remote peer. 1231 * local endpoint and the remote peer.
1231 */ 1232 */
1232int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp) 1233int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
1234 unsigned int __nocast gfp)
1233{ 1235{
1234 sctp_scope_t scope; 1236 sctp_scope_t scope;
1235 int flags; 1237 int flags;
@@ -1251,7 +1253,8 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
1251 1253
1252/* Build the association's bind address list from the cookie. */ 1254/* Build the association's bind address list from the cookie. */
1253int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc, 1255int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
1254 struct sctp_cookie *cookie, int gfp) 1256 struct sctp_cookie *cookie,
1257 unsigned int __nocast gfp)
1255{ 1258{
1256 int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length); 1259 int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
1257 int var_size3 = cookie->raw_addr_list_len; 1260 int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f90eadfb60a2..f71549710f2e 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,8 @@
53 53
54/* Forward declarations for internal helpers. */ 54/* Forward declarations for internal helpers. */
55static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *, 55static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
56 sctp_scope_t scope, int gfp, int flags); 56 sctp_scope_t scope, unsigned int __nocast gfp,
57 int flags);
57static void sctp_bind_addr_clean(struct sctp_bind_addr *); 58static void sctp_bind_addr_clean(struct sctp_bind_addr *);
58 59
59/* First Level Abstractions. */ 60/* First Level Abstractions. */
@@ -63,7 +64,8 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
63 */ 64 */
64int sctp_bind_addr_copy(struct sctp_bind_addr *dest, 65int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
65 const struct sctp_bind_addr *src, 66 const struct sctp_bind_addr *src,
66 sctp_scope_t scope, int gfp, int flags) 67 sctp_scope_t scope, unsigned int __nocast gfp,
68 int flags)
67{ 69{
68 struct sctp_sockaddr_entry *addr; 70 struct sctp_sockaddr_entry *addr;
69 struct list_head *pos; 71 struct list_head *pos;
@@ -144,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
144 146
145/* Add an address to the bind address list in the SCTP_bind_addr structure. */ 147/* Add an address to the bind address list in the SCTP_bind_addr structure. */
146int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, 148int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
147 int gfp) 149 unsigned int __nocast gfp)
148{ 150{
149 struct sctp_sockaddr_entry *addr; 151 struct sctp_sockaddr_entry *addr;
150 152
@@ -197,7 +199,8 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
197 * The second argument is the return value for the length. 199 * The second argument is the return value for the length.
198 */ 200 */
199union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, 201union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
200 int *addrs_len, int gfp) 202 int *addrs_len,
203 unsigned int __nocast gfp)
201{ 204{
202 union sctp_params addrparms; 205 union sctp_params addrparms;
203 union sctp_params retval; 206 union sctp_params retval;
@@ -249,7 +252,7 @@ end_raw:
249 * address parameters). 252 * address parameters).
250 */ 253 */
251int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, 254int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
252 int addrs_len, __u16 port, int gfp) 255 int addrs_len, __u16 port, unsigned int __nocast gfp)
253{ 256{
254 union sctp_addr_param *rawaddr; 257 union sctp_addr_param *rawaddr;
255 struct sctp_paramhdr *param; 258 struct sctp_paramhdr *param;
@@ -347,7 +350,8 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
347/* Copy out addresses from the global local address list. */ 350/* Copy out addresses from the global local address list. */
348static int sctp_copy_one_addr(struct sctp_bind_addr *dest, 351static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
349 union sctp_addr *addr, 352 union sctp_addr *addr,
350 sctp_scope_t scope, int gfp, int flags) 353 sctp_scope_t scope, unsigned int __nocast gfp,
354 int flags)
351{ 355{
352 int error = 0; 356 int error = 0;
353 357
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0c2ab7885058..61da2937e641 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
62} 62}
63 63
64/* Allocate and initialize datamsg. */ 64/* Allocate and initialize datamsg. */
65SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(int gfp) 65SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
66{ 66{
67 struct sctp_datamsg *msg; 67 struct sctp_datamsg *msg;
68 msg = kmalloc(sizeof(struct sctp_datamsg), gfp); 68 msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 2ec0320fac3b..e47ac0d1a6d6 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -67,7 +67,8 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
67 * Initialize the base fields of the endpoint structure. 67 * Initialize the base fields of the endpoint structure.
68 */ 68 */
69static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, 69static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
70 struct sock *sk, int gfp) 70 struct sock *sk,
71 unsigned int __nocast gfp)
71{ 72{
72 struct sctp_sock *sp = sctp_sk(sk); 73 struct sctp_sock *sp = sctp_sk(sk);
73 memset(ep, 0, sizeof(struct sctp_endpoint)); 74 memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -102,9 +103,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
102 /* Set up the base timeout information. */ 103 /* Set up the base timeout information. */
103 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; 104 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
104 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = 105 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
105 SCTP_DEFAULT_TIMEOUT_T1_COOKIE; 106 msecs_to_jiffies(sp->rtoinfo.srto_initial);
106 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = 107 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
107 SCTP_DEFAULT_TIMEOUT_T1_INIT; 108 msecs_to_jiffies(sp->rtoinfo.srto_initial);
108 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = 109 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
109 msecs_to_jiffies(sp->rtoinfo.srto_initial); 110 msecs_to_jiffies(sp->rtoinfo.srto_initial);
110 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; 111 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +118,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
117 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] 118 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
118 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); 119 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
119 120
120 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 121 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
121 SCTP_DEFAULT_TIMEOUT_HEARTBEAT; 122 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
122 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = 123 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
123 SCTP_DEFAULT_TIMEOUT_SACK;
124 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
125 sp->autoclose * HZ;
126 124
127 /* Use SCTP specific send buffer space queues. */ 125 /* Use SCTP specific send buffer space queues. */
128 ep->sndbuf_policy = sctp_sndbuf_policy; 126 ep->sndbuf_policy = sctp_sndbuf_policy;
@@ -140,7 +138,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
140/* Create a sctp_endpoint with all that boring stuff initialized. 138/* Create a sctp_endpoint with all that boring stuff initialized.
141 * Returns NULL if there isn't enough memory. 139 * Returns NULL if there isn't enough memory.
142 */ 140 */
143struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, int gfp) 141struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
142 unsigned int __nocast gfp)
144{ 143{
145 struct sctp_endpoint *ep; 144 struct sctp_endpoint *ep;
146 145
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 339f7acfdb64..5e085e041a6e 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -115,6 +115,17 @@ static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
115 atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); 115 atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
116} 116}
117 117
118struct sctp_input_cb {
119 union {
120 struct inet_skb_parm h4;
121#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
122 struct inet6_skb_parm h6;
123#endif
124 } header;
125 struct sctp_chunk *chunk;
126};
127#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
128
118/* 129/*
119 * This is the routine which IP calls when receiving an SCTP packet. 130 * This is the routine which IP calls when receiving an SCTP packet.
120 */ 131 */
@@ -243,6 +254,7 @@ int sctp_rcv(struct sk_buff *skb)
243 ret = -ENOMEM; 254 ret = -ENOMEM;
244 goto discard_release; 255 goto discard_release;
245 } 256 }
257 SCTP_INPUT_CB(skb)->chunk = chunk;
246 258
247 sctp_rcv_set_owner_r(skb,sk); 259 sctp_rcv_set_owner_r(skb,sk);
248 260
@@ -265,9 +277,9 @@ int sctp_rcv(struct sk_buff *skb)
265 sctp_bh_lock_sock(sk); 277 sctp_bh_lock_sock(sk);
266 278
267 if (sock_owned_by_user(sk)) 279 if (sock_owned_by_user(sk))
268 sk_add_backlog(sk, (struct sk_buff *) chunk); 280 sk_add_backlog(sk, skb);
269 else 281 else
270 sctp_backlog_rcv(sk, (struct sk_buff *) chunk); 282 sctp_backlog_rcv(sk, skb);
271 283
272 /* Release the sock and any reference counts we took in the 284 /* Release the sock and any reference counts we took in the
273 * lookup calls. 285 * lookup calls.
@@ -302,14 +314,8 @@ discard_release:
302 */ 314 */
303int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) 315int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
304{ 316{
305 struct sctp_chunk *chunk; 317 struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
306 struct sctp_inq *inqueue; 318 struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
307
308 /* One day chunk will live inside the skb, but for
309 * now this works.
310 */
311 chunk = (struct sctp_chunk *) skb;
312 inqueue = &chunk->rcvr->inqueue;
313 319
314 sctp_inq_push(inqueue, chunk); 320 sctp_inq_push(inqueue, chunk);
315 return 0; 321 return 0;
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cedf4351556c..2d33922c044b 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -50,7 +50,7 @@
50/* Initialize an SCTP inqueue. */ 50/* Initialize an SCTP inqueue. */
51void sctp_inq_init(struct sctp_inq *queue) 51void sctp_inq_init(struct sctp_inq *queue)
52{ 52{
53 skb_queue_head_init(&queue->in); 53 INIT_LIST_HEAD(&queue->in_chunk_list);
54 queue->in_progress = NULL; 54 queue->in_progress = NULL;
55 55
56 /* Create a task for delivering data. */ 56 /* Create a task for delivering data. */
@@ -62,11 +62,13 @@ void sctp_inq_init(struct sctp_inq *queue)
62/* Release the memory associated with an SCTP inqueue. */ 62/* Release the memory associated with an SCTP inqueue. */
63void sctp_inq_free(struct sctp_inq *queue) 63void sctp_inq_free(struct sctp_inq *queue)
64{ 64{
65 struct sctp_chunk *chunk; 65 struct sctp_chunk *chunk, *tmp;
66 66
67 /* Empty the queue. */ 67 /* Empty the queue. */
68 while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL) 68 list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
69 list_del_init(&chunk->list);
69 sctp_chunk_free(chunk); 70 sctp_chunk_free(chunk);
71 }
70 72
71 /* If there is a packet which is currently being worked on, 73 /* If there is a packet which is currently being worked on,
72 * free it as well. 74 * free it as well.
@@ -92,7 +94,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
92 * Eventually, we should clean up inqueue to not rely 94 * Eventually, we should clean up inqueue to not rely
93 * on the BH related data structures. 95 * on the BH related data structures.
94 */ 96 */
95 skb_queue_tail(&(q->in), (struct sk_buff *) packet); 97 list_add_tail(&packet->list, &q->in_chunk_list);
96 q->immediate.func(q->immediate.data); 98 q->immediate.func(q->immediate.data);
97} 99}
98 100
@@ -131,12 +133,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
131 133
132 /* Do we need to take the next packet out of the queue to process? */ 134 /* Do we need to take the next packet out of the queue to process? */
133 if (!chunk) { 135 if (!chunk) {
136 struct list_head *entry;
137
134 /* Is the queue empty? */ 138 /* Is the queue empty? */
135 if (skb_queue_empty(&queue->in)) 139 if (list_empty(&queue->in_chunk_list))
136 return NULL; 140 return NULL;
137 141
142 entry = queue->in_chunk_list.next;
138 chunk = queue->in_progress = 143 chunk = queue->in_progress =
139 (struct sctp_chunk *) skb_dequeue(&queue->in); 144 list_entry(entry, struct sctp_chunk, list);
145 list_del_init(entry);
140 146
141 /* This is the first chunk in the packet. */ 147 /* This is the first chunk in the packet. */
142 chunk->singleton = 1; 148 chunk->singleton = 1;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 84b5b370b09d..931371633464 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -108,7 +108,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
108 packet->transport = transport; 108 packet->transport = transport;
109 packet->source_port = sport; 109 packet->source_port = sport;
110 packet->destination_port = dport; 110 packet->destination_port = dport;
111 skb_queue_head_init(&packet->chunks); 111 INIT_LIST_HEAD(&packet->chunk_list);
112 if (asoc) { 112 if (asoc) {
113 struct sctp_sock *sp = sctp_sk(asoc->base.sk); 113 struct sctp_sock *sp = sctp_sk(asoc->base.sk);
114 overhead = sp->pf->af->net_header_len; 114 overhead = sp->pf->af->net_header_len;
@@ -129,12 +129,14 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
129/* Free a packet. */ 129/* Free a packet. */
130void sctp_packet_free(struct sctp_packet *packet) 130void sctp_packet_free(struct sctp_packet *packet)
131{ 131{
132 struct sctp_chunk *chunk; 132 struct sctp_chunk *chunk, *tmp;
133 133
134 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); 134 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
135 135
136 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) 136 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
137 list_del_init(&chunk->list);
137 sctp_chunk_free(chunk); 138 sctp_chunk_free(chunk);
139 }
138 140
139 if (packet->malloced) 141 if (packet->malloced)
140 kfree(packet); 142 kfree(packet);
@@ -276,7 +278,7 @@ append:
276 packet->has_sack = 1; 278 packet->has_sack = 1;
277 279
278 /* It is OK to send this chunk. */ 280 /* It is OK to send this chunk. */
279 __skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk); 281 list_add_tail(&chunk->list, &packet->chunk_list);
280 packet->size += chunk_len; 282 packet->size += chunk_len;
281 chunk->transport = packet->transport; 283 chunk->transport = packet->transport;
282finish: 284finish:
@@ -295,7 +297,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
295 struct sctphdr *sh; 297 struct sctphdr *sh;
296 __u32 crc32; 298 __u32 crc32;
297 struct sk_buff *nskb; 299 struct sk_buff *nskb;
298 struct sctp_chunk *chunk; 300 struct sctp_chunk *chunk, *tmp;
299 struct sock *sk; 301 struct sock *sk;
300 int err = 0; 302 int err = 0;
301 int padding; /* How much padding do we need? */ 303 int padding; /* How much padding do we need? */
@@ -305,11 +307,11 @@ int sctp_packet_transmit(struct sctp_packet *packet)
305 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); 307 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
306 308
307 /* Do NOT generate a chunkless packet. */ 309 /* Do NOT generate a chunkless packet. */
308 chunk = (struct sctp_chunk *)skb_peek(&packet->chunks); 310 if (list_empty(&packet->chunk_list))
309 if (unlikely(!chunk))
310 return err; 311 return err;
311 312
312 /* Set up convenience variables... */ 313 /* Set up convenience variables... */
314 chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
313 sk = chunk->skb->sk; 315 sk = chunk->skb->sk;
314 316
315 /* Allocate the new skb. */ 317 /* Allocate the new skb. */
@@ -370,7 +372,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
370 * [This whole comment explains WORD_ROUND() below.] 372 * [This whole comment explains WORD_ROUND() below.]
371 */ 373 */
372 SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n"); 374 SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n");
373 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { 375 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
376 list_del_init(&chunk->list);
374 if (sctp_chunk_is_data(chunk)) { 377 if (sctp_chunk_is_data(chunk)) {
375 378
376 if (!chunk->has_tsn) { 379 if (!chunk->has_tsn) {
@@ -511,7 +514,8 @@ err:
511 * will get resent or dropped later. 514 * will get resent or dropped later.
512 */ 515 */
513 516
514 while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { 517 list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
518 list_del_init(&chunk->list);
515 if (!sctp_chunk_is_data(chunk)) 519 if (!sctp_chunk_is_data(chunk))
516 sctp_chunk_free(chunk); 520 sctp_chunk_free(chunk);
517 } 521 }
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 4eb81a1407b7..efb72faba20c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -75,7 +75,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
75static inline void sctp_outq_head_data(struct sctp_outq *q, 75static inline void sctp_outq_head_data(struct sctp_outq *q,
76 struct sctp_chunk *ch) 76 struct sctp_chunk *ch)
77{ 77{
78 __skb_queue_head(&q->out, (struct sk_buff *)ch); 78 list_add(&ch->list, &q->out_chunk_list);
79 q->out_qlen += ch->skb->len; 79 q->out_qlen += ch->skb->len;
80 return; 80 return;
81} 81}
@@ -83,17 +83,22 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
83/* Take data from the front of the queue. */ 83/* Take data from the front of the queue. */
84static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) 84static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
85{ 85{
86 struct sctp_chunk *ch; 86 struct sctp_chunk *ch = NULL;
87 ch = (struct sctp_chunk *)__skb_dequeue(&q->out); 87
88 if (ch) 88 if (!list_empty(&q->out_chunk_list)) {
89 struct list_head *entry = q->out_chunk_list.next;
90
91 ch = list_entry(entry, struct sctp_chunk, list);
92 list_del_init(entry);
89 q->out_qlen -= ch->skb->len; 93 q->out_qlen -= ch->skb->len;
94 }
90 return ch; 95 return ch;
91} 96}
92/* Add data chunk to the end of the queue. */ 97/* Add data chunk to the end of the queue. */
93static inline void sctp_outq_tail_data(struct sctp_outq *q, 98static inline void sctp_outq_tail_data(struct sctp_outq *q,
94 struct sctp_chunk *ch) 99 struct sctp_chunk *ch)
95{ 100{
96 __skb_queue_tail(&q->out, (struct sk_buff *)ch); 101 list_add_tail(&ch->list, &q->out_chunk_list);
97 q->out_qlen += ch->skb->len; 102 q->out_qlen += ch->skb->len;
98 return; 103 return;
99} 104}
@@ -197,8 +202,8 @@ static inline int sctp_cacc_skip(struct sctp_transport *primary,
197void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) 202void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
198{ 203{
199 q->asoc = asoc; 204 q->asoc = asoc;
200 skb_queue_head_init(&q->out); 205 INIT_LIST_HEAD(&q->out_chunk_list);
201 skb_queue_head_init(&q->control); 206 INIT_LIST_HEAD(&q->control_chunk_list);
202 INIT_LIST_HEAD(&q->retransmit); 207 INIT_LIST_HEAD(&q->retransmit);
203 INIT_LIST_HEAD(&q->sacked); 208 INIT_LIST_HEAD(&q->sacked);
204 INIT_LIST_HEAD(&q->abandoned); 209 INIT_LIST_HEAD(&q->abandoned);
@@ -217,7 +222,7 @@ void sctp_outq_teardown(struct sctp_outq *q)
217{ 222{
218 struct sctp_transport *transport; 223 struct sctp_transport *transport;
219 struct list_head *lchunk, *pos, *temp; 224 struct list_head *lchunk, *pos, *temp;
220 struct sctp_chunk *chunk; 225 struct sctp_chunk *chunk, *tmp;
221 226
222 /* Throw away unacknowledged chunks. */ 227 /* Throw away unacknowledged chunks. */
223 list_for_each(pos, &q->asoc->peer.transport_addr_list) { 228 list_for_each(pos, &q->asoc->peer.transport_addr_list) {
@@ -269,8 +274,10 @@ void sctp_outq_teardown(struct sctp_outq *q)
269 q->error = 0; 274 q->error = 0;
270 275
271 /* Throw away any leftover control chunks. */ 276 /* Throw away any leftover control chunks. */
272 while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL) 277 list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
278 list_del_init(&chunk->list);
273 sctp_chunk_free(chunk); 279 sctp_chunk_free(chunk);
280 }
274} 281}
275 282
276/* Free the outqueue structure and any related pending chunks. */ 283/* Free the outqueue structure and any related pending chunks. */
@@ -333,7 +340,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
333 break; 340 break;
334 }; 341 };
335 } else { 342 } else {
336 __skb_queue_tail(&q->control, (struct sk_buff *) chunk); 343 list_add_tail(&chunk->list, &q->control_chunk_list);
337 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); 344 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
338 } 345 }
339 346
@@ -650,10 +657,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
650 __u16 sport = asoc->base.bind_addr.port; 657 __u16 sport = asoc->base.bind_addr.port;
651 __u16 dport = asoc->peer.port; 658 __u16 dport = asoc->peer.port;
652 __u32 vtag = asoc->peer.i.init_tag; 659 __u32 vtag = asoc->peer.i.init_tag;
653 struct sk_buff_head *queue;
654 struct sctp_transport *transport = NULL; 660 struct sctp_transport *transport = NULL;
655 struct sctp_transport *new_transport; 661 struct sctp_transport *new_transport;
656 struct sctp_chunk *chunk; 662 struct sctp_chunk *chunk, *tmp;
657 sctp_xmit_t status; 663 sctp_xmit_t status;
658 int error = 0; 664 int error = 0;
659 int start_timer = 0; 665 int start_timer = 0;
@@ -675,8 +681,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
675 * ... 681 * ...
676 */ 682 */
677 683
678 queue = &q->control; 684 list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
679 while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) { 685 list_del_init(&chunk->list);
686
680 /* Pick the right transport to use. */ 687 /* Pick the right transport to use. */
681 new_transport = chunk->transport; 688 new_transport = chunk->transport;
682 689
@@ -814,8 +821,6 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
814 821
815 /* Finally, transmit new packets. */ 822 /* Finally, transmit new packets. */
816 start_timer = 0; 823 start_timer = 0;
817 queue = &q->out;
818
819 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { 824 while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
820 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid 825 /* RFC 2960 6.5 Every DATA chunk MUST carry a valid
821 * stream identifier. 826 * stream identifier.
@@ -1149,8 +1154,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
1149 /* See if all chunks are acked. 1154 /* See if all chunks are acked.
1150 * Make sure the empty queue handler will get run later. 1155 * Make sure the empty queue handler will get run later.
1151 */ 1156 */
1152 q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) && 1157 q->empty = (list_empty(&q->out_chunk_list) &&
1153 list_empty(&q->retransmit); 1158 list_empty(&q->control_chunk_list) &&
1159 list_empty(&q->retransmit));
1154 if (!q->empty) 1160 if (!q->empty)
1155 goto finish; 1161 goto finish;
1156 1162
@@ -1679,9 +1685,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
1679 if (TSN_lte(tsn, ctsn)) { 1685 if (TSN_lte(tsn, ctsn)) {
1680 list_del_init(lchunk); 1686 list_del_init(lchunk);
1681 if (!chunk->tsn_gap_acked) { 1687 if (!chunk->tsn_gap_acked) {
1682 chunk->transport->flight_size -= 1688 chunk->transport->flight_size -=
1683 sctp_data_size(chunk); 1689 sctp_data_size(chunk);
1684 q->outstanding_bytes -= sctp_data_size(chunk); 1690 q->outstanding_bytes -= sctp_data_size(chunk);
1685 } 1691 }
1686 sctp_chunk_free(chunk); 1692 sctp_chunk_free(chunk);
1687 } else { 1693 } else {
@@ -1729,7 +1735,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
1729 nskips, &ftsn_skip_arr[0]); 1735 nskips, &ftsn_skip_arr[0]);
1730 1736
1731 if (ftsn_chunk) { 1737 if (ftsn_chunk) {
1732 __skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk); 1738 list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
1733 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); 1739 SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
1734 } 1740 }
1735} 1741}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5135e1a25d25..ce9245e71fca 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
219 219
220/* Copy the local addresses which are valid for 'scope' into 'bp'. */ 220/* Copy the local addresses which are valid for 'scope' into 'bp'. */
221int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope, 221int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
222 int gfp, int copy_flags) 222 unsigned int __nocast gfp, int copy_flags)
223{ 223{
224 struct sctp_sockaddr_entry *addr; 224 struct sctp_sockaddr_entry *addr;
225 int error = 0; 225 int error = 0;
@@ -1050,7 +1050,10 @@ SCTP_STATIC __init int sctp_init(void)
1050 sctp_sndbuf_policy = 0; 1050 sctp_sndbuf_policy = 0;
1051 1051
1052 /* HB.interval - 30 seconds */ 1052 /* HB.interval - 30 seconds */
1053 sctp_hb_interval = 30 * HZ; 1053 sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
1054
1055 /* delayed SACK timeout */
1056 sctp_sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK;
1054 1057
1055 /* Implementation specific variables. */ 1058 /* Implementation specific variables. */
1056 1059
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5baed9bb7de5..00d32b7c8266 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
78static int sctp_process_param(struct sctp_association *asoc, 78static int sctp_process_param(struct sctp_association *asoc,
79 union sctp_params param, 79 union sctp_params param,
80 const union sctp_addr *peer_addr, 80 const union sctp_addr *peer_addr,
81 int gfp); 81 unsigned int __nocast gfp);
82 82
83/* What was the inbound interface for this chunk? */ 83/* What was the inbound interface for this chunk? */
84int sctp_chunk_iif(const struct sctp_chunk *chunk) 84int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
174 */ 174 */
175struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, 175struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
176 const struct sctp_bind_addr *bp, 176 const struct sctp_bind_addr *bp,
177 int gfp, int vparam_len) 177 unsigned int __nocast gfp, int vparam_len)
178{ 178{
179 sctp_inithdr_t init; 179 sctp_inithdr_t init;
180 union sctp_params addrs; 180 union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
261 261
262struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, 262struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
263 const struct sctp_chunk *chunk, 263 const struct sctp_chunk *chunk,
264 int gfp, int unkparam_len) 264 unsigned int __nocast gfp, int unkparam_len)
265{ 265{
266 sctp_inithdr_t initack; 266 sctp_inithdr_t initack;
267 struct sctp_chunk *retval; 267 struct sctp_chunk *retval;
@@ -1003,6 +1003,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
1003 SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb); 1003 SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb);
1004 } 1004 }
1005 1005
1006 INIT_LIST_HEAD(&retval->list);
1006 retval->skb = skb; 1007 retval->skb = skb;
1007 retval->asoc = (struct sctp_association *)asoc; 1008 retval->asoc = (struct sctp_association *)asoc;
1008 retval->resent = 0; 1009 retval->resent = 0;
@@ -1116,8 +1117,7 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk)
1116/* Possibly, free the chunk. */ 1117/* Possibly, free the chunk. */
1117void sctp_chunk_free(struct sctp_chunk *chunk) 1118void sctp_chunk_free(struct sctp_chunk *chunk)
1118{ 1119{
1119 /* Make sure that we are not on any list. */ 1120 BUG_ON(!list_empty(&chunk->list));
1120 skb_unlink((struct sk_buff *) chunk);
1121 list_del_init(&chunk->transmitted_list); 1121 list_del_init(&chunk->transmitted_list);
1122 1122
1123 /* Release our reference on the message tracker. */ 1123 /* Release our reference on the message tracker. */
@@ -1233,7 +1233,8 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
1233 1233
1234/* Create a CLOSED association to use with an incoming packet. */ 1234/* Create a CLOSED association to use with an incoming packet. */
1235struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, 1235struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
1236 struct sctp_chunk *chunk, int gfp) 1236 struct sctp_chunk *chunk,
1237 unsigned int __nocast gfp)
1237{ 1238{
1238 struct sctp_association *asoc; 1239 struct sctp_association *asoc;
1239 struct sk_buff *skb; 1240 struct sk_buff *skb;
@@ -1348,7 +1349,7 @@ nodata:
1348struct sctp_association *sctp_unpack_cookie( 1349struct sctp_association *sctp_unpack_cookie(
1349 const struct sctp_endpoint *ep, 1350 const struct sctp_endpoint *ep,
1350 const struct sctp_association *asoc, 1351 const struct sctp_association *asoc,
1351 struct sctp_chunk *chunk, int gfp, 1352 struct sctp_chunk *chunk, unsigned int __nocast gfp,
1352 int *error, struct sctp_chunk **errp) 1353 int *error, struct sctp_chunk **errp)
1353{ 1354{
1354 struct sctp_association *retval = NULL; 1355 struct sctp_association *retval = NULL;
@@ -1812,7 +1813,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
1812 */ 1813 */
1813int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, 1814int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1814 const union sctp_addr *peer_addr, 1815 const union sctp_addr *peer_addr,
1815 sctp_init_chunk_t *peer_init, int gfp) 1816 sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
1816{ 1817{
1817 union sctp_params param; 1818 union sctp_params param;
1818 struct sctp_transport *transport; 1819 struct sctp_transport *transport;
@@ -1983,7 +1984,7 @@ nomem:
1983static int sctp_process_param(struct sctp_association *asoc, 1984static int sctp_process_param(struct sctp_association *asoc,
1984 union sctp_params param, 1985 union sctp_params param,
1985 const union sctp_addr *peer_addr, 1986 const union sctp_addr *peer_addr,
1986 int gfp) 1987 unsigned int __nocast gfp)
1987{ 1988{
1988 union sctp_addr addr; 1989 union sctp_addr addr;
1989 int i; 1990 int i;
@@ -2739,8 +2740,12 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
2739 asoc->addip_last_asconf = NULL; 2740 asoc->addip_last_asconf = NULL;
2740 2741
2741 /* Send the next asconf chunk from the addip chunk queue. */ 2742 /* Send the next asconf chunk from the addip chunk queue. */
2742 asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks); 2743 if (!list_empty(&asoc->addip_chunk_list)) {
2743 if (asconf) { 2744 struct list_head *entry = asoc->addip_chunk_list.next;
2745 asconf = list_entry(entry, struct sctp_chunk, list);
2746
2747 list_del_init(entry);
2748
2744 /* Hold the chunk until an ASCONF_ACK is received. */ 2749 /* Hold the chunk until an ASCONF_ACK is received. */
2745 sctp_chunk_hold(asconf); 2750 sctp_chunk_hold(asconf);
2746 if (sctp_primitive_ASCONF(asoc, asconf)) 2751 if (sctp_primitive_ASCONF(asoc, asconf))
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 778639db125a..39c970b5b198 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
63 void *event_arg, 63 void *event_arg,
64 sctp_disposition_t status, 64 sctp_disposition_t status,
65 sctp_cmd_seq_t *commands, 65 sctp_cmd_seq_t *commands,
66 int gfp); 66 unsigned int __nocast gfp);
67static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, 67static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
68 sctp_state_t state, 68 sctp_state_t state,
69 struct sctp_endpoint *ep, 69 struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
71 void *event_arg, 71 void *event_arg,
72 sctp_disposition_t status, 72 sctp_disposition_t status,
73 sctp_cmd_seq_t *commands, 73 sctp_cmd_seq_t *commands,
74 int gfp); 74 unsigned int __nocast gfp);
75 75
76/******************************************************************** 76/********************************************************************
77 * Helper functions 77 * Helper functions
@@ -497,7 +497,8 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
497static int sctp_cmd_process_init(sctp_cmd_seq_t *commands, 497static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
498 struct sctp_association *asoc, 498 struct sctp_association *asoc,
499 struct sctp_chunk *chunk, 499 struct sctp_chunk *chunk,
500 sctp_init_chunk_t *peer_init, int gfp) 500 sctp_init_chunk_t *peer_init,
501 unsigned int __nocast gfp)
501{ 502{
502 int error; 503 int error;
503 504
@@ -852,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
852 struct sctp_endpoint *ep, 853 struct sctp_endpoint *ep,
853 struct sctp_association *asoc, 854 struct sctp_association *asoc,
854 void *event_arg, 855 void *event_arg,
855 int gfp) 856 unsigned int __nocast gfp)
856{ 857{
857 sctp_cmd_seq_t commands; 858 sctp_cmd_seq_t commands;
858 const sctp_sm_table_entry_t *state_fn; 859 const sctp_sm_table_entry_t *state_fn;
@@ -897,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
897 void *event_arg, 898 void *event_arg,
898 sctp_disposition_t status, 899 sctp_disposition_t status,
899 sctp_cmd_seq_t *commands, 900 sctp_cmd_seq_t *commands,
900 int gfp) 901 unsigned int __nocast gfp)
901{ 902{
902 int error; 903 int error;
903 904
@@ -985,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
985 void *event_arg, 986 void *event_arg,
986 sctp_disposition_t status, 987 sctp_disposition_t status,
987 sctp_cmd_seq_t *commands, 988 sctp_cmd_seq_t *commands,
988 int gfp) 989 unsigned int __nocast gfp)
989{ 990{
990 int error = 0; 991 int error = 0;
991 int force; 992 int force;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 058189684c7c..86073df418f5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
92 sctp_cmd_seq_t *commands); 92 sctp_cmd_seq_t *commands);
93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); 93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
94 94
95static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
96 __u16 error,
97 const struct sctp_association *asoc,
98 struct sctp_transport *transport);
99
100static sctp_disposition_t sctp_sf_violation_chunklen(
101 const struct sctp_endpoint *ep,
102 const struct sctp_association *asoc,
103 const sctp_subtype_t type,
104 void *arg,
105 sctp_cmd_seq_t *commands);
95 106
96/* Small helper function that checks if the chunk length 107/* Small helper function that checks if the chunk length
97 * is of the appropriate length. The 'required_length' argument 108 * is of the appropriate length. The 'required_length' argument
@@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
2328 * 2339 *
2329 * This is common code called by several sctp_sf_*_abort() functions above. 2340 * This is common code called by several sctp_sf_*_abort() functions above.
2330 */ 2341 */
2331sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, 2342static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
2332 __u16 error, 2343 __u16 error,
2333 const struct sctp_association *asoc, 2344 const struct sctp_association *asoc,
2334 struct sctp_transport *transport) 2345 struct sctp_transport *transport)
@@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
3687 * 3698 *
3688 * Generate an ABORT chunk and terminate the association. 3699 * Generate an ABORT chunk and terminate the association.
3689 */ 3700 */
3690sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep, 3701static sctp_disposition_t sctp_sf_violation_chunklen(
3702 const struct sctp_endpoint *ep,
3691 const struct sctp_association *asoc, 3703 const struct sctp_association *asoc,
3692 const sctp_subtype_t type, 3704 const sctp_subtype_t type,
3693 void *arg, 3705 void *arg,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aad55dc3792b..091a66f06a35 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -406,7 +406,7 @@ static int sctp_send_asconf(struct sctp_association *asoc,
406 * transmission. 406 * transmission.
407 */ 407 */
408 if (asoc->addip_last_asconf) { 408 if (asoc->addip_last_asconf) {
409 __skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk); 409 list_add_tail(&chunk->list, &asoc->addip_chunk_list);
410 goto out; 410 goto out;
411 } 411 }
412 412
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index e627d2b451b6..25037daf3fa0 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -57,7 +57,8 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
57/* Create a new sctp_ssnmap. 57/* Create a new sctp_ssnmap.
58 * Allocate room to store at least 'len' contiguous TSNs. 58 * Allocate room to store at least 'len' contiguous TSNs.
59 */ 59 */
60struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int gfp) 60struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
61 unsigned int __nocast gfp)
61{ 62{
62 struct sctp_ssnmap *retval; 63 struct sctp_ssnmap *retval;
63 int size; 64 int size;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc31849312b..dc4893474f18 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -47,6 +47,8 @@
47static ctl_handler sctp_sysctl_jiffies_ms; 47static ctl_handler sctp_sysctl_jiffies_ms;
48static long rto_timer_min = 1; 48static long rto_timer_min = 1;
49static long rto_timer_max = 86400000; /* One day */ 49static long rto_timer_max = 86400000; /* One day */
50static long sack_timer_min = 1;
51static long sack_timer_max = 500;
50 52
51static ctl_table sctp_table[] = { 53static ctl_table sctp_table[] = {
52 { 54 {
@@ -187,6 +189,17 @@ static ctl_table sctp_table[] = {
187 .mode = 0644, 189 .mode = 0644,
188 .proc_handler = &proc_dointvec 190 .proc_handler = &proc_dointvec
189 }, 191 },
192 {
193 .ctl_name = NET_SCTP_SACK_TIMEOUT,
194 .procname = "sack_timeout",
195 .data = &sctp_sack_timeout,
196 .maxlen = sizeof(long),
197 .mode = 0644,
198 .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
199 .strategy = &sctp_sysctl_jiffies_ms,
200 .extra1 = &sack_timer_min,
201 .extra2 = &sack_timer_max,
202 },
190 { .ctl_name = 0 } 203 { .ctl_name = 0 }
191}; 204};
192 205
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 0ec0fde6e6c5..d2f04ebe5081 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
57/* Initialize a new transport from provided memory. */ 57/* Initialize a new transport from provided memory. */
58static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, 58static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
59 const union sctp_addr *addr, 59 const union sctp_addr *addr,
60 int gfp) 60 unsigned int __nocast gfp)
61{ 61{
62 /* Copy in the address. */ 62 /* Copy in the address. */
63 peer->ipaddr = *addr; 63 peer->ipaddr = *addr;
@@ -103,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
103 103
104 /* Set up the heartbeat timer. */ 104 /* Set up the heartbeat timer. */
105 init_timer(&peer->hb_timer); 105 init_timer(&peer->hb_timer);
106 peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
107 peer->hb_timer.function = sctp_generate_heartbeat_event; 106 peer->hb_timer.function = sctp_generate_heartbeat_event;
108 peer->hb_timer.data = (unsigned long)peer; 107 peer->hb_timer.data = (unsigned long)peer;
109 108
@@ -122,7 +121,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
122} 121}
123 122
124/* Allocate and initialize a new transport. */ 123/* Allocate and initialize a new transport. */
125struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, int gfp) 124struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
125 unsigned int __nocast gfp)
126{ 126{
127 struct sctp_transport *transport; 127 struct sctp_transport *transport;
128 128
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 17d0ff534735..0abd5101107c 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
74 74
75/* Create a new sctp_ulpevent. */ 75/* Create a new sctp_ulpevent. */
76SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, 76SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
77 int gfp) 77 unsigned int __nocast gfp)
78{ 78{
79 struct sctp_ulpevent *event; 79 struct sctp_ulpevent *event;
80 struct sk_buff *skb; 80 struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
136struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( 136struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
137 const struct sctp_association *asoc, 137 const struct sctp_association *asoc,
138 __u16 flags, __u16 state, __u16 error, __u16 outbound, 138 __u16 flags, __u16 state, __u16 error, __u16 outbound,
139 __u16 inbound, int gfp) 139 __u16 inbound, unsigned int __nocast gfp)
140{ 140{
141 struct sctp_ulpevent *event; 141 struct sctp_ulpevent *event;
142 struct sctp_assoc_change *sac; 142 struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
237struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( 237struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
238 const struct sctp_association *asoc, 238 const struct sctp_association *asoc,
239 const struct sockaddr_storage *aaddr, 239 const struct sockaddr_storage *aaddr,
240 int flags, int state, int error, int gfp) 240 int flags, int state, int error, unsigned int __nocast gfp)
241{ 241{
242 struct sctp_ulpevent *event; 242 struct sctp_ulpevent *event;
243 struct sctp_paddr_change *spc; 243 struct sctp_paddr_change *spc;
@@ -350,7 +350,7 @@ fail:
350 */ 350 */
351struct sctp_ulpevent *sctp_ulpevent_make_remote_error( 351struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
352 const struct sctp_association *asoc, struct sctp_chunk *chunk, 352 const struct sctp_association *asoc, struct sctp_chunk *chunk,
353 __u16 flags, int gfp) 353 __u16 flags, unsigned int __nocast gfp)
354{ 354{
355 struct sctp_ulpevent *event; 355 struct sctp_ulpevent *event;
356 struct sctp_remote_error *sre; 356 struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
448 */ 448 */
449struct sctp_ulpevent *sctp_ulpevent_make_send_failed( 449struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
450 const struct sctp_association *asoc, struct sctp_chunk *chunk, 450 const struct sctp_association *asoc, struct sctp_chunk *chunk,
451 __u16 flags, __u32 error, int gfp) 451 __u16 flags, __u32 error, unsigned int __nocast gfp)
452{ 452{
453 struct sctp_ulpevent *event; 453 struct sctp_ulpevent *event;
454 struct sctp_send_failed *ssf; 454 struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
557 */ 557 */
558struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( 558struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
559 const struct sctp_association *asoc, 559 const struct sctp_association *asoc,
560 __u16 flags, int gfp) 560 __u16 flags, unsigned int __nocast gfp)
561{ 561{
562 struct sctp_ulpevent *event; 562 struct sctp_ulpevent *event;
563 struct sctp_shutdown_event *sse; 563 struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
620 * 5.3.1.6 SCTP_ADAPTION_INDICATION 620 * 5.3.1.6 SCTP_ADAPTION_INDICATION
621 */ 621 */
622struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication( 622struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
623 const struct sctp_association *asoc, int gfp) 623 const struct sctp_association *asoc, unsigned int __nocast gfp)
624{ 624{
625 struct sctp_ulpevent *event; 625 struct sctp_ulpevent *event;
626 struct sctp_adaption_event *sai; 626 struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
657 */ 657 */
658struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, 658struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
659 struct sctp_chunk *chunk, 659 struct sctp_chunk *chunk,
660 int gfp) 660 unsigned int __nocast gfp)
661{ 661{
662 struct sctp_ulpevent *event = NULL; 662 struct sctp_ulpevent *event = NULL;
663 struct sk_buff *skb; 663 struct sk_buff *skb;
@@ -718,7 +718,8 @@ fail:
718 * various events. 718 * various events.
719 */ 719 */
720struct sctp_ulpevent *sctp_ulpevent_make_pdapi( 720struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
721 const struct sctp_association *asoc, __u32 indication, int gfp) 721 const struct sctp_association *asoc, __u32 indication,
722 unsigned int __nocast gfp)
722{ 723{
723 struct sctp_ulpevent *event; 724 struct sctp_ulpevent *event;
724 struct sctp_pdapi_event *pd; 725 struct sctp_pdapi_event *pd;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d5dd2cf7ac4a..8bbc279d6c99 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
100 100
101/* Process an incoming DATA chunk. */ 101/* Process an incoming DATA chunk. */
102int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, 102int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
103 int gfp) 103 unsigned int __nocast gfp)
104{ 104{
105 struct sk_buff_head temp; 105 struct sk_buff_head temp;
106 sctp_data_chunk_t *hdr; 106 sctp_data_chunk_t *hdr;
@@ -778,7 +778,8 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
778 778
779/* Partial deliver the first message as there is pressure on rwnd. */ 779/* Partial deliver the first message as there is pressure on rwnd. */
780void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, 780void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
781 struct sctp_chunk *chunk, int gfp) 781 struct sctp_chunk *chunk,
782 unsigned int __nocast gfp)
782{ 783{
783 struct sctp_ulpevent *event; 784 struct sctp_ulpevent *event;
784 struct sctp_association *asoc; 785 struct sctp_association *asoc;
@@ -802,7 +803,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
802 803
803/* Renege some packets to make room for an incoming chunk. */ 804/* Renege some packets to make room for an incoming chunk. */
804void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, 805void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
805 int gfp) 806 unsigned int __nocast gfp)
806{ 807{
807 struct sctp_association *asoc; 808 struct sctp_association *asoc;
808 __u16 needed, freed; 809 __u16 needed, freed;
@@ -841,7 +842,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
841/* Notify the application if an association is aborted and in 842/* Notify the application if an association is aborted and in
842 * partial delivery mode. Send up any pending received messages. 843 * partial delivery mode. Send up any pending received messages.
843 */ 844 */
844void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, int gfp) 845void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
845{ 846{
846 struct sctp_ulpevent *ev = NULL; 847 struct sctp_ulpevent *ev = NULL;
847 struct sock *sk; 848 struct sock *sk;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 32e8acbc60fe..62a073495276 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,6 +41,7 @@ EXPORT_SYMBOL(rpc_release_task);
41 41
42/* RPC client functions */ 42/* RPC client functions */
43EXPORT_SYMBOL(rpc_create_client); 43EXPORT_SYMBOL(rpc_create_client);
44EXPORT_SYMBOL(rpc_new_client);
44EXPORT_SYMBOL(rpc_clone_client); 45EXPORT_SYMBOL(rpc_clone_client);
45EXPORT_SYMBOL(rpc_bind_new_program); 46EXPORT_SYMBOL(rpc_bind_new_program);
46EXPORT_SYMBOL(rpc_destroy_client); 47EXPORT_SYMBOL(rpc_destroy_client);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc96..56db8f13e6cb 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1185 arg->page_len = (pages-2)*PAGE_SIZE; 1185 arg->page_len = (pages-2)*PAGE_SIZE;
1186 arg->len = (pages-1)*PAGE_SIZE; 1186 arg->len = (pages-1)*PAGE_SIZE;
1187 arg->tail[0].iov_len = 0; 1187 arg->tail[0].iov_len = 0;
1188 1188
1189 try_to_freeze(PF_FREEZE); 1189 try_to_freeze();
1190 if (signalled()) 1190 if (signalled())
1191 return -EINTR; 1191 return -EINTR;
1192 1192
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1227 1227
1228 schedule_timeout(timeout); 1228 schedule_timeout(timeout);
1229 1229
1230 try_to_freeze(PF_FREEZE); 1230 try_to_freeze();
1231 1231
1232 spin_lock_bh(&serv->sv_lock); 1232 spin_lock_bh(&serv->sv_lock);
1233 remove_wait_queue(&rqstp->rq_wait, &wait); 1233 remove_wait_queue(&rqstp->rq_wait, &wait);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index eca92405948f..3c654e06b084 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -145,8 +145,6 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
145 if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { 145 if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) {
146 if (task == xprt->snd_task) 146 if (task == xprt->snd_task)
147 return 1; 147 return 1;
148 if (task == NULL)
149 return 0;
150 goto out_sleep; 148 goto out_sleep;
151 } 149 }
152 if (xprt->nocong || __xprt_get_cong(xprt, task)) { 150 if (xprt->nocong || __xprt_get_cong(xprt, task)) {
@@ -970,7 +968,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
970 goto out; 968 goto out;
971 } 969 }
972 970
973 dprintk("RPC: XID %08x read %u bytes\n", 971 dprintk("RPC: XID %08x read %Zd bytes\n",
974 ntohl(xprt->tcp_xid), r); 972 ntohl(xprt->tcp_xid), r);
975 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", 973 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
976 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); 974 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
@@ -1006,7 +1004,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
1006 desc->count -= len; 1004 desc->count -= len;
1007 desc->offset += len; 1005 desc->offset += len;
1008 xprt->tcp_offset += len; 1006 xprt->tcp_offset += len;
1009 dprintk("RPC: discarded %u bytes\n", len); 1007 dprintk("RPC: discarded %Zu bytes\n", len);
1010 tcp_check_recm(xprt); 1008 tcp_check_recm(xprt);
1011} 1009}
1012 1010
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 000000000000..5a69733bcdad
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,21 @@
1#
2# Unix Domain Sockets
3#
4
5config UNIX
6 tristate "Unix domain sockets"
7 ---help---
8 If you say Y here, you will include support for Unix domain sockets;
9 sockets are the standard Unix mechanism for establishing and
10 accessing network connections. Many commonly used programs such as
11 the X Window system and syslog use these sockets even if your
12 machine is not connected to any network. Unless you are working on
13 an embedded system or something similar, you therefore definitely
14 want to say Y here.
15
16 To compile this driver as a module, choose M here: the module will be
17 called unix. Note that several important services won't work
18 correctly if you say M here and then neglect to load the module.
19
20 Say Y unless you know what you are doing.
21
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c420eba4876b..d403e34088ad 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock *sk)
302 * may receive messages only from that peer. */ 302 * may receive messages only from that peer. */
303static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 303static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
304{ 304{
305 if (skb_queue_len(&sk->sk_receive_queue)) { 305 if (!skb_queue_empty(&sk->sk_receive_queue)) {
306 skb_queue_purge(&sk->sk_receive_queue); 306 skb_queue_purge(&sk->sk_receive_queue);
307 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 307 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
308 308
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct sock * sk, long timeo)
1619 for (;;) { 1619 for (;;) {
1620 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 1620 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1621 1621
1622 if (skb_queue_len(&sk->sk_receive_queue) || 1622 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1623 sk->sk_err || 1623 sk->sk_err ||
1624 (sk->sk_shutdown & RCV_SHUTDOWN) || 1624 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1625 signal_pending(current) || 1625 signal_pending(current) ||
diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig
new file mode 100644
index 000000000000..1debe1cb054e
--- /dev/null
+++ b/net/wanrouter/Kconfig
@@ -0,0 +1,29 @@
1#
2# Configuration for WAN router
3#
4
5config WAN_ROUTER
6 tristate "WAN router"
7 depends on EXPERIMENTAL
8 ---help---
9 Wide Area Networks (WANs), such as X.25, frame relay and leased
10 lines, are used to interconnect Local Area Networks (LANs) over vast
11 distances with data transfer rates significantly higher than those
12 achievable with commonly used asynchronous modem connections.
13 Usually, a quite expensive external device called a `WAN router' is
14 needed to connect to a WAN.
15
16 As an alternative, WAN routing can be built into the Linux kernel.
17 With relatively inexpensive WAN interface cards available on the
18 market, a perfectly usable router can be built for less than half
19 the price of an external router. If you have one of those cards and
20 wish to use your Linux box as a WAN router, say Y here and also to
21 the WAN driver for your card, below. You will then need the
22 wan-tools package which is available from <ftp://ftp.sangoma.com/>.
23 Read <file:Documentation/networking/wan-router.txt> for more
24 information.
25
26 To compile WAN routing support as a module, choose M here: the
27 module will be called wanrouter.
28
29 If unsure, say N.
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index d6844ac226f5..13b650ad22e2 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -358,10 +358,10 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
358 */ 358 */
359 359
360 360
361unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev) 361__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
362{ 362{
363 int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ 363 int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */
364 unsigned short ethertype; 364 __be16 ethertype;
365 365
366 switch (skb->data[cnt]) { 366 switch (skb->data[cnt]) {
367 case NLPID_IP: /* IP datagramm */ 367 case NLPID_IP: /* IP datagramm */
@@ -379,7 +379,7 @@ unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
379 skb->data[cnt+3], dev->name); 379 skb->data[cnt+3], dev->name);
380 return 0; 380 return 0;
381 } 381 }
382 ethertype = *((unsigned short*)&skb->data[cnt+4]); 382 ethertype = *((__be16*)&skb->data[cnt+4]);
383 cnt += 6; 383 cnt += 6;
384 break; 384 break;
385 385
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 000000000000..e6759c9660bb
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,36 @@
1#
2# CCITT X.25 Packet Layer
3#
4
5config X25
6 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 X.25 is a set of standardized network protocols, similar in scope to
10 frame relay; the one physical line from your box to the X.25 network
11 entry point can carry several logical point-to-point connections
12 (called "virtual circuits") to other computers connected to the X.25
13 network. Governments, banks, and other organizations tend to use it
14 to connect to each other or to form Wide Area Networks (WANs). Many
15 countries have public X.25 networks. X.25 consists of two
16 protocols: the higher level Packet Layer Protocol (PLP) (say Y here
17 if you want that) and the lower level data link layer protocol LAPB
18 (say Y to "LAPB Data Link Driver" below if you want that).
19
20 You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
21 <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
22 Information about X.25 for Linux is contained in the files
23 <file:Documentation/networking/x25.txt> and
24 <file:Documentation/networking/x25-iface.txt>.
25
26 One connects to an X.25 network either with a dedicated network card
27 using the X.21 protocol (not yet supported by Linux) or one can do
28 X.25 over a standard telephone line using an ordinary modem (say Y
29 to "X.25 async driver" below) or over Ethernet using an ordinary
30 Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
31 Driver" and "LAPB over Ethernet driver" below).
32
33 To compile this driver as a module, choose M here: the module
34 will be called x25. If unsure, say N.
35
36
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 58ca6a972c48..0c1c04322baf 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -1,6 +1,10 @@
1# 1#
2# XFRM configuration 2# XFRM configuration
3# 3#
4config XFRM
5 bool
6 depends on NET
7
4config XFRM_USER 8config XFRM_USER
5 tristate "IPsec user configuration interface" 9 tristate "IPsec user configuration interface"
6 depends on INET && XFRM 10 depends on INET && XFRM
@@ -10,3 +14,14 @@ config XFRM_USER
10 14
11 If unsure, say Y. 15 If unsure, say Y.
12 16
17config NET_KEY
18 tristate "PF_KEY sockets"
19 select XFRM
20 ---help---
21 PF_KEYv2 socket family, compatible to KAME ones.
22 They are required if you are going to use IPsec tools ported
23 from KAME.
24
25 Say Y unless you know what you are doing.
26
27