aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-06-19 21:55:56 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-19 21:55:56 -0400
commitd0b952a9837f81cd89e756b1b34293fa6e1cb59d (patch)
treefbe488bc5f407afa0e91cefb262d9e9ee69062ac /net
parentd90125bfe958ed0451c6b98f831c86aba08b43d5 (diff)
parent47552c4e555eefe381f3d45140b59a2ea4b16486 (diff)
Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
* master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6: (109 commits) [ETHTOOL]: Fix UFO typo [SCTP]: Fix persistent slowdown in sctp when a gap ack consumes rx buffer. [SCTP]: Send only 1 window update SACK per message. [SCTP]: Don't do CRC32C checksum over loopback. [SCTP] Reset rtt_in_progress for the chunk when processing its sack. [SCTP]: Reject sctp packets with broadcast addresses. [SCTP]: Limit association max_retrans setting in setsockopt. [PFKEYV2]: Fix inconsistent typing in struct sadb_x_kmprivate. [IPV6]: Sum real space for RTAs. [IRDA]: Use put_unaligned() in irlmp_do_discovery(). [BRIDGE]: Add support for NETIF_F_HW_CSUM devices [NET]: Add NETIF_F_GEN_CSUM and NETIF_F_ALL_CSUM [TG3]: Convert to non-LLTX [TG3]: Remove unnecessary tx_lock [TCP]: Add tcp_slow_start_after_idle sysctl. [BNX2]: Update version and reldate [BNX2]: Use CPU native page size [BNX2]: Use compressed firmware [BNX2]: Add firmware decompression [BNX2]: Allow WoL settings on new 5708 chips ... Manual fixup for conflict in drivers/net/tulip/winbond-840.c
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig22
-rw-r--r--net/atm/clip.c4
-rw-r--r--net/bridge/Makefile2
-rw-r--r--net/bridge/br.c28
-rw-r--r--net/bridge/br_device.c6
-rw-r--r--net/bridge/br_forward.c12
-rw-r--r--net/bridge/br_if.c13
-rw-r--r--net/bridge/br_netfilter.c14
-rw-r--r--net/bridge/br_netlink.c199
-rw-r--r--net/bridge/br_notify.c2
-rw-r--r--net/bridge/br_private.h12
-rw-r--r--net/bridge/br_stp_if.c4
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/dev.c185
-rw-r--r--net/core/dev_mcast.c28
-rw-r--r--net/core/ethtool.c9
-rw-r--r--net/core/netpoll.c9
-rw-r--r--net/core/pktgen.c4
-rw-r--r--net/core/skbuff.c10
-rw-r--r--net/core/sock.c6
-rw-r--r--net/core/user_dma.c131
-rw-r--r--net/dccp/proto.c4
-rw-r--r--net/decnet/dn_nsp_in.c3
-rw-r--r--net/decnet/dn_route.c3
-rw-r--r--net/ipv4/Kconfig50
-rw-r--r--net/ipv4/Makefile6
-rw-r--r--net/ipv4/ah4.c15
-rw-r--r--net/ipv4/esp4.c18
-rw-r--r--net/ipv4/icmp.c5
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ipcomp.c34
-rw-r--r--net/ipv4/netfilter/Kconfig40
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c143
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c77
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_h323.c111
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_h323_types.c6
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c85
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c6
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_sip.c471
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c18
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_h323.c77
-rw-r--r--net/ipv4/netfilter/ip_nat_sip.c249
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c20
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c73
-rw-r--r--net/ipv4/netfilter/ipt_recent.c1276
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c26
-rw-r--r--net/ipv4/tcp.c127
-rw-r--r--net/ipv4/tcp_bic.c7
-rw-r--r--net/ipv4/tcp_compound.c448
-rw-r--r--net/ipv4/tcp_cong.c6
-rw-r--r--net/ipv4/tcp_cubic.c6
-rw-r--r--net/ipv4/tcp_highspeed.c24
-rw-r--r--net/ipv4/tcp_htcp.c9
-rw-r--r--net/ipv4/tcp_input.c89
-rw-r--r--net/ipv4/tcp_ipv4.c18
-rw-r--r--net/ipv4/tcp_lp.c338
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/tcp_probe.c181
-rw-r--r--net/ipv4/tcp_veno.c231
-rw-r--r--net/ipv4/tcp_westwood.c80
-rw-r--r--net/ipv4/xfrm4_input.c28
-rw-r--r--net/ipv4/xfrm4_mode_transport.c83
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c125
-rw-r--r--net/ipv4/xfrm4_output.c61
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_state.c1
-rw-r--r--net/ipv6/Kconfig20
-rw-r--r--net/ipv6/Makefile2
-rw-r--r--net/ipv6/addrconf.c28
-rw-r--r--net/ipv6/ah6.c10
-rw-r--r--net/ipv6/esp6.c20
-rw-r--r--net/ipv6/ip6_output.c3
-rw-r--r--net/ipv6/ipcomp6.c38
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c10
-rw-r--r--net/ipv6/tcp_ipv6.c12
-rw-r--r--net/ipv6/xfrm6_input.c29
-rw-r--r--net/ipv6/xfrm6_mode_transport.c88
-rw-r--r--net/ipv6/xfrm6_mode_tunnel.c121
-rw-r--r--net/ipv6/xfrm6_output.c63
-rw-r--r--net/ipv6/xfrm6_policy.c6
-rw-r--r--net/ipv6/xfrm6_state.c1
-rw-r--r--net/ipx/ipx_route.c2
-rw-r--r--net/irda/irlmp.c6
-rw-r--r--net/key/af_key.c17
-rw-r--r--net/llc/af_llc.c6
-rw-r--r--net/llc/llc_if.c2
-rw-r--r--net/llc/llc_input.c10
-rw-r--r--net/llc/llc_sap.c59
-rw-r--r--net/netfilter/Kconfig48
-rw-r--r--net/netfilter/Makefile4
-rw-r--r--net/netfilter/nf_conntrack_core.c9
-rw-r--r--net/netfilter/nf_conntrack_ftp.c77
-rw-r--r--net/netfilter/nf_conntrack_netlink.c85
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c5
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c3
-rw-r--r--net/netfilter/nf_conntrack_standalone.c16
-rw-r--r--net/netfilter/xt_CONNSECMARK.c155
-rw-r--r--net/netfilter/xt_SECMARK.c156
-rw-r--r--net/netfilter/xt_connmark.c2
-rw-r--r--net/netfilter/xt_dccp.c3
-rw-r--r--net/netfilter/xt_mark.c2
-rw-r--r--net/netfilter/xt_multiport.c7
-rw-r--r--net/netfilter/xt_quota.c96
-rw-r--r--net/netfilter/xt_sctp.c4
-rw-r--r--net/netfilter/xt_statistic.c112
-rw-r--r--net/netfilter/xt_string.c2
-rw-r--r--net/sched/sch_generic.c28
-rw-r--r--net/sched/sch_teql.c9
-rw-r--r--net/sctp/input.c6
-rw-r--r--net/sctp/ipv6.c6
-rw-r--r--net/sctp/output.c48
-rw-r--r--net/sctp/outqueue.c1
-rw-r--r--net/sctp/protocol.c8
-rw-r--r--net/sctp/sm_statefuns.c10
-rw-r--r--net/sctp/socket.c28
-rw-r--r--net/sctp/ulpevent.c30
-rw-r--r--net/xfrm/xfrm_policy.c141
-rw-r--r--net/xfrm/xfrm_state.c15
-rw-r--r--net/xfrm/xfrm_user.c19
132 files changed, 5268 insertions, 1846 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 4193cdcd3ae7..c6cec5aa5486 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -66,6 +66,13 @@ source "net/ipv6/Kconfig"
66 66
67endif # if INET 67endif # if INET
68 68
69config NETWORK_SECMARK
70 bool "Security Marking"
71 help
72 This enables security marking of network packets, similar
73 to nfmark, but designated for security purposes.
74 If you are unsure how to answer this question, answer N.
75
69menuconfig NETFILTER 76menuconfig NETFILTER
70 bool "Network packet filtering (replaces ipchains)" 77 bool "Network packet filtering (replaces ipchains)"
71 ---help--- 78 ---help---
@@ -215,6 +222,21 @@ config NET_PKTGEN
215 To compile this code as a module, choose M here: the 222 To compile this code as a module, choose M here: the
216 module will be called pktgen. 223 module will be called pktgen.
217 224
225config NET_TCPPROBE
226 tristate "TCP connection probing"
227 depends on INET && EXPERIMENTAL && PROC_FS && KPROBES
228 ---help---
229 This module allows for capturing the changes to TCP connection
230 state in response to incoming packets. It is used for debugging
231 TCP congestion avoidance modules. If you don't understand
232 what was just said, you don't need it: say N.
233
234 Documentation on how to use the packet generator can be found
235 at http://linux-net.osdl.org/index.php/TcpProbe
236
237 To compile this code as a module, choose M here: the
238 module will be called tcp_probe.
239
218endmenu 240endmenu
219 241
220endmenu 242endmenu
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 72d852982664..f92f9c94d2c7 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -98,7 +98,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
98 printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); 98 printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
99 return; 99 return;
100 } 100 }
101 spin_lock_bh(&entry->neigh->dev->xmit_lock); /* block clip_start_xmit() */ 101 netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */
102 entry->neigh->used = jiffies; 102 entry->neigh->used = jiffies;
103 for (walk = &entry->vccs; *walk; walk = &(*walk)->next) 103 for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
104 if (*walk == clip_vcc) { 104 if (*walk == clip_vcc) {
@@ -122,7 +122,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
122 printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc " 122 printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc "
123 "0x%p)\n", entry, clip_vcc); 123 "0x%p)\n", entry, clip_vcc);
124 out: 124 out:
125 spin_unlock_bh(&entry->neigh->dev->xmit_lock); 125 netif_tx_unlock_bh(entry->neigh->dev);
126} 126}
127 127
128/* The neighbour entry n->lock is held. */ 128/* The neighbour entry n->lock is held. */
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 59556e40e143..f444c12cde5a 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o
6 6
7bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ 7bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
8 br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ 8 br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \
9 br_stp_if.o br_stp_timer.o 9 br_stp_if.o br_stp_timer.o br_netlink.o
10 10
11bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o 11bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
12 12
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 12da21afb9ca..654401ceb2db 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -30,36 +30,46 @@ static struct llc_sap *br_stp_sap;
30 30
31static int __init br_init(void) 31static int __init br_init(void)
32{ 32{
33 int err;
34
33 br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv); 35 br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv);
34 if (!br_stp_sap) { 36 if (!br_stp_sap) {
35 printk(KERN_ERR "bridge: can't register sap for STP\n"); 37 printk(KERN_ERR "bridge: can't register sap for STP\n");
36 return -EBUSY; 38 return -EADDRINUSE;
37 } 39 }
38 40
39 br_fdb_init(); 41 br_fdb_init();
40 42
41#ifdef CONFIG_BRIDGE_NETFILTER 43 err = br_netfilter_init();
42 if (br_netfilter_init()) 44 if (err)
43 return 1; 45 goto err_out1;
44#endif 46
47 err = register_netdevice_notifier(&br_device_notifier);
48 if (err)
49 goto err_out2;
50
51 br_netlink_init();
45 brioctl_set(br_ioctl_deviceless_stub); 52 brioctl_set(br_ioctl_deviceless_stub);
46 br_handle_frame_hook = br_handle_frame; 53 br_handle_frame_hook = br_handle_frame;
47 54
48 br_fdb_get_hook = br_fdb_get; 55 br_fdb_get_hook = br_fdb_get;
49 br_fdb_put_hook = br_fdb_put; 56 br_fdb_put_hook = br_fdb_put;
50 57
51 register_netdevice_notifier(&br_device_notifier);
52
53 return 0; 58 return 0;
59
60err_out2:
61 br_netfilter_fini();
62err_out1:
63 llc_sap_put(br_stp_sap);
64 return err;
54} 65}
55 66
56static void __exit br_deinit(void) 67static void __exit br_deinit(void)
57{ 68{
58 rcu_assign_pointer(br_stp_sap->rcv_func, NULL); 69 rcu_assign_pointer(br_stp_sap->rcv_func, NULL);
59 70
60#ifdef CONFIG_BRIDGE_NETFILTER 71 br_netlink_fini();
61 br_netfilter_fini(); 72 br_netfilter_fini();
62#endif
63 unregister_netdevice_notifier(&br_device_notifier); 73 unregister_netdevice_notifier(&br_device_notifier);
64 brioctl_set(NULL); 74 brioctl_set(NULL);
65 75
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 0c88a2ac32c1..2afdc7c0736c 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -145,9 +145,9 @@ static int br_set_tx_csum(struct net_device *dev, u32 data)
145 struct net_bridge *br = netdev_priv(dev); 145 struct net_bridge *br = netdev_priv(dev);
146 146
147 if (data) 147 if (data)
148 br->feature_mask |= NETIF_F_IP_CSUM; 148 br->feature_mask |= NETIF_F_NO_CSUM;
149 else 149 else
150 br->feature_mask &= ~NETIF_F_IP_CSUM; 150 br->feature_mask &= ~NETIF_F_ALL_CSUM;
151 151
152 br_features_recompute(br); 152 br_features_recompute(br);
153 return 0; 153 return 0;
@@ -185,5 +185,5 @@ void br_dev_setup(struct net_device *dev)
185 dev->priv_flags = IFF_EBRIDGE; 185 dev->priv_flags = IFF_EBRIDGE;
186 186
187 dev->features = NETIF_F_SG | NETIF_F_FRAGLIST 187 dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
188 | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM; 188 | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_NO_CSUM;
189} 189}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 56f3aa47e758..0dca027ceb80 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -20,14 +20,11 @@
20#include <linux/netfilter_bridge.h> 20#include <linux/netfilter_bridge.h>
21#include "br_private.h" 21#include "br_private.h"
22 22
23/* Don't forward packets to originating port or forwarding diasabled */
23static inline int should_deliver(const struct net_bridge_port *p, 24static inline int should_deliver(const struct net_bridge_port *p,
24 const struct sk_buff *skb) 25 const struct sk_buff *skb)
25{ 26{
26 if (skb->dev == p->dev || 27 return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING);
27 p->state != BR_STATE_FORWARDING)
28 return 0;
29
30 return 1;
31} 28}
32 29
33static inline unsigned packet_length(const struct sk_buff *skb) 30static inline unsigned packet_length(const struct sk_buff *skb)
@@ -55,10 +52,9 @@ int br_dev_queue_push_xmit(struct sk_buff *skb)
55 52
56int br_forward_finish(struct sk_buff *skb) 53int br_forward_finish(struct sk_buff *skb)
57{ 54{
58 NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, 55 return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
59 br_dev_queue_push_xmit); 56 br_dev_queue_push_xmit);
60 57
61 return 0;
62} 58}
63 59
64static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) 60static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f5d47bf4f967..fdec773f5b52 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -372,12 +372,17 @@ void br_features_recompute(struct net_bridge *br)
372 struct net_bridge_port *p; 372 struct net_bridge_port *p;
373 unsigned long features, checksum; 373 unsigned long features, checksum;
374 374
375 features = br->feature_mask &~ NETIF_F_IP_CSUM; 375 checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0;
376 checksum = br->feature_mask & NETIF_F_IP_CSUM; 376 features = br->feature_mask & ~NETIF_F_ALL_CSUM;
377 377
378 list_for_each_entry(p, &br->port_list, list) { 378 list_for_each_entry(p, &br->port_list, list) {
379 if (!(p->dev->features 379 if (checksum & NETIF_F_NO_CSUM &&
380 & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))) 380 !(p->dev->features & NETIF_F_NO_CSUM))
381 checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
382 if (checksum & NETIF_F_HW_CSUM &&
383 !(p->dev->features & NETIF_F_HW_CSUM))
384 checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
385 if (!(p->dev->features & NETIF_F_IP_CSUM))
381 checksum = 0; 386 checksum = 0;
382 features &= p->dev->features; 387 features &= p->dev->features;
383 } 388 }
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3da9264449f7..3e41f9d6d51c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -407,12 +407,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
407 if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { 407 if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
408 if (pkt_len + sizeof(struct ipv6hdr) > skb->len) 408 if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
409 goto inhdr_error; 409 goto inhdr_error;
410 if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { 410 if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
411 if (__pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr))) 411 goto inhdr_error;
412 goto inhdr_error;
413 if (skb->ip_summed == CHECKSUM_HW)
414 skb->ip_summed = CHECKSUM_NONE;
415 }
416 } 412 }
417 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) 413 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
418 goto inhdr_error; 414 goto inhdr_error;
@@ -495,11 +491,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
495 if (skb->len < len || len < 4 * iph->ihl) 491 if (skb->len < len || len < 4 * iph->ihl)
496 goto inhdr_error; 492 goto inhdr_error;
497 493
498 if (skb->len > len) { 494 pskb_trim_rcsum(skb, len);
499 __pskb_trim(skb, len);
500 if (skb->ip_summed == CHECKSUM_HW)
501 skb->ip_summed = CHECKSUM_NONE;
502 }
503 495
504 nf_bridge_put(skb->nf_bridge); 496 nf_bridge_put(skb->nf_bridge);
505 if (!nf_bridge_alloc(skb)) 497 if (!nf_bridge_alloc(skb))
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
new file mode 100644
index 000000000000..881d7d1a732a
--- /dev/null
+++ b/net/bridge/br_netlink.c
@@ -0,0 +1,199 @@
1/*
2 * Bridge netlink control interface
3 *
4 * Authors:
5 * Stephen Hemminger <shemminger@osdl.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/kernel.h>
14#include <linux/rtnetlink.h>
15#include "br_private.h"
16
17/*
18 * Create one netlink message for one interface
19 * Contains port and master info as well as carrier and bridge state.
20 */
21static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port,
22 u32 pid, u32 seq, int event, unsigned int flags)
23{
24 const struct net_bridge *br = port->br;
25 const struct net_device *dev = port->dev;
26 struct ifinfomsg *r;
27 struct nlmsghdr *nlh;
28 unsigned char *b = skb->tail;
29 u32 mtu = dev->mtu;
30 u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
31 u8 portstate = port->state;
32
33 pr_debug("br_fill_info event %d port %s master %s\n",
34 event, dev->name, br->dev->name);
35
36 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
37 r = NLMSG_DATA(nlh);
38 r->ifi_family = AF_BRIDGE;
39 r->__ifi_pad = 0;
40 r->ifi_type = dev->type;
41 r->ifi_index = dev->ifindex;
42 r->ifi_flags = dev_get_flags(dev);
43 r->ifi_change = 0;
44
45 RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name);
46
47 RTA_PUT(skb, IFLA_MASTER, sizeof(int), &br->dev->ifindex);
48
49 if (dev->addr_len)
50 RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
51
52 RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu);
53 if (dev->ifindex != dev->iflink)
54 RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink);
55
56
57 RTA_PUT(skb, IFLA_OPERSTATE, sizeof(operstate), &operstate);
58
59 if (event == RTM_NEWLINK)
60 RTA_PUT(skb, IFLA_PROTINFO, sizeof(portstate), &portstate);
61
62 nlh->nlmsg_len = skb->tail - b;
63
64 return skb->len;
65
66nlmsg_failure:
67rtattr_failure:
68
69 skb_trim(skb, b - skb->data);
70 return -EINVAL;
71}
72
73/*
74 * Notify listeners of a change in port information
75 */
76void br_ifinfo_notify(int event, struct net_bridge_port *port)
77{
78 struct sk_buff *skb;
79 int err = -ENOMEM;
80
81 pr_debug("bridge notify event=%d\n", event);
82 skb = alloc_skb(NLMSG_SPACE(sizeof(struct ifinfomsg) + 128),
83 GFP_ATOMIC);
84 if (!skb)
85 goto err_out;
86
87 err = br_fill_ifinfo(skb, port, current->pid, 0, event, 0);
88 if (err)
89 goto err_kfree;
90
91 NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
92 netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
93 return;
94
95err_kfree:
96 kfree_skb(skb);
97err_out:
98 netlink_set_err(rtnl, 0, RTNLGRP_LINK, err);
99}
100
101/*
102 * Dump information about all ports, in response to GETLINK
103 */
104static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
105{
106 struct net_device *dev;
107 int idx;
108 int s_idx = cb->args[0];
109 int err = 0;
110
111 read_lock(&dev_base_lock);
112 for (dev = dev_base, idx = 0; dev; dev = dev->next) {
113 struct net_bridge_port *p = dev->br_port;
114
115 /* not a bridge port */
116 if (!p)
117 continue;
118
119 if (idx < s_idx)
120 continue;
121
122 err = br_fill_ifinfo(skb, p, NETLINK_CB(cb->skb).pid,
123 cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI);
124 if (err <= 0)
125 break;
126 ++idx;
127 }
128 read_unlock(&dev_base_lock);
129
130 cb->args[0] = idx;
131
132 return skb->len;
133}
134
135/*
136 * Change state of port (ie from forwarding to blocking etc)
137 * Used by spanning tree in user space.
138 */
139static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
140{
141 struct rtattr **rta = arg;
142 struct ifinfomsg *ifm = NLMSG_DATA(nlh);
143 struct net_device *dev;
144 struct net_bridge_port *p;
145 u8 new_state;
146
147 if (ifm->ifi_family != AF_BRIDGE)
148 return -EPFNOSUPPORT;
149
150 /* Must pass valid state as PROTINFO */
151 if (rta[IFLA_PROTINFO-1]) {
152 u8 *pstate = RTA_DATA(rta[IFLA_PROTINFO-1]);
153 new_state = *pstate;
154 } else
155 return -EINVAL;
156
157 if (new_state > BR_STATE_BLOCKING)
158 return -EINVAL;
159
160 /* Find bridge port */
161 dev = __dev_get_by_index(ifm->ifi_index);
162 if (!dev)
163 return -ENODEV;
164
165 p = dev->br_port;
166 if (!p)
167 return -EINVAL;
168
169 /* if kernel STP is running, don't allow changes */
170 if (p->br->stp_enabled)
171 return -EBUSY;
172
173 if (!netif_running(dev))
174 return -ENETDOWN;
175
176 if (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED)
177 return -ENETDOWN;
178
179 p->state = new_state;
180 br_log_state(p);
181 return 0;
182}
183
184
185static struct rtnetlink_link bridge_rtnetlink_table[RTM_NR_MSGTYPES] = {
186 [RTM_GETLINK - RTM_BASE] = { .dumpit = br_dump_ifinfo, },
187 [RTM_SETLINK - RTM_BASE] = { .doit = br_rtm_setlink, },
188};
189
190void __init br_netlink_init(void)
191{
192 rtnetlink_links[PF_BRIDGE] = bridge_rtnetlink_table;
193}
194
195void __exit br_netlink_fini(void)
196{
197 rtnetlink_links[PF_BRIDGE] = NULL;
198}
199
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index a43a9c1d50d7..20278494e4da 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/rtnetlink.h>
17 18
18#include "br_private.h" 19#include "br_private.h"
19 20
@@ -49,6 +50,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
49 50
50 case NETDEV_CHANGEADDR: 51 case NETDEV_CHANGEADDR:
51 br_fdb_changeaddr(p, dev->dev_addr); 52 br_fdb_changeaddr(p, dev->dev_addr);
53 br_ifinfo_notify(RTM_NEWLINK, p);
52 br_stp_recalculate_bridge_id(br); 54 br_stp_recalculate_bridge_id(br);
53 break; 55 break;
54 56
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 86ecea7ed372..c491fb2f280e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -29,7 +29,7 @@
29 29
30#define BR_PORT_DEBOUNCE (HZ/10) 30#define BR_PORT_DEBOUNCE (HZ/10)
31 31
32#define BR_VERSION "2.1" 32#define BR_VERSION "2.2"
33 33
34typedef struct bridge_id bridge_id; 34typedef struct bridge_id bridge_id;
35typedef struct mac_addr mac_addr; 35typedef struct mac_addr mac_addr;
@@ -192,8 +192,13 @@ extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
192extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg); 192extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg);
193 193
194/* br_netfilter.c */ 194/* br_netfilter.c */
195#ifdef CONFIG_BRIDGE_NETFILTER
195extern int br_netfilter_init(void); 196extern int br_netfilter_init(void);
196extern void br_netfilter_fini(void); 197extern void br_netfilter_fini(void);
198#else
199#define br_netfilter_init() (0)
200#define br_netfilter_fini() do { } while(0)
201#endif
197 202
198/* br_stp.c */ 203/* br_stp.c */
199extern void br_log_state(const struct net_bridge_port *p); 204extern void br_log_state(const struct net_bridge_port *p);
@@ -232,6 +237,11 @@ extern struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
232extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); 237extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
233 238
234 239
240/* br_netlink.c */
241extern void br_netlink_init(void);
242extern void br_netlink_fini(void);
243extern void br_ifinfo_notify(int event, struct net_bridge_port *port);
244
235#ifdef CONFIG_SYSFS 245#ifdef CONFIG_SYSFS
236/* br_sysfs_if.c */ 246/* br_sysfs_if.c */
237extern struct sysfs_ops brport_sysfs_ops; 247extern struct sysfs_ops brport_sysfs_ops;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 23dea1422c9a..14cd025079af 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
18#include <linux/etherdevice.h> 18#include <linux/etherdevice.h>
19#include <linux/rtnetlink.h>
19 20
20#include "br_private.h" 21#include "br_private.h"
21#include "br_private_stp.h" 22#include "br_private_stp.h"
@@ -86,6 +87,7 @@ void br_stp_disable_bridge(struct net_bridge *br)
86void br_stp_enable_port(struct net_bridge_port *p) 87void br_stp_enable_port(struct net_bridge_port *p)
87{ 88{
88 br_init_port(p); 89 br_init_port(p);
90 br_ifinfo_notify(RTM_NEWLINK, p);
89 br_port_state_selection(p->br); 91 br_port_state_selection(p->br);
90} 92}
91 93
@@ -99,6 +101,8 @@ void br_stp_disable_port(struct net_bridge_port *p)
99 printk(KERN_INFO "%s: port %i(%s) entering %s state\n", 101 printk(KERN_INFO "%s: port %i(%s) entering %s state\n",
100 br->dev->name, p->port_no, p->dev->name, "disabled"); 102 br->dev->name, p->port_no, p->dev->name, "disabled");
101 103
104 br_ifinfo_notify(RTM_DELLINK, p);
105
102 wasroot = br_is_root_bridge(br); 106 wasroot = br_is_root_bridge(br);
103 br_become_designated_port(p); 107 br_become_designated_port(p);
104 p->state = BR_STATE_DISABLED; 108 p->state = BR_STATE_DISABLED;
diff --git a/net/core/Makefile b/net/core/Makefile
index 79fe12cced27..e9bd2467d5a9 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o
16obj-$(CONFIG_NET_PKTGEN) += pktgen.o 16obj-$(CONFIG_NET_PKTGEN) += pktgen.o
17obj-$(CONFIG_WIRELESS_EXT) += wireless.o 17obj-$(CONFIG_WIRELESS_EXT) += wireless.o
18obj-$(CONFIG_NETPOLL) += netpoll.o 18obj-$(CONFIG_NETPOLL) += netpoll.o
19obj-$(CONFIG_NET_DMA) += user_dma.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 4fba549caf29..ab39fe17cb58 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
115#include <net/iw_handler.h> 115#include <net/iw_handler.h>
116#include <asm/current.h> 116#include <asm/current.h>
117#include <linux/audit.h> 117#include <linux/audit.h>
118#include <linux/dmaengine.h>
118 119
119/* 120/*
120 * The list of packet types we will receive (as opposed to discard) 121 * The list of packet types we will receive (as opposed to discard)
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock);
148static struct list_head ptype_base[16]; /* 16 way hashed list */ 149static struct list_head ptype_base[16]; /* 16 way hashed list */
149static struct list_head ptype_all; /* Taps */ 150static struct list_head ptype_all; /* Taps */
150 151
152#ifdef CONFIG_NET_DMA
153static struct dma_client *net_dma_client;
154static unsigned int net_dma_count;
155static spinlock_t net_dma_event_lock;
156#endif
157
151/* 158/*
152 * The @dev_base list is protected by @dev_base_lock and the rtnl 159 * The @dev_base list is protected by @dev_base_lock and the rtnl
153 * semaphore. 160 * semaphore.
@@ -1215,75 +1222,15 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1215#define illegal_highdma(dev, skb) (0) 1222#define illegal_highdma(dev, skb) (0)
1216#endif 1223#endif
1217 1224
1218/* Keep head the same: replace data */
1219int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
1220{
1221 unsigned int size;
1222 u8 *data;
1223 long offset;
1224 struct skb_shared_info *ninfo;
1225 int headerlen = skb->data - skb->head;
1226 int expand = (skb->tail + skb->data_len) - skb->end;
1227
1228 if (skb_shared(skb))
1229 BUG();
1230
1231 if (expand <= 0)
1232 expand = 0;
1233
1234 size = skb->end - skb->head + expand;
1235 size = SKB_DATA_ALIGN(size);
1236 data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
1237 if (!data)
1238 return -ENOMEM;
1239
1240 /* Copy entire thing */
1241 if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
1242 BUG();
1243
1244 /* Set up shinfo */
1245 ninfo = (struct skb_shared_info*)(data + size);
1246 atomic_set(&ninfo->dataref, 1);
1247 ninfo->tso_size = skb_shinfo(skb)->tso_size;
1248 ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
1249 ninfo->nr_frags = 0;
1250 ninfo->frag_list = NULL;
1251
1252 /* Offset between the two in bytes */
1253 offset = data - skb->head;
1254
1255 /* Free old data. */
1256 skb_release_data(skb);
1257
1258 skb->head = data;
1259 skb->end = data + size;
1260
1261 /* Set up new pointers */
1262 skb->h.raw += offset;
1263 skb->nh.raw += offset;
1264 skb->mac.raw += offset;
1265 skb->tail += offset;
1266 skb->data += offset;
1267
1268 /* We are no longer a clone, even if we were. */
1269 skb->cloned = 0;
1270
1271 skb->tail += skb->data_len;
1272 skb->data_len = 0;
1273 return 0;
1274}
1275
1276#define HARD_TX_LOCK(dev, cpu) { \ 1225#define HARD_TX_LOCK(dev, cpu) { \
1277 if ((dev->features & NETIF_F_LLTX) == 0) { \ 1226 if ((dev->features & NETIF_F_LLTX) == 0) { \
1278 spin_lock(&dev->xmit_lock); \ 1227 netif_tx_lock(dev); \
1279 dev->xmit_lock_owner = cpu; \
1280 } \ 1228 } \
1281} 1229}
1282 1230
1283#define HARD_TX_UNLOCK(dev) { \ 1231#define HARD_TX_UNLOCK(dev) { \
1284 if ((dev->features & NETIF_F_LLTX) == 0) { \ 1232 if ((dev->features & NETIF_F_LLTX) == 0) { \
1285 dev->xmit_lock_owner = -1; \ 1233 netif_tx_unlock(dev); \
1286 spin_unlock(&dev->xmit_lock); \
1287 } \ 1234 } \
1288} 1235}
1289 1236
@@ -1321,7 +1268,7 @@ int dev_queue_xmit(struct sk_buff *skb)
1321 1268
1322 if (skb_shinfo(skb)->frag_list && 1269 if (skb_shinfo(skb)->frag_list &&
1323 !(dev->features & NETIF_F_FRAGLIST) && 1270 !(dev->features & NETIF_F_FRAGLIST) &&
1324 __skb_linearize(skb, GFP_ATOMIC)) 1271 __skb_linearize(skb))
1325 goto out_kfree_skb; 1272 goto out_kfree_skb;
1326 1273
1327 /* Fragmented skb is linearized if device does not support SG, 1274 /* Fragmented skb is linearized if device does not support SG,
@@ -1330,14 +1277,14 @@ int dev_queue_xmit(struct sk_buff *skb)
1330 */ 1277 */
1331 if (skb_shinfo(skb)->nr_frags && 1278 if (skb_shinfo(skb)->nr_frags &&
1332 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && 1279 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1333 __skb_linearize(skb, GFP_ATOMIC)) 1280 __skb_linearize(skb))
1334 goto out_kfree_skb; 1281 goto out_kfree_skb;
1335 1282
1336 /* If packet is not checksummed and device does not support 1283 /* If packet is not checksummed and device does not support
1337 * checksumming for this protocol, complete checksumming here. 1284 * checksumming for this protocol, complete checksumming here.
1338 */ 1285 */
1339 if (skb->ip_summed == CHECKSUM_HW && 1286 if (skb->ip_summed == CHECKSUM_HW &&
1340 (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && 1287 (!(dev->features & NETIF_F_GEN_CSUM) &&
1341 (!(dev->features & NETIF_F_IP_CSUM) || 1288 (!(dev->features & NETIF_F_IP_CSUM) ||
1342 skb->protocol != htons(ETH_P_IP)))) 1289 skb->protocol != htons(ETH_P_IP))))
1343 if (skb_checksum_help(skb, 0)) 1290 if (skb_checksum_help(skb, 0))
@@ -1382,8 +1329,8 @@ int dev_queue_xmit(struct sk_buff *skb)
1382 /* The device has no queue. Common case for software devices: 1329 /* The device has no queue. Common case for software devices:
1383 loopback, all the sorts of tunnels... 1330 loopback, all the sorts of tunnels...
1384 1331
1385 Really, it is unlikely that xmit_lock protection is necessary here. 1332 Really, it is unlikely that netif_tx_lock protection is necessary
1386 (f.e. loopback and IP tunnels are clean ignoring statistics 1333 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1387 counters.) 1334 counters.)
1388 However, it is possible, that they rely on protection 1335 However, it is possible, that they rely on protection
1389 made by us here. 1336 made by us here.
@@ -1846,6 +1793,19 @@ static void net_rx_action(struct softirq_action *h)
1846 } 1793 }
1847 } 1794 }
1848out: 1795out:
1796#ifdef CONFIG_NET_DMA
1797 /*
1798 * There may not be any more sk_buffs coming right now, so push
1799 * any pending DMA copies to hardware
1800 */
1801 if (net_dma_client) {
1802 struct dma_chan *chan;
1803 rcu_read_lock();
1804 list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
1805 dma_async_memcpy_issue_pending(chan);
1806 rcu_read_unlock();
1807 }
1808#endif
1849 local_irq_enable(); 1809 local_irq_enable();
1850 return; 1810 return;
1851 1811
@@ -2785,7 +2745,7 @@ int register_netdevice(struct net_device *dev)
2785 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 2745 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2786 2746
2787 spin_lock_init(&dev->queue_lock); 2747 spin_lock_init(&dev->queue_lock);
2788 spin_lock_init(&dev->xmit_lock); 2748 spin_lock_init(&dev->_xmit_lock);
2789 dev->xmit_lock_owner = -1; 2749 dev->xmit_lock_owner = -1;
2790#ifdef CONFIG_NET_CLS_ACT 2750#ifdef CONFIG_NET_CLS_ACT
2791 spin_lock_init(&dev->ingress_lock); 2751 spin_lock_init(&dev->ingress_lock);
@@ -2829,9 +2789,7 @@ int register_netdevice(struct net_device *dev)
2829 2789
2830 /* Fix illegal SG+CSUM combinations. */ 2790 /* Fix illegal SG+CSUM combinations. */
2831 if ((dev->features & NETIF_F_SG) && 2791 if ((dev->features & NETIF_F_SG) &&
2832 !(dev->features & (NETIF_F_IP_CSUM | 2792 !(dev->features & NETIF_F_ALL_CSUM)) {
2833 NETIF_F_NO_CSUM |
2834 NETIF_F_HW_CSUM))) {
2835 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", 2793 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
2836 dev->name); 2794 dev->name);
2837 dev->features &= ~NETIF_F_SG; 2795 dev->features &= ~NETIF_F_SG;
@@ -3300,6 +3258,88 @@ static int dev_cpu_callback(struct notifier_block *nfb,
3300} 3258}
3301#endif /* CONFIG_HOTPLUG_CPU */ 3259#endif /* CONFIG_HOTPLUG_CPU */
3302 3260
3261#ifdef CONFIG_NET_DMA
3262/**
3263 * net_dma_rebalance -
3264 * This is called when the number of channels allocated to the net_dma_client
3265 * changes. The net_dma_client tries to have one DMA channel per CPU.
3266 */
3267static void net_dma_rebalance(void)
3268{
3269 unsigned int cpu, i, n;
3270 struct dma_chan *chan;
3271
3272 lock_cpu_hotplug();
3273
3274 if (net_dma_count == 0) {
3275 for_each_online_cpu(cpu)
3276 rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL);
3277 unlock_cpu_hotplug();
3278 return;
3279 }
3280
3281 i = 0;
3282 cpu = first_cpu(cpu_online_map);
3283
3284 rcu_read_lock();
3285 list_for_each_entry(chan, &net_dma_client->channels, client_node) {
3286 n = ((num_online_cpus() / net_dma_count)
3287 + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
3288
3289 while(n) {
3290 per_cpu(softnet_data.net_dma, cpu) = chan;
3291 cpu = next_cpu(cpu, cpu_online_map);
3292 n--;
3293 }
3294 i++;
3295 }
3296 rcu_read_unlock();
3297
3298 unlock_cpu_hotplug();
3299}
3300
3301/**
3302 * netdev_dma_event - event callback for the net_dma_client
3303 * @client: should always be net_dma_client
3304 * @chan:
3305 * @event:
3306 */
3307static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
3308 enum dma_event event)
3309{
3310 spin_lock(&net_dma_event_lock);
3311 switch (event) {
3312 case DMA_RESOURCE_ADDED:
3313 net_dma_count++;
3314 net_dma_rebalance();
3315 break;
3316 case DMA_RESOURCE_REMOVED:
3317 net_dma_count--;
3318 net_dma_rebalance();
3319 break;
3320 default:
3321 break;
3322 }
3323 spin_unlock(&net_dma_event_lock);
3324}
3325
3326/**
3327 * netdev_dma_regiser - register the networking subsystem as a DMA client
3328 */
3329static int __init netdev_dma_register(void)
3330{
3331 spin_lock_init(&net_dma_event_lock);
3332 net_dma_client = dma_async_client_register(netdev_dma_event);
3333 if (net_dma_client == NULL)
3334 return -ENOMEM;
3335
3336 dma_async_client_chan_request(net_dma_client, num_online_cpus());
3337 return 0;
3338}
3339
3340#else
3341static int __init netdev_dma_register(void) { return -ENODEV; }
3342#endif /* CONFIG_NET_DMA */
3303 3343
3304/* 3344/*
3305 * Initialize the DEV module. At boot time this walks the device list and 3345 * Initialize the DEV module. At boot time this walks the device list and
@@ -3353,6 +3393,8 @@ static int __init net_dev_init(void)
3353 atomic_set(&queue->backlog_dev.refcnt, 1); 3393 atomic_set(&queue->backlog_dev.refcnt, 1);
3354 } 3394 }
3355 3395
3396 netdev_dma_register();
3397
3356 dev_boot_phase = 0; 3398 dev_boot_phase = 0;
3357 3399
3358 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); 3400 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
@@ -3371,7 +3413,6 @@ subsys_initcall(net_dev_init);
3371EXPORT_SYMBOL(__dev_get_by_index); 3413EXPORT_SYMBOL(__dev_get_by_index);
3372EXPORT_SYMBOL(__dev_get_by_name); 3414EXPORT_SYMBOL(__dev_get_by_name);
3373EXPORT_SYMBOL(__dev_remove_pack); 3415EXPORT_SYMBOL(__dev_remove_pack);
3374EXPORT_SYMBOL(__skb_linearize);
3375EXPORT_SYMBOL(dev_valid_name); 3416EXPORT_SYMBOL(dev_valid_name);
3376EXPORT_SYMBOL(dev_add_pack); 3417EXPORT_SYMBOL(dev_add_pack);
3377EXPORT_SYMBOL(dev_alloc_name); 3418EXPORT_SYMBOL(dev_alloc_name);
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 05d60850840e..c57d887da2ef 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -62,7 +62,7 @@
62 * Device mc lists are changed by bh at least if IPv6 is enabled, 62 * Device mc lists are changed by bh at least if IPv6 is enabled,
63 * so that it must be bh protected. 63 * so that it must be bh protected.
64 * 64 *
65 * We block accesses to device mc filters with dev->xmit_lock. 65 * We block accesses to device mc filters with netif_tx_lock.
66 */ 66 */
67 67
68/* 68/*
@@ -93,9 +93,9 @@ static void __dev_mc_upload(struct net_device *dev)
93 93
94void dev_mc_upload(struct net_device *dev) 94void dev_mc_upload(struct net_device *dev)
95{ 95{
96 spin_lock_bh(&dev->xmit_lock); 96 netif_tx_lock_bh(dev);
97 __dev_mc_upload(dev); 97 __dev_mc_upload(dev);
98 spin_unlock_bh(&dev->xmit_lock); 98 netif_tx_unlock_bh(dev);
99} 99}
100 100
101/* 101/*
@@ -107,7 +107,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
107 int err = 0; 107 int err = 0;
108 struct dev_mc_list *dmi, **dmip; 108 struct dev_mc_list *dmi, **dmip;
109 109
110 spin_lock_bh(&dev->xmit_lock); 110 netif_tx_lock_bh(dev);
111 111
112 for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { 112 for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
113 /* 113 /*
@@ -139,13 +139,13 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
139 */ 139 */
140 __dev_mc_upload(dev); 140 __dev_mc_upload(dev);
141 141
142 spin_unlock_bh(&dev->xmit_lock); 142 netif_tx_unlock_bh(dev);
143 return 0; 143 return 0;
144 } 144 }
145 } 145 }
146 err = -ENOENT; 146 err = -ENOENT;
147done: 147done:
148 spin_unlock_bh(&dev->xmit_lock); 148 netif_tx_unlock_bh(dev);
149 return err; 149 return err;
150} 150}
151 151
@@ -160,7 +160,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
160 160
161 dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC); 161 dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
162 162
163 spin_lock_bh(&dev->xmit_lock); 163 netif_tx_lock_bh(dev);
164 for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { 164 for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
165 if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && 165 if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
166 dmi->dmi_addrlen == alen) { 166 dmi->dmi_addrlen == alen) {
@@ -176,7 +176,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
176 } 176 }
177 177
178 if ((dmi = dmi1) == NULL) { 178 if ((dmi = dmi1) == NULL) {
179 spin_unlock_bh(&dev->xmit_lock); 179 netif_tx_unlock_bh(dev);
180 return -ENOMEM; 180 return -ENOMEM;
181 } 181 }
182 memcpy(dmi->dmi_addr, addr, alen); 182 memcpy(dmi->dmi_addr, addr, alen);
@@ -189,11 +189,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
189 189
190 __dev_mc_upload(dev); 190 __dev_mc_upload(dev);
191 191
192 spin_unlock_bh(&dev->xmit_lock); 192 netif_tx_unlock_bh(dev);
193 return 0; 193 return 0;
194 194
195done: 195done:
196 spin_unlock_bh(&dev->xmit_lock); 196 netif_tx_unlock_bh(dev);
197 kfree(dmi1); 197 kfree(dmi1);
198 return err; 198 return err;
199} 199}
@@ -204,7 +204,7 @@ done:
204 204
205void dev_mc_discard(struct net_device *dev) 205void dev_mc_discard(struct net_device *dev)
206{ 206{
207 spin_lock_bh(&dev->xmit_lock); 207 netif_tx_lock_bh(dev);
208 208
209 while (dev->mc_list != NULL) { 209 while (dev->mc_list != NULL) {
210 struct dev_mc_list *tmp = dev->mc_list; 210 struct dev_mc_list *tmp = dev->mc_list;
@@ -215,7 +215,7 @@ void dev_mc_discard(struct net_device *dev)
215 } 215 }
216 dev->mc_count = 0; 216 dev->mc_count = 0;
217 217
218 spin_unlock_bh(&dev->xmit_lock); 218 netif_tx_unlock_bh(dev);
219} 219}
220 220
221#ifdef CONFIG_PROC_FS 221#ifdef CONFIG_PROC_FS
@@ -250,7 +250,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
250 struct dev_mc_list *m; 250 struct dev_mc_list *m;
251 struct net_device *dev = v; 251 struct net_device *dev = v;
252 252
253 spin_lock_bh(&dev->xmit_lock); 253 netif_tx_lock_bh(dev);
254 for (m = dev->mc_list; m; m = m->next) { 254 for (m = dev->mc_list; m; m = m->next) {
255 int i; 255 int i;
256 256
@@ -262,7 +262,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
262 262
263 seq_putc(seq, '\n'); 263 seq_putc(seq, '\n');
264 } 264 }
265 spin_unlock_bh(&dev->xmit_lock); 265 netif_tx_unlock_bh(dev);
266 return 0; 266 return 0;
267} 267}
268 268
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e6f76106a99b..33ce7ed6afc6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev)
30 30
31u32 ethtool_op_get_tx_csum(struct net_device *dev) 31u32 ethtool_op_get_tx_csum(struct net_device *dev)
32{ 32{
33 return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0; 33 return (dev->features & NETIF_F_ALL_CSUM) != 0;
34} 34}
35 35
36int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) 36int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
@@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
551 return -EFAULT; 551 return -EFAULT;
552 552
553 if (edata.data && 553 if (edata.data &&
554 !(dev->features & (NETIF_F_IP_CSUM | 554 !(dev->features & NETIF_F_ALL_CSUM))
555 NETIF_F_NO_CSUM |
556 NETIF_F_HW_CSUM)))
557 return -EINVAL; 555 return -EINVAL;
558 556
559 return __ethtool_set_sg(dev, edata.data); 557 return __ethtool_set_sg(dev, edata.data);
@@ -591,7 +589,7 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
591 589
592static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) 590static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
593{ 591{
594 struct ethtool_value edata = { ETHTOOL_GTSO }; 592 struct ethtool_value edata = { ETHTOOL_GUFO };
595 593
596 if (!dev->ethtool_ops->get_ufo) 594 if (!dev->ethtool_ops->get_ufo)
597 return -EOPNOTSUPP; 595 return -EOPNOTSUPP;
@@ -600,6 +598,7 @@ static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
600 return -EFAULT; 598 return -EFAULT;
601 return 0; 599 return 0;
602} 600}
601
603static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) 602static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
604{ 603{
605 struct ethtool_value edata; 604 struct ethtool_value edata;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e8e05cebd95a..9cb781830380 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -273,24 +273,21 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
273 273
274 do { 274 do {
275 npinfo->tries--; 275 npinfo->tries--;
276 spin_lock(&np->dev->xmit_lock); 276 netif_tx_lock(np->dev);
277 np->dev->xmit_lock_owner = smp_processor_id();
278 277
279 /* 278 /*
280 * network drivers do not expect to be called if the queue is 279 * network drivers do not expect to be called if the queue is
281 * stopped. 280 * stopped.
282 */ 281 */
283 if (netif_queue_stopped(np->dev)) { 282 if (netif_queue_stopped(np->dev)) {
284 np->dev->xmit_lock_owner = -1; 283 netif_tx_unlock(np->dev);
285 spin_unlock(&np->dev->xmit_lock);
286 netpoll_poll(np); 284 netpoll_poll(np);
287 udelay(50); 285 udelay(50);
288 continue; 286 continue;
289 } 287 }
290 288
291 status = np->dev->hard_start_xmit(skb, np->dev); 289 status = np->dev->hard_start_xmit(skb, np->dev);
292 np->dev->xmit_lock_owner = -1; 290 netif_tx_unlock(np->dev);
293 spin_unlock(&np->dev->xmit_lock);
294 291
295 /* success */ 292 /* success */
296 if(!status) { 293 if(!status) {
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c23e9c06ee23..67ed14ddabd2 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2897,7 +2897,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
2897 } 2897 }
2898 } 2898 }
2899 2899
2900 spin_lock_bh(&odev->xmit_lock); 2900 netif_tx_lock_bh(odev);
2901 if (!netif_queue_stopped(odev)) { 2901 if (!netif_queue_stopped(odev)) {
2902 2902
2903 atomic_inc(&(pkt_dev->skb->users)); 2903 atomic_inc(&(pkt_dev->skb->users));
@@ -2942,7 +2942,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
2942 pkt_dev->next_tx_ns = 0; 2942 pkt_dev->next_tx_ns = 0;
2943 } 2943 }
2944 2944
2945 spin_unlock_bh(&odev->xmit_lock); 2945 netif_tx_unlock_bh(odev);
2946 2946
2947 /* If pkt_dev->count is zero, then run forever */ 2947 /* If pkt_dev->count is zero, then run forever */
2948 if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { 2948 if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fb3770f9c094..bb7210f4005e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -464,7 +464,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
464 n->tc_verd = CLR_TC_MUNGED(n->tc_verd); 464 n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
465 C(input_dev); 465 C(input_dev);
466#endif 466#endif
467 467 skb_copy_secmark(n, skb);
468#endif 468#endif
469 C(truesize); 469 C(truesize);
470 atomic_set(&n->users, 1); 470 atomic_set(&n->users, 1);
@@ -526,6 +526,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
526#endif 526#endif
527 new->tc_index = old->tc_index; 527 new->tc_index = old->tc_index;
528#endif 528#endif
529 skb_copy_secmark(new, old);
529 atomic_set(&new->users, 1); 530 atomic_set(&new->users, 1);
530 skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; 531 skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
531 skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; 532 skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
@@ -800,12 +801,10 @@ struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
800 return nskb; 801 return nskb;
801} 802}
802 803
803/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. 804/* Trims skb to length len. It can change skb pointers.
804 * If realloc==0 and trimming is impossible without change of data,
805 * it is BUG().
806 */ 805 */
807 806
808int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) 807int ___pskb_trim(struct sk_buff *skb, unsigned int len)
809{ 808{
810 int offset = skb_headlen(skb); 809 int offset = skb_headlen(skb);
811 int nfrags = skb_shinfo(skb)->nr_frags; 810 int nfrags = skb_shinfo(skb)->nr_frags;
@@ -815,7 +814,6 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
815 int end = offset + skb_shinfo(skb)->frags[i].size; 814 int end = offset + skb_shinfo(skb)->frags[i].size;
816 if (end > len) { 815 if (end > len) {
817 if (skb_cloned(skb)) { 816 if (skb_cloned(skb)) {
818 BUG_ON(!realloc);
819 if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) 817 if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
820 return -ENOMEM; 818 return -ENOMEM;
821 } 819 }
diff --git a/net/core/sock.c b/net/core/sock.c
index ed2afdb9ea2d..5d820c376653 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
832 atomic_set(&newsk->sk_omem_alloc, 0); 832 atomic_set(&newsk->sk_omem_alloc, 0);
833 skb_queue_head_init(&newsk->sk_receive_queue); 833 skb_queue_head_init(&newsk->sk_receive_queue);
834 skb_queue_head_init(&newsk->sk_write_queue); 834 skb_queue_head_init(&newsk->sk_write_queue);
835#ifdef CONFIG_NET_DMA
836 skb_queue_head_init(&newsk->sk_async_wait_queue);
837#endif
835 838
836 rwlock_init(&newsk->sk_dst_lock); 839 rwlock_init(&newsk->sk_dst_lock);
837 rwlock_init(&newsk->sk_callback_lock); 840 rwlock_init(&newsk->sk_callback_lock);
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
1383 skb_queue_head_init(&sk->sk_receive_queue); 1386 skb_queue_head_init(&sk->sk_receive_queue);
1384 skb_queue_head_init(&sk->sk_write_queue); 1387 skb_queue_head_init(&sk->sk_write_queue);
1385 skb_queue_head_init(&sk->sk_error_queue); 1388 skb_queue_head_init(&sk->sk_error_queue);
1389#ifdef CONFIG_NET_DMA
1390 skb_queue_head_init(&sk->sk_async_wait_queue);
1391#endif
1386 1392
1387 sk->sk_send_head = NULL; 1393 sk->sk_send_head = NULL;
1388 1394
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 000000000000..b7c98dbcdb81
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,131 @@
1/*
2 * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
3 * Portions based on net/core/datagram.c and copyrighted by their authors.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * The full GNU General Public License is included in this distribution in the
20 * file called COPYING.
21 */
22
23/*
24 * This code allows the net stack to make use of a DMA engine for
25 * skb to iovec copies.
26 */
27
28#include <linux/dmaengine.h>
29#include <linux/socket.h>
30#include <linux/rtnetlink.h> /* for BUG_TRAP */
31#include <net/tcp.h>
32
33#define NET_DMA_DEFAULT_COPYBREAK 4096
34
35int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
36
37/**
38 * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
39 * @skb - buffer to copy
40 * @offset - offset in the buffer to start copying from
41 * @iovec - io vector to copy to
42 * @len - amount of data to copy from buffer to iovec
43 * @pinned_list - locked iovec buffer data
44 *
45 * Note: the iovec is modified during the copy.
46 */
47int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
48 struct sk_buff *skb, int offset, struct iovec *to,
49 size_t len, struct dma_pinned_list *pinned_list)
50{
51 int start = skb_headlen(skb);
52 int i, copy = start - offset;
53 dma_cookie_t cookie = 0;
54
55 /* Copy header. */
56 if (copy > 0) {
57 if (copy > len)
58 copy = len;
59 cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
60 skb->data + offset, copy);
61 if (cookie < 0)
62 goto fault;
63 len -= copy;
64 if (len == 0)
65 goto end;
66 offset += copy;
67 }
68
69 /* Copy paged appendix. Hmm... why does this look so complicated? */
70 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
71 int end;
72
73 BUG_TRAP(start <= offset + len);
74
75 end = start + skb_shinfo(skb)->frags[i].size;
76 copy = end - offset;
77 if ((copy = end - offset) > 0) {
78 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
79 struct page *page = frag->page;
80
81 if (copy > len)
82 copy = len;
83
84 cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
85 frag->page_offset + offset - start, copy);
86 if (cookie < 0)
87 goto fault;
88 len -= copy;
89 if (len == 0)
90 goto end;
91 offset += copy;
92 }
93 start = end;
94 }
95
96 if (skb_shinfo(skb)->frag_list) {
97 struct sk_buff *list = skb_shinfo(skb)->frag_list;
98
99 for (; list; list = list->next) {
100 int end;
101
102 BUG_TRAP(start <= offset + len);
103
104 end = start + list->len;
105 copy = end - offset;
106 if (copy > 0) {
107 if (copy > len)
108 copy = len;
109 cookie = dma_skb_copy_datagram_iovec(chan, list,
110 offset - start, to, copy,
111 pinned_list);
112 if (cookie < 0)
113 goto fault;
114 len -= copy;
115 if (len == 0)
116 goto end;
117 offset += copy;
118 }
119 start = end;
120 }
121 }
122
123end:
124 if (!len) {
125 skb->dma_cookie = cookie;
126 return cookie;
127 }
128
129fault:
130 return -EFAULT;
131}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2e0ee8355c41..5317fd3e6691 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
719 } 719 }
720 dccp_pr_debug("packet_type=%s\n", 720 dccp_pr_debug("packet_type=%s\n",
721 dccp_packet_name(dh->dccph_type)); 721 dccp_packet_name(dh->dccph_type));
722 sk_eat_skb(sk, skb); 722 sk_eat_skb(sk, skb, 0);
723verify_sock_status: 723verify_sock_status:
724 if (sock_flag(sk, SOCK_DONE)) { 724 if (sock_flag(sk, SOCK_DONE)) {
725 len = 0; 725 len = 0;
@@ -773,7 +773,7 @@ verify_sock_status:
773 } 773 }
774 found_fin_ok: 774 found_fin_ok:
775 if (!(flags & MSG_PEEK)) 775 if (!(flags & MSG_PEEK))
776 sk_eat_skb(sk, skb); 776 sk_eat_skb(sk, skb, 0);
777 break; 777 break;
778 } while (1); 778 } while (1);
779out: 779out:
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 547523b41c81..a2ba9db1c376 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -801,8 +801,7 @@ got_it:
801 * We linearize everything except data segments here. 801 * We linearize everything except data segments here.
802 */ 802 */
803 if (cb->nsp_flags & ~0x60) { 803 if (cb->nsp_flags & ~0x60) {
804 if (unlikely(skb_is_nonlinear(skb)) && 804 if (unlikely(skb_linearize(skb)))
805 skb_linearize(skb, GFP_ATOMIC) != 0)
806 goto free_out; 805 goto free_out;
807 } 806 }
808 807
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e172cf98d7fc..5abf7057af00 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type
629 padlen); 629 padlen);
630 630
631 if (flags & DN_RT_PKT_CNTL) { 631 if (flags & DN_RT_PKT_CNTL) {
632 if (unlikely(skb_is_nonlinear(skb)) && 632 if (unlikely(skb_linearize(skb)))
633 skb_linearize(skb, GFP_ATOMIC) != 0)
634 goto dump_it; 633 goto dump_it;
635 634
636 switch(flags & DN_RT_CNTL_MSK) { 635 switch(flags & DN_RT_CNTL_MSK) {
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e40f75322377..da33393be45f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -414,6 +414,24 @@ config INET_TUNNEL
414 tristate 414 tristate
415 default n 415 default n
416 416
417config INET_XFRM_MODE_TRANSPORT
418 tristate "IP: IPsec transport mode"
419 default y
420 select XFRM
421 ---help---
422 Support for IPsec transport mode.
423
424 If unsure, say Y.
425
426config INET_XFRM_MODE_TUNNEL
427 tristate "IP: IPsec tunnel mode"
428 default y
429 select XFRM
430 ---help---
431 Support for IPsec tunnel mode.
432
433 If unsure, say Y.
434
417config INET_DIAG 435config INET_DIAG
418 tristate "INET: socket monitoring interface" 436 tristate "INET: socket monitoring interface"
419 default y 437 default y
@@ -532,6 +550,38 @@ config TCP_CONG_SCALABLE
532 properties, though is known to have fairness issues. 550 properties, though is known to have fairness issues.
533 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ 551 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
534 552
553config TCP_CONG_LP
554 tristate "TCP Low Priority"
555 depends on EXPERIMENTAL
556 default n
557 ---help---
558 TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
559 to utiliza only the excess network bandwidth as compared to the
560 ``fair share`` of bandwidth as targeted by TCP.
561 See http://www-ece.rice.edu/networks/TCP-LP/
562
563config TCP_CONG_VENO
564 tristate "TCP Veno"
565 depends on EXPERIMENTAL
566 default n
567 ---help---
568 TCP Veno is a sender-side only enhancement of TCP to obtain better
569 throughput over wireless networks. TCP Veno makes use of state
570 distinguishing to circumvent the difficult judgment of the packet loss
571 type. TCP Veno cuts down less congestion window in response to random
572 loss packets.
573 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
574
575config TCP_CONG_COMPOUND
576 tristate "TCP Compound"
577 depends on EXPERIMENTAL
578 default n
579 ---help---
580 TCP Compound is a sender-side only change to TCP that uses
581 a mixed Reno/Vegas approach to calculate the cwnd.
582 For further details look here:
583 ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf
584
535endmenu 585endmenu
536 586
537config TCP_CONG_BIC 587config TCP_CONG_BIC
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9ef50a0b9d2c..38b8039bdd55 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o
24obj-$(CONFIG_INET_IPCOMP) += ipcomp.o 24obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
25obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o 25obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
26obj-$(CONFIG_INET_TUNNEL) += tunnel4.o 26obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
27obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
28obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
27obj-$(CONFIG_IP_PNP) += ipconfig.o 29obj-$(CONFIG_IP_PNP) += ipconfig.o
28obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o 30obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
29obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o 31obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
@@ -34,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/
34obj-$(CONFIG_INET_DIAG) += inet_diag.o 36obj-$(CONFIG_INET_DIAG) += inet_diag.o
35obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 37obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
36obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 38obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
39obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
37obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 40obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
38obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o 41obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
39obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 42obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
@@ -41,7 +44,10 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
41obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o 44obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
42obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o 45obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
43obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o 46obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
47obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
44obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o 48obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
49obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
50obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o
45 51
46obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 52obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
47 xfrm4_output.o 53 xfrm4_output.o
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e2e4771fa4c6..c7782230080d 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -119,6 +119,7 @@ error:
119static int ah_input(struct xfrm_state *x, struct sk_buff *skb) 119static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
120{ 120{
121 int ah_hlen; 121 int ah_hlen;
122 int ihl;
122 struct iphdr *iph; 123 struct iphdr *iph;
123 struct ip_auth_hdr *ah; 124 struct ip_auth_hdr *ah;
124 struct ah_data *ahp; 125 struct ah_data *ahp;
@@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
149 ah = (struct ip_auth_hdr*)skb->data; 150 ah = (struct ip_auth_hdr*)skb->data;
150 iph = skb->nh.iph; 151 iph = skb->nh.iph;
151 152
152 memcpy(work_buf, iph, iph->ihl*4); 153 ihl = skb->data - skb->nh.raw;
154 memcpy(work_buf, iph, ihl);
153 155
154 iph->ttl = 0; 156 iph->ttl = 0;
155 iph->tos = 0; 157 iph->tos = 0;
156 iph->frag_off = 0; 158 iph->frag_off = 0;
157 iph->check = 0; 159 iph->check = 0;
158 if (iph->ihl != 5) { 160 if (ihl > sizeof(*iph)) {
159 u32 dummy; 161 u32 dummy;
160 if (ip_clear_mutable_options(iph, &dummy)) 162 if (ip_clear_mutable_options(iph, &dummy))
161 goto out; 163 goto out;
@@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
164 u8 auth_data[MAX_AH_AUTH_LEN]; 166 u8 auth_data[MAX_AH_AUTH_LEN];
165 167
166 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); 168 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
167 skb_push(skb, skb->data - skb->nh.raw); 169 skb_push(skb, ihl);
168 ahp->icv(ahp, skb, ah->auth_data); 170 ahp->icv(ahp, skb, ah->auth_data);
169 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { 171 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
170 x->stats.integrity_failed++; 172 x->stats.integrity_failed++;
@@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
172 } 174 }
173 } 175 }
174 ((struct iphdr*)work_buf)->protocol = ah->nexthdr; 176 ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
175 skb->nh.raw = skb_pull(skb, ah_hlen); 177 skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl);
176 memcpy(skb->nh.raw, work_buf, iph->ihl*4); 178 __skb_pull(skb, ah_hlen + ihl);
177 skb->nh.iph->tot_len = htons(skb->len);
178 skb_pull(skb, skb->nh.iph->ihl*4);
179 skb->h.raw = skb->data;
180 179
181 return 0; 180 return 0;
182 181
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9d1881c07a32..9bbdd4494551 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
143 int alen = esp->auth.icv_trunc_len; 143 int alen = esp->auth.icv_trunc_len;
144 int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; 144 int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
145 int nfrags; 145 int nfrags;
146 int encap_len = 0; 146 int ihl;
147 u8 nexthdr[2]; 147 u8 nexthdr[2];
148 struct scatterlist *sg; 148 struct scatterlist *sg;
149 u8 workbuf[60];
150 int padlen; 149 int padlen;
151 150
152 if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) 151 if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
@@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
177 skb->ip_summed = CHECKSUM_NONE; 176 skb->ip_summed = CHECKSUM_NONE;
178 177
179 esph = (struct ip_esp_hdr*)skb->data; 178 esph = (struct ip_esp_hdr*)skb->data;
180 iph = skb->nh.iph;
181 179
182 /* Get ivec. This can be wrong, check against another impls. */ 180 /* Get ivec. This can be wrong, check against another impls. */
183 if (esp->conf.ivlen) 181 if (esp->conf.ivlen)
@@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
204 202
205 /* ... check padding bits here. Silly. :-) */ 203 /* ... check padding bits here. Silly. :-) */
206 204
205 iph = skb->nh.iph;
206 ihl = iph->ihl * 4;
207
207 if (x->encap) { 208 if (x->encap) {
208 struct xfrm_encap_tmpl *encap = x->encap; 209 struct xfrm_encap_tmpl *encap = x->encap;
209 struct udphdr *uh; 210 struct udphdr *uh = (void *)(skb->nh.raw + ihl);
210
211 uh = (struct udphdr *)(iph + 1);
212 encap_len = (void*)esph - (void*)uh;
213 211
214 /* 212 /*
215 * 1) if the NAT-T peer's IP or port changed then 213 * 1) if the NAT-T peer's IP or port changed then
@@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
246 244
247 iph->protocol = nexthdr[1]; 245 iph->protocol = nexthdr[1];
248 pskb_trim(skb, skb->len - alen - padlen - 2); 246 pskb_trim(skb, skb->len - alen - padlen - 2);
249 memcpy(workbuf, skb->nh.raw, iph->ihl*4); 247 skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl;
250 skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
251 skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
252 memcpy(skb->nh.raw, workbuf, iph->ihl*4);
253 skb->nh.iph->tot_len = htons(skb->len);
254 248
255 return 0; 249 return 0;
256 250
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2a0455911ee0..017900172f7d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -730,7 +730,6 @@ out_err:
730static void icmp_redirect(struct sk_buff *skb) 730static void icmp_redirect(struct sk_buff *skb)
731{ 731{
732 struct iphdr *iph; 732 struct iphdr *iph;
733 unsigned long ip;
734 733
735 if (skb->len < sizeof(struct iphdr)) 734 if (skb->len < sizeof(struct iphdr))
736 goto out_err; 735 goto out_err;
@@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb)
742 goto out; 741 goto out;
743 742
744 iph = (struct iphdr *)skb->data; 743 iph = (struct iphdr *)skb->data;
745 ip = iph->daddr;
746 744
747 switch (skb->h.icmph->code & 7) { 745 switch (skb->h.icmph->code & 7) {
748 case ICMP_REDIR_NET: 746 case ICMP_REDIR_NET:
@@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb)
752 */ 750 */
753 case ICMP_REDIR_HOST: 751 case ICMP_REDIR_HOST:
754 case ICMP_REDIR_HOSTTOS: 752 case ICMP_REDIR_HOSTTOS:
755 ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, 753 ip_rt_redirect(skb->nh.iph->saddr, iph->daddr,
754 skb->h.icmph->un.gateway,
756 iph->saddr, skb->dev); 755 iph->saddr, skb->dev);
757 break; 756 break;
758 } 757 }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d512239a1473..ab680c851aa2 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2361 } 2361 }
2362 2362
2363 seq_printf(seq, 2363 seq_printf(seq,
2364 "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", 2364 "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
2365 im->multiaddr, im->users, 2365 im->multiaddr, im->users,
2366 im->tm_running, im->tm_running ? 2366 im->tm_running, im->tm_running ?
2367 jiffies_to_clock_t(im->timer.expires-jiffies) : 0, 2367 jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cff9c3a72daf..8538aac3d148 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
410 nf_bridge_get(to->nf_bridge); 410 nf_bridge_get(to->nf_bridge);
411#endif 411#endif
412#endif 412#endif
413 skb_copy_secmark(to, from);
413} 414}
414 415
415/* 416/*
@@ -839,7 +840,7 @@ int ip_append_data(struct sock *sk,
839 */ 840 */
840 if (transhdrlen && 841 if (transhdrlen &&
841 length + fragheaderlen <= mtu && 842 length + fragheaderlen <= mtu &&
842 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && 843 rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
843 !exthdrlen) 844 !exthdrlen)
844 csummode = CHECKSUM_HW; 845 csummode = CHECKSUM_HW;
845 846
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 95278b22b669..3ed8b57a1002 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list);
45static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) 45static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
46{ 46{
47 int err, plen, dlen; 47 int err, plen, dlen;
48 struct iphdr *iph;
49 struct ipcomp_data *ipcd = x->data; 48 struct ipcomp_data *ipcd = x->data;
50 u8 *start, *scratch; 49 u8 *start, *scratch;
51 struct crypto_tfm *tfm; 50 struct crypto_tfm *tfm;
@@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
74 73
75 skb_put(skb, dlen - plen); 74 skb_put(skb, dlen - plen);
76 memcpy(skb->data, scratch, dlen); 75 memcpy(skb->data, scratch, dlen);
77 iph = skb->nh.iph;
78 iph->tot_len = htons(dlen + iph->ihl * 4);
79out: 76out:
80 put_cpu(); 77 put_cpu();
81 return err; 78 return err;
@@ -83,34 +80,21 @@ out:
83 80
84static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) 81static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
85{ 82{
86 u8 nexthdr; 83 int err = -ENOMEM;
87 int err = 0;
88 struct iphdr *iph; 84 struct iphdr *iph;
89 union { 85 struct ip_comp_hdr *ipch;
90 struct iphdr iph;
91 char buf[60];
92 } tmp_iph;
93
94 86
95 if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && 87 if (skb_linearize_cow(skb))
96 skb_linearize(skb, GFP_ATOMIC) != 0) {
97 err = -ENOMEM;
98 goto out; 88 goto out;
99 }
100 89
101 skb->ip_summed = CHECKSUM_NONE; 90 skb->ip_summed = CHECKSUM_NONE;
102 91
103 /* Remove ipcomp header and decompress original payload */ 92 /* Remove ipcomp header and decompress original payload */
104 iph = skb->nh.iph; 93 iph = skb->nh.iph;
105 memcpy(&tmp_iph, iph, iph->ihl * 4); 94 ipch = (void *)skb->data;
106 nexthdr = *(u8 *)skb->data; 95 iph->protocol = ipch->nexthdr;
107 skb_pull(skb, sizeof(struct ip_comp_hdr)); 96 skb->h.raw = skb->nh.raw + sizeof(*ipch);
108 skb->nh.raw += sizeof(struct ip_comp_hdr); 97 __skb_pull(skb, sizeof(*ipch));
109 memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
110 iph = skb->nh.iph;
111 iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
112 iph->protocol = nexthdr;
113 skb->h.raw = skb->data;
114 err = ipcomp_decompress(x, skb); 98 err = ipcomp_decompress(x, skb);
115 99
116out: 100out:
@@ -171,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
171 goto out_ok; 155 goto out_ok;
172 } 156 }
173 157
174 if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && 158 if (skb_linearize_cow(skb))
175 skb_linearize(skb, GFP_ATOMIC) != 0) {
176 goto out_ok; 159 goto out_ok;
177 }
178 160
179 err = ipcomp_compress(x, skb); 161 err = ipcomp_compress(x, skb);
180 iph = skb->nh.iph; 162 iph = skb->nh.iph;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d4072533da21..e1d7f5fbc526 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK
55 of packets, but this mark value is kept in the conntrack session 55 of packets, but this mark value is kept in the conntrack session
56 instead of the individual packets. 56 instead of the individual packets.
57 57
58config IP_NF_CONNTRACK_SECMARK
59 bool 'Connection tracking security mark support'
60 depends on IP_NF_CONNTRACK && NETWORK_SECMARK
61 help
62 This option enables security markings to be applied to
63 connections. Typically they are copied to connections from
64 packets using the CONNSECMARK target and copied back from
65 connections to packets with the same target, with the packets
66 being originally labeled via SECMARK.
67
68 If unsure, say 'N'.
69
58config IP_NF_CONNTRACK_EVENTS 70config IP_NF_CONNTRACK_EVENTS
59 bool "Connection tracking events (EXPERIMENTAL)" 71 bool "Connection tracking events (EXPERIMENTAL)"
60 depends on EXPERIMENTAL && IP_NF_CONNTRACK 72 depends on EXPERIMENTAL && IP_NF_CONNTRACK
@@ -142,6 +154,8 @@ config IP_NF_TFTP
142config IP_NF_AMANDA 154config IP_NF_AMANDA
143 tristate "Amanda backup protocol support" 155 tristate "Amanda backup protocol support"
144 depends on IP_NF_CONNTRACK 156 depends on IP_NF_CONNTRACK
157 select TEXTSEARCH
158 select TEXTSEARCH_KMP
145 help 159 help
146 If you are running the Amanda backup package <http://www.amanda.org/> 160 If you are running the Amanda backup package <http://www.amanda.org/>
147 on this machine or machines that will be MASQUERADED through this 161 on this machine or machines that will be MASQUERADED through this
@@ -181,14 +195,26 @@ config IP_NF_H323
181 With this module you can support H.323 on a connection tracking/NAT 195 With this module you can support H.323 on a connection tracking/NAT
182 firewall. 196 firewall.
183 197
184 This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP 198 This module supports RAS, Fast Start, H.245 Tunnelling, Call
185 and T.120 based data and applications including audio, video, FAX, 199 Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
186 chat, whiteboard, file transfer, etc. For more information, please 200 whiteboard, file transfer, etc. For more information, please
187 see http://nath323.sourceforge.net/. 201 visit http://nath323.sourceforge.net/.
188 202
189 If you want to compile it as a module, say 'M' here and read 203 If you want to compile it as a module, say 'M' here and read
190 Documentation/modules.txt. If unsure, say 'N'. 204 Documentation/modules.txt. If unsure, say 'N'.
191 205
206config IP_NF_SIP
207 tristate "SIP protocol support (EXPERIMENTAL)"
208 depends on IP_NF_CONNTRACK && EXPERIMENTAL
209 help
210 SIP is an application-layer control protocol that can establish,
211 modify, and terminate multimedia sessions (conferences) such as
212 Internet telephony calls. With the ip_conntrack_sip and
213 the ip_nat_sip modules you can support the protocol on a connection
214 tracking/NATing firewall.
215
216 To compile it as a module, choose M here. If unsure, say Y.
217
192config IP_NF_QUEUE 218config IP_NF_QUEUE
193 tristate "IP Userspace queueing via NETLINK (OBSOLETE)" 219 tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
194 help 220 help
@@ -501,6 +527,12 @@ config IP_NF_NAT_H323
501 default IP_NF_NAT if IP_NF_H323=y 527 default IP_NF_NAT if IP_NF_H323=y
502 default m if IP_NF_H323=m 528 default m if IP_NF_H323=m
503 529
530config IP_NF_NAT_SIP
531 tristate
532 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
533 default IP_NF_NAT if IP_NF_SIP=y
534 default m if IP_NF_SIP=m
535
504# mangle + specific targets 536# mangle + specific targets
505config IP_NF_MANGLE 537config IP_NF_MANGLE
506 tristate "Packet mangling" 538 tristate "Packet mangling"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 461cb1eb5de7..3ded4a3af59c 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
31obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o 31obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
32obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o 32obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
33obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o 33obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
34obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o
34obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o 35obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
35 36
36# NAT helpers 37# NAT helpers
@@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
40obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o 41obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
41obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o 42obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
42obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o 43obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
44obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o
43 45
44# generic IP tables 46# generic IP tables
45obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o 47obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a604b1ccfdaa..0a7bd7f04061 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -17,33 +17,29 @@
17 * this value. 17 * this value.
18 * 18 *
19 */ 19 */
20
21#include <linux/in.h>
22#include <linux/kernel.h> 20#include <linux/kernel.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/netfilter.h>
25#include <linux/ip.h>
26#include <linux/moduleparam.h> 22#include <linux/moduleparam.h>
23#include <linux/textsearch.h>
24#include <linux/skbuff.h>
25#include <linux/in.h>
26#include <linux/ip.h>
27#include <linux/udp.h> 27#include <linux/udp.h>
28#include <net/checksum.h>
29#include <net/udp.h>
30 28
29#include <linux/netfilter.h>
31#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 30#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
32#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> 31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
33 32
34static unsigned int master_timeout = 300; 33static unsigned int master_timeout = 300;
34static char *ts_algo = "kmp";
35 35
36MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); 36MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
37MODULE_DESCRIPTION("Amanda connection tracking module"); 37MODULE_DESCRIPTION("Amanda connection tracking module");
38MODULE_LICENSE("GPL"); 38MODULE_LICENSE("GPL");
39module_param(master_timeout, uint, 0600); 39module_param(master_timeout, uint, 0600);
40MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); 40MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
41 41module_param(ts_algo, charp, 0400);
42static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; 42MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
43
44/* This is slow, but it's simple. --RR */
45static char *amanda_buffer;
46static DEFINE_SPINLOCK(amanda_buffer_lock);
47 43
48unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 44unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
49 enum ip_conntrack_info ctinfo, 45 enum ip_conntrack_info ctinfo,
@@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
52 struct ip_conntrack_expect *exp); 48 struct ip_conntrack_expect *exp);
53EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); 49EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
54 50
51enum amanda_strings {
52 SEARCH_CONNECT,
53 SEARCH_NEWLINE,
54 SEARCH_DATA,
55 SEARCH_MESG,
56 SEARCH_INDEX,
57};
58
59static struct {
60 char *string;
61 size_t len;
62 struct ts_config *ts;
63} search[] = {
64 [SEARCH_CONNECT] = {
65 .string = "CONNECT ",
66 .len = 8,
67 },
68 [SEARCH_NEWLINE] = {
69 .string = "\n",
70 .len = 1,
71 },
72 [SEARCH_DATA] = {
73 .string = "DATA ",
74 .len = 5,
75 },
76 [SEARCH_MESG] = {
77 .string = "MESG ",
78 .len = 5,
79 },
80 [SEARCH_INDEX] = {
81 .string = "INDEX ",
82 .len = 6,
83 },
84};
85
55static int help(struct sk_buff **pskb, 86static int help(struct sk_buff **pskb,
56 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) 87 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
57{ 88{
89 struct ts_state ts;
58 struct ip_conntrack_expect *exp; 90 struct ip_conntrack_expect *exp;
59 char *data, *data_limit, *tmp; 91 unsigned int dataoff, start, stop, off, i;
60 unsigned int dataoff, i; 92 char pbuf[sizeof("65535")], *tmp;
61 u_int16_t port, len; 93 u_int16_t port, len;
62 int ret = NF_ACCEPT; 94 int ret = NF_ACCEPT;
63 95
@@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb,
77 return NF_ACCEPT; 109 return NF_ACCEPT;
78 } 110 }
79 111
80 spin_lock_bh(&amanda_buffer_lock); 112 memset(&ts, 0, sizeof(ts));
81 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); 113 start = skb_find_text(*pskb, dataoff, (*pskb)->len,
82 data = amanda_buffer; 114 search[SEARCH_CONNECT].ts, &ts);
83 data_limit = amanda_buffer + (*pskb)->len - dataoff; 115 if (start == UINT_MAX)
84 *data_limit = '\0';
85
86 /* Search for the CONNECT string */
87 data = strstr(data, "CONNECT ");
88 if (!data)
89 goto out; 116 goto out;
90 data += strlen("CONNECT "); 117 start += dataoff + search[SEARCH_CONNECT].len;
91 118
92 /* Only search first line. */ 119 memset(&ts, 0, sizeof(ts));
93 if ((tmp = strchr(data, '\n'))) 120 stop = skb_find_text(*pskb, start, (*pskb)->len,
94 *tmp = '\0'; 121 search[SEARCH_NEWLINE].ts, &ts);
122 if (stop == UINT_MAX)
123 goto out;
124 stop += start;
95 125
96 for (i = 0; i < ARRAY_SIZE(conns); i++) { 126 for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
97 char *match = strstr(data, conns[i]); 127 memset(&ts, 0, sizeof(ts));
98 if (!match) 128 off = skb_find_text(*pskb, start, stop, search[i].ts, &ts);
129 if (off == UINT_MAX)
99 continue; 130 continue;
100 tmp = data = match + strlen(conns[i]); 131 off += start + search[i].len;
101 port = simple_strtoul(data, &data, 10); 132
102 len = data - tmp; 133 len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
134 if (skb_copy_bits(*pskb, off, pbuf, len))
135 break;
136 pbuf[len] = '\0';
137
138 port = simple_strtoul(pbuf, &tmp, 10);
139 len = tmp - pbuf;
103 if (port == 0 || len > 5) 140 if (port == 0 || len > 5)
104 break; 141 break;
105 142
@@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb,
125 exp->mask.dst.u.tcp.port = 0xFFFF; 162 exp->mask.dst.u.tcp.port = 0xFFFF;
126 163
127 if (ip_nat_amanda_hook) 164 if (ip_nat_amanda_hook)
128 ret = ip_nat_amanda_hook(pskb, ctinfo, 165 ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff,
129 tmp - amanda_buffer,
130 len, exp); 166 len, exp);
131 else if (ip_conntrack_expect_related(exp) != 0) 167 else if (ip_conntrack_expect_related(exp) != 0)
132 ret = NF_DROP; 168 ret = NF_DROP;
@@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb,
134 } 170 }
135 171
136out: 172out:
137 spin_unlock_bh(&amanda_buffer_lock);
138 return ret; 173 return ret;
139} 174}
140 175
141static struct ip_conntrack_helper amanda_helper = { 176static struct ip_conntrack_helper amanda_helper = {
142 .max_expected = ARRAY_SIZE(conns), 177 .max_expected = 3,
143 .timeout = 180, 178 .timeout = 180,
144 .me = THIS_MODULE, 179 .me = THIS_MODULE,
145 .help = help, 180 .help = help,
@@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = {
155 190
156static void __exit ip_conntrack_amanda_fini(void) 191static void __exit ip_conntrack_amanda_fini(void)
157{ 192{
193 int i;
194
158 ip_conntrack_helper_unregister(&amanda_helper); 195 ip_conntrack_helper_unregister(&amanda_helper);
159 kfree(amanda_buffer); 196 for (i = 0; i < ARRAY_SIZE(search); i++)
197 textsearch_destroy(search[i].ts);
160} 198}
161 199
162static int __init ip_conntrack_amanda_init(void) 200static int __init ip_conntrack_amanda_init(void)
163{ 201{
164 int ret; 202 int ret, i;
165 203
166 amanda_buffer = kmalloc(65536, GFP_KERNEL); 204 ret = -ENOMEM;
167 if (!amanda_buffer) 205 for (i = 0; i < ARRAY_SIZE(search); i++) {
168 return -ENOMEM; 206 search[i].ts = textsearch_prepare(ts_algo, search[i].string,
169 207 search[i].len,
170 ret = ip_conntrack_helper_register(&amanda_helper); 208 GFP_KERNEL, TS_AUTOLOAD);
171 if (ret < 0) { 209 if (search[i].ts == NULL)
172 kfree(amanda_buffer); 210 goto err;
173 return ret;
174 } 211 }
212 ret = ip_conntrack_helper_register(&amanda_helper);
213 if (ret < 0)
214 goto err;
175 return 0; 215 return 0;
176 216
177 217err:
218 for (; i >= 0; i--) {
219 if (search[i].ts)
220 textsearch_destroy(search[i].ts);
221 }
222 return ret;
178} 223}
179 224
180module_init(ip_conntrack_amanda_init); 225module_init(ip_conntrack_amanda_init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a297da7bbef5..7e4cf9a4d15f 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -724,6 +724,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple,
724 /* this is ugly, but there is no other place where to put it */ 724 /* this is ugly, but there is no other place where to put it */
725 conntrack->nat.masq_index = exp->master->nat.masq_index; 725 conntrack->nat.masq_index = exp->master->nat.masq_index;
726#endif 726#endif
727#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
728 conntrack->secmark = exp->master->secmark;
729#endif
727 nf_conntrack_get(&conntrack->master->ct_general); 730 nf_conntrack_get(&conntrack->master->ct_general);
728 CONNTRACK_STAT_INC(expect_new); 731 CONNTRACK_STAT_INC(expect_new);
729 } else { 732 } else {
@@ -1130,6 +1133,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1130 1133
1131 write_lock_bh(&ip_conntrack_lock); 1134 write_lock_bh(&ip_conntrack_lock);
1132 1135
1136 /* Only update if this is not a fixed timeout */
1137 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1138 write_unlock_bh(&ip_conntrack_lock);
1139 return;
1140 }
1141
1133 /* If not in hash table, timer will not be active yet */ 1142 /* If not in hash table, timer will not be active yet */
1134 if (!is_confirmed(ct)) { 1143 if (!is_confirmed(ct)) {
1135 ct->timeout.expires = extra_jiffies; 1144 ct->timeout.expires = extra_jiffies;
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 3e542bf28a9d..4dcf526c3944 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char);
56static int try_epsv_response(const char *, size_t, u_int32_t [], char); 56static int try_epsv_response(const char *, size_t, u_int32_t [], char);
57 57
58static const struct ftp_search { 58static const struct ftp_search {
59 enum ip_conntrack_dir dir;
60 const char *pattern; 59 const char *pattern;
61 size_t plen; 60 size_t plen;
62 char skip; 61 char skip;
63 char term; 62 char term;
64 enum ip_ct_ftp_type ftptype; 63 enum ip_ct_ftp_type ftptype;
65 int (*getnum)(const char *, size_t, u_int32_t[], char); 64 int (*getnum)(const char *, size_t, u_int32_t[], char);
66} search[] = { 65} search[IP_CT_DIR_MAX][2] = {
67 { 66 [IP_CT_DIR_ORIGINAL] = {
68 IP_CT_DIR_ORIGINAL, 67 {
69 "PORT", sizeof("PORT") - 1, ' ', '\r', 68 .pattern = "PORT",
70 IP_CT_FTP_PORT, 69 .plen = sizeof("PORT") - 1,
71 try_rfc959, 70 .skip = ' ',
71 .term = '\r',
72 .ftptype = IP_CT_FTP_PORT,
73 .getnum = try_rfc959,
74 },
75 {
76 .pattern = "EPRT",
77 .plen = sizeof("EPRT") - 1,
78 .skip = ' ',
79 .term = '\r',
80 .ftptype = IP_CT_FTP_EPRT,
81 .getnum = try_eprt,
82 },
72 }, 83 },
73 { 84 [IP_CT_DIR_REPLY] = {
74 IP_CT_DIR_REPLY, 85 {
75 "227 ", sizeof("227 ") - 1, '(', ')', 86 .pattern = "227 ",
76 IP_CT_FTP_PASV, 87 .plen = sizeof("227 ") - 1,
77 try_rfc959, 88 .skip = '(',
78 }, 89 .term = ')',
79 { 90 .ftptype = IP_CT_FTP_PASV,
80 IP_CT_DIR_ORIGINAL, 91 .getnum = try_rfc959,
81 "EPRT", sizeof("EPRT") - 1, ' ', '\r', 92 },
82 IP_CT_FTP_EPRT, 93 {
83 try_eprt, 94 .pattern = "229 ",
84 }, 95 .plen = sizeof("229 ") - 1,
85 { 96 .skip = '(',
86 IP_CT_DIR_REPLY, 97 .term = ')',
87 "229 ", sizeof("229 ") - 1, '(', ')', 98 .ftptype = IP_CT_FTP_EPSV,
88 IP_CT_FTP_EPSV, 99 .getnum = try_epsv_response,
89 try_epsv_response, 100 },
90 }, 101 },
91}; 102};
92 103
@@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb,
346 array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; 357 array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
347 array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; 358 array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
348 359
349 for (i = 0; i < ARRAY_SIZE(search); i++) { 360 for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
350 if (search[i].dir != dir) continue;
351
352 found = find_pattern(fb_ptr, (*pskb)->len - dataoff, 361 found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
353 search[i].pattern, 362 search[dir][i].pattern,
354 search[i].plen, 363 search[dir][i].plen,
355 search[i].skip, 364 search[dir][i].skip,
356 search[i].term, 365 search[dir][i].term,
357 &matchoff, &matchlen, 366 &matchoff, &matchlen,
358 array, 367 array,
359 search[i].getnum); 368 search[dir][i].getnum);
360 if (found) break; 369 if (found) break;
361 } 370 }
362 if (found == -1) { 371 if (found == -1) {
@@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb,
366 this case. */ 375 this case. */
367 if (net_ratelimit()) 376 if (net_ratelimit())
368 printk("conntrack_ftp: partial %s %u+%u\n", 377 printk("conntrack_ftp: partial %s %u+%u\n",
369 search[i].pattern, 378 search[dir][i].pattern,
370 ntohl(th->seq), datalen); 379 ntohl(th->seq), datalen);
371 ret = NF_DROP; 380 ret = NF_DROP;
372 goto out; 381 goto out;
@@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb,
426 /* Now, NAT might want to mangle the packet, and register the 435 /* Now, NAT might want to mangle the packet, and register the
427 * (possibly changed) expectation itself. */ 436 * (possibly changed) expectation itself. */
428 if (ip_nat_ftp_hook) 437 if (ip_nat_ftp_hook)
429 ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, 438 ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype,
430 matchoff, matchlen, exp, &seq); 439 matchoff, matchlen, exp, &seq);
431 else { 440 else {
432 /* Can't expect this? Best to drop packet now. */ 441 /* Can't expect this? Best to drop packet now. */
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
index 518f581d39ec..0665674218c6 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
@@ -22,6 +22,8 @@
22#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> 22#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
23#include <linux/netfilter_ipv4/ip_conntrack_h323.h> 23#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
24#include <linux/moduleparam.h> 24#include <linux/moduleparam.h>
25#include <linux/ctype.h>
26#include <linux/inet.h>
25 27
26#if 0 28#if 0
27#define DEBUGP printk 29#define DEBUGP printk
@@ -38,6 +40,12 @@ static int gkrouted_only = 1;
38module_param(gkrouted_only, int, 0600); 40module_param(gkrouted_only, int, 0600);
39MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); 41MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
40 42
43static int callforward_filter = 1;
44module_param(callforward_filter, bool, 0600);
45MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
46 "if both endpoints are on different sides "
47 "(determined by routing information)");
48
41/* Hooks for NAT */ 49/* Hooks for NAT */
42int (*set_h245_addr_hook) (struct sk_buff ** pskb, 50int (*set_h245_addr_hook) (struct sk_buff ** pskb,
43 unsigned char **data, int dataoff, 51 unsigned char **data, int dataoff,
@@ -77,6 +85,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb,
77 unsigned char **data, int dataoff, 85 unsigned char **data, int dataoff,
78 TransportAddress * addr, u_int16_t port, 86 TransportAddress * addr, u_int16_t port,
79 struct ip_conntrack_expect * exp); 87 struct ip_conntrack_expect * exp);
88int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
89 struct ip_conntrack * ct,
90 enum ip_conntrack_info ctinfo,
91 unsigned char **data, int dataoff,
92 TransportAddress * addr, u_int16_t port,
93 struct ip_conntrack_expect * exp);
80int (*nat_q931_hook) (struct sk_buff ** pskb, 94int (*nat_q931_hook) (struct sk_buff ** pskb,
81 struct ip_conntrack * ct, 95 struct ip_conntrack * ct,
82 enum ip_conntrack_info ctinfo, 96 enum ip_conntrack_info ctinfo,
@@ -683,6 +697,92 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
683 return ret; 697 return ret;
684} 698}
685 699
700/* Forwarding declaration */
701void ip_conntrack_q931_expect(struct ip_conntrack *new,
702 struct ip_conntrack_expect *this);
703
704/****************************************************************************/
705static int expect_callforwarding(struct sk_buff **pskb,
706 struct ip_conntrack *ct,
707 enum ip_conntrack_info ctinfo,
708 unsigned char **data, int dataoff,
709 TransportAddress * addr)
710{
711 int dir = CTINFO2DIR(ctinfo);
712 int ret = 0;
713 u_int32_t ip;
714 u_int16_t port;
715 struct ip_conntrack_expect *exp = NULL;
716
717 /* Read alternativeAddress */
718 if (!get_h225_addr(*data, addr, &ip, &port) || port == 0)
719 return 0;
720
721 /* If the calling party is on the same side of the forward-to party,
722 * we don't need to track the second call */
723 if (callforward_filter) {
724 struct rtable *rt1, *rt2;
725 struct flowi fl1 = {
726 .fl4_dst = ip,
727 };
728 struct flowi fl2 = {
729 .fl4_dst = ct->tuplehash[!dir].tuple.src.ip,
730 };
731
732 if (ip_route_output_key(&rt1, &fl1) == 0) {
733 if (ip_route_output_key(&rt2, &fl2) == 0) {
734 if (rt1->rt_gateway == rt2->rt_gateway &&
735 rt1->u.dst.dev == rt2->u.dst.dev)
736 ret = 1;
737 dst_release(&rt2->u.dst);
738 }
739 dst_release(&rt1->u.dst);
740 }
741 if (ret) {
742 DEBUGP("ip_ct_q931: Call Forwarding not tracked\n");
743 return 0;
744 }
745 }
746
747 /* Create expect for the second call leg */
748 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
749 return -1;
750 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
751 exp->tuple.src.u.tcp.port = 0;
752 exp->tuple.dst.ip = ip;
753 exp->tuple.dst.u.tcp.port = htons(port);
754 exp->tuple.dst.protonum = IPPROTO_TCP;
755 exp->mask.src.ip = 0xFFFFFFFF;
756 exp->mask.src.u.tcp.port = 0;
757 exp->mask.dst.ip = 0xFFFFFFFF;
758 exp->mask.dst.u.tcp.port = 0xFFFF;
759 exp->mask.dst.protonum = 0xFF;
760 exp->flags = 0;
761
762 if (ct->tuplehash[dir].tuple.src.ip !=
763 ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) {
764 /* Need NAT */
765 ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff,
766 addr, port, exp);
767 } else { /* Conntrack only */
768 exp->expectfn = ip_conntrack_q931_expect;
769
770 if (ip_conntrack_expect_related(exp) == 0) {
771 DEBUGP("ip_ct_q931: expect Call Forwarding "
772 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
773 NIPQUAD(exp->tuple.src.ip),
774 ntohs(exp->tuple.src.u.tcp.port),
775 NIPQUAD(exp->tuple.dst.ip),
776 ntohs(exp->tuple.dst.u.tcp.port));
777 } else
778 ret = -1;
779 }
780
781 ip_conntrack_expect_put(exp);
782
783 return ret;
784}
785
686/****************************************************************************/ 786/****************************************************************************/
687static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, 787static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
688 enum ip_conntrack_info ctinfo, 788 enum ip_conntrack_info ctinfo,
@@ -878,6 +978,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
878 978
879 DEBUGP("ip_ct_q931: Facility\n"); 979 DEBUGP("ip_ct_q931: Facility\n");
880 980
981 if (facility->reason.choice == eFacilityReason_callForwarded) {
982 if (facility->options & eFacility_UUIE_alternativeAddress)
983 return expect_callforwarding(pskb, ct, ctinfo, data,
984 dataoff,
985 &facility->
986 alternativeAddress);
987 return 0;
988 }
989
881 if (facility->options & eFacility_UUIE_h245Address) { 990 if (facility->options & eFacility_UUIE_h245Address) {
882 ret = expect_h245(pskb, ct, ctinfo, data, dataoff, 991 ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
883 &facility->h245Address); 992 &facility->h245Address);
@@ -1677,7 +1786,6 @@ static int __init init(void)
1677 fini(); 1786 fini();
1678 return ret; 1787 return ret;
1679 } 1788 }
1680
1681 DEBUGP("ip_ct_h323: init success\n"); 1789 DEBUGP("ip_ct_h323: init success\n");
1682 return 0; 1790 return 0;
1683} 1791}
@@ -1696,6 +1804,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook);
1696EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); 1804EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
1697EXPORT_SYMBOL_GPL(nat_t120_hook); 1805EXPORT_SYMBOL_GPL(nat_t120_hook);
1698EXPORT_SYMBOL_GPL(nat_h245_hook); 1806EXPORT_SYMBOL_GPL(nat_h245_hook);
1807EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
1699EXPORT_SYMBOL_GPL(nat_q931_hook); 1808EXPORT_SYMBOL_GPL(nat_q931_hook);
1700 1809
1701MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); 1810MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
index 022c47b9f6c9..4b359618bedd 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
@@ -1,4 +1,4 @@
1/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 1/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006
2 * 2 *
3 * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> 3 * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
4 * 4 *
@@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */
1069 1069
1070static field_t _Facility_UUIE[] = { /* SEQUENCE */ 1070static field_t _Facility_UUIE[] = { /* SEQUENCE */
1071 {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, 1071 {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
1072 {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0, 1072 {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
1073 _TransportAddress}, 1073 offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
1074 {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, 1074 {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
1075 _Facility_UUIE_alternativeAliasAddress}, 1075 _Facility_UUIE_alternativeAliasAddress},
1076 {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, 1076 {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 01bd7cab9367..33891bb1fde4 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -399,38 +399,54 @@ nfattr_failure:
399static int ctnetlink_done(struct netlink_callback *cb) 399static int ctnetlink_done(struct netlink_callback *cb)
400{ 400{
401 DEBUGP("entered %s\n", __FUNCTION__); 401 DEBUGP("entered %s\n", __FUNCTION__);
402 if (cb->args[1])
403 ip_conntrack_put((struct ip_conntrack *)cb->args[1]);
402 return 0; 404 return 0;
403} 405}
404 406
405static int 407static int
406ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) 408ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
407{ 409{
408 struct ip_conntrack *ct = NULL; 410 struct ip_conntrack *ct, *last;
409 struct ip_conntrack_tuple_hash *h; 411 struct ip_conntrack_tuple_hash *h;
410 struct list_head *i; 412 struct list_head *i;
411 u_int32_t *id = (u_int32_t *) &cb->args[1];
412 413
413 DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, 414 DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__,
414 cb->args[0], *id); 415 cb->args[0], *id);
415 416
416 read_lock_bh(&ip_conntrack_lock); 417 read_lock_bh(&ip_conntrack_lock);
417 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { 418 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) {
419restart:
420 last = (struct ip_conntrack *)cb->args[1];
418 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { 421 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
419 h = (struct ip_conntrack_tuple_hash *) i; 422 h = (struct ip_conntrack_tuple_hash *) i;
420 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) 423 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
421 continue; 424 continue;
422 ct = tuplehash_to_ctrack(h); 425 ct = tuplehash_to_ctrack(h);
423 if (ct->id <= *id) 426 if (last != NULL) {
424 continue; 427 if (ct == last) {
428 ip_conntrack_put(last);
429 cb->args[1] = 0;
430 last = NULL;
431 } else
432 continue;
433 }
425 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, 434 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
426 cb->nlh->nlmsg_seq, 435 cb->nlh->nlmsg_seq,
427 IPCTNL_MSG_CT_NEW, 436 IPCTNL_MSG_CT_NEW,
428 1, ct) < 0) 437 1, ct) < 0) {
438 nf_conntrack_get(&ct->ct_general);
439 cb->args[1] = (unsigned long)ct;
429 goto out; 440 goto out;
430 *id = ct->id; 441 }
442 }
443 if (last != NULL) {
444 ip_conntrack_put(last);
445 cb->args[1] = 0;
446 goto restart;
431 } 447 }
432 } 448 }
433out: 449out:
434 read_unlock_bh(&ip_conntrack_lock); 450 read_unlock_bh(&ip_conntrack_lock);
435 451
436 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); 452 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
@@ -629,7 +645,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
629}; 645};
630 646
631static inline int 647static inline int
632ctnetlink_parse_nat(struct nfattr *cda[], 648ctnetlink_parse_nat(struct nfattr *nat,
633 const struct ip_conntrack *ct, struct ip_nat_range *range) 649 const struct ip_conntrack *ct, struct ip_nat_range *range)
634{ 650{
635 struct nfattr *tb[CTA_NAT_MAX]; 651 struct nfattr *tb[CTA_NAT_MAX];
@@ -639,7 +655,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
639 655
640 memset(range, 0, sizeof(*range)); 656 memset(range, 0, sizeof(*range));
641 657
642 nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); 658 nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
643 659
644 if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) 660 if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
645 return -EINVAL; 661 return -EINVAL;
@@ -854,39 +870,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
854 /* ASSURED bit can only be set */ 870 /* ASSURED bit can only be set */
855 return -EINVAL; 871 return -EINVAL;
856 872
857 if (cda[CTA_NAT-1]) { 873 if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
858#ifndef CONFIG_IP_NF_NAT_NEEDED 874#ifndef CONFIG_IP_NF_NAT_NEEDED
859 return -EINVAL; 875 return -EINVAL;
860#else 876#else
861 unsigned int hooknum;
862 struct ip_nat_range range; 877 struct ip_nat_range range;
863 878
864 if (ctnetlink_parse_nat(cda, ct, &range) < 0) 879 if (cda[CTA_NAT_DST-1]) {
865 return -EINVAL; 880 if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
866 881 &range) < 0)
867 DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 882 return -EINVAL;
868 NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), 883 if (ip_nat_initialized(ct,
869 htons(range.min.all), htons(range.max.all)); 884 HOOK2MANIP(NF_IP_PRE_ROUTING)))
870 885 return -EEXIST;
871 /* This is tricky but it works. ip_nat_setup_info needs the 886 ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
872 * hook number as parameter, so let's do the correct 887 }
873 * conversion and run away */ 888 if (cda[CTA_NAT_SRC-1]) {
874 if (status & IPS_SRC_NAT_DONE) 889 if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
875 hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ 890 &range) < 0)
876 else if (status & IPS_DST_NAT_DONE) 891 return -EINVAL;
877 hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ 892 if (ip_nat_initialized(ct,
878 else 893 HOOK2MANIP(NF_IP_POST_ROUTING)))
879 return -EINVAL; /* Missing NAT flags */ 894 return -EEXIST;
880 895 ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
881 DEBUGP("NAT status: %lu\n", 896 }
882 status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
883
884 if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
885 return -EEXIST;
886 ip_nat_setup_info(ct, &range, hooknum);
887
888 DEBUGP("NAT status after setup_info: %lu\n",
889 ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
890#endif 897#endif
891 } 898 }
892 899
@@ -1106,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
1106 /* implicit 'else' */ 1113 /* implicit 'else' */
1107 1114
1108 /* we only allow nat config for new conntracks */ 1115 /* we only allow nat config for new conntracks */
1109 if (cda[CTA_NAT-1]) { 1116 if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
1110 err = -EINVAL; 1117 err = -EINVAL;
1111 goto out_unlock; 1118 goto out_unlock;
1112 } 1119 }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 56794797d55b..21ee124c0463 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km,
77} 77}
78 78
79/* look up the source key for a given tuple */ 79/* look up the source key for a given tuple */
80static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) 80static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t)
81{ 81{
82 struct ip_ct_gre_keymap *km; 82 struct ip_ct_gre_keymap *km;
83 u_int32_t key = 0; 83 __be16 key = 0;
84 84
85 read_lock_bh(&ip_ct_gre_lock); 85 read_lock_bh(&ip_ct_gre_lock);
86 km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, 86 km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
@@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
190 struct ip_conntrack_tuple *tuple) 190 struct ip_conntrack_tuple *tuple)
191{ 191{
192 struct gre_hdr_pptp _pgrehdr, *pgrehdr; 192 struct gre_hdr_pptp _pgrehdr, *pgrehdr;
193 u_int32_t srckey; 193 __be16 srckey;
194 struct gre_hdr _grehdr, *grehdr; 194 struct gre_hdr _grehdr, *grehdr;
195 195
196 /* first only delinearize old RFC1701 GRE header */ 196 /* first only delinearize old RFC1701 GRE header */
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index d8b14a9010a6..23f1c504586d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
224 } 224 }
225 225
226 /* See ip_conntrack_proto_tcp.c */ 226 /* See ip_conntrack_proto_tcp.c */
227 if (hooknum == NF_IP_PRE_ROUTING && 227 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
228 nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { 228 nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
229 if (LOG_INVALID(IPPROTO_ICMP)) 229 if (LOG_INVALID(IPPROTO_ICMP))
230 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 230 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 062b252b58ad..c5c2ce5cdeb8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb,
870 * and moreover root might send raw packets. 870 * and moreover root might send raw packets.
871 */ 871 */
872 /* FIXME: Source route IP option packets --RR */ 872 /* FIXME: Source route IP option packets --RR */
873 if (hooknum == NF_IP_PRE_ROUTING && 873 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
874 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { 874 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
875 if (LOG_INVALID(IPPROTO_TCP)) 875 if (LOG_INVALID(IPPROTO_TCP))
876 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 876 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 70899868783b..9b2c16b4d2ff 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
120 * because the semantic of CHECKSUM_HW is different there 120 * because the semantic of CHECKSUM_HW is different there
121 * and moreover root might send raw packets. 121 * and moreover root might send raw packets.
122 * FIXME: Source route IP option packets --RR */ 122 * FIXME: Source route IP option packets --RR */
123 if (hooknum == NF_IP_PRE_ROUTING && 123 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
124 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { 124 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
125 if (LOG_INVALID(IPPROTO_UDP)) 125 if (LOG_INVALID(IPPROTO_UDP))
126 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 126 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c
new file mode 100644
index 000000000000..fc87ce0da40d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_sip.c
@@ -0,0 +1,471 @@
1/* SIP extension for IP connection tracking.
2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_conntrack_ftp.c and other modules.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/config.h>
12#include <linux/module.h>
13#include <linux/ctype.h>
14#include <linux/skbuff.h>
15#include <linux/in.h>
16#include <linux/ip.h>
17#include <linux/udp.h>
18
19#include <linux/netfilter.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
22#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
23
24#if 0
25#define DEBUGP printk
26#else
27#define DEBUGP(format, args...)
28#endif
29
30MODULE_LICENSE("GPL");
31MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
32MODULE_DESCRIPTION("SIP connection tracking helper");
33
34#define MAX_PORTS 8
35static unsigned short ports[MAX_PORTS];
36static int ports_c;
37module_param_array(ports, ushort, &ports_c, 0400);
38MODULE_PARM_DESC(ports, "port numbers of sip servers");
39
40static unsigned int sip_timeout = SIP_TIMEOUT;
41module_param(sip_timeout, uint, 0600);
42MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
43
44unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
45 enum ip_conntrack_info ctinfo,
46 struct ip_conntrack *ct,
47 const char **dptr);
48EXPORT_SYMBOL_GPL(ip_nat_sip_hook);
49
50unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
51 enum ip_conntrack_info ctinfo,
52 struct ip_conntrack_expect *exp,
53 const char *dptr);
54EXPORT_SYMBOL_GPL(ip_nat_sdp_hook);
55
56int ct_sip_get_info(const char *dptr, size_t dlen,
57 unsigned int *matchoff,
58 unsigned int *matchlen,
59 struct sip_header_nfo *hnfo);
60EXPORT_SYMBOL_GPL(ct_sip_get_info);
61
62
63static int digits_len(const char *dptr, const char *limit, int *shift);
64static int epaddr_len(const char *dptr, const char *limit, int *shift);
65static int skp_digits_len(const char *dptr, const char *limit, int *shift);
66static int skp_epaddr_len(const char *dptr, const char *limit, int *shift);
67
68struct sip_header_nfo ct_sip_hdrs[] = {
69 { /* Via header */
70 .lname = "Via:",
71 .lnlen = sizeof("Via:") - 1,
72 .sname = "\r\nv:",
73 .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */
74 .ln_str = "UDP ",
75 .ln_strlen = sizeof("UDP ") - 1,
76 .match_len = epaddr_len,
77 },
78 { /* Contact header */
79 .lname = "Contact:",
80 .lnlen = sizeof("Contact:") - 1,
81 .sname = "\r\nm:",
82 .snlen = sizeof("\r\nm:") - 1,
83 .ln_str = "sip:",
84 .ln_strlen = sizeof("sip:") - 1,
85 .match_len = skp_epaddr_len
86 },
87 { /* Content length header */
88 .lname = "Content-Length:",
89 .lnlen = sizeof("Content-Length:") - 1,
90 .sname = "\r\nl:",
91 .snlen = sizeof("\r\nl:") - 1,
92 .ln_str = ":",
93 .ln_strlen = sizeof(":") - 1,
94 .match_len = skp_digits_len
95 },
96 { /* SDP media info */
97 .lname = "\nm=",
98 .lnlen = sizeof("\nm=") - 1,
99 .sname = "\rm=",
100 .snlen = sizeof("\rm=") - 1,
101 .ln_str = "audio ",
102 .ln_strlen = sizeof("audio ") - 1,
103 .match_len = digits_len
104 },
105 { /* SDP owner address*/
106 .lname = "\no=",
107 .lnlen = sizeof("\no=") - 1,
108 .sname = "\ro=",
109 .snlen = sizeof("\ro=") - 1,
110 .ln_str = "IN IP4 ",
111 .ln_strlen = sizeof("IN IP4 ") - 1,
112 .match_len = epaddr_len
113 },
114 { /* SDP connection info */
115 .lname = "\nc=",
116 .lnlen = sizeof("\nc=") - 1,
117 .sname = "\rc=",
118 .snlen = sizeof("\rc=") - 1,
119 .ln_str = "IN IP4 ",
120 .ln_strlen = sizeof("IN IP4 ") - 1,
121 .match_len = epaddr_len
122 },
123 { /* Requests headers */
124 .lname = "sip:",
125 .lnlen = sizeof("sip:") - 1,
126 .sname = "sip:",
127 .snlen = sizeof("sip:") - 1, /* yes, i know.. ;) */
128 .ln_str = "@",
129 .ln_strlen = sizeof("@") - 1,
130 .match_len = epaddr_len
131 },
132 { /* SDP version header */
133 .lname = "\nv=",
134 .lnlen = sizeof("\nv=") - 1,
135 .sname = "\rv=",
136 .snlen = sizeof("\rv=") - 1,
137 .ln_str = "=",
138 .ln_strlen = sizeof("=") - 1,
139 .match_len = digits_len
140 }
141};
142EXPORT_SYMBOL_GPL(ct_sip_hdrs);
143
144/* get line lenght until first CR or LF seen. */
145int ct_sip_lnlen(const char *line, const char *limit)
146{
147 const char *k = line;
148
149 while ((line <= limit) && (*line == '\r' || *line == '\n'))
150 line++;
151
152 while (line <= limit) {
153 if (*line == '\r' || *line == '\n')
154 break;
155 line++;
156 }
157 return line - k;
158}
159EXPORT_SYMBOL_GPL(ct_sip_lnlen);
160
161/* Linear string search, case sensitive. */
162const char *ct_sip_search(const char *needle, const char *haystack,
163 size_t needle_len, size_t haystack_len)
164{
165 const char *limit = haystack + (haystack_len - needle_len);
166
167 while (haystack <= limit) {
168 if (memcmp(haystack, needle, needle_len) == 0)
169 return haystack;
170 haystack++;
171 }
172 return NULL;
173}
174EXPORT_SYMBOL_GPL(ct_sip_search);
175
176static int digits_len(const char *dptr, const char *limit, int *shift)
177{
178 int len = 0;
179 while (dptr <= limit && isdigit(*dptr)) {
180 dptr++;
181 len++;
182 }
183 return len;
184}
185
186/* get digits lenght, skiping blank spaces. */
187static int skp_digits_len(const char *dptr, const char *limit, int *shift)
188{
189 for (; dptr <= limit && *dptr == ' '; dptr++)
190 (*shift)++;
191
192 return digits_len(dptr, limit, shift);
193}
194
195/* Simple ipaddr parser.. */
196static int parse_ipaddr(const char *cp, const char **endp,
197 u_int32_t *ipaddr, const char *limit)
198{
199 unsigned long int val;
200 int i, digit = 0;
201
202 for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) {
203 digit = 0;
204 if (!isdigit(*cp))
205 break;
206
207 val = simple_strtoul(cp, (char **)&cp, 10);
208 if (val > 0xFF)
209 return -1;
210
211 ((u_int8_t *)ipaddr)[i] = val;
212 digit = 1;
213
214 if (*cp != '.')
215 break;
216 cp++;
217 }
218 if (!digit)
219 return -1;
220
221 if (endp)
222 *endp = cp;
223
224 return 0;
225}
226
227/* skip ip address. returns it lenght. */
228static int epaddr_len(const char *dptr, const char *limit, int *shift)
229{
230 const char *aux = dptr;
231 u_int32_t ip;
232
233 if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) {
234 DEBUGP("ip: %s parse failed.!\n", dptr);
235 return 0;
236 }
237
238 /* Port number */
239 if (*dptr == ':') {
240 dptr++;
241 dptr += digits_len(dptr, limit, shift);
242 }
243 return dptr - aux;
244}
245
246/* get address length, skiping user info. */
247static int skp_epaddr_len(const char *dptr, const char *limit, int *shift)
248{
249 int s = *shift;
250
251 for (; dptr <= limit && *dptr != '@'; dptr++)
252 (*shift)++;
253
254 if (*dptr == '@') {
255 dptr++;
256 (*shift)++;
257 } else
258 *shift = s;
259
260 return epaddr_len(dptr, limit, shift);
261}
262
263/* Returns 0 if not found, -1 error parsing. */
264int ct_sip_get_info(const char *dptr, size_t dlen,
265 unsigned int *matchoff,
266 unsigned int *matchlen,
267 struct sip_header_nfo *hnfo)
268{
269 const char *limit, *aux, *k = dptr;
270 int shift = 0;
271
272 limit = dptr + (dlen - hnfo->lnlen);
273
274 while (dptr <= limit) {
275 if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) &&
276 (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) {
277 dptr++;
278 continue;
279 }
280 aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen,
281 ct_sip_lnlen(dptr, limit));
282 if (!aux) {
283 DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
284 hnfo->lname);
285 return -1;
286 }
287 aux += hnfo->ln_strlen;
288
289 *matchlen = hnfo->match_len(aux, limit, &shift);
290 if (!*matchlen)
291 return -1;
292
293 *matchoff = (aux - k) + shift;
294
295 DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
296 *matchlen);
297 return 1;
298 }
299 DEBUGP("%s header not found.\n", hnfo->lname);
300 return 0;
301}
302
303static int set_expected_rtp(struct sk_buff **pskb,
304 struct ip_conntrack *ct,
305 enum ip_conntrack_info ctinfo,
306 u_int32_t ipaddr, u_int16_t port,
307 const char *dptr)
308{
309 struct ip_conntrack_expect *exp;
310 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
311 int ret;
312
313 exp = ip_conntrack_expect_alloc(ct);
314 if (exp == NULL)
315 return NF_DROP;
316
317 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
318 exp->tuple.src.u.udp.port = 0;
319 exp->tuple.dst.ip = ipaddr;
320 exp->tuple.dst.u.udp.port = htons(port);
321 exp->tuple.dst.protonum = IPPROTO_UDP;
322
323 exp->mask.src.ip = 0xFFFFFFFF;
324 exp->mask.src.u.udp.port = 0;
325 exp->mask.dst.ip = 0xFFFFFFFF;
326 exp->mask.dst.u.udp.port = 0xFFFF;
327 exp->mask.dst.protonum = 0xFF;
328
329 exp->expectfn = NULL;
330 exp->flags = 0;
331
332 if (ip_nat_sdp_hook)
333 ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr);
334 else {
335 if (ip_conntrack_expect_related(exp) != 0)
336 ret = NF_DROP;
337 else
338 ret = NF_ACCEPT;
339 }
340 ip_conntrack_expect_put(exp);
341
342 return ret;
343}
344
345static int sip_help(struct sk_buff **pskb,
346 struct ip_conntrack *ct,
347 enum ip_conntrack_info ctinfo)
348{
349 unsigned int dataoff, datalen;
350 const char *dptr;
351 int ret = NF_ACCEPT;
352 int matchoff, matchlen;
353 u_int32_t ipaddr;
354 u_int16_t port;
355
356 /* No Data ? */
357 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
358 if (dataoff >= (*pskb)->len) {
359 DEBUGP("skb->len = %u\n", (*pskb)->len);
360 return NF_ACCEPT;
361 }
362
363 ip_ct_refresh(ct, *pskb, sip_timeout * HZ);
364
365 if (!skb_is_nonlinear(*pskb))
366 dptr = (*pskb)->data + dataoff;
367 else {
368 DEBUGP("Copy of skbuff not supported yet.\n");
369 goto out;
370 }
371
372 if (ip_nat_sip_hook) {
373 if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) {
374 ret = NF_DROP;
375 goto out;
376 }
377 }
378
379 /* After this point NAT, could have mangled skb, so
380 we need to recalculate payload lenght. */
381 datalen = (*pskb)->len - dataoff;
382
383 if (datalen < (sizeof("SIP/2.0 200") - 1))
384 goto out;
385
386 /* RTP info only in some SDP pkts */
387 if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 &&
388 memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) {
389 goto out;
390 }
391 /* Get ip and port address from SDP packet. */
392 if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
393 &ct_sip_hdrs[POS_CONNECTION]) > 0) {
394
395 /* We'll drop only if there are parse problems. */
396 if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr,
397 dptr + datalen) < 0) {
398 ret = NF_DROP;
399 goto out;
400 }
401 if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
402 &ct_sip_hdrs[POS_MEDIA]) > 0) {
403
404 port = simple_strtoul(dptr + matchoff, NULL, 10);
405 if (port < 1024) {
406 ret = NF_DROP;
407 goto out;
408 }
409 ret = set_expected_rtp(pskb, ct, ctinfo,
410 ipaddr, port, dptr);
411 }
412 }
413out:
414 return ret;
415}
416
417static struct ip_conntrack_helper sip[MAX_PORTS];
418static char sip_names[MAX_PORTS][10];
419
420static void fini(void)
421{
422 int i;
423 for (i = 0; i < ports_c; i++) {
424 DEBUGP("unregistering helper for port %d\n", ports[i]);
425 ip_conntrack_helper_unregister(&sip[i]);
426 }
427}
428
429static int __init init(void)
430{
431 int i, ret;
432 char *tmpname;
433
434 if (ports_c == 0)
435 ports[ports_c++] = SIP_PORT;
436
437 for (i = 0; i < ports_c; i++) {
438 /* Create helper structure */
439 memset(&sip[i], 0, sizeof(struct ip_conntrack_helper));
440
441 sip[i].tuple.dst.protonum = IPPROTO_UDP;
442 sip[i].tuple.src.u.udp.port = htons(ports[i]);
443 sip[i].mask.src.u.udp.port = 0xFFFF;
444 sip[i].mask.dst.protonum = 0xFF;
445 sip[i].max_expected = 1;
446 sip[i].timeout = 3 * 60; /* 3 minutes */
447 sip[i].me = THIS_MODULE;
448 sip[i].help = sip_help;
449
450 tmpname = &sip_names[i][0];
451 if (ports[i] == SIP_PORT)
452 sprintf(tmpname, "sip");
453 else
454 sprintf(tmpname, "sip-%d", i);
455 sip[i].name = tmpname;
456
457 DEBUGP("port #%d: %d\n", i, ports[i]);
458
459 ret = ip_conntrack_helper_register(&sip[i]);
460 if (ret) {
461 printk("ERROR registering helper for port %d\n",
462 ports[i]);
463 fini();
464 return ret;
465 }
466 }
467 return 0;
468}
469
470module_init(init);
471module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 929d61f7be91..88445aac3f28 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
189 return -ENOSPC; 189 return -ENOSPC;
190#endif 190#endif
191 191
192#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
193 if (seq_printf(s, "secmark=%u ", conntrack->secmark))
194 return -ENOSPC;
195#endif
196
192 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) 197 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
193 return -ENOSPC; 198 return -ENOSPC;
194 199
@@ -417,7 +422,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum,
417 422
418 /* This is where we call the helper: as the packet goes out. */ 423 /* This is where we call the helper: as the packet goes out. */
419 ct = ip_conntrack_get(*pskb, &ctinfo); 424 ct = ip_conntrack_get(*pskb, &ctinfo);
420 if (ct && ct->helper) { 425 if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) {
421 unsigned int ret; 426 unsigned int ret;
422 ret = ct->helper->help(pskb, ct, ctinfo); 427 ret = ct->helper->help(pskb, ct, ctinfo);
423 if (ret != NF_ACCEPT) 428 if (ret != NF_ACCEPT)
@@ -564,6 +569,8 @@ extern unsigned int ip_ct_generic_timeout;
564static int log_invalid_proto_min = 0; 569static int log_invalid_proto_min = 0;
565static int log_invalid_proto_max = 255; 570static int log_invalid_proto_max = 255;
566 571
572int ip_conntrack_checksum = 1;
573
567static struct ctl_table_header *ip_ct_sysctl_header; 574static struct ctl_table_header *ip_ct_sysctl_header;
568 575
569static ctl_table ip_ct_sysctl_table[] = { 576static ctl_table ip_ct_sysctl_table[] = {
@@ -592,6 +599,14 @@ static ctl_table ip_ct_sysctl_table[] = {
592 .proc_handler = &proc_dointvec, 599 .proc_handler = &proc_dointvec,
593 }, 600 },
594 { 601 {
602 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
603 .procname = "ip_conntrack_checksum",
604 .data = &ip_conntrack_checksum,
605 .maxlen = sizeof(int),
606 .mode = 0644,
607 .proc_handler = &proc_dointvec,
608 },
609 {
595 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, 610 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
596 .procname = "ip_conntrack_tcp_timeout_syn_sent", 611 .procname = "ip_conntrack_tcp_timeout_syn_sent",
597 .data = &ip_ct_tcp_timeout_syn_sent, 612 .data = &ip_ct_tcp_timeout_syn_sent,
@@ -946,6 +961,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
946EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); 961EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
947EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); 962EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
948EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); 963EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
964EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
949#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ 965#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
950 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) 966 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
951EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); 967EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
index d45663d137a7..419b878fb467 100644
--- a/net/ipv4/netfilter/ip_nat_helper_h323.c
+++ b/net/ipv4/netfilter/ip_nat_helper_h323.c
@@ -487,6 +487,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
487} 487}
488 488
489/****************************************************************************/ 489/****************************************************************************/
490static void ip_nat_callforwarding_expect(struct ip_conntrack *new,
491 struct ip_conntrack_expect *this)
492{
493 struct ip_nat_range range;
494
495 /* This must be a fresh one. */
496 BUG_ON(new->status & IPS_NAT_DONE_MASK);
497
498 /* Change src to where master sends to */
499 range.flags = IP_NAT_RANGE_MAP_IPS;
500 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
501
502 /* hook doesn't matter, but it has to do source manip */
503 ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
504
505 /* For DST manip, map port here to where it's expected. */
506 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
507 range.min = range.max = this->saved_proto;
508 range.min_ip = range.max_ip = this->saved_ip;
509
510 /* hook doesn't matter, but it has to do destination manip */
511 ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
512
513 ip_conntrack_q931_expect(new, this);
514}
515
516/****************************************************************************/
517static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct,
518 enum ip_conntrack_info ctinfo,
519 unsigned char **data, int dataoff,
520 TransportAddress * addr, u_int16_t port,
521 struct ip_conntrack_expect *exp)
522{
523 int dir = CTINFO2DIR(ctinfo);
524 u_int16_t nated_port;
525
526 /* Set expectations for NAT */
527 exp->saved_ip = exp->tuple.dst.ip;
528 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
529 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
530 exp->expectfn = ip_nat_callforwarding_expect;
531 exp->dir = !dir;
532
533 /* Try to get same port: if not, try to change it. */
534 for (nated_port = port; nated_port != 0; nated_port++) {
535 exp->tuple.dst.u.tcp.port = htons(nated_port);
536 if (ip_conntrack_expect_related(exp) == 0)
537 break;
538 }
539
540 if (nated_port == 0) { /* No port available */
541 if (net_ratelimit())
542 printk("ip_nat_q931: out of TCP ports\n");
543 return 0;
544 }
545
546 /* Modify signal */
547 if (!set_h225_addr(pskb, data, dataoff, addr,
548 ct->tuplehash[!dir].tuple.dst.ip,
549 nated_port) == 0) {
550 ip_conntrack_unexpect_related(exp);
551 return -1;
552 }
553
554 /* Success */
555 DEBUGP("ip_nat_q931: expect Call Forwarding "
556 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
557 NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
558 NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
559
560 return 0;
561}
562
563/****************************************************************************/
490static int __init init(void) 564static int __init init(void)
491{ 565{
492 BUG_ON(set_h245_addr_hook != NULL); 566 BUG_ON(set_h245_addr_hook != NULL);
@@ -496,6 +570,7 @@ static int __init init(void)
496 BUG_ON(nat_rtp_rtcp_hook != NULL); 570 BUG_ON(nat_rtp_rtcp_hook != NULL);
497 BUG_ON(nat_t120_hook != NULL); 571 BUG_ON(nat_t120_hook != NULL);
498 BUG_ON(nat_h245_hook != NULL); 572 BUG_ON(nat_h245_hook != NULL);
573 BUG_ON(nat_callforwarding_hook != NULL);
499 BUG_ON(nat_q931_hook != NULL); 574 BUG_ON(nat_q931_hook != NULL);
500 575
501 set_h245_addr_hook = set_h245_addr; 576 set_h245_addr_hook = set_h245_addr;
@@ -505,6 +580,7 @@ static int __init init(void)
505 nat_rtp_rtcp_hook = nat_rtp_rtcp; 580 nat_rtp_rtcp_hook = nat_rtp_rtcp;
506 nat_t120_hook = nat_t120; 581 nat_t120_hook = nat_t120;
507 nat_h245_hook = nat_h245; 582 nat_h245_hook = nat_h245;
583 nat_callforwarding_hook = nat_callforwarding;
508 nat_q931_hook = nat_q931; 584 nat_q931_hook = nat_q931;
509 585
510 DEBUGP("ip_nat_h323: init success\n"); 586 DEBUGP("ip_nat_h323: init success\n");
@@ -521,6 +597,7 @@ static void __exit fini(void)
521 nat_rtp_rtcp_hook = NULL; 597 nat_rtp_rtcp_hook = NULL;
522 nat_t120_hook = NULL; 598 nat_t120_hook = NULL;
523 nat_h245_hook = NULL; 599 nat_h245_hook = NULL;
600 nat_callforwarding_hook = NULL;
524 nat_q931_hook = NULL; 601 nat_q931_hook = NULL;
525 synchronize_net(); 602 synchronize_net();
526} 603}
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c
new file mode 100644
index 000000000000..6ffba63adca2
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_sip.c
@@ -0,0 +1,249 @@
1/* SIP extension for UDP NAT alteration.
2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_nat_ftp.c and other modules.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/ip.h>
14#include <linux/udp.h>
15
16#include <linux/netfilter_ipv4.h>
17#include <linux/netfilter_ipv4/ip_nat.h>
18#include <linux/netfilter_ipv4/ip_nat_helper.h>
19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
20#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
24MODULE_DESCRIPTION("SIP NAT helper");
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(format, args...)
30#endif
31
32extern struct sip_header_nfo ct_sip_hdrs[];
33
34static unsigned int mangle_sip_packet(struct sk_buff **pskb,
35 enum ip_conntrack_info ctinfo,
36 struct ip_conntrack *ct,
37 const char **dptr, size_t dlen,
38 char *buffer, int bufflen,
39 struct sip_header_nfo *hnfo)
40{
41 unsigned int matchlen, matchoff;
42
43 if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0)
44 return 0;
45
46 if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
47 matchoff, matchlen, buffer, bufflen))
48 return 0;
49
50 /* We need to reload this. Thanks Patrick. */
51 *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
52 return 1;
53}
54
55static unsigned int ip_nat_sip(struct sk_buff **pskb,
56 enum ip_conntrack_info ctinfo,
57 struct ip_conntrack *ct,
58 const char **dptr)
59{
60 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
61 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
62 unsigned int bufflen, dataoff;
63 u_int32_t ip;
64 u_int16_t port;
65
66 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
67
68 ip = ct->tuplehash[!dir].tuple.dst.ip;
69 port = ct->tuplehash[!dir].tuple.dst.u.udp.port;
70 bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port));
71
72 /* short packet ? */
73 if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1))
74 return 0;
75
76 /* Basic rules: requests and responses. */
77 if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) {
78 const char *aux;
79
80 if ((ctinfo) < IP_CT_IS_REPLY) {
81 mangle_sip_packet(pskb, ctinfo, ct, dptr,
82 (*pskb)->len - dataoff,
83 buffer, bufflen,
84 &ct_sip_hdrs[POS_CONTACT]);
85 return 1;
86 }
87
88 if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
89 (*pskb)->len - dataoff,
90 buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
91 return 0;
92
93 /* This search should ignore case, but later.. */
94 aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1,
95 (*pskb)->len - dataoff);
96 if (!aux)
97 return 0;
98
99 if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"),
100 ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff)))
101 return 1;
102
103 return mangle_sip_packet(pskb, ctinfo, ct, dptr,
104 (*pskb)->len - dataoff,
105 buffer, bufflen,
106 &ct_sip_hdrs[POS_CONTACT]);
107 }
108 if ((ctinfo) < IP_CT_IS_REPLY) {
109 if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
110 (*pskb)->len - dataoff,
111 buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
112 return 0;
113
114 /* Mangle Contact if exists only. - watch udp_nat_mangle()! */
115 mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff,
116 buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]);
117 return 1;
118 }
119 /* This mangle requests headers. */
120 return mangle_sip_packet(pskb, ctinfo, ct, dptr,
121 ct_sip_lnlen(*dptr,
122 *dptr + (*pskb)->len - dataoff),
123 buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]);
124}
125
126static int mangle_content_len(struct sk_buff **pskb,
127 enum ip_conntrack_info ctinfo,
128 struct ip_conntrack *ct,
129 const char *dptr)
130{
131 unsigned int dataoff, matchoff, matchlen;
132 char buffer[sizeof("65536")];
133 int bufflen;
134
135 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
136
137 /* Get actual SDP lenght */
138 if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
139 &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) {
140
141 /* since ct_sip_get_info() give us a pointer passing 'v='
142 we need to add 2 bytes in this count. */
143 int c_len = (*pskb)->len - dataoff - matchoff + 2;
144
145 /* Now, update SDP lenght */
146 if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
147 &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) {
148
149 bufflen = sprintf(buffer, "%u", c_len);
150
151 return ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
152 matchoff, matchlen,
153 buffer, bufflen);
154 }
155 }
156 return 0;
157}
158
159static unsigned int mangle_sdp(struct sk_buff **pskb,
160 enum ip_conntrack_info ctinfo,
161 struct ip_conntrack *ct,
162 u_int32_t newip, u_int16_t port,
163 const char *dptr)
164{
165 char buffer[sizeof("nnn.nnn.nnn.nnn")];
166 unsigned int dataoff, bufflen;
167
168 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
169
170 /* Mangle owner and contact info. */
171 bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
172 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
173 buffer, bufflen, &ct_sip_hdrs[POS_OWNER]))
174 return 0;
175
176 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
177 buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION]))
178 return 0;
179
180 /* Mangle media port. */
181 bufflen = sprintf(buffer, "%u", port);
182 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
183 buffer, bufflen, &ct_sip_hdrs[POS_MEDIA]))
184 return 0;
185
186 return mangle_content_len(pskb, ctinfo, ct, dptr);
187}
188
189/* So, this packet has hit the connection tracking matching code.
190 Mangle it, and change the expectation to match the new version. */
191static unsigned int ip_nat_sdp(struct sk_buff **pskb,
192 enum ip_conntrack_info ctinfo,
193 struct ip_conntrack_expect *exp,
194 const char *dptr)
195{
196 struct ip_conntrack *ct = exp->master;
197 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
198 u_int32_t newip;
199 u_int16_t port;
200
201 DEBUGP("ip_nat_sdp():\n");
202
203 /* Connection will come from reply */
204 newip = ct->tuplehash[!dir].tuple.dst.ip;
205
206 exp->tuple.dst.ip = newip;
207 exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
208 exp->dir = !dir;
209
210 /* When you see the packet, we need to NAT it the same as the
211 this one. */
212 exp->expectfn = ip_nat_follow_master;
213
214 /* Try to get same port: if not, try to change it. */
215 for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) {
216 exp->tuple.dst.u.udp.port = htons(port);
217 if (ip_conntrack_expect_related(exp) == 0)
218 break;
219 }
220
221 if (port == 0)
222 return NF_DROP;
223
224 if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) {
225 ip_conntrack_unexpect_related(exp);
226 return NF_DROP;
227 }
228 return NF_ACCEPT;
229}
230
231static void __exit fini(void)
232{
233 ip_nat_sip_hook = NULL;
234 ip_nat_sdp_hook = NULL;
235 /* Make sure noone calls it, meanwhile. */
236 synchronize_net();
237}
238
239static int __init init(void)
240{
241 BUG_ON(ip_nat_sip_hook);
242 BUG_ON(ip_nat_sdp_hook);
243 ip_nat_sip_hook = ip_nat_sip;
244 ip_nat_sdp_hook = ip_nat_sdp;
245 return 0;
246}
247
248module_init(init);
249module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index c33244263b90..d20d557f915a 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void)
1348module_init(ip_nat_snmp_basic_init); 1348module_init(ip_nat_snmp_basic_init);
1349module_exit(ip_nat_snmp_basic_fini); 1349module_exit(ip_nat_snmp_basic_fini);
1350 1350
1351module_param(debug, bool, 0600); 1351module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index aad9d28c8d71..dbc83c5d7aa6 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
241 struct iphdr *iph = skb->nh.iph; 241 struct iphdr *iph = skb->nh.iph;
242 unsigned long hashval; 242 unsigned long hashval;
243 u_int16_t sport, dport; 243 u_int16_t sport, dport;
244 struct tcphdr *th; 244 u_int16_t *ports;
245 struct udphdr *uh;
246 struct icmphdr *ih;
247 245
248 switch (iph->protocol) { 246 switch (iph->protocol) {
249 case IPPROTO_TCP: 247 case IPPROTO_TCP:
250 th = (void *)iph+iph->ihl*4;
251 sport = ntohs(th->source);
252 dport = ntohs(th->dest);
253 break;
254 case IPPROTO_UDP: 248 case IPPROTO_UDP:
255 uh = (void *)iph+iph->ihl*4; 249 case IPPROTO_SCTP:
256 sport = ntohs(uh->source); 250 case IPPROTO_DCCP:
257 dport = ntohs(uh->dest);
258 break;
259 case IPPROTO_ICMP: 251 case IPPROTO_ICMP:
260 ih = (void *)iph+iph->ihl*4; 252 ports = (void *)iph+iph->ihl*4;
261 sport = ntohs(ih->un.echo.id); 253 sport = ports[0];
262 dport = (ih->type<<8)|ih->code; 254 dport = ports[1];
263 break; 255 break;
264 default: 256 default:
265 if (net_ratelimit()) { 257 if (net_ratelimit()) {
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 0bba3c2bb786..431a3ce6f7b7 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
147 /* This packet will not be the same as the other: clear nf fields */ 147 /* This packet will not be the same as the other: clear nf fields */
148 nf_reset(nskb); 148 nf_reset(nskb);
149 nskb->nfmark = 0; 149 nskb->nfmark = 0;
150 skb_init_secmark(nskb);
150 151
151 tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); 152 tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
152 153
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 7c6836c4646e..92980ab8ce48 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -28,9 +28,6 @@
28#include <linux/jhash.h> 28#include <linux/jhash.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/tcp.h>
32#include <linux/udp.h>
33#include <linux/sctp.h>
34#include <linux/proc_fs.h> 31#include <linux/proc_fs.h>
35#include <linux/seq_file.h> 32#include <linux/seq_file.h>
36#include <linux/list.h> 33#include <linux/list.h>
@@ -83,6 +80,7 @@ struct ipt_hashlimit_htable {
83 /* used internally */ 80 /* used internally */
84 spinlock_t lock; /* lock for list_head */ 81 spinlock_t lock; /* lock for list_head */
85 u_int32_t rnd; /* random seed for hash */ 82 u_int32_t rnd; /* random seed for hash */
83 int rnd_initialized;
86 struct timer_list timer; /* timer for gc */ 84 struct timer_list timer; /* timer for gc */
87 atomic_t count; /* number entries in table */ 85 atomic_t count; /* number entries in table */
88 86
@@ -137,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
137 135
138 /* initialize hash with random val at the time we allocate 136 /* initialize hash with random val at the time we allocate
139 * the first hashtable entry */ 137 * the first hashtable entry */
140 if (!ht->rnd) 138 if (!ht->rnd_initialized) {
141 get_random_bytes(&ht->rnd, 4); 139 get_random_bytes(&ht->rnd, 4);
140 ht->rnd_initialized = 1;
141 }
142 142
143 if (ht->cfg.max && 143 if (ht->cfg.max &&
144 atomic_read(&ht->count) >= ht->cfg.max) { 144 atomic_read(&ht->count) >= ht->cfg.max) {
@@ -217,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
217 217
218 atomic_set(&hinfo->count, 0); 218 atomic_set(&hinfo->count, 0);
219 atomic_set(&hinfo->use, 1); 219 atomic_set(&hinfo->use, 1);
220 hinfo->rnd = 0; 220 hinfo->rnd_initialized = 0;
221 spin_lock_init(&hinfo->lock); 221 spin_lock_init(&hinfo->lock);
222 hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); 222 hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir);
223 if (!hinfo->pde) { 223 if (!hinfo->pde) {
@@ -381,49 +381,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
381 dh->rateinfo.credit = dh->rateinfo.credit_cap; 381 dh->rateinfo.credit = dh->rateinfo.credit_cap;
382} 382}
383 383
384static inline int get_ports(const struct sk_buff *skb, int offset,
385 u16 ports[2])
386{
387 union {
388 struct tcphdr th;
389 struct udphdr uh;
390 sctp_sctphdr_t sctph;
391 } hdr_u, *ptr_u;
392
393 /* Must not be a fragment. */
394 if (offset)
395 return 1;
396
397 /* Must be big enough to read ports (both UDP and TCP have
398 them at the start). */
399 ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u);
400 if (!ptr_u)
401 return 1;
402
403 switch (skb->nh.iph->protocol) {
404 case IPPROTO_TCP:
405 ports[0] = ptr_u->th.source;
406 ports[1] = ptr_u->th.dest;
407 break;
408 case IPPROTO_UDP:
409 ports[0] = ptr_u->uh.source;
410 ports[1] = ptr_u->uh.dest;
411 break;
412 case IPPROTO_SCTP:
413 ports[0] = ptr_u->sctph.source;
414 ports[1] = ptr_u->sctph.dest;
415 break;
416 default:
417 /* all other protocols don't supprot per-port hash
418 * buckets */
419 ports[0] = ports[1] = 0;
420 break;
421 }
422
423 return 0;
424}
425
426
427static int 384static int
428hashlimit_match(const struct sk_buff *skb, 385hashlimit_match(const struct sk_buff *skb,
429 const struct net_device *in, 386 const struct net_device *in,
@@ -449,8 +406,22 @@ hashlimit_match(const struct sk_buff *skb,
449 dst.src_ip = skb->nh.iph->saddr; 406 dst.src_ip = skb->nh.iph->saddr;
450 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT 407 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT
451 ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { 408 ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) {
452 u_int16_t ports[2]; 409 u_int16_t _ports[2], *ports;
453 if (get_ports(skb, offset, ports)) { 410
411 switch (skb->nh.iph->protocol) {
412 case IPPROTO_TCP:
413 case IPPROTO_UDP:
414 case IPPROTO_SCTP:
415 case IPPROTO_DCCP:
416 ports = skb_header_pointer(skb, skb->nh.iph->ihl*4,
417 sizeof(_ports), &_ports);
418 break;
419 default:
420 _ports[0] = _ports[1] = 0;
421 ports = _ports;
422 break;
423 }
424 if (!ports) {
454 /* We've been asked to examine this packet, and we 425 /* We've been asked to examine this packet, and we
455 can't. Hence, no choice but to drop. */ 426 can't. Hence, no choice but to drop. */
456 *hotdrop = 1; 427 *hotdrop = 1;
@@ -561,7 +532,7 @@ static void
561hashlimit_destroy(const struct xt_match *match, void *matchinfo, 532hashlimit_destroy(const struct xt_match *match, void *matchinfo,
562 unsigned int matchsize) 533 unsigned int matchsize)
563{ 534{
564 struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; 535 struct ipt_hashlimit_info *r = matchinfo;
565 536
566 htable_put(r->hinfo); 537 htable_put(r->hinfo);
567} 538}
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index b847ee409efb..61a2139f9cfd 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -1,1007 +1,499 @@
1/* Kernel module to check if the source address has been seen recently. */ 1/*
2/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ 2 * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
3/* Author: Stephen Frost <sfrost@snowman.net> */ 3 *
4/* Project Page: http://snowman.net/projects/ipt_recent/ */ 4 * This program is free software; you can redistribute it and/or modify
5/* This software is distributed under the terms of the GPL, Version 2 */ 5 * it under the terms of the GNU General Public License version 2 as
6/* This copyright does not cover user programs that use kernel services 6 * published by the Free Software Foundation.
7 * by normal system calls. */ 7 *
8 8 * This is a replacement of the old ipt_recent module, which carried the
9#include <linux/module.h> 9 * following copyright notice:
10#include <linux/skbuff.h> 10 *
11 * Author: Stephen Frost <sfrost@snowman.net>
12 * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
13 */
14#include <linux/init.h>
15#include <linux/moduleparam.h>
11#include <linux/proc_fs.h> 16#include <linux/proc_fs.h>
12#include <linux/spinlock.h> 17#include <linux/seq_file.h>
13#include <linux/interrupt.h> 18#include <linux/string.h>
14#include <asm/uaccess.h>
15#include <linux/ctype.h> 19#include <linux/ctype.h>
16#include <linux/ip.h> 20#include <linux/list.h>
17#include <linux/vmalloc.h> 21#include <linux/random.h>
18#include <linux/moduleparam.h> 22#include <linux/jhash.h>
23#include <linux/bitops.h>
24#include <linux/skbuff.h>
25#include <linux/inet.h>
19 26
20#include <linux/netfilter_ipv4/ip_tables.h> 27#include <linux/netfilter_ipv4/ip_tables.h>
21#include <linux/netfilter_ipv4/ipt_recent.h> 28#include <linux/netfilter_ipv4/ipt_recent.h>
22 29
23#undef DEBUG 30MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
24#define HASH_LOG 9 31MODULE_DESCRIPTION("IP tables recently seen matching module");
32MODULE_LICENSE("GPL");
25 33
26/* Defaults, these can be overridden on the module command-line. */
27static unsigned int ip_list_tot = 100; 34static unsigned int ip_list_tot = 100;
28static unsigned int ip_pkt_list_tot = 20; 35static unsigned int ip_pkt_list_tot = 20;
29static unsigned int ip_list_hash_size = 0; 36static unsigned int ip_list_hash_size = 0;
30static unsigned int ip_list_perms = 0644; 37static unsigned int ip_list_perms = 0644;
31#ifdef DEBUG
32static int debug = 1;
33#endif
34
35static char version[] =
36KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n";
37
38MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>");
39MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER);
40MODULE_LICENSE("GPL");
41module_param(ip_list_tot, uint, 0400); 38module_param(ip_list_tot, uint, 0400);
42module_param(ip_pkt_list_tot, uint, 0400); 39module_param(ip_pkt_list_tot, uint, 0400);
43module_param(ip_list_hash_size, uint, 0400); 40module_param(ip_list_hash_size, uint, 0400);
44module_param(ip_list_perms, uint, 0400); 41module_param(ip_list_perms, uint, 0400);
45#ifdef DEBUG 42MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
46module_param(debug, bool, 0600); 43MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
47MODULE_PARM_DESC(debug,"enable debugging output"); 44MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
48#endif 45MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
49MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); 46
50MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); 47
51MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); 48struct recent_entry {
52MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); 49 struct list_head list;
53 50 struct list_head lru_list;
54/* Structure of our list of recently seen addresses. */ 51 u_int32_t addr;
55struct recent_ip_list { 52 u_int8_t ttl;
56 u_int32_t addr; 53 u_int8_t index;
57 u_int8_t ttl; 54 u_int16_t nstamps;
58 unsigned long last_seen; 55 unsigned long stamps[0];
59 unsigned long *last_pkts;
60 u_int32_t oldest_pkt;
61 u_int32_t hash_entry;
62 u_int32_t time_pos;
63};
64
65struct time_info_list {
66 u_int32_t position;
67 u_int32_t time;
68}; 56};
69 57
70/* Structure of our linked list of tables of recent lists. */ 58struct recent_table {
71struct recent_ip_tables { 59 struct list_head list;
72 char name[IPT_RECENT_NAME_LEN]; 60 char name[IPT_RECENT_NAME_LEN];
73 int count;
74 int time_pos;
75 struct recent_ip_list *table;
76 struct recent_ip_tables *next;
77 spinlock_t list_lock;
78 int *hash_table;
79 struct time_info_list *time_info;
80#ifdef CONFIG_PROC_FS 61#ifdef CONFIG_PROC_FS
81 struct proc_dir_entry *status_proc; 62 struct proc_dir_entry *proc;
82#endif /* CONFIG_PROC_FS */ 63#endif
64 unsigned int refcnt;
65 unsigned int entries;
66 struct list_head lru_list;
67 struct list_head iphash[0];
83}; 68};
84 69
85/* Our current list of addresses we have recently seen. 70static LIST_HEAD(tables);
86 * Only added to on a --set, and only updated on --set || --update
87 */
88static struct recent_ip_tables *r_tables = NULL;
89
90/* We protect r_list with this spinlock so two processors are not modifying
91 * the list at the same time.
92 */
93static DEFINE_SPINLOCK(recent_lock); 71static DEFINE_SPINLOCK(recent_lock);
72static DEFINE_MUTEX(recent_mutex);
94 73
95#ifdef CONFIG_PROC_FS 74#ifdef CONFIG_PROC_FS
96/* Our /proc/net/ipt_recent entry */ 75static struct proc_dir_entry *proc_dir;
97static struct proc_dir_entry *proc_net_ipt_recent = NULL; 76static struct file_operations recent_fops;
98#endif
99
100/* Function declaration for later. */
101static int
102match(const struct sk_buff *skb,
103 const struct net_device *in,
104 const struct net_device *out,
105 const struct xt_match *match,
106 const void *matchinfo,
107 int offset,
108 unsigned int protoff,
109 int *hotdrop);
110
111/* Function to hash a given address into the hash table of table_size size */
112static int hash_func(unsigned int addr, int table_size)
113{
114 int result = 0;
115 unsigned int value = addr;
116 do { result ^= value; } while((value >>= HASH_LOG));
117
118#ifdef DEBUG
119 if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n",
120 result & (table_size - 1),
121 addr,
122 table_size);
123#endif 77#endif
124 78
125 return(result & (table_size - 1)); 79static u_int32_t hash_rnd;
126} 80static int hash_rnd_initted;
127 81
128#ifdef CONFIG_PROC_FS 82static unsigned int recent_entry_hash(u_int32_t addr)
129/* This is the function which produces the output for our /proc output
130 * interface which lists each IP address, the last seen time and the
131 * other recent times the address was seen.
132 */
133
134static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
135{ 83{
136 int len = 0, count, last_len = 0, pkt_count; 84 if (!hash_rnd_initted) {
137 off_t pos = 0; 85 get_random_bytes(&hash_rnd, 4);
138 off_t begin = 0; 86 hash_rnd_initted = 1;
139 struct recent_ip_tables *curr_table;
140
141 curr_table = (struct recent_ip_tables*) data;
142
143 spin_lock_bh(&curr_table->list_lock);
144 for(count = 0; count < ip_list_tot; count++) {
145 if(!curr_table->table[count].addr) continue;
146 last_len = len;
147 len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr));
148 len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl);
149 len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen);
150 len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt);
151 len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]);
152 for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) {
153 if(!curr_table->table[count].last_pkts[pkt_count]) break;
154 len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]);
155 }
156 len += sprintf(buffer+len,"\n");
157 pos = begin + len;
158 if(pos < offset) { len = 0; begin = pos; }
159 if(pos > offset + length) { len = last_len; break; }
160 } 87 }
161 88 return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1);
162 *start = buffer + (offset - begin);
163 len -= (offset - begin);
164 if(len > length) len = length;
165
166 spin_unlock_bh(&curr_table->list_lock);
167 return len;
168} 89}
169 90
170/* ip_recent_ctrl provides an interface for users to modify the table 91static struct recent_entry *
171 * directly. This allows adding entries, removing entries, and 92recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl)
172 * flushing the entire table.
173 * This is done by opening up the appropriate table for writing and
174 * sending one of:
175 * xx.xx.xx.xx -- Add entry to table with current time
176 * +xx.xx.xx.xx -- Add entry to table with current time
177 * -xx.xx.xx.xx -- Remove entry from table
178 * clear -- Flush table, remove all entries
179 */
180
181static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data)
182{ 93{
183 static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; 94 struct recent_entry *e;
184 u_int32_t val; 95 unsigned int h;
185 int base, used = 0; 96
186 char c, *cp; 97 h = recent_entry_hash(addr);
187 union iaddr { 98 list_for_each_entry(e, &table->iphash[h], list)
188 uint8_t bytes[4]; 99 if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
189 uint32_t word; 100 return e;
190 } res; 101 return NULL;
191 uint8_t *pp = res.bytes; 102}
192 int digit;
193
194 char buffer[20];
195 int len, check_set = 0, count;
196 u_int32_t addr = 0;
197 struct sk_buff *skb;
198 struct ipt_recent_info *info;
199 struct recent_ip_tables *curr_table;
200
201 curr_table = (struct recent_ip_tables*) data;
202
203 if(size > 20) len = 20; else len = size;
204
205 if(copy_from_user(buffer,input,len)) return -EFAULT;
206
207 if(len < 20) buffer[len] = '\0';
208
209#ifdef DEBUG
210 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer);
211#endif
212 103
213 cp = buffer; 104static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
214 while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } 105{
106 list_del(&e->list);
107 list_del(&e->lru_list);
108 kfree(e);
109 t->entries--;
110}
215 111
216 /* Check if we are asked to flush the entire table */ 112static struct recent_entry *
217 if(!memcmp(cp,"clear",5)) { 113recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl)
218 used += 5; 114{
219 spin_lock_bh(&curr_table->list_lock); 115 struct recent_entry *e;
220 curr_table->time_pos = 0;
221 for(count = 0; count < ip_list_hash_size; count++) {
222 curr_table->hash_table[count] = -1;
223 }
224 for(count = 0; count < ip_list_tot; count++) {
225 curr_table->table[count].last_seen = 0;
226 curr_table->table[count].addr = 0;
227 curr_table->table[count].ttl = 0;
228 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
229 curr_table->table[count].oldest_pkt = 0;
230 curr_table->table[count].time_pos = 0;
231 curr_table->time_info[count].position = count;
232 curr_table->time_info[count].time = 0;
233 }
234 spin_unlock_bh(&curr_table->list_lock);
235 return used;
236 }
237 116
238 check_set = IPT_RECENT_SET; 117 if (t->entries >= ip_list_tot) {
239 switch(*cp) { 118 e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
240 case '+': check_set = IPT_RECENT_SET; cp++; used++; break; 119 recent_entry_remove(t, e);
241 case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break;
242 default: if(!isdigit(*cp)) return (used+1); break;
243 } 120 }
121 e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
122 GFP_ATOMIC);
123 if (e == NULL)
124 return NULL;
125 e->addr = addr;
126 e->ttl = ttl;
127 e->stamps[0] = jiffies;
128 e->nstamps = 1;
129 e->index = 1;
130 list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
131 list_add_tail(&e->lru_list, &t->lru_list);
132 t->entries++;
133 return e;
134}
244 135
245#ifdef DEBUG 136static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
246 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); 137{
247#endif 138 e->stamps[e->index++] = jiffies;
248 /* Get addr (effectively inet_aton()) */ 139 if (e->index > e->nstamps)
249 /* Shamelessly stolen from libc, a function in the kernel for doing 140 e->nstamps = e->index;
250 * this would, of course, be greatly preferred, but our options appear 141 e->index %= ip_pkt_list_tot;
251 * to be rather limited, so we will just do it ourselves here. 142 list_move_tail(&e->lru_list, &t->lru_list);
252 */ 143}
253 res.word = 0;
254
255 c = *cp;
256 for(;;) {
257 if(!isdigit(c)) return used;
258 val = 0; base = 10; digit = 0;
259 if(c == '0') {
260 c = *++cp;
261 if(c == 'x' || c == 'X') base = 16, c = *++cp;
262 else { base = 8; digit = 1; }
263 }
264 for(;;) {
265 if(isascii(c) && isdigit(c)) {
266 if(base == 8 && (c == '8' || c == '0')) return used;
267 val = (val * base) + (c - '0');
268 c = *++cp;
269 digit = 1;
270 } else if(base == 16 && isascii(c) && isxdigit(c)) {
271 val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A'));
272 c = *++cp;
273 digit = 1;
274 } else break;
275 }
276 if(c == '.') {
277 if(pp > res.bytes + 2 || val > 0xff) return used;
278 *pp++ = val;
279 c = *++cp;
280 } else break;
281 }
282 used = cp - buffer;
283 if(c != '\0' && (!isascii(c) || !isspace(c))) return used;
284 if(c == '\n') used++;
285 if(!digit) return used;
286 144
287 if(val > max[pp - res.bytes]) return used; 145static struct recent_table *recent_table_lookup(const char *name)
288 addr = res.word | htonl(val); 146{
147 struct recent_table *t;
289 148
290 if(!addr && check_set == IPT_RECENT_SET) return used; 149 list_for_each_entry(t, &tables, list)
150 if (!strcmp(t->name, name))
151 return t;
152 return NULL;
153}
291 154
292#ifdef DEBUG 155static void recent_table_flush(struct recent_table *t)
293 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); 156{
294#endif 157 struct recent_entry *e, *next;
158 unsigned int i;
295 159
296 /* Set up and just call match */ 160 for (i = 0; i < ip_list_hash_size; i++) {
297 info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); 161 list_for_each_entry_safe(e, next, &t->iphash[i], list)
298 if(!info) { return -ENOMEM; } 162 recent_entry_remove(t, e);
299 info->seconds = 0;
300 info->hit_count = 0;
301 info->check_set = check_set;
302 info->invert = 0;
303 info->side = IPT_RECENT_SOURCE;
304 strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN);
305 info->name[IPT_RECENT_NAME_LEN-1] = '\0';
306
307 skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL);
308 if (!skb) {
309 used = -ENOMEM;
310 goto out_free_info;
311 }
312 skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL);
313 if (!skb->nh.iph) {
314 used = -ENOMEM;
315 goto out_free_skb;
316 } 163 }
317
318 skb->nh.iph->saddr = addr;
319 skb->nh.iph->daddr = 0;
320 /* Clear ttl since we have no way of knowing it */
321 skb->nh.iph->ttl = 0;
322 match(skb,NULL,NULL,NULL,info,0,0,NULL);
323
324 kfree(skb->nh.iph);
325out_free_skb:
326 kfree(skb);
327out_free_info:
328 kfree(info);
329
330#ifdef DEBUG
331 if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used);
332#endif
333 return used;
334} 164}
335 165
336#endif /* CONFIG_PROC_FS */
337
338/* 'match' is our primary function, called by the kernel whenever a rule is
339 * hit with our module as an option to it.
340 * What this function does depends on what was specifically asked of it by
341 * the user:
342 * --set -- Add or update last seen time of the source address of the packet
343 * -- matchinfo->check_set == IPT_RECENT_SET
344 * --rcheck -- Just check if the source address is in the list
345 * -- matchinfo->check_set == IPT_RECENT_CHECK
346 * --update -- If the source address is in the list, update last_seen
347 * -- matchinfo->check_set == IPT_RECENT_UPDATE
348 * --remove -- If the source address is in the list, remove it
349 * -- matchinfo->check_set == IPT_RECENT_REMOVE
350 * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds
351 * -- matchinfo->seconds
352 * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times
353 * -- matchinfo->hit_count
354 * --seconds and --hitcount can be combined
355 */
356static int 166static int
357match(const struct sk_buff *skb, 167ipt_recent_match(const struct sk_buff *skb,
358 const struct net_device *in, 168 const struct net_device *in, const struct net_device *out,
359 const struct net_device *out, 169 const struct xt_match *match, const void *matchinfo,
360 const struct xt_match *match, 170 int offset, unsigned int protoff, int *hotdrop)
361 const void *matchinfo,
362 int offset,
363 unsigned int protoff,
364 int *hotdrop)
365{ 171{
366 int pkt_count, hits_found, ans;
367 unsigned long now;
368 const struct ipt_recent_info *info = matchinfo; 172 const struct ipt_recent_info *info = matchinfo;
369 u_int32_t addr = 0, time_temp; 173 struct recent_table *t;
370 u_int8_t ttl = skb->nh.iph->ttl; 174 struct recent_entry *e;
371 int *hash_table; 175 u_int32_t addr;
372 int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; 176 u_int8_t ttl;
373 struct time_info_list *time_info; 177 int ret = info->invert;
374 struct recent_ip_tables *curr_table;
375 struct recent_ip_tables *last_table;
376 struct recent_ip_list *r_list;
377
378#ifdef DEBUG
379 if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n");
380#endif
381
382 /* Default is false ^ info->invert */
383 ans = info->invert;
384 178
385#ifdef DEBUG 179 if (info->side == IPT_RECENT_DEST)
386 if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); 180 addr = skb->nh.iph->daddr;
387#endif 181 else
182 addr = skb->nh.iph->saddr;
388 183
389 /* if out != NULL then routing has been done and TTL changed. 184 ttl = skb->nh.iph->ttl;
390 * We change it back here internally for match what came in before routing. */ 185 /* use TTL as seen before forwarding */
391 if(out) ttl++; 186 if (out && !skb->sk)
187 ttl++;
392 188
393 /* Find the right table */
394 spin_lock_bh(&recent_lock); 189 spin_lock_bh(&recent_lock);
395 curr_table = r_tables; 190 t = recent_table_lookup(info->name);
396 while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); 191 e = recent_entry_lookup(t, addr,
397 192 info->check_set & IPT_RECENT_TTL ? ttl : 0);
398#ifdef DEBUG 193 if (e == NULL) {
399 if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); 194 if (!(info->check_set & IPT_RECENT_SET))
400#endif 195 goto out;
401 196 e = recent_entry_init(t, addr, ttl);
402 spin_unlock_bh(&recent_lock); 197 if (e == NULL)
403 198 *hotdrop = 1;
404 /* Table with this name not found, match impossible */ 199 ret ^= 1;
405 if(!curr_table) { return ans; } 200 goto out;
406
407 /* Make sure no one is changing the list while we work with it */
408 spin_lock_bh(&curr_table->list_lock);
409
410 r_list = curr_table->table;
411 if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr;
412
413 if(!addr) {
414#ifdef DEBUG
415 if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr);
416#endif
417 spin_unlock_bh(&curr_table->list_lock);
418 return ans;
419 }
420
421#ifdef DEBUG
422 if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl);
423#endif
424
425 /* Get jiffies now in case they changed while we were waiting for a lock */
426 now = jiffies;
427 hash_table = curr_table->hash_table;
428 time_info = curr_table->time_info;
429
430 orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size);
431 /* Hash entry at this result used */
432 /* Check for TTL match if requested. If TTL is zero then a match would never
433 * happen, so match regardless of existing TTL in that case. Zero means the
434 * entry was added via the /proc interface anyway, so we will just use the
435 * first TTL we get for that IP address. */
436 if(info->check_set & IPT_RECENT_TTL) {
437 while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr &&
438 (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) {
439 /* Collision in hash table */
440 hash_result = (hash_result + 1) % ip_list_hash_size;
441 }
442 } else {
443 while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) {
444 /* Collision in hash table */
445 hash_result = (hash_result + 1) % ip_list_hash_size;
446 }
447 }
448
449 if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) {
450 /* IP not in list and not asked to SET */
451 spin_unlock_bh(&curr_table->list_lock);
452 return ans;
453 }
454
455 /* Check if we need to handle the collision, do not need to on REMOVE */
456 if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) {
457#ifdef DEBUG
458 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n",
459 orig_hash_result,
460 hash_result,
461 r_list[hash_table[orig_hash_result]].addr,
462 addr);
463#endif
464
465 /* We had a collision.
466 * orig_hash_result is where we started, hash_result is where we ended up.
467 * So, swap them because we are likely to see the same guy again sooner */
468#ifdef DEBUG
469 if(debug) {
470 printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]);
471 printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n",
472 r_list[hash_table[orig_hash_result]].hash_entry);
473 }
474#endif
475
476 r_list[hash_table[orig_hash_result]].hash_entry = hash_result;
477
478
479 temp = hash_table[orig_hash_result];
480#ifdef DEBUG
481 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]);
482#endif
483 hash_table[orig_hash_result] = hash_table[hash_result];
484 hash_table[hash_result] = temp;
485 temp = hash_result;
486 hash_result = orig_hash_result;
487 orig_hash_result = temp;
488 time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result];
489 if(hash_table[hash_result] != -1) {
490 r_list[hash_table[hash_result]].hash_entry = hash_result;
491 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
492 }
493
494#ifdef DEBUG
495 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n");
496#endif
497 } 201 }
498 202
499 if(hash_table[hash_result] == -1) { 203 if (info->check_set & IPT_RECENT_SET)
500#ifdef DEBUG 204 ret ^= 1;
501 if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", 205 else if (info->check_set & IPT_RECENT_REMOVE) {
502 hash_result, addr); 206 recent_entry_remove(t, e);
503#endif 207 ret ^= 1;
504 208 } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) {
505 /* New item found and IPT_RECENT_SET, so we need to add it */ 209 unsigned long t = jiffies - info->seconds * HZ;
506 location = time_info[curr_table->time_pos].position; 210 unsigned int i, hits = 0;
507 hash_table[r_list[location].hash_entry] = -1; 211
508 hash_table[hash_result] = location; 212 for (i = 0; i < e->nstamps; i++) {
509 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); 213 if (info->seconds && time_after(t, e->stamps[i]))
510 r_list[location].time_pos = curr_table->time_pos; 214 continue;
511 r_list[location].addr = addr; 215 if (++hits >= info->hit_count) {
512 r_list[location].ttl = ttl; 216 ret ^= 1;
513 r_list[location].last_seen = now; 217 break;
514 r_list[location].oldest_pkt = 1;
515 r_list[location].last_pkts[0] = now;
516 r_list[location].hash_entry = hash_result;
517 time_info[curr_table->time_pos].time = r_list[location].last_seen;
518 curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot;
519
520 ans = !info->invert;
521 } else {
522#ifdef DEBUG
523 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n",
524 hash_result,
525 addr);
526#endif
527
528 /* Existing item found */
529 location = hash_table[hash_result];
530 /* We have a match on address, now to make sure it meets all requirements for a
531 * full match. */
532 if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) {
533 if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert;
534 if(info->seconds && !info->hit_count) {
535 if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert;
536 }
537 if(info->seconds && info->hit_count) {
538 for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
539 if(r_list[location].last_pkts[pkt_count] == 0) break;
540 if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
541 }
542 if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
543 }
544 if(info->hit_count && !info->seconds) {
545 for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
546 if(r_list[location].last_pkts[pkt_count] == 0) break;
547 hits_found++;
548 }
549 if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
550 } 218 }
551 } 219 }
552#ifdef DEBUG
553 if(debug) {
554 if(ans)
555 printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr);
556 else
557 printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr);
558 }
559#endif
560
561 /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the
562 * current timestamp to the last_seen. */
563 if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) {
564#ifdef DEBUG
565 if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n");
566#endif
567 /* Have to update our time info */
568 time_loc = r_list[location].time_pos;
569 time_info[time_loc].time = now;
570 time_info[time_loc].position = location;
571 while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
572 time_temp = time_info[time_loc].time;
573 time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
574 time_info[(time_loc+1)%ip_list_tot].time = time_temp;
575 time_temp = time_info[time_loc].position;
576 time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
577 time_info[(time_loc+1)%ip_list_tot].position = time_temp;
578 r_list[time_info[time_loc].position].time_pos = time_loc;
579 r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
580 time_loc = (time_loc+1) % ip_list_tot;
581 }
582 r_list[location].time_pos = time_loc;
583 r_list[location].ttl = ttl;
584 r_list[location].last_pkts[r_list[location].oldest_pkt] = now;
585 r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot;
586 r_list[location].last_seen = now;
587 }
588 /* If we have been asked to remove the entry from the list, just set it to 0 */
589 if(info->check_set & IPT_RECENT_REMOVE) {
590#ifdef DEBUG
591 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result);
592#endif
593 /* Check if this is part of a collision chain */
594 while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) {
595 orig_hash_result++;
596 if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) {
597 /* Found collision chain, how deep does this rabbit hole go? */
598#ifdef DEBUG
599 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n");
600#endif
601 end_collision_chain = orig_hash_result;
602 }
603 }
604 if(end_collision_chain != -1) {
605#ifdef DEBUG
606 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n");
607#endif
608 /* Part of a collision chain, swap it with the end of the chain
609 * before removing. */
610 r_list[hash_table[end_collision_chain]].hash_entry = hash_result;
611 temp = hash_table[end_collision_chain];
612 hash_table[end_collision_chain] = hash_table[hash_result];
613 hash_table[hash_result] = temp;
614 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
615 hash_result = end_collision_chain;
616 r_list[hash_table[hash_result]].hash_entry = hash_result;
617 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
618 }
619 location = hash_table[hash_result];
620 hash_table[r_list[location].hash_entry] = -1;
621 time_loc = r_list[location].time_pos;
622 time_info[time_loc].time = 0;
623 time_info[time_loc].position = location;
624 while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
625 time_temp = time_info[time_loc].time;
626 time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
627 time_info[(time_loc+1)%ip_list_tot].time = time_temp;
628 time_temp = time_info[time_loc].position;
629 time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
630 time_info[(time_loc+1)%ip_list_tot].position = time_temp;
631 r_list[time_info[time_loc].position].time_pos = time_loc;
632 r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
633 time_loc = (time_loc+1) % ip_list_tot;
634 }
635 r_list[location].time_pos = time_loc;
636 r_list[location].last_seen = 0;
637 r_list[location].addr = 0;
638 r_list[location].ttl = 0;
639 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
640 r_list[location].oldest_pkt = 0;
641 ans = !info->invert;
642 }
643 spin_unlock_bh(&curr_table->list_lock);
644 return ans;
645 } 220 }
646 221
647 spin_unlock_bh(&curr_table->list_lock); 222 if (info->check_set & IPT_RECENT_SET ||
648#ifdef DEBUG 223 (info->check_set & IPT_RECENT_UPDATE && ret)) {
649 if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); 224 recent_entry_update(t, e);
650#endif 225 e->ttl = ttl;
651 return ans; 226 }
227out:
228 spin_unlock_bh(&recent_lock);
229 return ret;
652} 230}
653 231
654/* This function is to verify that the rule given during the userspace iptables
655 * command is correct.
656 * If the command is valid then we check if the table name referred to by the
657 * rule exists, if not it is created.
658 */
659static int 232static int
660checkentry(const char *tablename, 233ipt_recent_checkentry(const char *tablename, const void *ip,
661 const void *ip, 234 const struct xt_match *match, void *matchinfo,
662 const struct xt_match *match, 235 unsigned int matchsize, unsigned int hook_mask)
663 void *matchinfo,
664 unsigned int matchsize,
665 unsigned int hook_mask)
666{ 236{
667 int flag = 0, c;
668 unsigned long *hold;
669 const struct ipt_recent_info *info = matchinfo; 237 const struct ipt_recent_info *info = matchinfo;
670 struct recent_ip_tables *curr_table, *find_table, *last_table; 238 struct recent_table *t;
671 239 unsigned i;
672#ifdef DEBUG 240 int ret = 0;
673 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n");
674#endif
675
676 /* seconds and hit_count only valid for CHECK/UPDATE */
677 if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; }
678 if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; }
679 if(info->check_set & IPT_RECENT_CHECK) flag++;
680 if(info->check_set & IPT_RECENT_UPDATE) flag++;
681
682 /* One and only one of these should ever be set */
683 if(flag != 1) return 0;
684
685 /* Name must be set to something */
686 if(!info->name || !info->name[0]) return 0;
687 241
688 /* Things look good, create a list for this if it does not exist */ 242 if (hweight8(info->check_set &
689 /* Lock the linked list while we play with it */ 243 (IPT_RECENT_SET | IPT_RECENT_REMOVE |
690 spin_lock_bh(&recent_lock); 244 IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1)
691 245 return 0;
692 /* Look for an entry with this name already created */ 246 if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
693 /* Finds the end of the list and the entry before the end if current name does not exist */ 247 (info->seconds || info->hit_count))
694 find_table = r_tables; 248 return 0;
695 while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); 249 if (info->name[0] == '\0' ||
250 strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
251 return 0;
696 252
697 /* If a table already exists just increment the count on that table and return */ 253 mutex_lock(&recent_mutex);
698 if(find_table) { 254 t = recent_table_lookup(info->name);
699#ifdef DEBUG 255 if (t != NULL) {
700 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); 256 t->refcnt++;
701#endif 257 ret = 1;
702 find_table->count++; 258 goto out;
703 spin_unlock_bh(&recent_lock);
704 return 1;
705 } 259 }
706 260
707 spin_unlock_bh(&recent_lock); 261 t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
708 262 GFP_KERNEL);
709 /* Table with this name not found */ 263 if (t == NULL)
710 /* Allocate memory for new linked list item */ 264 goto out;
711 265 t->refcnt = 1;
712#ifdef DEBUG 266 strcpy(t->name, info->name);
713 if(debug) { 267 INIT_LIST_HEAD(&t->lru_list);
714 printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); 268 for (i = 0; i < ip_list_hash_size; i++)
715 printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); 269 INIT_LIST_HEAD(&t->iphash[i]);
270#ifdef CONFIG_PROC_FS
271 t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir);
272 if (t->proc == NULL) {
273 kfree(t);
274 goto out;
716 } 275 }
276 t->proc->proc_fops = &recent_fops;
277 t->proc->data = t;
717#endif 278#endif
279 spin_lock_bh(&recent_lock);
280 list_add_tail(&t->list, &tables);
281 spin_unlock_bh(&recent_lock);
282 ret = 1;
283out:
284 mutex_unlock(&recent_mutex);
285 return ret;
286}
718 287
719 curr_table = vmalloc(sizeof(struct recent_ip_tables)); 288static void
720 if(curr_table == NULL) return 0; 289ipt_recent_destroy(const struct xt_match *match, void *matchinfo,
721 290 unsigned int matchsize)
722 spin_lock_init(&curr_table->list_lock); 291{
723 curr_table->next = NULL; 292 const struct ipt_recent_info *info = matchinfo;
724 curr_table->count = 1; 293 struct recent_table *t;
725 curr_table->time_pos = 0;
726 strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN);
727 curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0';
728
729 /* Allocate memory for this table and the list of packets in each entry. */
730#ifdef DEBUG
731 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n",
732 sizeof(struct recent_ip_list)*ip_list_tot,
733 info->name);
734#endif
735
736 curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot);
737 if(curr_table->table == NULL) { vfree(curr_table); return 0; }
738 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
739#ifdef DEBUG
740 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
741 sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
742#endif
743
744 hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
745#ifdef DEBUG
746 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
747#endif
748 if(hold == NULL) {
749 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n");
750 vfree(curr_table->table);
751 vfree(curr_table);
752 return 0;
753 }
754 for(c = 0; c < ip_list_tot; c++) {
755 curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot;
756 }
757 294
758 /* Allocate memory for the hash table */ 295 mutex_lock(&recent_mutex);
759#ifdef DEBUG 296 t = recent_table_lookup(info->name);
760 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", 297 if (--t->refcnt == 0) {
761 sizeof(int)*ip_list_hash_size); 298 spin_lock_bh(&recent_lock);
299 list_del(&t->list);
300 spin_unlock_bh(&recent_lock);
301 recent_table_flush(t);
302#ifdef CONFIG_PROC_FS
303 remove_proc_entry(t->name, proc_dir);
762#endif 304#endif
763 305 kfree(t);
764 curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size);
765 if(!curr_table->hash_table) {
766 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n");
767 vfree(hold);
768 vfree(curr_table->table);
769 vfree(curr_table);
770 return 0;
771 }
772
773 for(c = 0; c < ip_list_hash_size; c++) {
774 curr_table->hash_table[c] = -1;
775 } 306 }
307 mutex_unlock(&recent_mutex);
308}
776 309
777 /* Allocate memory for the time info */ 310#ifdef CONFIG_PROC_FS
778#ifdef DEBUG 311struct recent_iter_state {
779 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", 312 struct recent_table *table;
780 sizeof(struct time_info_list)*ip_list_tot); 313 unsigned int bucket;
781#endif 314};
782 315
783 curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); 316static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
784 if(!curr_table->time_info) { 317{
785 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); 318 struct recent_iter_state *st = seq->private;
786 vfree(curr_table->hash_table); 319 struct recent_table *t = st->table;
787 vfree(hold); 320 struct recent_entry *e;
788 vfree(curr_table->table); 321 loff_t p = *pos;
789 vfree(curr_table);
790 return 0;
791 }
792 for(c = 0; c < ip_list_tot; c++) {
793 curr_table->time_info[c].position = c;
794 curr_table->time_info[c].time = 0;
795 }
796 322
797 /* Put the new table in place */
798 spin_lock_bh(&recent_lock); 323 spin_lock_bh(&recent_lock);
799 find_table = r_tables;
800 while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
801
802 /* If a table already exists just increment the count on that table and return */
803 if(find_table) {
804 find_table->count++;
805 spin_unlock_bh(&recent_lock);
806#ifdef DEBUG
807 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name);
808#endif
809 vfree(curr_table->time_info);
810 vfree(curr_table->hash_table);
811 vfree(hold);
812 vfree(curr_table->table);
813 vfree(curr_table);
814 return 1;
815 }
816 if(!last_table) r_tables = curr_table; else last_table->next = curr_table;
817
818 spin_unlock_bh(&recent_lock);
819 324
820#ifdef CONFIG_PROC_FS 325 for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) {
821 /* Create our proc 'status' entry. */ 326 list_for_each_entry(e, &t->iphash[st->bucket], list) {
822 curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); 327 if (p-- == 0)
823 if (!curr_table->status_proc) { 328 return e;
824 vfree(hold);
825 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n");
826 /* Destroy the created table */
827 spin_lock_bh(&recent_lock);
828 last_table = NULL;
829 curr_table = r_tables;
830 if(!curr_table) {
831#ifdef DEBUG
832 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n");
833#endif
834 spin_unlock_bh(&recent_lock);
835 return 0;
836 }
837 while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
838 if(!curr_table) {
839#ifdef DEBUG
840 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n");
841#endif
842 spin_unlock_bh(&recent_lock);
843 return 0;
844 } 329 }
845 if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
846 spin_unlock_bh(&recent_lock);
847 vfree(curr_table->time_info);
848 vfree(curr_table->hash_table);
849 vfree(curr_table->table);
850 vfree(curr_table);
851 return 0;
852 } 330 }
853 331 return NULL;
854 curr_table->status_proc->owner = THIS_MODULE; 332}
855 curr_table->status_proc->data = curr_table;
856 wmb();
857 curr_table->status_proc->read_proc = ip_recent_get_info;
858 curr_table->status_proc->write_proc = ip_recent_ctrl;
859#endif /* CONFIG_PROC_FS */
860
861#ifdef DEBUG
862 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n");
863#endif
864 333
865 return 1; 334static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
335{
336 struct recent_iter_state *st = seq->private;
337 struct recent_table *t = st->table;
338 struct recent_entry *e = v;
339 struct list_head *head = e->list.next;
340
341 while (head == &t->iphash[st->bucket]) {
342 if (++st->bucket >= ip_list_hash_size)
343 return NULL;
344 head = t->iphash[st->bucket].next;
345 }
346 (*pos)++;
347 return list_entry(head, struct recent_entry, list);
866} 348}
867 349
868/* This function is called in the event that a rule matching this module is 350static void recent_seq_stop(struct seq_file *s, void *v)
869 * removed.
870 * When this happens we need to check if there are no other rules matching
871 * the table given. If that is the case then we remove the table and clean
872 * up its memory.
873 */
874static void
875destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize)
876{ 351{
877 const struct ipt_recent_info *info = matchinfo; 352 spin_unlock_bh(&recent_lock);
878 struct recent_ip_tables *curr_table, *last_table; 353}
879 354
880#ifdef DEBUG 355static int recent_seq_show(struct seq_file *seq, void *v)
881 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); 356{
882#endif 357 struct recent_entry *e = v;
358 unsigned int i;
359
360 i = (e->index - 1) % ip_pkt_list_tot;
361 seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
362 NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
363 for (i = 0; i < e->nstamps; i++)
364 seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
365 seq_printf(seq, "\n");
366 return 0;
367}
883 368
884 if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; 369static struct seq_operations recent_seq_ops = {
370 .start = recent_seq_start,
371 .next = recent_seq_next,
372 .stop = recent_seq_stop,
373 .show = recent_seq_show,
374};
885 375
886 /* Lock the linked list while we play with it */ 376static int recent_seq_open(struct inode *inode, struct file *file)
887 spin_lock_bh(&recent_lock); 377{
378 struct proc_dir_entry *pde = PDE(inode);
379 struct seq_file *seq;
380 struct recent_iter_state *st;
381 int ret;
382
383 st = kzalloc(sizeof(*st), GFP_KERNEL);
384 if (st == NULL)
385 return -ENOMEM;
386 ret = seq_open(file, &recent_seq_ops);
387 if (ret)
388 kfree(st);
389 st->table = pde->data;
390 seq = file->private_data;
391 seq->private = st;
392 return ret;
393}
888 394
889 /* Look for an entry with this name already created */ 395static ssize_t recent_proc_write(struct file *file, const char __user *input,
890 /* Finds the end of the list and the entry before the end if current name does not exist */ 396 size_t size, loff_t *loff)
891 last_table = NULL; 397{
892 curr_table = r_tables; 398 struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
893 if(!curr_table) { 399 struct recent_table *t = pde->data;
894#ifdef DEBUG 400 struct recent_entry *e;
895 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); 401 char buf[sizeof("+255.255.255.255")], *c = buf;
896#endif 402 u_int32_t addr;
403 int add;
404
405 if (size > sizeof(buf))
406 size = sizeof(buf);
407 if (copy_from_user(buf, input, size))
408 return -EFAULT;
409 while (isspace(*c))
410 c++;
411
412 if (size - (c - buf) < 5)
413 return c - buf;
414 if (!strncmp(c, "clear", 5)) {
415 c += 5;
416 spin_lock_bh(&recent_lock);
417 recent_table_flush(t);
897 spin_unlock_bh(&recent_lock); 418 spin_unlock_bh(&recent_lock);
898 return; 419 return c - buf;
899 } 420 }
900 while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
901 421
902 /* If a table does not exist then do nothing and return */ 422 switch (*c) {
903 if(!curr_table) { 423 case '-':
904#ifdef DEBUG 424 add = 0;
905 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); 425 c++;
906#endif 426 break;
907 spin_unlock_bh(&recent_lock); 427 case '+':
908 return; 428 c++;
429 default:
430 add = 1;
431 break;
909 } 432 }
433 addr = in_aton(c);
910 434
911 curr_table->count--; 435 spin_lock_bh(&recent_lock);
912 436 e = recent_entry_lookup(t, addr, 0);
913 /* If count is still non-zero then there are still rules referenceing it so we do nothing */ 437 if (e == NULL) {
914 if(curr_table->count) { 438 if (add)
915#ifdef DEBUG 439 recent_entry_init(t, addr, 0);
916 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); 440 } else {
917#endif 441 if (add)
918 spin_unlock_bh(&recent_lock); 442 recent_entry_update(t, e);
919 return; 443 else
444 recent_entry_remove(t, e);
920 } 445 }
921
922#ifdef DEBUG
923 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n");
924#endif
925
926 /* Count must be zero so we remove this table from the list */
927 if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
928
929 spin_unlock_bh(&recent_lock); 446 spin_unlock_bh(&recent_lock);
447 return size;
448}
930 449
931 /* lock to make sure any late-runners still using this after we removed it from 450static struct file_operations recent_fops = {
932 * the list finish up then remove everything */ 451 .open = recent_seq_open,
933 spin_lock_bh(&curr_table->list_lock); 452 .read = seq_read,
934 spin_unlock_bh(&curr_table->list_lock); 453 .write = recent_proc_write,
935 454 .release = seq_release_private,
936#ifdef CONFIG_PROC_FS 455 .owner = THIS_MODULE,
937 if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); 456};
938#endif /* CONFIG_PROC_FS */ 457#endif /* CONFIG_PROC_FS */
939 vfree(curr_table->table[0].last_pkts);
940 vfree(curr_table->table);
941 vfree(curr_table->hash_table);
942 vfree(curr_table->time_info);
943 vfree(curr_table);
944
945#ifdef DEBUG
946 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n");
947#endif
948 458
949 return;
950}
951
952/* This is the structure we pass to ipt_register to register our
953 * module with iptables.
954 */
955static struct ipt_match recent_match = { 459static struct ipt_match recent_match = {
956 .name = "recent", 460 .name = "recent",
957 .match = match, 461 .match = ipt_recent_match,
958 .matchsize = sizeof(struct ipt_recent_info), 462 .matchsize = sizeof(struct ipt_recent_info),
959 .checkentry = checkentry, 463 .checkentry = ipt_recent_checkentry,
960 .destroy = destroy, 464 .destroy = ipt_recent_destroy,
961 .me = THIS_MODULE 465 .me = THIS_MODULE,
962}; 466};
963 467
964/* Kernel module initialization. */
965static int __init ipt_recent_init(void) 468static int __init ipt_recent_init(void)
966{ 469{
967 int err, count; 470 int err;
968 471
969 printk(version); 472 if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
970#ifdef CONFIG_PROC_FS 473 return -EINVAL;
971 proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); 474 ip_list_hash_size = 1 << fls(ip_list_tot);
972 if(!proc_net_ipt_recent) return -ENOMEM;
973#endif
974
975 if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) {
976 printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n");
977 ip_list_hash_size = 0;
978 }
979
980 if(!ip_list_hash_size) {
981 ip_list_hash_size = ip_list_tot*3;
982 count = 2*2;
983 while(ip_list_hash_size > count) count = count*2;
984 ip_list_hash_size = count;
985 }
986
987#ifdef DEBUG
988 if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size);
989#endif
990 475
991 err = ipt_register_match(&recent_match); 476 err = ipt_register_match(&recent_match);
477#ifdef CONFIG_PROC_FS
992 if (err) 478 if (err)
993 remove_proc_entry("ipt_recent", proc_net); 479 return err;
480 proc_dir = proc_mkdir("ipt_recent", proc_net);
481 if (proc_dir == NULL) {
482 ipt_unregister_match(&recent_match);
483 err = -ENOMEM;
484 }
485#endif
994 return err; 486 return err;
995} 487}
996 488
997/* Kernel module destruction. */ 489static void __exit ipt_recent_exit(void)
998static void __exit ipt_recent_fini(void)
999{ 490{
491 BUG_ON(!list_empty(&tables));
1000 ipt_unregister_match(&recent_match); 492 ipt_unregister_match(&recent_match);
1001 493#ifdef CONFIG_PROC_FS
1002 remove_proc_entry("ipt_recent",proc_net); 494 remove_proc_entry("ipt_recent", proc_net);
495#endif
1003} 496}
1004 497
1005/* Register our module with the kernel. */
1006module_init(ipt_recent_init); 498module_init(ipt_recent_init);
1007module_exit(ipt_recent_fini); 499module_exit(ipt_recent_exit);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 77d974443c7b..8cc8e1b36778 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum,
145 145
146 /* This is where we call the helper: as the packet goes out. */ 146 /* This is where we call the helper: as the packet goes out. */
147 ct = nf_ct_get(*pskb, &ctinfo); 147 ct = nf_ct_get(*pskb, &ctinfo);
148 if (!ct) 148 if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
149 return NF_ACCEPT; 149 return NF_ACCEPT;
150 150
151 help = nfct_help(ct); 151 help = nfct_help(ct);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b0d361cc6e6..663a73ee3f2f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
235 } 235 }
236 236
237 /* See ip_conntrack_proto_tcp.c */ 237 /* See ip_conntrack_proto_tcp.c */
238 if (hooknum == NF_IP_PRE_ROUTING && 238 if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
239 nf_ip_checksum(skb, hooknum, dataoff, 0)) { 239 nf_ip_checksum(skb, hooknum, dataoff, 0)) {
240 if (LOG_INVALID(IPPROTO_ICMP)) 240 if (LOG_INVALID(IPPROTO_ICMP))
241 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 241 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fc2562415555..bd221ec3f81e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk)
103} 103}
104 104
105struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, 105struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
106 unsigned long raddr, unsigned long laddr, 106 __be32 raddr, __be32 laddr,
107 int dif) 107 int dif)
108{ 108{
109 struct hlist_node *node; 109 struct hlist_node *node;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3adfcf00..ce4cd5f35511 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -182,14 +182,6 @@ ctl_table ipv4_table[] = {
182 .strategy = &ipv4_doint_and_flush_strategy, 182 .strategy = &ipv4_doint_and_flush_strategy,
183 }, 183 },
184 { 184 {
185 .ctl_name = NET_IPV4_AUTOCONFIG,
186 .procname = "ip_autoconfig",
187 .data = &ipv4_config.autoconfig,
188 .maxlen = sizeof(int),
189 .mode = 0644,
190 .proc_handler = &proc_dointvec
191 },
192 {
193 .ctl_name = NET_IPV4_NO_PMTU_DISC, 185 .ctl_name = NET_IPV4_NO_PMTU_DISC,
194 .procname = "ip_no_pmtu_disc", 186 .procname = "ip_no_pmtu_disc",
195 .data = &ipv4_config.no_pmtu_disc, 187 .data = &ipv4_config.no_pmtu_disc,
@@ -688,6 +680,24 @@ ctl_table ipv4_table[] = {
688 .mode = 0644, 680 .mode = 0644,
689 .proc_handler = &proc_dointvec 681 .proc_handler = &proc_dointvec
690 }, 682 },
683#ifdef CONFIG_NET_DMA
684 {
685 .ctl_name = NET_TCP_DMA_COPYBREAK,
686 .procname = "tcp_dma_copybreak",
687 .data = &sysctl_tcp_dma_copybreak,
688 .maxlen = sizeof(int),
689 .mode = 0644,
690 .proc_handler = &proc_dointvec
691 },
692#endif
693 {
694 .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE,
695 .procname = "tcp_slow_start_after_idle",
696 .data = &sysctl_tcp_slow_start_after_idle,
697 .maxlen = sizeof(int),
698 .mode = 0644,
699 .proc_handler = &proc_dointvec
700 },
691 { .ctl_name = 0 } 701 { .ctl_name = 0 }
692}; 702};
693 703
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b8055037..74998f250071 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -263,7 +263,7 @@
263#include <net/tcp.h> 263#include <net/tcp.h>
264#include <net/xfrm.h> 264#include <net/xfrm.h>
265#include <net/ip.h> 265#include <net/ip.h>
266 266#include <net/netdma.h>
267 267
268#include <asm/uaccess.h> 268#include <asm/uaccess.h>
269#include <asm/ioctls.h> 269#include <asm/ioctls.h>
@@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
622 ssize_t res; 622 ssize_t res;
623 struct sock *sk = sock->sk; 623 struct sock *sk = sock->sk;
624 624
625#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
626
627 if (!(sk->sk_route_caps & NETIF_F_SG) || 625 if (!(sk->sk_route_caps & NETIF_F_SG) ||
628 !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) 626 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
629 return sock_no_sendpage(sock, page, offset, size, flags); 627 return sock_no_sendpage(sock, page, offset, size, flags);
630 628
631#undef TCP_ZC_CSUM_FLAGS
632
633 lock_sock(sk); 629 lock_sock(sk);
634 TCP_CHECK_TIMER(sk); 630 TCP_CHECK_TIMER(sk);
635 res = do_tcp_sendpages(sk, &page, offset, size, flags); 631 res = do_tcp_sendpages(sk, &page, offset, size, flags);
@@ -726,9 +722,7 @@ new_segment:
726 /* 722 /*
727 * Check whether we can use HW checksum. 723 * Check whether we can use HW checksum.
728 */ 724 */
729 if (sk->sk_route_caps & 725 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
730 (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
731 NETIF_F_HW_CSUM))
732 skb->ip_summed = CHECKSUM_HW; 726 skb->ip_summed = CHECKSUM_HW;
733 727
734 skb_entail(sk, tp, skb); 728 skb_entail(sk, tp, skb);
@@ -937,7 +931,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
937 * calculation of whether or not we must ACK for the sake of 931 * calculation of whether or not we must ACK for the sake of
938 * a window update. 932 * a window update.
939 */ 933 */
940static void cleanup_rbuf(struct sock *sk, int copied) 934void tcp_cleanup_rbuf(struct sock *sk, int copied)
941{ 935{
942 struct tcp_sock *tp = tcp_sk(sk); 936 struct tcp_sock *tp = tcp_sk(sk);
943 int time_to_ack = 0; 937 int time_to_ack = 0;
@@ -1072,11 +1066,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1072 break; 1066 break;
1073 } 1067 }
1074 if (skb->h.th->fin) { 1068 if (skb->h.th->fin) {
1075 sk_eat_skb(sk, skb); 1069 sk_eat_skb(sk, skb, 0);
1076 ++seq; 1070 ++seq;
1077 break; 1071 break;
1078 } 1072 }
1079 sk_eat_skb(sk, skb); 1073 sk_eat_skb(sk, skb, 0);
1080 if (!desc->count) 1074 if (!desc->count)
1081 break; 1075 break;
1082 } 1076 }
@@ -1086,7 +1080,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1086 1080
1087 /* Clean up data we have read: This will do ACK frames. */ 1081 /* Clean up data we have read: This will do ACK frames. */
1088 if (copied) 1082 if (copied)
1089 cleanup_rbuf(sk, copied); 1083 tcp_cleanup_rbuf(sk, copied);
1090 return copied; 1084 return copied;
1091} 1085}
1092 1086
@@ -1110,6 +1104,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1110 int target; /* Read at least this many bytes */ 1104 int target; /* Read at least this many bytes */
1111 long timeo; 1105 long timeo;
1112 struct task_struct *user_recv = NULL; 1106 struct task_struct *user_recv = NULL;
1107 int copied_early = 0;
1113 1108
1114 lock_sock(sk); 1109 lock_sock(sk);
1115 1110
@@ -1133,6 +1128,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1133 1128
1134 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1129 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1135 1130
1131#ifdef CONFIG_NET_DMA
1132 tp->ucopy.dma_chan = NULL;
1133 preempt_disable();
1134 if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1135 !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) {
1136 preempt_enable_no_resched();
1137 tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
1138 } else
1139 preempt_enable_no_resched();
1140#endif
1141
1136 do { 1142 do {
1137 struct sk_buff *skb; 1143 struct sk_buff *skb;
1138 u32 offset; 1144 u32 offset;
@@ -1220,7 +1226,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1220 } 1226 }
1221 } 1227 }
1222 1228
1223 cleanup_rbuf(sk, copied); 1229 tcp_cleanup_rbuf(sk, copied);
1224 1230
1225 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { 1231 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1226 /* Install new reader */ 1232 /* Install new reader */
@@ -1274,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1274 } else 1280 } else
1275 sk_wait_data(sk, &timeo); 1281 sk_wait_data(sk, &timeo);
1276 1282
1283#ifdef CONFIG_NET_DMA
1284 tp->ucopy.wakeup = 0;
1285#endif
1286
1277 if (user_recv) { 1287 if (user_recv) {
1278 int chunk; 1288 int chunk;
1279 1289
@@ -1329,13 +1339,39 @@ do_prequeue:
1329 } 1339 }
1330 1340
1331 if (!(flags & MSG_TRUNC)) { 1341 if (!(flags & MSG_TRUNC)) {
1332 err = skb_copy_datagram_iovec(skb, offset, 1342#ifdef CONFIG_NET_DMA
1333 msg->msg_iov, used); 1343 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1334 if (err) { 1344 tp->ucopy.dma_chan = get_softnet_dma();
1335 /* Exception. Bailout! */ 1345
1336 if (!copied) 1346 if (tp->ucopy.dma_chan) {
1337 copied = -EFAULT; 1347 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1338 break; 1348 tp->ucopy.dma_chan, skb, offset,
1349 msg->msg_iov, used,
1350 tp->ucopy.pinned_list);
1351
1352 if (tp->ucopy.dma_cookie < 0) {
1353
1354 printk(KERN_ALERT "dma_cookie < 0\n");
1355
1356 /* Exception. Bailout! */
1357 if (!copied)
1358 copied = -EFAULT;
1359 break;
1360 }
1361 if ((offset + used) == skb->len)
1362 copied_early = 1;
1363
1364 } else
1365#endif
1366 {
1367 err = skb_copy_datagram_iovec(skb, offset,
1368 msg->msg_iov, used);
1369 if (err) {
1370 /* Exception. Bailout! */
1371 if (!copied)
1372 copied = -EFAULT;
1373 break;
1374 }
1339 } 1375 }
1340 } 1376 }
1341 1377
@@ -1355,15 +1391,19 @@ skip_copy:
1355 1391
1356 if (skb->h.th->fin) 1392 if (skb->h.th->fin)
1357 goto found_fin_ok; 1393 goto found_fin_ok;
1358 if (!(flags & MSG_PEEK)) 1394 if (!(flags & MSG_PEEK)) {
1359 sk_eat_skb(sk, skb); 1395 sk_eat_skb(sk, skb, copied_early);
1396 copied_early = 0;
1397 }
1360 continue; 1398 continue;
1361 1399
1362 found_fin_ok: 1400 found_fin_ok:
1363 /* Process the FIN. */ 1401 /* Process the FIN. */
1364 ++*seq; 1402 ++*seq;
1365 if (!(flags & MSG_PEEK)) 1403 if (!(flags & MSG_PEEK)) {
1366 sk_eat_skb(sk, skb); 1404 sk_eat_skb(sk, skb, copied_early);
1405 copied_early = 0;
1406 }
1367 break; 1407 break;
1368 } while (len > 0); 1408 } while (len > 0);
1369 1409
@@ -1386,12 +1426,42 @@ skip_copy:
1386 tp->ucopy.len = 0; 1426 tp->ucopy.len = 0;
1387 } 1427 }
1388 1428
1429#ifdef CONFIG_NET_DMA
1430 if (tp->ucopy.dma_chan) {
1431 struct sk_buff *skb;
1432 dma_cookie_t done, used;
1433
1434 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1435
1436 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1437 tp->ucopy.dma_cookie, &done,
1438 &used) == DMA_IN_PROGRESS) {
1439 /* do partial cleanup of sk_async_wait_queue */
1440 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1441 (dma_async_is_complete(skb->dma_cookie, done,
1442 used) == DMA_SUCCESS)) {
1443 __skb_dequeue(&sk->sk_async_wait_queue);
1444 kfree_skb(skb);
1445 }
1446 }
1447
1448 /* Safe to free early-copied skbs now */
1449 __skb_queue_purge(&sk->sk_async_wait_queue);
1450 dma_chan_put(tp->ucopy.dma_chan);
1451 tp->ucopy.dma_chan = NULL;
1452 }
1453 if (tp->ucopy.pinned_list) {
1454 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1455 tp->ucopy.pinned_list = NULL;
1456 }
1457#endif
1458
1389 /* According to UNIX98, msg_name/msg_namelen are ignored 1459 /* According to UNIX98, msg_name/msg_namelen are ignored
1390 * on connected socket. I was just happy when found this 8) --ANK 1460 * on connected socket. I was just happy when found this 8) --ANK
1391 */ 1461 */
1392 1462
1393 /* Clean up data we have read: This will do ACK frames. */ 1463 /* Clean up data we have read: This will do ACK frames. */
1394 cleanup_rbuf(sk, copied); 1464 tcp_cleanup_rbuf(sk, copied);
1395 1465
1396 TCP_CHECK_TIMER(sk); 1466 TCP_CHECK_TIMER(sk);
1397 release_sock(sk); 1467 release_sock(sk);
@@ -1658,6 +1728,9 @@ int tcp_disconnect(struct sock *sk, int flags)
1658 __skb_queue_purge(&sk->sk_receive_queue); 1728 __skb_queue_purge(&sk->sk_receive_queue);
1659 sk_stream_writequeue_purge(sk); 1729 sk_stream_writequeue_purge(sk);
1660 __skb_queue_purge(&tp->out_of_order_queue); 1730 __skb_queue_purge(&tp->out_of_order_queue);
1731#ifdef CONFIG_NET_DMA
1732 __skb_queue_purge(&sk->sk_async_wait_queue);
1733#endif
1661 1734
1662 inet->dport = 0; 1735 inet->dport = 0;
1663 1736
@@ -1858,7 +1931,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
1858 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && 1931 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1859 inet_csk_ack_scheduled(sk)) { 1932 inet_csk_ack_scheduled(sk)) {
1860 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; 1933 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1861 cleanup_rbuf(sk, 1); 1934 tcp_cleanup_rbuf(sk, 1);
1862 if (!(val & 1)) 1935 if (!(val & 1))
1863 icsk->icsk_ack.pingpong = 1; 1936 icsk->icsk_ack.pingpong = 1;
1864 } 1937 }
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 035f2092d73a..b2d9021ad22b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
198 return max(tp->snd_cwnd, ca->last_max_cwnd); 198 return max(tp->snd_cwnd, ca->last_max_cwnd);
199} 199}
200 200
201static u32 bictcp_min_cwnd(struct sock *sk)
202{
203 const struct tcp_sock *tp = tcp_sk(sk);
204 return tp->snd_ssthresh;
205}
206
207static void bictcp_state(struct sock *sk, u8 new_state) 201static void bictcp_state(struct sock *sk, u8 new_state)
208{ 202{
209 if (new_state == TCP_CA_Loss) 203 if (new_state == TCP_CA_Loss)
@@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = {
231 .cong_avoid = bictcp_cong_avoid, 225 .cong_avoid = bictcp_cong_avoid,
232 .set_state = bictcp_state, 226 .set_state = bictcp_state,
233 .undo_cwnd = bictcp_undo_cwnd, 227 .undo_cwnd = bictcp_undo_cwnd,
234 .min_cwnd = bictcp_min_cwnd,
235 .pkts_acked = bictcp_acked, 228 .pkts_acked = bictcp_acked,
236 .owner = THIS_MODULE, 229 .owner = THIS_MODULE,
237 .name = "bic", 230 .name = "bic",
diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c
new file mode 100644
index 000000000000..bc54f7e9aea9
--- /dev/null
+++ b/net/ipv4/tcp_compound.c
@@ -0,0 +1,448 @@
1/*
2 * TCP Vegas congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * Lawrence S. Brakmo and Larry L. Peterson.
6 * "TCP Vegas: End to end congestion avoidance on a global internet."
7 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
8 * October 1995. Available from:
9 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
10 *
11 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
12 * The main aspects that distinguish this implementation from the
13 * Arizona Vegas implementation are:
14 * o We do not change the loss detection or recovery mechanisms of
15 * Linux in any way. Linux already recovers from losses quite well,
16 * using fine-grained timers, NewReno, and FACK.
17 * o To avoid the performance penalty imposed by increasing cwnd
18 * only every-other RTT during slow start, we increase during
19 * every RTT during slow start, just like Reno.
20 * o Largely to allow continuous cwnd growth during slow start,
21 * we use the rate at which ACKs come back as the "actual"
22 * rate, rather than the rate at which data is sent.
23 * o To speed convergence to the right rate, we set the cwnd
24 * to achieve the right ("actual") rate when we exit slow start.
25 * o To filter out the noise caused by delayed ACKs, we use the
26 * minimum RTT sample observed during the last RTT to calculate
27 * the actual rate.
28 * o When the sender re-starts from idle, it waits until it has
29 * received ACKs for an entire flight of new data before making
30 * a cwnd adjustment decision. The original Vegas implementation
31 * assumed senders never went idle.
32 *
33 *
34 * TCP Compound based on TCP Vegas
35 *
36 * further details can be found here:
37 * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf
38 */
39
40#include <linux/config.h>
41#include <linux/mm.h>
42#include <linux/module.h>
43#include <linux/skbuff.h>
44#include <linux/inet_diag.h>
45
46#include <net/tcp.h>
47
48/* Default values of the Vegas variables, in fixed-point representation
49 * with V_PARAM_SHIFT bits to the right of the binary point.
50 */
51#define V_PARAM_SHIFT 1
52
53#define TCP_COMPOUND_ALPHA 3U
54#define TCP_COMPOUND_BETA 1U
55#define TCP_COMPOUND_GAMMA 30
56#define TCP_COMPOUND_ZETA 1
57
58/* TCP compound variables */
59struct compound {
60 u32 beg_snd_nxt; /* right edge during last RTT */
61 u32 beg_snd_una; /* left edge during last RTT */
62 u32 beg_snd_cwnd; /* saves the size of the cwnd */
63 u8 doing_vegas_now; /* if true, do vegas for this RTT */
64 u16 cntRTT; /* # of RTTs measured within last RTT */
65 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
66 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
67
68 u32 cwnd;
69 u32 dwnd;
70};
71
72/* There are several situations when we must "re-start" Vegas:
73 *
74 * o when a connection is established
75 * o after an RTO
76 * o after fast recovery
77 * o when we send a packet and there is no outstanding
78 * unacknowledged data (restarting an idle connection)
79 *
80 * In these circumstances we cannot do a Vegas calculation at the
81 * end of the first RTT, because any calculation we do is using
82 * stale info -- both the saved cwnd and congestion feedback are
83 * stale.
84 *
85 * Instead we must wait until the completion of an RTT during
86 * which we actually receive ACKs.
87 */
88static inline void vegas_enable(struct sock *sk)
89{
90 const struct tcp_sock *tp = tcp_sk(sk);
91 struct compound *vegas = inet_csk_ca(sk);
92
93 /* Begin taking Vegas samples next time we send something. */
94 vegas->doing_vegas_now = 1;
95
96 /* Set the beginning of the next send window. */
97 vegas->beg_snd_nxt = tp->snd_nxt;
98
99 vegas->cntRTT = 0;
100 vegas->minRTT = 0x7fffffff;
101}
102
103/* Stop taking Vegas samples for now. */
104static inline void vegas_disable(struct sock *sk)
105{
106 struct compound *vegas = inet_csk_ca(sk);
107
108 vegas->doing_vegas_now = 0;
109}
110
111static void tcp_compound_init(struct sock *sk)
112{
113 struct compound *vegas = inet_csk_ca(sk);
114 const struct tcp_sock *tp = tcp_sk(sk);
115
116 vegas->baseRTT = 0x7fffffff;
117 vegas_enable(sk);
118
119 vegas->dwnd = 0;
120 vegas->cwnd = tp->snd_cwnd;
121}
122
123/* Do RTT sampling needed for Vegas.
124 * Basically we:
125 * o min-filter RTT samples from within an RTT to get the current
126 * propagation delay + queuing delay (we are min-filtering to try to
127 * avoid the effects of delayed ACKs)
128 * o min-filter RTT samples from a much longer window (forever for now)
129 * to find the propagation delay (baseRTT)
130 */
131static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt)
132{
133 struct compound *vegas = inet_csk_ca(sk);
134 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
135
136 /* Filter to find propagation delay: */
137 if (vrtt < vegas->baseRTT)
138 vegas->baseRTT = vrtt;
139
140 /* Find the min RTT during the last RTT to find
141 * the current prop. delay + queuing delay:
142 */
143
144 vegas->minRTT = min(vegas->minRTT, vrtt);
145 vegas->cntRTT++;
146}
147
148static void tcp_compound_state(struct sock *sk, u8 ca_state)
149{
150
151 if (ca_state == TCP_CA_Open)
152 vegas_enable(sk);
153 else
154 vegas_disable(sk);
155}
156
157
158/* 64bit divisor, dividend and result. dynamic precision */
159static inline u64 div64_64(u64 dividend, u64 divisor)
160{
161 u32 d = divisor;
162
163 if (divisor > 0xffffffffULL) {
164 unsigned int shift = fls(divisor >> 32);
165
166 d = divisor >> shift;
167 dividend >>= shift;
168 }
169
170 /* avoid 64 bit division if possible */
171 if (dividend >> 32)
172 do_div(dividend, d);
173 else
174 dividend = (u32) dividend / d;
175
176 return dividend;
177}
178
179/* calculate the quartic root of "a" using Newton-Raphson */
180static u32 qroot(u64 a)
181{
182 u32 x, x1;
183
184 /* Initial estimate is based on:
185 * qrt(x) = exp(log(x) / 4)
186 */
187 x = 1u << (fls64(a) >> 2);
188
189 /*
190 * Iteration based on:
191 * 3
192 * x = ( 3 * x + a / x ) / 4
193 * k+1 k k
194 */
195 do {
196 u64 x3 = x;
197
198 x1 = x;
199 x3 *= x;
200 x3 *= x;
201
202 x = (3 * x + (u32) div64_64(a, x3)) / 4;
203 } while (abs(x1 - x) > 1);
204
205 return x;
206}
207
208
209/*
210 * If the connection is idle and we are restarting,
211 * then we don't want to do any Vegas calculations
212 * until we get fresh RTT samples. So when we
213 * restart, we reset our Vegas state to a clean
214 * slate. After we get acks for this flight of
215 * packets, _then_ we can make Vegas calculations
216 * again.
217 */
218static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event)
219{
220 if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
221 tcp_compound_init(sk);
222}
223
224static void tcp_compound_cong_avoid(struct sock *sk, u32 ack,
225 u32 seq_rtt, u32 in_flight, int flag)
226{
227 struct tcp_sock *tp = tcp_sk(sk);
228 struct compound *vegas = inet_csk_ca(sk);
229 u8 inc = 0;
230
231 if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) {
232 if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) {
233 vegas->cwnd = tp->snd_cwnd;
234 vegas->dwnd = 0;
235 } else
236 vegas->cwnd = tp->snd_cwnd - vegas->dwnd;
237
238 }
239
240 if (!tcp_is_cwnd_limited(sk, in_flight))
241 return;
242
243 if (vegas->cwnd <= tp->snd_ssthresh)
244 inc = 1;
245 else if (tp->snd_cwnd_cnt < tp->snd_cwnd)
246 tp->snd_cwnd_cnt++;
247
248 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
249 inc = 1;
250 tp->snd_cwnd_cnt = 0;
251 }
252
253 if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp)
254 vegas->cwnd++;
255
256 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
257 *
258 * These are so named because they represent the approximate values
259 * of snd_una and snd_nxt at the beginning of the current RTT. More
260 * precisely, they represent the amount of data sent during the RTT.
261 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
262 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
263 * bytes of data have been ACKed during the course of the RTT, giving
264 * an "actual" rate of:
265 *
266 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
267 *
268 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
269 * because delayed ACKs can cover more than one segment, so they
270 * don't line up nicely with the boundaries of RTTs.
271 *
272 * Another unfortunate fact of life is that delayed ACKs delay the
273 * advance of the left edge of our send window, so that the number
274 * of bytes we send in an RTT is often less than our cwnd will allow.
275 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
276 */
277
278 if (after(ack, vegas->beg_snd_nxt)) {
279 /* Do the Vegas once-per-RTT cwnd adjustment. */
280 u32 old_wnd, old_snd_cwnd;
281
282 /* Here old_wnd is essentially the window of data that was
283 * sent during the previous RTT, and has all
284 * been acknowledged in the course of the RTT that ended
285 * with the ACK we just received. Likewise, old_snd_cwnd
286 * is the cwnd during the previous RTT.
287 */
288 if (!tp->mss_cache)
289 return;
290
291 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
292 tp->mss_cache;
293 old_snd_cwnd = vegas->beg_snd_cwnd;
294
295 /* Save the extent of the current window so we can use this
296 * at the end of the next RTT.
297 */
298 vegas->beg_snd_una = vegas->beg_snd_nxt;
299 vegas->beg_snd_nxt = tp->snd_nxt;
300 vegas->beg_snd_cwnd = tp->snd_cwnd;
301
302 /* We do the Vegas calculations only if we got enough RTT
303 * samples that we can be reasonably sure that we got
304 * at least one RTT sample that wasn't from a delayed ACK.
305 * If we only had 2 samples total,
306 * then that means we're getting only 1 ACK per RTT, which
307 * means they're almost certainly delayed ACKs.
308 * If we have 3 samples, we should be OK.
309 */
310
311 if (vegas->cntRTT > 2) {
312 u32 rtt, target_cwnd, diff;
313 u32 brtt, dwnd;
314
315 /* We have enough RTT samples, so, using the Vegas
316 * algorithm, we determine if we should increase or
317 * decrease cwnd, and by how much.
318 */
319
320 /* Pluck out the RTT we are using for the Vegas
321 * calculations. This is the min RTT seen during the
322 * last RTT. Taking the min filters out the effects
323 * of delayed ACKs, at the cost of noticing congestion
324 * a bit later.
325 */
326 rtt = vegas->minRTT;
327
328 /* Calculate the cwnd we should have, if we weren't
329 * going too fast.
330 *
331 * This is:
332 * (actual rate in segments) * baseRTT
333 * We keep it as a fixed point number with
334 * V_PARAM_SHIFT bits to the right of the binary point.
335 */
336 if (!rtt)
337 return;
338
339 brtt = vegas->baseRTT;
340 target_cwnd = ((old_wnd * brtt)
341 << V_PARAM_SHIFT) / rtt;
342
343 /* Calculate the difference between the window we had,
344 * and the window we would like to have. This quantity
345 * is the "Diff" from the Arizona Vegas papers.
346 *
347 * Again, this is a fixed point number with
348 * V_PARAM_SHIFT bits to the right of the binary
349 * point.
350 */
351
352 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
353
354 dwnd = vegas->dwnd;
355
356 if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) {
357 u64 v;
358 u32 x;
359
360 /*
361 * The TCP Compound paper describes the choice
362 * of "k" determines the agressiveness,
363 * ie. slope of the response function.
364 *
365 * For same value as HSTCP would be 0.8
366 * but for computaional reasons, both the
367 * original authors and this implementation
368 * use 0.75.
369 */
370 v = old_wnd;
371 x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA;
372 if (x > 1)
373 dwnd = x - 1;
374 else
375 dwnd = 0;
376
377 dwnd += vegas->dwnd;
378
379 } else if ((dwnd << V_PARAM_SHIFT) <
380 (diff * TCP_COMPOUND_BETA))
381 dwnd = 0;
382 else
383 dwnd =
384 ((dwnd << V_PARAM_SHIFT) -
385 (diff *
386 TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT;
387
388 vegas->dwnd = dwnd;
389
390 }
391
392 /* Wipe the slate clean for the next RTT. */
393 vegas->cntRTT = 0;
394 vegas->minRTT = 0x7fffffff;
395 }
396
397 tp->snd_cwnd = vegas->cwnd + vegas->dwnd;
398}
399
400/* Extract info for Tcp socket info provided via netlink. */
401static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
402{
403 const struct compound *ca = inet_csk_ca(sk);
404 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
405 struct tcpvegas_info *info;
406
407 info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
408 sizeof(*info)));
409
410 info->tcpv_enabled = ca->doing_vegas_now;
411 info->tcpv_rttcnt = ca->cntRTT;
412 info->tcpv_rtt = ca->baseRTT;
413 info->tcpv_minrtt = ca->minRTT;
414 rtattr_failure:;
415 }
416}
417
418static struct tcp_congestion_ops tcp_compound = {
419 .init = tcp_compound_init,
420 .ssthresh = tcp_reno_ssthresh,
421 .cong_avoid = tcp_compound_cong_avoid,
422 .rtt_sample = tcp_compound_rtt_calc,
423 .set_state = tcp_compound_state,
424 .cwnd_event = tcp_compound_cwnd_event,
425 .get_info = tcp_compound_get_info,
426
427 .owner = THIS_MODULE,
428 .name = "compound",
429};
430
431static int __init tcp_compound_register(void)
432{
433 BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE);
434 tcp_register_congestion_control(&tcp_compound);
435 return 0;
436}
437
438static void __exit tcp_compound_unregister(void)
439{
440 tcp_unregister_congestion_control(&tcp_compound);
441}
442
443module_init(tcp_compound_register);
444module_exit(tcp_compound_unregister);
445
446MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger");
447MODULE_LICENSE("GPL");
448MODULE_DESCRIPTION("TCP Compound");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 91c2f41c7f58..857eefc52aab 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
38 int ret = 0; 38 int ret = 0;
39 39
40 /* all algorithms must implement ssthresh and cong_avoid ops */ 40 /* all algorithms must implement ssthresh and cong_avoid ops */
41 if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { 41 if (!ca->ssthresh || !ca->cong_avoid) {
42 printk(KERN_ERR "TCP %s does not implement required ops\n", 42 printk(KERN_ERR "TCP %s does not implement required ops\n",
43 ca->name); 43 ca->name);
44 return -EINVAL; 44 return -EINVAL;
@@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk)
251} 251}
252EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 252EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
253 253
254/* Lower bound on congestion window. */ 254/* Lower bound on congestion window with halving. */
255u32 tcp_reno_min_cwnd(struct sock *sk) 255u32 tcp_reno_min_cwnd(const struct sock *sk)
256{ 256{
257 const struct tcp_sock *tp = tcp_sk(sk); 257 const struct tcp_sock *tp = tcp_sk(sk);
258 return tp->snd_ssthresh/2; 258 return tp->snd_ssthresh/2;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 31a4986dfbf7..78b7a6b9e4de 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
325 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); 325 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
326} 326}
327 327
328static u32 bictcp_min_cwnd(struct sock *sk)
329{
330 return tcp_sk(sk)->snd_ssthresh;
331}
332
333static void bictcp_state(struct sock *sk, u8 new_state) 328static void bictcp_state(struct sock *sk, u8 new_state)
334{ 329{
335 if (new_state == TCP_CA_Loss) 330 if (new_state == TCP_CA_Loss)
@@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = {
357 .cong_avoid = bictcp_cong_avoid, 352 .cong_avoid = bictcp_cong_avoid,
358 .set_state = bictcp_state, 353 .set_state = bictcp_state,
359 .undo_cwnd = bictcp_undo_cwnd, 354 .undo_cwnd = bictcp_undo_cwnd,
360 .min_cwnd = bictcp_min_cwnd,
361 .pkts_acked = bictcp_acked, 355 .pkts_acked = bictcp_acked,
362 .owner = THIS_MODULE, 356 .owner = THIS_MODULE,
363 .name = "cubic", 357 .name = "cubic",
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index ba7c63ca5bb1..1120245b2373 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,6 +98,10 @@ struct hstcp {
98 u32 ai; 98 u32 ai;
99}; 99};
100 100
101static int max_ssthresh = 100;
102module_param(max_ssthresh, int, 0644);
103MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)");
104
101static void hstcp_init(struct sock *sk) 105static void hstcp_init(struct sock *sk)
102{ 106{
103 struct tcp_sock *tp = tcp_sk(sk); 107 struct tcp_sock *tp = tcp_sk(sk);
@@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
119 if (!tcp_is_cwnd_limited(sk, in_flight)) 123 if (!tcp_is_cwnd_limited(sk, in_flight))
120 return; 124 return;
121 125
122 if (tp->snd_cwnd <= tp->snd_ssthresh) 126 if (tp->snd_cwnd <= tp->snd_ssthresh) {
123 tcp_slow_start(tp); 127 /* RFC3742: limited slow start
124 else { 128 * the window is increased by 1/K MSS for each arriving ACK,
129 * for K = int(cwnd/(0.5 max_ssthresh))
130 */
131 if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) {
132 u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U);
133 if (++tp->snd_cwnd_cnt >= k) {
134 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
135 tp->snd_cwnd++;
136 tp->snd_cwnd_cnt = 0;
137 }
138 } else {
139 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
140 tp->snd_cwnd++;
141 }
142 } else {
125 /* Update AIMD parameters */ 143 /* Update AIMD parameters */
126 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { 144 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
127 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && 145 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 1b2ff53f98ed..3d92c1859267 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
246 } 246 }
247} 247}
248 248
249/* Lower bound on congestion window. */
250static u32 htcp_min_cwnd(struct sock *sk)
251{
252 const struct tcp_sock *tp = tcp_sk(sk);
253 return tp->snd_ssthresh;
254}
255
256
257static void htcp_init(struct sock *sk) 249static void htcp_init(struct sock *sk)
258{ 250{
259 struct htcp *ca = inet_csk_ca(sk); 251 struct htcp *ca = inet_csk_ca(sk);
@@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state)
285static struct tcp_congestion_ops htcp = { 277static struct tcp_congestion_ops htcp = {
286 .init = htcp_init, 278 .init = htcp_init,
287 .ssthresh = htcp_recalc_ssthresh, 279 .ssthresh = htcp_recalc_ssthresh,
288 .min_cwnd = htcp_min_cwnd,
289 .cong_avoid = htcp_cong_avoid, 280 .cong_avoid = htcp_cong_avoid,
290 .set_state = htcp_state, 281 .set_state = htcp_state,
291 .undo_cwnd = htcp_cwnd_undo, 282 .undo_cwnd = htcp_cwnd_undo,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b5521a9d3dc1..e08245bdda3a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -71,6 +71,7 @@
71#include <net/inet_common.h> 71#include <net/inet_common.h>
72#include <linux/ipsec.h> 72#include <linux/ipsec.h>
73#include <asm/unaligned.h> 73#include <asm/unaligned.h>
74#include <net/netdma.h>
74 75
75int sysctl_tcp_timestamps = 1; 76int sysctl_tcp_timestamps = 1;
76int sysctl_tcp_window_scaling = 1; 77int sysctl_tcp_window_scaling = 1;
@@ -1688,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1688 tp->snd_cwnd_stamp = tcp_time_stamp; 1689 tp->snd_cwnd_stamp = tcp_time_stamp;
1689} 1690}
1690 1691
1692/* Lower bound on congestion window is slow start threshold
1693 * unless congestion avoidance choice decides to overide it.
1694 */
1695static inline u32 tcp_cwnd_min(const struct sock *sk)
1696{
1697 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1698
1699 return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
1700}
1701
1691/* Decrease cwnd each second ack. */ 1702/* Decrease cwnd each second ack. */
1692static void tcp_cwnd_down(struct sock *sk) 1703static void tcp_cwnd_down(struct sock *sk)
1693{ 1704{
1694 const struct inet_connection_sock *icsk = inet_csk(sk);
1695 struct tcp_sock *tp = tcp_sk(sk); 1705 struct tcp_sock *tp = tcp_sk(sk);
1696 int decr = tp->snd_cwnd_cnt + 1; 1706 int decr = tp->snd_cwnd_cnt + 1;
1697 1707
1698 tp->snd_cwnd_cnt = decr&1; 1708 tp->snd_cwnd_cnt = decr&1;
1699 decr >>= 1; 1709 decr >>= 1;
1700 1710
1701 if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) 1711 if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
1702 tp->snd_cwnd -= decr; 1712 tp->snd_cwnd -= decr;
1703 1713
1704 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1714 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -3785,6 +3795,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk
3785 __tcp_checksum_complete_user(sk, skb); 3795 __tcp_checksum_complete_user(sk, skb);
3786} 3796}
3787 3797
3798#ifdef CONFIG_NET_DMA
3799static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
3800{
3801 struct tcp_sock *tp = tcp_sk(sk);
3802 int chunk = skb->len - hlen;
3803 int dma_cookie;
3804 int copied_early = 0;
3805
3806 if (tp->ucopy.wakeup)
3807 return 0;
3808
3809 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
3810 tp->ucopy.dma_chan = get_softnet_dma();
3811
3812 if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
3813
3814 dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
3815 skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
3816
3817 if (dma_cookie < 0)
3818 goto out;
3819
3820 tp->ucopy.dma_cookie = dma_cookie;
3821 copied_early = 1;
3822
3823 tp->ucopy.len -= chunk;
3824 tp->copied_seq += chunk;
3825 tcp_rcv_space_adjust(sk);
3826
3827 if ((tp->ucopy.len == 0) ||
3828 (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
3829 (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
3830 tp->ucopy.wakeup = 1;
3831 sk->sk_data_ready(sk, 0);
3832 }
3833 } else if (chunk > 0) {
3834 tp->ucopy.wakeup = 1;
3835 sk->sk_data_ready(sk, 0);
3836 }
3837out:
3838 return copied_early;
3839}
3840#endif /* CONFIG_NET_DMA */
3841
3788/* 3842/*
3789 * TCP receive function for the ESTABLISHED state. 3843 * TCP receive function for the ESTABLISHED state.
3790 * 3844 *
@@ -3886,8 +3940,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3886 tp->rcv_nxt == tp->rcv_wup) 3940 tp->rcv_nxt == tp->rcv_wup)
3887 tcp_store_ts_recent(tp); 3941 tcp_store_ts_recent(tp);
3888 3942
3889 tcp_rcv_rtt_measure_ts(sk, skb);
3890
3891 /* We know that such packets are checksummed 3943 /* We know that such packets are checksummed
3892 * on entry. 3944 * on entry.
3893 */ 3945 */
@@ -3901,14 +3953,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3901 } 3953 }
3902 } else { 3954 } else {
3903 int eaten = 0; 3955 int eaten = 0;
3956 int copied_early = 0;
3904 3957
3905 if (tp->ucopy.task == current && 3958 if (tp->copied_seq == tp->rcv_nxt &&
3906 tp->copied_seq == tp->rcv_nxt && 3959 len - tcp_header_len <= tp->ucopy.len) {
3907 len - tcp_header_len <= tp->ucopy.len && 3960#ifdef CONFIG_NET_DMA
3908 sock_owned_by_user(sk)) { 3961 if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
3909 __set_current_state(TASK_RUNNING); 3962 copied_early = 1;
3963 eaten = 1;
3964 }
3965#endif
3966 if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
3967 __set_current_state(TASK_RUNNING);
3910 3968
3911 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { 3969 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
3970 eaten = 1;
3971 }
3972 if (eaten) {
3912 /* Predicted packet is in window by definition. 3973 /* Predicted packet is in window by definition.
3913 * seq == rcv_nxt and rcv_wup <= rcv_nxt. 3974 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
3914 * Hence, check seq<=rcv_wup reduces to: 3975 * Hence, check seq<=rcv_wup reduces to:
@@ -3924,8 +3985,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3924 __skb_pull(skb, tcp_header_len); 3985 __skb_pull(skb, tcp_header_len);
3925 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 3986 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3926 NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); 3987 NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
3927 eaten = 1;
3928 } 3988 }
3989 if (copied_early)
3990 tcp_cleanup_rbuf(sk, skb->len);
3929 } 3991 }
3930 if (!eaten) { 3992 if (!eaten) {
3931 if (tcp_checksum_complete_user(sk, skb)) 3993 if (tcp_checksum_complete_user(sk, skb))
@@ -3966,6 +4028,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3966 4028
3967 __tcp_ack_snd_check(sk, 0); 4029 __tcp_ack_snd_check(sk, 0);
3968no_ack: 4030no_ack:
4031#ifdef CONFIG_NET_DMA
4032 if (copied_early)
4033 __skb_queue_tail(&sk->sk_async_wait_queue, skb);
4034 else
4035#endif
3969 if (eaten) 4036 if (eaten)
3970 __kfree_skb(skb); 4037 __kfree_skb(skb);
3971 else 4038 else
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 672950e54c49..25ecc6e2478b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -71,6 +71,7 @@
71#include <net/inet_common.h> 71#include <net/inet_common.h>
72#include <net/timewait_sock.h> 72#include <net/timewait_sock.h>
73#include <net/xfrm.h> 73#include <net/xfrm.h>
74#include <net/netdma.h>
74 75
75#include <linux/inet.h> 76#include <linux/inet.h>
76#include <linux/ipv6.h> 77#include <linux/ipv6.h>
@@ -1091,8 +1092,18 @@ process:
1091 bh_lock_sock(sk); 1092 bh_lock_sock(sk);
1092 ret = 0; 1093 ret = 0;
1093 if (!sock_owned_by_user(sk)) { 1094 if (!sock_owned_by_user(sk)) {
1094 if (!tcp_prequeue(sk, skb)) 1095#ifdef CONFIG_NET_DMA
1096 struct tcp_sock *tp = tcp_sk(sk);
1097 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1098 tp->ucopy.dma_chan = get_softnet_dma();
1099 if (tp->ucopy.dma_chan)
1095 ret = tcp_v4_do_rcv(sk, skb); 1100 ret = tcp_v4_do_rcv(sk, skb);
1101 else
1102#endif
1103 {
1104 if (!tcp_prequeue(sk, skb))
1105 ret = tcp_v4_do_rcv(sk, skb);
1106 }
1096 } else 1107 } else
1097 sk_add_backlog(sk, skb); 1108 sk_add_backlog(sk, skb);
1098 bh_unlock_sock(sk); 1109 bh_unlock_sock(sk);
@@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk)
1296 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1307 /* Cleans up our, hopefully empty, out_of_order_queue. */
1297 __skb_queue_purge(&tp->out_of_order_queue); 1308 __skb_queue_purge(&tp->out_of_order_queue);
1298 1309
1310#ifdef CONFIG_NET_DMA
1311 /* Cleans up our sk_async_wait_queue */
1312 __skb_queue_purge(&sk->sk_async_wait_queue);
1313#endif
1314
1299 /* Clean prequeue, it must be empty really */ 1315 /* Clean prequeue, it must be empty really */
1300 __skb_queue_purge(&tp->ucopy.prequeue); 1316 __skb_queue_purge(&tp->ucopy.prequeue);
1301 1317
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
new file mode 100644
index 000000000000..1f977b6ee9a1
--- /dev/null
+++ b/net/ipv4/tcp_lp.c
@@ -0,0 +1,338 @@
1/*
2 * TCP Low Priority (TCP-LP)
3 *
4 * TCP Low Priority is a distributed algorithm whose goal is to utilize only
5 * the excess network bandwidth as compared to the ``fair share`` of
6 * bandwidth as targeted by TCP. Available from:
7 * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
8 *
9 * Original Author:
10 * Aleksandar Kuzmanovic <akuzma@northwestern.edu>
11 *
12 * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation.
13 * As of 2.6.13, Linux supports pluggable congestion control algorithms.
14 * Due to the limitation of the API, we take the following changes from
15 * the original TCP-LP implementation:
16 * o We use newReno in most core CA handling. Only add some checking
17 * within cong_avoid.
18 * o Error correcting in remote HZ, therefore remote HZ will be keeped
19 * on checking and updating.
20 * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne
21 * OWD have a similar meaning as RTT. Also correct the buggy formular.
22 * o Handle reaction for Early Congestion Indication (ECI) within
23 * pkts_acked, as mentioned within pseudo code.
24 * o OWD is handled in relative format, where local time stamp will in
25 * tcp_time_stamp format.
26 *
27 * Port from 2.4.19 to 2.6.16 as module by:
28 * Wong Hoi Sing Edison <hswong3i@gmail.com>
29 * Hung Hing Lun <hlhung3i@gmail.com>
30 *
31 * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $
32 */
33
34#include <linux/config.h>
35#include <linux/module.h>
36#include <net/tcp.h>
37
38/* resolution of owd */
39#define LP_RESOL 1000
40
41/**
42 * enum tcp_lp_state
43 * @LP_VALID_RHZ: is remote HZ valid?
44 * @LP_VALID_OWD: is OWD valid?
45 * @LP_WITHIN_THR: are we within threshold?
46 * @LP_WITHIN_INF: are we within inference?
47 *
48 * TCP-LP's state flags.
49 * We create this set of state flag mainly for debugging.
50 */
51enum tcp_lp_state {
52 LP_VALID_RHZ = (1 << 0),
53 LP_VALID_OWD = (1 << 1),
54 LP_WITHIN_THR = (1 << 3),
55 LP_WITHIN_INF = (1 << 4),
56};
57
58/**
59 * struct lp
60 * @flag: TCP-LP state flag
61 * @sowd: smoothed OWD << 3
62 * @owd_min: min OWD
63 * @owd_max: max OWD
64 * @owd_max_rsv: resrved max owd
65 * @remote_hz: estimated remote HZ
66 * @remote_ref_time: remote reference time
67 * @local_ref_time: local reference time
68 * @last_drop: time for last active drop
69 * @inference: current inference
70 *
71 * TCP-LP's private struct.
72 * We get the idea from original TCP-LP implementation where only left those we
73 * found are really useful.
74 */
75struct lp {
76 u32 flag;
77 u32 sowd;
78 u32 owd_min;
79 u32 owd_max;
80 u32 owd_max_rsv;
81 u32 remote_hz;
82 u32 remote_ref_time;
83 u32 local_ref_time;
84 u32 last_drop;
85 u32 inference;
86};
87
88/**
89 * tcp_lp_init
90 *
91 * Init all required variables.
92 * Clone the handling from Vegas module implementation.
93 */
94static void tcp_lp_init(struct sock *sk)
95{
96 struct lp *lp = inet_csk_ca(sk);
97
98 lp->flag = 0;
99 lp->sowd = 0;
100 lp->owd_min = 0xffffffff;
101 lp->owd_max = 0;
102 lp->owd_max_rsv = 0;
103 lp->remote_hz = 0;
104 lp->remote_ref_time = 0;
105 lp->local_ref_time = 0;
106 lp->last_drop = 0;
107 lp->inference = 0;
108}
109
110/**
111 * tcp_lp_cong_avoid
112 *
113 * Implementation of cong_avoid.
114 * Will only call newReno CA when away from inference.
115 * From TCP-LP's paper, this will be handled in additive increasement.
116 */
117static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
118 int flag)
119{
120 struct lp *lp = inet_csk_ca(sk);
121
122 if (!(lp->flag & LP_WITHIN_INF))
123 tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
124}
125
126/**
127 * tcp_lp_remote_hz_estimator
128 *
129 * Estimate remote HZ.
130 * We keep on updating the estimated value, where original TCP-LP
131 * implementation only guest it for once and use forever.
132 */
133static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
134{
135 struct tcp_sock *tp = tcp_sk(sk);
136 struct lp *lp = inet_csk_ca(sk);
137 s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */
138 s64 m = 0;
139
140 /* not yet record reference time
141 * go away!! record it before come back!! */
142 if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
143 goto out;
144
145 /* we can't calc remote HZ with no different!! */
146 if (tp->rx_opt.rcv_tsval == lp->remote_ref_time
147 || tp->rx_opt.rcv_tsecr == lp->local_ref_time)
148 goto out;
149
150 m = HZ * (tp->rx_opt.rcv_tsval -
151 lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr -
152 lp->local_ref_time);
153 if (m < 0)
154 m = -m;
155
156 if (rhz != 0) {
157 m -= rhz >> 6; /* m is now error in remote HZ est */
158 rhz += m; /* 63/64 old + 1/64 new */
159 } else
160 rhz = m << 6;
161
162 /* record time for successful remote HZ calc */
163 lp->flag |= LP_VALID_RHZ;
164
165 out:
166 /* record reference time stamp */
167 lp->remote_ref_time = tp->rx_opt.rcv_tsval;
168 lp->local_ref_time = tp->rx_opt.rcv_tsecr;
169
170 return rhz >> 6;
171}
172
173/**
174 * tcp_lp_owd_calculator
175 *
176 * Calculate one way delay (in relative format).
177 * Original implement OWD as minus of remote time difference to local time
178 * difference directly. As this time difference just simply equal to RTT, when
179 * the network status is stable, remote RTT will equal to local RTT, and result
180 * OWD into zero.
181 * It seems to be a bug and so we fixed it.
182 */
183static u32 tcp_lp_owd_calculator(struct sock *sk)
184{
185 struct tcp_sock *tp = tcp_sk(sk);
186 struct lp *lp = inet_csk_ca(sk);
187 s64 owd = 0;
188
189 lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
190
191 if (lp->flag & LP_VALID_RHZ) {
192 owd =
193 tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
194 tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ);
195 if (owd < 0)
196 owd = -owd;
197 }
198
199 if (owd > 0)
200 lp->flag |= LP_VALID_OWD;
201 else
202 lp->flag &= ~LP_VALID_OWD;
203
204 return owd;
205}
206
207/**
208 * tcp_lp_rtt_sample
209 *
210 * Implementation or rtt_sample.
211 * Will take the following action,
212 * 1. calc OWD,
213 * 2. record the min/max OWD,
214 * 3. calc smoothed OWD (SOWD).
215 * Most ideas come from the original TCP-LP implementation.
216 */
217static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt)
218{
219 struct lp *lp = inet_csk_ca(sk);
220 s64 mowd = tcp_lp_owd_calculator(sk);
221
222 /* sorry that we don't have valid data */
223 if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
224 return;
225
226 /* record the next min owd */
227 if (mowd < lp->owd_min)
228 lp->owd_min = mowd;
229
230 /* always forget the max of the max
231 * we just set owd_max as one below it */
232 if (mowd > lp->owd_max) {
233 if (mowd > lp->owd_max_rsv) {
234 if (lp->owd_max_rsv == 0)
235 lp->owd_max = mowd;
236 else
237 lp->owd_max = lp->owd_max_rsv;
238 lp->owd_max_rsv = mowd;
239 } else
240 lp->owd_max = mowd;
241 }
242
243 /* calc for smoothed owd */
244 if (lp->sowd != 0) {
245 mowd -= lp->sowd >> 3; /* m is now error in owd est */
246 lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */
247 } else
248 lp->sowd = mowd << 3; /* take the measured time be owd */
249}
250
251/**
252 * tcp_lp_pkts_acked
253 *
254 * Implementation of pkts_acked.
255 * Deal with active drop under Early Congestion Indication.
256 * Only drop to half and 1 will be handle, because we hope to use back
257 * newReno in increase case.
258 * We work it out by following the idea from TCP-LP's paper directly
259 */
260static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
261{
262 struct tcp_sock *tp = tcp_sk(sk);
263 struct lp *lp = inet_csk_ca(sk);
264
265 /* calc inference */
266 if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
267 lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
268
269 /* test if within inference */
270 if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
271 lp->flag |= LP_WITHIN_INF;
272 else
273 lp->flag &= ~LP_WITHIN_INF;
274
275 /* test if within threshold */
276 if (lp->sowd >> 3 <
277 lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
278 lp->flag |= LP_WITHIN_THR;
279 else
280 lp->flag &= ~LP_WITHIN_THR;
281
282 pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
283 tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
284 lp->sowd >> 3);
285
286 if (lp->flag & LP_WITHIN_THR)
287 return;
288
289 /* FIXME: try to reset owd_min and owd_max here
290 * so decrease the chance the min/max is no longer suitable
291 * and will usually within threshold when whithin inference */
292 lp->owd_min = lp->sowd >> 3;
293 lp->owd_max = lp->sowd >> 2;
294 lp->owd_max_rsv = lp->sowd >> 2;
295
296 /* happened within inference
297 * drop snd_cwnd into 1 */
298 if (lp->flag & LP_WITHIN_INF)
299 tp->snd_cwnd = 1U;
300
301 /* happened after inference
302 * cut snd_cwnd into half */
303 else
304 tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
305
306 /* record this drop time */
307 lp->last_drop = tcp_time_stamp;
308}
309
310static struct tcp_congestion_ops tcp_lp = {
311 .init = tcp_lp_init,
312 .ssthresh = tcp_reno_ssthresh,
313 .cong_avoid = tcp_lp_cong_avoid,
314 .min_cwnd = tcp_reno_min_cwnd,
315 .rtt_sample = tcp_lp_rtt_sample,
316 .pkts_acked = tcp_lp_pkts_acked,
317
318 .owner = THIS_MODULE,
319 .name = "lp"
320};
321
322static int __init tcp_lp_register(void)
323{
324 BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
325 return tcp_register_congestion_control(&tcp_lp);
326}
327
328static void __exit tcp_lp_unregister(void)
329{
330 tcp_unregister_congestion_control(&tcp_lp);
331}
332
333module_init(tcp_lp_register);
334module_exit(tcp_lp_unregister);
335
336MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun");
337MODULE_LICENSE("GPL");
338MODULE_DESCRIPTION("TCP Low Priority");
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f33c9dddaa12..07bb5a2b375e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3;
59int sysctl_tcp_mtu_probing = 0; 59int sysctl_tcp_mtu_probing = 0;
60int sysctl_tcp_base_mss = 512; 60int sysctl_tcp_base_mss = 512;
61 61
62/* By default, RFC2861 behavior. */
63int sysctl_tcp_slow_start_after_idle = 1;
64
62static void update_send_head(struct sock *sk, struct tcp_sock *tp, 65static void update_send_head(struct sock *sk, struct tcp_sock *tp,
63 struct sk_buff *skb) 66 struct sk_buff *skb)
64{ 67{
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
138 struct inet_connection_sock *icsk = inet_csk(sk); 141 struct inet_connection_sock *icsk = inet_csk(sk);
139 const u32 now = tcp_time_stamp; 142 const u32 now = tcp_time_stamp;
140 143
141 if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) 144 if (sysctl_tcp_slow_start_after_idle &&
145 (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
142 tcp_cwnd_restart(sk, __sk_dst_get(sk)); 146 tcp_cwnd_restart(sk, __sk_dst_get(sk));
143 147
144 tp->lsndtime = now; 148 tp->lsndtime = now;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
new file mode 100644
index 000000000000..d7d517a3a238
--- /dev/null
+++ b/net/ipv4/tcp_probe.c
@@ -0,0 +1,181 @@
1/*
2 * tcpprobe - Observe the TCP flow with kprobes.
3 *
4 * The idea for this came from Werner Almesberger's umlsim
5 * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/kprobes.h>
24#include <linux/socket.h>
25#include <linux/tcp.h>
26#include <linux/proc_fs.h>
27#include <linux/module.h>
28#include <linux/kfifo.h>
29#include <linux/vmalloc.h>
30
31#include <net/tcp.h>
32
33MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>");
34MODULE_DESCRIPTION("TCP cwnd snooper");
35MODULE_LICENSE("GPL");
36
37static int port = 0;
38MODULE_PARM_DESC(port, "Port to match (0=all)");
39module_param(port, int, 0);
40
41static int bufsize = 64*1024;
42MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
43module_param(bufsize, int, 0);
44
45static const char procname[] = "tcpprobe";
46
47struct {
48 struct kfifo *fifo;
49 spinlock_t lock;
50 wait_queue_head_t wait;
51 struct timeval tstart;
52} tcpw;
53
54static void printl(const char *fmt, ...)
55{
56 va_list args;
57 int len;
58 struct timeval now;
59 char tbuf[256];
60
61 va_start(args, fmt);
62 do_gettimeofday(&now);
63
64 now.tv_sec -= tcpw.tstart.tv_sec;
65 now.tv_usec -= tcpw.tstart.tv_usec;
66 if (now.tv_usec < 0) {
67 --now.tv_sec;
68 now.tv_usec += 1000000;
69 }
70
71 len = sprintf(tbuf, "%lu.%06lu ",
72 (unsigned long) now.tv_sec,
73 (unsigned long) now.tv_usec);
74 len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
75 va_end(args);
76
77 kfifo_put(tcpw.fifo, tbuf, len);
78 wake_up(&tcpw.wait);
79}
80
81static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk,
82 struct msghdr *msg, size_t size)
83{
84 const struct tcp_sock *tp = tcp_sk(sk);
85 const struct inet_sock *inet = inet_sk(sk);
86
87 if (port == 0 || ntohs(inet->dport) == port ||
88 ntohs(inet->sport) == port) {
89 printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n",
90 NIPQUAD(inet->saddr), ntohs(inet->sport),
91 NIPQUAD(inet->daddr), ntohs(inet->dport),
92 size, tp->snd_nxt, tp->snd_una,
93 tp->snd_cwnd, tcp_current_ssthresh(sk),
94 tp->snd_wnd);
95 }
96
97 jprobe_return();
98 return 0;
99}
100
101static struct jprobe tcp_send_probe = {
102 .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, },
103 .entry = (kprobe_opcode_t *) &jtcp_sendmsg,
104};
105
106
107static int tcpprobe_open(struct inode * inode, struct file * file)
108{
109 kfifo_reset(tcpw.fifo);
110 do_gettimeofday(&tcpw.tstart);
111 return 0;
112}
113
114static ssize_t tcpprobe_read(struct file *file, char __user *buf,
115 size_t len, loff_t *ppos)
116{
117 int error = 0, cnt;
118 unsigned char *tbuf;
119
120 if (!buf || len < 0)
121 return -EINVAL;
122
123 if (len == 0)
124 return 0;
125
126 tbuf = vmalloc(len);
127 if (!tbuf)
128 return -ENOMEM;
129
130 error = wait_event_interruptible(tcpw.wait,
131 __kfifo_len(tcpw.fifo) != 0);
132 if (error)
133 return error;
134
135 cnt = kfifo_get(tcpw.fifo, tbuf, len);
136 error = copy_to_user(buf, tbuf, cnt);
137
138 vfree(tbuf);
139
140 return error ? error : cnt;
141}
142
143static struct file_operations tcpprobe_fops = {
144 .owner = THIS_MODULE,
145 .open = tcpprobe_open,
146 .read = tcpprobe_read,
147};
148
149static __init int tcpprobe_init(void)
150{
151 int ret = -ENOMEM;
152
153 init_waitqueue_head(&tcpw.wait);
154 spin_lock_init(&tcpw.lock);
155 tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock);
156
157 if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops))
158 goto err0;
159
160 ret = register_jprobe(&tcp_send_probe);
161 if (ret)
162 goto err1;
163
164 pr_info("TCP watch registered (port=%d)\n", port);
165 return 0;
166 err1:
167 proc_net_remove(procname);
168 err0:
169 kfifo_free(tcpw.fifo);
170 return ret;
171}
172module_init(tcpprobe_init);
173
174static __exit void tcpprobe_exit(void)
175{
176 kfifo_free(tcpw.fifo);
177 proc_net_remove(procname);
178 unregister_jprobe(&tcp_send_probe);
179
180}
181module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
new file mode 100644
index 000000000000..11b42a7135c1
--- /dev/null
+++ b/net/ipv4/tcp_veno.c
@@ -0,0 +1,231 @@
1/*
2 * TCP Veno congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * C. P. Fu, S. C. Liew.
6 * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
7 * IEEE Journal on Selected Areas in Communication,
8 * Feb. 2003.
9 * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
10 */
11
12#include <linux/config.h>
13#include <linux/mm.h>
14#include <linux/module.h>
15#include <linux/skbuff.h>
16#include <linux/inet_diag.h>
17
18#include <net/tcp.h>
19
20/* Default values of the Veno variables, in fixed-point representation
21 * with V_PARAM_SHIFT bits to the right of the binary point.
22 */
23#define V_PARAM_SHIFT 1
24static const int beta = 3 << V_PARAM_SHIFT;
25
26/* Veno variables */
27struct veno {
28 u8 doing_veno_now; /* if true, do veno for this rtt */
29 u16 cntrtt; /* # of rtts measured within last rtt */
30 u32 minrtt; /* min of rtts measured within last rtt (in usec) */
31 u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
32 u32 inc; /* decide whether to increase cwnd */
33 u32 diff; /* calculate the diff rate */
34};
35
36/* There are several situations when we must "re-start" Veno:
37 *
38 * o when a connection is established
39 * o after an RTO
40 * o after fast recovery
41 * o when we send a packet and there is no outstanding
42 * unacknowledged data (restarting an idle connection)
43 *
44 */
45static inline void veno_enable(struct sock *sk)
46{
47 struct veno *veno = inet_csk_ca(sk);
48
49 /* turn on Veno */
50 veno->doing_veno_now = 1;
51
52 veno->minrtt = 0x7fffffff;
53}
54
55static inline void veno_disable(struct sock *sk)
56{
57 struct veno *veno = inet_csk_ca(sk);
58
59 /* turn off Veno */
60 veno->doing_veno_now = 0;
61}
62
63static void tcp_veno_init(struct sock *sk)
64{
65 struct veno *veno = inet_csk_ca(sk);
66
67 veno->basertt = 0x7fffffff;
68 veno->inc = 1;
69 veno_enable(sk);
70}
71
72/* Do rtt sampling needed for Veno. */
73static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt)
74{
75 struct veno *veno = inet_csk_ca(sk);
76 u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */
77
78 /* Filter to find propagation delay: */
79 if (vrtt < veno->basertt)
80 veno->basertt = vrtt;
81
82 /* Find the min rtt during the last rtt to find
83 * the current prop. delay + queuing delay:
84 */
85 veno->minrtt = min(veno->minrtt, vrtt);
86 veno->cntrtt++;
87}
88
89static void tcp_veno_state(struct sock *sk, u8 ca_state)
90{
91 if (ca_state == TCP_CA_Open)
92 veno_enable(sk);
93 else
94 veno_disable(sk);
95}
96
97/*
98 * If the connection is idle and we are restarting,
99 * then we don't want to do any Veno calculations
100 * until we get fresh rtt samples. So when we
101 * restart, we reset our Veno state to a clean
102 * state. After we get acks for this flight of
103 * packets, _then_ we can make Veno calculations
104 * again.
105 */
106static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
107{
108 if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
109 tcp_veno_init(sk);
110}
111
112static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
113 u32 seq_rtt, u32 in_flight, int flag)
114{
115 struct tcp_sock *tp = tcp_sk(sk);
116 struct veno *veno = inet_csk_ca(sk);
117
118 if (!veno->doing_veno_now)
119 return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
120
121 /* limited by applications */
122 if (!tcp_is_cwnd_limited(sk, in_flight))
123 return;
124
125 /* We do the Veno calculations only if we got enough rtt samples */
126 if (veno->cntrtt <= 2) {
127 /* We don't have enough rtt samples to do the Veno
128 * calculation, so we'll behave like Reno.
129 */
130 tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
131 } else {
132 u32 rtt, target_cwnd;
133
134 /* We have enough rtt samples, so, using the Veno
135 * algorithm, we determine the state of the network.
136 */
137
138 rtt = veno->minrtt;
139
140 target_cwnd = ((tp->snd_cwnd * veno->basertt)
141 << V_PARAM_SHIFT) / rtt;
142
143 veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
144
145 if (tp->snd_cwnd <= tp->snd_ssthresh) {
146 /* Slow start. */
147 tcp_slow_start(tp);
148 } else {
149 /* Congestion avoidance. */
150 if (veno->diff < beta) {
151 /* In the "non-congestive state", increase cwnd
152 * every rtt.
153 */
154 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
155 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
156 tp->snd_cwnd++;
157 tp->snd_cwnd_cnt = 0;
158 } else
159 tp->snd_cwnd_cnt++;
160 } else {
161 /* In the "congestive state", increase cwnd
162 * every other rtt.
163 */
164 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
165 if (veno->inc
166 && tp->snd_cwnd <
167 tp->snd_cwnd_clamp) {
168 tp->snd_cwnd++;
169 veno->inc = 0;
170 } else
171 veno->inc = 1;
172 tp->snd_cwnd_cnt = 0;
173 } else
174 tp->snd_cwnd_cnt++;
175 }
176
177 }
178 if (tp->snd_cwnd < 2)
179 tp->snd_cwnd = 2;
180 else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
181 tp->snd_cwnd = tp->snd_cwnd_clamp;
182 }
183 /* Wipe the slate clean for the next rtt. */
184 /* veno->cntrtt = 0; */
185 veno->minrtt = 0x7fffffff;
186}
187
188/* Veno MD phase */
189static u32 tcp_veno_ssthresh(struct sock *sk)
190{
191 const struct tcp_sock *tp = tcp_sk(sk);
192 struct veno *veno = inet_csk_ca(sk);
193
194 if (veno->diff < beta)
195 /* in "non-congestive state", cut cwnd by 1/5 */
196 return max(tp->snd_cwnd * 4 / 5, 2U);
197 else
198 /* in "congestive state", cut cwnd by 1/2 */
199 return max(tp->snd_cwnd >> 1U, 2U);
200}
201
202static struct tcp_congestion_ops tcp_veno = {
203 .init = tcp_veno_init,
204 .ssthresh = tcp_veno_ssthresh,
205 .cong_avoid = tcp_veno_cong_avoid,
206 .rtt_sample = tcp_veno_rtt_calc,
207 .set_state = tcp_veno_state,
208 .cwnd_event = tcp_veno_cwnd_event,
209
210 .owner = THIS_MODULE,
211 .name = "veno",
212};
213
214static int __init tcp_veno_register(void)
215{
216 BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE);
217 tcp_register_congestion_control(&tcp_veno);
218 return 0;
219}
220
221static void __exit tcp_veno_unregister(void)
222{
223 tcp_unregister_congestion_control(&tcp_veno);
224}
225
226module_init(tcp_veno_register);
227module_exit(tcp_veno_unregister);
228
229MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu");
230MODULE_LICENSE("GPL");
231MODULE_DESCRIPTION("TCP Veno");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 0c340c3756c2..4247da1384bf 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -1,7 +1,24 @@
1/* 1/*
2 * TCP Westwood+ 2 * TCP Westwood+: end-to-end bandwidth estimation for TCP
3 * 3 *
4 * Angelo Dell'Aera: TCP Westwood+ support 4 * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4
5 *
6 * Support at http://c3lab.poliba.it/index.php/Westwood
7 * Main references in literature:
8 *
9 * - Mascolo S, Casetti, M. Gerla et al.
10 * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001
11 *
12 * - A. Grieco, s. Mascolo
13 * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer
14 * Comm. Review, 2004
15 *
16 * - A. Dell'Aera, L. Grieco, S. Mascolo.
17 * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving :
18 * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004
19 *
20 * Westwood+ employs end-to-end bandwidth measurement to set cwnd and
21 * ssthresh after packet loss. The probing phase is as the original Reno.
5 */ 22 */
6 23
7#include <linux/config.h> 24#include <linux/config.h>
@@ -22,6 +39,8 @@ struct westwood {
22 u32 accounted; 39 u32 accounted;
23 u32 rtt; 40 u32 rtt;
24 u32 rtt_min; /* minimum observed RTT */ 41 u32 rtt_min; /* minimum observed RTT */
42 u8 first_ack; /* flag which infers that this is the first ack */
43 u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/
25}; 44};
26 45
27 46
@@ -49,9 +68,11 @@ static void tcp_westwood_init(struct sock *sk)
49 w->bw_est = 0; 68 w->bw_est = 0;
50 w->accounted = 0; 69 w->accounted = 0;
51 w->cumul_ack = 0; 70 w->cumul_ack = 0;
71 w->reset_rtt_min = 1;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; 72 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp; 73 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tcp_sk(sk)->snd_una; 74 w->snd_una = tcp_sk(sk)->snd_una;
75 w->first_ack = 1;
55} 76}
56 77
57/* 78/*
@@ -63,10 +84,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b)
63 return (((7 * a) + b) >> 3); 84 return (((7 * a) + b) >> 3);
64} 85}
65 86
66static inline void westwood_filter(struct westwood *w, u32 delta) 87static void westwood_filter(struct westwood *w, u32 delta)
67{ 88{
68 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); 89 /* If the filter is empty fill it with the first sample of bandwidth */
69 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); 90 if (w->bw_ns_est == 0 && w->bw_est == 0) {
91 w->bw_ns_est = w->bk / delta;
92 w->bw_est = w->bw_ns_est;
93 } else {
94 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
95 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
96 }
70} 97}
71 98
72/* 99/*
@@ -91,6 +118,15 @@ static void westwood_update_window(struct sock *sk)
91 struct westwood *w = inet_csk_ca(sk); 118 struct westwood *w = inet_csk_ca(sk);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx; 119 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93 120
121 /* Initialize w->snd_una with the first acked sequence number in order
122 * to fix mismatch between tp->snd_una and w->snd_una for the first
123 * bandwidth sample
124 */
125 if (w->first_ack) {
126 w->snd_una = tcp_sk(sk)->snd_una;
127 w->first_ack = 0;
128 }
129
94 /* 130 /*
95 * See if a RTT-window has passed. 131 * See if a RTT-window has passed.
96 * Be careful since if RTT is less than 132 * Be careful since if RTT is less than
@@ -108,6 +144,16 @@ static void westwood_update_window(struct sock *sk)
108 } 144 }
109} 145}
110 146
147static inline void update_rtt_min(struct westwood *w)
148{
149 if (w->reset_rtt_min) {
150 w->rtt_min = w->rtt;
151 w->reset_rtt_min = 0;
152 } else
153 w->rtt_min = min(w->rtt, w->rtt_min);
154}
155
156
111/* 157/*
112 * @westwood_fast_bw 158 * @westwood_fast_bw
113 * It is called when we are in fast path. In particular it is called when 159 * It is called when we are in fast path. In particular it is called when
@@ -123,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk)
123 169
124 w->bk += tp->snd_una - w->snd_una; 170 w->bk += tp->snd_una - w->snd_una;
125 w->snd_una = tp->snd_una; 171 w->snd_una = tp->snd_una;
126 w->rtt_min = min(w->rtt, w->rtt_min); 172 update_rtt_min(w);
127} 173}
128 174
129/* 175/*
@@ -162,12 +208,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
162 return w->cumul_ack; 208 return w->cumul_ack;
163} 209}
164 210
165static inline u32 westwood_bw_rttmin(const struct sock *sk)
166{
167 const struct tcp_sock *tp = tcp_sk(sk);
168 const struct westwood *w = inet_csk_ca(sk);
169 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
170}
171 211
172/* 212/*
173 * TCP Westwood 213 * TCP Westwood
@@ -175,9 +215,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk)
175 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 215 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
176 * so avoids ever returning 0. 216 * so avoids ever returning 0.
177 */ 217 */
178static u32 tcp_westwood_cwnd_min(struct sock *sk) 218static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
179{ 219{
180 return westwood_bw_rttmin(sk); 220 const struct tcp_sock *tp = tcp_sk(sk);
221 const struct westwood *w = inet_csk_ca(sk);
222 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
181} 223}
182 224
183static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) 225static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
@@ -191,17 +233,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
191 break; 233 break;
192 234
193 case CA_EVENT_COMPLETE_CWR: 235 case CA_EVENT_COMPLETE_CWR:
194 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); 236 tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
195 break; 237 break;
196 238
197 case CA_EVENT_FRTO: 239 case CA_EVENT_FRTO:
198 tp->snd_ssthresh = westwood_bw_rttmin(sk); 240 tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
241 /* Update RTT_min when next ack arrives */
242 w->reset_rtt_min = 1;
199 break; 243 break;
200 244
201 case CA_EVENT_SLOW_ACK: 245 case CA_EVENT_SLOW_ACK:
202 westwood_update_window(sk); 246 westwood_update_window(sk);
203 w->bk += westwood_acked_count(sk); 247 w->bk += westwood_acked_count(sk);
204 w->rtt_min = min(w->rtt, w->rtt_min); 248 update_rtt_min(w);
205 break; 249 break;
206 250
207 default: 251 default:
@@ -235,7 +279,7 @@ static struct tcp_congestion_ops tcp_westwood = {
235 .init = tcp_westwood_init, 279 .init = tcp_westwood_init,
236 .ssthresh = tcp_reno_ssthresh, 280 .ssthresh = tcp_reno_ssthresh,
237 .cong_avoid = tcp_reno_cong_avoid, 281 .cong_avoid = tcp_reno_cong_avoid,
238 .min_cwnd = tcp_westwood_cwnd_min, 282 .min_cwnd = tcp_westwood_bw_rttmin,
239 .cwnd_event = tcp_westwood_event, 283 .cwnd_event = tcp_westwood_event,
240 .get_info = tcp_westwood_info, 284 .get_info = tcp_westwood_info,
241 .pkts_acked = tcp_westwood_pkts_acked, 285 .pkts_acked = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 3e174c83bfe7..817ed84511a6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -13,7 +13,6 @@
13#include <linux/string.h> 13#include <linux/string.h>
14#include <linux/netfilter.h> 14#include <linux/netfilter.h>
15#include <linux/netfilter_ipv4.h> 15#include <linux/netfilter_ipv4.h>
16#include <net/inet_ecn.h>
17#include <net/ip.h> 16#include <net/ip.h>
18#include <net/xfrm.h> 17#include <net/xfrm.h>
19 18
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb)
24 23
25EXPORT_SYMBOL(xfrm4_rcv); 24EXPORT_SYMBOL(xfrm4_rcv);
26 25
27static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
28{
29 struct iphdr *outer_iph = skb->nh.iph;
30 struct iphdr *inner_iph = skb->h.ipiph;
31
32 if (INET_ECN_is_ce(outer_iph->tos))
33 IP_ECN_set_ce(inner_iph);
34}
35
36static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) 26static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
37{ 27{
38 switch (nexthdr) { 28 switch (nexthdr) {
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
113 103
114 xfrm_vec[xfrm_nr++] = x; 104 xfrm_vec[xfrm_nr++] = x;
115 105
116 iph = skb->nh.iph; 106 if (x->mode->input(x, skb))
107 goto drop;
117 108
118 if (x->props.mode) { 109 if (x->props.mode) {
119 if (iph->protocol != IPPROTO_IPIP)
120 goto drop;
121 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
122 goto drop;
123 if (skb_cloned(skb) &&
124 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
125 goto drop;
126 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
127 ipv4_copy_dscp(iph, skb->h.ipiph);
128 if (!(x->props.flags & XFRM_STATE_NOECN))
129 ipip_ecn_decapsulate(skb);
130 skb->mac.raw = memmove(skb->data - skb->mac_len,
131 skb->mac.raw, skb->mac_len);
132 skb->nh.raw = skb->data;
133 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
134 decaps = 1; 110 decaps = 1;
135 break; 111 break;
136 } 112 }
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 000000000000..a9e6b3dd19c9
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,83 @@
1/*
2 * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
3 *
4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
5 */
6
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/stringify.h>
12#include <net/dst.h>
13#include <net/ip.h>
14#include <net/xfrm.h>
15
16/* Add encapsulation header.
17 *
18 * The IP header will be moved forward to make space for the encapsulation
19 * header.
20 *
21 * On exit, skb->h will be set to the start of the payload to be processed
22 * by x->type->output and skb->nh will be set to the top IP header.
23 */
24static int xfrm4_transport_output(struct sk_buff *skb)
25{
26 struct xfrm_state *x;
27 struct iphdr *iph;
28 int ihl;
29
30 iph = skb->nh.iph;
31 skb->h.ipiph = iph;
32
33 ihl = iph->ihl * 4;
34 skb->h.raw += ihl;
35
36 x = skb->dst->xfrm;
37 skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
38 return 0;
39}
40
41/* Remove encapsulation header.
42 *
43 * The IP header will be moved over the top of the encapsulation header.
44 *
45 * On entry, skb->h shall point to where the IP header should be and skb->nh
46 * shall be set to where the IP header currently is. skb->data shall point
47 * to the start of the payload.
48 */
49static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
50{
51 int ihl = skb->data - skb->h.raw;
52
53 if (skb->h.raw != skb->nh.raw)
54 skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl);
55 skb->nh.iph->tot_len = htons(skb->len + ihl);
56 skb->h.raw = skb->data;
57 return 0;
58}
59
60static struct xfrm_mode xfrm4_transport_mode = {
61 .input = xfrm4_transport_input,
62 .output = xfrm4_transport_output,
63 .owner = THIS_MODULE,
64 .encap = XFRM_MODE_TRANSPORT,
65};
66
67static int __init xfrm4_transport_init(void)
68{
69 return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
70}
71
72static void __exit xfrm4_transport_exit(void)
73{
74 int err;
75
76 err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
77 BUG_ON(err);
78}
79
80module_init(xfrm4_transport_init);
81module_exit(xfrm4_transport_exit);
82MODULE_LICENSE("GPL");
83MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 000000000000..f8d880beb12f
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
1/*
2 * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
3 *
4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
5 */
6
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/stringify.h>
12#include <net/dst.h>
13#include <net/inet_ecn.h>
14#include <net/ip.h>
15#include <net/xfrm.h>
16
17static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
18{
19 struct iphdr *outer_iph = skb->nh.iph;
20 struct iphdr *inner_iph = skb->h.ipiph;
21
22 if (INET_ECN_is_ce(outer_iph->tos))
23 IP_ECN_set_ce(inner_iph);
24}
25
26/* Add encapsulation header.
27 *
28 * The top IP header will be constructed per RFC 2401. The following fields
29 * in it shall be filled in by x->type->output:
30 * tot_len
31 * check
32 *
33 * On exit, skb->h will be set to the start of the payload to be processed
34 * by x->type->output and skb->nh will be set to the top IP header.
35 */
36static int xfrm4_tunnel_output(struct sk_buff *skb)
37{
38 struct dst_entry *dst = skb->dst;
39 struct xfrm_state *x = dst->xfrm;
40 struct iphdr *iph, *top_iph;
41 int flags;
42
43 iph = skb->nh.iph;
44 skb->h.ipiph = iph;
45
46 skb->nh.raw = skb_push(skb, x->props.header_len);
47 top_iph = skb->nh.iph;
48
49 top_iph->ihl = 5;
50 top_iph->version = 4;
51
52 /* DS disclosed */
53 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
54
55 flags = x->props.flags;
56 if (flags & XFRM_STATE_NOECN)
57 IP_ECN_clear(top_iph);
58
59 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
60 0 : (iph->frag_off & htons(IP_DF));
61 if (!top_iph->frag_off)
62 __ip_select_ident(top_iph, dst->child, 0);
63
64 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
65
66 top_iph->saddr = x->props.saddr.a4;
67 top_iph->daddr = x->id.daddr.a4;
68 top_iph->protocol = IPPROTO_IPIP;
69
70 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
71 return 0;
72}
73
74static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
75{
76 struct iphdr *iph = skb->nh.iph;
77 int err = -EINVAL;
78
79 if (iph->protocol != IPPROTO_IPIP)
80 goto out;
81 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
82 goto out;
83
84 if (skb_cloned(skb) &&
85 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
86 goto out;
87
88 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
89 ipv4_copy_dscp(iph, skb->h.ipiph);
90 if (!(x->props.flags & XFRM_STATE_NOECN))
91 ipip_ecn_decapsulate(skb);
92 skb->mac.raw = memmove(skb->data - skb->mac_len,
93 skb->mac.raw, skb->mac_len);
94 skb->nh.raw = skb->data;
95 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
96 err = 0;
97
98out:
99 return err;
100}
101
102static struct xfrm_mode xfrm4_tunnel_mode = {
103 .input = xfrm4_tunnel_input,
104 .output = xfrm4_tunnel_output,
105 .owner = THIS_MODULE,
106 .encap = XFRM_MODE_TUNNEL,
107};
108
109static int __init xfrm4_tunnel_init(void)
110{
111 return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
112}
113
114static void __exit xfrm4_tunnel_exit(void)
115{
116 int err;
117
118 err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
119 BUG_ON(err);
120}
121
122module_init(xfrm4_tunnel_init);
123module_exit(xfrm4_tunnel_exit);
124MODULE_LICENSE("GPL");
125MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 4ef8efaf6a67..ac9d91d4bb05 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -12,67 +12,10 @@
12#include <linux/skbuff.h> 12#include <linux/skbuff.h>
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/netfilter_ipv4.h> 14#include <linux/netfilter_ipv4.h>
15#include <net/inet_ecn.h>
16#include <net/ip.h> 15#include <net/ip.h>
17#include <net/xfrm.h> 16#include <net/xfrm.h>
18#include <net/icmp.h> 17#include <net/icmp.h>
19 18
20/* Add encapsulation header.
21 *
22 * In transport mode, the IP header will be moved forward to make space
23 * for the encapsulation header.
24 *
25 * In tunnel mode, the top IP header will be constructed per RFC 2401.
26 * The following fields in it shall be filled in by x->type->output:
27 * tot_len
28 * check
29 *
30 * On exit, skb->h will be set to the start of the payload to be processed
31 * by x->type->output and skb->nh will be set to the top IP header.
32 */
33static void xfrm4_encap(struct sk_buff *skb)
34{
35 struct dst_entry *dst = skb->dst;
36 struct xfrm_state *x = dst->xfrm;
37 struct iphdr *iph, *top_iph;
38 int flags;
39
40 iph = skb->nh.iph;
41 skb->h.ipiph = iph;
42
43 skb->nh.raw = skb_push(skb, x->props.header_len);
44 top_iph = skb->nh.iph;
45
46 if (!x->props.mode) {
47 skb->h.raw += iph->ihl*4;
48 memmove(top_iph, iph, iph->ihl*4);
49 return;
50 }
51
52 top_iph->ihl = 5;
53 top_iph->version = 4;
54
55 /* DS disclosed */
56 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
57
58 flags = x->props.flags;
59 if (flags & XFRM_STATE_NOECN)
60 IP_ECN_clear(top_iph);
61
62 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
63 0 : (iph->frag_off & htons(IP_DF));
64 if (!top_iph->frag_off)
65 __ip_select_ident(top_iph, dst->child, 0);
66
67 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
68
69 top_iph->saddr = x->props.saddr.a4;
70 top_iph->daddr = x->id.daddr.a4;
71 top_iph->protocol = IPPROTO_IPIP;
72
73 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
74}
75
76static int xfrm4_tunnel_check_size(struct sk_buff *skb) 19static int xfrm4_tunnel_check_size(struct sk_buff *skb)
77{ 20{
78 int mtu, ret = 0; 21 int mtu, ret = 0;
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb)
121 if (err) 64 if (err)
122 goto error; 65 goto error;
123 66
124 xfrm4_encap(skb); 67 err = x->mode->output(skb);
68 if (err)
69 goto error;
125 70
126 err = x->type->output(x, skb); 71 err = x->type->output(x, skb);
127 if (err) 72 if (err)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8604c747bca5..c0465284dfac 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
17static struct dst_ops xfrm4_dst_ops; 17static struct dst_ops xfrm4_dst_ops;
18static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 18static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
19 19
20static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
21
22static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) 20static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
23{ 21{
24 return __ip_route_output_key((struct rtable**)dst, fl); 22 return __ip_route_output_key((struct rtable**)dst, fl);
@@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
237 235
238static inline int xfrm4_garbage_collect(void) 236static inline int xfrm4_garbage_collect(void)
239{ 237{
240 read_lock(&xfrm4_policy_afinfo.lock);
241 xfrm4_policy_afinfo.garbage_collect(); 238 xfrm4_policy_afinfo.garbage_collect();
242 read_unlock(&xfrm4_policy_afinfo.lock);
243 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); 239 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
244} 240}
245 241
@@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = {
299 295
300static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 296static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
301 .family = AF_INET, 297 .family = AF_INET,
302 .lock = RW_LOCK_UNLOCKED,
303 .type_map = &xfrm4_type_map,
304 .dst_ops = &xfrm4_dst_ops, 298 .dst_ops = &xfrm4_dst_ops,
305 .dst_lookup = xfrm4_dst_lookup, 299 .dst_lookup = xfrm4_dst_lookup,
306 .find_bundle = __xfrm4_find_bundle, 300 .find_bundle = __xfrm4_find_bundle,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index dbabf81a9b7b..81e1751c966e 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
131 131
132static struct xfrm_state_afinfo xfrm4_state_afinfo = { 132static struct xfrm_state_afinfo xfrm4_state_afinfo = {
133 .family = AF_INET, 133 .family = AF_INET,
134 .lock = RW_LOCK_UNLOCKED,
135 .init_flags = xfrm4_init_flags, 134 .init_flags = xfrm4_init_flags,
136 .init_tempsel = __xfrm4_init_tempsel, 135 .init_tempsel = __xfrm4_init_tempsel,
137 .state_lookup = __xfrm4_state_lookup, 136 .state_lookup = __xfrm4_state_lookup,
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index f8a107ab5592..e923d4dea418 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -106,6 +106,26 @@ config INET6_TUNNEL
106 tristate 106 tristate
107 default n 107 default n
108 108
109config INET6_XFRM_MODE_TRANSPORT
110 tristate "IPv6: IPsec transport mode"
111 depends on IPV6
112 default IPV6
113 select XFRM
114 ---help---
115 Support for IPsec transport mode.
116
117 If unsure, say Y.
118
119config INET6_XFRM_MODE_TUNNEL
120 tristate "IPv6: IPsec tunnel mode"
121 depends on IPV6
122 default IPV6
123 select XFRM
124 ---help---
125 Support for IPsec tunnel mode.
126
127 If unsure, say Y.
128
109config IPV6_TUNNEL 129config IPV6_TUNNEL
110 tristate "IPv6: IPv6-in-IPv6 tunnel" 130 tristate "IPv6: IPv6-in-IPv6 tunnel"
111 select INET6_TUNNEL 131 select INET6_TUNNEL
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a760b0988fbb..386e0a626948 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o
20obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o 20obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
21obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o 21obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
22obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o 22obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
23obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
24obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
23obj-$(CONFIG_NETFILTER) += netfilter/ 25obj-$(CONFIG_NETFILTER) += netfilter/
24 26
25obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o 27obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 445006ee4522..c2c26fa0943d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2860,6 +2860,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
2860 return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); 2860 return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen);
2861} 2861}
2862 2862
2863/* Maximum length of ifa_cacheinfo attributes */
2864#define INET6_IFADDR_RTA_SPACE \
2865 RTA_SPACE(16) /* IFA_ADDRESS */ + \
2866 RTA_SPACE(sizeof(struct ifa_cacheinfo)) /* CACHEINFO */
2867
2863static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, 2868static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
2864 u32 pid, u32 seq, int event, unsigned int flags) 2869 u32 pid, u32 seq, int event, unsigned int flags)
2865{ 2870{
@@ -3092,7 +3097,7 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
3092static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) 3097static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
3093{ 3098{
3094 struct sk_buff *skb; 3099 struct sk_buff *skb;
3095 int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); 3100 int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + INET6_IFADDR_RTA_SPACE);
3096 3101
3097 skb = alloc_skb(size, GFP_ATOMIC); 3102 skb = alloc_skb(size, GFP_ATOMIC);
3098 if (!skb) { 3103 if (!skb) {
@@ -3142,6 +3147,17 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
3142#endif 3147#endif
3143} 3148}
3144 3149
3150/* Maximum length of ifinfomsg attributes */
3151#define INET6_IFINFO_RTA_SPACE \
3152 RTA_SPACE(IFNAMSIZ) /* IFNAME */ + \
3153 RTA_SPACE(MAX_ADDR_LEN) /* ADDRESS */ + \
3154 RTA_SPACE(sizeof(u32)) /* MTU */ + \
3155 RTA_SPACE(sizeof(int)) /* LINK */ + \
3156 RTA_SPACE(0) /* PROTINFO */ + \
3157 RTA_SPACE(sizeof(u32)) /* FLAGS */ + \
3158 RTA_SPACE(sizeof(struct ifla_cacheinfo)) /* CACHEINFO */ + \
3159 RTA_SPACE(sizeof(__s32[DEVCONF_MAX])) /* CONF */
3160
3145static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 3161static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
3146 u32 pid, u32 seq, int event, unsigned int flags) 3162 u32 pid, u32 seq, int event, unsigned int flags)
3147{ 3163{
@@ -3235,8 +3251,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
3235void inet6_ifinfo_notify(int event, struct inet6_dev *idev) 3251void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
3236{ 3252{
3237 struct sk_buff *skb; 3253 struct sk_buff *skb;
3238 /* 128 bytes ?? */ 3254 int size = NLMSG_SPACE(sizeof(struct ifinfomsg) + INET6_IFINFO_RTA_SPACE);
3239 int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+128);
3240 3255
3241 skb = alloc_skb(size, GFP_ATOMIC); 3256 skb = alloc_skb(size, GFP_ATOMIC);
3242 if (!skb) { 3257 if (!skb) {
@@ -3252,6 +3267,11 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
3252 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC); 3267 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC);
3253} 3268}
3254 3269
3270/* Maximum length of prefix_cacheinfo attributes */
3271#define INET6_PREFIX_RTA_SPACE \
3272 RTA_SPACE(sizeof(((struct prefix_info *)NULL)->prefix)) /* ADDRESS */ + \
3273 RTA_SPACE(sizeof(struct prefix_cacheinfo)) /* CACHEINFO */
3274
3255static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, 3275static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
3256 struct prefix_info *pinfo, u32 pid, u32 seq, 3276 struct prefix_info *pinfo, u32 pid, u32 seq,
3257 int event, unsigned int flags) 3277 int event, unsigned int flags)
@@ -3296,7 +3316,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
3296 struct prefix_info *pinfo) 3316 struct prefix_info *pinfo)
3297{ 3317{
3298 struct sk_buff *skb; 3318 struct sk_buff *skb;
3299 int size = NLMSG_SPACE(sizeof(struct prefixmsg)+128); 3319 int size = NLMSG_SPACE(sizeof(struct prefixmsg) + INET6_PREFIX_RTA_SPACE);
3300 3320
3301 skb = alloc_skb(size, GFP_ATOMIC); 3321 skb = alloc_skb(size, GFP_ATOMIC);
3302 if (!skb) { 3322 if (!skb) {
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 6778173a3dda..d31c0d6c0448 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -292,7 +292,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
292 292
293 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); 293 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
294 memset(ah->auth_data, 0, ahp->icv_trunc_len); 294 memset(ah->auth_data, 0, ahp->icv_trunc_len);
295 skb_push(skb, skb->data - skb->nh.raw); 295 skb_push(skb, hdr_len);
296 ahp->icv(ahp, skb, ah->auth_data); 296 ahp->icv(ahp, skb, ah->auth_data);
297 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { 297 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
298 LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); 298 LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n");
@@ -301,12 +301,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
301 } 301 }
302 } 302 }
303 303
304 skb->nh.raw = skb_pull(skb, ah_hlen); 304 skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len);
305 memcpy(skb->nh.raw, tmp_hdr, hdr_len); 305 __skb_pull(skb, ah_hlen + hdr_len);
306 skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
307 skb_pull(skb, hdr_len);
308 skb->h.raw = skb->data;
309
310 306
311 kfree(tmp_hdr); 307 kfree(tmp_hdr);
312 308
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 22f046079037..a15a6f320f70 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -142,25 +142,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
142 142
143 int hdr_len = skb->h.raw - skb->nh.raw; 143 int hdr_len = skb->h.raw - skb->nh.raw;
144 int nfrags; 144 int nfrags;
145 unsigned char *tmp_hdr = NULL;
146 int ret = 0; 145 int ret = 0;
147 146
148 if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { 147 if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) {
149 ret = -EINVAL; 148 ret = -EINVAL;
150 goto out_nofree; 149 goto out;
151 } 150 }
152 151
153 if (elen <= 0 || (elen & (blksize-1))) { 152 if (elen <= 0 || (elen & (blksize-1))) {
154 ret = -EINVAL; 153 ret = -EINVAL;
155 goto out_nofree; 154 goto out;
156 }
157
158 tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC);
159 if (!tmp_hdr) {
160 ret = -ENOMEM;
161 goto out_nofree;
162 } 155 }
163 memcpy(tmp_hdr, skb->nh.raw, hdr_len);
164 156
165 /* If integrity check is required, do this. */ 157 /* If integrity check is required, do this. */
166 if (esp->auth.icv_full_len) { 158 if (esp->auth.icv_full_len) {
@@ -222,16 +214,12 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
222 /* ... check padding bits here. Silly. :-) */ 214 /* ... check padding bits here. Silly. :-) */
223 215
224 pskb_trim(skb, skb->len - alen - padlen - 2); 216 pskb_trim(skb, skb->len - alen - padlen - 2);
225 skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen);
226 skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen;
227 memcpy(skb->nh.raw, tmp_hdr, hdr_len);
228 skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
229 ret = nexthdr[1]; 217 ret = nexthdr[1];
230 } 218 }
231 219
220 skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len;
221
232out: 222out:
233 kfree(tmp_hdr);
234out_nofree:
235 return ret; 223 return ret;
236} 224}
237 225
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e46048974f37..d29620f4910e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
39#include <linux/in6.h> 39#include <linux/in6.h>
40#include <linux/tcp.h> 40#include <linux/tcp.h>
41#include <linux/route.h> 41#include <linux/route.h>
42#include <linux/module.h>
42 43
43#include <linux/netfilter.h> 44#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h> 45#include <linux/netfilter_ipv6.h>
@@ -458,6 +459,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
458 nf_bridge_get(to->nf_bridge); 459 nf_bridge_get(to->nf_bridge);
459#endif 460#endif
460#endif 461#endif
462 skb_copy_secmark(to, from);
461} 463}
462 464
463int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) 465int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
@@ -488,6 +490,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
488 490
489 return offset; 491 return offset;
490} 492}
493EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
491 494
492static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 495static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
493{ 496{
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 48636436028a..f28cd37feed3 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -65,38 +65,25 @@ static LIST_HEAD(ipcomp6_tfms_list);
65 65
66static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) 66static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
67{ 67{
68 int err = 0; 68 int err = -ENOMEM;
69 u8 nexthdr = 0;
70 int hdr_len = skb->h.raw - skb->nh.raw;
71 unsigned char *tmp_hdr = NULL;
72 struct ipv6hdr *iph; 69 struct ipv6hdr *iph;
70 struct ipv6_comp_hdr *ipch;
73 int plen, dlen; 71 int plen, dlen;
74 struct ipcomp_data *ipcd = x->data; 72 struct ipcomp_data *ipcd = x->data;
75 u8 *start, *scratch; 73 u8 *start, *scratch;
76 struct crypto_tfm *tfm; 74 struct crypto_tfm *tfm;
77 int cpu; 75 int cpu;
78 76
79 if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && 77 if (skb_linearize_cow(skb))
80 skb_linearize(skb, GFP_ATOMIC) != 0) {
81 err = -ENOMEM;
82 goto out; 78 goto out;
83 }
84 79
85 skb->ip_summed = CHECKSUM_NONE; 80 skb->ip_summed = CHECKSUM_NONE;
86 81
87 /* Remove ipcomp header and decompress original payload */ 82 /* Remove ipcomp header and decompress original payload */
88 iph = skb->nh.ipv6h; 83 iph = skb->nh.ipv6h;
89 tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); 84 ipch = (void *)skb->data;
90 if (!tmp_hdr) 85 skb->h.raw = skb->nh.raw + sizeof(*ipch);
91 goto out; 86 __skb_pull(skb, sizeof(*ipch));
92 memcpy(tmp_hdr, iph, hdr_len);
93 nexthdr = *(u8 *)skb->data;
94 skb_pull(skb, sizeof(struct ipv6_comp_hdr));
95 skb->nh.raw += sizeof(struct ipv6_comp_hdr);
96 memcpy(skb->nh.raw, tmp_hdr, hdr_len);
97 iph = skb->nh.ipv6h;
98 iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr));
99 skb->h.raw = skb->data;
100 87
101 /* decompression */ 88 /* decompression */
102 plen = skb->len; 89 plen = skb->len;
@@ -125,18 +112,11 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
125 112
126 skb_put(skb, dlen - plen); 113 skb_put(skb, dlen - plen);
127 memcpy(skb->data, scratch, dlen); 114 memcpy(skb->data, scratch, dlen);
115 err = ipch->nexthdr;
128 116
129 iph = skb->nh.ipv6h;
130 iph->payload_len = htons(skb->len);
131
132out_put_cpu: 117out_put_cpu:
133 put_cpu(); 118 put_cpu();
134out: 119out:
135 kfree(tmp_hdr);
136 if (err)
137 goto error_out;
138 return nexthdr;
139error_out:
140 return err; 120 return err;
141} 121}
142 122
@@ -159,10 +139,8 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb)
159 goto out_ok; 139 goto out_ok;
160 } 140 }
161 141
162 if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && 142 if (skb_linearize_cow(skb))
163 skb_linearize(skb, GFP_ATOMIC) != 0) {
164 goto out_ok; 143 goto out_ok;
165 }
166 144
167 /* compression */ 145 /* compression */
168 plen = skb->len - hdr_len; 146 plen = skb->len - hdr_len;
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 93bae36f2663..2a71c3b669f1 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -189,7 +189,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
189 189
190 /* This is where we call the helper: as the packet goes out. */ 190 /* This is where we call the helper: as the packet goes out. */
191 ct = nf_ct_get(*pskb, &ctinfo); 191 ct = nf_ct_get(*pskb, &ctinfo);
192 if (!ct) 192 if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
193 goto out; 193 goto out;
194 194
195 help = nfct_help(ct); 195 help = nfct_help(ct);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 86c6703265d0..ef18a7b7014b 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
233 return -NF_ACCEPT; 233 return -NF_ACCEPT;
234 } 234 }
235 235
236 if (hooknum == NF_IP6_PRE_ROUTING && 236 if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING &&
237 nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { 237 nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
238 nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, 238 nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
239 "nf_ct_icmpv6: ICMPv6 checksum failed\n"); 239 "nf_ct_icmpv6: ICMPv6 checksum failed\n");
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3e319035f82d..c32a029e43f0 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -456,13 +456,9 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
456 DEBUGP("queue: message is too short.\n"); 456 DEBUGP("queue: message is too short.\n");
457 goto err; 457 goto err;
458 } 458 }
459 if (end-offset < skb->len) { 459 if (pskb_trim_rcsum(skb, end - offset)) {
460 if (pskb_trim(skb, end - offset)) { 460 DEBUGP("Can't trim\n");
461 DEBUGP("Can't trim\n"); 461 goto err;
462 goto err;
463 }
464 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
465 skb->ip_summed = CHECKSUM_NONE;
466 } 462 }
467 463
468 /* Find out which fragments are in front and at the back of us 464 /* Find out which fragments are in front and at the back of us
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 301eee726b0f..a50eb306e9e2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1218,8 +1218,16 @@ process:
1218 bh_lock_sock(sk); 1218 bh_lock_sock(sk);
1219 ret = 0; 1219 ret = 0;
1220 if (!sock_owned_by_user(sk)) { 1220 if (!sock_owned_by_user(sk)) {
1221 if (!tcp_prequeue(sk, skb)) 1221#ifdef CONFIG_NET_DMA
1222 ret = tcp_v6_do_rcv(sk, skb); 1222 struct tcp_sock *tp = tcp_sk(sk);
1223 if (tp->ucopy.dma_chan)
1224 ret = tcp_v6_do_rcv(sk, skb);
1225 else
1226#endif
1227 {
1228 if (!tcp_prequeue(sk, skb))
1229 ret = tcp_v6_do_rcv(sk, skb);
1230 }
1223 } else 1231 } else
1224 sk_add_backlog(sk, skb); 1232 sk_add_backlog(sk, skb);
1225 bh_unlock_sock(sk); 1233 bh_unlock_sock(sk);
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00cfdee18dca..0405d74ff910 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -13,21 +13,9 @@
13#include <linux/string.h> 13#include <linux/string.h>
14#include <linux/netfilter.h> 14#include <linux/netfilter.h>
15#include <linux/netfilter_ipv6.h> 15#include <linux/netfilter_ipv6.h>
16#include <net/dsfield.h>
17#include <net/inet_ecn.h>
18#include <net/ip.h>
19#include <net/ipv6.h> 16#include <net/ipv6.h>
20#include <net/xfrm.h> 17#include <net/xfrm.h>
21 18
22static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
23{
24 struct ipv6hdr *outer_iph = skb->nh.ipv6h;
25 struct ipv6hdr *inner_iph = skb->h.ipv6h;
26
27 if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
28 IP6_ECN_set_ce(inner_iph);
29}
30
31int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) 19int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
32{ 20{
33 int err; 21 int err;
@@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
81 69
82 xfrm_vec[xfrm_nr++] = x; 70 xfrm_vec[xfrm_nr++] = x;
83 71
72 if (x->mode->input(x, skb))
73 goto drop;
74
84 if (x->props.mode) { /* XXX */ 75 if (x->props.mode) { /* XXX */
85 if (nexthdr != IPPROTO_IPV6)
86 goto drop;
87 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
88 goto drop;
89 if (skb_cloned(skb) &&
90 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
91 goto drop;
92 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
93 ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
94 if (!(x->props.flags & XFRM_STATE_NOECN))
95 ipip6_ecn_decapsulate(skb);
96 skb->mac.raw = memmove(skb->data - skb->mac_len,
97 skb->mac.raw, skb->mac_len);
98 skb->nh.raw = skb->data;
99 decaps = 1; 76 decaps = 1;
100 break; 77 break;
101 } 78 }
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
new file mode 100644
index 000000000000..711d713e36d8
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -0,0 +1,88 @@
1/*
2 * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6.
3 *
4 * Copyright (C) 2002 USAGI/WIDE Project
5 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
6 */
7
8#include <linux/init.h>
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/stringify.h>
13#include <net/dst.h>
14#include <net/ipv6.h>
15#include <net/xfrm.h>
16
17/* Add encapsulation header.
18 *
19 * The IP header and mutable extension headers will be moved forward to make
20 * space for the encapsulation header.
21 *
22 * On exit, skb->h will be set to the start of the encapsulation header to be
23 * filled in by x->type->output and skb->nh will be set to the nextheader field
24 * of the extension header directly preceding the encapsulation header, or in
25 * its absence, that of the top IP header. The value of skb->data will always
26 * point to the top IP header.
27 */
28static int xfrm6_transport_output(struct sk_buff *skb)
29{
30 struct xfrm_state *x = skb->dst->xfrm;
31 struct ipv6hdr *iph;
32 u8 *prevhdr;
33 int hdr_len;
34
35 skb_push(skb, x->props.header_len);
36 iph = skb->nh.ipv6h;
37
38 hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
39 skb->nh.raw = prevhdr - x->props.header_len;
40 skb->h.raw = skb->data + hdr_len;
41 memmove(skb->data, iph, hdr_len);
42 return 0;
43}
44
45/* Remove encapsulation header.
46 *
47 * The IP header will be moved over the top of the encapsulation header.
48 *
49 * On entry, skb->h shall point to where the IP header should be and skb->nh
50 * shall be set to where the IP header currently is. skb->data shall point
51 * to the start of the payload.
52 */
53static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
54{
55 int ihl = skb->data - skb->h.raw;
56
57 if (skb->h.raw != skb->nh.raw)
58 skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl);
59 skb->nh.ipv6h->payload_len = htons(skb->len + ihl -
60 sizeof(struct ipv6hdr));
61 skb->h.raw = skb->data;
62 return 0;
63}
64
65static struct xfrm_mode xfrm6_transport_mode = {
66 .input = xfrm6_transport_input,
67 .output = xfrm6_transport_output,
68 .owner = THIS_MODULE,
69 .encap = XFRM_MODE_TRANSPORT,
70};
71
72static int __init xfrm6_transport_init(void)
73{
74 return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6);
75}
76
77static void __exit xfrm6_transport_exit(void)
78{
79 int err;
80
81 err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6);
82 BUG_ON(err);
83}
84
85module_init(xfrm6_transport_init);
86module_exit(xfrm6_transport_exit);
87MODULE_LICENSE("GPL");
88MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
new file mode 100644
index 000000000000..8af79be2edca
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -0,0 +1,121 @@
1/*
2 * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6.
3 *
4 * Copyright (C) 2002 USAGI/WIDE Project
5 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
6 */
7
8#include <linux/init.h>
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/stringify.h>
13#include <net/dsfield.h>
14#include <net/dst.h>
15#include <net/inet_ecn.h>
16#include <net/ipv6.h>
17#include <net/xfrm.h>
18
19static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
20{
21 struct ipv6hdr *outer_iph = skb->nh.ipv6h;
22 struct ipv6hdr *inner_iph = skb->h.ipv6h;
23
24 if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
25 IP6_ECN_set_ce(inner_iph);
26}
27
28/* Add encapsulation header.
29 *
30 * The top IP header will be constructed per RFC 2401. The following fields
31 * in it shall be filled in by x->type->output:
32 * payload_len
33 *
34 * On exit, skb->h will be set to the start of the encapsulation header to be
35 * filled in by x->type->output and skb->nh will be set to the nextheader field
36 * of the extension header directly preceding the encapsulation header, or in
37 * its absence, that of the top IP header. The value of skb->data will always
38 * point to the top IP header.
39 */
40static int xfrm6_tunnel_output(struct sk_buff *skb)
41{
42 struct dst_entry *dst = skb->dst;
43 struct xfrm_state *x = dst->xfrm;
44 struct ipv6hdr *iph, *top_iph;
45 int dsfield;
46
47 skb_push(skb, x->props.header_len);
48 iph = skb->nh.ipv6h;
49
50 skb->nh.raw = skb->data;
51 top_iph = skb->nh.ipv6h;
52 skb->nh.raw = &top_iph->nexthdr;
53 skb->h.ipv6h = top_iph + 1;
54
55 top_iph->version = 6;
56 top_iph->priority = iph->priority;
57 top_iph->flow_lbl[0] = iph->flow_lbl[0];
58 top_iph->flow_lbl[1] = iph->flow_lbl[1];
59 top_iph->flow_lbl[2] = iph->flow_lbl[2];
60 dsfield = ipv6_get_dsfield(top_iph);
61 dsfield = INET_ECN_encapsulate(dsfield, dsfield);
62 if (x->props.flags & XFRM_STATE_NOECN)
63 dsfield &= ~INET_ECN_MASK;
64 ipv6_change_dsfield(top_iph, 0, dsfield);
65 top_iph->nexthdr = IPPROTO_IPV6;
66 top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
67 ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
68 ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
69 return 0;
70}
71
72static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
73{
74 int err = -EINVAL;
75
76 if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6)
77 goto out;
78 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
79 goto out;
80
81 if (skb_cloned(skb) &&
82 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
83 goto out;
84
85 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
86 ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
87 if (!(x->props.flags & XFRM_STATE_NOECN))
88 ipip6_ecn_decapsulate(skb);
89 skb->mac.raw = memmove(skb->data - skb->mac_len,
90 skb->mac.raw, skb->mac_len);
91 skb->nh.raw = skb->data;
92 err = 0;
93
94out:
95 return err;
96}
97
98static struct xfrm_mode xfrm6_tunnel_mode = {
99 .input = xfrm6_tunnel_input,
100 .output = xfrm6_tunnel_output,
101 .owner = THIS_MODULE,
102 .encap = XFRM_MODE_TUNNEL,
103};
104
105static int __init xfrm6_tunnel_init(void)
106{
107 return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6);
108}
109
110static void __exit xfrm6_tunnel_exit(void)
111{
112 int err;
113
114 err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6);
115 BUG_ON(err);
116}
117
118module_init(xfrm6_tunnel_init);
119module_exit(xfrm6_tunnel_exit);
120MODULE_LICENSE("GPL");
121MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 80242172a5df..16e84254a252 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -14,68 +14,9 @@
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/icmpv6.h> 15#include <linux/icmpv6.h>
16#include <linux/netfilter_ipv6.h> 16#include <linux/netfilter_ipv6.h>
17#include <net/dsfield.h>
18#include <net/inet_ecn.h>
19#include <net/ipv6.h> 17#include <net/ipv6.h>
20#include <net/xfrm.h> 18#include <net/xfrm.h>
21 19
22/* Add encapsulation header.
23 *
24 * In transport mode, the IP header and mutable extension headers will be moved
25 * forward to make space for the encapsulation header.
26 *
27 * In tunnel mode, the top IP header will be constructed per RFC 2401.
28 * The following fields in it shall be filled in by x->type->output:
29 * payload_len
30 *
31 * On exit, skb->h will be set to the start of the encapsulation header to be
32 * filled in by x->type->output and skb->nh will be set to the nextheader field
33 * of the extension header directly preceding the encapsulation header, or in
34 * its absence, that of the top IP header. The value of skb->data will always
35 * point to the top IP header.
36 */
37static void xfrm6_encap(struct sk_buff *skb)
38{
39 struct dst_entry *dst = skb->dst;
40 struct xfrm_state *x = dst->xfrm;
41 struct ipv6hdr *iph, *top_iph;
42 int dsfield;
43
44 skb_push(skb, x->props.header_len);
45 iph = skb->nh.ipv6h;
46
47 if (!x->props.mode) {
48 u8 *prevhdr;
49 int hdr_len;
50
51 hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
52 skb->nh.raw = prevhdr - x->props.header_len;
53 skb->h.raw = skb->data + hdr_len;
54 memmove(skb->data, iph, hdr_len);
55 return;
56 }
57
58 skb->nh.raw = skb->data;
59 top_iph = skb->nh.ipv6h;
60 skb->nh.raw = &top_iph->nexthdr;
61 skb->h.ipv6h = top_iph + 1;
62
63 top_iph->version = 6;
64 top_iph->priority = iph->priority;
65 top_iph->flow_lbl[0] = iph->flow_lbl[0];
66 top_iph->flow_lbl[1] = iph->flow_lbl[1];
67 top_iph->flow_lbl[2] = iph->flow_lbl[2];
68 dsfield = ipv6_get_dsfield(top_iph);
69 dsfield = INET_ECN_encapsulate(dsfield, dsfield);
70 if (x->props.flags & XFRM_STATE_NOECN)
71 dsfield &= ~INET_ECN_MASK;
72 ipv6_change_dsfield(top_iph, 0, dsfield);
73 top_iph->nexthdr = IPPROTO_IPV6;
74 top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
75 ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
76 ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
77}
78
79static int xfrm6_tunnel_check_size(struct sk_buff *skb) 20static int xfrm6_tunnel_check_size(struct sk_buff *skb)
80{ 21{
81 int mtu, ret = 0; 22 int mtu, ret = 0;
@@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb)
118 if (err) 59 if (err)
119 goto error; 60 goto error;
120 61
121 xfrm6_encap(skb); 62 err = x->mode->output(skb);
63 if (err)
64 goto error;
122 65
123 err = x->type->output(x, skb); 66 err = x->type->output(x, skb);
124 if (err) 67 if (err)
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 88c840f1beb6..ee715f2691e9 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -23,8 +23,6 @@
23static struct dst_ops xfrm6_dst_ops; 23static struct dst_ops xfrm6_dst_ops;
24static struct xfrm_policy_afinfo xfrm6_policy_afinfo; 24static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
25 25
26static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED };
27
28static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) 26static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
29{ 27{
30 int err = 0; 28 int err = 0;
@@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl)
249 247
250static inline int xfrm6_garbage_collect(void) 248static inline int xfrm6_garbage_collect(void)
251{ 249{
252 read_lock(&xfrm6_policy_afinfo.lock);
253 xfrm6_policy_afinfo.garbage_collect(); 250 xfrm6_policy_afinfo.garbage_collect();
254 read_unlock(&xfrm6_policy_afinfo.lock);
255 return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); 251 return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2);
256} 252}
257 253
@@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = {
311 307
312static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { 308static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
313 .family = AF_INET6, 309 .family = AF_INET6,
314 .lock = RW_LOCK_UNLOCKED,
315 .type_map = &xfrm6_type_map,
316 .dst_ops = &xfrm6_dst_ops, 310 .dst_ops = &xfrm6_dst_ops,
317 .dst_lookup = xfrm6_dst_lookup, 311 .dst_lookup = xfrm6_dst_lookup,
318 .find_bundle = __xfrm6_find_bundle, 312 .find_bundle = __xfrm6_find_bundle,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index a5723024d3b3..b33296b3f6de 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto,
135 135
136static struct xfrm_state_afinfo xfrm6_state_afinfo = { 136static struct xfrm_state_afinfo xfrm6_state_afinfo = {
137 .family = AF_INET6, 137 .family = AF_INET6,
138 .lock = RW_LOCK_UNLOCKED,
139 .init_tempsel = __xfrm6_init_tempsel, 138 .init_tempsel = __xfrm6_init_tempsel,
140 .state_lookup = __xfrm6_state_lookup, 139 .state_lookup = __xfrm6_state_lookup,
141 .find_acq = __xfrm6_find_acq, 140 .find_acq = __xfrm6_find_acq,
diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c
index a394c6fe19a2..bba3431cd9a5 100644
--- a/net/ipx/ipx_route.c
+++ b/net/ipx/ipx_route.c
@@ -238,7 +238,7 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx,
238 } 238 }
239 239
240 /* Apply checksum. Not allowed on 802.3 links. */ 240 /* Apply checksum. Not allowed on 802.3 links. */
241 if (sk->sk_no_check || intrfc->if_dlink_type == IPX_FRAME_8023) 241 if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023))
242 ipx->ipx_checksum = 0xFFFF; 242 ipx->ipx_checksum = 0xFFFF;
243 else 243 else
244 ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); 244 ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr));
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index c19e9ce05a3a..57ea160f470b 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -44,6 +44,8 @@
44#include <net/irda/irlmp.h> 44#include <net/irda/irlmp.h>
45#include <net/irda/irlmp_frame.h> 45#include <net/irda/irlmp_frame.h>
46 46
47#include <asm/unaligned.h>
48
47static __u8 irlmp_find_free_slsap(void); 49static __u8 irlmp_find_free_slsap(void);
48static int irlmp_slsap_inuse(__u8 slsap_sel); 50static int irlmp_slsap_inuse(__u8 slsap_sel);
49 51
@@ -840,6 +842,7 @@ void irlmp_do_expiry(void)
840void irlmp_do_discovery(int nslots) 842void irlmp_do_discovery(int nslots)
841{ 843{
842 struct lap_cb *lap; 844 struct lap_cb *lap;
845 __u16 *data_hintsp;
843 846
844 /* Make sure the value is sane */ 847 /* Make sure the value is sane */
845 if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){ 848 if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){
@@ -849,7 +852,8 @@ void irlmp_do_discovery(int nslots)
849 } 852 }
850 853
851 /* Construct new discovery info to be used by IrLAP, */ 854 /* Construct new discovery info to be used by IrLAP, */
852 u16ho(irlmp->discovery_cmd.data.hints) = irlmp->hints.word; 855 data_hintsp = (__u16 *) irlmp->discovery_cmd.data.hints;
856 put_unaligned(irlmp->hints.word, data_hintsp);
853 857
854 /* 858 /*
855 * Set character set for device name (we use ASCII), and 859 * Set character set for device name (we use ASCII), and
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 859582275cab..d5e2121ea207 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1454,21 +1454,23 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
1454 if (x == NULL) 1454 if (x == NULL)
1455 return -ESRCH; 1455 return -ESRCH;
1456 1456
1457 if ((err = security_xfrm_state_delete(x)))
1458 goto out;
1459
1457 if (xfrm_state_kern(x)) { 1460 if (xfrm_state_kern(x)) {
1458 xfrm_state_put(x); 1461 err = -EPERM;
1459 return -EPERM; 1462 goto out;
1460 } 1463 }
1461 1464
1462 err = xfrm_state_delete(x); 1465 err = xfrm_state_delete(x);
1463 if (err < 0) { 1466 if (err < 0)
1464 xfrm_state_put(x); 1467 goto out;
1465 return err;
1466 }
1467 1468
1468 c.seq = hdr->sadb_msg_seq; 1469 c.seq = hdr->sadb_msg_seq;
1469 c.pid = hdr->sadb_msg_pid; 1470 c.pid = hdr->sadb_msg_pid;
1470 c.event = XFRM_MSG_DELSA; 1471 c.event = XFRM_MSG_DELSA;
1471 km_state_notify(x, &c); 1472 km_state_notify(x, &c);
1473out:
1472 xfrm_state_put(x); 1474 xfrm_state_put(x);
1473 1475
1474 return err; 1476 return err;
@@ -2274,11 +2276,14 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
2274 2276
2275 err = 0; 2277 err = 0;
2276 2278
2279 if ((err = security_xfrm_policy_delete(xp)))
2280 goto out;
2277 c.seq = hdr->sadb_msg_seq; 2281 c.seq = hdr->sadb_msg_seq;
2278 c.pid = hdr->sadb_msg_pid; 2282 c.pid = hdr->sadb_msg_pid;
2279 c.event = XFRM_MSG_DELPOLICY; 2283 c.event = XFRM_MSG_DELPOLICY;
2280 km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); 2284 km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
2281 2285
2286out:
2282 xfrm_pol_put(xp); 2287 xfrm_pol_put(xp);
2283 return err; 2288 return err;
2284} 2289}
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5a04db745c8d..75c9b1480801 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -674,7 +674,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
674 674
675 lock_sock(sk); 675 lock_sock(sk);
676 copied = -ENOTCONN; 676 copied = -ENOTCONN;
677 if (sk->sk_state == TCP_LISTEN) 677 if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN))
678 goto out; 678 goto out;
679 679
680 timeo = sock_rcvtimeo(sk, nonblock); 680 timeo = sock_rcvtimeo(sk, nonblock);
@@ -733,7 +733,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
733 if (sk->sk_shutdown & RCV_SHUTDOWN) 733 if (sk->sk_shutdown & RCV_SHUTDOWN)
734 break; 734 break;
735 735
736 if (sk->sk_state == TCP_CLOSE) { 736 if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) {
737 if (!sock_flag(sk, SOCK_DONE)) { 737 if (!sock_flag(sk, SOCK_DONE)) {
738 /* 738 /*
739 * This occurs when user tries to read 739 * This occurs when user tries to read
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
789 continue; 789 continue;
790 790
791 if (!(flags & MSG_PEEK)) { 791 if (!(flags & MSG_PEEK)) {
792 sk_eat_skb(sk, skb); 792 sk_eat_skb(sk, skb, 0);
793 *seq = 0; 793 *seq = 0;
794 } 794 }
795 } while (len > 0); 795 } while (len > 0);
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index ba90f7f0801a..5ae47be7dde0 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -26,8 +26,6 @@
26#include <net/llc_c_st.h> 26#include <net/llc_c_st.h>
27#include <net/tcp_states.h> 27#include <net/tcp_states.h>
28 28
29u8 llc_mac_null_var[IFHWADDRLEN];
30
31/** 29/**
32 * llc_build_and_send_pkt - Connection data sending for upper layers. 30 * llc_build_and_send_pkt - Connection data sending for upper layers.
33 * @sk: connection 31 * @sk: connection
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index d62e0f9b9da3..94d2368ade92 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -142,6 +142,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
142 struct llc_sap *sap; 142 struct llc_sap *sap;
143 struct llc_pdu_sn *pdu; 143 struct llc_pdu_sn *pdu;
144 int dest; 144 int dest;
145 int (*rcv)(struct sk_buff *, struct net_device *,
146 struct packet_type *, struct net_device *);
145 147
146 /* 148 /*
147 * When the interface is in promisc. mode, drop all the crap that it 149 * When the interface is in promisc. mode, drop all the crap that it
@@ -169,9 +171,11 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
169 * First the upper layer protocols that don't need the full 171 * First the upper layer protocols that don't need the full
170 * LLC functionality 172 * LLC functionality
171 */ 173 */
172 if (sap->rcv_func) { 174 rcv = rcu_dereference(sap->rcv_func);
173 sap->rcv_func(skb, dev, pt, orig_dev); 175 if (rcv) {
174 goto out_put; 176 struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
177 if (cskb)
178 rcv(cskb, dev, pt, orig_dev);
175 } 179 }
176 dest = llc_pdu_type(skb); 180 dest = llc_pdu_type(skb);
177 if (unlikely(!dest || !llc_type_handlers[dest - 1])) 181 if (unlikely(!dest || !llc_type_handlers[dest - 1]))
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 4029ceee9b91..20c4eb5c1ac6 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -282,7 +282,7 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb)
282 * mac, and local sap. Returns pointer for socket found, %NULL otherwise. 282 * mac, and local sap. Returns pointer for socket found, %NULL otherwise.
283 */ 283 */
284static struct sock *llc_lookup_dgram(struct llc_sap *sap, 284static struct sock *llc_lookup_dgram(struct llc_sap *sap,
285 struct llc_addr *laddr) 285 const struct llc_addr *laddr)
286{ 286{
287 struct sock *rc; 287 struct sock *rc;
288 struct hlist_node *node; 288 struct hlist_node *node;
@@ -304,19 +304,62 @@ found:
304 return rc; 304 return rc;
305} 305}
306 306
307/**
308 * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets.
309 * @sap: SAP
310 * @laddr: address of local LLC (MAC + SAP)
311 *
312 * Search socket list of the SAP and finds connections with same sap.
313 * Deliver clone to each.
314 */
315static void llc_sap_mcast(struct llc_sap *sap,
316 const struct llc_addr *laddr,
317 struct sk_buff *skb)
318{
319 struct sock *sk;
320 struct hlist_node *node;
321
322 read_lock_bh(&sap->sk_list.lock);
323 sk_for_each(sk, node, &sap->sk_list.list) {
324 struct llc_sock *llc = llc_sk(sk);
325 struct sk_buff *skb1;
326
327 if (sk->sk_type != SOCK_DGRAM)
328 continue;
329
330 if (llc->laddr.lsap != laddr->lsap)
331 continue;
332
333 skb1 = skb_clone(skb, GFP_ATOMIC);
334 if (!skb1)
335 break;
336
337 sock_hold(sk);
338 skb_set_owner_r(skb1, sk);
339 llc_sap_rcv(sap, skb1);
340 sock_put(sk);
341 }
342 read_unlock_bh(&sap->sk_list.lock);
343}
344
345
307void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) 346void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
308{ 347{
309 struct llc_addr laddr; 348 struct llc_addr laddr;
310 struct sock *sk;
311 349
312 llc_pdu_decode_da(skb, laddr.mac); 350 llc_pdu_decode_da(skb, laddr.mac);
313 llc_pdu_decode_dsap(skb, &laddr.lsap); 351 llc_pdu_decode_dsap(skb, &laddr.lsap);
314 352
315 sk = llc_lookup_dgram(sap, &laddr); 353 if (llc_mac_multicast(laddr.mac)) {
316 if (sk) { 354 llc_sap_mcast(sap, &laddr, skb);
317 skb_set_owner_r(skb, sk);
318 llc_sap_rcv(sap, skb);
319 sock_put(sk);
320 } else
321 kfree_skb(skb); 355 kfree_skb(skb);
356 } else {
357 struct sock *sk = llc_lookup_dgram(sap, &laddr);
358 if (sk) {
359 skb_set_owner_r(skb, sk);
360 llc_sap_rcv(sap, skb);
361 sock_put(sk);
362 } else
363 kfree_skb(skb);
364 }
322} 365}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e2893effdfaa..b1622b7de1cf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK
60 of packets, but this mark value is kept in the conntrack session 60 of packets, but this mark value is kept in the conntrack session
61 instead of the individual packets. 61 instead of the individual packets.
62 62
63config NF_CONNTRACK_SECMARK
64 bool 'Connection tracking security mark support'
65 depends on NF_CONNTRACK && NETWORK_SECMARK
66 help
67 This option enables security markings to be applied to
68 connections. Typically they are copied to connections from
69 packets using the CONNSECMARK target and copied back from
70 connections to packets with the same target, with the packets
71 being originally labeled via SECMARK.
72
73 If unsure, say 'N'.
74
63config NF_CONNTRACK_EVENTS 75config NF_CONNTRACK_EVENTS
64 bool "Connection tracking events (EXPERIMENTAL)" 76 bool "Connection tracking events (EXPERIMENTAL)"
65 depends on EXPERIMENTAL && NF_CONNTRACK 77 depends on EXPERIMENTAL && NF_CONNTRACK
@@ -174,6 +186,26 @@ config NETFILTER_XT_TARGET_NOTRACK
174 If you want to compile it as a module, say M here and read 186 If you want to compile it as a module, say M here and read
175 <file:Documentation/modules.txt>. If unsure, say `N'. 187 <file:Documentation/modules.txt>. If unsure, say `N'.
176 188
189config NETFILTER_XT_TARGET_SECMARK
190 tristate '"SECMARK" target support'
191 depends on NETFILTER_XTABLES && NETWORK_SECMARK
192 help
193 The SECMARK target allows security marking of network
194 packets, for use with security subsystems.
195
196 To compile it as a module, choose M here. If unsure, say N.
197
198config NETFILTER_XT_TARGET_CONNSECMARK
199 tristate '"CONNSECMARK" target support'
200 depends on NETFILTER_XTABLES && (NF_CONNTRACK_SECMARK || IP_NF_CONNTRACK_SECMARK)
201 help
202 The CONNSECMARK target copies security markings from packets
203 to connections, and restores security markings from connections
204 to packets (if the packets are not already marked). This would
205 normally be used in conjunction with the SECMARK target.
206
207 To compile it as a module, choose M here. If unsure, say N.
208
177config NETFILTER_XT_MATCH_COMMENT 209config NETFILTER_XT_MATCH_COMMENT
178 tristate '"comment" match support' 210 tristate '"comment" match support'
179 depends on NETFILTER_XTABLES 211 depends on NETFILTER_XTABLES
@@ -329,6 +361,16 @@ config NETFILTER_XT_MATCH_PKTTYPE
329 361
330 To compile it as a module, choose M here. If unsure, say N. 362 To compile it as a module, choose M here. If unsure, say N.
331 363
364config NETFILTER_XT_MATCH_QUOTA
365 tristate '"quota" match support'
366 depends on NETFILTER_XTABLES
367 help
368 This option adds a `quota' match, which allows to match on a
369 byte counter.
370
371 If you want to compile it as a module, say M here and read
372 <file:Documentation/modules.txt>. If unsure, say `N'.
373
332config NETFILTER_XT_MATCH_REALM 374config NETFILTER_XT_MATCH_REALM
333 tristate '"realm" match support' 375 tristate '"realm" match support'
334 depends on NETFILTER_XTABLES 376 depends on NETFILTER_XTABLES
@@ -365,6 +407,12 @@ config NETFILTER_XT_MATCH_STATE
365 407
366 To compile it as a module, choose M here. If unsure, say N. 408 To compile it as a module, choose M here. If unsure, say N.
367 409
410config NETFILTER_XT_MATCH_STATISTIC
411 tristate '"statistic" match support'
412 depends on NETFILTER_XTABLES
413 help
414 statistic module
415
368config NETFILTER_XT_MATCH_STRING 416config NETFILTER_XT_MATCH_STRING
369 tristate '"string" match support' 417 tristate '"string" match support'
370 depends on NETFILTER_XTABLES 418 depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 95b7e416512d..6fa4b7580458 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -28,6 +28,8 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
28obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o 28obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
29obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o 29obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
30obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o 30obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
31obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
32obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
31 33
32# matches 34# matches
33obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o 35obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
@@ -44,9 +46,11 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o
44obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o 46obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
45obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o 47obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
46obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o 48obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
49obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
47obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o 50obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
48obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o 51obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
49obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o 52obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
53obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
50obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o 54obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
51obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o 55obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
52obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o 56obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f9b83f91371a..cd299f4b7db1 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -990,6 +990,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
990#ifdef CONFIG_NF_CONNTRACK_MARK 990#ifdef CONFIG_NF_CONNTRACK_MARK
991 conntrack->mark = exp->master->mark; 991 conntrack->mark = exp->master->mark;
992#endif 992#endif
993#ifdef CONFIG_NF_CONNTRACK_SECMARK
994 conntrack->secmark = exp->master->secmark;
995#endif
993 nf_conntrack_get(&conntrack->master->ct_general); 996 nf_conntrack_get(&conntrack->master->ct_general);
994 NF_CT_STAT_INC(expect_new); 997 NF_CT_STAT_INC(expect_new);
995 } else 998 } else
@@ -1396,6 +1399,12 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
1396 1399
1397 write_lock_bh(&nf_conntrack_lock); 1400 write_lock_bh(&nf_conntrack_lock);
1398 1401
1402 /* Only update if this is not a fixed timeout */
1403 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1404 write_unlock_bh(&nf_conntrack_lock);
1405 return;
1406 }
1407
1399 /* If not in hash table, timer will not be active yet */ 1408 /* If not in hash table, timer will not be active yet */
1400 if (!nf_ct_is_confirmed(ct)) { 1409 if (!nf_ct_is_confirmed(ct)) {
1401 ct->timeout.expires = extra_jiffies; 1410 ct->timeout.expires = extra_jiffies;
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e38a4b5a3089..11d3be243536 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -67,37 +67,48 @@ static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
67 char); 67 char);
68 68
69static struct ftp_search { 69static struct ftp_search {
70 enum ip_conntrack_dir dir;
71 const char *pattern; 70 const char *pattern;
72 size_t plen; 71 size_t plen;
73 char skip; 72 char skip;
74 char term; 73 char term;
75 enum ip_ct_ftp_type ftptype; 74 enum ip_ct_ftp_type ftptype;
76 int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); 75 int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char);
77} search[] = { 76} search[IP_CT_DIR_MAX][2] = {
78 { 77 [IP_CT_DIR_ORIGINAL] = {
79 IP_CT_DIR_ORIGINAL, 78 {
80 "PORT", sizeof("PORT") - 1, ' ', '\r', 79 .pattern = "PORT",
81 IP_CT_FTP_PORT, 80 .plen = sizeof("PORT") - 1,
82 try_rfc959, 81 .skip = ' ',
82 .term = '\r',
83 .ftptype = IP_CT_FTP_PORT,
84 .getnum = try_rfc959,
85 },
86 {
87 .pattern = "EPRT",
88 .plen = sizeof("EPRT") - 1,
89 .skip = ' ',
90 .term = '\r',
91 .ftptype = IP_CT_FTP_EPRT,
92 .getnum = try_eprt,
93 },
83 }, 94 },
84 { 95 [IP_CT_DIR_REPLY] = {
85 IP_CT_DIR_REPLY, 96 {
86 "227 ", sizeof("227 ") - 1, '(', ')', 97 .pattern = "227 ",
87 IP_CT_FTP_PASV, 98 .plen = sizeof("227 ") - 1,
88 try_rfc959, 99 .skip = '(',
89 }, 100 .term = ')',
90 { 101 .ftptype = IP_CT_FTP_PASV,
91 IP_CT_DIR_ORIGINAL, 102 .getnum = try_rfc959,
92 "EPRT", sizeof("EPRT") - 1, ' ', '\r', 103 },
93 IP_CT_FTP_EPRT, 104 {
94 try_eprt, 105 .pattern = "229 ",
95 }, 106 .plen = sizeof("229 ") - 1,
96 { 107 .skip = '(',
97 IP_CT_DIR_REPLY, 108 .term = ')',
98 "229 ", sizeof("229 ") - 1, '(', ')', 109 .ftptype = IP_CT_FTP_EPSV,
99 IP_CT_FTP_EPSV, 110 .getnum = try_epsv_response,
100 try_epsv_response, 111 },
101 }, 112 },
102}; 113};
103 114
@@ -492,17 +503,15 @@ static int help(struct sk_buff **pskb,
492 memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, 503 memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
493 sizeof(cmd.u3.all)); 504 sizeof(cmd.u3.all));
494 505
495 for (i = 0; i < ARRAY_SIZE(search); i++) { 506 for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
496 if (search[i].dir != dir) continue;
497
498 found = find_pattern(fb_ptr, datalen, 507 found = find_pattern(fb_ptr, datalen,
499 search[i].pattern, 508 search[dir][i].pattern,
500 search[i].plen, 509 search[dir][i].plen,
501 search[i].skip, 510 search[dir][i].skip,
502 search[i].term, 511 search[dir][i].term,
503 &matchoff, &matchlen, 512 &matchoff, &matchlen,
504 &cmd, 513 &cmd,
505 search[i].getnum); 514 search[dir][i].getnum);
506 if (found) break; 515 if (found) break;
507 } 516 }
508 if (found == -1) { 517 if (found == -1) {
@@ -512,7 +521,7 @@ static int help(struct sk_buff **pskb,
512 this case. */ 521 this case. */
513 if (net_ratelimit()) 522 if (net_ratelimit())
514 printk("conntrack_ftp: partial %s %u+%u\n", 523 printk("conntrack_ftp: partial %s %u+%u\n",
515 search[i].pattern, 524 search[dir][i].pattern,
516 ntohl(th->seq), datalen); 525 ntohl(th->seq), datalen);
517 ret = NF_DROP; 526 ret = NF_DROP;
518 goto out; 527 goto out;
@@ -597,7 +606,7 @@ static int help(struct sk_buff **pskb,
597 /* Now, NAT might want to mangle the packet, and register the 606 /* Now, NAT might want to mangle the packet, and register the
598 * (possibly changed) expectation itself. */ 607 * (possibly changed) expectation itself. */
599 if (nf_nat_ftp_hook) 608 if (nf_nat_ftp_hook)
600 ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, 609 ret = nf_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype,
601 matchoff, matchlen, exp, &seq); 610 matchoff, matchlen, exp, &seq);
602 else { 611 else {
603 /* Can't expect this? Best to drop packet now. */ 612 /* Can't expect this? Best to drop packet now. */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index bd10eb944b65..b8c7c567c9df 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -407,6 +407,8 @@ nfattr_failure:
407 407
408static int ctnetlink_done(struct netlink_callback *cb) 408static int ctnetlink_done(struct netlink_callback *cb)
409{ 409{
410 if (cb->args[1])
411 nf_ct_put((struct nf_conn *)cb->args[1]);
410 DEBUGP("entered %s\n", __FUNCTION__); 412 DEBUGP("entered %s\n", __FUNCTION__);
411 return 0; 413 return 0;
412} 414}
@@ -416,10 +418,9 @@ static int ctnetlink_done(struct netlink_callback *cb)
416static int 418static int
417ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) 419ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
418{ 420{
419 struct nf_conn *ct = NULL; 421 struct nf_conn *ct, *last;
420 struct nf_conntrack_tuple_hash *h; 422 struct nf_conntrack_tuple_hash *h;
421 struct list_head *i; 423 struct list_head *i;
422 u_int32_t *id = (u_int32_t *) &cb->args[1];
423 struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); 424 struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh);
424 u_int8_t l3proto = nfmsg->nfgen_family; 425 u_int8_t l3proto = nfmsg->nfgen_family;
425 426
@@ -427,7 +428,9 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
427 cb->args[0], *id); 428 cb->args[0], *id);
428 429
429 read_lock_bh(&nf_conntrack_lock); 430 read_lock_bh(&nf_conntrack_lock);
430 for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { 431 for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
432restart:
433 last = (struct nf_conn *)cb->args[1];
431 list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { 434 list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) {
432 h = (struct nf_conntrack_tuple_hash *) i; 435 h = (struct nf_conntrack_tuple_hash *) i;
433 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) 436 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -438,17 +441,30 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
438 * then dump everything. */ 441 * then dump everything. */
439 if (l3proto && L3PROTO(ct) != l3proto) 442 if (l3proto && L3PROTO(ct) != l3proto)
440 continue; 443 continue;
441 if (ct->id <= *id) 444 if (last != NULL) {
442 continue; 445 if (ct == last) {
446 nf_ct_put(last);
447 cb->args[1] = 0;
448 last = NULL;
449 } else
450 continue;
451 }
443 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, 452 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
444 cb->nlh->nlmsg_seq, 453 cb->nlh->nlmsg_seq,
445 IPCTNL_MSG_CT_NEW, 454 IPCTNL_MSG_CT_NEW,
446 1, ct) < 0) 455 1, ct) < 0) {
456 nf_conntrack_get(&ct->ct_general);
457 cb->args[1] = (unsigned long)ct;
447 goto out; 458 goto out;
448 *id = ct->id; 459 }
460 }
461 if (last != NULL) {
462 nf_ct_put(last);
463 cb->args[1] = 0;
464 goto restart;
449 } 465 }
450 } 466 }
451out: 467out:
452 read_unlock_bh(&nf_conntrack_lock); 468 read_unlock_bh(&nf_conntrack_lock);
453 469
454 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); 470 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
@@ -641,7 +657,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
641}; 657};
642 658
643static inline int 659static inline int
644ctnetlink_parse_nat(struct nfattr *cda[], 660ctnetlink_parse_nat(struct nfattr *nat,
645 const struct nf_conn *ct, struct ip_nat_range *range) 661 const struct nf_conn *ct, struct ip_nat_range *range)
646{ 662{
647 struct nfattr *tb[CTA_NAT_MAX]; 663 struct nfattr *tb[CTA_NAT_MAX];
@@ -651,7 +667,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
651 667
652 memset(range, 0, sizeof(*range)); 668 memset(range, 0, sizeof(*range));
653 669
654 nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); 670 nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
655 671
656 if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) 672 if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
657 return -EINVAL; 673 return -EINVAL;
@@ -866,39 +882,30 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[])
866 /* ASSURED bit can only be set */ 882 /* ASSURED bit can only be set */
867 return -EINVAL; 883 return -EINVAL;
868 884
869 if (cda[CTA_NAT-1]) { 885 if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
870#ifndef CONFIG_IP_NF_NAT_NEEDED 886#ifndef CONFIG_IP_NF_NAT_NEEDED
871 return -EINVAL; 887 return -EINVAL;
872#else 888#else
873 unsigned int hooknum;
874 struct ip_nat_range range; 889 struct ip_nat_range range;
875 890
876 if (ctnetlink_parse_nat(cda, ct, &range) < 0) 891 if (cda[CTA_NAT_DST-1]) {
877 return -EINVAL; 892 if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
878 893 &range) < 0)
879 DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 894 return -EINVAL;
880 NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), 895 if (ip_nat_initialized(ct,
881 htons(range.min.all), htons(range.max.all)); 896 HOOK2MANIP(NF_IP_PRE_ROUTING)))
882 897 return -EEXIST;
883 /* This is tricky but it works. ip_nat_setup_info needs the 898 ip_nat_setup_info(ct, &range, hooknum);
884 * hook number as parameter, so let's do the correct 899 }
885 * conversion and run away */ 900 if (cda[CTA_NAT_SRC-1]) {
886 if (status & IPS_SRC_NAT_DONE) 901 if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
887 hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ 902 &range) < 0)
888 else if (status & IPS_DST_NAT_DONE) 903 return -EINVAL;
889 hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ 904 if (ip_nat_initialized(ct,
890 else 905 HOOK2MANIP(NF_IP_POST_ROUTING)))
891 return -EINVAL; /* Missing NAT flags */ 906 return -EEXIST;
892 907 ip_nat_setup_info(ct, &range, hooknum);
893 DEBUGP("NAT status: %lu\n", 908 }
894 status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
895
896 if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
897 return -EEXIST;
898 ip_nat_setup_info(ct, &range, hooknum);
899
900 DEBUGP("NAT status after setup_info: %lu\n",
901 ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
902#endif 909#endif
903 } 910 }
904 911
@@ -1122,7 +1129,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
1122 /* implicit 'else' */ 1129 /* implicit 'else' */
1123 1130
1124 /* we only allow nat config for new conntracks */ 1131 /* we only allow nat config for new conntracks */
1125 if (cda[CTA_NAT-1]) { 1132 if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
1126 err = -EINVAL; 1133 err = -EINVAL;
1127 goto out_unlock; 1134 goto out_unlock;
1128 } 1135 }
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69899f27d26a..12fb7c0a1509 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb,
828 * and moreover root might send raw packets. 828 * and moreover root might send raw packets.
829 */ 829 */
830 /* FIXME: Source route IP option packets --RR */ 830 /* FIXME: Source route IP option packets --RR */
831 if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || 831 if (nf_conntrack_checksum &&
832 (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && 832 ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
833 (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
833 nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { 834 nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
834 if (LOG_INVALID(IPPROTO_TCP)) 835 if (LOG_INVALID(IPPROTO_TCP))
835 nf_log_packet(pf, 0, skb, NULL, NULL, NULL, 836 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index d93edbfde9e3..ae07ebe3ab37 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff,
134 * because the semantic of CHECKSUM_HW is different there 134 * because the semantic of CHECKSUM_HW is different there
135 * and moreover root might send raw packets. 135 * and moreover root might send raw packets.
136 * FIXME: Source route IP option packets --RR */ 136 * FIXME: Source route IP option packets --RR */
137 if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || 137 if (nf_conntrack_checksum &&
138 ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
138 (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && 139 (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
139 nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { 140 nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
140 if (LOG_INVALID(IPPROTO_UDP)) 141 if (LOG_INVALID(IPPROTO_UDP))
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 408960c6a544..e34c574f0351 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
213 return -ENOSPC; 213 return -ENOSPC;
214#endif 214#endif
215 215
216#ifdef CONFIG_NF_CONNTRACK_SECMARK
217 if (seq_printf(s, "secmark=%u ", conntrack->secmark))
218 return -ENOSPC;
219#endif
220
216 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) 221 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
217 return -ENOSPC; 222 return -ENOSPC;
218 223
@@ -455,6 +460,8 @@ extern unsigned int nf_ct_generic_timeout;
455static int log_invalid_proto_min = 0; 460static int log_invalid_proto_min = 0;
456static int log_invalid_proto_max = 255; 461static int log_invalid_proto_max = 255;
457 462
463int nf_conntrack_checksum = 1;
464
458static struct ctl_table_header *nf_ct_sysctl_header; 465static struct ctl_table_header *nf_ct_sysctl_header;
459 466
460static ctl_table nf_ct_sysctl_table[] = { 467static ctl_table nf_ct_sysctl_table[] = {
@@ -483,6 +490,14 @@ static ctl_table nf_ct_sysctl_table[] = {
483 .proc_handler = &proc_dointvec, 490 .proc_handler = &proc_dointvec,
484 }, 491 },
485 { 492 {
493 .ctl_name = NET_NF_CONNTRACK_CHECKSUM,
494 .procname = "nf_conntrack_checksum",
495 .data = &nf_conntrack_checksum,
496 .maxlen = sizeof(unsigned int),
497 .mode = 0644,
498 .proc_handler = &proc_dointvec,
499 },
500 {
486 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, 501 .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
487 .procname = "nf_conntrack_tcp_timeout_syn_sent", 502 .procname = "nf_conntrack_tcp_timeout_syn_sent",
488 .data = &nf_ct_tcp_timeout_syn_sent, 503 .data = &nf_ct_tcp_timeout_syn_sent,
@@ -851,6 +866,7 @@ EXPORT_SYMBOL(nf_ct_proto_put);
851EXPORT_SYMBOL(nf_ct_l3proto_find_get); 866EXPORT_SYMBOL(nf_ct_l3proto_find_get);
852EXPORT_SYMBOL(nf_ct_l3proto_put); 867EXPORT_SYMBOL(nf_ct_l3proto_put);
853EXPORT_SYMBOL(nf_ct_l3protos); 868EXPORT_SYMBOL(nf_ct_l3protos);
869EXPORT_SYMBOL_GPL(nf_conntrack_checksum);
854EXPORT_SYMBOL(nf_conntrack_expect_alloc); 870EXPORT_SYMBOL(nf_conntrack_expect_alloc);
855EXPORT_SYMBOL(nf_conntrack_expect_put); 871EXPORT_SYMBOL(nf_conntrack_expect_put);
856EXPORT_SYMBOL(nf_conntrack_expect_related); 872EXPORT_SYMBOL(nf_conntrack_expect_related);
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
new file mode 100644
index 000000000000..8c011e020769
--- /dev/null
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -0,0 +1,155 @@
1/*
2 * This module is used to copy security markings from packets
3 * to connections, and restore security markings from connections
4 * back to packets. This would normally be performed in conjunction
5 * with the SECMARK target and state match.
6 *
7 * Based somewhat on CONNMARK:
8 * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
9 * by Henrik Nordstrom <hno@marasystems.com>
10 *
11 * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License version 2 as
15 * published by the Free Software Foundation.
16 *
17 */
18#include <linux/module.h>
19#include <linux/skbuff.h>
20#include <linux/netfilter/x_tables.h>
21#include <linux/netfilter/xt_CONNSECMARK.h>
22#include <net/netfilter/nf_conntrack_compat.h>
23
24#define PFX "CONNSECMARK: "
25
26MODULE_LICENSE("GPL");
27MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
28MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module");
29MODULE_ALIAS("ipt_CONNSECMARK");
30MODULE_ALIAS("ip6t_CONNSECMARK");
31
32/*
33 * If the packet has a security mark and the connection does not, copy
34 * the security mark from the packet to the connection.
35 */
36static void secmark_save(struct sk_buff *skb)
37{
38 if (skb->secmark) {
39 u32 *connsecmark;
40 enum ip_conntrack_info ctinfo;
41
42 connsecmark = nf_ct_get_secmark(skb, &ctinfo);
43 if (connsecmark && !*connsecmark)
44 if (*connsecmark != skb->secmark)
45 *connsecmark = skb->secmark;
46 }
47}
48
49/*
50 * If packet has no security mark, and the connection does, restore the
51 * security mark from the connection to the packet.
52 */
53static void secmark_restore(struct sk_buff *skb)
54{
55 if (!skb->secmark) {
56 u32 *connsecmark;
57 enum ip_conntrack_info ctinfo;
58
59 connsecmark = nf_ct_get_secmark(skb, &ctinfo);
60 if (connsecmark && *connsecmark)
61 if (skb->secmark != *connsecmark)
62 skb->secmark = *connsecmark;
63 }
64}
65
66static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
67 const struct net_device *out, unsigned int hooknum,
68 const struct xt_target *target,
69 const void *targinfo, void *userinfo)
70{
71 struct sk_buff *skb = *pskb;
72 const struct xt_connsecmark_target_info *info = targinfo;
73
74 switch (info->mode) {
75 case CONNSECMARK_SAVE:
76 secmark_save(skb);
77 break;
78
79 case CONNSECMARK_RESTORE:
80 secmark_restore(skb);
81 break;
82
83 default:
84 BUG();
85 }
86
87 return XT_CONTINUE;
88}
89
90static int checkentry(const char *tablename, const void *entry,
91 const struct xt_target *target, void *targinfo,
92 unsigned int targinfosize, unsigned int hook_mask)
93{
94 struct xt_connsecmark_target_info *info = targinfo;
95
96 switch (info->mode) {
97 case CONNSECMARK_SAVE:
98 case CONNSECMARK_RESTORE:
99 break;
100
101 default:
102 printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
103 return 0;
104 }
105
106 return 1;
107}
108
109static struct xt_target ipt_connsecmark_reg = {
110 .name = "CONNSECMARK",
111 .target = target,
112 .targetsize = sizeof(struct xt_connsecmark_target_info),
113 .table = "mangle",
114 .checkentry = checkentry,
115 .me = THIS_MODULE,
116 .family = AF_INET,
117 .revision = 0,
118};
119
120static struct xt_target ip6t_connsecmark_reg = {
121 .name = "CONNSECMARK",
122 .target = target,
123 .targetsize = sizeof(struct xt_connsecmark_target_info),
124 .table = "mangle",
125 .checkentry = checkentry,
126 .me = THIS_MODULE,
127 .family = AF_INET6,
128 .revision = 0,
129};
130
131static int __init xt_connsecmark_init(void)
132{
133 int err;
134
135 need_conntrack();
136
137 err = xt_register_target(&ipt_connsecmark_reg);
138 if (err)
139 return err;
140
141 err = xt_register_target(&ip6t_connsecmark_reg);
142 if (err)
143 xt_unregister_target(&ipt_connsecmark_reg);
144
145 return err;
146}
147
148static void __exit xt_connsecmark_fini(void)
149{
150 xt_unregister_target(&ip6t_connsecmark_reg);
151 xt_unregister_target(&ipt_connsecmark_reg);
152}
153
154module_init(xt_connsecmark_init);
155module_exit(xt_connsecmark_fini);
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
new file mode 100644
index 000000000000..c2ce9c4011cc
--- /dev/null
+++ b/net/netfilter/xt_SECMARK.c
@@ -0,0 +1,156 @@
1/*
2 * Module for modifying the secmark field of the skb, for use by
3 * security subsystems.
4 *
5 * Based on the nfmark match by:
6 * (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
7 *
8 * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 *
14 */
15#include <linux/module.h>
16#include <linux/skbuff.h>
17#include <linux/selinux.h>
18#include <linux/netfilter/x_tables.h>
19#include <linux/netfilter/xt_SECMARK.h>
20
21MODULE_LICENSE("GPL");
22MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
23MODULE_DESCRIPTION("ip[6]tables SECMARK modification module");
24MODULE_ALIAS("ipt_SECMARK");
25MODULE_ALIAS("ip6t_SECMARK");
26
27#define PFX "SECMARK: "
28
29static u8 mode;
30
31static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
32 const struct net_device *out, unsigned int hooknum,
33 const struct xt_target *target,
34 const void *targinfo, void *userinfo)
35{
36 u32 secmark = 0;
37 const struct xt_secmark_target_info *info = targinfo;
38
39 BUG_ON(info->mode != mode);
40
41 switch (mode) {
42 case SECMARK_MODE_SEL:
43 secmark = info->u.sel.selsid;
44 break;
45
46 default:
47 BUG();
48 }
49
50 if ((*pskb)->secmark != secmark)
51 (*pskb)->secmark = secmark;
52
53 return XT_CONTINUE;
54}
55
56static int checkentry_selinux(struct xt_secmark_target_info *info)
57{
58 int err;
59 struct xt_secmark_target_selinux_info *sel = &info->u.sel;
60
61 err = selinux_string_to_sid(sel->selctx, &sel->selsid);
62 if (err) {
63 if (err == -EINVAL)
64 printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n",
65 sel->selctx);
66 return 0;
67 }
68
69 if (!sel->selsid) {
70 printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n",
71 sel->selctx);
72 return 0;
73 }
74
75 err = selinux_relabel_packet_permission(sel->selsid);
76 if (err) {
77 printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
78 return 0;
79 }
80
81 return 1;
82}
83
84static int checkentry(const char *tablename, const void *entry,
85 const struct xt_target *target, void *targinfo,
86 unsigned int targinfosize, unsigned int hook_mask)
87{
88 struct xt_secmark_target_info *info = targinfo;
89
90 if (mode && mode != info->mode) {
91 printk(KERN_INFO PFX "mode already set to %hu cannot mix with "
92 "rules for mode %hu\n", mode, info->mode);
93 return 0;
94 }
95
96 switch (info->mode) {
97 case SECMARK_MODE_SEL:
98 if (!checkentry_selinux(info))
99 return 0;
100 break;
101
102 default:
103 printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
104 return 0;
105 }
106
107 if (!mode)
108 mode = info->mode;
109 return 1;
110}
111
112static struct xt_target ipt_secmark_reg = {
113 .name = "SECMARK",
114 .target = target,
115 .targetsize = sizeof(struct xt_secmark_target_info),
116 .table = "mangle",
117 .checkentry = checkentry,
118 .me = THIS_MODULE,
119 .family = AF_INET,
120 .revision = 0,
121};
122
123static struct xt_target ip6t_secmark_reg = {
124 .name = "SECMARK",
125 .target = target,
126 .targetsize = sizeof(struct xt_secmark_target_info),
127 .table = "mangle",
128 .checkentry = checkentry,
129 .me = THIS_MODULE,
130 .family = AF_INET6,
131 .revision = 0,
132};
133
134static int __init xt_secmark_init(void)
135{
136 int err;
137
138 err = xt_register_target(&ipt_secmark_reg);
139 if (err)
140 return err;
141
142 err = xt_register_target(&ip6t_secmark_reg);
143 if (err)
144 xt_unregister_target(&ipt_secmark_reg);
145
146 return err;
147}
148
149static void __exit xt_secmark_fini(void)
150{
151 xt_unregister_target(&ip6t_secmark_reg);
152 xt_unregister_target(&ipt_secmark_reg);
153}
154
155module_init(xt_secmark_init);
156module_exit(xt_secmark_fini);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index dc26a27cbcaf..56324c8aff0a 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -58,7 +58,7 @@ checkentry(const char *tablename,
58 unsigned int matchsize, 58 unsigned int matchsize,
59 unsigned int hook_mask) 59 unsigned int hook_mask)
60{ 60{
61 struct xt_connmark_info *cm = (struct xt_connmark_info *)matchinfo; 61 struct xt_connmark_info *cm = matchinfo;
62 62
63 if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { 63 if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
64 printk(KERN_WARNING "connmark: only support 32bit mark\n"); 64 printk(KERN_WARNING "connmark: only support 32bit mark\n");
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index dfb10b648e57..2e2f825dad4c 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -101,8 +101,7 @@ match(const struct sk_buff *skb,
101 unsigned int protoff, 101 unsigned int protoff,
102 int *hotdrop) 102 int *hotdrop)
103{ 103{
104 const struct xt_dccp_info *info = 104 const struct xt_dccp_info *info = matchinfo;
105 (const struct xt_dccp_info *)matchinfo;
106 struct dccp_hdr _dh, *dh; 105 struct dccp_hdr _dh, *dh;
107 106
108 if (offset) 107 if (offset)
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 8b385a34886d..876bc5797738 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -42,7 +42,7 @@ checkentry(const char *tablename,
42 unsigned int matchsize, 42 unsigned int matchsize,
43 unsigned int hook_mask) 43 unsigned int hook_mask)
44{ 44{
45 struct xt_mark_info *minfo = (struct xt_mark_info *) matchinfo; 45 const struct xt_mark_info *minfo = matchinfo;
46 46
47 if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { 47 if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
48 printk(KERN_WARNING "mark: only supports 32bit mark\n"); 48 printk(KERN_WARNING "mark: only supports 32bit mark\n");
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index b56cd2baaac2..1ff0a25396e7 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -1,4 +1,4 @@
1/* Kernel module to match one of a list of TCP/UDP ports: ports are in 1/* Kernel module to match one of a list of TCP/UDP/SCTP/DCCP ports: ports are in
2 the same place so we can treat them as equal. */ 2 the same place so we can treat them as equal. */
3 3
4/* (C) 1999-2001 Paul `Rusty' Russell 4/* (C) 1999-2001 Paul `Rusty' Russell
@@ -160,8 +160,9 @@ check(u_int16_t proto,
160 u_int8_t match_flags, 160 u_int8_t match_flags,
161 u_int8_t count) 161 u_int8_t count)
162{ 162{
163 /* Must specify proto == TCP/UDP, no unknown flags or bad count */ 163 /* Must specify supported protocol, no unknown flags or bad count */
164 return (proto == IPPROTO_TCP || proto == IPPROTO_UDP) 164 return (proto == IPPROTO_TCP || proto == IPPROTO_UDP
165 || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP)
165 && !(ip_invflags & XT_INV_PROTO) 166 && !(ip_invflags & XT_INV_PROTO)
166 && (match_flags == XT_MULTIPORT_SOURCE 167 && (match_flags == XT_MULTIPORT_SOURCE
167 || match_flags == XT_MULTIPORT_DESTINATION 168 || match_flags == XT_MULTIPORT_DESTINATION
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
new file mode 100644
index 000000000000..4cdba7469dc4
--- /dev/null
+++ b/net/netfilter/xt_quota.c
@@ -0,0 +1,96 @@
1/*
2 * netfilter module to enforce network quotas
3 *
4 * Sam Johnston <samj@samj.net>
5 */
6#include <linux/skbuff.h>
7#include <linux/spinlock.h>
8
9#include <linux/netfilter/x_tables.h>
10#include <linux/netfilter/xt_quota.h>
11
12MODULE_LICENSE("GPL");
13MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
14
15static DEFINE_SPINLOCK(quota_lock);
16
17static int
18match(const struct sk_buff *skb,
19 const struct net_device *in, const struct net_device *out,
20 const struct xt_match *match, const void *matchinfo,
21 int offset, unsigned int protoff, int *hotdrop)
22{
23 struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master;
24 int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0;
25
26 spin_lock_bh(&quota_lock);
27 if (q->quota >= skb->len) {
28 q->quota -= skb->len;
29 ret ^= 1;
30 } else {
31 /* we do not allow even small packets from now on */
32 q->quota = 0;
33 }
34 spin_unlock_bh(&quota_lock);
35
36 return ret;
37}
38
39static int
40checkentry(const char *tablename, const void *entry,
41 const struct xt_match *match, void *matchinfo,
42 unsigned int matchsize, unsigned int hook_mask)
43{
44 struct xt_quota_info *q = (struct xt_quota_info *)matchinfo;
45
46 if (q->flags & ~XT_QUOTA_MASK)
47 return 0;
48 /* For SMP, we only want to use one set of counters. */
49 q->master = q;
50 return 1;
51}
52
53static struct xt_match quota_match = {
54 .name = "quota",
55 .family = AF_INET,
56 .match = match,
57 .matchsize = sizeof(struct xt_quota_info),
58 .checkentry = checkentry,
59 .me = THIS_MODULE
60};
61
62static struct xt_match quota_match6 = {
63 .name = "quota",
64 .family = AF_INET6,
65 .match = match,
66 .matchsize = sizeof(struct xt_quota_info),
67 .checkentry = checkentry,
68 .me = THIS_MODULE
69};
70
71static int __init xt_quota_init(void)
72{
73 int ret;
74
75 ret = xt_register_match(&quota_match);
76 if (ret)
77 goto err1;
78 ret = xt_register_match(&quota_match6);
79 if (ret)
80 goto err2;
81 return ret;
82
83err2:
84 xt_unregister_match(&quota_match);
85err1:
86 return ret;
87}
88
89static void __exit xt_quota_fini(void)
90{
91 xt_unregister_match(&quota_match6);
92 xt_unregister_match(&quota_match);
93}
94
95module_init(xt_quota_init);
96module_exit(xt_quota_fini);
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 34bd87259a09..b5110e5b54b0 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -129,11 +129,9 @@ match(const struct sk_buff *skb,
129 unsigned int protoff, 129 unsigned int protoff,
130 int *hotdrop) 130 int *hotdrop)
131{ 131{
132 const struct xt_sctp_info *info; 132 const struct xt_sctp_info *info = matchinfo;
133 sctp_sctphdr_t _sh, *sh; 133 sctp_sctphdr_t _sh, *sh;
134 134
135 info = (const struct xt_sctp_info *)matchinfo;
136
137 if (offset) { 135 if (offset) {
138 duprintf("Dropping non-first fragment.. FIXME\n"); 136 duprintf("Dropping non-first fragment.. FIXME\n");
139 return 0; 137 return 0;
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
new file mode 100644
index 000000000000..de1037f58596
--- /dev/null
+++ b/net/netfilter/xt_statistic.c
@@ -0,0 +1,112 @@
1/*
2 * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>.
9 */
10
11#include <linux/init.h>
12#include <linux/spinlock.h>
13#include <linux/skbuff.h>
14#include <linux/net.h>
15
16#include <linux/netfilter/xt_statistic.h>
17#include <linux/netfilter/x_tables.h>
18
19MODULE_LICENSE("GPL");
20MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
21MODULE_DESCRIPTION("xtables statistical match module");
22MODULE_ALIAS("ipt_statistic");
23MODULE_ALIAS("ip6t_statistic");
24
25static DEFINE_SPINLOCK(nth_lock);
26
27static int
28match(const struct sk_buff *skb,
29 const struct net_device *in, const struct net_device *out,
30 const struct xt_match *match, const void *matchinfo,
31 int offset, unsigned int protoff, int *hotdrop)
32{
33 struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
34 int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0;
35
36 switch (info->mode) {
37 case XT_STATISTIC_MODE_RANDOM:
38 if ((net_random() & 0x7FFFFFFF) < info->u.random.probability)
39 ret ^= 1;
40 break;
41 case XT_STATISTIC_MODE_NTH:
42 info = info->master;
43 spin_lock_bh(&nth_lock);
44 if (info->u.nth.count++ == info->u.nth.every) {
45 info->u.nth.count = 0;
46 ret ^= 1;
47 }
48 spin_unlock_bh(&nth_lock);
49 break;
50 }
51
52 return ret;
53}
54
55static int
56checkentry(const char *tablename, const void *entry,
57 const struct xt_match *match, void *matchinfo,
58 unsigned int matchsize, unsigned int hook_mask)
59{
60 struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
61
62 if (info->mode > XT_STATISTIC_MODE_MAX ||
63 info->flags & ~XT_STATISTIC_MASK)
64 return 0;
65 info->master = info;
66 return 1;
67}
68
69static struct xt_match statistic_match = {
70 .name = "statistic",
71 .match = match,
72 .matchsize = sizeof(struct xt_statistic_info),
73 .checkentry = checkentry,
74 .family = AF_INET,
75 .me = THIS_MODULE,
76};
77
78static struct xt_match statistic_match6 = {
79 .name = "statistic",
80 .match = match,
81 .matchsize = sizeof(struct xt_statistic_info),
82 .checkentry = checkentry,
83 .family = AF_INET6,
84 .me = THIS_MODULE,
85};
86
87static int __init xt_statistic_init(void)
88{
89 int ret;
90
91 ret = xt_register_match(&statistic_match);
92 if (ret)
93 goto err1;
94
95 ret = xt_register_match(&statistic_match6);
96 if (ret)
97 goto err2;
98 return ret;
99err2:
100 xt_unregister_match(&statistic_match);
101err1:
102 return ret;
103}
104
105static void __exit xt_statistic_fini(void)
106{
107 xt_unregister_match(&statistic_match6);
108 xt_unregister_match(&statistic_match);
109}
110
111module_init(xt_statistic_init);
112module_exit(xt_statistic_fini);
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 79d9ea6964ba..0ebb6ac2c8c7 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -30,8 +30,8 @@ static int match(const struct sk_buff *skb,
30 unsigned int protoff, 30 unsigned int protoff,
31 int *hotdrop) 31 int *hotdrop)
32{ 32{
33 const struct xt_string_info *conf = matchinfo;
33 struct ts_state state; 34 struct ts_state state;
34 struct xt_string_info *conf = (struct xt_string_info *) matchinfo;
35 35
36 memset(&state, 0, sizeof(struct ts_state)); 36 memset(&state, 0, sizeof(struct ts_state));
37 37
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 138ea92ed268..b1e4c5e20ac7 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -72,9 +72,9 @@ void qdisc_unlock_tree(struct net_device *dev)
72 dev->queue_lock serializes queue accesses for this device 72 dev->queue_lock serializes queue accesses for this device
73 AND dev->qdisc pointer itself. 73 AND dev->qdisc pointer itself.
74 74
75 dev->xmit_lock serializes accesses to device driver. 75 netif_tx_lock serializes accesses to device driver.
76 76
77 dev->queue_lock and dev->xmit_lock are mutually exclusive, 77 dev->queue_lock and netif_tx_lock are mutually exclusive,
78 if one is grabbed, another must be free. 78 if one is grabbed, another must be free.
79 */ 79 */
80 80
@@ -108,7 +108,7 @@ int qdisc_restart(struct net_device *dev)
108 * will be requeued. 108 * will be requeued.
109 */ 109 */
110 if (!nolock) { 110 if (!nolock) {
111 if (!spin_trylock(&dev->xmit_lock)) { 111 if (!netif_tx_trylock(dev)) {
112 collision: 112 collision:
113 /* So, someone grabbed the driver. */ 113 /* So, someone grabbed the driver. */
114 114
@@ -126,8 +126,6 @@ int qdisc_restart(struct net_device *dev)
126 __get_cpu_var(netdev_rx_stat).cpu_collision++; 126 __get_cpu_var(netdev_rx_stat).cpu_collision++;
127 goto requeue; 127 goto requeue;
128 } 128 }
129 /* Remember that the driver is grabbed by us. */
130 dev->xmit_lock_owner = smp_processor_id();
131 } 129 }
132 130
133 { 131 {
@@ -142,8 +140,7 @@ int qdisc_restart(struct net_device *dev)
142 ret = dev->hard_start_xmit(skb, dev); 140 ret = dev->hard_start_xmit(skb, dev);
143 if (ret == NETDEV_TX_OK) { 141 if (ret == NETDEV_TX_OK) {
144 if (!nolock) { 142 if (!nolock) {
145 dev->xmit_lock_owner = -1; 143 netif_tx_unlock(dev);
146 spin_unlock(&dev->xmit_lock);
147 } 144 }
148 spin_lock(&dev->queue_lock); 145 spin_lock(&dev->queue_lock);
149 return -1; 146 return -1;
@@ -157,8 +154,7 @@ int qdisc_restart(struct net_device *dev)
157 /* NETDEV_TX_BUSY - we need to requeue */ 154 /* NETDEV_TX_BUSY - we need to requeue */
158 /* Release the driver */ 155 /* Release the driver */
159 if (!nolock) { 156 if (!nolock) {
160 dev->xmit_lock_owner = -1; 157 netif_tx_unlock(dev);
161 spin_unlock(&dev->xmit_lock);
162 } 158 }
163 spin_lock(&dev->queue_lock); 159 spin_lock(&dev->queue_lock);
164 q = dev->qdisc; 160 q = dev->qdisc;
@@ -187,7 +183,7 @@ static void dev_watchdog(unsigned long arg)
187{ 183{
188 struct net_device *dev = (struct net_device *)arg; 184 struct net_device *dev = (struct net_device *)arg;
189 185
190 spin_lock(&dev->xmit_lock); 186 netif_tx_lock(dev);
191 if (dev->qdisc != &noop_qdisc) { 187 if (dev->qdisc != &noop_qdisc) {
192 if (netif_device_present(dev) && 188 if (netif_device_present(dev) &&
193 netif_running(dev) && 189 netif_running(dev) &&
@@ -203,7 +199,7 @@ static void dev_watchdog(unsigned long arg)
203 dev_hold(dev); 199 dev_hold(dev);
204 } 200 }
205 } 201 }
206 spin_unlock(&dev->xmit_lock); 202 netif_tx_unlock(dev);
207 203
208 dev_put(dev); 204 dev_put(dev);
209} 205}
@@ -227,17 +223,17 @@ void __netdev_watchdog_up(struct net_device *dev)
227 223
228static void dev_watchdog_up(struct net_device *dev) 224static void dev_watchdog_up(struct net_device *dev)
229{ 225{
230 spin_lock_bh(&dev->xmit_lock); 226 netif_tx_lock_bh(dev);
231 __netdev_watchdog_up(dev); 227 __netdev_watchdog_up(dev);
232 spin_unlock_bh(&dev->xmit_lock); 228 netif_tx_unlock_bh(dev);
233} 229}
234 230
235static void dev_watchdog_down(struct net_device *dev) 231static void dev_watchdog_down(struct net_device *dev)
236{ 232{
237 spin_lock_bh(&dev->xmit_lock); 233 netif_tx_lock_bh(dev);
238 if (del_timer(&dev->watchdog_timer)) 234 if (del_timer(&dev->watchdog_timer))
239 dev_put(dev); 235 dev_put(dev);
240 spin_unlock_bh(&dev->xmit_lock); 236 netif_tx_unlock_bh(dev);
241} 237}
242 238
243void netif_carrier_on(struct net_device *dev) 239void netif_carrier_on(struct net_device *dev)
@@ -582,7 +578,7 @@ void dev_deactivate(struct net_device *dev)
582 while (test_bit(__LINK_STATE_SCHED, &dev->state)) 578 while (test_bit(__LINK_STATE_SCHED, &dev->state))
583 yield(); 579 yield();
584 580
585 spin_unlock_wait(&dev->xmit_lock); 581 spin_unlock_wait(&dev->_xmit_lock);
586} 582}
587 583
588void dev_init_scheduler(struct net_device *dev) 584void dev_init_scheduler(struct net_device *dev)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 79b8ef34c6e4..4c16ad57a3e4 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -302,20 +302,17 @@ restart:
302 302
303 switch (teql_resolve(skb, skb_res, slave)) { 303 switch (teql_resolve(skb, skb_res, slave)) {
304 case 0: 304 case 0:
305 if (spin_trylock(&slave->xmit_lock)) { 305 if (netif_tx_trylock(slave)) {
306 slave->xmit_lock_owner = smp_processor_id();
307 if (!netif_queue_stopped(slave) && 306 if (!netif_queue_stopped(slave) &&
308 slave->hard_start_xmit(skb, slave) == 0) { 307 slave->hard_start_xmit(skb, slave) == 0) {
309 slave->xmit_lock_owner = -1; 308 netif_tx_unlock(slave);
310 spin_unlock(&slave->xmit_lock);
311 master->slaves = NEXT_SLAVE(q); 309 master->slaves = NEXT_SLAVE(q);
312 netif_wake_queue(dev); 310 netif_wake_queue(dev);
313 master->stats.tx_packets++; 311 master->stats.tx_packets++;
314 master->stats.tx_bytes += len; 312 master->stats.tx_bytes += len;
315 return 0; 313 return 0;
316 } 314 }
317 slave->xmit_lock_owner = -1; 315 netif_tx_unlock(slave);
318 spin_unlock(&slave->xmit_lock);
319 } 316 }
320 if (netif_queue_stopped(dev)) 317 if (netif_queue_stopped(dev))
321 busy = 1; 318 busy = 1;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 1662f9cc869e..42b66e74bbb5 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -141,7 +141,8 @@ int sctp_rcv(struct sk_buff *skb)
141 __skb_pull(skb, skb->h.raw - skb->data); 141 __skb_pull(skb, skb->h.raw - skb->data);
142 if (skb->len < sizeof(struct sctphdr)) 142 if (skb->len < sizeof(struct sctphdr))
143 goto discard_it; 143 goto discard_it;
144 if (sctp_rcv_checksum(skb) < 0) 144 if ((skb->ip_summed != CHECKSUM_UNNECESSARY) &&
145 (sctp_rcv_checksum(skb) < 0))
145 goto discard_it; 146 goto discard_it;
146 147
147 skb_pull(skb, sizeof(struct sctphdr)); 148 skb_pull(skb, sizeof(struct sctphdr));
@@ -170,7 +171,8 @@ int sctp_rcv(struct sk_buff *skb)
170 * IP broadcast addresses cannot be used in an SCTP transport 171 * IP broadcast addresses cannot be used in an SCTP transport
171 * address." 172 * address."
172 */ 173 */
173 if (!af->addr_valid(&src, NULL) || !af->addr_valid(&dest, NULL)) 174 if (!af->addr_valid(&src, NULL, skb) ||
175 !af->addr_valid(&dest, NULL, skb))
174 goto discard_it; 176 goto discard_it;
175 177
176 asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); 178 asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c20d282fac06..8ef08070c8b6 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -523,7 +523,9 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
523 * Return 0 - If the address is a non-unicast or an illegal address. 523 * Return 0 - If the address is a non-unicast or an illegal address.
524 * Return 1 - If the address is a unicast. 524 * Return 1 - If the address is a unicast.
525 */ 525 */
526static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) 526static int sctp_v6_addr_valid(union sctp_addr *addr,
527 struct sctp_sock *sp,
528 const struct sk_buff *skb)
527{ 529{
528 int ret = ipv6_addr_type(&addr->v6.sin6_addr); 530 int ret = ipv6_addr_type(&addr->v6.sin6_addr);
529 531
@@ -537,7 +539,7 @@ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
537 if (sp && ipv6_only_sock(sctp_opt2sk(sp))) 539 if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
538 return 0; 540 return 0;
539 sctp_v6_map_v4(addr); 541 sctp_v6_map_v4(addr);
540 return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp); 542 return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb);
541 } 543 }
542 544
543 /* Is this a non-unicast address */ 545 /* Is this a non-unicast address */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 437cba7260a4..cdc5a3936766 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -295,14 +295,14 @@ int sctp_packet_transmit(struct sctp_packet *packet)
295 struct sctp_transport *tp = packet->transport; 295 struct sctp_transport *tp = packet->transport;
296 struct sctp_association *asoc = tp->asoc; 296 struct sctp_association *asoc = tp->asoc;
297 struct sctphdr *sh; 297 struct sctphdr *sh;
298 __u32 crc32; 298 __u32 crc32 = 0;
299 struct sk_buff *nskb; 299 struct sk_buff *nskb;
300 struct sctp_chunk *chunk, *tmp; 300 struct sctp_chunk *chunk, *tmp;
301 struct sock *sk; 301 struct sock *sk;
302 int err = 0; 302 int err = 0;
303 int padding; /* How much padding do we need? */ 303 int padding; /* How much padding do we need? */
304 __u8 has_data = 0; 304 __u8 has_data = 0;
305 struct dst_entry *dst; 305 struct dst_entry *dst = tp->dst;
306 306
307 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); 307 SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
308 308
@@ -327,6 +327,19 @@ int sctp_packet_transmit(struct sctp_packet *packet)
327 */ 327 */
328 skb_set_owner_w(nskb, sk); 328 skb_set_owner_w(nskb, sk);
329 329
330 /* The 'obsolete' field of dst is set to 2 when a dst is freed. */
331 if (!dst || (dst->obsolete > 1)) {
332 dst_release(dst);
333 sctp_transport_route(tp, NULL, sctp_sk(sk));
334 if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) {
335 sctp_assoc_sync_pmtu(asoc);
336 }
337 }
338 nskb->dst = dst_clone(tp->dst);
339 if (!nskb->dst)
340 goto no_route;
341 dst = nskb->dst;
342
330 /* Build the SCTP header. */ 343 /* Build the SCTP header. */
331 sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); 344 sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
332 sh->source = htons(packet->source_port); 345 sh->source = htons(packet->source_port);
@@ -350,7 +363,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
350 * Note: Adler-32 is no longer applicable, as has been replaced 363 * Note: Adler-32 is no longer applicable, as has been replaced
351 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>. 364 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
352 */ 365 */
353 crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr)); 366 if (!(dst->dev->features & NETIF_F_NO_CSUM))
367 crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr));
354 368
355 /** 369 /**
356 * 6.10 Bundling 370 * 6.10 Bundling
@@ -402,9 +416,14 @@ int sctp_packet_transmit(struct sctp_packet *packet)
402 if (padding) 416 if (padding)
403 memset(skb_put(chunk->skb, padding), 0, padding); 417 memset(skb_put(chunk->skb, padding), 0, padding);
404 418
405 crc32 = sctp_update_copy_cksum(skb_put(nskb, chunk->skb->len), 419 if (dst->dev->features & NETIF_F_NO_CSUM)
406 chunk->skb->data, 420 memcpy(skb_put(nskb, chunk->skb->len),
407 chunk->skb->len, crc32); 421 chunk->skb->data, chunk->skb->len);
422 else
423 crc32 = sctp_update_copy_cksum(skb_put(nskb,
424 chunk->skb->len),
425 chunk->skb->data,
426 chunk->skb->len, crc32);
408 427
409 SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n", 428 SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n",
410 "*** Chunk", chunk, 429 "*** Chunk", chunk,
@@ -427,7 +446,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
427 } 446 }
428 447
429 /* Perform final transformation on checksum. */ 448 /* Perform final transformation on checksum. */
430 crc32 = sctp_end_cksum(crc32); 449 if (!(dst->dev->features & NETIF_F_NO_CSUM))
450 crc32 = sctp_end_cksum(crc32);
431 451
432 /* 3) Put the resultant value into the checksum field in the 452 /* 3) Put the resultant value into the checksum field in the
433 * common header, and leave the rest of the bits unchanged. 453 * common header, and leave the rest of the bits unchanged.
@@ -477,20 +497,6 @@ int sctp_packet_transmit(struct sctp_packet *packet)
477 } 497 }
478 } 498 }
479 499
480 dst = tp->dst;
481 /* The 'obsolete' field of dst is set to 2 when a dst is freed. */
482 if (!dst || (dst->obsolete > 1)) {
483 dst_release(dst);
484 sctp_transport_route(tp, NULL, sctp_sk(sk));
485 if (asoc->param_flags & SPP_PMTUD_ENABLE) {
486 sctp_assoc_sync_pmtu(asoc);
487 }
488 }
489
490 nskb->dst = dst_clone(tp->dst);
491 if (!nskb->dst)
492 goto no_route;
493
494 SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", 500 SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n",
495 nskb->len); 501 nskb->len);
496 502
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index f148f9576dd2..e5faa351aaad 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1262,6 +1262,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1262 if (!tchunk->tsn_gap_acked && 1262 if (!tchunk->tsn_gap_acked &&
1263 !tchunk->resent && 1263 !tchunk->resent &&
1264 tchunk->rtt_in_progress) { 1264 tchunk->rtt_in_progress) {
1265 tchunk->rtt_in_progress = 0;
1265 rtt = jiffies - tchunk->sent_at; 1266 rtt = jiffies - tchunk->sent_at;
1266 sctp_transport_update_rto(transport, 1267 sctp_transport_update_rto(transport,
1267 rtt); 1268 rtt);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2088aa992b7a..816c033d7886 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -365,12 +365,18 @@ static int sctp_v4_is_any(const union sctp_addr *addr)
365 * Return 0 - If the address is a non-unicast or an illegal address. 365 * Return 0 - If the address is a non-unicast or an illegal address.
366 * Return 1 - If the address is a unicast. 366 * Return 1 - If the address is a unicast.
367 */ 367 */
368static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) 368static int sctp_v4_addr_valid(union sctp_addr *addr,
369 struct sctp_sock *sp,
370 const struct sk_buff *skb)
369{ 371{
370 /* Is this a non-unicast address or a unusable SCTP address? */ 372 /* Is this a non-unicast address or a unusable SCTP address? */
371 if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) 373 if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr))
372 return 0; 374 return 0;
373 375
376 /* Is this a broadcast address? */
377 if (skb && ((struct rtable *)skb->dst)->rt_flags & RTCF_BROADCAST)
378 return 0;
379
374 return 1; 380 return 1;
375} 381}
376 382
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8bc279219a72..9e58144f4851 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5293,10 +5293,18 @@ static int sctp_eat_data(const struct sctp_association *asoc,
5293 * seems a bit troublesome in that frag_point varies based on 5293 * seems a bit troublesome in that frag_point varies based on
5294 * PMTU. In cases, such as loopback, this might be a rather 5294 * PMTU. In cases, such as loopback, this might be a rather
5295 * large spill over. 5295 * large spill over.
5296 * NOTE: If we have a full receive buffer here, we only renege if
5297 * our receiver can still make progress without the tsn being
5298 * received. We do this because in the event that the associations
5299 * receive queue is empty we are filling a leading gap, and since
5300 * reneging moves the gap to the end of the tsn stream, we are likely
5301 * to stall again very shortly. Avoiding the renege when we fill a
5302 * leading gap is a good heuristic for avoiding such steady state
5303 * stalls.
5296 */ 5304 */
5297 if (!asoc->rwnd || asoc->rwnd_over || 5305 if (!asoc->rwnd || asoc->rwnd_over ||
5298 (datalen > asoc->rwnd + asoc->frag_point) || 5306 (datalen > asoc->rwnd + asoc->frag_point) ||
5299 rcvbuf_over) { 5307 (rcvbuf_over && (!skb_queue_len(&sk->sk_receive_queue)))) {
5300 5308
5301 /* If this is the next TSN, consider reneging to make 5309 /* If this is the next TSN, consider reneging to make
5302 * room. Note: Playing nice with a confused sender. A 5310 * room. Note: Playing nice with a confused sender. A
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 174d4d35e951..b811691c35bf 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -172,7 +172,7 @@ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
172 return -EINVAL; 172 return -EINVAL;
173 173
174 /* Is this a valid SCTP address? */ 174 /* Is this a valid SCTP address? */
175 if (!af->addr_valid(addr, sctp_sk(sk))) 175 if (!af->addr_valid(addr, sctp_sk(sk), NULL))
176 return -EINVAL; 176 return -EINVAL;
177 177
178 if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) 178 if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr)))
@@ -2530,8 +2530,32 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, int o
2530 2530
2531 /* Set the values to the specific association */ 2531 /* Set the values to the specific association */
2532 if (asoc) { 2532 if (asoc) {
2533 if (assocparams.sasoc_asocmaxrxt != 0) 2533 if (assocparams.sasoc_asocmaxrxt != 0) {
2534 __u32 path_sum = 0;
2535 int paths = 0;
2536 struct list_head *pos;
2537 struct sctp_transport *peer_addr;
2538
2539 list_for_each(pos, &asoc->peer.transport_addr_list) {
2540 peer_addr = list_entry(pos,
2541 struct sctp_transport,
2542 transports);
2543 path_sum += peer_addr->pathmaxrxt;
2544 paths++;
2545 }
2546
2547 /* Only validate asocmaxrxt if we have more then
2548 * one path/transport. We do this because path
2549 * retransmissions are only counted when we have more
2550 * then one path.
2551 */
2552 if (paths > 1 &&
2553 assocparams.sasoc_asocmaxrxt > path_sum)
2554 return -EINVAL;
2555
2534 asoc->max_retrans = assocparams.sasoc_asocmaxrxt; 2556 asoc->max_retrans = assocparams.sasoc_asocmaxrxt;
2557 }
2558
2535 if (assocparams.sasoc_cookie_life != 0) { 2559 if (assocparams.sasoc_cookie_life != 0) {
2536 asoc->cookie_life.tv_sec = 2560 asoc->cookie_life.tv_sec =
2537 assocparams.sasoc_cookie_life / 1000; 2561 assocparams.sasoc_cookie_life / 1000;
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index ba97f974f57c..ee236784a6bb 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -51,6 +51,8 @@
51static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, 51static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
52 struct sctp_association *asoc); 52 struct sctp_association *asoc);
53static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); 53static void sctp_ulpevent_release_data(struct sctp_ulpevent *event);
54static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event);
55
54 56
55/* Initialize an ULP event from an given skb. */ 57/* Initialize an ULP event from an given skb. */
56SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) 58SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
@@ -883,6 +885,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
883static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) 885static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
884{ 886{
885 struct sk_buff *skb, *frag; 887 struct sk_buff *skb, *frag;
888 unsigned int len;
886 889
887 /* Current stack structures assume that the rcv buffer is 890 /* Current stack structures assume that the rcv buffer is
888 * per socket. For UDP style sockets this is not true as 891 * per socket. For UDP style sockets this is not true as
@@ -892,7 +895,30 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
892 */ 895 */
893 896
894 skb = sctp_event2skb(event); 897 skb = sctp_event2skb(event);
895 sctp_assoc_rwnd_increase(event->asoc, skb_headlen(skb)); 898 len = skb->len;
899
900 if (!skb->data_len)
901 goto done;
902
903 /* Don't forget the fragments. */
904 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
905 /* NOTE: skb_shinfos are recursive. Although IP returns
906 * skb's with only 1 level of fragments, SCTP reassembly can
907 * increase the levels.
908 */
909 sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
910 }
911
912done:
913 sctp_assoc_rwnd_increase(event->asoc, len);
914 sctp_ulpevent_release_owner(event);
915}
916
917static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
918{
919 struct sk_buff *skb, *frag;
920
921 skb = sctp_event2skb(event);
896 922
897 if (!skb->data_len) 923 if (!skb->data_len)
898 goto done; 924 goto done;
@@ -903,7 +929,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
903 * skb's with only 1 level of fragments, SCTP reassembly can 929 * skb's with only 1 level of fragments, SCTP reassembly can
904 * increase the levels. 930 * increase the levels.
905 */ 931 */
906 sctp_ulpevent_release_data(sctp_skb2event(frag)); 932 sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
907 } 933 }
908 934
909done: 935done:
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b469c8b54613..b8936926c24b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
46 46
47static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); 47static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
48static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); 48static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
49static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family);
50static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo);
49 51
50int xfrm_register_type(struct xfrm_type *type, unsigned short family) 52int xfrm_register_type(struct xfrm_type *type, unsigned short family)
51{ 53{
52 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 54 struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
53 struct xfrm_type_map *typemap; 55 struct xfrm_type **typemap;
54 int err = 0; 56 int err = 0;
55 57
56 if (unlikely(afinfo == NULL)) 58 if (unlikely(afinfo == NULL))
57 return -EAFNOSUPPORT; 59 return -EAFNOSUPPORT;
58 typemap = afinfo->type_map; 60 typemap = afinfo->type_map;
59 61
60 write_lock_bh(&typemap->lock); 62 if (likely(typemap[type->proto] == NULL))
61 if (likely(typemap->map[type->proto] == NULL)) 63 typemap[type->proto] = type;
62 typemap->map[type->proto] = type;
63 else 64 else
64 err = -EEXIST; 65 err = -EEXIST;
65 write_unlock_bh(&typemap->lock); 66 xfrm_policy_unlock_afinfo(afinfo);
66 xfrm_policy_put_afinfo(afinfo);
67 return err; 67 return err;
68} 68}
69EXPORT_SYMBOL(xfrm_register_type); 69EXPORT_SYMBOL(xfrm_register_type);
70 70
71int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) 71int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
72{ 72{
73 struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); 73 struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
74 struct xfrm_type_map *typemap; 74 struct xfrm_type **typemap;
75 int err = 0; 75 int err = 0;
76 76
77 if (unlikely(afinfo == NULL)) 77 if (unlikely(afinfo == NULL))
78 return -EAFNOSUPPORT; 78 return -EAFNOSUPPORT;
79 typemap = afinfo->type_map; 79 typemap = afinfo->type_map;
80 80
81 write_lock_bh(&typemap->lock); 81 if (unlikely(typemap[type->proto] != type))
82 if (unlikely(typemap->map[type->proto] != type))
83 err = -ENOENT; 82 err = -ENOENT;
84 else 83 else
85 typemap->map[type->proto] = NULL; 84 typemap[type->proto] = NULL;
86 write_unlock_bh(&typemap->lock); 85 xfrm_policy_unlock_afinfo(afinfo);
87 xfrm_policy_put_afinfo(afinfo);
88 return err; 86 return err;
89} 87}
90EXPORT_SYMBOL(xfrm_unregister_type); 88EXPORT_SYMBOL(xfrm_unregister_type);
@@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type);
92struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) 90struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
93{ 91{
94 struct xfrm_policy_afinfo *afinfo; 92 struct xfrm_policy_afinfo *afinfo;
95 struct xfrm_type_map *typemap; 93 struct xfrm_type **typemap;
96 struct xfrm_type *type; 94 struct xfrm_type *type;
97 int modload_attempted = 0; 95 int modload_attempted = 0;
98 96
@@ -102,11 +100,9 @@ retry:
102 return NULL; 100 return NULL;
103 typemap = afinfo->type_map; 101 typemap = afinfo->type_map;
104 102
105 read_lock(&typemap->lock); 103 type = typemap[proto];
106 type = typemap->map[proto];
107 if (unlikely(type && !try_module_get(type->owner))) 104 if (unlikely(type && !try_module_get(type->owner)))
108 type = NULL; 105 type = NULL;
109 read_unlock(&typemap->lock);
110 if (!type && !modload_attempted) { 106 if (!type && !modload_attempted) {
111 xfrm_policy_put_afinfo(afinfo); 107 xfrm_policy_put_afinfo(afinfo);
112 request_module("xfrm-type-%d-%d", 108 request_module("xfrm-type-%d-%d",
@@ -142,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type)
142 module_put(type->owner); 138 module_put(type->owner);
143} 139}
144 140
141int xfrm_register_mode(struct xfrm_mode *mode, int family)
142{
143 struct xfrm_policy_afinfo *afinfo;
144 struct xfrm_mode **modemap;
145 int err;
146
147 if (unlikely(mode->encap >= XFRM_MODE_MAX))
148 return -EINVAL;
149
150 afinfo = xfrm_policy_lock_afinfo(family);
151 if (unlikely(afinfo == NULL))
152 return -EAFNOSUPPORT;
153
154 err = -EEXIST;
155 modemap = afinfo->mode_map;
156 if (likely(modemap[mode->encap] == NULL)) {
157 modemap[mode->encap] = mode;
158 err = 0;
159 }
160
161 xfrm_policy_unlock_afinfo(afinfo);
162 return err;
163}
164EXPORT_SYMBOL(xfrm_register_mode);
165
166int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
167{
168 struct xfrm_policy_afinfo *afinfo;
169 struct xfrm_mode **modemap;
170 int err;
171
172 if (unlikely(mode->encap >= XFRM_MODE_MAX))
173 return -EINVAL;
174
175 afinfo = xfrm_policy_lock_afinfo(family);
176 if (unlikely(afinfo == NULL))
177 return -EAFNOSUPPORT;
178
179 err = -ENOENT;
180 modemap = afinfo->mode_map;
181 if (likely(modemap[mode->encap] == mode)) {
182 modemap[mode->encap] = NULL;
183 err = 0;
184 }
185
186 xfrm_policy_unlock_afinfo(afinfo);
187 return err;
188}
189EXPORT_SYMBOL(xfrm_unregister_mode);
190
191struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
192{
193 struct xfrm_policy_afinfo *afinfo;
194 struct xfrm_mode *mode;
195 int modload_attempted = 0;
196
197 if (unlikely(encap >= XFRM_MODE_MAX))
198 return NULL;
199
200retry:
201 afinfo = xfrm_policy_get_afinfo(family);
202 if (unlikely(afinfo == NULL))
203 return NULL;
204
205 mode = afinfo->mode_map[encap];
206 if (unlikely(mode && !try_module_get(mode->owner)))
207 mode = NULL;
208 if (!mode && !modload_attempted) {
209 xfrm_policy_put_afinfo(afinfo);
210 request_module("xfrm-mode-%d-%d", family, encap);
211 modload_attempted = 1;
212 goto retry;
213 }
214
215 xfrm_policy_put_afinfo(afinfo);
216 return mode;
217}
218
219void xfrm_put_mode(struct xfrm_mode *mode)
220{
221 module_put(mode->owner);
222}
223
145static inline unsigned long make_jiffies(long secs) 224static inline unsigned long make_jiffies(long secs)
146{ 225{
147 if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) 226 if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
@@ -1306,17 +1385,31 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
1306 return NULL; 1385 return NULL;
1307 read_lock(&xfrm_policy_afinfo_lock); 1386 read_lock(&xfrm_policy_afinfo_lock);
1308 afinfo = xfrm_policy_afinfo[family]; 1387 afinfo = xfrm_policy_afinfo[family];
1309 if (likely(afinfo != NULL)) 1388 if (unlikely(!afinfo))
1310 read_lock(&afinfo->lock); 1389 read_unlock(&xfrm_policy_afinfo_lock);
1311 read_unlock(&xfrm_policy_afinfo_lock);
1312 return afinfo; 1390 return afinfo;
1313} 1391}
1314 1392
1315static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) 1393static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
1316{ 1394{
1317 if (unlikely(afinfo == NULL)) 1395 read_unlock(&xfrm_policy_afinfo_lock);
1318 return; 1396}
1319 read_unlock(&afinfo->lock); 1397
1398static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family)
1399{
1400 struct xfrm_policy_afinfo *afinfo;
1401 if (unlikely(family >= NPROTO))
1402 return NULL;
1403 write_lock_bh(&xfrm_policy_afinfo_lock);
1404 afinfo = xfrm_policy_afinfo[family];
1405 if (unlikely(!afinfo))
1406 write_unlock_bh(&xfrm_policy_afinfo_lock);
1407 return afinfo;
1408}
1409
1410static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo)
1411{
1412 write_unlock_bh(&xfrm_policy_afinfo_lock);
1320} 1413}
1321 1414
1322static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) 1415static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 93a2f36ad3db..17b29ec3c417 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
77 kfree(x->ealg); 77 kfree(x->ealg);
78 kfree(x->calg); 78 kfree(x->calg);
79 kfree(x->encap); 79 kfree(x->encap);
80 if (x->mode)
81 xfrm_put_mode(x->mode);
80 if (x->type) { 82 if (x->type) {
81 x->type->destructor(x); 83 x->type->destructor(x);
82 xfrm_put_type(x->type); 84 xfrm_put_type(x->type);
@@ -1103,17 +1105,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family)
1103 return NULL; 1105 return NULL;
1104 read_lock(&xfrm_state_afinfo_lock); 1106 read_lock(&xfrm_state_afinfo_lock);
1105 afinfo = xfrm_state_afinfo[family]; 1107 afinfo = xfrm_state_afinfo[family];
1106 if (likely(afinfo != NULL)) 1108 if (unlikely(!afinfo))
1107 read_lock(&afinfo->lock); 1109 read_unlock(&xfrm_state_afinfo_lock);
1108 read_unlock(&xfrm_state_afinfo_lock);
1109 return afinfo; 1110 return afinfo;
1110} 1111}
1111 1112
1112static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) 1113static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
1113{ 1114{
1114 if (unlikely(afinfo == NULL)) 1115 read_unlock(&xfrm_state_afinfo_lock);
1115 return;
1116 read_unlock(&afinfo->lock);
1117} 1116}
1118 1117
1119/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ 1118/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
@@ -1196,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x)
1196 if (err) 1195 if (err)
1197 goto error; 1196 goto error;
1198 1197
1198 x->mode = xfrm_get_mode(x->props.mode, family);
1199 if (x->mode == NULL)
1200 goto error;
1201
1199 x->km.state = XFRM_STATE_VALID; 1202 x->km.state = XFRM_STATE_VALID;
1200 1203
1201error: 1204error:
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 81d1005830f4..c21dc26141ea 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -427,23 +427,25 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
427 if (x == NULL) 427 if (x == NULL)
428 return -ESRCH; 428 return -ESRCH;
429 429
430 if ((err = security_xfrm_state_delete(x)) != 0)
431 goto out;
432
430 if (xfrm_state_kern(x)) { 433 if (xfrm_state_kern(x)) {
431 xfrm_state_put(x); 434 err = -EPERM;
432 return -EPERM; 435 goto out;
433 } 436 }
434 437
435 err = xfrm_state_delete(x); 438 err = xfrm_state_delete(x);
436 if (err < 0) { 439 if (err < 0)
437 xfrm_state_put(x); 440 goto out;
438 return err;
439 }
440 441
441 c.seq = nlh->nlmsg_seq; 442 c.seq = nlh->nlmsg_seq;
442 c.pid = nlh->nlmsg_pid; 443 c.pid = nlh->nlmsg_pid;
443 c.event = nlh->nlmsg_type; 444 c.event = nlh->nlmsg_type;
444 km_state_notify(x, &c); 445 km_state_notify(x, &c);
445 xfrm_state_put(x);
446 446
447out:
448 xfrm_state_put(x);
447 return err; 449 return err;
448} 450}
449 451
@@ -1055,6 +1057,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
1055 MSG_DONTWAIT); 1057 MSG_DONTWAIT);
1056 } 1058 }
1057 } else { 1059 } else {
1060 if ((err = security_xfrm_policy_delete(xp)) != 0)
1061 goto out;
1058 c.data.byid = p->index; 1062 c.data.byid = p->index;
1059 c.event = nlh->nlmsg_type; 1063 c.event = nlh->nlmsg_type;
1060 c.seq = nlh->nlmsg_seq; 1064 c.seq = nlh->nlmsg_seq;
@@ -1064,6 +1068,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
1064 1068
1065 xfrm_pol_put(xp); 1069 xfrm_pol_put(xp);
1066 1070
1071out:
1067 return err; 1072 return err;
1068} 1073}
1069 1074