aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-06-19 21:55:56 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-19 21:55:56 -0400
commitd0b952a9837f81cd89e756b1b34293fa6e1cb59d (patch)
treefbe488bc5f407afa0e91cefb262d9e9ee69062ac /net/ipv4
parentd90125bfe958ed0451c6b98f831c86aba08b43d5 (diff)
parent47552c4e555eefe381f3d45140b59a2ea4b16486 (diff)
Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
* master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6: (109 commits) [ETHTOOL]: Fix UFO typo [SCTP]: Fix persistent slowdown in sctp when a gap ack consumes rx buffer. [SCTP]: Send only 1 window update SACK per message. [SCTP]: Don't do CRC32C checksum over loopback. [SCTP] Reset rtt_in_progress for the chunk when processing its sack. [SCTP]: Reject sctp packets with broadcast addresses. [SCTP]: Limit association max_retrans setting in setsockopt. [PFKEYV2]: Fix inconsistent typing in struct sadb_x_kmprivate. [IPV6]: Sum real space for RTAs. [IRDA]: Use put_unaligned() in irlmp_do_discovery(). [BRIDGE]: Add support for NETIF_F_HW_CSUM devices [NET]: Add NETIF_F_GEN_CSUM and NETIF_F_ALL_CSUM [TG3]: Convert to non-LLTX [TG3]: Remove unnecessary tx_lock [TCP]: Add tcp_slow_start_after_idle sysctl. [BNX2]: Update version and reldate [BNX2]: Use CPU native page size [BNX2]: Use compressed firmware [BNX2]: Add firmware decompression [BNX2]: Allow WoL settings on new 5708 chips ... Manual fixup for conflict in drivers/net/tulip/winbond-840.c
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig50
-rw-r--r--net/ipv4/Makefile6
-rw-r--r--net/ipv4/ah4.c15
-rw-r--r--net/ipv4/esp4.c18
-rw-r--r--net/ipv4/icmp.c5
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ipcomp.c34
-rw-r--r--net/ipv4/netfilter/Kconfig40
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c143
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c77
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_h323.c111
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_h323_types.c6
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c85
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c6
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_sip.c471
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c18
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_h323.c77
-rw-r--r--net/ipv4/netfilter/ip_nat_sip.c249
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c20
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c73
-rw-r--r--net/ipv4/netfilter/ipt_recent.c1276
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c26
-rw-r--r--net/ipv4/tcp.c127
-rw-r--r--net/ipv4/tcp_bic.c7
-rw-r--r--net/ipv4/tcp_compound.c448
-rw-r--r--net/ipv4/tcp_cong.c6
-rw-r--r--net/ipv4/tcp_cubic.c6
-rw-r--r--net/ipv4/tcp_highspeed.c24
-rw-r--r--net/ipv4/tcp_htcp.c9
-rw-r--r--net/ipv4/tcp_input.c89
-rw-r--r--net/ipv4/tcp_ipv4.c18
-rw-r--r--net/ipv4/tcp_lp.c338
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/tcp_probe.c181
-rw-r--r--net/ipv4/tcp_veno.c231
-rw-r--r--net/ipv4/tcp_westwood.c80
-rw-r--r--net/ipv4/xfrm4_input.c28
-rw-r--r--net/ipv4/xfrm4_mode_transport.c83
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c125
-rw-r--r--net/ipv4/xfrm4_output.c61
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_state.c1
53 files changed, 3377 insertions, 1334 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e40f75322377..da33393be45f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -414,6 +414,24 @@ config INET_TUNNEL
414 tristate 414 tristate
415 default n 415 default n
416 416
417config INET_XFRM_MODE_TRANSPORT
418 tristate "IP: IPsec transport mode"
419 default y
420 select XFRM
421 ---help---
422 Support for IPsec transport mode.
423
424 If unsure, say Y.
425
426config INET_XFRM_MODE_TUNNEL
427 tristate "IP: IPsec tunnel mode"
428 default y
429 select XFRM
430 ---help---
431 Support for IPsec tunnel mode.
432
433 If unsure, say Y.
434
417config INET_DIAG 435config INET_DIAG
418 tristate "INET: socket monitoring interface" 436 tristate "INET: socket monitoring interface"
419 default y 437 default y
@@ -532,6 +550,38 @@ config TCP_CONG_SCALABLE
532 properties, though is known to have fairness issues. 550 properties, though is known to have fairness issues.
533 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ 551 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
534 552
553config TCP_CONG_LP
554 tristate "TCP Low Priority"
555 depends on EXPERIMENTAL
556 default n
557 ---help---
558 TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
559 to utiliza only the excess network bandwidth as compared to the
560 ``fair share`` of bandwidth as targeted by TCP.
561 See http://www-ece.rice.edu/networks/TCP-LP/
562
563config TCP_CONG_VENO
564 tristate "TCP Veno"
565 depends on EXPERIMENTAL
566 default n
567 ---help---
568 TCP Veno is a sender-side only enhancement of TCP to obtain better
569 throughput over wireless networks. TCP Veno makes use of state
570 distinguishing to circumvent the difficult judgment of the packet loss
571 type. TCP Veno cuts down less congestion window in response to random
572 loss packets.
573 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
574
575config TCP_CONG_COMPOUND
576 tristate "TCP Compound"
577 depends on EXPERIMENTAL
578 default n
579 ---help---
580 TCP Compound is a sender-side only change to TCP that uses
581 a mixed Reno/Vegas approach to calculate the cwnd.
582 For further details look here:
583 ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf
584
535endmenu 585endmenu
536 586
537config TCP_CONG_BIC 587config TCP_CONG_BIC
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9ef50a0b9d2c..38b8039bdd55 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o
24obj-$(CONFIG_INET_IPCOMP) += ipcomp.o 24obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
25obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o 25obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
26obj-$(CONFIG_INET_TUNNEL) += tunnel4.o 26obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
27obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
28obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
27obj-$(CONFIG_IP_PNP) += ipconfig.o 29obj-$(CONFIG_IP_PNP) += ipconfig.o
28obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o 30obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
29obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o 31obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
@@ -34,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/
34obj-$(CONFIG_INET_DIAG) += inet_diag.o 36obj-$(CONFIG_INET_DIAG) += inet_diag.o
35obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 37obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
36obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 38obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
39obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
37obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 40obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
38obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o 41obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
39obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 42obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
@@ -41,7 +44,10 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
41obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o 44obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
42obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o 45obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
43obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o 46obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
47obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
44obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o 48obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
49obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
50obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o
45 51
46obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 52obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
47 xfrm4_output.o 53 xfrm4_output.o
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e2e4771fa4c6..c7782230080d 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -119,6 +119,7 @@ error:
119static int ah_input(struct xfrm_state *x, struct sk_buff *skb) 119static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
120{ 120{
121 int ah_hlen; 121 int ah_hlen;
122 int ihl;
122 struct iphdr *iph; 123 struct iphdr *iph;
123 struct ip_auth_hdr *ah; 124 struct ip_auth_hdr *ah;
124 struct ah_data *ahp; 125 struct ah_data *ahp;
@@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
149 ah = (struct ip_auth_hdr*)skb->data; 150 ah = (struct ip_auth_hdr*)skb->data;
150 iph = skb->nh.iph; 151 iph = skb->nh.iph;
151 152
152 memcpy(work_buf, iph, iph->ihl*4); 153 ihl = skb->data - skb->nh.raw;
154 memcpy(work_buf, iph, ihl);
153 155
154 iph->ttl = 0; 156 iph->ttl = 0;
155 iph->tos = 0; 157 iph->tos = 0;
156 iph->frag_off = 0; 158 iph->frag_off = 0;
157 iph->check = 0; 159 iph->check = 0;
158 if (iph->ihl != 5) { 160 if (ihl > sizeof(*iph)) {
159 u32 dummy; 161 u32 dummy;
160 if (ip_clear_mutable_options(iph, &dummy)) 162 if (ip_clear_mutable_options(iph, &dummy))
161 goto out; 163 goto out;
@@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
164 u8 auth_data[MAX_AH_AUTH_LEN]; 166 u8 auth_data[MAX_AH_AUTH_LEN];
165 167
166 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); 168 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
167 skb_push(skb, skb->data - skb->nh.raw); 169 skb_push(skb, ihl);
168 ahp->icv(ahp, skb, ah->auth_data); 170 ahp->icv(ahp, skb, ah->auth_data);
169 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { 171 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
170 x->stats.integrity_failed++; 172 x->stats.integrity_failed++;
@@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
172 } 174 }
173 } 175 }
174 ((struct iphdr*)work_buf)->protocol = ah->nexthdr; 176 ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
175 skb->nh.raw = skb_pull(skb, ah_hlen); 177 skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl);
176 memcpy(skb->nh.raw, work_buf, iph->ihl*4); 178 __skb_pull(skb, ah_hlen + ihl);
177 skb->nh.iph->tot_len = htons(skb->len);
178 skb_pull(skb, skb->nh.iph->ihl*4);
179 skb->h.raw = skb->data;
180 179
181 return 0; 180 return 0;
182 181
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9d1881c07a32..9bbdd4494551 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
143 int alen = esp->auth.icv_trunc_len; 143 int alen = esp->auth.icv_trunc_len;
144 int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; 144 int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
145 int nfrags; 145 int nfrags;
146 int encap_len = 0; 146 int ihl;
147 u8 nexthdr[2]; 147 u8 nexthdr[2];
148 struct scatterlist *sg; 148 struct scatterlist *sg;
149 u8 workbuf[60];
150 int padlen; 149 int padlen;
151 150
152 if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) 151 if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
@@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
177 skb->ip_summed = CHECKSUM_NONE; 176 skb->ip_summed = CHECKSUM_NONE;
178 177
179 esph = (struct ip_esp_hdr*)skb->data; 178 esph = (struct ip_esp_hdr*)skb->data;
180 iph = skb->nh.iph;
181 179
182 /* Get ivec. This can be wrong, check against another impls. */ 180 /* Get ivec. This can be wrong, check against another impls. */
183 if (esp->conf.ivlen) 181 if (esp->conf.ivlen)
@@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
204 202
205 /* ... check padding bits here. Silly. :-) */ 203 /* ... check padding bits here. Silly. :-) */
206 204
205 iph = skb->nh.iph;
206 ihl = iph->ihl * 4;
207
207 if (x->encap) { 208 if (x->encap) {
208 struct xfrm_encap_tmpl *encap = x->encap; 209 struct xfrm_encap_tmpl *encap = x->encap;
209 struct udphdr *uh; 210 struct udphdr *uh = (void *)(skb->nh.raw + ihl);
210
211 uh = (struct udphdr *)(iph + 1);
212 encap_len = (void*)esph - (void*)uh;
213 211
214 /* 212 /*
215 * 1) if the NAT-T peer's IP or port changed then 213 * 1) if the NAT-T peer's IP or port changed then
@@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
246 244
247 iph->protocol = nexthdr[1]; 245 iph->protocol = nexthdr[1];
248 pskb_trim(skb, skb->len - alen - padlen - 2); 246 pskb_trim(skb, skb->len - alen - padlen - 2);
249 memcpy(workbuf, skb->nh.raw, iph->ihl*4); 247 skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl;
250 skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
251 skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
252 memcpy(skb->nh.raw, workbuf, iph->ihl*4);
253 skb->nh.iph->tot_len = htons(skb->len);
254 248
255 return 0; 249 return 0;
256 250
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2a0455911ee0..017900172f7d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -730,7 +730,6 @@ out_err:
730static void icmp_redirect(struct sk_buff *skb) 730static void icmp_redirect(struct sk_buff *skb)
731{ 731{
732 struct iphdr *iph; 732 struct iphdr *iph;
733 unsigned long ip;
734 733
735 if (skb->len < sizeof(struct iphdr)) 734 if (skb->len < sizeof(struct iphdr))
736 goto out_err; 735 goto out_err;
@@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb)
742 goto out; 741 goto out;
743 742
744 iph = (struct iphdr *)skb->data; 743 iph = (struct iphdr *)skb->data;
745 ip = iph->daddr;
746 744
747 switch (skb->h.icmph->code & 7) { 745 switch (skb->h.icmph->code & 7) {
748 case ICMP_REDIR_NET: 746 case ICMP_REDIR_NET:
@@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb)
752 */ 750 */
753 case ICMP_REDIR_HOST: 751 case ICMP_REDIR_HOST:
754 case ICMP_REDIR_HOSTTOS: 752 case ICMP_REDIR_HOSTTOS:
755 ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, 753 ip_rt_redirect(skb->nh.iph->saddr, iph->daddr,
754 skb->h.icmph->un.gateway,
756 iph->saddr, skb->dev); 755 iph->saddr, skb->dev);
757 break; 756 break;
758 } 757 }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d512239a1473..ab680c851aa2 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2361 } 2361 }
2362 2362
2363 seq_printf(seq, 2363 seq_printf(seq,
2364 "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", 2364 "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
2365 im->multiaddr, im->users, 2365 im->multiaddr, im->users,
2366 im->tm_running, im->tm_running ? 2366 im->tm_running, im->tm_running ?
2367 jiffies_to_clock_t(im->timer.expires-jiffies) : 0, 2367 jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cff9c3a72daf..8538aac3d148 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
410 nf_bridge_get(to->nf_bridge); 410 nf_bridge_get(to->nf_bridge);
411#endif 411#endif
412#endif 412#endif
413 skb_copy_secmark(to, from);
413} 414}
414 415
415/* 416/*
@@ -839,7 +840,7 @@ int ip_append_data(struct sock *sk,
839 */ 840 */
840 if (transhdrlen && 841 if (transhdrlen &&
841 length + fragheaderlen <= mtu && 842 length + fragheaderlen <= mtu &&
842 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && 843 rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
843 !exthdrlen) 844 !exthdrlen)
844 csummode = CHECKSUM_HW; 845 csummode = CHECKSUM_HW;
845 846
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 95278b22b669..3ed8b57a1002 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list);
45static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) 45static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
46{ 46{
47 int err, plen, dlen; 47 int err, plen, dlen;
48 struct iphdr *iph;
49 struct ipcomp_data *ipcd = x->data; 48 struct ipcomp_data *ipcd = x->data;
50 u8 *start, *scratch; 49 u8 *start, *scratch;
51 struct crypto_tfm *tfm; 50 struct crypto_tfm *tfm;
@@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
74 73
75 skb_put(skb, dlen - plen); 74 skb_put(skb, dlen - plen);
76 memcpy(skb->data, scratch, dlen); 75 memcpy(skb->data, scratch, dlen);
77 iph = skb->nh.iph;
78 iph->tot_len = htons(dlen + iph->ihl * 4);
79out: 76out:
80 put_cpu(); 77 put_cpu();
81 return err; 78 return err;
@@ -83,34 +80,21 @@ out:
83 80
84static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) 81static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
85{ 82{
86 u8 nexthdr; 83 int err = -ENOMEM;
87 int err = 0;
88 struct iphdr *iph; 84 struct iphdr *iph;
89 union { 85 struct ip_comp_hdr *ipch;
90 struct iphdr iph;
91 char buf[60];
92 } tmp_iph;
93
94 86
95 if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && 87 if (skb_linearize_cow(skb))
96 skb_linearize(skb, GFP_ATOMIC) != 0) {
97 err = -ENOMEM;
98 goto out; 88 goto out;
99 }
100 89
101 skb->ip_summed = CHECKSUM_NONE; 90 skb->ip_summed = CHECKSUM_NONE;
102 91
103 /* Remove ipcomp header and decompress original payload */ 92 /* Remove ipcomp header and decompress original payload */
104 iph = skb->nh.iph; 93 iph = skb->nh.iph;
105 memcpy(&tmp_iph, iph, iph->ihl * 4); 94 ipch = (void *)skb->data;
106 nexthdr = *(u8 *)skb->data; 95 iph->protocol = ipch->nexthdr;
107 skb_pull(skb, sizeof(struct ip_comp_hdr)); 96 skb->h.raw = skb->nh.raw + sizeof(*ipch);
108 skb->nh.raw += sizeof(struct ip_comp_hdr); 97 __skb_pull(skb, sizeof(*ipch));
109 memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
110 iph = skb->nh.iph;
111 iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
112 iph->protocol = nexthdr;
113 skb->h.raw = skb->data;
114 err = ipcomp_decompress(x, skb); 98 err = ipcomp_decompress(x, skb);
115 99
116out: 100out:
@@ -171,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
171 goto out_ok; 155 goto out_ok;
172 } 156 }
173 157
174 if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && 158 if (skb_linearize_cow(skb))
175 skb_linearize(skb, GFP_ATOMIC) != 0) {
176 goto out_ok; 159 goto out_ok;
177 }
178 160
179 err = ipcomp_compress(x, skb); 161 err = ipcomp_compress(x, skb);
180 iph = skb->nh.iph; 162 iph = skb->nh.iph;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d4072533da21..e1d7f5fbc526 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK
55 of packets, but this mark value is kept in the conntrack session 55 of packets, but this mark value is kept in the conntrack session
56 instead of the individual packets. 56 instead of the individual packets.
57 57
58config IP_NF_CONNTRACK_SECMARK
59 bool 'Connection tracking security mark support'
60 depends on IP_NF_CONNTRACK && NETWORK_SECMARK
61 help
62 This option enables security markings to be applied to
63 connections. Typically they are copied to connections from
64 packets using the CONNSECMARK target and copied back from
65 connections to packets with the same target, with the packets
66 being originally labeled via SECMARK.
67
68 If unsure, say 'N'.
69
58config IP_NF_CONNTRACK_EVENTS 70config IP_NF_CONNTRACK_EVENTS
59 bool "Connection tracking events (EXPERIMENTAL)" 71 bool "Connection tracking events (EXPERIMENTAL)"
60 depends on EXPERIMENTAL && IP_NF_CONNTRACK 72 depends on EXPERIMENTAL && IP_NF_CONNTRACK
@@ -142,6 +154,8 @@ config IP_NF_TFTP
142config IP_NF_AMANDA 154config IP_NF_AMANDA
143 tristate "Amanda backup protocol support" 155 tristate "Amanda backup protocol support"
144 depends on IP_NF_CONNTRACK 156 depends on IP_NF_CONNTRACK
157 select TEXTSEARCH
158 select TEXTSEARCH_KMP
145 help 159 help
146 If you are running the Amanda backup package <http://www.amanda.org/> 160 If you are running the Amanda backup package <http://www.amanda.org/>
147 on this machine or machines that will be MASQUERADED through this 161 on this machine or machines that will be MASQUERADED through this
@@ -181,14 +195,26 @@ config IP_NF_H323
181 With this module you can support H.323 on a connection tracking/NAT 195 With this module you can support H.323 on a connection tracking/NAT
182 firewall. 196 firewall.
183 197
184 This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP 198 This module supports RAS, Fast Start, H.245 Tunnelling, Call
185 and T.120 based data and applications including audio, video, FAX, 199 Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
186 chat, whiteboard, file transfer, etc. For more information, please 200 whiteboard, file transfer, etc. For more information, please
187 see http://nath323.sourceforge.net/. 201 visit http://nath323.sourceforge.net/.
188 202
189 If you want to compile it as a module, say 'M' here and read 203 If you want to compile it as a module, say 'M' here and read
190 Documentation/modules.txt. If unsure, say 'N'. 204 Documentation/modules.txt. If unsure, say 'N'.
191 205
206config IP_NF_SIP
207 tristate "SIP protocol support (EXPERIMENTAL)"
208 depends on IP_NF_CONNTRACK && EXPERIMENTAL
209 help
210 SIP is an application-layer control protocol that can establish,
211 modify, and terminate multimedia sessions (conferences) such as
212 Internet telephony calls. With the ip_conntrack_sip and
213 the ip_nat_sip modules you can support the protocol on a connection
214 tracking/NATing firewall.
215
216 To compile it as a module, choose M here. If unsure, say Y.
217
192config IP_NF_QUEUE 218config IP_NF_QUEUE
193 tristate "IP Userspace queueing via NETLINK (OBSOLETE)" 219 tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
194 help 220 help
@@ -501,6 +527,12 @@ config IP_NF_NAT_H323
501 default IP_NF_NAT if IP_NF_H323=y 527 default IP_NF_NAT if IP_NF_H323=y
502 default m if IP_NF_H323=m 528 default m if IP_NF_H323=m
503 529
530config IP_NF_NAT_SIP
531 tristate
532 depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
533 default IP_NF_NAT if IP_NF_SIP=y
534 default m if IP_NF_SIP=m
535
504# mangle + specific targets 536# mangle + specific targets
505config IP_NF_MANGLE 537config IP_NF_MANGLE
506 tristate "Packet mangling" 538 tristate "Packet mangling"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 461cb1eb5de7..3ded4a3af59c 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
31obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o 31obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
32obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o 32obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
33obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o 33obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
34obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o
34obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o 35obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
35 36
36# NAT helpers 37# NAT helpers
@@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
40obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o 41obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
41obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o 42obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
42obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o 43obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
44obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o
43 45
44# generic IP tables 46# generic IP tables
45obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o 47obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a604b1ccfdaa..0a7bd7f04061 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -17,33 +17,29 @@
17 * this value. 17 * this value.
18 * 18 *
19 */ 19 */
20
21#include <linux/in.h>
22#include <linux/kernel.h> 20#include <linux/kernel.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/netfilter.h>
25#include <linux/ip.h>
26#include <linux/moduleparam.h> 22#include <linux/moduleparam.h>
23#include <linux/textsearch.h>
24#include <linux/skbuff.h>
25#include <linux/in.h>
26#include <linux/ip.h>
27#include <linux/udp.h> 27#include <linux/udp.h>
28#include <net/checksum.h>
29#include <net/udp.h>
30 28
29#include <linux/netfilter.h>
31#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 30#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
32#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> 31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
33 32
34static unsigned int master_timeout = 300; 33static unsigned int master_timeout = 300;
34static char *ts_algo = "kmp";
35 35
36MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); 36MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
37MODULE_DESCRIPTION("Amanda connection tracking module"); 37MODULE_DESCRIPTION("Amanda connection tracking module");
38MODULE_LICENSE("GPL"); 38MODULE_LICENSE("GPL");
39module_param(master_timeout, uint, 0600); 39module_param(master_timeout, uint, 0600);
40MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); 40MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
41 41module_param(ts_algo, charp, 0400);
42static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; 42MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
43
44/* This is slow, but it's simple. --RR */
45static char *amanda_buffer;
46static DEFINE_SPINLOCK(amanda_buffer_lock);
47 43
48unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 44unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
49 enum ip_conntrack_info ctinfo, 45 enum ip_conntrack_info ctinfo,
@@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
52 struct ip_conntrack_expect *exp); 48 struct ip_conntrack_expect *exp);
53EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); 49EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
54 50
51enum amanda_strings {
52 SEARCH_CONNECT,
53 SEARCH_NEWLINE,
54 SEARCH_DATA,
55 SEARCH_MESG,
56 SEARCH_INDEX,
57};
58
59static struct {
60 char *string;
61 size_t len;
62 struct ts_config *ts;
63} search[] = {
64 [SEARCH_CONNECT] = {
65 .string = "CONNECT ",
66 .len = 8,
67 },
68 [SEARCH_NEWLINE] = {
69 .string = "\n",
70 .len = 1,
71 },
72 [SEARCH_DATA] = {
73 .string = "DATA ",
74 .len = 5,
75 },
76 [SEARCH_MESG] = {
77 .string = "MESG ",
78 .len = 5,
79 },
80 [SEARCH_INDEX] = {
81 .string = "INDEX ",
82 .len = 6,
83 },
84};
85
55static int help(struct sk_buff **pskb, 86static int help(struct sk_buff **pskb,
56 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) 87 struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
57{ 88{
89 struct ts_state ts;
58 struct ip_conntrack_expect *exp; 90 struct ip_conntrack_expect *exp;
59 char *data, *data_limit, *tmp; 91 unsigned int dataoff, start, stop, off, i;
60 unsigned int dataoff, i; 92 char pbuf[sizeof("65535")], *tmp;
61 u_int16_t port, len; 93 u_int16_t port, len;
62 int ret = NF_ACCEPT; 94 int ret = NF_ACCEPT;
63 95
@@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb,
77 return NF_ACCEPT; 109 return NF_ACCEPT;
78 } 110 }
79 111
80 spin_lock_bh(&amanda_buffer_lock); 112 memset(&ts, 0, sizeof(ts));
81 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); 113 start = skb_find_text(*pskb, dataoff, (*pskb)->len,
82 data = amanda_buffer; 114 search[SEARCH_CONNECT].ts, &ts);
83 data_limit = amanda_buffer + (*pskb)->len - dataoff; 115 if (start == UINT_MAX)
84 *data_limit = '\0';
85
86 /* Search for the CONNECT string */
87 data = strstr(data, "CONNECT ");
88 if (!data)
89 goto out; 116 goto out;
90 data += strlen("CONNECT "); 117 start += dataoff + search[SEARCH_CONNECT].len;
91 118
92 /* Only search first line. */ 119 memset(&ts, 0, sizeof(ts));
93 if ((tmp = strchr(data, '\n'))) 120 stop = skb_find_text(*pskb, start, (*pskb)->len,
94 *tmp = '\0'; 121 search[SEARCH_NEWLINE].ts, &ts);
122 if (stop == UINT_MAX)
123 goto out;
124 stop += start;
95 125
96 for (i = 0; i < ARRAY_SIZE(conns); i++) { 126 for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
97 char *match = strstr(data, conns[i]); 127 memset(&ts, 0, sizeof(ts));
98 if (!match) 128 off = skb_find_text(*pskb, start, stop, search[i].ts, &ts);
129 if (off == UINT_MAX)
99 continue; 130 continue;
100 tmp = data = match + strlen(conns[i]); 131 off += start + search[i].len;
101 port = simple_strtoul(data, &data, 10); 132
102 len = data - tmp; 133 len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
134 if (skb_copy_bits(*pskb, off, pbuf, len))
135 break;
136 pbuf[len] = '\0';
137
138 port = simple_strtoul(pbuf, &tmp, 10);
139 len = tmp - pbuf;
103 if (port == 0 || len > 5) 140 if (port == 0 || len > 5)
104 break; 141 break;
105 142
@@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb,
125 exp->mask.dst.u.tcp.port = 0xFFFF; 162 exp->mask.dst.u.tcp.port = 0xFFFF;
126 163
127 if (ip_nat_amanda_hook) 164 if (ip_nat_amanda_hook)
128 ret = ip_nat_amanda_hook(pskb, ctinfo, 165 ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff,
129 tmp - amanda_buffer,
130 len, exp); 166 len, exp);
131 else if (ip_conntrack_expect_related(exp) != 0) 167 else if (ip_conntrack_expect_related(exp) != 0)
132 ret = NF_DROP; 168 ret = NF_DROP;
@@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb,
134 } 170 }
135 171
136out: 172out:
137 spin_unlock_bh(&amanda_buffer_lock);
138 return ret; 173 return ret;
139} 174}
140 175
141static struct ip_conntrack_helper amanda_helper = { 176static struct ip_conntrack_helper amanda_helper = {
142 .max_expected = ARRAY_SIZE(conns), 177 .max_expected = 3,
143 .timeout = 180, 178 .timeout = 180,
144 .me = THIS_MODULE, 179 .me = THIS_MODULE,
145 .help = help, 180 .help = help,
@@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = {
155 190
156static void __exit ip_conntrack_amanda_fini(void) 191static void __exit ip_conntrack_amanda_fini(void)
157{ 192{
193 int i;
194
158 ip_conntrack_helper_unregister(&amanda_helper); 195 ip_conntrack_helper_unregister(&amanda_helper);
159 kfree(amanda_buffer); 196 for (i = 0; i < ARRAY_SIZE(search); i++)
197 textsearch_destroy(search[i].ts);
160} 198}
161 199
162static int __init ip_conntrack_amanda_init(void) 200static int __init ip_conntrack_amanda_init(void)
163{ 201{
164 int ret; 202 int ret, i;
165 203
166 amanda_buffer = kmalloc(65536, GFP_KERNEL); 204 ret = -ENOMEM;
167 if (!amanda_buffer) 205 for (i = 0; i < ARRAY_SIZE(search); i++) {
168 return -ENOMEM; 206 search[i].ts = textsearch_prepare(ts_algo, search[i].string,
169 207 search[i].len,
170 ret = ip_conntrack_helper_register(&amanda_helper); 208 GFP_KERNEL, TS_AUTOLOAD);
171 if (ret < 0) { 209 if (search[i].ts == NULL)
172 kfree(amanda_buffer); 210 goto err;
173 return ret;
174 } 211 }
212 ret = ip_conntrack_helper_register(&amanda_helper);
213 if (ret < 0)
214 goto err;
175 return 0; 215 return 0;
176 216
177 217err:
218 for (; i >= 0; i--) {
219 if (search[i].ts)
220 textsearch_destroy(search[i].ts);
221 }
222 return ret;
178} 223}
179 224
180module_init(ip_conntrack_amanda_init); 225module_init(ip_conntrack_amanda_init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a297da7bbef5..7e4cf9a4d15f 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -724,6 +724,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple,
724 /* this is ugly, but there is no other place where to put it */ 724 /* this is ugly, but there is no other place where to put it */
725 conntrack->nat.masq_index = exp->master->nat.masq_index; 725 conntrack->nat.masq_index = exp->master->nat.masq_index;
726#endif 726#endif
727#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
728 conntrack->secmark = exp->master->secmark;
729#endif
727 nf_conntrack_get(&conntrack->master->ct_general); 730 nf_conntrack_get(&conntrack->master->ct_general);
728 CONNTRACK_STAT_INC(expect_new); 731 CONNTRACK_STAT_INC(expect_new);
729 } else { 732 } else {
@@ -1130,6 +1133,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1130 1133
1131 write_lock_bh(&ip_conntrack_lock); 1134 write_lock_bh(&ip_conntrack_lock);
1132 1135
1136 /* Only update if this is not a fixed timeout */
1137 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1138 write_unlock_bh(&ip_conntrack_lock);
1139 return;
1140 }
1141
1133 /* If not in hash table, timer will not be active yet */ 1142 /* If not in hash table, timer will not be active yet */
1134 if (!is_confirmed(ct)) { 1143 if (!is_confirmed(ct)) {
1135 ct->timeout.expires = extra_jiffies; 1144 ct->timeout.expires = extra_jiffies;
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 3e542bf28a9d..4dcf526c3944 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char);
56static int try_epsv_response(const char *, size_t, u_int32_t [], char); 56static int try_epsv_response(const char *, size_t, u_int32_t [], char);
57 57
58static const struct ftp_search { 58static const struct ftp_search {
59 enum ip_conntrack_dir dir;
60 const char *pattern; 59 const char *pattern;
61 size_t plen; 60 size_t plen;
62 char skip; 61 char skip;
63 char term; 62 char term;
64 enum ip_ct_ftp_type ftptype; 63 enum ip_ct_ftp_type ftptype;
65 int (*getnum)(const char *, size_t, u_int32_t[], char); 64 int (*getnum)(const char *, size_t, u_int32_t[], char);
66} search[] = { 65} search[IP_CT_DIR_MAX][2] = {
67 { 66 [IP_CT_DIR_ORIGINAL] = {
68 IP_CT_DIR_ORIGINAL, 67 {
69 "PORT", sizeof("PORT") - 1, ' ', '\r', 68 .pattern = "PORT",
70 IP_CT_FTP_PORT, 69 .plen = sizeof("PORT") - 1,
71 try_rfc959, 70 .skip = ' ',
71 .term = '\r',
72 .ftptype = IP_CT_FTP_PORT,
73 .getnum = try_rfc959,
74 },
75 {
76 .pattern = "EPRT",
77 .plen = sizeof("EPRT") - 1,
78 .skip = ' ',
79 .term = '\r',
80 .ftptype = IP_CT_FTP_EPRT,
81 .getnum = try_eprt,
82 },
72 }, 83 },
73 { 84 [IP_CT_DIR_REPLY] = {
74 IP_CT_DIR_REPLY, 85 {
75 "227 ", sizeof("227 ") - 1, '(', ')', 86 .pattern = "227 ",
76 IP_CT_FTP_PASV, 87 .plen = sizeof("227 ") - 1,
77 try_rfc959, 88 .skip = '(',
78 }, 89 .term = ')',
79 { 90 .ftptype = IP_CT_FTP_PASV,
80 IP_CT_DIR_ORIGINAL, 91 .getnum = try_rfc959,
81 "EPRT", sizeof("EPRT") - 1, ' ', '\r', 92 },
82 IP_CT_FTP_EPRT, 93 {
83 try_eprt, 94 .pattern = "229 ",
84 }, 95 .plen = sizeof("229 ") - 1,
85 { 96 .skip = '(',
86 IP_CT_DIR_REPLY, 97 .term = ')',
87 "229 ", sizeof("229 ") - 1, '(', ')', 98 .ftptype = IP_CT_FTP_EPSV,
88 IP_CT_FTP_EPSV, 99 .getnum = try_epsv_response,
89 try_epsv_response, 100 },
90 }, 101 },
91}; 102};
92 103
@@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb,
346 array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; 357 array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
347 array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; 358 array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
348 359
349 for (i = 0; i < ARRAY_SIZE(search); i++) { 360 for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
350 if (search[i].dir != dir) continue;
351
352 found = find_pattern(fb_ptr, (*pskb)->len - dataoff, 361 found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
353 search[i].pattern, 362 search[dir][i].pattern,
354 search[i].plen, 363 search[dir][i].plen,
355 search[i].skip, 364 search[dir][i].skip,
356 search[i].term, 365 search[dir][i].term,
357 &matchoff, &matchlen, 366 &matchoff, &matchlen,
358 array, 367 array,
359 search[i].getnum); 368 search[dir][i].getnum);
360 if (found) break; 369 if (found) break;
361 } 370 }
362 if (found == -1) { 371 if (found == -1) {
@@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb,
366 this case. */ 375 this case. */
367 if (net_ratelimit()) 376 if (net_ratelimit())
368 printk("conntrack_ftp: partial %s %u+%u\n", 377 printk("conntrack_ftp: partial %s %u+%u\n",
369 search[i].pattern, 378 search[dir][i].pattern,
370 ntohl(th->seq), datalen); 379 ntohl(th->seq), datalen);
371 ret = NF_DROP; 380 ret = NF_DROP;
372 goto out; 381 goto out;
@@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb,
426 /* Now, NAT might want to mangle the packet, and register the 435 /* Now, NAT might want to mangle the packet, and register the
427 * (possibly changed) expectation itself. */ 436 * (possibly changed) expectation itself. */
428 if (ip_nat_ftp_hook) 437 if (ip_nat_ftp_hook)
429 ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, 438 ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype,
430 matchoff, matchlen, exp, &seq); 439 matchoff, matchlen, exp, &seq);
431 else { 440 else {
432 /* Can't expect this? Best to drop packet now. */ 441 /* Can't expect this? Best to drop packet now. */
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
index 518f581d39ec..0665674218c6 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
@@ -22,6 +22,8 @@
22#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> 22#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
23#include <linux/netfilter_ipv4/ip_conntrack_h323.h> 23#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
24#include <linux/moduleparam.h> 24#include <linux/moduleparam.h>
25#include <linux/ctype.h>
26#include <linux/inet.h>
25 27
26#if 0 28#if 0
27#define DEBUGP printk 29#define DEBUGP printk
@@ -38,6 +40,12 @@ static int gkrouted_only = 1;
38module_param(gkrouted_only, int, 0600); 40module_param(gkrouted_only, int, 0600);
39MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); 41MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
40 42
43static int callforward_filter = 1;
44module_param(callforward_filter, bool, 0600);
45MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
46 "if both endpoints are on different sides "
47 "(determined by routing information)");
48
41/* Hooks for NAT */ 49/* Hooks for NAT */
42int (*set_h245_addr_hook) (struct sk_buff ** pskb, 50int (*set_h245_addr_hook) (struct sk_buff ** pskb,
43 unsigned char **data, int dataoff, 51 unsigned char **data, int dataoff,
@@ -77,6 +85,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb,
77 unsigned char **data, int dataoff, 85 unsigned char **data, int dataoff,
78 TransportAddress * addr, u_int16_t port, 86 TransportAddress * addr, u_int16_t port,
79 struct ip_conntrack_expect * exp); 87 struct ip_conntrack_expect * exp);
88int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
89 struct ip_conntrack * ct,
90 enum ip_conntrack_info ctinfo,
91 unsigned char **data, int dataoff,
92 TransportAddress * addr, u_int16_t port,
93 struct ip_conntrack_expect * exp);
80int (*nat_q931_hook) (struct sk_buff ** pskb, 94int (*nat_q931_hook) (struct sk_buff ** pskb,
81 struct ip_conntrack * ct, 95 struct ip_conntrack * ct,
82 enum ip_conntrack_info ctinfo, 96 enum ip_conntrack_info ctinfo,
@@ -683,6 +697,92 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
683 return ret; 697 return ret;
684} 698}
685 699
700/* Forwarding declaration */
701void ip_conntrack_q931_expect(struct ip_conntrack *new,
702 struct ip_conntrack_expect *this);
703
704/****************************************************************************/
705static int expect_callforwarding(struct sk_buff **pskb,
706 struct ip_conntrack *ct,
707 enum ip_conntrack_info ctinfo,
708 unsigned char **data, int dataoff,
709 TransportAddress * addr)
710{
711 int dir = CTINFO2DIR(ctinfo);
712 int ret = 0;
713 u_int32_t ip;
714 u_int16_t port;
715 struct ip_conntrack_expect *exp = NULL;
716
717 /* Read alternativeAddress */
718 if (!get_h225_addr(*data, addr, &ip, &port) || port == 0)
719 return 0;
720
721 /* If the calling party is on the same side of the forward-to party,
722 * we don't need to track the second call */
723 if (callforward_filter) {
724 struct rtable *rt1, *rt2;
725 struct flowi fl1 = {
726 .fl4_dst = ip,
727 };
728 struct flowi fl2 = {
729 .fl4_dst = ct->tuplehash[!dir].tuple.src.ip,
730 };
731
732 if (ip_route_output_key(&rt1, &fl1) == 0) {
733 if (ip_route_output_key(&rt2, &fl2) == 0) {
734 if (rt1->rt_gateway == rt2->rt_gateway &&
735 rt1->u.dst.dev == rt2->u.dst.dev)
736 ret = 1;
737 dst_release(&rt2->u.dst);
738 }
739 dst_release(&rt1->u.dst);
740 }
741 if (ret) {
742 DEBUGP("ip_ct_q931: Call Forwarding not tracked\n");
743 return 0;
744 }
745 }
746
747 /* Create expect for the second call leg */
748 if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
749 return -1;
750 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
751 exp->tuple.src.u.tcp.port = 0;
752 exp->tuple.dst.ip = ip;
753 exp->tuple.dst.u.tcp.port = htons(port);
754 exp->tuple.dst.protonum = IPPROTO_TCP;
755 exp->mask.src.ip = 0xFFFFFFFF;
756 exp->mask.src.u.tcp.port = 0;
757 exp->mask.dst.ip = 0xFFFFFFFF;
758 exp->mask.dst.u.tcp.port = 0xFFFF;
759 exp->mask.dst.protonum = 0xFF;
760 exp->flags = 0;
761
762 if (ct->tuplehash[dir].tuple.src.ip !=
763 ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) {
764 /* Need NAT */
765 ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff,
766 addr, port, exp);
767 } else { /* Conntrack only */
768 exp->expectfn = ip_conntrack_q931_expect;
769
770 if (ip_conntrack_expect_related(exp) == 0) {
771 DEBUGP("ip_ct_q931: expect Call Forwarding "
772 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
773 NIPQUAD(exp->tuple.src.ip),
774 ntohs(exp->tuple.src.u.tcp.port),
775 NIPQUAD(exp->tuple.dst.ip),
776 ntohs(exp->tuple.dst.u.tcp.port));
777 } else
778 ret = -1;
779 }
780
781 ip_conntrack_expect_put(exp);
782
783 return ret;
784}
785
686/****************************************************************************/ 786/****************************************************************************/
687static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, 787static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
688 enum ip_conntrack_info ctinfo, 788 enum ip_conntrack_info ctinfo,
@@ -878,6 +978,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
878 978
879 DEBUGP("ip_ct_q931: Facility\n"); 979 DEBUGP("ip_ct_q931: Facility\n");
880 980
981 if (facility->reason.choice == eFacilityReason_callForwarded) {
982 if (facility->options & eFacility_UUIE_alternativeAddress)
983 return expect_callforwarding(pskb, ct, ctinfo, data,
984 dataoff,
985 &facility->
986 alternativeAddress);
987 return 0;
988 }
989
881 if (facility->options & eFacility_UUIE_h245Address) { 990 if (facility->options & eFacility_UUIE_h245Address) {
882 ret = expect_h245(pskb, ct, ctinfo, data, dataoff, 991 ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
883 &facility->h245Address); 992 &facility->h245Address);
@@ -1677,7 +1786,6 @@ static int __init init(void)
1677 fini(); 1786 fini();
1678 return ret; 1787 return ret;
1679 } 1788 }
1680
1681 DEBUGP("ip_ct_h323: init success\n"); 1789 DEBUGP("ip_ct_h323: init success\n");
1682 return 0; 1790 return 0;
1683} 1791}
@@ -1696,6 +1804,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook);
1696EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); 1804EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
1697EXPORT_SYMBOL_GPL(nat_t120_hook); 1805EXPORT_SYMBOL_GPL(nat_t120_hook);
1698EXPORT_SYMBOL_GPL(nat_h245_hook); 1806EXPORT_SYMBOL_GPL(nat_h245_hook);
1807EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
1699EXPORT_SYMBOL_GPL(nat_q931_hook); 1808EXPORT_SYMBOL_GPL(nat_q931_hook);
1700 1809
1701MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); 1810MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
index 022c47b9f6c9..4b359618bedd 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
@@ -1,4 +1,4 @@
1/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 1/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006
2 * 2 *
3 * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> 3 * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
4 * 4 *
@@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */
1069 1069
1070static field_t _Facility_UUIE[] = { /* SEQUENCE */ 1070static field_t _Facility_UUIE[] = { /* SEQUENCE */
1071 {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, 1071 {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
1072 {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0, 1072 {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
1073 _TransportAddress}, 1073 offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
1074 {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, 1074 {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
1075 _Facility_UUIE_alternativeAliasAddress}, 1075 _Facility_UUIE_alternativeAliasAddress},
1076 {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, 1076 {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 01bd7cab9367..33891bb1fde4 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -399,38 +399,54 @@ nfattr_failure:
399static int ctnetlink_done(struct netlink_callback *cb) 399static int ctnetlink_done(struct netlink_callback *cb)
400{ 400{
401 DEBUGP("entered %s\n", __FUNCTION__); 401 DEBUGP("entered %s\n", __FUNCTION__);
402 if (cb->args[1])
403 ip_conntrack_put((struct ip_conntrack *)cb->args[1]);
402 return 0; 404 return 0;
403} 405}
404 406
405static int 407static int
406ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) 408ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
407{ 409{
408 struct ip_conntrack *ct = NULL; 410 struct ip_conntrack *ct, *last;
409 struct ip_conntrack_tuple_hash *h; 411 struct ip_conntrack_tuple_hash *h;
410 struct list_head *i; 412 struct list_head *i;
411 u_int32_t *id = (u_int32_t *) &cb->args[1];
412 413
413 DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, 414 DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__,
414 cb->args[0], *id); 415 cb->args[0], *id);
415 416
416 read_lock_bh(&ip_conntrack_lock); 417 read_lock_bh(&ip_conntrack_lock);
417 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { 418 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) {
419restart:
420 last = (struct ip_conntrack *)cb->args[1];
418 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { 421 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
419 h = (struct ip_conntrack_tuple_hash *) i; 422 h = (struct ip_conntrack_tuple_hash *) i;
420 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) 423 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
421 continue; 424 continue;
422 ct = tuplehash_to_ctrack(h); 425 ct = tuplehash_to_ctrack(h);
423 if (ct->id <= *id) 426 if (last != NULL) {
424 continue; 427 if (ct == last) {
428 ip_conntrack_put(last);
429 cb->args[1] = 0;
430 last = NULL;
431 } else
432 continue;
433 }
425 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, 434 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
426 cb->nlh->nlmsg_seq, 435 cb->nlh->nlmsg_seq,
427 IPCTNL_MSG_CT_NEW, 436 IPCTNL_MSG_CT_NEW,
428 1, ct) < 0) 437 1, ct) < 0) {
438 nf_conntrack_get(&ct->ct_general);
439 cb->args[1] = (unsigned long)ct;
429 goto out; 440 goto out;
430 *id = ct->id; 441 }
442 }
443 if (last != NULL) {
444 ip_conntrack_put(last);
445 cb->args[1] = 0;
446 goto restart;
431 } 447 }
432 } 448 }
433out: 449out:
434 read_unlock_bh(&ip_conntrack_lock); 450 read_unlock_bh(&ip_conntrack_lock);
435 451
436 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); 452 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
@@ -629,7 +645,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
629}; 645};
630 646
631static inline int 647static inline int
632ctnetlink_parse_nat(struct nfattr *cda[], 648ctnetlink_parse_nat(struct nfattr *nat,
633 const struct ip_conntrack *ct, struct ip_nat_range *range) 649 const struct ip_conntrack *ct, struct ip_nat_range *range)
634{ 650{
635 struct nfattr *tb[CTA_NAT_MAX]; 651 struct nfattr *tb[CTA_NAT_MAX];
@@ -639,7 +655,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
639 655
640 memset(range, 0, sizeof(*range)); 656 memset(range, 0, sizeof(*range));
641 657
642 nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); 658 nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
643 659
644 if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) 660 if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
645 return -EINVAL; 661 return -EINVAL;
@@ -854,39 +870,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
854 /* ASSURED bit can only be set */ 870 /* ASSURED bit can only be set */
855 return -EINVAL; 871 return -EINVAL;
856 872
857 if (cda[CTA_NAT-1]) { 873 if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
858#ifndef CONFIG_IP_NF_NAT_NEEDED 874#ifndef CONFIG_IP_NF_NAT_NEEDED
859 return -EINVAL; 875 return -EINVAL;
860#else 876#else
861 unsigned int hooknum;
862 struct ip_nat_range range; 877 struct ip_nat_range range;
863 878
864 if (ctnetlink_parse_nat(cda, ct, &range) < 0) 879 if (cda[CTA_NAT_DST-1]) {
865 return -EINVAL; 880 if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
866 881 &range) < 0)
867 DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 882 return -EINVAL;
868 NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), 883 if (ip_nat_initialized(ct,
869 htons(range.min.all), htons(range.max.all)); 884 HOOK2MANIP(NF_IP_PRE_ROUTING)))
870 885 return -EEXIST;
871 /* This is tricky but it works. ip_nat_setup_info needs the 886 ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
872 * hook number as parameter, so let's do the correct 887 }
873 * conversion and run away */ 888 if (cda[CTA_NAT_SRC-1]) {
874 if (status & IPS_SRC_NAT_DONE) 889 if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
875 hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ 890 &range) < 0)
876 else if (status & IPS_DST_NAT_DONE) 891 return -EINVAL;
877 hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ 892 if (ip_nat_initialized(ct,
878 else 893 HOOK2MANIP(NF_IP_POST_ROUTING)))
879 return -EINVAL; /* Missing NAT flags */ 894 return -EEXIST;
880 895 ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
881 DEBUGP("NAT status: %lu\n", 896 }
882 status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
883
884 if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
885 return -EEXIST;
886 ip_nat_setup_info(ct, &range, hooknum);
887
888 DEBUGP("NAT status after setup_info: %lu\n",
889 ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
890#endif 897#endif
891 } 898 }
892 899
@@ -1106,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
1106 /* implicit 'else' */ 1113 /* implicit 'else' */
1107 1114
1108 /* we only allow nat config for new conntracks */ 1115 /* we only allow nat config for new conntracks */
1109 if (cda[CTA_NAT-1]) { 1116 if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
1110 err = -EINVAL; 1117 err = -EINVAL;
1111 goto out_unlock; 1118 goto out_unlock;
1112 } 1119 }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 56794797d55b..21ee124c0463 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km,
77} 77}
78 78
79/* look up the source key for a given tuple */ 79/* look up the source key for a given tuple */
80static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) 80static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t)
81{ 81{
82 struct ip_ct_gre_keymap *km; 82 struct ip_ct_gre_keymap *km;
83 u_int32_t key = 0; 83 __be16 key = 0;
84 84
85 read_lock_bh(&ip_ct_gre_lock); 85 read_lock_bh(&ip_ct_gre_lock);
86 km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, 86 km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
@@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
190 struct ip_conntrack_tuple *tuple) 190 struct ip_conntrack_tuple *tuple)
191{ 191{
192 struct gre_hdr_pptp _pgrehdr, *pgrehdr; 192 struct gre_hdr_pptp _pgrehdr, *pgrehdr;
193 u_int32_t srckey; 193 __be16 srckey;
194 struct gre_hdr _grehdr, *grehdr; 194 struct gre_hdr _grehdr, *grehdr;
195 195
196 /* first only delinearize old RFC1701 GRE header */ 196 /* first only delinearize old RFC1701 GRE header */
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index d8b14a9010a6..23f1c504586d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
224 } 224 }
225 225
226 /* See ip_conntrack_proto_tcp.c */ 226 /* See ip_conntrack_proto_tcp.c */
227 if (hooknum == NF_IP_PRE_ROUTING && 227 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
228 nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { 228 nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
229 if (LOG_INVALID(IPPROTO_ICMP)) 229 if (LOG_INVALID(IPPROTO_ICMP))
230 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 230 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 062b252b58ad..c5c2ce5cdeb8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb,
870 * and moreover root might send raw packets. 870 * and moreover root might send raw packets.
871 */ 871 */
872 /* FIXME: Source route IP option packets --RR */ 872 /* FIXME: Source route IP option packets --RR */
873 if (hooknum == NF_IP_PRE_ROUTING && 873 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
874 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { 874 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
875 if (LOG_INVALID(IPPROTO_TCP)) 875 if (LOG_INVALID(IPPROTO_TCP))
876 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 876 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 70899868783b..9b2c16b4d2ff 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
120 * because the semantic of CHECKSUM_HW is different there 120 * because the semantic of CHECKSUM_HW is different there
121 * and moreover root might send raw packets. 121 * and moreover root might send raw packets.
122 * FIXME: Source route IP option packets --RR */ 122 * FIXME: Source route IP option packets --RR */
123 if (hooknum == NF_IP_PRE_ROUTING && 123 if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
124 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { 124 nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
125 if (LOG_INVALID(IPPROTO_UDP)) 125 if (LOG_INVALID(IPPROTO_UDP))
126 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 126 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c
new file mode 100644
index 000000000000..fc87ce0da40d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_sip.c
@@ -0,0 +1,471 @@
1/* SIP extension for IP connection tracking.
2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_conntrack_ftp.c and other modules.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/config.h>
12#include <linux/module.h>
13#include <linux/ctype.h>
14#include <linux/skbuff.h>
15#include <linux/in.h>
16#include <linux/ip.h>
17#include <linux/udp.h>
18
19#include <linux/netfilter.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
22#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
23
24#if 0
25#define DEBUGP printk
26#else
27#define DEBUGP(format, args...)
28#endif
29
30MODULE_LICENSE("GPL");
31MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
32MODULE_DESCRIPTION("SIP connection tracking helper");
33
34#define MAX_PORTS 8
35static unsigned short ports[MAX_PORTS];
36static int ports_c;
37module_param_array(ports, ushort, &ports_c, 0400);
38MODULE_PARM_DESC(ports, "port numbers of sip servers");
39
40static unsigned int sip_timeout = SIP_TIMEOUT;
41module_param(sip_timeout, uint, 0600);
42MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
43
44unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
45 enum ip_conntrack_info ctinfo,
46 struct ip_conntrack *ct,
47 const char **dptr);
48EXPORT_SYMBOL_GPL(ip_nat_sip_hook);
49
50unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
51 enum ip_conntrack_info ctinfo,
52 struct ip_conntrack_expect *exp,
53 const char *dptr);
54EXPORT_SYMBOL_GPL(ip_nat_sdp_hook);
55
56int ct_sip_get_info(const char *dptr, size_t dlen,
57 unsigned int *matchoff,
58 unsigned int *matchlen,
59 struct sip_header_nfo *hnfo);
60EXPORT_SYMBOL_GPL(ct_sip_get_info);
61
62
63static int digits_len(const char *dptr, const char *limit, int *shift);
64static int epaddr_len(const char *dptr, const char *limit, int *shift);
65static int skp_digits_len(const char *dptr, const char *limit, int *shift);
66static int skp_epaddr_len(const char *dptr, const char *limit, int *shift);
67
68struct sip_header_nfo ct_sip_hdrs[] = {
69 { /* Via header */
70 .lname = "Via:",
71 .lnlen = sizeof("Via:") - 1,
72 .sname = "\r\nv:",
73 .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */
74 .ln_str = "UDP ",
75 .ln_strlen = sizeof("UDP ") - 1,
76 .match_len = epaddr_len,
77 },
78 { /* Contact header */
79 .lname = "Contact:",
80 .lnlen = sizeof("Contact:") - 1,
81 .sname = "\r\nm:",
82 .snlen = sizeof("\r\nm:") - 1,
83 .ln_str = "sip:",
84 .ln_strlen = sizeof("sip:") - 1,
85 .match_len = skp_epaddr_len
86 },
87 { /* Content length header */
88 .lname = "Content-Length:",
89 .lnlen = sizeof("Content-Length:") - 1,
90 .sname = "\r\nl:",
91 .snlen = sizeof("\r\nl:") - 1,
92 .ln_str = ":",
93 .ln_strlen = sizeof(":") - 1,
94 .match_len = skp_digits_len
95 },
96 { /* SDP media info */
97 .lname = "\nm=",
98 .lnlen = sizeof("\nm=") - 1,
99 .sname = "\rm=",
100 .snlen = sizeof("\rm=") - 1,
101 .ln_str = "audio ",
102 .ln_strlen = sizeof("audio ") - 1,
103 .match_len = digits_len
104 },
105 { /* SDP owner address*/
106 .lname = "\no=",
107 .lnlen = sizeof("\no=") - 1,
108 .sname = "\ro=",
109 .snlen = sizeof("\ro=") - 1,
110 .ln_str = "IN IP4 ",
111 .ln_strlen = sizeof("IN IP4 ") - 1,
112 .match_len = epaddr_len
113 },
114 { /* SDP connection info */
115 .lname = "\nc=",
116 .lnlen = sizeof("\nc=") - 1,
117 .sname = "\rc=",
118 .snlen = sizeof("\rc=") - 1,
119 .ln_str = "IN IP4 ",
120 .ln_strlen = sizeof("IN IP4 ") - 1,
121 .match_len = epaddr_len
122 },
123 { /* Requests headers */
124 .lname = "sip:",
125 .lnlen = sizeof("sip:") - 1,
126 .sname = "sip:",
127 .snlen = sizeof("sip:") - 1, /* yes, i know.. ;) */
128 .ln_str = "@",
129 .ln_strlen = sizeof("@") - 1,
130 .match_len = epaddr_len
131 },
132 { /* SDP version header */
133 .lname = "\nv=",
134 .lnlen = sizeof("\nv=") - 1,
135 .sname = "\rv=",
136 .snlen = sizeof("\rv=") - 1,
137 .ln_str = "=",
138 .ln_strlen = sizeof("=") - 1,
139 .match_len = digits_len
140 }
141};
142EXPORT_SYMBOL_GPL(ct_sip_hdrs);
143
144/* get line lenght until first CR or LF seen. */
145int ct_sip_lnlen(const char *line, const char *limit)
146{
147 const char *k = line;
148
149 while ((line <= limit) && (*line == '\r' || *line == '\n'))
150 line++;
151
152 while (line <= limit) {
153 if (*line == '\r' || *line == '\n')
154 break;
155 line++;
156 }
157 return line - k;
158}
159EXPORT_SYMBOL_GPL(ct_sip_lnlen);
160
161/* Linear string search, case sensitive. */
162const char *ct_sip_search(const char *needle, const char *haystack,
163 size_t needle_len, size_t haystack_len)
164{
165 const char *limit = haystack + (haystack_len - needle_len);
166
167 while (haystack <= limit) {
168 if (memcmp(haystack, needle, needle_len) == 0)
169 return haystack;
170 haystack++;
171 }
172 return NULL;
173}
174EXPORT_SYMBOL_GPL(ct_sip_search);
175
176static int digits_len(const char *dptr, const char *limit, int *shift)
177{
178 int len = 0;
179 while (dptr <= limit && isdigit(*dptr)) {
180 dptr++;
181 len++;
182 }
183 return len;
184}
185
186/* get digits lenght, skiping blank spaces. */
187static int skp_digits_len(const char *dptr, const char *limit, int *shift)
188{
189 for (; dptr <= limit && *dptr == ' '; dptr++)
190 (*shift)++;
191
192 return digits_len(dptr, limit, shift);
193}
194
195/* Simple ipaddr parser.. */
196static int parse_ipaddr(const char *cp, const char **endp,
197 u_int32_t *ipaddr, const char *limit)
198{
199 unsigned long int val;
200 int i, digit = 0;
201
202 for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) {
203 digit = 0;
204 if (!isdigit(*cp))
205 break;
206
207 val = simple_strtoul(cp, (char **)&cp, 10);
208 if (val > 0xFF)
209 return -1;
210
211 ((u_int8_t *)ipaddr)[i] = val;
212 digit = 1;
213
214 if (*cp != '.')
215 break;
216 cp++;
217 }
218 if (!digit)
219 return -1;
220
221 if (endp)
222 *endp = cp;
223
224 return 0;
225}
226
227/* skip ip address. returns it lenght. */
228static int epaddr_len(const char *dptr, const char *limit, int *shift)
229{
230 const char *aux = dptr;
231 u_int32_t ip;
232
233 if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) {
234 DEBUGP("ip: %s parse failed.!\n", dptr);
235 return 0;
236 }
237
238 /* Port number */
239 if (*dptr == ':') {
240 dptr++;
241 dptr += digits_len(dptr, limit, shift);
242 }
243 return dptr - aux;
244}
245
246/* get address length, skiping user info. */
247static int skp_epaddr_len(const char *dptr, const char *limit, int *shift)
248{
249 int s = *shift;
250
251 for (; dptr <= limit && *dptr != '@'; dptr++)
252 (*shift)++;
253
254 if (*dptr == '@') {
255 dptr++;
256 (*shift)++;
257 } else
258 *shift = s;
259
260 return epaddr_len(dptr, limit, shift);
261}
262
263/* Returns 0 if not found, -1 error parsing. */
264int ct_sip_get_info(const char *dptr, size_t dlen,
265 unsigned int *matchoff,
266 unsigned int *matchlen,
267 struct sip_header_nfo *hnfo)
268{
269 const char *limit, *aux, *k = dptr;
270 int shift = 0;
271
272 limit = dptr + (dlen - hnfo->lnlen);
273
274 while (dptr <= limit) {
275 if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) &&
276 (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) {
277 dptr++;
278 continue;
279 }
280 aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen,
281 ct_sip_lnlen(dptr, limit));
282 if (!aux) {
283 DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
284 hnfo->lname);
285 return -1;
286 }
287 aux += hnfo->ln_strlen;
288
289 *matchlen = hnfo->match_len(aux, limit, &shift);
290 if (!*matchlen)
291 return -1;
292
293 *matchoff = (aux - k) + shift;
294
295 DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
296 *matchlen);
297 return 1;
298 }
299 DEBUGP("%s header not found.\n", hnfo->lname);
300 return 0;
301}
302
303static int set_expected_rtp(struct sk_buff **pskb,
304 struct ip_conntrack *ct,
305 enum ip_conntrack_info ctinfo,
306 u_int32_t ipaddr, u_int16_t port,
307 const char *dptr)
308{
309 struct ip_conntrack_expect *exp;
310 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
311 int ret;
312
313 exp = ip_conntrack_expect_alloc(ct);
314 if (exp == NULL)
315 return NF_DROP;
316
317 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
318 exp->tuple.src.u.udp.port = 0;
319 exp->tuple.dst.ip = ipaddr;
320 exp->tuple.dst.u.udp.port = htons(port);
321 exp->tuple.dst.protonum = IPPROTO_UDP;
322
323 exp->mask.src.ip = 0xFFFFFFFF;
324 exp->mask.src.u.udp.port = 0;
325 exp->mask.dst.ip = 0xFFFFFFFF;
326 exp->mask.dst.u.udp.port = 0xFFFF;
327 exp->mask.dst.protonum = 0xFF;
328
329 exp->expectfn = NULL;
330 exp->flags = 0;
331
332 if (ip_nat_sdp_hook)
333 ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr);
334 else {
335 if (ip_conntrack_expect_related(exp) != 0)
336 ret = NF_DROP;
337 else
338 ret = NF_ACCEPT;
339 }
340 ip_conntrack_expect_put(exp);
341
342 return ret;
343}
344
345static int sip_help(struct sk_buff **pskb,
346 struct ip_conntrack *ct,
347 enum ip_conntrack_info ctinfo)
348{
349 unsigned int dataoff, datalen;
350 const char *dptr;
351 int ret = NF_ACCEPT;
352 int matchoff, matchlen;
353 u_int32_t ipaddr;
354 u_int16_t port;
355
356 /* No Data ? */
357 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
358 if (dataoff >= (*pskb)->len) {
359 DEBUGP("skb->len = %u\n", (*pskb)->len);
360 return NF_ACCEPT;
361 }
362
363 ip_ct_refresh(ct, *pskb, sip_timeout * HZ);
364
365 if (!skb_is_nonlinear(*pskb))
366 dptr = (*pskb)->data + dataoff;
367 else {
368 DEBUGP("Copy of skbuff not supported yet.\n");
369 goto out;
370 }
371
372 if (ip_nat_sip_hook) {
373 if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) {
374 ret = NF_DROP;
375 goto out;
376 }
377 }
378
379 /* After this point NAT, could have mangled skb, so
380 we need to recalculate payload lenght. */
381 datalen = (*pskb)->len - dataoff;
382
383 if (datalen < (sizeof("SIP/2.0 200") - 1))
384 goto out;
385
386 /* RTP info only in some SDP pkts */
387 if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 &&
388 memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) {
389 goto out;
390 }
391 /* Get ip and port address from SDP packet. */
392 if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
393 &ct_sip_hdrs[POS_CONNECTION]) > 0) {
394
395 /* We'll drop only if there are parse problems. */
396 if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr,
397 dptr + datalen) < 0) {
398 ret = NF_DROP;
399 goto out;
400 }
401 if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
402 &ct_sip_hdrs[POS_MEDIA]) > 0) {
403
404 port = simple_strtoul(dptr + matchoff, NULL, 10);
405 if (port < 1024) {
406 ret = NF_DROP;
407 goto out;
408 }
409 ret = set_expected_rtp(pskb, ct, ctinfo,
410 ipaddr, port, dptr);
411 }
412 }
413out:
414 return ret;
415}
416
417static struct ip_conntrack_helper sip[MAX_PORTS];
418static char sip_names[MAX_PORTS][10];
419
420static void fini(void)
421{
422 int i;
423 for (i = 0; i < ports_c; i++) {
424 DEBUGP("unregistering helper for port %d\n", ports[i]);
425 ip_conntrack_helper_unregister(&sip[i]);
426 }
427}
428
429static int __init init(void)
430{
431 int i, ret;
432 char *tmpname;
433
434 if (ports_c == 0)
435 ports[ports_c++] = SIP_PORT;
436
437 for (i = 0; i < ports_c; i++) {
438 /* Create helper structure */
439 memset(&sip[i], 0, sizeof(struct ip_conntrack_helper));
440
441 sip[i].tuple.dst.protonum = IPPROTO_UDP;
442 sip[i].tuple.src.u.udp.port = htons(ports[i]);
443 sip[i].mask.src.u.udp.port = 0xFFFF;
444 sip[i].mask.dst.protonum = 0xFF;
445 sip[i].max_expected = 1;
446 sip[i].timeout = 3 * 60; /* 3 minutes */
447 sip[i].me = THIS_MODULE;
448 sip[i].help = sip_help;
449
450 tmpname = &sip_names[i][0];
451 if (ports[i] == SIP_PORT)
452 sprintf(tmpname, "sip");
453 else
454 sprintf(tmpname, "sip-%d", i);
455 sip[i].name = tmpname;
456
457 DEBUGP("port #%d: %d\n", i, ports[i]);
458
459 ret = ip_conntrack_helper_register(&sip[i]);
460 if (ret) {
461 printk("ERROR registering helper for port %d\n",
462 ports[i]);
463 fini();
464 return ret;
465 }
466 }
467 return 0;
468}
469
470module_init(init);
471module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 929d61f7be91..88445aac3f28 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
189 return -ENOSPC; 189 return -ENOSPC;
190#endif 190#endif
191 191
192#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
193 if (seq_printf(s, "secmark=%u ", conntrack->secmark))
194 return -ENOSPC;
195#endif
196
192 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) 197 if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
193 return -ENOSPC; 198 return -ENOSPC;
194 199
@@ -417,7 +422,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum,
417 422
418 /* This is where we call the helper: as the packet goes out. */ 423 /* This is where we call the helper: as the packet goes out. */
419 ct = ip_conntrack_get(*pskb, &ctinfo); 424 ct = ip_conntrack_get(*pskb, &ctinfo);
420 if (ct && ct->helper) { 425 if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) {
421 unsigned int ret; 426 unsigned int ret;
422 ret = ct->helper->help(pskb, ct, ctinfo); 427 ret = ct->helper->help(pskb, ct, ctinfo);
423 if (ret != NF_ACCEPT) 428 if (ret != NF_ACCEPT)
@@ -564,6 +569,8 @@ extern unsigned int ip_ct_generic_timeout;
564static int log_invalid_proto_min = 0; 569static int log_invalid_proto_min = 0;
565static int log_invalid_proto_max = 255; 570static int log_invalid_proto_max = 255;
566 571
572int ip_conntrack_checksum = 1;
573
567static struct ctl_table_header *ip_ct_sysctl_header; 574static struct ctl_table_header *ip_ct_sysctl_header;
568 575
569static ctl_table ip_ct_sysctl_table[] = { 576static ctl_table ip_ct_sysctl_table[] = {
@@ -592,6 +599,14 @@ static ctl_table ip_ct_sysctl_table[] = {
592 .proc_handler = &proc_dointvec, 599 .proc_handler = &proc_dointvec,
593 }, 600 },
594 { 601 {
602 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
603 .procname = "ip_conntrack_checksum",
604 .data = &ip_conntrack_checksum,
605 .maxlen = sizeof(int),
606 .mode = 0644,
607 .proc_handler = &proc_dointvec,
608 },
609 {
595 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, 610 .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
596 .procname = "ip_conntrack_tcp_timeout_syn_sent", 611 .procname = "ip_conntrack_tcp_timeout_syn_sent",
597 .data = &ip_ct_tcp_timeout_syn_sent, 612 .data = &ip_ct_tcp_timeout_syn_sent,
@@ -946,6 +961,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
946EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); 961EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
947EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); 962EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
948EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); 963EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
964EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
949#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ 965#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
950 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) 966 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
951EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); 967EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
index d45663d137a7..419b878fb467 100644
--- a/net/ipv4/netfilter/ip_nat_helper_h323.c
+++ b/net/ipv4/netfilter/ip_nat_helper_h323.c
@@ -487,6 +487,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
487} 487}
488 488
489/****************************************************************************/ 489/****************************************************************************/
490static void ip_nat_callforwarding_expect(struct ip_conntrack *new,
491 struct ip_conntrack_expect *this)
492{
493 struct ip_nat_range range;
494
495 /* This must be a fresh one. */
496 BUG_ON(new->status & IPS_NAT_DONE_MASK);
497
498 /* Change src to where master sends to */
499 range.flags = IP_NAT_RANGE_MAP_IPS;
500 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
501
502 /* hook doesn't matter, but it has to do source manip */
503 ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
504
505 /* For DST manip, map port here to where it's expected. */
506 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
507 range.min = range.max = this->saved_proto;
508 range.min_ip = range.max_ip = this->saved_ip;
509
510 /* hook doesn't matter, but it has to do destination manip */
511 ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
512
513 ip_conntrack_q931_expect(new, this);
514}
515
516/****************************************************************************/
517static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct,
518 enum ip_conntrack_info ctinfo,
519 unsigned char **data, int dataoff,
520 TransportAddress * addr, u_int16_t port,
521 struct ip_conntrack_expect *exp)
522{
523 int dir = CTINFO2DIR(ctinfo);
524 u_int16_t nated_port;
525
526 /* Set expectations for NAT */
527 exp->saved_ip = exp->tuple.dst.ip;
528 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
529 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
530 exp->expectfn = ip_nat_callforwarding_expect;
531 exp->dir = !dir;
532
533 /* Try to get same port: if not, try to change it. */
534 for (nated_port = port; nated_port != 0; nated_port++) {
535 exp->tuple.dst.u.tcp.port = htons(nated_port);
536 if (ip_conntrack_expect_related(exp) == 0)
537 break;
538 }
539
540 if (nated_port == 0) { /* No port available */
541 if (net_ratelimit())
542 printk("ip_nat_q931: out of TCP ports\n");
543 return 0;
544 }
545
546 /* Modify signal */
547 if (!set_h225_addr(pskb, data, dataoff, addr,
548 ct->tuplehash[!dir].tuple.dst.ip,
549 nated_port) == 0) {
550 ip_conntrack_unexpect_related(exp);
551 return -1;
552 }
553
554 /* Success */
555 DEBUGP("ip_nat_q931: expect Call Forwarding "
556 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
557 NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
558 NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
559
560 return 0;
561}
562
563/****************************************************************************/
490static int __init init(void) 564static int __init init(void)
491{ 565{
492 BUG_ON(set_h245_addr_hook != NULL); 566 BUG_ON(set_h245_addr_hook != NULL);
@@ -496,6 +570,7 @@ static int __init init(void)
496 BUG_ON(nat_rtp_rtcp_hook != NULL); 570 BUG_ON(nat_rtp_rtcp_hook != NULL);
497 BUG_ON(nat_t120_hook != NULL); 571 BUG_ON(nat_t120_hook != NULL);
498 BUG_ON(nat_h245_hook != NULL); 572 BUG_ON(nat_h245_hook != NULL);
573 BUG_ON(nat_callforwarding_hook != NULL);
499 BUG_ON(nat_q931_hook != NULL); 574 BUG_ON(nat_q931_hook != NULL);
500 575
501 set_h245_addr_hook = set_h245_addr; 576 set_h245_addr_hook = set_h245_addr;
@@ -505,6 +580,7 @@ static int __init init(void)
505 nat_rtp_rtcp_hook = nat_rtp_rtcp; 580 nat_rtp_rtcp_hook = nat_rtp_rtcp;
506 nat_t120_hook = nat_t120; 581 nat_t120_hook = nat_t120;
507 nat_h245_hook = nat_h245; 582 nat_h245_hook = nat_h245;
583 nat_callforwarding_hook = nat_callforwarding;
508 nat_q931_hook = nat_q931; 584 nat_q931_hook = nat_q931;
509 585
510 DEBUGP("ip_nat_h323: init success\n"); 586 DEBUGP("ip_nat_h323: init success\n");
@@ -521,6 +597,7 @@ static void __exit fini(void)
521 nat_rtp_rtcp_hook = NULL; 597 nat_rtp_rtcp_hook = NULL;
522 nat_t120_hook = NULL; 598 nat_t120_hook = NULL;
523 nat_h245_hook = NULL; 599 nat_h245_hook = NULL;
600 nat_callforwarding_hook = NULL;
524 nat_q931_hook = NULL; 601 nat_q931_hook = NULL;
525 synchronize_net(); 602 synchronize_net();
526} 603}
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c
new file mode 100644
index 000000000000..6ffba63adca2
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_sip.c
@@ -0,0 +1,249 @@
1/* SIP extension for UDP NAT alteration.
2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_nat_ftp.c and other modules.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/ip.h>
14#include <linux/udp.h>
15
16#include <linux/netfilter_ipv4.h>
17#include <linux/netfilter_ipv4/ip_nat.h>
18#include <linux/netfilter_ipv4/ip_nat_helper.h>
19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
20#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
24MODULE_DESCRIPTION("SIP NAT helper");
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(format, args...)
30#endif
31
32extern struct sip_header_nfo ct_sip_hdrs[];
33
34static unsigned int mangle_sip_packet(struct sk_buff **pskb,
35 enum ip_conntrack_info ctinfo,
36 struct ip_conntrack *ct,
37 const char **dptr, size_t dlen,
38 char *buffer, int bufflen,
39 struct sip_header_nfo *hnfo)
40{
41 unsigned int matchlen, matchoff;
42
43 if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0)
44 return 0;
45
46 if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
47 matchoff, matchlen, buffer, bufflen))
48 return 0;
49
50 /* We need to reload this. Thanks Patrick. */
51 *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
52 return 1;
53}
54
55static unsigned int ip_nat_sip(struct sk_buff **pskb,
56 enum ip_conntrack_info ctinfo,
57 struct ip_conntrack *ct,
58 const char **dptr)
59{
60 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
61 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
62 unsigned int bufflen, dataoff;
63 u_int32_t ip;
64 u_int16_t port;
65
66 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
67
68 ip = ct->tuplehash[!dir].tuple.dst.ip;
69 port = ct->tuplehash[!dir].tuple.dst.u.udp.port;
70 bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port));
71
72 /* short packet ? */
73 if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1))
74 return 0;
75
76 /* Basic rules: requests and responses. */
77 if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) {
78 const char *aux;
79
80 if ((ctinfo) < IP_CT_IS_REPLY) {
81 mangle_sip_packet(pskb, ctinfo, ct, dptr,
82 (*pskb)->len - dataoff,
83 buffer, bufflen,
84 &ct_sip_hdrs[POS_CONTACT]);
85 return 1;
86 }
87
88 if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
89 (*pskb)->len - dataoff,
90 buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
91 return 0;
92
93 /* This search should ignore case, but later.. */
94 aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1,
95 (*pskb)->len - dataoff);
96 if (!aux)
97 return 0;
98
99 if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"),
100 ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff)))
101 return 1;
102
103 return mangle_sip_packet(pskb, ctinfo, ct, dptr,
104 (*pskb)->len - dataoff,
105 buffer, bufflen,
106 &ct_sip_hdrs[POS_CONTACT]);
107 }
108 if ((ctinfo) < IP_CT_IS_REPLY) {
109 if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
110 (*pskb)->len - dataoff,
111 buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
112 return 0;
113
114 /* Mangle Contact if exists only. - watch udp_nat_mangle()! */
115 mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff,
116 buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]);
117 return 1;
118 }
119 /* This mangle requests headers. */
120 return mangle_sip_packet(pskb, ctinfo, ct, dptr,
121 ct_sip_lnlen(*dptr,
122 *dptr + (*pskb)->len - dataoff),
123 buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]);
124}
125
126static int mangle_content_len(struct sk_buff **pskb,
127 enum ip_conntrack_info ctinfo,
128 struct ip_conntrack *ct,
129 const char *dptr)
130{
131 unsigned int dataoff, matchoff, matchlen;
132 char buffer[sizeof("65536")];
133 int bufflen;
134
135 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
136
137 /* Get actual SDP lenght */
138 if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
139 &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) {
140
141 /* since ct_sip_get_info() give us a pointer passing 'v='
142 we need to add 2 bytes in this count. */
143 int c_len = (*pskb)->len - dataoff - matchoff + 2;
144
145 /* Now, update SDP lenght */
146 if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
147 &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) {
148
149 bufflen = sprintf(buffer, "%u", c_len);
150
151 return ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
152 matchoff, matchlen,
153 buffer, bufflen);
154 }
155 }
156 return 0;
157}
158
159static unsigned int mangle_sdp(struct sk_buff **pskb,
160 enum ip_conntrack_info ctinfo,
161 struct ip_conntrack *ct,
162 u_int32_t newip, u_int16_t port,
163 const char *dptr)
164{
165 char buffer[sizeof("nnn.nnn.nnn.nnn")];
166 unsigned int dataoff, bufflen;
167
168 dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
169
170 /* Mangle owner and contact info. */
171 bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
172 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
173 buffer, bufflen, &ct_sip_hdrs[POS_OWNER]))
174 return 0;
175
176 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
177 buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION]))
178 return 0;
179
180 /* Mangle media port. */
181 bufflen = sprintf(buffer, "%u", port);
182 if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
183 buffer, bufflen, &ct_sip_hdrs[POS_MEDIA]))
184 return 0;
185
186 return mangle_content_len(pskb, ctinfo, ct, dptr);
187}
188
189/* So, this packet has hit the connection tracking matching code.
190 Mangle it, and change the expectation to match the new version. */
191static unsigned int ip_nat_sdp(struct sk_buff **pskb,
192 enum ip_conntrack_info ctinfo,
193 struct ip_conntrack_expect *exp,
194 const char *dptr)
195{
196 struct ip_conntrack *ct = exp->master;
197 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
198 u_int32_t newip;
199 u_int16_t port;
200
201 DEBUGP("ip_nat_sdp():\n");
202
203 /* Connection will come from reply */
204 newip = ct->tuplehash[!dir].tuple.dst.ip;
205
206 exp->tuple.dst.ip = newip;
207 exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
208 exp->dir = !dir;
209
210 /* When you see the packet, we need to NAT it the same as the
211 this one. */
212 exp->expectfn = ip_nat_follow_master;
213
214 /* Try to get same port: if not, try to change it. */
215 for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) {
216 exp->tuple.dst.u.udp.port = htons(port);
217 if (ip_conntrack_expect_related(exp) == 0)
218 break;
219 }
220
221 if (port == 0)
222 return NF_DROP;
223
224 if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) {
225 ip_conntrack_unexpect_related(exp);
226 return NF_DROP;
227 }
228 return NF_ACCEPT;
229}
230
231static void __exit fini(void)
232{
233 ip_nat_sip_hook = NULL;
234 ip_nat_sdp_hook = NULL;
235 /* Make sure noone calls it, meanwhile. */
236 synchronize_net();
237}
238
239static int __init init(void)
240{
241 BUG_ON(ip_nat_sip_hook);
242 BUG_ON(ip_nat_sdp_hook);
243 ip_nat_sip_hook = ip_nat_sip;
244 ip_nat_sdp_hook = ip_nat_sdp;
245 return 0;
246}
247
248module_init(init);
249module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index c33244263b90..d20d557f915a 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void)
1348module_init(ip_nat_snmp_basic_init); 1348module_init(ip_nat_snmp_basic_init);
1349module_exit(ip_nat_snmp_basic_fini); 1349module_exit(ip_nat_snmp_basic_fini);
1350 1350
1351module_param(debug, bool, 0600); 1351module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index aad9d28c8d71..dbc83c5d7aa6 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
241 struct iphdr *iph = skb->nh.iph; 241 struct iphdr *iph = skb->nh.iph;
242 unsigned long hashval; 242 unsigned long hashval;
243 u_int16_t sport, dport; 243 u_int16_t sport, dport;
244 struct tcphdr *th; 244 u_int16_t *ports;
245 struct udphdr *uh;
246 struct icmphdr *ih;
247 245
248 switch (iph->protocol) { 246 switch (iph->protocol) {
249 case IPPROTO_TCP: 247 case IPPROTO_TCP:
250 th = (void *)iph+iph->ihl*4;
251 sport = ntohs(th->source);
252 dport = ntohs(th->dest);
253 break;
254 case IPPROTO_UDP: 248 case IPPROTO_UDP:
255 uh = (void *)iph+iph->ihl*4; 249 case IPPROTO_SCTP:
256 sport = ntohs(uh->source); 250 case IPPROTO_DCCP:
257 dport = ntohs(uh->dest);
258 break;
259 case IPPROTO_ICMP: 251 case IPPROTO_ICMP:
260 ih = (void *)iph+iph->ihl*4; 252 ports = (void *)iph+iph->ihl*4;
261 sport = ntohs(ih->un.echo.id); 253 sport = ports[0];
262 dport = (ih->type<<8)|ih->code; 254 dport = ports[1];
263 break; 255 break;
264 default: 256 default:
265 if (net_ratelimit()) { 257 if (net_ratelimit()) {
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 0bba3c2bb786..431a3ce6f7b7 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
147 /* This packet will not be the same as the other: clear nf fields */ 147 /* This packet will not be the same as the other: clear nf fields */
148 nf_reset(nskb); 148 nf_reset(nskb);
149 nskb->nfmark = 0; 149 nskb->nfmark = 0;
150 skb_init_secmark(nskb);
150 151
151 tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); 152 tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
152 153
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 7c6836c4646e..92980ab8ce48 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -28,9 +28,6 @@
28#include <linux/jhash.h> 28#include <linux/jhash.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/tcp.h>
32#include <linux/udp.h>
33#include <linux/sctp.h>
34#include <linux/proc_fs.h> 31#include <linux/proc_fs.h>
35#include <linux/seq_file.h> 32#include <linux/seq_file.h>
36#include <linux/list.h> 33#include <linux/list.h>
@@ -83,6 +80,7 @@ struct ipt_hashlimit_htable {
83 /* used internally */ 80 /* used internally */
84 spinlock_t lock; /* lock for list_head */ 81 spinlock_t lock; /* lock for list_head */
85 u_int32_t rnd; /* random seed for hash */ 82 u_int32_t rnd; /* random seed for hash */
83 int rnd_initialized;
86 struct timer_list timer; /* timer for gc */ 84 struct timer_list timer; /* timer for gc */
87 atomic_t count; /* number entries in table */ 85 atomic_t count; /* number entries in table */
88 86
@@ -137,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
137 135
138 /* initialize hash with random val at the time we allocate 136 /* initialize hash with random val at the time we allocate
139 * the first hashtable entry */ 137 * the first hashtable entry */
140 if (!ht->rnd) 138 if (!ht->rnd_initialized) {
141 get_random_bytes(&ht->rnd, 4); 139 get_random_bytes(&ht->rnd, 4);
140 ht->rnd_initialized = 1;
141 }
142 142
143 if (ht->cfg.max && 143 if (ht->cfg.max &&
144 atomic_read(&ht->count) >= ht->cfg.max) { 144 atomic_read(&ht->count) >= ht->cfg.max) {
@@ -217,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
217 217
218 atomic_set(&hinfo->count, 0); 218 atomic_set(&hinfo->count, 0);
219 atomic_set(&hinfo->use, 1); 219 atomic_set(&hinfo->use, 1);
220 hinfo->rnd = 0; 220 hinfo->rnd_initialized = 0;
221 spin_lock_init(&hinfo->lock); 221 spin_lock_init(&hinfo->lock);
222 hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); 222 hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir);
223 if (!hinfo->pde) { 223 if (!hinfo->pde) {
@@ -381,49 +381,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
381 dh->rateinfo.credit = dh->rateinfo.credit_cap; 381 dh->rateinfo.credit = dh->rateinfo.credit_cap;
382} 382}
383 383
384static inline int get_ports(const struct sk_buff *skb, int offset,
385 u16 ports[2])
386{
387 union {
388 struct tcphdr th;
389 struct udphdr uh;
390 sctp_sctphdr_t sctph;
391 } hdr_u, *ptr_u;
392
393 /* Must not be a fragment. */
394 if (offset)
395 return 1;
396
397 /* Must be big enough to read ports (both UDP and TCP have
398 them at the start). */
399 ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u);
400 if (!ptr_u)
401 return 1;
402
403 switch (skb->nh.iph->protocol) {
404 case IPPROTO_TCP:
405 ports[0] = ptr_u->th.source;
406 ports[1] = ptr_u->th.dest;
407 break;
408 case IPPROTO_UDP:
409 ports[0] = ptr_u->uh.source;
410 ports[1] = ptr_u->uh.dest;
411 break;
412 case IPPROTO_SCTP:
413 ports[0] = ptr_u->sctph.source;
414 ports[1] = ptr_u->sctph.dest;
415 break;
416 default:
417 /* all other protocols don't supprot per-port hash
418 * buckets */
419 ports[0] = ports[1] = 0;
420 break;
421 }
422
423 return 0;
424}
425
426
427static int 384static int
428hashlimit_match(const struct sk_buff *skb, 385hashlimit_match(const struct sk_buff *skb,
429 const struct net_device *in, 386 const struct net_device *in,
@@ -449,8 +406,22 @@ hashlimit_match(const struct sk_buff *skb,
449 dst.src_ip = skb->nh.iph->saddr; 406 dst.src_ip = skb->nh.iph->saddr;
450 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT 407 if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT
451 ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { 408 ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) {
452 u_int16_t ports[2]; 409 u_int16_t _ports[2], *ports;
453 if (get_ports(skb, offset, ports)) { 410
411 switch (skb->nh.iph->protocol) {
412 case IPPROTO_TCP:
413 case IPPROTO_UDP:
414 case IPPROTO_SCTP:
415 case IPPROTO_DCCP:
416 ports = skb_header_pointer(skb, skb->nh.iph->ihl*4,
417 sizeof(_ports), &_ports);
418 break;
419 default:
420 _ports[0] = _ports[1] = 0;
421 ports = _ports;
422 break;
423 }
424 if (!ports) {
454 /* We've been asked to examine this packet, and we 425 /* We've been asked to examine this packet, and we
455 can't. Hence, no choice but to drop. */ 426 can't. Hence, no choice but to drop. */
456 *hotdrop = 1; 427 *hotdrop = 1;
@@ -561,7 +532,7 @@ static void
561hashlimit_destroy(const struct xt_match *match, void *matchinfo, 532hashlimit_destroy(const struct xt_match *match, void *matchinfo,
562 unsigned int matchsize) 533 unsigned int matchsize)
563{ 534{
564 struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; 535 struct ipt_hashlimit_info *r = matchinfo;
565 536
566 htable_put(r->hinfo); 537 htable_put(r->hinfo);
567} 538}
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index b847ee409efb..61a2139f9cfd 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -1,1007 +1,499 @@
1/* Kernel module to check if the source address has been seen recently. */ 1/*
2/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ 2 * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
3/* Author: Stephen Frost <sfrost@snowman.net> */ 3 *
4/* Project Page: http://snowman.net/projects/ipt_recent/ */ 4 * This program is free software; you can redistribute it and/or modify
5/* This software is distributed under the terms of the GPL, Version 2 */ 5 * it under the terms of the GNU General Public License version 2 as
6/* This copyright does not cover user programs that use kernel services 6 * published by the Free Software Foundation.
7 * by normal system calls. */ 7 *
8 8 * This is a replacement of the old ipt_recent module, which carried the
9#include <linux/module.h> 9 * following copyright notice:
10#include <linux/skbuff.h> 10 *
11 * Author: Stephen Frost <sfrost@snowman.net>
12 * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
13 */
14#include <linux/init.h>
15#include <linux/moduleparam.h>
11#include <linux/proc_fs.h> 16#include <linux/proc_fs.h>
12#include <linux/spinlock.h> 17#include <linux/seq_file.h>
13#include <linux/interrupt.h> 18#include <linux/string.h>
14#include <asm/uaccess.h>
15#include <linux/ctype.h> 19#include <linux/ctype.h>
16#include <linux/ip.h> 20#include <linux/list.h>
17#include <linux/vmalloc.h> 21#include <linux/random.h>
18#include <linux/moduleparam.h> 22#include <linux/jhash.h>
23#include <linux/bitops.h>
24#include <linux/skbuff.h>
25#include <linux/inet.h>
19 26
20#include <linux/netfilter_ipv4/ip_tables.h> 27#include <linux/netfilter_ipv4/ip_tables.h>
21#include <linux/netfilter_ipv4/ipt_recent.h> 28#include <linux/netfilter_ipv4/ipt_recent.h>
22 29
23#undef DEBUG 30MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
24#define HASH_LOG 9 31MODULE_DESCRIPTION("IP tables recently seen matching module");
32MODULE_LICENSE("GPL");
25 33
26/* Defaults, these can be overridden on the module command-line. */
27static unsigned int ip_list_tot = 100; 34static unsigned int ip_list_tot = 100;
28static unsigned int ip_pkt_list_tot = 20; 35static unsigned int ip_pkt_list_tot = 20;
29static unsigned int ip_list_hash_size = 0; 36static unsigned int ip_list_hash_size = 0;
30static unsigned int ip_list_perms = 0644; 37static unsigned int ip_list_perms = 0644;
31#ifdef DEBUG
32static int debug = 1;
33#endif
34
35static char version[] =
36KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n";
37
38MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>");
39MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER);
40MODULE_LICENSE("GPL");
41module_param(ip_list_tot, uint, 0400); 38module_param(ip_list_tot, uint, 0400);
42module_param(ip_pkt_list_tot, uint, 0400); 39module_param(ip_pkt_list_tot, uint, 0400);
43module_param(ip_list_hash_size, uint, 0400); 40module_param(ip_list_hash_size, uint, 0400);
44module_param(ip_list_perms, uint, 0400); 41module_param(ip_list_perms, uint, 0400);
45#ifdef DEBUG 42MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
46module_param(debug, bool, 0600); 43MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
47MODULE_PARM_DESC(debug,"enable debugging output"); 44MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
48#endif 45MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
49MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); 46
50MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); 47
51MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); 48struct recent_entry {
52MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); 49 struct list_head list;
53 50 struct list_head lru_list;
54/* Structure of our list of recently seen addresses. */ 51 u_int32_t addr;
55struct recent_ip_list { 52 u_int8_t ttl;
56 u_int32_t addr; 53 u_int8_t index;
57 u_int8_t ttl; 54 u_int16_t nstamps;
58 unsigned long last_seen; 55 unsigned long stamps[0];
59 unsigned long *last_pkts;
60 u_int32_t oldest_pkt;
61 u_int32_t hash_entry;
62 u_int32_t time_pos;
63};
64
65struct time_info_list {
66 u_int32_t position;
67 u_int32_t time;
68}; 56};
69 57
70/* Structure of our linked list of tables of recent lists. */ 58struct recent_table {
71struct recent_ip_tables { 59 struct list_head list;
72 char name[IPT_RECENT_NAME_LEN]; 60 char name[IPT_RECENT_NAME_LEN];
73 int count;
74 int time_pos;
75 struct recent_ip_list *table;
76 struct recent_ip_tables *next;
77 spinlock_t list_lock;
78 int *hash_table;
79 struct time_info_list *time_info;
80#ifdef CONFIG_PROC_FS 61#ifdef CONFIG_PROC_FS
81 struct proc_dir_entry *status_proc; 62 struct proc_dir_entry *proc;
82#endif /* CONFIG_PROC_FS */ 63#endif
64 unsigned int refcnt;
65 unsigned int entries;
66 struct list_head lru_list;
67 struct list_head iphash[0];
83}; 68};
84 69
85/* Our current list of addresses we have recently seen. 70static LIST_HEAD(tables);
86 * Only added to on a --set, and only updated on --set || --update
87 */
88static struct recent_ip_tables *r_tables = NULL;
89
90/* We protect r_list with this spinlock so two processors are not modifying
91 * the list at the same time.
92 */
93static DEFINE_SPINLOCK(recent_lock); 71static DEFINE_SPINLOCK(recent_lock);
72static DEFINE_MUTEX(recent_mutex);
94 73
95#ifdef CONFIG_PROC_FS 74#ifdef CONFIG_PROC_FS
96/* Our /proc/net/ipt_recent entry */ 75static struct proc_dir_entry *proc_dir;
97static struct proc_dir_entry *proc_net_ipt_recent = NULL; 76static struct file_operations recent_fops;
98#endif
99
100/* Function declaration for later. */
101static int
102match(const struct sk_buff *skb,
103 const struct net_device *in,
104 const struct net_device *out,
105 const struct xt_match *match,
106 const void *matchinfo,
107 int offset,
108 unsigned int protoff,
109 int *hotdrop);
110
111/* Function to hash a given address into the hash table of table_size size */
112static int hash_func(unsigned int addr, int table_size)
113{
114 int result = 0;
115 unsigned int value = addr;
116 do { result ^= value; } while((value >>= HASH_LOG));
117
118#ifdef DEBUG
119 if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n",
120 result & (table_size - 1),
121 addr,
122 table_size);
123#endif 77#endif
124 78
125 return(result & (table_size - 1)); 79static u_int32_t hash_rnd;
126} 80static int hash_rnd_initted;
127 81
128#ifdef CONFIG_PROC_FS 82static unsigned int recent_entry_hash(u_int32_t addr)
129/* This is the function which produces the output for our /proc output
130 * interface which lists each IP address, the last seen time and the
131 * other recent times the address was seen.
132 */
133
134static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
135{ 83{
136 int len = 0, count, last_len = 0, pkt_count; 84 if (!hash_rnd_initted) {
137 off_t pos = 0; 85 get_random_bytes(&hash_rnd, 4);
138 off_t begin = 0; 86 hash_rnd_initted = 1;
139 struct recent_ip_tables *curr_table;
140
141 curr_table = (struct recent_ip_tables*) data;
142
143 spin_lock_bh(&curr_table->list_lock);
144 for(count = 0; count < ip_list_tot; count++) {
145 if(!curr_table->table[count].addr) continue;
146 last_len = len;
147 len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr));
148 len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl);
149 len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen);
150 len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt);
151 len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]);
152 for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) {
153 if(!curr_table->table[count].last_pkts[pkt_count]) break;
154 len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]);
155 }
156 len += sprintf(buffer+len,"\n");
157 pos = begin + len;
158 if(pos < offset) { len = 0; begin = pos; }
159 if(pos > offset + length) { len = last_len; break; }
160 } 87 }
161 88 return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1);
162 *start = buffer + (offset - begin);
163 len -= (offset - begin);
164 if(len > length) len = length;
165
166 spin_unlock_bh(&curr_table->list_lock);
167 return len;
168} 89}
169 90
170/* ip_recent_ctrl provides an interface for users to modify the table 91static struct recent_entry *
171 * directly. This allows adding entries, removing entries, and 92recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl)
172 * flushing the entire table.
173 * This is done by opening up the appropriate table for writing and
174 * sending one of:
175 * xx.xx.xx.xx -- Add entry to table with current time
176 * +xx.xx.xx.xx -- Add entry to table with current time
177 * -xx.xx.xx.xx -- Remove entry from table
178 * clear -- Flush table, remove all entries
179 */
180
181static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data)
182{ 93{
183 static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; 94 struct recent_entry *e;
184 u_int32_t val; 95 unsigned int h;
185 int base, used = 0; 96
186 char c, *cp; 97 h = recent_entry_hash(addr);
187 union iaddr { 98 list_for_each_entry(e, &table->iphash[h], list)
188 uint8_t bytes[4]; 99 if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
189 uint32_t word; 100 return e;
190 } res; 101 return NULL;
191 uint8_t *pp = res.bytes; 102}
192 int digit;
193
194 char buffer[20];
195 int len, check_set = 0, count;
196 u_int32_t addr = 0;
197 struct sk_buff *skb;
198 struct ipt_recent_info *info;
199 struct recent_ip_tables *curr_table;
200
201 curr_table = (struct recent_ip_tables*) data;
202
203 if(size > 20) len = 20; else len = size;
204
205 if(copy_from_user(buffer,input,len)) return -EFAULT;
206
207 if(len < 20) buffer[len] = '\0';
208
209#ifdef DEBUG
210 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer);
211#endif
212 103
213 cp = buffer; 104static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
214 while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } 105{
106 list_del(&e->list);
107 list_del(&e->lru_list);
108 kfree(e);
109 t->entries--;
110}
215 111
216 /* Check if we are asked to flush the entire table */ 112static struct recent_entry *
217 if(!memcmp(cp,"clear",5)) { 113recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl)
218 used += 5; 114{
219 spin_lock_bh(&curr_table->list_lock); 115 struct recent_entry *e;
220 curr_table->time_pos = 0;
221 for(count = 0; count < ip_list_hash_size; count++) {
222 curr_table->hash_table[count] = -1;
223 }
224 for(count = 0; count < ip_list_tot; count++) {
225 curr_table->table[count].last_seen = 0;
226 curr_table->table[count].addr = 0;
227 curr_table->table[count].ttl = 0;
228 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
229 curr_table->table[count].oldest_pkt = 0;
230 curr_table->table[count].time_pos = 0;
231 curr_table->time_info[count].position = count;
232 curr_table->time_info[count].time = 0;
233 }
234 spin_unlock_bh(&curr_table->list_lock);
235 return used;
236 }
237 116
238 check_set = IPT_RECENT_SET; 117 if (t->entries >= ip_list_tot) {
239 switch(*cp) { 118 e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
240 case '+': check_set = IPT_RECENT_SET; cp++; used++; break; 119 recent_entry_remove(t, e);
241 case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break;
242 default: if(!isdigit(*cp)) return (used+1); break;
243 } 120 }
121 e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
122 GFP_ATOMIC);
123 if (e == NULL)
124 return NULL;
125 e->addr = addr;
126 e->ttl = ttl;
127 e->stamps[0] = jiffies;
128 e->nstamps = 1;
129 e->index = 1;
130 list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
131 list_add_tail(&e->lru_list, &t->lru_list);
132 t->entries++;
133 return e;
134}
244 135
245#ifdef DEBUG 136static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
246 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); 137{
247#endif 138 e->stamps[e->index++] = jiffies;
248 /* Get addr (effectively inet_aton()) */ 139 if (e->index > e->nstamps)
249 /* Shamelessly stolen from libc, a function in the kernel for doing 140 e->nstamps = e->index;
250 * this would, of course, be greatly preferred, but our options appear 141 e->index %= ip_pkt_list_tot;
251 * to be rather limited, so we will just do it ourselves here. 142 list_move_tail(&e->lru_list, &t->lru_list);
252 */ 143}
253 res.word = 0;
254
255 c = *cp;
256 for(;;) {
257 if(!isdigit(c)) return used;
258 val = 0; base = 10; digit = 0;
259 if(c == '0') {
260 c = *++cp;
261 if(c == 'x' || c == 'X') base = 16, c = *++cp;
262 else { base = 8; digit = 1; }
263 }
264 for(;;) {
265 if(isascii(c) && isdigit(c)) {
266 if(base == 8 && (c == '8' || c == '0')) return used;
267 val = (val * base) + (c - '0');
268 c = *++cp;
269 digit = 1;
270 } else if(base == 16 && isascii(c) && isxdigit(c)) {
271 val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A'));
272 c = *++cp;
273 digit = 1;
274 } else break;
275 }
276 if(c == '.') {
277 if(pp > res.bytes + 2 || val > 0xff) return used;
278 *pp++ = val;
279 c = *++cp;
280 } else break;
281 }
282 used = cp - buffer;
283 if(c != '\0' && (!isascii(c) || !isspace(c))) return used;
284 if(c == '\n') used++;
285 if(!digit) return used;
286 144
287 if(val > max[pp - res.bytes]) return used; 145static struct recent_table *recent_table_lookup(const char *name)
288 addr = res.word | htonl(val); 146{
147 struct recent_table *t;
289 148
290 if(!addr && check_set == IPT_RECENT_SET) return used; 149 list_for_each_entry(t, &tables, list)
150 if (!strcmp(t->name, name))
151 return t;
152 return NULL;
153}
291 154
292#ifdef DEBUG 155static void recent_table_flush(struct recent_table *t)
293 if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); 156{
294#endif 157 struct recent_entry *e, *next;
158 unsigned int i;
295 159
296 /* Set up and just call match */ 160 for (i = 0; i < ip_list_hash_size; i++) {
297 info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); 161 list_for_each_entry_safe(e, next, &t->iphash[i], list)
298 if(!info) { return -ENOMEM; } 162 recent_entry_remove(t, e);
299 info->seconds = 0;
300 info->hit_count = 0;
301 info->check_set = check_set;
302 info->invert = 0;
303 info->side = IPT_RECENT_SOURCE;
304 strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN);
305 info->name[IPT_RECENT_NAME_LEN-1] = '\0';
306
307 skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL);
308 if (!skb) {
309 used = -ENOMEM;
310 goto out_free_info;
311 }
312 skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL);
313 if (!skb->nh.iph) {
314 used = -ENOMEM;
315 goto out_free_skb;
316 } 163 }
317
318 skb->nh.iph->saddr = addr;
319 skb->nh.iph->daddr = 0;
320 /* Clear ttl since we have no way of knowing it */
321 skb->nh.iph->ttl = 0;
322 match(skb,NULL,NULL,NULL,info,0,0,NULL);
323
324 kfree(skb->nh.iph);
325out_free_skb:
326 kfree(skb);
327out_free_info:
328 kfree(info);
329
330#ifdef DEBUG
331 if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used);
332#endif
333 return used;
334} 164}
335 165
336#endif /* CONFIG_PROC_FS */
337
338/* 'match' is our primary function, called by the kernel whenever a rule is
339 * hit with our module as an option to it.
340 * What this function does depends on what was specifically asked of it by
341 * the user:
342 * --set -- Add or update last seen time of the source address of the packet
343 * -- matchinfo->check_set == IPT_RECENT_SET
344 * --rcheck -- Just check if the source address is in the list
345 * -- matchinfo->check_set == IPT_RECENT_CHECK
346 * --update -- If the source address is in the list, update last_seen
347 * -- matchinfo->check_set == IPT_RECENT_UPDATE
348 * --remove -- If the source address is in the list, remove it
349 * -- matchinfo->check_set == IPT_RECENT_REMOVE
350 * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds
351 * -- matchinfo->seconds
352 * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times
353 * -- matchinfo->hit_count
354 * --seconds and --hitcount can be combined
355 */
356static int 166static int
357match(const struct sk_buff *skb, 167ipt_recent_match(const struct sk_buff *skb,
358 const struct net_device *in, 168 const struct net_device *in, const struct net_device *out,
359 const struct net_device *out, 169 const struct xt_match *match, const void *matchinfo,
360 const struct xt_match *match, 170 int offset, unsigned int protoff, int *hotdrop)
361 const void *matchinfo,
362 int offset,
363 unsigned int protoff,
364 int *hotdrop)
365{ 171{
366 int pkt_count, hits_found, ans;
367 unsigned long now;
368 const struct ipt_recent_info *info = matchinfo; 172 const struct ipt_recent_info *info = matchinfo;
369 u_int32_t addr = 0, time_temp; 173 struct recent_table *t;
370 u_int8_t ttl = skb->nh.iph->ttl; 174 struct recent_entry *e;
371 int *hash_table; 175 u_int32_t addr;
372 int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; 176 u_int8_t ttl;
373 struct time_info_list *time_info; 177 int ret = info->invert;
374 struct recent_ip_tables *curr_table;
375 struct recent_ip_tables *last_table;
376 struct recent_ip_list *r_list;
377
378#ifdef DEBUG
379 if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n");
380#endif
381
382 /* Default is false ^ info->invert */
383 ans = info->invert;
384 178
385#ifdef DEBUG 179 if (info->side == IPT_RECENT_DEST)
386 if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); 180 addr = skb->nh.iph->daddr;
387#endif 181 else
182 addr = skb->nh.iph->saddr;
388 183
389 /* if out != NULL then routing has been done and TTL changed. 184 ttl = skb->nh.iph->ttl;
390 * We change it back here internally for match what came in before routing. */ 185 /* use TTL as seen before forwarding */
391 if(out) ttl++; 186 if (out && !skb->sk)
187 ttl++;
392 188
393 /* Find the right table */
394 spin_lock_bh(&recent_lock); 189 spin_lock_bh(&recent_lock);
395 curr_table = r_tables; 190 t = recent_table_lookup(info->name);
396 while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); 191 e = recent_entry_lookup(t, addr,
397 192 info->check_set & IPT_RECENT_TTL ? ttl : 0);
398#ifdef DEBUG 193 if (e == NULL) {
399 if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); 194 if (!(info->check_set & IPT_RECENT_SET))
400#endif 195 goto out;
401 196 e = recent_entry_init(t, addr, ttl);
402 spin_unlock_bh(&recent_lock); 197 if (e == NULL)
403 198 *hotdrop = 1;
404 /* Table with this name not found, match impossible */ 199 ret ^= 1;
405 if(!curr_table) { return ans; } 200 goto out;
406
407 /* Make sure no one is changing the list while we work with it */
408 spin_lock_bh(&curr_table->list_lock);
409
410 r_list = curr_table->table;
411 if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr;
412
413 if(!addr) {
414#ifdef DEBUG
415 if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr);
416#endif
417 spin_unlock_bh(&curr_table->list_lock);
418 return ans;
419 }
420
421#ifdef DEBUG
422 if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl);
423#endif
424
425 /* Get jiffies now in case they changed while we were waiting for a lock */
426 now = jiffies;
427 hash_table = curr_table->hash_table;
428 time_info = curr_table->time_info;
429
430 orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size);
431 /* Hash entry at this result used */
432 /* Check for TTL match if requested. If TTL is zero then a match would never
433 * happen, so match regardless of existing TTL in that case. Zero means the
434 * entry was added via the /proc interface anyway, so we will just use the
435 * first TTL we get for that IP address. */
436 if(info->check_set & IPT_RECENT_TTL) {
437 while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr &&
438 (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) {
439 /* Collision in hash table */
440 hash_result = (hash_result + 1) % ip_list_hash_size;
441 }
442 } else {
443 while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) {
444 /* Collision in hash table */
445 hash_result = (hash_result + 1) % ip_list_hash_size;
446 }
447 }
448
449 if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) {
450 /* IP not in list and not asked to SET */
451 spin_unlock_bh(&curr_table->list_lock);
452 return ans;
453 }
454
455 /* Check if we need to handle the collision, do not need to on REMOVE */
456 if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) {
457#ifdef DEBUG
458 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n",
459 orig_hash_result,
460 hash_result,
461 r_list[hash_table[orig_hash_result]].addr,
462 addr);
463#endif
464
465 /* We had a collision.
466 * orig_hash_result is where we started, hash_result is where we ended up.
467 * So, swap them because we are likely to see the same guy again sooner */
468#ifdef DEBUG
469 if(debug) {
470 printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]);
471 printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n",
472 r_list[hash_table[orig_hash_result]].hash_entry);
473 }
474#endif
475
476 r_list[hash_table[orig_hash_result]].hash_entry = hash_result;
477
478
479 temp = hash_table[orig_hash_result];
480#ifdef DEBUG
481 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]);
482#endif
483 hash_table[orig_hash_result] = hash_table[hash_result];
484 hash_table[hash_result] = temp;
485 temp = hash_result;
486 hash_result = orig_hash_result;
487 orig_hash_result = temp;
488 time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result];
489 if(hash_table[hash_result] != -1) {
490 r_list[hash_table[hash_result]].hash_entry = hash_result;
491 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
492 }
493
494#ifdef DEBUG
495 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n");
496#endif
497 } 201 }
498 202
499 if(hash_table[hash_result] == -1) { 203 if (info->check_set & IPT_RECENT_SET)
500#ifdef DEBUG 204 ret ^= 1;
501 if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", 205 else if (info->check_set & IPT_RECENT_REMOVE) {
502 hash_result, addr); 206 recent_entry_remove(t, e);
503#endif 207 ret ^= 1;
504 208 } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) {
505 /* New item found and IPT_RECENT_SET, so we need to add it */ 209 unsigned long t = jiffies - info->seconds * HZ;
506 location = time_info[curr_table->time_pos].position; 210 unsigned int i, hits = 0;
507 hash_table[r_list[location].hash_entry] = -1; 211
508 hash_table[hash_result] = location; 212 for (i = 0; i < e->nstamps; i++) {
509 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); 213 if (info->seconds && time_after(t, e->stamps[i]))
510 r_list[location].time_pos = curr_table->time_pos; 214 continue;
511 r_list[location].addr = addr; 215 if (++hits >= info->hit_count) {
512 r_list[location].ttl = ttl; 216 ret ^= 1;
513 r_list[location].last_seen = now; 217 break;
514 r_list[location].oldest_pkt = 1;
515 r_list[location].last_pkts[0] = now;
516 r_list[location].hash_entry = hash_result;
517 time_info[curr_table->time_pos].time = r_list[location].last_seen;
518 curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot;
519
520 ans = !info->invert;
521 } else {
522#ifdef DEBUG
523 if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n",
524 hash_result,
525 addr);
526#endif
527
528 /* Existing item found */
529 location = hash_table[hash_result];
530 /* We have a match on address, now to make sure it meets all requirements for a
531 * full match. */
532 if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) {
533 if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert;
534 if(info->seconds && !info->hit_count) {
535 if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert;
536 }
537 if(info->seconds && info->hit_count) {
538 for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
539 if(r_list[location].last_pkts[pkt_count] == 0) break;
540 if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
541 }
542 if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
543 }
544 if(info->hit_count && !info->seconds) {
545 for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
546 if(r_list[location].last_pkts[pkt_count] == 0) break;
547 hits_found++;
548 }
549 if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
550 } 218 }
551 } 219 }
552#ifdef DEBUG
553 if(debug) {
554 if(ans)
555 printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr);
556 else
557 printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr);
558 }
559#endif
560
561 /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the
562 * current timestamp to the last_seen. */
563 if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) {
564#ifdef DEBUG
565 if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n");
566#endif
567 /* Have to update our time info */
568 time_loc = r_list[location].time_pos;
569 time_info[time_loc].time = now;
570 time_info[time_loc].position = location;
571 while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
572 time_temp = time_info[time_loc].time;
573 time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
574 time_info[(time_loc+1)%ip_list_tot].time = time_temp;
575 time_temp = time_info[time_loc].position;
576 time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
577 time_info[(time_loc+1)%ip_list_tot].position = time_temp;
578 r_list[time_info[time_loc].position].time_pos = time_loc;
579 r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
580 time_loc = (time_loc+1) % ip_list_tot;
581 }
582 r_list[location].time_pos = time_loc;
583 r_list[location].ttl = ttl;
584 r_list[location].last_pkts[r_list[location].oldest_pkt] = now;
585 r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot;
586 r_list[location].last_seen = now;
587 }
588 /* If we have been asked to remove the entry from the list, just set it to 0 */
589 if(info->check_set & IPT_RECENT_REMOVE) {
590#ifdef DEBUG
591 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result);
592#endif
593 /* Check if this is part of a collision chain */
594 while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) {
595 orig_hash_result++;
596 if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) {
597 /* Found collision chain, how deep does this rabbit hole go? */
598#ifdef DEBUG
599 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n");
600#endif
601 end_collision_chain = orig_hash_result;
602 }
603 }
604 if(end_collision_chain != -1) {
605#ifdef DEBUG
606 if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n");
607#endif
608 /* Part of a collision chain, swap it with the end of the chain
609 * before removing. */
610 r_list[hash_table[end_collision_chain]].hash_entry = hash_result;
611 temp = hash_table[end_collision_chain];
612 hash_table[end_collision_chain] = hash_table[hash_result];
613 hash_table[hash_result] = temp;
614 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
615 hash_result = end_collision_chain;
616 r_list[hash_table[hash_result]].hash_entry = hash_result;
617 time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
618 }
619 location = hash_table[hash_result];
620 hash_table[r_list[location].hash_entry] = -1;
621 time_loc = r_list[location].time_pos;
622 time_info[time_loc].time = 0;
623 time_info[time_loc].position = location;
624 while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
625 time_temp = time_info[time_loc].time;
626 time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
627 time_info[(time_loc+1)%ip_list_tot].time = time_temp;
628 time_temp = time_info[time_loc].position;
629 time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
630 time_info[(time_loc+1)%ip_list_tot].position = time_temp;
631 r_list[time_info[time_loc].position].time_pos = time_loc;
632 r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
633 time_loc = (time_loc+1) % ip_list_tot;
634 }
635 r_list[location].time_pos = time_loc;
636 r_list[location].last_seen = 0;
637 r_list[location].addr = 0;
638 r_list[location].ttl = 0;
639 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
640 r_list[location].oldest_pkt = 0;
641 ans = !info->invert;
642 }
643 spin_unlock_bh(&curr_table->list_lock);
644 return ans;
645 } 220 }
646 221
647 spin_unlock_bh(&curr_table->list_lock); 222 if (info->check_set & IPT_RECENT_SET ||
648#ifdef DEBUG 223 (info->check_set & IPT_RECENT_UPDATE && ret)) {
649 if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); 224 recent_entry_update(t, e);
650#endif 225 e->ttl = ttl;
651 return ans; 226 }
227out:
228 spin_unlock_bh(&recent_lock);
229 return ret;
652} 230}
653 231
654/* This function is to verify that the rule given during the userspace iptables
655 * command is correct.
656 * If the command is valid then we check if the table name referred to by the
657 * rule exists, if not it is created.
658 */
659static int 232static int
660checkentry(const char *tablename, 233ipt_recent_checkentry(const char *tablename, const void *ip,
661 const void *ip, 234 const struct xt_match *match, void *matchinfo,
662 const struct xt_match *match, 235 unsigned int matchsize, unsigned int hook_mask)
663 void *matchinfo,
664 unsigned int matchsize,
665 unsigned int hook_mask)
666{ 236{
667 int flag = 0, c;
668 unsigned long *hold;
669 const struct ipt_recent_info *info = matchinfo; 237 const struct ipt_recent_info *info = matchinfo;
670 struct recent_ip_tables *curr_table, *find_table, *last_table; 238 struct recent_table *t;
671 239 unsigned i;
672#ifdef DEBUG 240 int ret = 0;
673 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n");
674#endif
675
676 /* seconds and hit_count only valid for CHECK/UPDATE */
677 if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; }
678 if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; }
679 if(info->check_set & IPT_RECENT_CHECK) flag++;
680 if(info->check_set & IPT_RECENT_UPDATE) flag++;
681
682 /* One and only one of these should ever be set */
683 if(flag != 1) return 0;
684
685 /* Name must be set to something */
686 if(!info->name || !info->name[0]) return 0;
687 241
688 /* Things look good, create a list for this if it does not exist */ 242 if (hweight8(info->check_set &
689 /* Lock the linked list while we play with it */ 243 (IPT_RECENT_SET | IPT_RECENT_REMOVE |
690 spin_lock_bh(&recent_lock); 244 IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1)
691 245 return 0;
692 /* Look for an entry with this name already created */ 246 if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
693 /* Finds the end of the list and the entry before the end if current name does not exist */ 247 (info->seconds || info->hit_count))
694 find_table = r_tables; 248 return 0;
695 while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); 249 if (info->name[0] == '\0' ||
250 strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
251 return 0;
696 252
697 /* If a table already exists just increment the count on that table and return */ 253 mutex_lock(&recent_mutex);
698 if(find_table) { 254 t = recent_table_lookup(info->name);
699#ifdef DEBUG 255 if (t != NULL) {
700 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); 256 t->refcnt++;
701#endif 257 ret = 1;
702 find_table->count++; 258 goto out;
703 spin_unlock_bh(&recent_lock);
704 return 1;
705 } 259 }
706 260
707 spin_unlock_bh(&recent_lock); 261 t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
708 262 GFP_KERNEL);
709 /* Table with this name not found */ 263 if (t == NULL)
710 /* Allocate memory for new linked list item */ 264 goto out;
711 265 t->refcnt = 1;
712#ifdef DEBUG 266 strcpy(t->name, info->name);
713 if(debug) { 267 INIT_LIST_HEAD(&t->lru_list);
714 printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); 268 for (i = 0; i < ip_list_hash_size; i++)
715 printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); 269 INIT_LIST_HEAD(&t->iphash[i]);
270#ifdef CONFIG_PROC_FS
271 t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir);
272 if (t->proc == NULL) {
273 kfree(t);
274 goto out;
716 } 275 }
276 t->proc->proc_fops = &recent_fops;
277 t->proc->data = t;
717#endif 278#endif
279 spin_lock_bh(&recent_lock);
280 list_add_tail(&t->list, &tables);
281 spin_unlock_bh(&recent_lock);
282 ret = 1;
283out:
284 mutex_unlock(&recent_mutex);
285 return ret;
286}
718 287
719 curr_table = vmalloc(sizeof(struct recent_ip_tables)); 288static void
720 if(curr_table == NULL) return 0; 289ipt_recent_destroy(const struct xt_match *match, void *matchinfo,
721 290 unsigned int matchsize)
722 spin_lock_init(&curr_table->list_lock); 291{
723 curr_table->next = NULL; 292 const struct ipt_recent_info *info = matchinfo;
724 curr_table->count = 1; 293 struct recent_table *t;
725 curr_table->time_pos = 0;
726 strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN);
727 curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0';
728
729 /* Allocate memory for this table and the list of packets in each entry. */
730#ifdef DEBUG
731 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n",
732 sizeof(struct recent_ip_list)*ip_list_tot,
733 info->name);
734#endif
735
736 curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot);
737 if(curr_table->table == NULL) { vfree(curr_table); return 0; }
738 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
739#ifdef DEBUG
740 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
741 sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
742#endif
743
744 hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
745#ifdef DEBUG
746 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
747#endif
748 if(hold == NULL) {
749 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n");
750 vfree(curr_table->table);
751 vfree(curr_table);
752 return 0;
753 }
754 for(c = 0; c < ip_list_tot; c++) {
755 curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot;
756 }
757 294
758 /* Allocate memory for the hash table */ 295 mutex_lock(&recent_mutex);
759#ifdef DEBUG 296 t = recent_table_lookup(info->name);
760 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", 297 if (--t->refcnt == 0) {
761 sizeof(int)*ip_list_hash_size); 298 spin_lock_bh(&recent_lock);
299 list_del(&t->list);
300 spin_unlock_bh(&recent_lock);
301 recent_table_flush(t);
302#ifdef CONFIG_PROC_FS
303 remove_proc_entry(t->name, proc_dir);
762#endif 304#endif
763 305 kfree(t);
764 curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size);
765 if(!curr_table->hash_table) {
766 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n");
767 vfree(hold);
768 vfree(curr_table->table);
769 vfree(curr_table);
770 return 0;
771 }
772
773 for(c = 0; c < ip_list_hash_size; c++) {
774 curr_table->hash_table[c] = -1;
775 } 306 }
307 mutex_unlock(&recent_mutex);
308}
776 309
777 /* Allocate memory for the time info */ 310#ifdef CONFIG_PROC_FS
778#ifdef DEBUG 311struct recent_iter_state {
779 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", 312 struct recent_table *table;
780 sizeof(struct time_info_list)*ip_list_tot); 313 unsigned int bucket;
781#endif 314};
782 315
783 curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); 316static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
784 if(!curr_table->time_info) { 317{
785 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); 318 struct recent_iter_state *st = seq->private;
786 vfree(curr_table->hash_table); 319 struct recent_table *t = st->table;
787 vfree(hold); 320 struct recent_entry *e;
788 vfree(curr_table->table); 321 loff_t p = *pos;
789 vfree(curr_table);
790 return 0;
791 }
792 for(c = 0; c < ip_list_tot; c++) {
793 curr_table->time_info[c].position = c;
794 curr_table->time_info[c].time = 0;
795 }
796 322
797 /* Put the new table in place */
798 spin_lock_bh(&recent_lock); 323 spin_lock_bh(&recent_lock);
799 find_table = r_tables;
800 while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
801
802 /* If a table already exists just increment the count on that table and return */
803 if(find_table) {
804 find_table->count++;
805 spin_unlock_bh(&recent_lock);
806#ifdef DEBUG
807 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name);
808#endif
809 vfree(curr_table->time_info);
810 vfree(curr_table->hash_table);
811 vfree(hold);
812 vfree(curr_table->table);
813 vfree(curr_table);
814 return 1;
815 }
816 if(!last_table) r_tables = curr_table; else last_table->next = curr_table;
817
818 spin_unlock_bh(&recent_lock);
819 324
820#ifdef CONFIG_PROC_FS 325 for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) {
821 /* Create our proc 'status' entry. */ 326 list_for_each_entry(e, &t->iphash[st->bucket], list) {
822 curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); 327 if (p-- == 0)
823 if (!curr_table->status_proc) { 328 return e;
824 vfree(hold);
825 printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n");
826 /* Destroy the created table */
827 spin_lock_bh(&recent_lock);
828 last_table = NULL;
829 curr_table = r_tables;
830 if(!curr_table) {
831#ifdef DEBUG
832 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n");
833#endif
834 spin_unlock_bh(&recent_lock);
835 return 0;
836 }
837 while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
838 if(!curr_table) {
839#ifdef DEBUG
840 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n");
841#endif
842 spin_unlock_bh(&recent_lock);
843 return 0;
844 } 329 }
845 if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
846 spin_unlock_bh(&recent_lock);
847 vfree(curr_table->time_info);
848 vfree(curr_table->hash_table);
849 vfree(curr_table->table);
850 vfree(curr_table);
851 return 0;
852 } 330 }
853 331 return NULL;
854 curr_table->status_proc->owner = THIS_MODULE; 332}
855 curr_table->status_proc->data = curr_table;
856 wmb();
857 curr_table->status_proc->read_proc = ip_recent_get_info;
858 curr_table->status_proc->write_proc = ip_recent_ctrl;
859#endif /* CONFIG_PROC_FS */
860
861#ifdef DEBUG
862 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n");
863#endif
864 333
865 return 1; 334static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
335{
336 struct recent_iter_state *st = seq->private;
337 struct recent_table *t = st->table;
338 struct recent_entry *e = v;
339 struct list_head *head = e->list.next;
340
341 while (head == &t->iphash[st->bucket]) {
342 if (++st->bucket >= ip_list_hash_size)
343 return NULL;
344 head = t->iphash[st->bucket].next;
345 }
346 (*pos)++;
347 return list_entry(head, struct recent_entry, list);
866} 348}
867 349
868/* This function is called in the event that a rule matching this module is 350static void recent_seq_stop(struct seq_file *s, void *v)
869 * removed.
870 * When this happens we need to check if there are no other rules matching
871 * the table given. If that is the case then we remove the table and clean
872 * up its memory.
873 */
874static void
875destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize)
876{ 351{
877 const struct ipt_recent_info *info = matchinfo; 352 spin_unlock_bh(&recent_lock);
878 struct recent_ip_tables *curr_table, *last_table; 353}
879 354
880#ifdef DEBUG 355static int recent_seq_show(struct seq_file *seq, void *v)
881 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); 356{
882#endif 357 struct recent_entry *e = v;
358 unsigned int i;
359
360 i = (e->index - 1) % ip_pkt_list_tot;
361 seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
362 NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
363 for (i = 0; i < e->nstamps; i++)
364 seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
365 seq_printf(seq, "\n");
366 return 0;
367}
883 368
884 if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; 369static struct seq_operations recent_seq_ops = {
370 .start = recent_seq_start,
371 .next = recent_seq_next,
372 .stop = recent_seq_stop,
373 .show = recent_seq_show,
374};
885 375
886 /* Lock the linked list while we play with it */ 376static int recent_seq_open(struct inode *inode, struct file *file)
887 spin_lock_bh(&recent_lock); 377{
378 struct proc_dir_entry *pde = PDE(inode);
379 struct seq_file *seq;
380 struct recent_iter_state *st;
381 int ret;
382
383 st = kzalloc(sizeof(*st), GFP_KERNEL);
384 if (st == NULL)
385 return -ENOMEM;
386 ret = seq_open(file, &recent_seq_ops);
387 if (ret)
388 kfree(st);
389 st->table = pde->data;
390 seq = file->private_data;
391 seq->private = st;
392 return ret;
393}
888 394
889 /* Look for an entry with this name already created */ 395static ssize_t recent_proc_write(struct file *file, const char __user *input,
890 /* Finds the end of the list and the entry before the end if current name does not exist */ 396 size_t size, loff_t *loff)
891 last_table = NULL; 397{
892 curr_table = r_tables; 398 struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
893 if(!curr_table) { 399 struct recent_table *t = pde->data;
894#ifdef DEBUG 400 struct recent_entry *e;
895 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); 401 char buf[sizeof("+255.255.255.255")], *c = buf;
896#endif 402 u_int32_t addr;
403 int add;
404
405 if (size > sizeof(buf))
406 size = sizeof(buf);
407 if (copy_from_user(buf, input, size))
408 return -EFAULT;
409 while (isspace(*c))
410 c++;
411
412 if (size - (c - buf) < 5)
413 return c - buf;
414 if (!strncmp(c, "clear", 5)) {
415 c += 5;
416 spin_lock_bh(&recent_lock);
417 recent_table_flush(t);
897 spin_unlock_bh(&recent_lock); 418 spin_unlock_bh(&recent_lock);
898 return; 419 return c - buf;
899 } 420 }
900 while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
901 421
902 /* If a table does not exist then do nothing and return */ 422 switch (*c) {
903 if(!curr_table) { 423 case '-':
904#ifdef DEBUG 424 add = 0;
905 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); 425 c++;
906#endif 426 break;
907 spin_unlock_bh(&recent_lock); 427 case '+':
908 return; 428 c++;
429 default:
430 add = 1;
431 break;
909 } 432 }
433 addr = in_aton(c);
910 434
911 curr_table->count--; 435 spin_lock_bh(&recent_lock);
912 436 e = recent_entry_lookup(t, addr, 0);
913 /* If count is still non-zero then there are still rules referenceing it so we do nothing */ 437 if (e == NULL) {
914 if(curr_table->count) { 438 if (add)
915#ifdef DEBUG 439 recent_entry_init(t, addr, 0);
916 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); 440 } else {
917#endif 441 if (add)
918 spin_unlock_bh(&recent_lock); 442 recent_entry_update(t, e);
919 return; 443 else
444 recent_entry_remove(t, e);
920 } 445 }
921
922#ifdef DEBUG
923 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n");
924#endif
925
926 /* Count must be zero so we remove this table from the list */
927 if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
928
929 spin_unlock_bh(&recent_lock); 446 spin_unlock_bh(&recent_lock);
447 return size;
448}
930 449
931 /* lock to make sure any late-runners still using this after we removed it from 450static struct file_operations recent_fops = {
932 * the list finish up then remove everything */ 451 .open = recent_seq_open,
933 spin_lock_bh(&curr_table->list_lock); 452 .read = seq_read,
934 spin_unlock_bh(&curr_table->list_lock); 453 .write = recent_proc_write,
935 454 .release = seq_release_private,
936#ifdef CONFIG_PROC_FS 455 .owner = THIS_MODULE,
937 if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); 456};
938#endif /* CONFIG_PROC_FS */ 457#endif /* CONFIG_PROC_FS */
939 vfree(curr_table->table[0].last_pkts);
940 vfree(curr_table->table);
941 vfree(curr_table->hash_table);
942 vfree(curr_table->time_info);
943 vfree(curr_table);
944
945#ifdef DEBUG
946 if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n");
947#endif
948 458
949 return;
950}
951
952/* This is the structure we pass to ipt_register to register our
953 * module with iptables.
954 */
955static struct ipt_match recent_match = { 459static struct ipt_match recent_match = {
956 .name = "recent", 460 .name = "recent",
957 .match = match, 461 .match = ipt_recent_match,
958 .matchsize = sizeof(struct ipt_recent_info), 462 .matchsize = sizeof(struct ipt_recent_info),
959 .checkentry = checkentry, 463 .checkentry = ipt_recent_checkentry,
960 .destroy = destroy, 464 .destroy = ipt_recent_destroy,
961 .me = THIS_MODULE 465 .me = THIS_MODULE,
962}; 466};
963 467
964/* Kernel module initialization. */
965static int __init ipt_recent_init(void) 468static int __init ipt_recent_init(void)
966{ 469{
967 int err, count; 470 int err;
968 471
969 printk(version); 472 if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
970#ifdef CONFIG_PROC_FS 473 return -EINVAL;
971 proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); 474 ip_list_hash_size = 1 << fls(ip_list_tot);
972 if(!proc_net_ipt_recent) return -ENOMEM;
973#endif
974
975 if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) {
976 printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n");
977 ip_list_hash_size = 0;
978 }
979
980 if(!ip_list_hash_size) {
981 ip_list_hash_size = ip_list_tot*3;
982 count = 2*2;
983 while(ip_list_hash_size > count) count = count*2;
984 ip_list_hash_size = count;
985 }
986
987#ifdef DEBUG
988 if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size);
989#endif
990 475
991 err = ipt_register_match(&recent_match); 476 err = ipt_register_match(&recent_match);
477#ifdef CONFIG_PROC_FS
992 if (err) 478 if (err)
993 remove_proc_entry("ipt_recent", proc_net); 479 return err;
480 proc_dir = proc_mkdir("ipt_recent", proc_net);
481 if (proc_dir == NULL) {
482 ipt_unregister_match(&recent_match);
483 err = -ENOMEM;
484 }
485#endif
994 return err; 486 return err;
995} 487}
996 488
997/* Kernel module destruction. */ 489static void __exit ipt_recent_exit(void)
998static void __exit ipt_recent_fini(void)
999{ 490{
491 BUG_ON(!list_empty(&tables));
1000 ipt_unregister_match(&recent_match); 492 ipt_unregister_match(&recent_match);
1001 493#ifdef CONFIG_PROC_FS
1002 remove_proc_entry("ipt_recent",proc_net); 494 remove_proc_entry("ipt_recent", proc_net);
495#endif
1003} 496}
1004 497
1005/* Register our module with the kernel. */
1006module_init(ipt_recent_init); 498module_init(ipt_recent_init);
1007module_exit(ipt_recent_fini); 499module_exit(ipt_recent_exit);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 77d974443c7b..8cc8e1b36778 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum,
145 145
146 /* This is where we call the helper: as the packet goes out. */ 146 /* This is where we call the helper: as the packet goes out. */
147 ct = nf_ct_get(*pskb, &ctinfo); 147 ct = nf_ct_get(*pskb, &ctinfo);
148 if (!ct) 148 if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
149 return NF_ACCEPT; 149 return NF_ACCEPT;
150 150
151 help = nfct_help(ct); 151 help = nfct_help(ct);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b0d361cc6e6..663a73ee3f2f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
235 } 235 }
236 236
237 /* See ip_conntrack_proto_tcp.c */ 237 /* See ip_conntrack_proto_tcp.c */
238 if (hooknum == NF_IP_PRE_ROUTING && 238 if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
239 nf_ip_checksum(skb, hooknum, dataoff, 0)) { 239 nf_ip_checksum(skb, hooknum, dataoff, 0)) {
240 if (LOG_INVALID(IPPROTO_ICMP)) 240 if (LOG_INVALID(IPPROTO_ICMP))
241 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 241 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fc2562415555..bd221ec3f81e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk)
103} 103}
104 104
105struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, 105struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
106 unsigned long raddr, unsigned long laddr, 106 __be32 raddr, __be32 laddr,
107 int dif) 107 int dif)
108{ 108{
109 struct hlist_node *node; 109 struct hlist_node *node;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3adfcf00..ce4cd5f35511 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -182,14 +182,6 @@ ctl_table ipv4_table[] = {
182 .strategy = &ipv4_doint_and_flush_strategy, 182 .strategy = &ipv4_doint_and_flush_strategy,
183 }, 183 },
184 { 184 {
185 .ctl_name = NET_IPV4_AUTOCONFIG,
186 .procname = "ip_autoconfig",
187 .data = &ipv4_config.autoconfig,
188 .maxlen = sizeof(int),
189 .mode = 0644,
190 .proc_handler = &proc_dointvec
191 },
192 {
193 .ctl_name = NET_IPV4_NO_PMTU_DISC, 185 .ctl_name = NET_IPV4_NO_PMTU_DISC,
194 .procname = "ip_no_pmtu_disc", 186 .procname = "ip_no_pmtu_disc",
195 .data = &ipv4_config.no_pmtu_disc, 187 .data = &ipv4_config.no_pmtu_disc,
@@ -688,6 +680,24 @@ ctl_table ipv4_table[] = {
688 .mode = 0644, 680 .mode = 0644,
689 .proc_handler = &proc_dointvec 681 .proc_handler = &proc_dointvec
690 }, 682 },
683#ifdef CONFIG_NET_DMA
684 {
685 .ctl_name = NET_TCP_DMA_COPYBREAK,
686 .procname = "tcp_dma_copybreak",
687 .data = &sysctl_tcp_dma_copybreak,
688 .maxlen = sizeof(int),
689 .mode = 0644,
690 .proc_handler = &proc_dointvec
691 },
692#endif
693 {
694 .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE,
695 .procname = "tcp_slow_start_after_idle",
696 .data = &sysctl_tcp_slow_start_after_idle,
697 .maxlen = sizeof(int),
698 .mode = 0644,
699 .proc_handler = &proc_dointvec
700 },
691 { .ctl_name = 0 } 701 { .ctl_name = 0 }
692}; 702};
693 703
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b8055037..74998f250071 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -263,7 +263,7 @@
263#include <net/tcp.h> 263#include <net/tcp.h>
264#include <net/xfrm.h> 264#include <net/xfrm.h>
265#include <net/ip.h> 265#include <net/ip.h>
266 266#include <net/netdma.h>
267 267
268#include <asm/uaccess.h> 268#include <asm/uaccess.h>
269#include <asm/ioctls.h> 269#include <asm/ioctls.h>
@@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
622 ssize_t res; 622 ssize_t res;
623 struct sock *sk = sock->sk; 623 struct sock *sk = sock->sk;
624 624
625#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
626
627 if (!(sk->sk_route_caps & NETIF_F_SG) || 625 if (!(sk->sk_route_caps & NETIF_F_SG) ||
628 !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) 626 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
629 return sock_no_sendpage(sock, page, offset, size, flags); 627 return sock_no_sendpage(sock, page, offset, size, flags);
630 628
631#undef TCP_ZC_CSUM_FLAGS
632
633 lock_sock(sk); 629 lock_sock(sk);
634 TCP_CHECK_TIMER(sk); 630 TCP_CHECK_TIMER(sk);
635 res = do_tcp_sendpages(sk, &page, offset, size, flags); 631 res = do_tcp_sendpages(sk, &page, offset, size, flags);
@@ -726,9 +722,7 @@ new_segment:
726 /* 722 /*
727 * Check whether we can use HW checksum. 723 * Check whether we can use HW checksum.
728 */ 724 */
729 if (sk->sk_route_caps & 725 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
730 (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
731 NETIF_F_HW_CSUM))
732 skb->ip_summed = CHECKSUM_HW; 726 skb->ip_summed = CHECKSUM_HW;
733 727
734 skb_entail(sk, tp, skb); 728 skb_entail(sk, tp, skb);
@@ -937,7 +931,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
937 * calculation of whether or not we must ACK for the sake of 931 * calculation of whether or not we must ACK for the sake of
938 * a window update. 932 * a window update.
939 */ 933 */
940static void cleanup_rbuf(struct sock *sk, int copied) 934void tcp_cleanup_rbuf(struct sock *sk, int copied)
941{ 935{
942 struct tcp_sock *tp = tcp_sk(sk); 936 struct tcp_sock *tp = tcp_sk(sk);
943 int time_to_ack = 0; 937 int time_to_ack = 0;
@@ -1072,11 +1066,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1072 break; 1066 break;
1073 } 1067 }
1074 if (skb->h.th->fin) { 1068 if (skb->h.th->fin) {
1075 sk_eat_skb(sk, skb); 1069 sk_eat_skb(sk, skb, 0);
1076 ++seq; 1070 ++seq;
1077 break; 1071 break;
1078 } 1072 }
1079 sk_eat_skb(sk, skb); 1073 sk_eat_skb(sk, skb, 0);
1080 if (!desc->count) 1074 if (!desc->count)
1081 break; 1075 break;
1082 } 1076 }
@@ -1086,7 +1080,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1086 1080
1087 /* Clean up data we have read: This will do ACK frames. */ 1081 /* Clean up data we have read: This will do ACK frames. */
1088 if (copied) 1082 if (copied)
1089 cleanup_rbuf(sk, copied); 1083 tcp_cleanup_rbuf(sk, copied);
1090 return copied; 1084 return copied;
1091} 1085}
1092 1086
@@ -1110,6 +1104,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1110 int target; /* Read at least this many bytes */ 1104 int target; /* Read at least this many bytes */
1111 long timeo; 1105 long timeo;
1112 struct task_struct *user_recv = NULL; 1106 struct task_struct *user_recv = NULL;
1107 int copied_early = 0;
1113 1108
1114 lock_sock(sk); 1109 lock_sock(sk);
1115 1110
@@ -1133,6 +1128,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1133 1128
1134 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1129 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1135 1130
1131#ifdef CONFIG_NET_DMA
1132 tp->ucopy.dma_chan = NULL;
1133 preempt_disable();
1134 if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1135 !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) {
1136 preempt_enable_no_resched();
1137 tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
1138 } else
1139 preempt_enable_no_resched();
1140#endif
1141
1136 do { 1142 do {
1137 struct sk_buff *skb; 1143 struct sk_buff *skb;
1138 u32 offset; 1144 u32 offset;
@@ -1220,7 +1226,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1220 } 1226 }
1221 } 1227 }
1222 1228
1223 cleanup_rbuf(sk, copied); 1229 tcp_cleanup_rbuf(sk, copied);
1224 1230
1225 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { 1231 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1226 /* Install new reader */ 1232 /* Install new reader */
@@ -1274,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1274 } else 1280 } else
1275 sk_wait_data(sk, &timeo); 1281 sk_wait_data(sk, &timeo);
1276 1282
1283#ifdef CONFIG_NET_DMA
1284 tp->ucopy.wakeup = 0;
1285#endif
1286
1277 if (user_recv) { 1287 if (user_recv) {
1278 int chunk; 1288 int chunk;
1279 1289
@@ -1329,13 +1339,39 @@ do_prequeue:
1329 } 1339 }
1330 1340
1331 if (!(flags & MSG_TRUNC)) { 1341 if (!(flags & MSG_TRUNC)) {
1332 err = skb_copy_datagram_iovec(skb, offset, 1342#ifdef CONFIG_NET_DMA
1333 msg->msg_iov, used); 1343 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1334 if (err) { 1344 tp->ucopy.dma_chan = get_softnet_dma();
1335 /* Exception. Bailout! */ 1345
1336 if (!copied) 1346 if (tp->ucopy.dma_chan) {
1337 copied = -EFAULT; 1347 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1338 break; 1348 tp->ucopy.dma_chan, skb, offset,
1349 msg->msg_iov, used,
1350 tp->ucopy.pinned_list);
1351
1352 if (tp->ucopy.dma_cookie < 0) {
1353
1354 printk(KERN_ALERT "dma_cookie < 0\n");
1355
1356 /* Exception. Bailout! */
1357 if (!copied)
1358 copied = -EFAULT;
1359 break;
1360 }
1361 if ((offset + used) == skb->len)
1362 copied_early = 1;
1363
1364 } else
1365#endif
1366 {
1367 err = skb_copy_datagram_iovec(skb, offset,
1368 msg->msg_iov, used);
1369 if (err) {
1370 /* Exception. Bailout! */
1371 if (!copied)
1372 copied = -EFAULT;
1373 break;
1374 }
1339 } 1375 }
1340 } 1376 }
1341 1377
@@ -1355,15 +1391,19 @@ skip_copy:
1355 1391
1356 if (skb->h.th->fin) 1392 if (skb->h.th->fin)
1357 goto found_fin_ok; 1393 goto found_fin_ok;
1358 if (!(flags & MSG_PEEK)) 1394 if (!(flags & MSG_PEEK)) {
1359 sk_eat_skb(sk, skb); 1395 sk_eat_skb(sk, skb, copied_early);
1396 copied_early = 0;
1397 }
1360 continue; 1398 continue;
1361 1399
1362 found_fin_ok: 1400 found_fin_ok:
1363 /* Process the FIN. */ 1401 /* Process the FIN. */
1364 ++*seq; 1402 ++*seq;
1365 if (!(flags & MSG_PEEK)) 1403 if (!(flags & MSG_PEEK)) {
1366 sk_eat_skb(sk, skb); 1404 sk_eat_skb(sk, skb, copied_early);
1405 copied_early = 0;
1406 }
1367 break; 1407 break;
1368 } while (len > 0); 1408 } while (len > 0);
1369 1409
@@ -1386,12 +1426,42 @@ skip_copy:
1386 tp->ucopy.len = 0; 1426 tp->ucopy.len = 0;
1387 } 1427 }
1388 1428
1429#ifdef CONFIG_NET_DMA
1430 if (tp->ucopy.dma_chan) {
1431 struct sk_buff *skb;
1432 dma_cookie_t done, used;
1433
1434 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1435
1436 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1437 tp->ucopy.dma_cookie, &done,
1438 &used) == DMA_IN_PROGRESS) {
1439 /* do partial cleanup of sk_async_wait_queue */
1440 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1441 (dma_async_is_complete(skb->dma_cookie, done,
1442 used) == DMA_SUCCESS)) {
1443 __skb_dequeue(&sk->sk_async_wait_queue);
1444 kfree_skb(skb);
1445 }
1446 }
1447
1448 /* Safe to free early-copied skbs now */
1449 __skb_queue_purge(&sk->sk_async_wait_queue);
1450 dma_chan_put(tp->ucopy.dma_chan);
1451 tp->ucopy.dma_chan = NULL;
1452 }
1453 if (tp->ucopy.pinned_list) {
1454 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1455 tp->ucopy.pinned_list = NULL;
1456 }
1457#endif
1458
1389 /* According to UNIX98, msg_name/msg_namelen are ignored 1459 /* According to UNIX98, msg_name/msg_namelen are ignored
1390 * on connected socket. I was just happy when found this 8) --ANK 1460 * on connected socket. I was just happy when found this 8) --ANK
1391 */ 1461 */
1392 1462
1393 /* Clean up data we have read: This will do ACK frames. */ 1463 /* Clean up data we have read: This will do ACK frames. */
1394 cleanup_rbuf(sk, copied); 1464 tcp_cleanup_rbuf(sk, copied);
1395 1465
1396 TCP_CHECK_TIMER(sk); 1466 TCP_CHECK_TIMER(sk);
1397 release_sock(sk); 1467 release_sock(sk);
@@ -1658,6 +1728,9 @@ int tcp_disconnect(struct sock *sk, int flags)
1658 __skb_queue_purge(&sk->sk_receive_queue); 1728 __skb_queue_purge(&sk->sk_receive_queue);
1659 sk_stream_writequeue_purge(sk); 1729 sk_stream_writequeue_purge(sk);
1660 __skb_queue_purge(&tp->out_of_order_queue); 1730 __skb_queue_purge(&tp->out_of_order_queue);
1731#ifdef CONFIG_NET_DMA
1732 __skb_queue_purge(&sk->sk_async_wait_queue);
1733#endif
1661 1734
1662 inet->dport = 0; 1735 inet->dport = 0;
1663 1736
@@ -1858,7 +1931,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
1858 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && 1931 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1859 inet_csk_ack_scheduled(sk)) { 1932 inet_csk_ack_scheduled(sk)) {
1860 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; 1933 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1861 cleanup_rbuf(sk, 1); 1934 tcp_cleanup_rbuf(sk, 1);
1862 if (!(val & 1)) 1935 if (!(val & 1))
1863 icsk->icsk_ack.pingpong = 1; 1936 icsk->icsk_ack.pingpong = 1;
1864 } 1937 }
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 035f2092d73a..b2d9021ad22b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
198 return max(tp->snd_cwnd, ca->last_max_cwnd); 198 return max(tp->snd_cwnd, ca->last_max_cwnd);
199} 199}
200 200
201static u32 bictcp_min_cwnd(struct sock *sk)
202{
203 const struct tcp_sock *tp = tcp_sk(sk);
204 return tp->snd_ssthresh;
205}
206
207static void bictcp_state(struct sock *sk, u8 new_state) 201static void bictcp_state(struct sock *sk, u8 new_state)
208{ 202{
209 if (new_state == TCP_CA_Loss) 203 if (new_state == TCP_CA_Loss)
@@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = {
231 .cong_avoid = bictcp_cong_avoid, 225 .cong_avoid = bictcp_cong_avoid,
232 .set_state = bictcp_state, 226 .set_state = bictcp_state,
233 .undo_cwnd = bictcp_undo_cwnd, 227 .undo_cwnd = bictcp_undo_cwnd,
234 .min_cwnd = bictcp_min_cwnd,
235 .pkts_acked = bictcp_acked, 228 .pkts_acked = bictcp_acked,
236 .owner = THIS_MODULE, 229 .owner = THIS_MODULE,
237 .name = "bic", 230 .name = "bic",
diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c
new file mode 100644
index 000000000000..bc54f7e9aea9
--- /dev/null
+++ b/net/ipv4/tcp_compound.c
@@ -0,0 +1,448 @@
1/*
2 * TCP Vegas congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * Lawrence S. Brakmo and Larry L. Peterson.
6 * "TCP Vegas: End to end congestion avoidance on a global internet."
7 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
8 * October 1995. Available from:
9 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
10 *
11 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
12 * The main aspects that distinguish this implementation from the
13 * Arizona Vegas implementation are:
14 * o We do not change the loss detection or recovery mechanisms of
15 * Linux in any way. Linux already recovers from losses quite well,
16 * using fine-grained timers, NewReno, and FACK.
17 * o To avoid the performance penalty imposed by increasing cwnd
18 * only every-other RTT during slow start, we increase during
19 * every RTT during slow start, just like Reno.
20 * o Largely to allow continuous cwnd growth during slow start,
21 * we use the rate at which ACKs come back as the "actual"
22 * rate, rather than the rate at which data is sent.
23 * o To speed convergence to the right rate, we set the cwnd
24 * to achieve the right ("actual") rate when we exit slow start.
25 * o To filter out the noise caused by delayed ACKs, we use the
26 * minimum RTT sample observed during the last RTT to calculate
27 * the actual rate.
28 * o When the sender re-starts from idle, it waits until it has
29 * received ACKs for an entire flight of new data before making
30 * a cwnd adjustment decision. The original Vegas implementation
31 * assumed senders never went idle.
32 *
33 *
34 * TCP Compound based on TCP Vegas
35 *
36 * further details can be found here:
37 * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf
38 */
39
40#include <linux/config.h>
41#include <linux/mm.h>
42#include <linux/module.h>
43#include <linux/skbuff.h>
44#include <linux/inet_diag.h>
45
46#include <net/tcp.h>
47
48/* Default values of the Vegas variables, in fixed-point representation
49 * with V_PARAM_SHIFT bits to the right of the binary point.
50 */
51#define V_PARAM_SHIFT 1
52
53#define TCP_COMPOUND_ALPHA 3U
54#define TCP_COMPOUND_BETA 1U
55#define TCP_COMPOUND_GAMMA 30
56#define TCP_COMPOUND_ZETA 1
57
58/* TCP compound variables */
59struct compound {
60 u32 beg_snd_nxt; /* right edge during last RTT */
61 u32 beg_snd_una; /* left edge during last RTT */
62 u32 beg_snd_cwnd; /* saves the size of the cwnd */
63 u8 doing_vegas_now; /* if true, do vegas for this RTT */
64 u16 cntRTT; /* # of RTTs measured within last RTT */
65 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
66 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
67
68 u32 cwnd;
69 u32 dwnd;
70};
71
72/* There are several situations when we must "re-start" Vegas:
73 *
74 * o when a connection is established
75 * o after an RTO
76 * o after fast recovery
77 * o when we send a packet and there is no outstanding
78 * unacknowledged data (restarting an idle connection)
79 *
80 * In these circumstances we cannot do a Vegas calculation at the
81 * end of the first RTT, because any calculation we do is using
82 * stale info -- both the saved cwnd and congestion feedback are
83 * stale.
84 *
85 * Instead we must wait until the completion of an RTT during
86 * which we actually receive ACKs.
87 */
88static inline void vegas_enable(struct sock *sk)
89{
90 const struct tcp_sock *tp = tcp_sk(sk);
91 struct compound *vegas = inet_csk_ca(sk);
92
93 /* Begin taking Vegas samples next time we send something. */
94 vegas->doing_vegas_now = 1;
95
96 /* Set the beginning of the next send window. */
97 vegas->beg_snd_nxt = tp->snd_nxt;
98
99 vegas->cntRTT = 0;
100 vegas->minRTT = 0x7fffffff;
101}
102
103/* Stop taking Vegas samples for now. */
104static inline void vegas_disable(struct sock *sk)
105{
106 struct compound *vegas = inet_csk_ca(sk);
107
108 vegas->doing_vegas_now = 0;
109}
110
111static void tcp_compound_init(struct sock *sk)
112{
113 struct compound *vegas = inet_csk_ca(sk);
114 const struct tcp_sock *tp = tcp_sk(sk);
115
116 vegas->baseRTT = 0x7fffffff;
117 vegas_enable(sk);
118
119 vegas->dwnd = 0;
120 vegas->cwnd = tp->snd_cwnd;
121}
122
123/* Do RTT sampling needed for Vegas.
124 * Basically we:
125 * o min-filter RTT samples from within an RTT to get the current
126 * propagation delay + queuing delay (we are min-filtering to try to
127 * avoid the effects of delayed ACKs)
128 * o min-filter RTT samples from a much longer window (forever for now)
129 * to find the propagation delay (baseRTT)
130 */
131static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt)
132{
133 struct compound *vegas = inet_csk_ca(sk);
134 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
135
136 /* Filter to find propagation delay: */
137 if (vrtt < vegas->baseRTT)
138 vegas->baseRTT = vrtt;
139
140 /* Find the min RTT during the last RTT to find
141 * the current prop. delay + queuing delay:
142 */
143
144 vegas->minRTT = min(vegas->minRTT, vrtt);
145 vegas->cntRTT++;
146}
147
148static void tcp_compound_state(struct sock *sk, u8 ca_state)
149{
150
151 if (ca_state == TCP_CA_Open)
152 vegas_enable(sk);
153 else
154 vegas_disable(sk);
155}
156
157
158/* 64bit divisor, dividend and result. dynamic precision */
159static inline u64 div64_64(u64 dividend, u64 divisor)
160{
161 u32 d = divisor;
162
163 if (divisor > 0xffffffffULL) {
164 unsigned int shift = fls(divisor >> 32);
165
166 d = divisor >> shift;
167 dividend >>= shift;
168 }
169
170 /* avoid 64 bit division if possible */
171 if (dividend >> 32)
172 do_div(dividend, d);
173 else
174 dividend = (u32) dividend / d;
175
176 return dividend;
177}
178
179/* calculate the quartic root of "a" using Newton-Raphson */
180static u32 qroot(u64 a)
181{
182 u32 x, x1;
183
184 /* Initial estimate is based on:
185 * qrt(x) = exp(log(x) / 4)
186 */
187 x = 1u << (fls64(a) >> 2);
188
189 /*
190 * Iteration based on:
191 * 3
192 * x = ( 3 * x + a / x ) / 4
193 * k+1 k k
194 */
195 do {
196 u64 x3 = x;
197
198 x1 = x;
199 x3 *= x;
200 x3 *= x;
201
202 x = (3 * x + (u32) div64_64(a, x3)) / 4;
203 } while (abs(x1 - x) > 1);
204
205 return x;
206}
207
208
209/*
210 * If the connection is idle and we are restarting,
211 * then we don't want to do any Vegas calculations
212 * until we get fresh RTT samples. So when we
213 * restart, we reset our Vegas state to a clean
214 * slate. After we get acks for this flight of
215 * packets, _then_ we can make Vegas calculations
216 * again.
217 */
218static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event)
219{
220 if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
221 tcp_compound_init(sk);
222}
223
224static void tcp_compound_cong_avoid(struct sock *sk, u32 ack,
225 u32 seq_rtt, u32 in_flight, int flag)
226{
227 struct tcp_sock *tp = tcp_sk(sk);
228 struct compound *vegas = inet_csk_ca(sk);
229 u8 inc = 0;
230
231 if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) {
232 if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) {
233 vegas->cwnd = tp->snd_cwnd;
234 vegas->dwnd = 0;
235 } else
236 vegas->cwnd = tp->snd_cwnd - vegas->dwnd;
237
238 }
239
240 if (!tcp_is_cwnd_limited(sk, in_flight))
241 return;
242
243 if (vegas->cwnd <= tp->snd_ssthresh)
244 inc = 1;
245 else if (tp->snd_cwnd_cnt < tp->snd_cwnd)
246 tp->snd_cwnd_cnt++;
247
248 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
249 inc = 1;
250 tp->snd_cwnd_cnt = 0;
251 }
252
253 if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp)
254 vegas->cwnd++;
255
256 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
257 *
258 * These are so named because they represent the approximate values
259 * of snd_una and snd_nxt at the beginning of the current RTT. More
260 * precisely, they represent the amount of data sent during the RTT.
261 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
262 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
263 * bytes of data have been ACKed during the course of the RTT, giving
264 * an "actual" rate of:
265 *
266 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
267 *
268 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
269 * because delayed ACKs can cover more than one segment, so they
270 * don't line up nicely with the boundaries of RTTs.
271 *
272 * Another unfortunate fact of life is that delayed ACKs delay the
273 * advance of the left edge of our send window, so that the number
274 * of bytes we send in an RTT is often less than our cwnd will allow.
275 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
276 */
277
278 if (after(ack, vegas->beg_snd_nxt)) {
279 /* Do the Vegas once-per-RTT cwnd adjustment. */
280 u32 old_wnd, old_snd_cwnd;
281
282 /* Here old_wnd is essentially the window of data that was
283 * sent during the previous RTT, and has all
284 * been acknowledged in the course of the RTT that ended
285 * with the ACK we just received. Likewise, old_snd_cwnd
286 * is the cwnd during the previous RTT.
287 */
288 if (!tp->mss_cache)
289 return;
290
291 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
292 tp->mss_cache;
293 old_snd_cwnd = vegas->beg_snd_cwnd;
294
295 /* Save the extent of the current window so we can use this
296 * at the end of the next RTT.
297 */
298 vegas->beg_snd_una = vegas->beg_snd_nxt;
299 vegas->beg_snd_nxt = tp->snd_nxt;
300 vegas->beg_snd_cwnd = tp->snd_cwnd;
301
302 /* We do the Vegas calculations only if we got enough RTT
303 * samples that we can be reasonably sure that we got
304 * at least one RTT sample that wasn't from a delayed ACK.
305 * If we only had 2 samples total,
306 * then that means we're getting only 1 ACK per RTT, which
307 * means they're almost certainly delayed ACKs.
308 * If we have 3 samples, we should be OK.
309 */
310
311 if (vegas->cntRTT > 2) {
312 u32 rtt, target_cwnd, diff;
313 u32 brtt, dwnd;
314
315 /* We have enough RTT samples, so, using the Vegas
316 * algorithm, we determine if we should increase or
317 * decrease cwnd, and by how much.
318 */
319
320 /* Pluck out the RTT we are using for the Vegas
321 * calculations. This is the min RTT seen during the
322 * last RTT. Taking the min filters out the effects
323 * of delayed ACKs, at the cost of noticing congestion
324 * a bit later.
325 */
326 rtt = vegas->minRTT;
327
328 /* Calculate the cwnd we should have, if we weren't
329 * going too fast.
330 *
331 * This is:
332 * (actual rate in segments) * baseRTT
333 * We keep it as a fixed point number with
334 * V_PARAM_SHIFT bits to the right of the binary point.
335 */
336 if (!rtt)
337 return;
338
339 brtt = vegas->baseRTT;
340 target_cwnd = ((old_wnd * brtt)
341 << V_PARAM_SHIFT) / rtt;
342
343 /* Calculate the difference between the window we had,
344 * and the window we would like to have. This quantity
345 * is the "Diff" from the Arizona Vegas papers.
346 *
347 * Again, this is a fixed point number with
348 * V_PARAM_SHIFT bits to the right of the binary
349 * point.
350 */
351
352 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
353
354 dwnd = vegas->dwnd;
355
356 if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) {
357 u64 v;
358 u32 x;
359
360 /*
361 * The TCP Compound paper describes the choice
362 * of "k" determines the agressiveness,
363 * ie. slope of the response function.
364 *
365 * For same value as HSTCP would be 0.8
366 * but for computaional reasons, both the
367 * original authors and this implementation
368 * use 0.75.
369 */
370 v = old_wnd;
371 x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA;
372 if (x > 1)
373 dwnd = x - 1;
374 else
375 dwnd = 0;
376
377 dwnd += vegas->dwnd;
378
379 } else if ((dwnd << V_PARAM_SHIFT) <
380 (diff * TCP_COMPOUND_BETA))
381 dwnd = 0;
382 else
383 dwnd =
384 ((dwnd << V_PARAM_SHIFT) -
385 (diff *
386 TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT;
387
388 vegas->dwnd = dwnd;
389
390 }
391
392 /* Wipe the slate clean for the next RTT. */
393 vegas->cntRTT = 0;
394 vegas->minRTT = 0x7fffffff;
395 }
396
397 tp->snd_cwnd = vegas->cwnd + vegas->dwnd;
398}
399
400/* Extract info for Tcp socket info provided via netlink. */
401static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
402{
403 const struct compound *ca = inet_csk_ca(sk);
404 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
405 struct tcpvegas_info *info;
406
407 info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
408 sizeof(*info)));
409
410 info->tcpv_enabled = ca->doing_vegas_now;
411 info->tcpv_rttcnt = ca->cntRTT;
412 info->tcpv_rtt = ca->baseRTT;
413 info->tcpv_minrtt = ca->minRTT;
414 rtattr_failure:;
415 }
416}
417
418static struct tcp_congestion_ops tcp_compound = {
419 .init = tcp_compound_init,
420 .ssthresh = tcp_reno_ssthresh,
421 .cong_avoid = tcp_compound_cong_avoid,
422 .rtt_sample = tcp_compound_rtt_calc,
423 .set_state = tcp_compound_state,
424 .cwnd_event = tcp_compound_cwnd_event,
425 .get_info = tcp_compound_get_info,
426
427 .owner = THIS_MODULE,
428 .name = "compound",
429};
430
431static int __init tcp_compound_register(void)
432{
433 BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE);
434 tcp_register_congestion_control(&tcp_compound);
435 return 0;
436}
437
438static void __exit tcp_compound_unregister(void)
439{
440 tcp_unregister_congestion_control(&tcp_compound);
441}
442
443module_init(tcp_compound_register);
444module_exit(tcp_compound_unregister);
445
446MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger");
447MODULE_LICENSE("GPL");
448MODULE_DESCRIPTION("TCP Compound");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 91c2f41c7f58..857eefc52aab 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
38 int ret = 0; 38 int ret = 0;
39 39
40 /* all algorithms must implement ssthresh and cong_avoid ops */ 40 /* all algorithms must implement ssthresh and cong_avoid ops */
41 if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { 41 if (!ca->ssthresh || !ca->cong_avoid) {
42 printk(KERN_ERR "TCP %s does not implement required ops\n", 42 printk(KERN_ERR "TCP %s does not implement required ops\n",
43 ca->name); 43 ca->name);
44 return -EINVAL; 44 return -EINVAL;
@@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk)
251} 251}
252EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 252EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
253 253
254/* Lower bound on congestion window. */ 254/* Lower bound on congestion window with halving. */
255u32 tcp_reno_min_cwnd(struct sock *sk) 255u32 tcp_reno_min_cwnd(const struct sock *sk)
256{ 256{
257 const struct tcp_sock *tp = tcp_sk(sk); 257 const struct tcp_sock *tp = tcp_sk(sk);
258 return tp->snd_ssthresh/2; 258 return tp->snd_ssthresh/2;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 31a4986dfbf7..78b7a6b9e4de 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
325 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); 325 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
326} 326}
327 327
328static u32 bictcp_min_cwnd(struct sock *sk)
329{
330 return tcp_sk(sk)->snd_ssthresh;
331}
332
333static void bictcp_state(struct sock *sk, u8 new_state) 328static void bictcp_state(struct sock *sk, u8 new_state)
334{ 329{
335 if (new_state == TCP_CA_Loss) 330 if (new_state == TCP_CA_Loss)
@@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = {
357 .cong_avoid = bictcp_cong_avoid, 352 .cong_avoid = bictcp_cong_avoid,
358 .set_state = bictcp_state, 353 .set_state = bictcp_state,
359 .undo_cwnd = bictcp_undo_cwnd, 354 .undo_cwnd = bictcp_undo_cwnd,
360 .min_cwnd = bictcp_min_cwnd,
361 .pkts_acked = bictcp_acked, 355 .pkts_acked = bictcp_acked,
362 .owner = THIS_MODULE, 356 .owner = THIS_MODULE,
363 .name = "cubic", 357 .name = "cubic",
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index ba7c63ca5bb1..1120245b2373 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,6 +98,10 @@ struct hstcp {
98 u32 ai; 98 u32 ai;
99}; 99};
100 100
101static int max_ssthresh = 100;
102module_param(max_ssthresh, int, 0644);
103MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)");
104
101static void hstcp_init(struct sock *sk) 105static void hstcp_init(struct sock *sk)
102{ 106{
103 struct tcp_sock *tp = tcp_sk(sk); 107 struct tcp_sock *tp = tcp_sk(sk);
@@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
119 if (!tcp_is_cwnd_limited(sk, in_flight)) 123 if (!tcp_is_cwnd_limited(sk, in_flight))
120 return; 124 return;
121 125
122 if (tp->snd_cwnd <= tp->snd_ssthresh) 126 if (tp->snd_cwnd <= tp->snd_ssthresh) {
123 tcp_slow_start(tp); 127 /* RFC3742: limited slow start
124 else { 128 * the window is increased by 1/K MSS for each arriving ACK,
129 * for K = int(cwnd/(0.5 max_ssthresh))
130 */
131 if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) {
132 u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U);
133 if (++tp->snd_cwnd_cnt >= k) {
134 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
135 tp->snd_cwnd++;
136 tp->snd_cwnd_cnt = 0;
137 }
138 } else {
139 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
140 tp->snd_cwnd++;
141 }
142 } else {
125 /* Update AIMD parameters */ 143 /* Update AIMD parameters */
126 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { 144 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
127 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && 145 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 1b2ff53f98ed..3d92c1859267 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
246 } 246 }
247} 247}
248 248
249/* Lower bound on congestion window. */
250static u32 htcp_min_cwnd(struct sock *sk)
251{
252 const struct tcp_sock *tp = tcp_sk(sk);
253 return tp->snd_ssthresh;
254}
255
256
257static void htcp_init(struct sock *sk) 249static void htcp_init(struct sock *sk)
258{ 250{
259 struct htcp *ca = inet_csk_ca(sk); 251 struct htcp *ca = inet_csk_ca(sk);
@@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state)
285static struct tcp_congestion_ops htcp = { 277static struct tcp_congestion_ops htcp = {
286 .init = htcp_init, 278 .init = htcp_init,
287 .ssthresh = htcp_recalc_ssthresh, 279 .ssthresh = htcp_recalc_ssthresh,
288 .min_cwnd = htcp_min_cwnd,
289 .cong_avoid = htcp_cong_avoid, 280 .cong_avoid = htcp_cong_avoid,
290 .set_state = htcp_state, 281 .set_state = htcp_state,
291 .undo_cwnd = htcp_cwnd_undo, 282 .undo_cwnd = htcp_cwnd_undo,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b5521a9d3dc1..e08245bdda3a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -71,6 +71,7 @@
71#include <net/inet_common.h> 71#include <net/inet_common.h>
72#include <linux/ipsec.h> 72#include <linux/ipsec.h>
73#include <asm/unaligned.h> 73#include <asm/unaligned.h>
74#include <net/netdma.h>
74 75
75int sysctl_tcp_timestamps = 1; 76int sysctl_tcp_timestamps = 1;
76int sysctl_tcp_window_scaling = 1; 77int sysctl_tcp_window_scaling = 1;
@@ -1688,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1688 tp->snd_cwnd_stamp = tcp_time_stamp; 1689 tp->snd_cwnd_stamp = tcp_time_stamp;
1689} 1690}
1690 1691
1692/* Lower bound on congestion window is slow start threshold
1693 * unless congestion avoidance choice decides to overide it.
1694 */
1695static inline u32 tcp_cwnd_min(const struct sock *sk)
1696{
1697 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1698
1699 return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
1700}
1701
1691/* Decrease cwnd each second ack. */ 1702/* Decrease cwnd each second ack. */
1692static void tcp_cwnd_down(struct sock *sk) 1703static void tcp_cwnd_down(struct sock *sk)
1693{ 1704{
1694 const struct inet_connection_sock *icsk = inet_csk(sk);
1695 struct tcp_sock *tp = tcp_sk(sk); 1705 struct tcp_sock *tp = tcp_sk(sk);
1696 int decr = tp->snd_cwnd_cnt + 1; 1706 int decr = tp->snd_cwnd_cnt + 1;
1697 1707
1698 tp->snd_cwnd_cnt = decr&1; 1708 tp->snd_cwnd_cnt = decr&1;
1699 decr >>= 1; 1709 decr >>= 1;
1700 1710
1701 if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) 1711 if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
1702 tp->snd_cwnd -= decr; 1712 tp->snd_cwnd -= decr;
1703 1713
1704 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1714 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -3785,6 +3795,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk
3785 __tcp_checksum_complete_user(sk, skb); 3795 __tcp_checksum_complete_user(sk, skb);
3786} 3796}
3787 3797
3798#ifdef CONFIG_NET_DMA
3799static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
3800{
3801 struct tcp_sock *tp = tcp_sk(sk);
3802 int chunk = skb->len - hlen;
3803 int dma_cookie;
3804 int copied_early = 0;
3805
3806 if (tp->ucopy.wakeup)
3807 return 0;
3808
3809 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
3810 tp->ucopy.dma_chan = get_softnet_dma();
3811
3812 if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
3813
3814 dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
3815 skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
3816
3817 if (dma_cookie < 0)
3818 goto out;
3819
3820 tp->ucopy.dma_cookie = dma_cookie;
3821 copied_early = 1;
3822
3823 tp->ucopy.len -= chunk;
3824 tp->copied_seq += chunk;
3825 tcp_rcv_space_adjust(sk);
3826
3827 if ((tp->ucopy.len == 0) ||
3828 (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
3829 (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
3830 tp->ucopy.wakeup = 1;
3831 sk->sk_data_ready(sk, 0);
3832 }
3833 } else if (chunk > 0) {
3834 tp->ucopy.wakeup = 1;
3835 sk->sk_data_ready(sk, 0);
3836 }
3837out:
3838 return copied_early;
3839}
3840#endif /* CONFIG_NET_DMA */
3841
3788/* 3842/*
3789 * TCP receive function for the ESTABLISHED state. 3843 * TCP receive function for the ESTABLISHED state.
3790 * 3844 *
@@ -3886,8 +3940,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3886 tp->rcv_nxt == tp->rcv_wup) 3940 tp->rcv_nxt == tp->rcv_wup)
3887 tcp_store_ts_recent(tp); 3941 tcp_store_ts_recent(tp);
3888 3942
3889 tcp_rcv_rtt_measure_ts(sk, skb);
3890
3891 /* We know that such packets are checksummed 3943 /* We know that such packets are checksummed
3892 * on entry. 3944 * on entry.
3893 */ 3945 */
@@ -3901,14 +3953,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3901 } 3953 }
3902 } else { 3954 } else {
3903 int eaten = 0; 3955 int eaten = 0;
3956 int copied_early = 0;
3904 3957
3905 if (tp->ucopy.task == current && 3958 if (tp->copied_seq == tp->rcv_nxt &&
3906 tp->copied_seq == tp->rcv_nxt && 3959 len - tcp_header_len <= tp->ucopy.len) {
3907 len - tcp_header_len <= tp->ucopy.len && 3960#ifdef CONFIG_NET_DMA
3908 sock_owned_by_user(sk)) { 3961 if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
3909 __set_current_state(TASK_RUNNING); 3962 copied_early = 1;
3963 eaten = 1;
3964 }
3965#endif
3966 if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
3967 __set_current_state(TASK_RUNNING);
3910 3968
3911 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { 3969 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
3970 eaten = 1;
3971 }
3972 if (eaten) {
3912 /* Predicted packet is in window by definition. 3973 /* Predicted packet is in window by definition.
3913 * seq == rcv_nxt and rcv_wup <= rcv_nxt. 3974 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
3914 * Hence, check seq<=rcv_wup reduces to: 3975 * Hence, check seq<=rcv_wup reduces to:
@@ -3924,8 +3985,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3924 __skb_pull(skb, tcp_header_len); 3985 __skb_pull(skb, tcp_header_len);
3925 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 3986 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3926 NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); 3987 NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
3927 eaten = 1;
3928 } 3988 }
3989 if (copied_early)
3990 tcp_cleanup_rbuf(sk, skb->len);
3929 } 3991 }
3930 if (!eaten) { 3992 if (!eaten) {
3931 if (tcp_checksum_complete_user(sk, skb)) 3993 if (tcp_checksum_complete_user(sk, skb))
@@ -3966,6 +4028,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3966 4028
3967 __tcp_ack_snd_check(sk, 0); 4029 __tcp_ack_snd_check(sk, 0);
3968no_ack: 4030no_ack:
4031#ifdef CONFIG_NET_DMA
4032 if (copied_early)
4033 __skb_queue_tail(&sk->sk_async_wait_queue, skb);
4034 else
4035#endif
3969 if (eaten) 4036 if (eaten)
3970 __kfree_skb(skb); 4037 __kfree_skb(skb);
3971 else 4038 else
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 672950e54c49..25ecc6e2478b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -71,6 +71,7 @@
71#include <net/inet_common.h> 71#include <net/inet_common.h>
72#include <net/timewait_sock.h> 72#include <net/timewait_sock.h>
73#include <net/xfrm.h> 73#include <net/xfrm.h>
74#include <net/netdma.h>
74 75
75#include <linux/inet.h> 76#include <linux/inet.h>
76#include <linux/ipv6.h> 77#include <linux/ipv6.h>
@@ -1091,8 +1092,18 @@ process:
1091 bh_lock_sock(sk); 1092 bh_lock_sock(sk);
1092 ret = 0; 1093 ret = 0;
1093 if (!sock_owned_by_user(sk)) { 1094 if (!sock_owned_by_user(sk)) {
1094 if (!tcp_prequeue(sk, skb)) 1095#ifdef CONFIG_NET_DMA
1096 struct tcp_sock *tp = tcp_sk(sk);
1097 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1098 tp->ucopy.dma_chan = get_softnet_dma();
1099 if (tp->ucopy.dma_chan)
1095 ret = tcp_v4_do_rcv(sk, skb); 1100 ret = tcp_v4_do_rcv(sk, skb);
1101 else
1102#endif
1103 {
1104 if (!tcp_prequeue(sk, skb))
1105 ret = tcp_v4_do_rcv(sk, skb);
1106 }
1096 } else 1107 } else
1097 sk_add_backlog(sk, skb); 1108 sk_add_backlog(sk, skb);
1098 bh_unlock_sock(sk); 1109 bh_unlock_sock(sk);
@@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk)
1296 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1307 /* Cleans up our, hopefully empty, out_of_order_queue. */
1297 __skb_queue_purge(&tp->out_of_order_queue); 1308 __skb_queue_purge(&tp->out_of_order_queue);
1298 1309
1310#ifdef CONFIG_NET_DMA
1311 /* Cleans up our sk_async_wait_queue */
1312 __skb_queue_purge(&sk->sk_async_wait_queue);
1313#endif
1314
1299 /* Clean prequeue, it must be empty really */ 1315 /* Clean prequeue, it must be empty really */
1300 __skb_queue_purge(&tp->ucopy.prequeue); 1316 __skb_queue_purge(&tp->ucopy.prequeue);
1301 1317
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
new file mode 100644
index 000000000000..1f977b6ee9a1
--- /dev/null
+++ b/net/ipv4/tcp_lp.c
@@ -0,0 +1,338 @@
1/*
2 * TCP Low Priority (TCP-LP)
3 *
4 * TCP Low Priority is a distributed algorithm whose goal is to utilize only
5 * the excess network bandwidth as compared to the ``fair share`` of
6 * bandwidth as targeted by TCP. Available from:
7 * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
8 *
9 * Original Author:
10 * Aleksandar Kuzmanovic <akuzma@northwestern.edu>
11 *
12 * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation.
13 * As of 2.6.13, Linux supports pluggable congestion control algorithms.
14 * Due to the limitation of the API, we take the following changes from
15 * the original TCP-LP implementation:
16 * o We use newReno in most core CA handling. Only add some checking
17 * within cong_avoid.
18 * o Error correcting in remote HZ, therefore remote HZ will be keeped
19 * on checking and updating.
20 * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne
21 * OWD have a similar meaning as RTT. Also correct the buggy formular.
22 * o Handle reaction for Early Congestion Indication (ECI) within
23 * pkts_acked, as mentioned within pseudo code.
24 * o OWD is handled in relative format, where local time stamp will in
25 * tcp_time_stamp format.
26 *
27 * Port from 2.4.19 to 2.6.16 as module by:
28 * Wong Hoi Sing Edison <hswong3i@gmail.com>
29 * Hung Hing Lun <hlhung3i@gmail.com>
30 *
31 * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $
32 */
33
34#include <linux/config.h>
35#include <linux/module.h>
36#include <net/tcp.h>
37
38/* resolution of owd */
39#define LP_RESOL 1000
40
41/**
42 * enum tcp_lp_state
43 * @LP_VALID_RHZ: is remote HZ valid?
44 * @LP_VALID_OWD: is OWD valid?
45 * @LP_WITHIN_THR: are we within threshold?
46 * @LP_WITHIN_INF: are we within inference?
47 *
48 * TCP-LP's state flags.
49 * We create this set of state flag mainly for debugging.
50 */
51enum tcp_lp_state {
52 LP_VALID_RHZ = (1 << 0),
53 LP_VALID_OWD = (1 << 1),
54 LP_WITHIN_THR = (1 << 3),
55 LP_WITHIN_INF = (1 << 4),
56};
57
58/**
59 * struct lp
60 * @flag: TCP-LP state flag
61 * @sowd: smoothed OWD << 3
62 * @owd_min: min OWD
63 * @owd_max: max OWD
64 * @owd_max_rsv: resrved max owd
65 * @remote_hz: estimated remote HZ
66 * @remote_ref_time: remote reference time
67 * @local_ref_time: local reference time
68 * @last_drop: time for last active drop
69 * @inference: current inference
70 *
71 * TCP-LP's private struct.
72 * We get the idea from original TCP-LP implementation where only left those we
73 * found are really useful.
74 */
75struct lp {
76 u32 flag;
77 u32 sowd;
78 u32 owd_min;
79 u32 owd_max;
80 u32 owd_max_rsv;
81 u32 remote_hz;
82 u32 remote_ref_time;
83 u32 local_ref_time;
84 u32 last_drop;
85 u32 inference;
86};
87
88/**
89 * tcp_lp_init
90 *
91 * Init all required variables.
92 * Clone the handling from Vegas module implementation.
93 */
94static void tcp_lp_init(struct sock *sk)
95{
96 struct lp *lp = inet_csk_ca(sk);
97
98 lp->flag = 0;
99 lp->sowd = 0;
100 lp->owd_min = 0xffffffff;
101 lp->owd_max = 0;
102 lp->owd_max_rsv = 0;
103 lp->remote_hz = 0;
104 lp->remote_ref_time = 0;
105 lp->local_ref_time = 0;
106 lp->last_drop = 0;
107 lp->inference = 0;
108}
109
110/**
111 * tcp_lp_cong_avoid
112 *
113 * Implementation of cong_avoid.
114 * Will only call newReno CA when away from inference.
115 * From TCP-LP's paper, this will be handled in additive increasement.
116 */
117static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
118 int flag)
119{
120 struct lp *lp = inet_csk_ca(sk);
121
122 if (!(lp->flag & LP_WITHIN_INF))
123 tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
124}
125
126/**
127 * tcp_lp_remote_hz_estimator
128 *
129 * Estimate remote HZ.
130 * We keep on updating the estimated value, where original TCP-LP
131 * implementation only guest it for once and use forever.
132 */
133static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
134{
135 struct tcp_sock *tp = tcp_sk(sk);
136 struct lp *lp = inet_csk_ca(sk);
137 s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */
138 s64 m = 0;
139
140 /* not yet record reference time
141 * go away!! record it before come back!! */
142 if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
143 goto out;
144
145 /* we can't calc remote HZ with no different!! */
146 if (tp->rx_opt.rcv_tsval == lp->remote_ref_time
147 || tp->rx_opt.rcv_tsecr == lp->local_ref_time)
148 goto out;
149
150 m = HZ * (tp->rx_opt.rcv_tsval -
151 lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr -
152 lp->local_ref_time);
153 if (m < 0)
154 m = -m;
155
156 if (rhz != 0) {
157 m -= rhz >> 6; /* m is now error in remote HZ est */
158 rhz += m; /* 63/64 old + 1/64 new */
159 } else
160 rhz = m << 6;
161
162 /* record time for successful remote HZ calc */
163 lp->flag |= LP_VALID_RHZ;
164
165 out:
166 /* record reference time stamp */
167 lp->remote_ref_time = tp->rx_opt.rcv_tsval;
168 lp->local_ref_time = tp->rx_opt.rcv_tsecr;
169
170 return rhz >> 6;
171}
172
173/**
174 * tcp_lp_owd_calculator
175 *
176 * Calculate one way delay (in relative format).
177 * Original implement OWD as minus of remote time difference to local time
178 * difference directly. As this time difference just simply equal to RTT, when
179 * the network status is stable, remote RTT will equal to local RTT, and result
180 * OWD into zero.
181 * It seems to be a bug and so we fixed it.
182 */
183static u32 tcp_lp_owd_calculator(struct sock *sk)
184{
185 struct tcp_sock *tp = tcp_sk(sk);
186 struct lp *lp = inet_csk_ca(sk);
187 s64 owd = 0;
188
189 lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
190
191 if (lp->flag & LP_VALID_RHZ) {
192 owd =
193 tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
194 tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ);
195 if (owd < 0)
196 owd = -owd;
197 }
198
199 if (owd > 0)
200 lp->flag |= LP_VALID_OWD;
201 else
202 lp->flag &= ~LP_VALID_OWD;
203
204 return owd;
205}
206
207/**
208 * tcp_lp_rtt_sample
209 *
210 * Implementation or rtt_sample.
211 * Will take the following action,
212 * 1. calc OWD,
213 * 2. record the min/max OWD,
214 * 3. calc smoothed OWD (SOWD).
215 * Most ideas come from the original TCP-LP implementation.
216 */
217static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt)
218{
219 struct lp *lp = inet_csk_ca(sk);
220 s64 mowd = tcp_lp_owd_calculator(sk);
221
222 /* sorry that we don't have valid data */
223 if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
224 return;
225
226 /* record the next min owd */
227 if (mowd < lp->owd_min)
228 lp->owd_min = mowd;
229
230 /* always forget the max of the max
231 * we just set owd_max as one below it */
232 if (mowd > lp->owd_max) {
233 if (mowd > lp->owd_max_rsv) {
234 if (lp->owd_max_rsv == 0)
235 lp->owd_max = mowd;
236 else
237 lp->owd_max = lp->owd_max_rsv;
238 lp->owd_max_rsv = mowd;
239 } else
240 lp->owd_max = mowd;
241 }
242
243 /* calc for smoothed owd */
244 if (lp->sowd != 0) {
245 mowd -= lp->sowd >> 3; /* m is now error in owd est */
246 lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */
247 } else
248 lp->sowd = mowd << 3; /* take the measured time be owd */
249}
250
251/**
252 * tcp_lp_pkts_acked
253 *
254 * Implementation of pkts_acked.
255 * Deal with active drop under Early Congestion Indication.
256 * Only drop to half and 1 will be handle, because we hope to use back
257 * newReno in increase case.
258 * We work it out by following the idea from TCP-LP's paper directly
259 */
260static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked)
261{
262 struct tcp_sock *tp = tcp_sk(sk);
263 struct lp *lp = inet_csk_ca(sk);
264
265 /* calc inference */
266 if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
267 lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
268
269 /* test if within inference */
270 if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
271 lp->flag |= LP_WITHIN_INF;
272 else
273 lp->flag &= ~LP_WITHIN_INF;
274
275 /* test if within threshold */
276 if (lp->sowd >> 3 <
277 lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
278 lp->flag |= LP_WITHIN_THR;
279 else
280 lp->flag &= ~LP_WITHIN_THR;
281
282 pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
283 tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
284 lp->sowd >> 3);
285
286 if (lp->flag & LP_WITHIN_THR)
287 return;
288
289 /* FIXME: try to reset owd_min and owd_max here
290 * so decrease the chance the min/max is no longer suitable
291 * and will usually within threshold when whithin inference */
292 lp->owd_min = lp->sowd >> 3;
293 lp->owd_max = lp->sowd >> 2;
294 lp->owd_max_rsv = lp->sowd >> 2;
295
296 /* happened within inference
297 * drop snd_cwnd into 1 */
298 if (lp->flag & LP_WITHIN_INF)
299 tp->snd_cwnd = 1U;
300
301 /* happened after inference
302 * cut snd_cwnd into half */
303 else
304 tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
305
306 /* record this drop time */
307 lp->last_drop = tcp_time_stamp;
308}
309
310static struct tcp_congestion_ops tcp_lp = {
311 .init = tcp_lp_init,
312 .ssthresh = tcp_reno_ssthresh,
313 .cong_avoid = tcp_lp_cong_avoid,
314 .min_cwnd = tcp_reno_min_cwnd,
315 .rtt_sample = tcp_lp_rtt_sample,
316 .pkts_acked = tcp_lp_pkts_acked,
317
318 .owner = THIS_MODULE,
319 .name = "lp"
320};
321
322static int __init tcp_lp_register(void)
323{
324 BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
325 return tcp_register_congestion_control(&tcp_lp);
326}
327
328static void __exit tcp_lp_unregister(void)
329{
330 tcp_unregister_congestion_control(&tcp_lp);
331}
332
333module_init(tcp_lp_register);
334module_exit(tcp_lp_unregister);
335
336MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun");
337MODULE_LICENSE("GPL");
338MODULE_DESCRIPTION("TCP Low Priority");
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f33c9dddaa12..07bb5a2b375e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3;
59int sysctl_tcp_mtu_probing = 0; 59int sysctl_tcp_mtu_probing = 0;
60int sysctl_tcp_base_mss = 512; 60int sysctl_tcp_base_mss = 512;
61 61
62/* By default, RFC2861 behavior. */
63int sysctl_tcp_slow_start_after_idle = 1;
64
62static void update_send_head(struct sock *sk, struct tcp_sock *tp, 65static void update_send_head(struct sock *sk, struct tcp_sock *tp,
63 struct sk_buff *skb) 66 struct sk_buff *skb)
64{ 67{
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
138 struct inet_connection_sock *icsk = inet_csk(sk); 141 struct inet_connection_sock *icsk = inet_csk(sk);
139 const u32 now = tcp_time_stamp; 142 const u32 now = tcp_time_stamp;
140 143
141 if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) 144 if (sysctl_tcp_slow_start_after_idle &&
145 (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
142 tcp_cwnd_restart(sk, __sk_dst_get(sk)); 146 tcp_cwnd_restart(sk, __sk_dst_get(sk));
143 147
144 tp->lsndtime = now; 148 tp->lsndtime = now;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
new file mode 100644
index 000000000000..d7d517a3a238
--- /dev/null
+++ b/net/ipv4/tcp_probe.c
@@ -0,0 +1,181 @@
1/*
2 * tcpprobe - Observe the TCP flow with kprobes.
3 *
4 * The idea for this came from Werner Almesberger's umlsim
5 * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/kprobes.h>
24#include <linux/socket.h>
25#include <linux/tcp.h>
26#include <linux/proc_fs.h>
27#include <linux/module.h>
28#include <linux/kfifo.h>
29#include <linux/vmalloc.h>
30
31#include <net/tcp.h>
32
33MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>");
34MODULE_DESCRIPTION("TCP cwnd snooper");
35MODULE_LICENSE("GPL");
36
37static int port = 0;
38MODULE_PARM_DESC(port, "Port to match (0=all)");
39module_param(port, int, 0);
40
41static int bufsize = 64*1024;
42MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
43module_param(bufsize, int, 0);
44
45static const char procname[] = "tcpprobe";
46
47struct {
48 struct kfifo *fifo;
49 spinlock_t lock;
50 wait_queue_head_t wait;
51 struct timeval tstart;
52} tcpw;
53
54static void printl(const char *fmt, ...)
55{
56 va_list args;
57 int len;
58 struct timeval now;
59 char tbuf[256];
60
61 va_start(args, fmt);
62 do_gettimeofday(&now);
63
64 now.tv_sec -= tcpw.tstart.tv_sec;
65 now.tv_usec -= tcpw.tstart.tv_usec;
66 if (now.tv_usec < 0) {
67 --now.tv_sec;
68 now.tv_usec += 1000000;
69 }
70
71 len = sprintf(tbuf, "%lu.%06lu ",
72 (unsigned long) now.tv_sec,
73 (unsigned long) now.tv_usec);
74 len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
75 va_end(args);
76
77 kfifo_put(tcpw.fifo, tbuf, len);
78 wake_up(&tcpw.wait);
79}
80
81static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk,
82 struct msghdr *msg, size_t size)
83{
84 const struct tcp_sock *tp = tcp_sk(sk);
85 const struct inet_sock *inet = inet_sk(sk);
86
87 if (port == 0 || ntohs(inet->dport) == port ||
88 ntohs(inet->sport) == port) {
89 printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n",
90 NIPQUAD(inet->saddr), ntohs(inet->sport),
91 NIPQUAD(inet->daddr), ntohs(inet->dport),
92 size, tp->snd_nxt, tp->snd_una,
93 tp->snd_cwnd, tcp_current_ssthresh(sk),
94 tp->snd_wnd);
95 }
96
97 jprobe_return();
98 return 0;
99}
100
101static struct jprobe tcp_send_probe = {
102 .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, },
103 .entry = (kprobe_opcode_t *) &jtcp_sendmsg,
104};
105
106
107static int tcpprobe_open(struct inode * inode, struct file * file)
108{
109 kfifo_reset(tcpw.fifo);
110 do_gettimeofday(&tcpw.tstart);
111 return 0;
112}
113
114static ssize_t tcpprobe_read(struct file *file, char __user *buf,
115 size_t len, loff_t *ppos)
116{
117 int error = 0, cnt;
118 unsigned char *tbuf;
119
120 if (!buf || len < 0)
121 return -EINVAL;
122
123 if (len == 0)
124 return 0;
125
126 tbuf = vmalloc(len);
127 if (!tbuf)
128 return -ENOMEM;
129
130 error = wait_event_interruptible(tcpw.wait,
131 __kfifo_len(tcpw.fifo) != 0);
132 if (error)
133 return error;
134
135 cnt = kfifo_get(tcpw.fifo, tbuf, len);
136 error = copy_to_user(buf, tbuf, cnt);
137
138 vfree(tbuf);
139
140 return error ? error : cnt;
141}
142
143static struct file_operations tcpprobe_fops = {
144 .owner = THIS_MODULE,
145 .open = tcpprobe_open,
146 .read = tcpprobe_read,
147};
148
149static __init int tcpprobe_init(void)
150{
151 int ret = -ENOMEM;
152
153 init_waitqueue_head(&tcpw.wait);
154 spin_lock_init(&tcpw.lock);
155 tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock);
156
157 if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops))
158 goto err0;
159
160 ret = register_jprobe(&tcp_send_probe);
161 if (ret)
162 goto err1;
163
164 pr_info("TCP watch registered (port=%d)\n", port);
165 return 0;
166 err1:
167 proc_net_remove(procname);
168 err0:
169 kfifo_free(tcpw.fifo);
170 return ret;
171}
172module_init(tcpprobe_init);
173
174static __exit void tcpprobe_exit(void)
175{
176 kfifo_free(tcpw.fifo);
177 proc_net_remove(procname);
178 unregister_jprobe(&tcp_send_probe);
179
180}
181module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
new file mode 100644
index 000000000000..11b42a7135c1
--- /dev/null
+++ b/net/ipv4/tcp_veno.c
@@ -0,0 +1,231 @@
1/*
2 * TCP Veno congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * C. P. Fu, S. C. Liew.
6 * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
7 * IEEE Journal on Selected Areas in Communication,
8 * Feb. 2003.
9 * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
10 */
11
12#include <linux/config.h>
13#include <linux/mm.h>
14#include <linux/module.h>
15#include <linux/skbuff.h>
16#include <linux/inet_diag.h>
17
18#include <net/tcp.h>
19
20/* Default values of the Veno variables, in fixed-point representation
21 * with V_PARAM_SHIFT bits to the right of the binary point.
22 */
23#define V_PARAM_SHIFT 1
24static const int beta = 3 << V_PARAM_SHIFT;
25
26/* Veno variables */
27struct veno {
28 u8 doing_veno_now; /* if true, do veno for this rtt */
29 u16 cntrtt; /* # of rtts measured within last rtt */
30 u32 minrtt; /* min of rtts measured within last rtt (in usec) */
31 u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
32 u32 inc; /* decide whether to increase cwnd */
33 u32 diff; /* calculate the diff rate */
34};
35
36/* There are several situations when we must "re-start" Veno:
37 *
38 * o when a connection is established
39 * o after an RTO
40 * o after fast recovery
41 * o when we send a packet and there is no outstanding
42 * unacknowledged data (restarting an idle connection)
43 *
44 */
45static inline void veno_enable(struct sock *sk)
46{
47 struct veno *veno = inet_csk_ca(sk);
48
49 /* turn on Veno */
50 veno->doing_veno_now = 1;
51
52 veno->minrtt = 0x7fffffff;
53}
54
55static inline void veno_disable(struct sock *sk)
56{
57 struct veno *veno = inet_csk_ca(sk);
58
59 /* turn off Veno */
60 veno->doing_veno_now = 0;
61}
62
63static void tcp_veno_init(struct sock *sk)
64{
65 struct veno *veno = inet_csk_ca(sk);
66
67 veno->basertt = 0x7fffffff;
68 veno->inc = 1;
69 veno_enable(sk);
70}
71
72/* Do rtt sampling needed for Veno. */
73static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt)
74{
75 struct veno *veno = inet_csk_ca(sk);
76 u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */
77
78 /* Filter to find propagation delay: */
79 if (vrtt < veno->basertt)
80 veno->basertt = vrtt;
81
82 /* Find the min rtt during the last rtt to find
83 * the current prop. delay + queuing delay:
84 */
85 veno->minrtt = min(veno->minrtt, vrtt);
86 veno->cntrtt++;
87}
88
89static void tcp_veno_state(struct sock *sk, u8 ca_state)
90{
91 if (ca_state == TCP_CA_Open)
92 veno_enable(sk);
93 else
94 veno_disable(sk);
95}
96
97/*
98 * If the connection is idle and we are restarting,
99 * then we don't want to do any Veno calculations
100 * until we get fresh rtt samples. So when we
101 * restart, we reset our Veno state to a clean
102 * state. After we get acks for this flight of
103 * packets, _then_ we can make Veno calculations
104 * again.
105 */
106static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
107{
108 if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
109 tcp_veno_init(sk);
110}
111
112static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
113 u32 seq_rtt, u32 in_flight, int flag)
114{
115 struct tcp_sock *tp = tcp_sk(sk);
116 struct veno *veno = inet_csk_ca(sk);
117
118 if (!veno->doing_veno_now)
119 return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
120
121 /* limited by applications */
122 if (!tcp_is_cwnd_limited(sk, in_flight))
123 return;
124
125 /* We do the Veno calculations only if we got enough rtt samples */
126 if (veno->cntrtt <= 2) {
127 /* We don't have enough rtt samples to do the Veno
128 * calculation, so we'll behave like Reno.
129 */
130 tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
131 } else {
132 u32 rtt, target_cwnd;
133
134 /* We have enough rtt samples, so, using the Veno
135 * algorithm, we determine the state of the network.
136 */
137
138 rtt = veno->minrtt;
139
140 target_cwnd = ((tp->snd_cwnd * veno->basertt)
141 << V_PARAM_SHIFT) / rtt;
142
143 veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
144
145 if (tp->snd_cwnd <= tp->snd_ssthresh) {
146 /* Slow start. */
147 tcp_slow_start(tp);
148 } else {
149 /* Congestion avoidance. */
150 if (veno->diff < beta) {
151 /* In the "non-congestive state", increase cwnd
152 * every rtt.
153 */
154 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
155 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
156 tp->snd_cwnd++;
157 tp->snd_cwnd_cnt = 0;
158 } else
159 tp->snd_cwnd_cnt++;
160 } else {
161 /* In the "congestive state", increase cwnd
162 * every other rtt.
163 */
164 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
165 if (veno->inc
166 && tp->snd_cwnd <
167 tp->snd_cwnd_clamp) {
168 tp->snd_cwnd++;
169 veno->inc = 0;
170 } else
171 veno->inc = 1;
172 tp->snd_cwnd_cnt = 0;
173 } else
174 tp->snd_cwnd_cnt++;
175 }
176
177 }
178 if (tp->snd_cwnd < 2)
179 tp->snd_cwnd = 2;
180 else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
181 tp->snd_cwnd = tp->snd_cwnd_clamp;
182 }
183 /* Wipe the slate clean for the next rtt. */
184 /* veno->cntrtt = 0; */
185 veno->minrtt = 0x7fffffff;
186}
187
188/* Veno MD phase */
189static u32 tcp_veno_ssthresh(struct sock *sk)
190{
191 const struct tcp_sock *tp = tcp_sk(sk);
192 struct veno *veno = inet_csk_ca(sk);
193
194 if (veno->diff < beta)
195 /* in "non-congestive state", cut cwnd by 1/5 */
196 return max(tp->snd_cwnd * 4 / 5, 2U);
197 else
198 /* in "congestive state", cut cwnd by 1/2 */
199 return max(tp->snd_cwnd >> 1U, 2U);
200}
201
202static struct tcp_congestion_ops tcp_veno = {
203 .init = tcp_veno_init,
204 .ssthresh = tcp_veno_ssthresh,
205 .cong_avoid = tcp_veno_cong_avoid,
206 .rtt_sample = tcp_veno_rtt_calc,
207 .set_state = tcp_veno_state,
208 .cwnd_event = tcp_veno_cwnd_event,
209
210 .owner = THIS_MODULE,
211 .name = "veno",
212};
213
214static int __init tcp_veno_register(void)
215{
216 BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE);
217 tcp_register_congestion_control(&tcp_veno);
218 return 0;
219}
220
221static void __exit tcp_veno_unregister(void)
222{
223 tcp_unregister_congestion_control(&tcp_veno);
224}
225
226module_init(tcp_veno_register);
227module_exit(tcp_veno_unregister);
228
229MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu");
230MODULE_LICENSE("GPL");
231MODULE_DESCRIPTION("TCP Veno");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 0c340c3756c2..4247da1384bf 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -1,7 +1,24 @@
1/* 1/*
2 * TCP Westwood+ 2 * TCP Westwood+: end-to-end bandwidth estimation for TCP
3 * 3 *
4 * Angelo Dell'Aera: TCP Westwood+ support 4 * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4
5 *
6 * Support at http://c3lab.poliba.it/index.php/Westwood
7 * Main references in literature:
8 *
9 * - Mascolo S, Casetti, M. Gerla et al.
10 * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001
11 *
12 * - A. Grieco, s. Mascolo
13 * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer
14 * Comm. Review, 2004
15 *
16 * - A. Dell'Aera, L. Grieco, S. Mascolo.
17 * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving :
18 * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004
19 *
20 * Westwood+ employs end-to-end bandwidth measurement to set cwnd and
21 * ssthresh after packet loss. The probing phase is as the original Reno.
5 */ 22 */
6 23
7#include <linux/config.h> 24#include <linux/config.h>
@@ -22,6 +39,8 @@ struct westwood {
22 u32 accounted; 39 u32 accounted;
23 u32 rtt; 40 u32 rtt;
24 u32 rtt_min; /* minimum observed RTT */ 41 u32 rtt_min; /* minimum observed RTT */
42 u8 first_ack; /* flag which infers that this is the first ack */
43 u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/
25}; 44};
26 45
27 46
@@ -49,9 +68,11 @@ static void tcp_westwood_init(struct sock *sk)
49 w->bw_est = 0; 68 w->bw_est = 0;
50 w->accounted = 0; 69 w->accounted = 0;
51 w->cumul_ack = 0; 70 w->cumul_ack = 0;
71 w->reset_rtt_min = 1;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; 72 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp; 73 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tcp_sk(sk)->snd_una; 74 w->snd_una = tcp_sk(sk)->snd_una;
75 w->first_ack = 1;
55} 76}
56 77
57/* 78/*
@@ -63,10 +84,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b)
63 return (((7 * a) + b) >> 3); 84 return (((7 * a) + b) >> 3);
64} 85}
65 86
66static inline void westwood_filter(struct westwood *w, u32 delta) 87static void westwood_filter(struct westwood *w, u32 delta)
67{ 88{
68 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); 89 /* If the filter is empty fill it with the first sample of bandwidth */
69 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); 90 if (w->bw_ns_est == 0 && w->bw_est == 0) {
91 w->bw_ns_est = w->bk / delta;
92 w->bw_est = w->bw_ns_est;
93 } else {
94 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
95 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
96 }
70} 97}
71 98
72/* 99/*
@@ -91,6 +118,15 @@ static void westwood_update_window(struct sock *sk)
91 struct westwood *w = inet_csk_ca(sk); 118 struct westwood *w = inet_csk_ca(sk);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx; 119 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93 120
121 /* Initialize w->snd_una with the first acked sequence number in order
122 * to fix mismatch between tp->snd_una and w->snd_una for the first
123 * bandwidth sample
124 */
125 if (w->first_ack) {
126 w->snd_una = tcp_sk(sk)->snd_una;
127 w->first_ack = 0;
128 }
129
94 /* 130 /*
95 * See if a RTT-window has passed. 131 * See if a RTT-window has passed.
96 * Be careful since if RTT is less than 132 * Be careful since if RTT is less than
@@ -108,6 +144,16 @@ static void westwood_update_window(struct sock *sk)
108 } 144 }
109} 145}
110 146
147static inline void update_rtt_min(struct westwood *w)
148{
149 if (w->reset_rtt_min) {
150 w->rtt_min = w->rtt;
151 w->reset_rtt_min = 0;
152 } else
153 w->rtt_min = min(w->rtt, w->rtt_min);
154}
155
156
111/* 157/*
112 * @westwood_fast_bw 158 * @westwood_fast_bw
113 * It is called when we are in fast path. In particular it is called when 159 * It is called when we are in fast path. In particular it is called when
@@ -123,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk)
123 169
124 w->bk += tp->snd_una - w->snd_una; 170 w->bk += tp->snd_una - w->snd_una;
125 w->snd_una = tp->snd_una; 171 w->snd_una = tp->snd_una;
126 w->rtt_min = min(w->rtt, w->rtt_min); 172 update_rtt_min(w);
127} 173}
128 174
129/* 175/*
@@ -162,12 +208,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
162 return w->cumul_ack; 208 return w->cumul_ack;
163} 209}
164 210
165static inline u32 westwood_bw_rttmin(const struct sock *sk)
166{
167 const struct tcp_sock *tp = tcp_sk(sk);
168 const struct westwood *w = inet_csk_ca(sk);
169 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
170}
171 211
172/* 212/*
173 * TCP Westwood 213 * TCP Westwood
@@ -175,9 +215,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk)
175 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 215 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
176 * so avoids ever returning 0. 216 * so avoids ever returning 0.
177 */ 217 */
178static u32 tcp_westwood_cwnd_min(struct sock *sk) 218static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
179{ 219{
180 return westwood_bw_rttmin(sk); 220 const struct tcp_sock *tp = tcp_sk(sk);
221 const struct westwood *w = inet_csk_ca(sk);
222 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
181} 223}
182 224
183static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) 225static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
@@ -191,17 +233,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
191 break; 233 break;
192 234
193 case CA_EVENT_COMPLETE_CWR: 235 case CA_EVENT_COMPLETE_CWR:
194 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); 236 tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
195 break; 237 break;
196 238
197 case CA_EVENT_FRTO: 239 case CA_EVENT_FRTO:
198 tp->snd_ssthresh = westwood_bw_rttmin(sk); 240 tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
241 /* Update RTT_min when next ack arrives */
242 w->reset_rtt_min = 1;
199 break; 243 break;
200 244
201 case CA_EVENT_SLOW_ACK: 245 case CA_EVENT_SLOW_ACK:
202 westwood_update_window(sk); 246 westwood_update_window(sk);
203 w->bk += westwood_acked_count(sk); 247 w->bk += westwood_acked_count(sk);
204 w->rtt_min = min(w->rtt, w->rtt_min); 248 update_rtt_min(w);
205 break; 249 break;
206 250
207 default: 251 default:
@@ -235,7 +279,7 @@ static struct tcp_congestion_ops tcp_westwood = {
235 .init = tcp_westwood_init, 279 .init = tcp_westwood_init,
236 .ssthresh = tcp_reno_ssthresh, 280 .ssthresh = tcp_reno_ssthresh,
237 .cong_avoid = tcp_reno_cong_avoid, 281 .cong_avoid = tcp_reno_cong_avoid,
238 .min_cwnd = tcp_westwood_cwnd_min, 282 .min_cwnd = tcp_westwood_bw_rttmin,
239 .cwnd_event = tcp_westwood_event, 283 .cwnd_event = tcp_westwood_event,
240 .get_info = tcp_westwood_info, 284 .get_info = tcp_westwood_info,
241 .pkts_acked = tcp_westwood_pkts_acked, 285 .pkts_acked = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 3e174c83bfe7..817ed84511a6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -13,7 +13,6 @@
13#include <linux/string.h> 13#include <linux/string.h>
14#include <linux/netfilter.h> 14#include <linux/netfilter.h>
15#include <linux/netfilter_ipv4.h> 15#include <linux/netfilter_ipv4.h>
16#include <net/inet_ecn.h>
17#include <net/ip.h> 16#include <net/ip.h>
18#include <net/xfrm.h> 17#include <net/xfrm.h>
19 18
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb)
24 23
25EXPORT_SYMBOL(xfrm4_rcv); 24EXPORT_SYMBOL(xfrm4_rcv);
26 25
27static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
28{
29 struct iphdr *outer_iph = skb->nh.iph;
30 struct iphdr *inner_iph = skb->h.ipiph;
31
32 if (INET_ECN_is_ce(outer_iph->tos))
33 IP_ECN_set_ce(inner_iph);
34}
35
36static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) 26static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
37{ 27{
38 switch (nexthdr) { 28 switch (nexthdr) {
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
113 103
114 xfrm_vec[xfrm_nr++] = x; 104 xfrm_vec[xfrm_nr++] = x;
115 105
116 iph = skb->nh.iph; 106 if (x->mode->input(x, skb))
107 goto drop;
117 108
118 if (x->props.mode) { 109 if (x->props.mode) {
119 if (iph->protocol != IPPROTO_IPIP)
120 goto drop;
121 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
122 goto drop;
123 if (skb_cloned(skb) &&
124 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
125 goto drop;
126 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
127 ipv4_copy_dscp(iph, skb->h.ipiph);
128 if (!(x->props.flags & XFRM_STATE_NOECN))
129 ipip_ecn_decapsulate(skb);
130 skb->mac.raw = memmove(skb->data - skb->mac_len,
131 skb->mac.raw, skb->mac_len);
132 skb->nh.raw = skb->data;
133 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
134 decaps = 1; 110 decaps = 1;
135 break; 111 break;
136 } 112 }
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 000000000000..a9e6b3dd19c9
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,83 @@
1/*
2 * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
3 *
4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
5 */
6
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/stringify.h>
12#include <net/dst.h>
13#include <net/ip.h>
14#include <net/xfrm.h>
15
16/* Add encapsulation header.
17 *
18 * The IP header will be moved forward to make space for the encapsulation
19 * header.
20 *
21 * On exit, skb->h will be set to the start of the payload to be processed
22 * by x->type->output and skb->nh will be set to the top IP header.
23 */
24static int xfrm4_transport_output(struct sk_buff *skb)
25{
26 struct xfrm_state *x;
27 struct iphdr *iph;
28 int ihl;
29
30 iph = skb->nh.iph;
31 skb->h.ipiph = iph;
32
33 ihl = iph->ihl * 4;
34 skb->h.raw += ihl;
35
36 x = skb->dst->xfrm;
37 skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
38 return 0;
39}
40
41/* Remove encapsulation header.
42 *
43 * The IP header will be moved over the top of the encapsulation header.
44 *
45 * On entry, skb->h shall point to where the IP header should be and skb->nh
46 * shall be set to where the IP header currently is. skb->data shall point
47 * to the start of the payload.
48 */
49static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
50{
51 int ihl = skb->data - skb->h.raw;
52
53 if (skb->h.raw != skb->nh.raw)
54 skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl);
55 skb->nh.iph->tot_len = htons(skb->len + ihl);
56 skb->h.raw = skb->data;
57 return 0;
58}
59
60static struct xfrm_mode xfrm4_transport_mode = {
61 .input = xfrm4_transport_input,
62 .output = xfrm4_transport_output,
63 .owner = THIS_MODULE,
64 .encap = XFRM_MODE_TRANSPORT,
65};
66
67static int __init xfrm4_transport_init(void)
68{
69 return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
70}
71
72static void __exit xfrm4_transport_exit(void)
73{
74 int err;
75
76 err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
77 BUG_ON(err);
78}
79
80module_init(xfrm4_transport_init);
81module_exit(xfrm4_transport_exit);
82MODULE_LICENSE("GPL");
83MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 000000000000..f8d880beb12f
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
1/*
2 * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
3 *
4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
5 */
6
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/stringify.h>
12#include <net/dst.h>
13#include <net/inet_ecn.h>
14#include <net/ip.h>
15#include <net/xfrm.h>
16
17static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
18{
19 struct iphdr *outer_iph = skb->nh.iph;
20 struct iphdr *inner_iph = skb->h.ipiph;
21
22 if (INET_ECN_is_ce(outer_iph->tos))
23 IP_ECN_set_ce(inner_iph);
24}
25
26/* Add encapsulation header.
27 *
28 * The top IP header will be constructed per RFC 2401. The following fields
29 * in it shall be filled in by x->type->output:
30 * tot_len
31 * check
32 *
33 * On exit, skb->h will be set to the start of the payload to be processed
34 * by x->type->output and skb->nh will be set to the top IP header.
35 */
36static int xfrm4_tunnel_output(struct sk_buff *skb)
37{
38 struct dst_entry *dst = skb->dst;
39 struct xfrm_state *x = dst->xfrm;
40 struct iphdr *iph, *top_iph;
41 int flags;
42
43 iph = skb->nh.iph;
44 skb->h.ipiph = iph;
45
46 skb->nh.raw = skb_push(skb, x->props.header_len);
47 top_iph = skb->nh.iph;
48
49 top_iph->ihl = 5;
50 top_iph->version = 4;
51
52 /* DS disclosed */
53 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
54
55 flags = x->props.flags;
56 if (flags & XFRM_STATE_NOECN)
57 IP_ECN_clear(top_iph);
58
59 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
60 0 : (iph->frag_off & htons(IP_DF));
61 if (!top_iph->frag_off)
62 __ip_select_ident(top_iph, dst->child, 0);
63
64 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
65
66 top_iph->saddr = x->props.saddr.a4;
67 top_iph->daddr = x->id.daddr.a4;
68 top_iph->protocol = IPPROTO_IPIP;
69
70 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
71 return 0;
72}
73
74static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
75{
76 struct iphdr *iph = skb->nh.iph;
77 int err = -EINVAL;
78
79 if (iph->protocol != IPPROTO_IPIP)
80 goto out;
81 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
82 goto out;
83
84 if (skb_cloned(skb) &&
85 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
86 goto out;
87
88 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
89 ipv4_copy_dscp(iph, skb->h.ipiph);
90 if (!(x->props.flags & XFRM_STATE_NOECN))
91 ipip_ecn_decapsulate(skb);
92 skb->mac.raw = memmove(skb->data - skb->mac_len,
93 skb->mac.raw, skb->mac_len);
94 skb->nh.raw = skb->data;
95 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
96 err = 0;
97
98out:
99 return err;
100}
101
102static struct xfrm_mode xfrm4_tunnel_mode = {
103 .input = xfrm4_tunnel_input,
104 .output = xfrm4_tunnel_output,
105 .owner = THIS_MODULE,
106 .encap = XFRM_MODE_TUNNEL,
107};
108
109static int __init xfrm4_tunnel_init(void)
110{
111 return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
112}
113
114static void __exit xfrm4_tunnel_exit(void)
115{
116 int err;
117
118 err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
119 BUG_ON(err);
120}
121
122module_init(xfrm4_tunnel_init);
123module_exit(xfrm4_tunnel_exit);
124MODULE_LICENSE("GPL");
125MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 4ef8efaf6a67..ac9d91d4bb05 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -12,67 +12,10 @@
12#include <linux/skbuff.h> 12#include <linux/skbuff.h>
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/netfilter_ipv4.h> 14#include <linux/netfilter_ipv4.h>
15#include <net/inet_ecn.h>
16#include <net/ip.h> 15#include <net/ip.h>
17#include <net/xfrm.h> 16#include <net/xfrm.h>
18#include <net/icmp.h> 17#include <net/icmp.h>
19 18
20/* Add encapsulation header.
21 *
22 * In transport mode, the IP header will be moved forward to make space
23 * for the encapsulation header.
24 *
25 * In tunnel mode, the top IP header will be constructed per RFC 2401.
26 * The following fields in it shall be filled in by x->type->output:
27 * tot_len
28 * check
29 *
30 * On exit, skb->h will be set to the start of the payload to be processed
31 * by x->type->output and skb->nh will be set to the top IP header.
32 */
33static void xfrm4_encap(struct sk_buff *skb)
34{
35 struct dst_entry *dst = skb->dst;
36 struct xfrm_state *x = dst->xfrm;
37 struct iphdr *iph, *top_iph;
38 int flags;
39
40 iph = skb->nh.iph;
41 skb->h.ipiph = iph;
42
43 skb->nh.raw = skb_push(skb, x->props.header_len);
44 top_iph = skb->nh.iph;
45
46 if (!x->props.mode) {
47 skb->h.raw += iph->ihl*4;
48 memmove(top_iph, iph, iph->ihl*4);
49 return;
50 }
51
52 top_iph->ihl = 5;
53 top_iph->version = 4;
54
55 /* DS disclosed */
56 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
57
58 flags = x->props.flags;
59 if (flags & XFRM_STATE_NOECN)
60 IP_ECN_clear(top_iph);
61
62 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
63 0 : (iph->frag_off & htons(IP_DF));
64 if (!top_iph->frag_off)
65 __ip_select_ident(top_iph, dst->child, 0);
66
67 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
68
69 top_iph->saddr = x->props.saddr.a4;
70 top_iph->daddr = x->id.daddr.a4;
71 top_iph->protocol = IPPROTO_IPIP;
72
73 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
74}
75
76static int xfrm4_tunnel_check_size(struct sk_buff *skb) 19static int xfrm4_tunnel_check_size(struct sk_buff *skb)
77{ 20{
78 int mtu, ret = 0; 21 int mtu, ret = 0;
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb)
121 if (err) 64 if (err)
122 goto error; 65 goto error;
123 66
124 xfrm4_encap(skb); 67 err = x->mode->output(skb);
68 if (err)
69 goto error;
125 70
126 err = x->type->output(x, skb); 71 err = x->type->output(x, skb);
127 if (err) 72 if (err)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8604c747bca5..c0465284dfac 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
17static struct dst_ops xfrm4_dst_ops; 17static struct dst_ops xfrm4_dst_ops;
18static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 18static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
19 19
20static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
21
22static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) 20static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
23{ 21{
24 return __ip_route_output_key((struct rtable**)dst, fl); 22 return __ip_route_output_key((struct rtable**)dst, fl);
@@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
237 235
238static inline int xfrm4_garbage_collect(void) 236static inline int xfrm4_garbage_collect(void)
239{ 237{
240 read_lock(&xfrm4_policy_afinfo.lock);
241 xfrm4_policy_afinfo.garbage_collect(); 238 xfrm4_policy_afinfo.garbage_collect();
242 read_unlock(&xfrm4_policy_afinfo.lock);
243 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); 239 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
244} 240}
245 241
@@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = {
299 295
300static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 296static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
301 .family = AF_INET, 297 .family = AF_INET,
302 .lock = RW_LOCK_UNLOCKED,
303 .type_map = &xfrm4_type_map,
304 .dst_ops = &xfrm4_dst_ops, 298 .dst_ops = &xfrm4_dst_ops,
305 .dst_lookup = xfrm4_dst_lookup, 299 .dst_lookup = xfrm4_dst_lookup,
306 .find_bundle = __xfrm4_find_bundle, 300 .find_bundle = __xfrm4_find_bundle,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index dbabf81a9b7b..81e1751c966e 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
131 131
132static struct xfrm_state_afinfo xfrm4_state_afinfo = { 132static struct xfrm_state_afinfo xfrm4_state_afinfo = {
133 .family = AF_INET, 133 .family = AF_INET,
134 .lock = RW_LOCK_UNLOCKED,
135 .init_flags = xfrm4_init_flags, 134 .init_flags = xfrm4_init_flags,
136 .init_tempsel = __xfrm4_init_tempsel, 135 .init_tempsel = __xfrm4_init_tempsel,
137 .state_lookup = __xfrm4_state_lookup, 136 .state_lookup = __xfrm4_state_lookup,