diff options
author | Trond Myklebust <Trond.Myklebust@netapp.com> | 2006-06-20 08:59:45 -0400 |
---|---|---|
committer | Trond Myklebust <Trond.Myklebust@netapp.com> | 2006-06-20 08:59:45 -0400 |
commit | d59bf96cdde5b874a57bfd1425faa45da915d0b7 (patch) | |
tree | 351a40b72514d620e5bebea2de38c26f23277ffc /net/ipv4 | |
parent | 28df955a2ad484d602314b30183ea8496a9aa34a (diff) | |
parent | 25f42b6af09e34c3f92107b36b5aa6edc2fdba2f (diff) |
Merge branch 'master' of /home/trondmy/kernel/linux-2.6/
Diffstat (limited to 'net/ipv4')
55 files changed, 3380 insertions, 1337 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e40f75322377..da33393be45f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -414,6 +414,24 @@ config INET_TUNNEL | |||
414 | tristate | 414 | tristate |
415 | default n | 415 | default n |
416 | 416 | ||
417 | config INET_XFRM_MODE_TRANSPORT | ||
418 | tristate "IP: IPsec transport mode" | ||
419 | default y | ||
420 | select XFRM | ||
421 | ---help--- | ||
422 | Support for IPsec transport mode. | ||
423 | |||
424 | If unsure, say Y. | ||
425 | |||
426 | config INET_XFRM_MODE_TUNNEL | ||
427 | tristate "IP: IPsec tunnel mode" | ||
428 | default y | ||
429 | select XFRM | ||
430 | ---help--- | ||
431 | Support for IPsec tunnel mode. | ||
432 | |||
433 | If unsure, say Y. | ||
434 | |||
417 | config INET_DIAG | 435 | config INET_DIAG |
418 | tristate "INET: socket monitoring interface" | 436 | tristate "INET: socket monitoring interface" |
419 | default y | 437 | default y |
@@ -532,6 +550,38 @@ config TCP_CONG_SCALABLE | |||
532 | properties, though is known to have fairness issues. | 550 | properties, though is known to have fairness issues. |
533 | See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ | 551 | See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ |
534 | 552 | ||
553 | config TCP_CONG_LP | ||
554 | tristate "TCP Low Priority" | ||
555 | depends on EXPERIMENTAL | ||
556 | default n | ||
557 | ---help--- | ||
558 | TCP Low Priority (TCP-LP), a distributed algorithm whose goal is | ||
559 | to utiliza only the excess network bandwidth as compared to the | ||
560 | ``fair share`` of bandwidth as targeted by TCP. | ||
561 | See http://www-ece.rice.edu/networks/TCP-LP/ | ||
562 | |||
563 | config TCP_CONG_VENO | ||
564 | tristate "TCP Veno" | ||
565 | depends on EXPERIMENTAL | ||
566 | default n | ||
567 | ---help--- | ||
568 | TCP Veno is a sender-side only enhancement of TCP to obtain better | ||
569 | throughput over wireless networks. TCP Veno makes use of state | ||
570 | distinguishing to circumvent the difficult judgment of the packet loss | ||
571 | type. TCP Veno cuts down less congestion window in response to random | ||
572 | loss packets. | ||
573 | See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf | ||
574 | |||
575 | config TCP_CONG_COMPOUND | ||
576 | tristate "TCP Compound" | ||
577 | depends on EXPERIMENTAL | ||
578 | default n | ||
579 | ---help--- | ||
580 | TCP Compound is a sender-side only change to TCP that uses | ||
581 | a mixed Reno/Vegas approach to calculate the cwnd. | ||
582 | For further details look here: | ||
583 | ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf | ||
584 | |||
535 | endmenu | 585 | endmenu |
536 | 586 | ||
537 | config TCP_CONG_BIC | 587 | config TCP_CONG_BIC |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9ef50a0b9d2c..38b8039bdd55 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o | |||
24 | obj-$(CONFIG_INET_IPCOMP) += ipcomp.o | 24 | obj-$(CONFIG_INET_IPCOMP) += ipcomp.o |
25 | obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o | 25 | obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o |
26 | obj-$(CONFIG_INET_TUNNEL) += tunnel4.o | 26 | obj-$(CONFIG_INET_TUNNEL) += tunnel4.o |
27 | obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o | ||
28 | obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o | ||
27 | obj-$(CONFIG_IP_PNP) += ipconfig.o | 29 | obj-$(CONFIG_IP_PNP) += ipconfig.o |
28 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o | 30 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o |
29 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o | 31 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o |
@@ -34,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ | |||
34 | obj-$(CONFIG_INET_DIAG) += inet_diag.o | 36 | obj-$(CONFIG_INET_DIAG) += inet_diag.o |
35 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o | 37 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o |
36 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o | 38 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o |
39 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o | ||
37 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | 40 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o |
38 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o | 41 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o |
39 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o | 42 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o |
@@ -41,7 +44,10 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o | |||
41 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o | 44 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o |
42 | obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o | 45 | obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o |
43 | obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o | 46 | obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o |
47 | obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o | ||
44 | obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o | 48 | obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o |
49 | obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o | ||
50 | obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o | ||
45 | 51 | ||
46 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 52 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
47 | xfrm4_output.o | 53 | xfrm4_output.o |
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e2e4771fa4c6..c7782230080d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -119,6 +119,7 @@ error: | |||
119 | static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | 119 | static int ah_input(struct xfrm_state *x, struct sk_buff *skb) |
120 | { | 120 | { |
121 | int ah_hlen; | 121 | int ah_hlen; |
122 | int ihl; | ||
122 | struct iphdr *iph; | 123 | struct iphdr *iph; |
123 | struct ip_auth_hdr *ah; | 124 | struct ip_auth_hdr *ah; |
124 | struct ah_data *ahp; | 125 | struct ah_data *ahp; |
@@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
149 | ah = (struct ip_auth_hdr*)skb->data; | 150 | ah = (struct ip_auth_hdr*)skb->data; |
150 | iph = skb->nh.iph; | 151 | iph = skb->nh.iph; |
151 | 152 | ||
152 | memcpy(work_buf, iph, iph->ihl*4); | 153 | ihl = skb->data - skb->nh.raw; |
154 | memcpy(work_buf, iph, ihl); | ||
153 | 155 | ||
154 | iph->ttl = 0; | 156 | iph->ttl = 0; |
155 | iph->tos = 0; | 157 | iph->tos = 0; |
156 | iph->frag_off = 0; | 158 | iph->frag_off = 0; |
157 | iph->check = 0; | 159 | iph->check = 0; |
158 | if (iph->ihl != 5) { | 160 | if (ihl > sizeof(*iph)) { |
159 | u32 dummy; | 161 | u32 dummy; |
160 | if (ip_clear_mutable_options(iph, &dummy)) | 162 | if (ip_clear_mutable_options(iph, &dummy)) |
161 | goto out; | 163 | goto out; |
@@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
164 | u8 auth_data[MAX_AH_AUTH_LEN]; | 166 | u8 auth_data[MAX_AH_AUTH_LEN]; |
165 | 167 | ||
166 | memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); | 168 | memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); |
167 | skb_push(skb, skb->data - skb->nh.raw); | 169 | skb_push(skb, ihl); |
168 | ahp->icv(ahp, skb, ah->auth_data); | 170 | ahp->icv(ahp, skb, ah->auth_data); |
169 | if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { | 171 | if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { |
170 | x->stats.integrity_failed++; | 172 | x->stats.integrity_failed++; |
@@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
172 | } | 174 | } |
173 | } | 175 | } |
174 | ((struct iphdr*)work_buf)->protocol = ah->nexthdr; | 176 | ((struct iphdr*)work_buf)->protocol = ah->nexthdr; |
175 | skb->nh.raw = skb_pull(skb, ah_hlen); | 177 | skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); |
176 | memcpy(skb->nh.raw, work_buf, iph->ihl*4); | 178 | __skb_pull(skb, ah_hlen + ihl); |
177 | skb->nh.iph->tot_len = htons(skb->len); | ||
178 | skb_pull(skb, skb->nh.iph->ihl*4); | ||
179 | skb->h.raw = skb->data; | ||
180 | 179 | ||
181 | return 0; | 180 | return 0; |
182 | 181 | ||
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9d1881c07a32..9bbdd4494551 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
143 | int alen = esp->auth.icv_trunc_len; | 143 | int alen = esp->auth.icv_trunc_len; |
144 | int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; | 144 | int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; |
145 | int nfrags; | 145 | int nfrags; |
146 | int encap_len = 0; | 146 | int ihl; |
147 | u8 nexthdr[2]; | 147 | u8 nexthdr[2]; |
148 | struct scatterlist *sg; | 148 | struct scatterlist *sg; |
149 | u8 workbuf[60]; | ||
150 | int padlen; | 149 | int padlen; |
151 | 150 | ||
152 | if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) | 151 | if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) |
@@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
177 | skb->ip_summed = CHECKSUM_NONE; | 176 | skb->ip_summed = CHECKSUM_NONE; |
178 | 177 | ||
179 | esph = (struct ip_esp_hdr*)skb->data; | 178 | esph = (struct ip_esp_hdr*)skb->data; |
180 | iph = skb->nh.iph; | ||
181 | 179 | ||
182 | /* Get ivec. This can be wrong, check against another impls. */ | 180 | /* Get ivec. This can be wrong, check against another impls. */ |
183 | if (esp->conf.ivlen) | 181 | if (esp->conf.ivlen) |
@@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
204 | 202 | ||
205 | /* ... check padding bits here. Silly. :-) */ | 203 | /* ... check padding bits here. Silly. :-) */ |
206 | 204 | ||
205 | iph = skb->nh.iph; | ||
206 | ihl = iph->ihl * 4; | ||
207 | |||
207 | if (x->encap) { | 208 | if (x->encap) { |
208 | struct xfrm_encap_tmpl *encap = x->encap; | 209 | struct xfrm_encap_tmpl *encap = x->encap; |
209 | struct udphdr *uh; | 210 | struct udphdr *uh = (void *)(skb->nh.raw + ihl); |
210 | |||
211 | uh = (struct udphdr *)(iph + 1); | ||
212 | encap_len = (void*)esph - (void*)uh; | ||
213 | 211 | ||
214 | /* | 212 | /* |
215 | * 1) if the NAT-T peer's IP or port changed then | 213 | * 1) if the NAT-T peer's IP or port changed then |
@@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
246 | 244 | ||
247 | iph->protocol = nexthdr[1]; | 245 | iph->protocol = nexthdr[1]; |
248 | pskb_trim(skb, skb->len - alen - padlen - 2); | 246 | pskb_trim(skb, skb->len - alen - padlen - 2); |
249 | memcpy(workbuf, skb->nh.raw, iph->ihl*4); | 247 | skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; |
250 | skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); | ||
251 | skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; | ||
252 | memcpy(skb->nh.raw, workbuf, iph->ihl*4); | ||
253 | skb->nh.iph->tot_len = htons(skb->len); | ||
254 | 248 | ||
255 | return 0; | 249 | return 0; |
256 | 250 | ||
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cdde96390960..31387abf53a2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -666,3 +666,4 @@ void __init ip_fib_init(void) | |||
666 | } | 666 | } |
667 | 667 | ||
668 | EXPORT_SYMBOL(inet_addr_type); | 668 | EXPORT_SYMBOL(inet_addr_type); |
669 | EXPORT_SYMBOL(ip_dev_find); | ||
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2a0455911ee0..017900172f7d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -730,7 +730,6 @@ out_err: | |||
730 | static void icmp_redirect(struct sk_buff *skb) | 730 | static void icmp_redirect(struct sk_buff *skb) |
731 | { | 731 | { |
732 | struct iphdr *iph; | 732 | struct iphdr *iph; |
733 | unsigned long ip; | ||
734 | 733 | ||
735 | if (skb->len < sizeof(struct iphdr)) | 734 | if (skb->len < sizeof(struct iphdr)) |
736 | goto out_err; | 735 | goto out_err; |
@@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb) | |||
742 | goto out; | 741 | goto out; |
743 | 742 | ||
744 | iph = (struct iphdr *)skb->data; | 743 | iph = (struct iphdr *)skb->data; |
745 | ip = iph->daddr; | ||
746 | 744 | ||
747 | switch (skb->h.icmph->code & 7) { | 745 | switch (skb->h.icmph->code & 7) { |
748 | case ICMP_REDIR_NET: | 746 | case ICMP_REDIR_NET: |
@@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb) | |||
752 | */ | 750 | */ |
753 | case ICMP_REDIR_HOST: | 751 | case ICMP_REDIR_HOST: |
754 | case ICMP_REDIR_HOSTTOS: | 752 | case ICMP_REDIR_HOSTTOS: |
755 | ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, | 753 | ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, |
754 | skb->h.icmph->un.gateway, | ||
756 | iph->saddr, skb->dev); | 755 | iph->saddr, skb->dev); |
757 | break; | 756 | break; |
758 | } | 757 | } |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d512239a1473..ab680c851aa2 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) | |||
2361 | } | 2361 | } |
2362 | 2362 | ||
2363 | seq_printf(seq, | 2363 | seq_printf(seq, |
2364 | "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", | 2364 | "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", |
2365 | im->multiaddr, im->users, | 2365 | im->multiaddr, im->users, |
2366 | im->tm_running, im->tm_running ? | 2366 | im->tm_running, im->tm_running ? |
2367 | jiffies_to_clock_t(im->timer.expires-jiffies) : 0, | 2367 | jiffies_to_clock_t(im->timer.expires-jiffies) : 0, |
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 0923add122b4..9f0bb529ab70 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c | |||
@@ -116,6 +116,7 @@ sr_failed: | |||
116 | 116 | ||
117 | too_many_hops: | 117 | too_many_hops: |
118 | /* Tell the sender its packet died... */ | 118 | /* Tell the sender its packet died... */ |
119 | IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); | ||
119 | icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); | 120 | icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); |
120 | drop: | 121 | drop: |
121 | kfree_skb(skb); | 122 | kfree_skb(skb); |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cff9c3a72daf..8538aac3d148 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) | |||
410 | nf_bridge_get(to->nf_bridge); | 410 | nf_bridge_get(to->nf_bridge); |
411 | #endif | 411 | #endif |
412 | #endif | 412 | #endif |
413 | skb_copy_secmark(to, from); | ||
413 | } | 414 | } |
414 | 415 | ||
415 | /* | 416 | /* |
@@ -839,7 +840,7 @@ int ip_append_data(struct sock *sk, | |||
839 | */ | 840 | */ |
840 | if (transhdrlen && | 841 | if (transhdrlen && |
841 | length + fragheaderlen <= mtu && | 842 | length + fragheaderlen <= mtu && |
842 | rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && | 843 | rt->u.dst.dev->features & NETIF_F_ALL_CSUM && |
843 | !exthdrlen) | 844 | !exthdrlen) |
844 | csummode = CHECKSUM_HW; | 845 | csummode = CHECKSUM_HW; |
845 | 846 | ||
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 95278b22b669..3ed8b57a1002 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
@@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list); | |||
45 | static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) | 45 | static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) |
46 | { | 46 | { |
47 | int err, plen, dlen; | 47 | int err, plen, dlen; |
48 | struct iphdr *iph; | ||
49 | struct ipcomp_data *ipcd = x->data; | 48 | struct ipcomp_data *ipcd = x->data; |
50 | u8 *start, *scratch; | 49 | u8 *start, *scratch; |
51 | struct crypto_tfm *tfm; | 50 | struct crypto_tfm *tfm; |
@@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) | |||
74 | 73 | ||
75 | skb_put(skb, dlen - plen); | 74 | skb_put(skb, dlen - plen); |
76 | memcpy(skb->data, scratch, dlen); | 75 | memcpy(skb->data, scratch, dlen); |
77 | iph = skb->nh.iph; | ||
78 | iph->tot_len = htons(dlen + iph->ihl * 4); | ||
79 | out: | 76 | out: |
80 | put_cpu(); | 77 | put_cpu(); |
81 | return err; | 78 | return err; |
@@ -83,34 +80,21 @@ out: | |||
83 | 80 | ||
84 | static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) | 81 | static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) |
85 | { | 82 | { |
86 | u8 nexthdr; | 83 | int err = -ENOMEM; |
87 | int err = 0; | ||
88 | struct iphdr *iph; | 84 | struct iphdr *iph; |
89 | union { | 85 | struct ip_comp_hdr *ipch; |
90 | struct iphdr iph; | ||
91 | char buf[60]; | ||
92 | } tmp_iph; | ||
93 | |||
94 | 86 | ||
95 | if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && | 87 | if (skb_linearize_cow(skb)) |
96 | skb_linearize(skb, GFP_ATOMIC) != 0) { | ||
97 | err = -ENOMEM; | ||
98 | goto out; | 88 | goto out; |
99 | } | ||
100 | 89 | ||
101 | skb->ip_summed = CHECKSUM_NONE; | 90 | skb->ip_summed = CHECKSUM_NONE; |
102 | 91 | ||
103 | /* Remove ipcomp header and decompress original payload */ | 92 | /* Remove ipcomp header and decompress original payload */ |
104 | iph = skb->nh.iph; | 93 | iph = skb->nh.iph; |
105 | memcpy(&tmp_iph, iph, iph->ihl * 4); | 94 | ipch = (void *)skb->data; |
106 | nexthdr = *(u8 *)skb->data; | 95 | iph->protocol = ipch->nexthdr; |
107 | skb_pull(skb, sizeof(struct ip_comp_hdr)); | 96 | skb->h.raw = skb->nh.raw + sizeof(*ipch); |
108 | skb->nh.raw += sizeof(struct ip_comp_hdr); | 97 | __skb_pull(skb, sizeof(*ipch)); |
109 | memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); | ||
110 | iph = skb->nh.iph; | ||
111 | iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); | ||
112 | iph->protocol = nexthdr; | ||
113 | skb->h.raw = skb->data; | ||
114 | err = ipcomp_decompress(x, skb); | 98 | err = ipcomp_decompress(x, skb); |
115 | 99 | ||
116 | out: | 100 | out: |
@@ -171,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) | |||
171 | goto out_ok; | 155 | goto out_ok; |
172 | } | 156 | } |
173 | 157 | ||
174 | if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && | 158 | if (skb_linearize_cow(skb)) |
175 | skb_linearize(skb, GFP_ATOMIC) != 0) { | ||
176 | goto out_ok; | 159 | goto out_ok; |
177 | } | ||
178 | 160 | ||
179 | err = ipcomp_compress(x, skb); | 161 | err = ipcomp_compress(x, skb); |
180 | iph = skb->nh.iph; | 162 | iph = skb->nh.iph; |
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d4072533da21..e1d7f5fbc526 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK | |||
55 | of packets, but this mark value is kept in the conntrack session | 55 | of packets, but this mark value is kept in the conntrack session |
56 | instead of the individual packets. | 56 | instead of the individual packets. |
57 | 57 | ||
58 | config IP_NF_CONNTRACK_SECMARK | ||
59 | bool 'Connection tracking security mark support' | ||
60 | depends on IP_NF_CONNTRACK && NETWORK_SECMARK | ||
61 | help | ||
62 | This option enables security markings to be applied to | ||
63 | connections. Typically they are copied to connections from | ||
64 | packets using the CONNSECMARK target and copied back from | ||
65 | connections to packets with the same target, with the packets | ||
66 | being originally labeled via SECMARK. | ||
67 | |||
68 | If unsure, say 'N'. | ||
69 | |||
58 | config IP_NF_CONNTRACK_EVENTS | 70 | config IP_NF_CONNTRACK_EVENTS |
59 | bool "Connection tracking events (EXPERIMENTAL)" | 71 | bool "Connection tracking events (EXPERIMENTAL)" |
60 | depends on EXPERIMENTAL && IP_NF_CONNTRACK | 72 | depends on EXPERIMENTAL && IP_NF_CONNTRACK |
@@ -142,6 +154,8 @@ config IP_NF_TFTP | |||
142 | config IP_NF_AMANDA | 154 | config IP_NF_AMANDA |
143 | tristate "Amanda backup protocol support" | 155 | tristate "Amanda backup protocol support" |
144 | depends on IP_NF_CONNTRACK | 156 | depends on IP_NF_CONNTRACK |
157 | select TEXTSEARCH | ||
158 | select TEXTSEARCH_KMP | ||
145 | help | 159 | help |
146 | If you are running the Amanda backup package <http://www.amanda.org/> | 160 | If you are running the Amanda backup package <http://www.amanda.org/> |
147 | on this machine or machines that will be MASQUERADED through this | 161 | on this machine or machines that will be MASQUERADED through this |
@@ -181,14 +195,26 @@ config IP_NF_H323 | |||
181 | With this module you can support H.323 on a connection tracking/NAT | 195 | With this module you can support H.323 on a connection tracking/NAT |
182 | firewall. | 196 | firewall. |
183 | 197 | ||
184 | This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP | 198 | This module supports RAS, Fast Start, H.245 Tunnelling, Call |
185 | and T.120 based data and applications including audio, video, FAX, | 199 | Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, |
186 | chat, whiteboard, file transfer, etc. For more information, please | 200 | whiteboard, file transfer, etc. For more information, please |
187 | see http://nath323.sourceforge.net/. | 201 | visit http://nath323.sourceforge.net/. |
188 | 202 | ||
189 | If you want to compile it as a module, say 'M' here and read | 203 | If you want to compile it as a module, say 'M' here and read |
190 | Documentation/modules.txt. If unsure, say 'N'. | 204 | Documentation/modules.txt. If unsure, say 'N'. |
191 | 205 | ||
206 | config IP_NF_SIP | ||
207 | tristate "SIP protocol support (EXPERIMENTAL)" | ||
208 | depends on IP_NF_CONNTRACK && EXPERIMENTAL | ||
209 | help | ||
210 | SIP is an application-layer control protocol that can establish, | ||
211 | modify, and terminate multimedia sessions (conferences) such as | ||
212 | Internet telephony calls. With the ip_conntrack_sip and | ||
213 | the ip_nat_sip modules you can support the protocol on a connection | ||
214 | tracking/NATing firewall. | ||
215 | |||
216 | To compile it as a module, choose M here. If unsure, say Y. | ||
217 | |||
192 | config IP_NF_QUEUE | 218 | config IP_NF_QUEUE |
193 | tristate "IP Userspace queueing via NETLINK (OBSOLETE)" | 219 | tristate "IP Userspace queueing via NETLINK (OBSOLETE)" |
194 | help | 220 | help |
@@ -501,6 +527,12 @@ config IP_NF_NAT_H323 | |||
501 | default IP_NF_NAT if IP_NF_H323=y | 527 | default IP_NF_NAT if IP_NF_H323=y |
502 | default m if IP_NF_H323=m | 528 | default m if IP_NF_H323=m |
503 | 529 | ||
530 | config IP_NF_NAT_SIP | ||
531 | tristate | ||
532 | depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n | ||
533 | default IP_NF_NAT if IP_NF_SIP=y | ||
534 | default m if IP_NF_SIP=m | ||
535 | |||
504 | # mangle + specific targets | 536 | # mangle + specific targets |
505 | config IP_NF_MANGLE | 537 | config IP_NF_MANGLE |
506 | tristate "Packet mangling" | 538 | tristate "Packet mangling" |
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 461cb1eb5de7..3ded4a3af59c 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
@@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o | |||
31 | obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o | 31 | obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o |
32 | obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o | 32 | obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o |
33 | obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o | 33 | obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o |
34 | obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o | ||
34 | obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o | 35 | obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o |
35 | 36 | ||
36 | # NAT helpers | 37 | # NAT helpers |
@@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o | |||
40 | obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o | 41 | obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o |
41 | obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o | 42 | obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o |
42 | obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o | 43 | obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o |
44 | obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o | ||
43 | 45 | ||
44 | # generic IP tables | 46 | # generic IP tables |
45 | obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o | 47 | obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o |
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index a604b1ccfdaa..0a7bd7f04061 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c | |||
@@ -17,33 +17,29 @@ | |||
17 | * this value. | 17 | * this value. |
18 | * | 18 | * |
19 | */ | 19 | */ |
20 | |||
21 | #include <linux/in.h> | ||
22 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
23 | #include <linux/module.h> | 21 | #include <linux/module.h> |
24 | #include <linux/netfilter.h> | ||
25 | #include <linux/ip.h> | ||
26 | #include <linux/moduleparam.h> | 22 | #include <linux/moduleparam.h> |
23 | #include <linux/textsearch.h> | ||
24 | #include <linux/skbuff.h> | ||
25 | #include <linux/in.h> | ||
26 | #include <linux/ip.h> | ||
27 | #include <linux/udp.h> | 27 | #include <linux/udp.h> |
28 | #include <net/checksum.h> | ||
29 | #include <net/udp.h> | ||
30 | 28 | ||
29 | #include <linux/netfilter.h> | ||
31 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | 30 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> |
32 | #include <linux/netfilter_ipv4/ip_conntrack_amanda.h> | 31 | #include <linux/netfilter_ipv4/ip_conntrack_amanda.h> |
33 | 32 | ||
34 | static unsigned int master_timeout = 300; | 33 | static unsigned int master_timeout = 300; |
34 | static char *ts_algo = "kmp"; | ||
35 | 35 | ||
36 | MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); | 36 | MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); |
37 | MODULE_DESCRIPTION("Amanda connection tracking module"); | 37 | MODULE_DESCRIPTION("Amanda connection tracking module"); |
38 | MODULE_LICENSE("GPL"); | 38 | MODULE_LICENSE("GPL"); |
39 | module_param(master_timeout, uint, 0600); | 39 | module_param(master_timeout, uint, 0600); |
40 | MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); | 40 | MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); |
41 | 41 | module_param(ts_algo, charp, 0400); | |
42 | static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; | 42 | MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); |
43 | |||
44 | /* This is slow, but it's simple. --RR */ | ||
45 | static char *amanda_buffer; | ||
46 | static DEFINE_SPINLOCK(amanda_buffer_lock); | ||
47 | 43 | ||
48 | unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, | 44 | unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, |
49 | enum ip_conntrack_info ctinfo, | 45 | enum ip_conntrack_info ctinfo, |
@@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, | |||
52 | struct ip_conntrack_expect *exp); | 48 | struct ip_conntrack_expect *exp); |
53 | EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); | 49 | EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); |
54 | 50 | ||
51 | enum amanda_strings { | ||
52 | SEARCH_CONNECT, | ||
53 | SEARCH_NEWLINE, | ||
54 | SEARCH_DATA, | ||
55 | SEARCH_MESG, | ||
56 | SEARCH_INDEX, | ||
57 | }; | ||
58 | |||
59 | static struct { | ||
60 | char *string; | ||
61 | size_t len; | ||
62 | struct ts_config *ts; | ||
63 | } search[] = { | ||
64 | [SEARCH_CONNECT] = { | ||
65 | .string = "CONNECT ", | ||
66 | .len = 8, | ||
67 | }, | ||
68 | [SEARCH_NEWLINE] = { | ||
69 | .string = "\n", | ||
70 | .len = 1, | ||
71 | }, | ||
72 | [SEARCH_DATA] = { | ||
73 | .string = "DATA ", | ||
74 | .len = 5, | ||
75 | }, | ||
76 | [SEARCH_MESG] = { | ||
77 | .string = "MESG ", | ||
78 | .len = 5, | ||
79 | }, | ||
80 | [SEARCH_INDEX] = { | ||
81 | .string = "INDEX ", | ||
82 | .len = 6, | ||
83 | }, | ||
84 | }; | ||
85 | |||
55 | static int help(struct sk_buff **pskb, | 86 | static int help(struct sk_buff **pskb, |
56 | struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) | 87 | struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) |
57 | { | 88 | { |
89 | struct ts_state ts; | ||
58 | struct ip_conntrack_expect *exp; | 90 | struct ip_conntrack_expect *exp; |
59 | char *data, *data_limit, *tmp; | 91 | unsigned int dataoff, start, stop, off, i; |
60 | unsigned int dataoff, i; | 92 | char pbuf[sizeof("65535")], *tmp; |
61 | u_int16_t port, len; | 93 | u_int16_t port, len; |
62 | int ret = NF_ACCEPT; | 94 | int ret = NF_ACCEPT; |
63 | 95 | ||
@@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb, | |||
77 | return NF_ACCEPT; | 109 | return NF_ACCEPT; |
78 | } | 110 | } |
79 | 111 | ||
80 | spin_lock_bh(&amanda_buffer_lock); | 112 | memset(&ts, 0, sizeof(ts)); |
81 | skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); | 113 | start = skb_find_text(*pskb, dataoff, (*pskb)->len, |
82 | data = amanda_buffer; | 114 | search[SEARCH_CONNECT].ts, &ts); |
83 | data_limit = amanda_buffer + (*pskb)->len - dataoff; | 115 | if (start == UINT_MAX) |
84 | *data_limit = '\0'; | ||
85 | |||
86 | /* Search for the CONNECT string */ | ||
87 | data = strstr(data, "CONNECT "); | ||
88 | if (!data) | ||
89 | goto out; | 116 | goto out; |
90 | data += strlen("CONNECT "); | 117 | start += dataoff + search[SEARCH_CONNECT].len; |
91 | 118 | ||
92 | /* Only search first line. */ | 119 | memset(&ts, 0, sizeof(ts)); |
93 | if ((tmp = strchr(data, '\n'))) | 120 | stop = skb_find_text(*pskb, start, (*pskb)->len, |
94 | *tmp = '\0'; | 121 | search[SEARCH_NEWLINE].ts, &ts); |
122 | if (stop == UINT_MAX) | ||
123 | goto out; | ||
124 | stop += start; | ||
95 | 125 | ||
96 | for (i = 0; i < ARRAY_SIZE(conns); i++) { | 126 | for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { |
97 | char *match = strstr(data, conns[i]); | 127 | memset(&ts, 0, sizeof(ts)); |
98 | if (!match) | 128 | off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); |
129 | if (off == UINT_MAX) | ||
99 | continue; | 130 | continue; |
100 | tmp = data = match + strlen(conns[i]); | 131 | off += start + search[i].len; |
101 | port = simple_strtoul(data, &data, 10); | 132 | |
102 | len = data - tmp; | 133 | len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); |
134 | if (skb_copy_bits(*pskb, off, pbuf, len)) | ||
135 | break; | ||
136 | pbuf[len] = '\0'; | ||
137 | |||
138 | port = simple_strtoul(pbuf, &tmp, 10); | ||
139 | len = tmp - pbuf; | ||
103 | if (port == 0 || len > 5) | 140 | if (port == 0 || len > 5) |
104 | break; | 141 | break; |
105 | 142 | ||
@@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb, | |||
125 | exp->mask.dst.u.tcp.port = 0xFFFF; | 162 | exp->mask.dst.u.tcp.port = 0xFFFF; |
126 | 163 | ||
127 | if (ip_nat_amanda_hook) | 164 | if (ip_nat_amanda_hook) |
128 | ret = ip_nat_amanda_hook(pskb, ctinfo, | 165 | ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff, |
129 | tmp - amanda_buffer, | ||
130 | len, exp); | 166 | len, exp); |
131 | else if (ip_conntrack_expect_related(exp) != 0) | 167 | else if (ip_conntrack_expect_related(exp) != 0) |
132 | ret = NF_DROP; | 168 | ret = NF_DROP; |
@@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb, | |||
134 | } | 170 | } |
135 | 171 | ||
136 | out: | 172 | out: |
137 | spin_unlock_bh(&amanda_buffer_lock); | ||
138 | return ret; | 173 | return ret; |
139 | } | 174 | } |
140 | 175 | ||
141 | static struct ip_conntrack_helper amanda_helper = { | 176 | static struct ip_conntrack_helper amanda_helper = { |
142 | .max_expected = ARRAY_SIZE(conns), | 177 | .max_expected = 3, |
143 | .timeout = 180, | 178 | .timeout = 180, |
144 | .me = THIS_MODULE, | 179 | .me = THIS_MODULE, |
145 | .help = help, | 180 | .help = help, |
@@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = { | |||
155 | 190 | ||
156 | static void __exit ip_conntrack_amanda_fini(void) | 191 | static void __exit ip_conntrack_amanda_fini(void) |
157 | { | 192 | { |
193 | int i; | ||
194 | |||
158 | ip_conntrack_helper_unregister(&amanda_helper); | 195 | ip_conntrack_helper_unregister(&amanda_helper); |
159 | kfree(amanda_buffer); | 196 | for (i = 0; i < ARRAY_SIZE(search); i++) |
197 | textsearch_destroy(search[i].ts); | ||
160 | } | 198 | } |
161 | 199 | ||
162 | static int __init ip_conntrack_amanda_init(void) | 200 | static int __init ip_conntrack_amanda_init(void) |
163 | { | 201 | { |
164 | int ret; | 202 | int ret, i; |
165 | 203 | ||
166 | amanda_buffer = kmalloc(65536, GFP_KERNEL); | 204 | ret = -ENOMEM; |
167 | if (!amanda_buffer) | 205 | for (i = 0; i < ARRAY_SIZE(search); i++) { |
168 | return -ENOMEM; | 206 | search[i].ts = textsearch_prepare(ts_algo, search[i].string, |
169 | 207 | search[i].len, | |
170 | ret = ip_conntrack_helper_register(&amanda_helper); | 208 | GFP_KERNEL, TS_AUTOLOAD); |
171 | if (ret < 0) { | 209 | if (search[i].ts == NULL) |
172 | kfree(amanda_buffer); | 210 | goto err; |
173 | return ret; | ||
174 | } | 211 | } |
212 | ret = ip_conntrack_helper_register(&amanda_helper); | ||
213 | if (ret < 0) | ||
214 | goto err; | ||
175 | return 0; | 215 | return 0; |
176 | 216 | ||
177 | 217 | err: | |
218 | for (; i >= 0; i--) { | ||
219 | if (search[i].ts) | ||
220 | textsearch_destroy(search[i].ts); | ||
221 | } | ||
222 | return ret; | ||
178 | } | 223 | } |
179 | 224 | ||
180 | module_init(ip_conntrack_amanda_init); | 225 | module_init(ip_conntrack_amanda_init); |
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a297da7bbef5..7e4cf9a4d15f 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c | |||
@@ -724,6 +724,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, | |||
724 | /* this is ugly, but there is no other place where to put it */ | 724 | /* this is ugly, but there is no other place where to put it */ |
725 | conntrack->nat.masq_index = exp->master->nat.masq_index; | 725 | conntrack->nat.masq_index = exp->master->nat.masq_index; |
726 | #endif | 726 | #endif |
727 | #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK | ||
728 | conntrack->secmark = exp->master->secmark; | ||
729 | #endif | ||
727 | nf_conntrack_get(&conntrack->master->ct_general); | 730 | nf_conntrack_get(&conntrack->master->ct_general); |
728 | CONNTRACK_STAT_INC(expect_new); | 731 | CONNTRACK_STAT_INC(expect_new); |
729 | } else { | 732 | } else { |
@@ -1130,6 +1133,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct, | |||
1130 | 1133 | ||
1131 | write_lock_bh(&ip_conntrack_lock); | 1134 | write_lock_bh(&ip_conntrack_lock); |
1132 | 1135 | ||
1136 | /* Only update if this is not a fixed timeout */ | ||
1137 | if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { | ||
1138 | write_unlock_bh(&ip_conntrack_lock); | ||
1139 | return; | ||
1140 | } | ||
1141 | |||
1133 | /* If not in hash table, timer will not be active yet */ | 1142 | /* If not in hash table, timer will not be active yet */ |
1134 | if (!is_confirmed(ct)) { | 1143 | if (!is_confirmed(ct)) { |
1135 | ct->timeout.expires = extra_jiffies; | 1144 | ct->timeout.expires = extra_jiffies; |
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 3e542bf28a9d..4dcf526c3944 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c | |||
@@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char); | |||
56 | static int try_epsv_response(const char *, size_t, u_int32_t [], char); | 56 | static int try_epsv_response(const char *, size_t, u_int32_t [], char); |
57 | 57 | ||
58 | static const struct ftp_search { | 58 | static const struct ftp_search { |
59 | enum ip_conntrack_dir dir; | ||
60 | const char *pattern; | 59 | const char *pattern; |
61 | size_t plen; | 60 | size_t plen; |
62 | char skip; | 61 | char skip; |
63 | char term; | 62 | char term; |
64 | enum ip_ct_ftp_type ftptype; | 63 | enum ip_ct_ftp_type ftptype; |
65 | int (*getnum)(const char *, size_t, u_int32_t[], char); | 64 | int (*getnum)(const char *, size_t, u_int32_t[], char); |
66 | } search[] = { | 65 | } search[IP_CT_DIR_MAX][2] = { |
67 | { | 66 | [IP_CT_DIR_ORIGINAL] = { |
68 | IP_CT_DIR_ORIGINAL, | 67 | { |
69 | "PORT", sizeof("PORT") - 1, ' ', '\r', | 68 | .pattern = "PORT", |
70 | IP_CT_FTP_PORT, | 69 | .plen = sizeof("PORT") - 1, |
71 | try_rfc959, | 70 | .skip = ' ', |
71 | .term = '\r', | ||
72 | .ftptype = IP_CT_FTP_PORT, | ||
73 | .getnum = try_rfc959, | ||
74 | }, | ||
75 | { | ||
76 | .pattern = "EPRT", | ||
77 | .plen = sizeof("EPRT") - 1, | ||
78 | .skip = ' ', | ||
79 | .term = '\r', | ||
80 | .ftptype = IP_CT_FTP_EPRT, | ||
81 | .getnum = try_eprt, | ||
82 | }, | ||
72 | }, | 83 | }, |
73 | { | 84 | [IP_CT_DIR_REPLY] = { |
74 | IP_CT_DIR_REPLY, | 85 | { |
75 | "227 ", sizeof("227 ") - 1, '(', ')', | 86 | .pattern = "227 ", |
76 | IP_CT_FTP_PASV, | 87 | .plen = sizeof("227 ") - 1, |
77 | try_rfc959, | 88 | .skip = '(', |
78 | }, | 89 | .term = ')', |
79 | { | 90 | .ftptype = IP_CT_FTP_PASV, |
80 | IP_CT_DIR_ORIGINAL, | 91 | .getnum = try_rfc959, |
81 | "EPRT", sizeof("EPRT") - 1, ' ', '\r', | 92 | }, |
82 | IP_CT_FTP_EPRT, | 93 | { |
83 | try_eprt, | 94 | .pattern = "229 ", |
84 | }, | 95 | .plen = sizeof("229 ") - 1, |
85 | { | 96 | .skip = '(', |
86 | IP_CT_DIR_REPLY, | 97 | .term = ')', |
87 | "229 ", sizeof("229 ") - 1, '(', ')', | 98 | .ftptype = IP_CT_FTP_EPSV, |
88 | IP_CT_FTP_EPSV, | 99 | .getnum = try_epsv_response, |
89 | try_epsv_response, | 100 | }, |
90 | }, | 101 | }, |
91 | }; | 102 | }; |
92 | 103 | ||
@@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb, | |||
346 | array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; | 357 | array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; |
347 | array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; | 358 | array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; |
348 | 359 | ||
349 | for (i = 0; i < ARRAY_SIZE(search); i++) { | 360 | for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { |
350 | if (search[i].dir != dir) continue; | ||
351 | |||
352 | found = find_pattern(fb_ptr, (*pskb)->len - dataoff, | 361 | found = find_pattern(fb_ptr, (*pskb)->len - dataoff, |
353 | search[i].pattern, | 362 | search[dir][i].pattern, |
354 | search[i].plen, | 363 | search[dir][i].plen, |
355 | search[i].skip, | 364 | search[dir][i].skip, |
356 | search[i].term, | 365 | search[dir][i].term, |
357 | &matchoff, &matchlen, | 366 | &matchoff, &matchlen, |
358 | array, | 367 | array, |
359 | search[i].getnum); | 368 | search[dir][i].getnum); |
360 | if (found) break; | 369 | if (found) break; |
361 | } | 370 | } |
362 | if (found == -1) { | 371 | if (found == -1) { |
@@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb, | |||
366 | this case. */ | 375 | this case. */ |
367 | if (net_ratelimit()) | 376 | if (net_ratelimit()) |
368 | printk("conntrack_ftp: partial %s %u+%u\n", | 377 | printk("conntrack_ftp: partial %s %u+%u\n", |
369 | search[i].pattern, | 378 | search[dir][i].pattern, |
370 | ntohl(th->seq), datalen); | 379 | ntohl(th->seq), datalen); |
371 | ret = NF_DROP; | 380 | ret = NF_DROP; |
372 | goto out; | 381 | goto out; |
@@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb, | |||
426 | /* Now, NAT might want to mangle the packet, and register the | 435 | /* Now, NAT might want to mangle the packet, and register the |
427 | * (possibly changed) expectation itself. */ | 436 | * (possibly changed) expectation itself. */ |
428 | if (ip_nat_ftp_hook) | 437 | if (ip_nat_ftp_hook) |
429 | ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, | 438 | ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, |
430 | matchoff, matchlen, exp, &seq); | 439 | matchoff, matchlen, exp, &seq); |
431 | else { | 440 | else { |
432 | /* Can't expect this? Best to drop packet now. */ | 441 | /* Can't expect this? Best to drop packet now. */ |
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 518f581d39ec..0665674218c6 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/netfilter_ipv4/ip_conntrack_tuple.h> | 22 | #include <linux/netfilter_ipv4/ip_conntrack_tuple.h> |
23 | #include <linux/netfilter_ipv4/ip_conntrack_h323.h> | 23 | #include <linux/netfilter_ipv4/ip_conntrack_h323.h> |
24 | #include <linux/moduleparam.h> | 24 | #include <linux/moduleparam.h> |
25 | #include <linux/ctype.h> | ||
26 | #include <linux/inet.h> | ||
25 | 27 | ||
26 | #if 0 | 28 | #if 0 |
27 | #define DEBUGP printk | 29 | #define DEBUGP printk |
@@ -38,6 +40,12 @@ static int gkrouted_only = 1; | |||
38 | module_param(gkrouted_only, int, 0600); | 40 | module_param(gkrouted_only, int, 0600); |
39 | MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); | 41 | MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); |
40 | 42 | ||
43 | static int callforward_filter = 1; | ||
44 | module_param(callforward_filter, bool, 0600); | ||
45 | MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " | ||
46 | "if both endpoints are on different sides " | ||
47 | "(determined by routing information)"); | ||
48 | |||
41 | /* Hooks for NAT */ | 49 | /* Hooks for NAT */ |
42 | int (*set_h245_addr_hook) (struct sk_buff ** pskb, | 50 | int (*set_h245_addr_hook) (struct sk_buff ** pskb, |
43 | unsigned char **data, int dataoff, | 51 | unsigned char **data, int dataoff, |
@@ -77,6 +85,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb, | |||
77 | unsigned char **data, int dataoff, | 85 | unsigned char **data, int dataoff, |
78 | TransportAddress * addr, u_int16_t port, | 86 | TransportAddress * addr, u_int16_t port, |
79 | struct ip_conntrack_expect * exp); | 87 | struct ip_conntrack_expect * exp); |
88 | int (*nat_callforwarding_hook) (struct sk_buff ** pskb, | ||
89 | struct ip_conntrack * ct, | ||
90 | enum ip_conntrack_info ctinfo, | ||
91 | unsigned char **data, int dataoff, | ||
92 | TransportAddress * addr, u_int16_t port, | ||
93 | struct ip_conntrack_expect * exp); | ||
80 | int (*nat_q931_hook) (struct sk_buff ** pskb, | 94 | int (*nat_q931_hook) (struct sk_buff ** pskb, |
81 | struct ip_conntrack * ct, | 95 | struct ip_conntrack * ct, |
82 | enum ip_conntrack_info ctinfo, | 96 | enum ip_conntrack_info ctinfo, |
@@ -683,6 +697,92 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct, | |||
683 | return ret; | 697 | return ret; |
684 | } | 698 | } |
685 | 699 | ||
700 | /* Forwarding declaration */ | ||
701 | void ip_conntrack_q931_expect(struct ip_conntrack *new, | ||
702 | struct ip_conntrack_expect *this); | ||
703 | |||
704 | /****************************************************************************/ | ||
705 | static int expect_callforwarding(struct sk_buff **pskb, | ||
706 | struct ip_conntrack *ct, | ||
707 | enum ip_conntrack_info ctinfo, | ||
708 | unsigned char **data, int dataoff, | ||
709 | TransportAddress * addr) | ||
710 | { | ||
711 | int dir = CTINFO2DIR(ctinfo); | ||
712 | int ret = 0; | ||
713 | u_int32_t ip; | ||
714 | u_int16_t port; | ||
715 | struct ip_conntrack_expect *exp = NULL; | ||
716 | |||
717 | /* Read alternativeAddress */ | ||
718 | if (!get_h225_addr(*data, addr, &ip, &port) || port == 0) | ||
719 | return 0; | ||
720 | |||
721 | /* If the calling party is on the same side of the forward-to party, | ||
722 | * we don't need to track the second call */ | ||
723 | if (callforward_filter) { | ||
724 | struct rtable *rt1, *rt2; | ||
725 | struct flowi fl1 = { | ||
726 | .fl4_dst = ip, | ||
727 | }; | ||
728 | struct flowi fl2 = { | ||
729 | .fl4_dst = ct->tuplehash[!dir].tuple.src.ip, | ||
730 | }; | ||
731 | |||
732 | if (ip_route_output_key(&rt1, &fl1) == 0) { | ||
733 | if (ip_route_output_key(&rt2, &fl2) == 0) { | ||
734 | if (rt1->rt_gateway == rt2->rt_gateway && | ||
735 | rt1->u.dst.dev == rt2->u.dst.dev) | ||
736 | ret = 1; | ||
737 | dst_release(&rt2->u.dst); | ||
738 | } | ||
739 | dst_release(&rt1->u.dst); | ||
740 | } | ||
741 | if (ret) { | ||
742 | DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); | ||
743 | return 0; | ||
744 | } | ||
745 | } | ||
746 | |||
747 | /* Create expect for the second call leg */ | ||
748 | if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) | ||
749 | return -1; | ||
750 | exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; | ||
751 | exp->tuple.src.u.tcp.port = 0; | ||
752 | exp->tuple.dst.ip = ip; | ||
753 | exp->tuple.dst.u.tcp.port = htons(port); | ||
754 | exp->tuple.dst.protonum = IPPROTO_TCP; | ||
755 | exp->mask.src.ip = 0xFFFFFFFF; | ||
756 | exp->mask.src.u.tcp.port = 0; | ||
757 | exp->mask.dst.ip = 0xFFFFFFFF; | ||
758 | exp->mask.dst.u.tcp.port = 0xFFFF; | ||
759 | exp->mask.dst.protonum = 0xFF; | ||
760 | exp->flags = 0; | ||
761 | |||
762 | if (ct->tuplehash[dir].tuple.src.ip != | ||
763 | ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) { | ||
764 | /* Need NAT */ | ||
765 | ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff, | ||
766 | addr, port, exp); | ||
767 | } else { /* Conntrack only */ | ||
768 | exp->expectfn = ip_conntrack_q931_expect; | ||
769 | |||
770 | if (ip_conntrack_expect_related(exp) == 0) { | ||
771 | DEBUGP("ip_ct_q931: expect Call Forwarding " | ||
772 | "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", | ||
773 | NIPQUAD(exp->tuple.src.ip), | ||
774 | ntohs(exp->tuple.src.u.tcp.port), | ||
775 | NIPQUAD(exp->tuple.dst.ip), | ||
776 | ntohs(exp->tuple.dst.u.tcp.port)); | ||
777 | } else | ||
778 | ret = -1; | ||
779 | } | ||
780 | |||
781 | ip_conntrack_expect_put(exp); | ||
782 | |||
783 | return ret; | ||
784 | } | ||
785 | |||
686 | /****************************************************************************/ | 786 | /****************************************************************************/ |
687 | static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, | 787 | static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, |
688 | enum ip_conntrack_info ctinfo, | 788 | enum ip_conntrack_info ctinfo, |
@@ -878,6 +978,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct, | |||
878 | 978 | ||
879 | DEBUGP("ip_ct_q931: Facility\n"); | 979 | DEBUGP("ip_ct_q931: Facility\n"); |
880 | 980 | ||
981 | if (facility->reason.choice == eFacilityReason_callForwarded) { | ||
982 | if (facility->options & eFacility_UUIE_alternativeAddress) | ||
983 | return expect_callforwarding(pskb, ct, ctinfo, data, | ||
984 | dataoff, | ||
985 | &facility-> | ||
986 | alternativeAddress); | ||
987 | return 0; | ||
988 | } | ||
989 | |||
881 | if (facility->options & eFacility_UUIE_h245Address) { | 990 | if (facility->options & eFacility_UUIE_h245Address) { |
882 | ret = expect_h245(pskb, ct, ctinfo, data, dataoff, | 991 | ret = expect_h245(pskb, ct, ctinfo, data, dataoff, |
883 | &facility->h245Address); | 992 | &facility->h245Address); |
@@ -1677,7 +1786,6 @@ static int __init init(void) | |||
1677 | fini(); | 1786 | fini(); |
1678 | return ret; | 1787 | return ret; |
1679 | } | 1788 | } |
1680 | |||
1681 | DEBUGP("ip_ct_h323: init success\n"); | 1789 | DEBUGP("ip_ct_h323: init success\n"); |
1682 | return 0; | 1790 | return 0; |
1683 | } | 1791 | } |
@@ -1696,6 +1804,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook); | |||
1696 | EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); | 1804 | EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); |
1697 | EXPORT_SYMBOL_GPL(nat_t120_hook); | 1805 | EXPORT_SYMBOL_GPL(nat_t120_hook); |
1698 | EXPORT_SYMBOL_GPL(nat_h245_hook); | 1806 | EXPORT_SYMBOL_GPL(nat_h245_hook); |
1807 | EXPORT_SYMBOL_GPL(nat_callforwarding_hook); | ||
1699 | EXPORT_SYMBOL_GPL(nat_q931_hook); | 1808 | EXPORT_SYMBOL_GPL(nat_q931_hook); |
1700 | 1809 | ||
1701 | MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); | 1810 | MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); |
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c index 022c47b9f6c9..4b359618bedd 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 | 1 | /* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 |
2 | * | 2 | * |
3 | * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> | 3 | * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> |
4 | * | 4 | * |
@@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ | |||
1069 | 1069 | ||
1070 | static field_t _Facility_UUIE[] = { /* SEQUENCE */ | 1070 | static field_t _Facility_UUIE[] = { /* SEQUENCE */ |
1071 | {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, | 1071 | {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, |
1072 | {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0, | 1072 | {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, |
1073 | _TransportAddress}, | 1073 | offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, |
1074 | {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, | 1074 | {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, |
1075 | _Facility_UUIE_alternativeAliasAddress}, | 1075 | _Facility_UUIE_alternativeAliasAddress}, |
1076 | {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, | 1076 | {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, |
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 01bd7cab9367..33891bb1fde4 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c | |||
@@ -399,38 +399,54 @@ nfattr_failure: | |||
399 | static int ctnetlink_done(struct netlink_callback *cb) | 399 | static int ctnetlink_done(struct netlink_callback *cb) |
400 | { | 400 | { |
401 | DEBUGP("entered %s\n", __FUNCTION__); | 401 | DEBUGP("entered %s\n", __FUNCTION__); |
402 | if (cb->args[1]) | ||
403 | ip_conntrack_put((struct ip_conntrack *)cb->args[1]); | ||
402 | return 0; | 404 | return 0; |
403 | } | 405 | } |
404 | 406 | ||
405 | static int | 407 | static int |
406 | ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) | 408 | ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) |
407 | { | 409 | { |
408 | struct ip_conntrack *ct = NULL; | 410 | struct ip_conntrack *ct, *last; |
409 | struct ip_conntrack_tuple_hash *h; | 411 | struct ip_conntrack_tuple_hash *h; |
410 | struct list_head *i; | 412 | struct list_head *i; |
411 | u_int32_t *id = (u_int32_t *) &cb->args[1]; | ||
412 | 413 | ||
413 | DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, | 414 | DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, |
414 | cb->args[0], *id); | 415 | cb->args[0], *id); |
415 | 416 | ||
416 | read_lock_bh(&ip_conntrack_lock); | 417 | read_lock_bh(&ip_conntrack_lock); |
417 | for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { | 418 | for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { |
419 | restart: | ||
420 | last = (struct ip_conntrack *)cb->args[1]; | ||
418 | list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { | 421 | list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { |
419 | h = (struct ip_conntrack_tuple_hash *) i; | 422 | h = (struct ip_conntrack_tuple_hash *) i; |
420 | if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) | 423 | if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) |
421 | continue; | 424 | continue; |
422 | ct = tuplehash_to_ctrack(h); | 425 | ct = tuplehash_to_ctrack(h); |
423 | if (ct->id <= *id) | 426 | if (last != NULL) { |
424 | continue; | 427 | if (ct == last) { |
428 | ip_conntrack_put(last); | ||
429 | cb->args[1] = 0; | ||
430 | last = NULL; | ||
431 | } else | ||
432 | continue; | ||
433 | } | ||
425 | if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, | 434 | if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, |
426 | cb->nlh->nlmsg_seq, | 435 | cb->nlh->nlmsg_seq, |
427 | IPCTNL_MSG_CT_NEW, | 436 | IPCTNL_MSG_CT_NEW, |
428 | 1, ct) < 0) | 437 | 1, ct) < 0) { |
438 | nf_conntrack_get(&ct->ct_general); | ||
439 | cb->args[1] = (unsigned long)ct; | ||
429 | goto out; | 440 | goto out; |
430 | *id = ct->id; | 441 | } |
442 | } | ||
443 | if (last != NULL) { | ||
444 | ip_conntrack_put(last); | ||
445 | cb->args[1] = 0; | ||
446 | goto restart; | ||
431 | } | 447 | } |
432 | } | 448 | } |
433 | out: | 449 | out: |
434 | read_unlock_bh(&ip_conntrack_lock); | 450 | read_unlock_bh(&ip_conntrack_lock); |
435 | 451 | ||
436 | DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); | 452 | DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); |
@@ -629,7 +645,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { | |||
629 | }; | 645 | }; |
630 | 646 | ||
631 | static inline int | 647 | static inline int |
632 | ctnetlink_parse_nat(struct nfattr *cda[], | 648 | ctnetlink_parse_nat(struct nfattr *nat, |
633 | const struct ip_conntrack *ct, struct ip_nat_range *range) | 649 | const struct ip_conntrack *ct, struct ip_nat_range *range) |
634 | { | 650 | { |
635 | struct nfattr *tb[CTA_NAT_MAX]; | 651 | struct nfattr *tb[CTA_NAT_MAX]; |
@@ -639,7 +655,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], | |||
639 | 655 | ||
640 | memset(range, 0, sizeof(*range)); | 656 | memset(range, 0, sizeof(*range)); |
641 | 657 | ||
642 | nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); | 658 | nfattr_parse_nested(tb, CTA_NAT_MAX, nat); |
643 | 659 | ||
644 | if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) | 660 | if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) |
645 | return -EINVAL; | 661 | return -EINVAL; |
@@ -854,39 +870,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) | |||
854 | /* ASSURED bit can only be set */ | 870 | /* ASSURED bit can only be set */ |
855 | return -EINVAL; | 871 | return -EINVAL; |
856 | 872 | ||
857 | if (cda[CTA_NAT-1]) { | 873 | if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { |
858 | #ifndef CONFIG_IP_NF_NAT_NEEDED | 874 | #ifndef CONFIG_IP_NF_NAT_NEEDED |
859 | return -EINVAL; | 875 | return -EINVAL; |
860 | #else | 876 | #else |
861 | unsigned int hooknum; | ||
862 | struct ip_nat_range range; | 877 | struct ip_nat_range range; |
863 | 878 | ||
864 | if (ctnetlink_parse_nat(cda, ct, &range) < 0) | 879 | if (cda[CTA_NAT_DST-1]) { |
865 | return -EINVAL; | 880 | if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, |
866 | 881 | &range) < 0) | |
867 | DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", | 882 | return -EINVAL; |
868 | NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), | 883 | if (ip_nat_initialized(ct, |
869 | htons(range.min.all), htons(range.max.all)); | 884 | HOOK2MANIP(NF_IP_PRE_ROUTING))) |
870 | 885 | return -EEXIST; | |
871 | /* This is tricky but it works. ip_nat_setup_info needs the | 886 | ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); |
872 | * hook number as parameter, so let's do the correct | 887 | } |
873 | * conversion and run away */ | 888 | if (cda[CTA_NAT_SRC-1]) { |
874 | if (status & IPS_SRC_NAT_DONE) | 889 | if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, |
875 | hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ | 890 | &range) < 0) |
876 | else if (status & IPS_DST_NAT_DONE) | 891 | return -EINVAL; |
877 | hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ | 892 | if (ip_nat_initialized(ct, |
878 | else | 893 | HOOK2MANIP(NF_IP_POST_ROUTING))) |
879 | return -EINVAL; /* Missing NAT flags */ | 894 | return -EEXIST; |
880 | 895 | ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); | |
881 | DEBUGP("NAT status: %lu\n", | 896 | } |
882 | status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); | ||
883 | |||
884 | if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) | ||
885 | return -EEXIST; | ||
886 | ip_nat_setup_info(ct, &range, hooknum); | ||
887 | |||
888 | DEBUGP("NAT status after setup_info: %lu\n", | ||
889 | ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); | ||
890 | #endif | 897 | #endif |
891 | } | 898 | } |
892 | 899 | ||
@@ -1106,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, | |||
1106 | /* implicit 'else' */ | 1113 | /* implicit 'else' */ |
1107 | 1114 | ||
1108 | /* we only allow nat config for new conntracks */ | 1115 | /* we only allow nat config for new conntracks */ |
1109 | if (cda[CTA_NAT-1]) { | 1116 | if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { |
1110 | err = -EINVAL; | 1117 | err = -EINVAL; |
1111 | goto out_unlock; | 1118 | goto out_unlock; |
1112 | } | 1119 | } |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 56794797d55b..21ee124c0463 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c | |||
@@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, | |||
77 | } | 77 | } |
78 | 78 | ||
79 | /* look up the source key for a given tuple */ | 79 | /* look up the source key for a given tuple */ |
80 | static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) | 80 | static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t) |
81 | { | 81 | { |
82 | struct ip_ct_gre_keymap *km; | 82 | struct ip_ct_gre_keymap *km; |
83 | u_int32_t key = 0; | 83 | __be16 key = 0; |
84 | 84 | ||
85 | read_lock_bh(&ip_ct_gre_lock); | 85 | read_lock_bh(&ip_ct_gre_lock); |
86 | km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, | 86 | km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, |
@@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, | |||
190 | struct ip_conntrack_tuple *tuple) | 190 | struct ip_conntrack_tuple *tuple) |
191 | { | 191 | { |
192 | struct gre_hdr_pptp _pgrehdr, *pgrehdr; | 192 | struct gre_hdr_pptp _pgrehdr, *pgrehdr; |
193 | u_int32_t srckey; | 193 | __be16 srckey; |
194 | struct gre_hdr _grehdr, *grehdr; | 194 | struct gre_hdr _grehdr, *grehdr; |
195 | 195 | ||
196 | /* first only delinearize old RFC1701 GRE header */ | 196 | /* first only delinearize old RFC1701 GRE header */ |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index d8b14a9010a6..23f1c504586d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c | |||
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
224 | } | 224 | } |
225 | 225 | ||
226 | /* See ip_conntrack_proto_tcp.c */ | 226 | /* See ip_conntrack_proto_tcp.c */ |
227 | if (hooknum == NF_IP_PRE_ROUTING && | 227 | if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && |
228 | nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { | 228 | nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { |
229 | if (LOG_INVALID(IPPROTO_ICMP)) | 229 | if (LOG_INVALID(IPPROTO_ICMP)) |
230 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 230 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 062b252b58ad..c5c2ce5cdeb8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c | |||
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb, | |||
870 | * and moreover root might send raw packets. | 870 | * and moreover root might send raw packets. |
871 | */ | 871 | */ |
872 | /* FIXME: Source route IP option packets --RR */ | 872 | /* FIXME: Source route IP option packets --RR */ |
873 | if (hooknum == NF_IP_PRE_ROUTING && | 873 | if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && |
874 | nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { | 874 | nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { |
875 | if (LOG_INVALID(IPPROTO_TCP)) | 875 | if (LOG_INVALID(IPPROTO_TCP)) |
876 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 876 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 70899868783b..9b2c16b4d2ff 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c | |||
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
120 | * because the semantic of CHECKSUM_HW is different there | 120 | * because the semantic of CHECKSUM_HW is different there |
121 | * and moreover root might send raw packets. | 121 | * and moreover root might send raw packets. |
122 | * FIXME: Source route IP option packets --RR */ | 122 | * FIXME: Source route IP option packets --RR */ |
123 | if (hooknum == NF_IP_PRE_ROUTING && | 123 | if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && |
124 | nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { | 124 | nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { |
125 | if (LOG_INVALID(IPPROTO_UDP)) | 125 | if (LOG_INVALID(IPPROTO_UDP)) |
126 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 126 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c new file mode 100644 index 000000000000..fc87ce0da40d --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_sip.c | |||
@@ -0,0 +1,471 @@ | |||
1 | /* SIP extension for IP connection tracking. | ||
2 | * | ||
3 | * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> | ||
4 | * based on RR's ip_conntrack_ftp.c and other modules. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/config.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/ctype.h> | ||
14 | #include <linux/skbuff.h> | ||
15 | #include <linux/in.h> | ||
16 | #include <linux/ip.h> | ||
17 | #include <linux/udp.h> | ||
18 | |||
19 | #include <linux/netfilter.h> | ||
20 | #include <linux/netfilter_ipv4.h> | ||
21 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | ||
22 | #include <linux/netfilter_ipv4/ip_conntrack_sip.h> | ||
23 | |||
24 | #if 0 | ||
25 | #define DEBUGP printk | ||
26 | #else | ||
27 | #define DEBUGP(format, args...) | ||
28 | #endif | ||
29 | |||
30 | MODULE_LICENSE("GPL"); | ||
31 | MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); | ||
32 | MODULE_DESCRIPTION("SIP connection tracking helper"); | ||
33 | |||
34 | #define MAX_PORTS 8 | ||
35 | static unsigned short ports[MAX_PORTS]; | ||
36 | static int ports_c; | ||
37 | module_param_array(ports, ushort, &ports_c, 0400); | ||
38 | MODULE_PARM_DESC(ports, "port numbers of sip servers"); | ||
39 | |||
40 | static unsigned int sip_timeout = SIP_TIMEOUT; | ||
41 | module_param(sip_timeout, uint, 0600); | ||
42 | MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); | ||
43 | |||
44 | unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, | ||
45 | enum ip_conntrack_info ctinfo, | ||
46 | struct ip_conntrack *ct, | ||
47 | const char **dptr); | ||
48 | EXPORT_SYMBOL_GPL(ip_nat_sip_hook); | ||
49 | |||
50 | unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, | ||
51 | enum ip_conntrack_info ctinfo, | ||
52 | struct ip_conntrack_expect *exp, | ||
53 | const char *dptr); | ||
54 | EXPORT_SYMBOL_GPL(ip_nat_sdp_hook); | ||
55 | |||
56 | int ct_sip_get_info(const char *dptr, size_t dlen, | ||
57 | unsigned int *matchoff, | ||
58 | unsigned int *matchlen, | ||
59 | struct sip_header_nfo *hnfo); | ||
60 | EXPORT_SYMBOL_GPL(ct_sip_get_info); | ||
61 | |||
62 | |||
63 | static int digits_len(const char *dptr, const char *limit, int *shift); | ||
64 | static int epaddr_len(const char *dptr, const char *limit, int *shift); | ||
65 | static int skp_digits_len(const char *dptr, const char *limit, int *shift); | ||
66 | static int skp_epaddr_len(const char *dptr, const char *limit, int *shift); | ||
67 | |||
68 | struct sip_header_nfo ct_sip_hdrs[] = { | ||
69 | { /* Via header */ | ||
70 | .lname = "Via:", | ||
71 | .lnlen = sizeof("Via:") - 1, | ||
72 | .sname = "\r\nv:", | ||
73 | .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ | ||
74 | .ln_str = "UDP ", | ||
75 | .ln_strlen = sizeof("UDP ") - 1, | ||
76 | .match_len = epaddr_len, | ||
77 | }, | ||
78 | { /* Contact header */ | ||
79 | .lname = "Contact:", | ||
80 | .lnlen = sizeof("Contact:") - 1, | ||
81 | .sname = "\r\nm:", | ||
82 | .snlen = sizeof("\r\nm:") - 1, | ||
83 | .ln_str = "sip:", | ||
84 | .ln_strlen = sizeof("sip:") - 1, | ||
85 | .match_len = skp_epaddr_len | ||
86 | }, | ||
87 | { /* Content length header */ | ||
88 | .lname = "Content-Length:", | ||
89 | .lnlen = sizeof("Content-Length:") - 1, | ||
90 | .sname = "\r\nl:", | ||
91 | .snlen = sizeof("\r\nl:") - 1, | ||
92 | .ln_str = ":", | ||
93 | .ln_strlen = sizeof(":") - 1, | ||
94 | .match_len = skp_digits_len | ||
95 | }, | ||
96 | { /* SDP media info */ | ||
97 | .lname = "\nm=", | ||
98 | .lnlen = sizeof("\nm=") - 1, | ||
99 | .sname = "\rm=", | ||
100 | .snlen = sizeof("\rm=") - 1, | ||
101 | .ln_str = "audio ", | ||
102 | .ln_strlen = sizeof("audio ") - 1, | ||
103 | .match_len = digits_len | ||
104 | }, | ||
105 | { /* SDP owner address*/ | ||
106 | .lname = "\no=", | ||
107 | .lnlen = sizeof("\no=") - 1, | ||
108 | .sname = "\ro=", | ||
109 | .snlen = sizeof("\ro=") - 1, | ||
110 | .ln_str = "IN IP4 ", | ||
111 | .ln_strlen = sizeof("IN IP4 ") - 1, | ||
112 | .match_len = epaddr_len | ||
113 | }, | ||
114 | { /* SDP connection info */ | ||
115 | .lname = "\nc=", | ||
116 | .lnlen = sizeof("\nc=") - 1, | ||
117 | .sname = "\rc=", | ||
118 | .snlen = sizeof("\rc=") - 1, | ||
119 | .ln_str = "IN IP4 ", | ||
120 | .ln_strlen = sizeof("IN IP4 ") - 1, | ||
121 | .match_len = epaddr_len | ||
122 | }, | ||
123 | { /* Requests headers */ | ||
124 | .lname = "sip:", | ||
125 | .lnlen = sizeof("sip:") - 1, | ||
126 | .sname = "sip:", | ||
127 | .snlen = sizeof("sip:") - 1, /* yes, i know.. ;) */ | ||
128 | .ln_str = "@", | ||
129 | .ln_strlen = sizeof("@") - 1, | ||
130 | .match_len = epaddr_len | ||
131 | }, | ||
132 | { /* SDP version header */ | ||
133 | .lname = "\nv=", | ||
134 | .lnlen = sizeof("\nv=") - 1, | ||
135 | .sname = "\rv=", | ||
136 | .snlen = sizeof("\rv=") - 1, | ||
137 | .ln_str = "=", | ||
138 | .ln_strlen = sizeof("=") - 1, | ||
139 | .match_len = digits_len | ||
140 | } | ||
141 | }; | ||
142 | EXPORT_SYMBOL_GPL(ct_sip_hdrs); | ||
143 | |||
144 | /* get line lenght until first CR or LF seen. */ | ||
145 | int ct_sip_lnlen(const char *line, const char *limit) | ||
146 | { | ||
147 | const char *k = line; | ||
148 | |||
149 | while ((line <= limit) && (*line == '\r' || *line == '\n')) | ||
150 | line++; | ||
151 | |||
152 | while (line <= limit) { | ||
153 | if (*line == '\r' || *line == '\n') | ||
154 | break; | ||
155 | line++; | ||
156 | } | ||
157 | return line - k; | ||
158 | } | ||
159 | EXPORT_SYMBOL_GPL(ct_sip_lnlen); | ||
160 | |||
161 | /* Linear string search, case sensitive. */ | ||
162 | const char *ct_sip_search(const char *needle, const char *haystack, | ||
163 | size_t needle_len, size_t haystack_len) | ||
164 | { | ||
165 | const char *limit = haystack + (haystack_len - needle_len); | ||
166 | |||
167 | while (haystack <= limit) { | ||
168 | if (memcmp(haystack, needle, needle_len) == 0) | ||
169 | return haystack; | ||
170 | haystack++; | ||
171 | } | ||
172 | return NULL; | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(ct_sip_search); | ||
175 | |||
176 | static int digits_len(const char *dptr, const char *limit, int *shift) | ||
177 | { | ||
178 | int len = 0; | ||
179 | while (dptr <= limit && isdigit(*dptr)) { | ||
180 | dptr++; | ||
181 | len++; | ||
182 | } | ||
183 | return len; | ||
184 | } | ||
185 | |||
186 | /* get digits lenght, skiping blank spaces. */ | ||
187 | static int skp_digits_len(const char *dptr, const char *limit, int *shift) | ||
188 | { | ||
189 | for (; dptr <= limit && *dptr == ' '; dptr++) | ||
190 | (*shift)++; | ||
191 | |||
192 | return digits_len(dptr, limit, shift); | ||
193 | } | ||
194 | |||
195 | /* Simple ipaddr parser.. */ | ||
196 | static int parse_ipaddr(const char *cp, const char **endp, | ||
197 | u_int32_t *ipaddr, const char *limit) | ||
198 | { | ||
199 | unsigned long int val; | ||
200 | int i, digit = 0; | ||
201 | |||
202 | for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) { | ||
203 | digit = 0; | ||
204 | if (!isdigit(*cp)) | ||
205 | break; | ||
206 | |||
207 | val = simple_strtoul(cp, (char **)&cp, 10); | ||
208 | if (val > 0xFF) | ||
209 | return -1; | ||
210 | |||
211 | ((u_int8_t *)ipaddr)[i] = val; | ||
212 | digit = 1; | ||
213 | |||
214 | if (*cp != '.') | ||
215 | break; | ||
216 | cp++; | ||
217 | } | ||
218 | if (!digit) | ||
219 | return -1; | ||
220 | |||
221 | if (endp) | ||
222 | *endp = cp; | ||
223 | |||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | /* skip ip address. returns it lenght. */ | ||
228 | static int epaddr_len(const char *dptr, const char *limit, int *shift) | ||
229 | { | ||
230 | const char *aux = dptr; | ||
231 | u_int32_t ip; | ||
232 | |||
233 | if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) { | ||
234 | DEBUGP("ip: %s parse failed.!\n", dptr); | ||
235 | return 0; | ||
236 | } | ||
237 | |||
238 | /* Port number */ | ||
239 | if (*dptr == ':') { | ||
240 | dptr++; | ||
241 | dptr += digits_len(dptr, limit, shift); | ||
242 | } | ||
243 | return dptr - aux; | ||
244 | } | ||
245 | |||
246 | /* get address length, skiping user info. */ | ||
247 | static int skp_epaddr_len(const char *dptr, const char *limit, int *shift) | ||
248 | { | ||
249 | int s = *shift; | ||
250 | |||
251 | for (; dptr <= limit && *dptr != '@'; dptr++) | ||
252 | (*shift)++; | ||
253 | |||
254 | if (*dptr == '@') { | ||
255 | dptr++; | ||
256 | (*shift)++; | ||
257 | } else | ||
258 | *shift = s; | ||
259 | |||
260 | return epaddr_len(dptr, limit, shift); | ||
261 | } | ||
262 | |||
263 | /* Returns 0 if not found, -1 error parsing. */ | ||
264 | int ct_sip_get_info(const char *dptr, size_t dlen, | ||
265 | unsigned int *matchoff, | ||
266 | unsigned int *matchlen, | ||
267 | struct sip_header_nfo *hnfo) | ||
268 | { | ||
269 | const char *limit, *aux, *k = dptr; | ||
270 | int shift = 0; | ||
271 | |||
272 | limit = dptr + (dlen - hnfo->lnlen); | ||
273 | |||
274 | while (dptr <= limit) { | ||
275 | if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && | ||
276 | (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { | ||
277 | dptr++; | ||
278 | continue; | ||
279 | } | ||
280 | aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, | ||
281 | ct_sip_lnlen(dptr, limit)); | ||
282 | if (!aux) { | ||
283 | DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, | ||
284 | hnfo->lname); | ||
285 | return -1; | ||
286 | } | ||
287 | aux += hnfo->ln_strlen; | ||
288 | |||
289 | *matchlen = hnfo->match_len(aux, limit, &shift); | ||
290 | if (!*matchlen) | ||
291 | return -1; | ||
292 | |||
293 | *matchoff = (aux - k) + shift; | ||
294 | |||
295 | DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, | ||
296 | *matchlen); | ||
297 | return 1; | ||
298 | } | ||
299 | DEBUGP("%s header not found.\n", hnfo->lname); | ||
300 | return 0; | ||
301 | } | ||
302 | |||
303 | static int set_expected_rtp(struct sk_buff **pskb, | ||
304 | struct ip_conntrack *ct, | ||
305 | enum ip_conntrack_info ctinfo, | ||
306 | u_int32_t ipaddr, u_int16_t port, | ||
307 | const char *dptr) | ||
308 | { | ||
309 | struct ip_conntrack_expect *exp; | ||
310 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
311 | int ret; | ||
312 | |||
313 | exp = ip_conntrack_expect_alloc(ct); | ||
314 | if (exp == NULL) | ||
315 | return NF_DROP; | ||
316 | |||
317 | exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; | ||
318 | exp->tuple.src.u.udp.port = 0; | ||
319 | exp->tuple.dst.ip = ipaddr; | ||
320 | exp->tuple.dst.u.udp.port = htons(port); | ||
321 | exp->tuple.dst.protonum = IPPROTO_UDP; | ||
322 | |||
323 | exp->mask.src.ip = 0xFFFFFFFF; | ||
324 | exp->mask.src.u.udp.port = 0; | ||
325 | exp->mask.dst.ip = 0xFFFFFFFF; | ||
326 | exp->mask.dst.u.udp.port = 0xFFFF; | ||
327 | exp->mask.dst.protonum = 0xFF; | ||
328 | |||
329 | exp->expectfn = NULL; | ||
330 | exp->flags = 0; | ||
331 | |||
332 | if (ip_nat_sdp_hook) | ||
333 | ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr); | ||
334 | else { | ||
335 | if (ip_conntrack_expect_related(exp) != 0) | ||
336 | ret = NF_DROP; | ||
337 | else | ||
338 | ret = NF_ACCEPT; | ||
339 | } | ||
340 | ip_conntrack_expect_put(exp); | ||
341 | |||
342 | return ret; | ||
343 | } | ||
344 | |||
345 | static int sip_help(struct sk_buff **pskb, | ||
346 | struct ip_conntrack *ct, | ||
347 | enum ip_conntrack_info ctinfo) | ||
348 | { | ||
349 | unsigned int dataoff, datalen; | ||
350 | const char *dptr; | ||
351 | int ret = NF_ACCEPT; | ||
352 | int matchoff, matchlen; | ||
353 | u_int32_t ipaddr; | ||
354 | u_int16_t port; | ||
355 | |||
356 | /* No Data ? */ | ||
357 | dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
358 | if (dataoff >= (*pskb)->len) { | ||
359 | DEBUGP("skb->len = %u\n", (*pskb)->len); | ||
360 | return NF_ACCEPT; | ||
361 | } | ||
362 | |||
363 | ip_ct_refresh(ct, *pskb, sip_timeout * HZ); | ||
364 | |||
365 | if (!skb_is_nonlinear(*pskb)) | ||
366 | dptr = (*pskb)->data + dataoff; | ||
367 | else { | ||
368 | DEBUGP("Copy of skbuff not supported yet.\n"); | ||
369 | goto out; | ||
370 | } | ||
371 | |||
372 | if (ip_nat_sip_hook) { | ||
373 | if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) { | ||
374 | ret = NF_DROP; | ||
375 | goto out; | ||
376 | } | ||
377 | } | ||
378 | |||
379 | /* After this point NAT, could have mangled skb, so | ||
380 | we need to recalculate payload lenght. */ | ||
381 | datalen = (*pskb)->len - dataoff; | ||
382 | |||
383 | if (datalen < (sizeof("SIP/2.0 200") - 1)) | ||
384 | goto out; | ||
385 | |||
386 | /* RTP info only in some SDP pkts */ | ||
387 | if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && | ||
388 | memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { | ||
389 | goto out; | ||
390 | } | ||
391 | /* Get ip and port address from SDP packet. */ | ||
392 | if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, | ||
393 | &ct_sip_hdrs[POS_CONNECTION]) > 0) { | ||
394 | |||
395 | /* We'll drop only if there are parse problems. */ | ||
396 | if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr, | ||
397 | dptr + datalen) < 0) { | ||
398 | ret = NF_DROP; | ||
399 | goto out; | ||
400 | } | ||
401 | if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, | ||
402 | &ct_sip_hdrs[POS_MEDIA]) > 0) { | ||
403 | |||
404 | port = simple_strtoul(dptr + matchoff, NULL, 10); | ||
405 | if (port < 1024) { | ||
406 | ret = NF_DROP; | ||
407 | goto out; | ||
408 | } | ||
409 | ret = set_expected_rtp(pskb, ct, ctinfo, | ||
410 | ipaddr, port, dptr); | ||
411 | } | ||
412 | } | ||
413 | out: | ||
414 | return ret; | ||
415 | } | ||
416 | |||
417 | static struct ip_conntrack_helper sip[MAX_PORTS]; | ||
418 | static char sip_names[MAX_PORTS][10]; | ||
419 | |||
420 | static void fini(void) | ||
421 | { | ||
422 | int i; | ||
423 | for (i = 0; i < ports_c; i++) { | ||
424 | DEBUGP("unregistering helper for port %d\n", ports[i]); | ||
425 | ip_conntrack_helper_unregister(&sip[i]); | ||
426 | } | ||
427 | } | ||
428 | |||
429 | static int __init init(void) | ||
430 | { | ||
431 | int i, ret; | ||
432 | char *tmpname; | ||
433 | |||
434 | if (ports_c == 0) | ||
435 | ports[ports_c++] = SIP_PORT; | ||
436 | |||
437 | for (i = 0; i < ports_c; i++) { | ||
438 | /* Create helper structure */ | ||
439 | memset(&sip[i], 0, sizeof(struct ip_conntrack_helper)); | ||
440 | |||
441 | sip[i].tuple.dst.protonum = IPPROTO_UDP; | ||
442 | sip[i].tuple.src.u.udp.port = htons(ports[i]); | ||
443 | sip[i].mask.src.u.udp.port = 0xFFFF; | ||
444 | sip[i].mask.dst.protonum = 0xFF; | ||
445 | sip[i].max_expected = 1; | ||
446 | sip[i].timeout = 3 * 60; /* 3 minutes */ | ||
447 | sip[i].me = THIS_MODULE; | ||
448 | sip[i].help = sip_help; | ||
449 | |||
450 | tmpname = &sip_names[i][0]; | ||
451 | if (ports[i] == SIP_PORT) | ||
452 | sprintf(tmpname, "sip"); | ||
453 | else | ||
454 | sprintf(tmpname, "sip-%d", i); | ||
455 | sip[i].name = tmpname; | ||
456 | |||
457 | DEBUGP("port #%d: %d\n", i, ports[i]); | ||
458 | |||
459 | ret = ip_conntrack_helper_register(&sip[i]); | ||
460 | if (ret) { | ||
461 | printk("ERROR registering helper for port %d\n", | ||
462 | ports[i]); | ||
463 | fini(); | ||
464 | return ret; | ||
465 | } | ||
466 | } | ||
467 | return 0; | ||
468 | } | ||
469 | |||
470 | module_init(init); | ||
471 | module_exit(fini); | ||
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 929d61f7be91..88445aac3f28 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c | |||
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v) | |||
189 | return -ENOSPC; | 189 | return -ENOSPC; |
190 | #endif | 190 | #endif |
191 | 191 | ||
192 | #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK | ||
193 | if (seq_printf(s, "secmark=%u ", conntrack->secmark)) | ||
194 | return -ENOSPC; | ||
195 | #endif | ||
196 | |||
192 | if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) | 197 | if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) |
193 | return -ENOSPC; | 198 | return -ENOSPC; |
194 | 199 | ||
@@ -417,7 +422,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, | |||
417 | 422 | ||
418 | /* This is where we call the helper: as the packet goes out. */ | 423 | /* This is where we call the helper: as the packet goes out. */ |
419 | ct = ip_conntrack_get(*pskb, &ctinfo); | 424 | ct = ip_conntrack_get(*pskb, &ctinfo); |
420 | if (ct && ct->helper) { | 425 | if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { |
421 | unsigned int ret; | 426 | unsigned int ret; |
422 | ret = ct->helper->help(pskb, ct, ctinfo); | 427 | ret = ct->helper->help(pskb, ct, ctinfo); |
423 | if (ret != NF_ACCEPT) | 428 | if (ret != NF_ACCEPT) |
@@ -564,6 +569,8 @@ extern unsigned int ip_ct_generic_timeout; | |||
564 | static int log_invalid_proto_min = 0; | 569 | static int log_invalid_proto_min = 0; |
565 | static int log_invalid_proto_max = 255; | 570 | static int log_invalid_proto_max = 255; |
566 | 571 | ||
572 | int ip_conntrack_checksum = 1; | ||
573 | |||
567 | static struct ctl_table_header *ip_ct_sysctl_header; | 574 | static struct ctl_table_header *ip_ct_sysctl_header; |
568 | 575 | ||
569 | static ctl_table ip_ct_sysctl_table[] = { | 576 | static ctl_table ip_ct_sysctl_table[] = { |
@@ -592,6 +599,14 @@ static ctl_table ip_ct_sysctl_table[] = { | |||
592 | .proc_handler = &proc_dointvec, | 599 | .proc_handler = &proc_dointvec, |
593 | }, | 600 | }, |
594 | { | 601 | { |
602 | .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, | ||
603 | .procname = "ip_conntrack_checksum", | ||
604 | .data = &ip_conntrack_checksum, | ||
605 | .maxlen = sizeof(int), | ||
606 | .mode = 0644, | ||
607 | .proc_handler = &proc_dointvec, | ||
608 | }, | ||
609 | { | ||
595 | .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, | 610 | .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, |
596 | .procname = "ip_conntrack_tcp_timeout_syn_sent", | 611 | .procname = "ip_conntrack_tcp_timeout_syn_sent", |
597 | .data = &ip_ct_tcp_timeout_syn_sent, | 612 | .data = &ip_ct_tcp_timeout_syn_sent, |
@@ -946,6 +961,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); | |||
946 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); | 961 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); |
947 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); | 962 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); |
948 | EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); | 963 | EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); |
964 | EXPORT_SYMBOL_GPL(ip_conntrack_checksum); | ||
949 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | 965 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ |
950 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | 966 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) |
951 | EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); | 967 | EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); |
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c index d45663d137a7..419b878fb467 100644 --- a/net/ipv4/netfilter/ip_nat_helper_h323.c +++ b/net/ipv4/netfilter/ip_nat_helper_h323.c | |||
@@ -487,6 +487,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct, | |||
487 | } | 487 | } |
488 | 488 | ||
489 | /****************************************************************************/ | 489 | /****************************************************************************/ |
490 | static void ip_nat_callforwarding_expect(struct ip_conntrack *new, | ||
491 | struct ip_conntrack_expect *this) | ||
492 | { | ||
493 | struct ip_nat_range range; | ||
494 | |||
495 | /* This must be a fresh one. */ | ||
496 | BUG_ON(new->status & IPS_NAT_DONE_MASK); | ||
497 | |||
498 | /* Change src to where master sends to */ | ||
499 | range.flags = IP_NAT_RANGE_MAP_IPS; | ||
500 | range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; | ||
501 | |||
502 | /* hook doesn't matter, but it has to do source manip */ | ||
503 | ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); | ||
504 | |||
505 | /* For DST manip, map port here to where it's expected. */ | ||
506 | range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); | ||
507 | range.min = range.max = this->saved_proto; | ||
508 | range.min_ip = range.max_ip = this->saved_ip; | ||
509 | |||
510 | /* hook doesn't matter, but it has to do destination manip */ | ||
511 | ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); | ||
512 | |||
513 | ip_conntrack_q931_expect(new, this); | ||
514 | } | ||
515 | |||
516 | /****************************************************************************/ | ||
517 | static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct, | ||
518 | enum ip_conntrack_info ctinfo, | ||
519 | unsigned char **data, int dataoff, | ||
520 | TransportAddress * addr, u_int16_t port, | ||
521 | struct ip_conntrack_expect *exp) | ||
522 | { | ||
523 | int dir = CTINFO2DIR(ctinfo); | ||
524 | u_int16_t nated_port; | ||
525 | |||
526 | /* Set expectations for NAT */ | ||
527 | exp->saved_ip = exp->tuple.dst.ip; | ||
528 | exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; | ||
529 | exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; | ||
530 | exp->expectfn = ip_nat_callforwarding_expect; | ||
531 | exp->dir = !dir; | ||
532 | |||
533 | /* Try to get same port: if not, try to change it. */ | ||
534 | for (nated_port = port; nated_port != 0; nated_port++) { | ||
535 | exp->tuple.dst.u.tcp.port = htons(nated_port); | ||
536 | if (ip_conntrack_expect_related(exp) == 0) | ||
537 | break; | ||
538 | } | ||
539 | |||
540 | if (nated_port == 0) { /* No port available */ | ||
541 | if (net_ratelimit()) | ||
542 | printk("ip_nat_q931: out of TCP ports\n"); | ||
543 | return 0; | ||
544 | } | ||
545 | |||
546 | /* Modify signal */ | ||
547 | if (!set_h225_addr(pskb, data, dataoff, addr, | ||
548 | ct->tuplehash[!dir].tuple.dst.ip, | ||
549 | nated_port) == 0) { | ||
550 | ip_conntrack_unexpect_related(exp); | ||
551 | return -1; | ||
552 | } | ||
553 | |||
554 | /* Success */ | ||
555 | DEBUGP("ip_nat_q931: expect Call Forwarding " | ||
556 | "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", | ||
557 | NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), | ||
558 | NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); | ||
559 | |||
560 | return 0; | ||
561 | } | ||
562 | |||
563 | /****************************************************************************/ | ||
490 | static int __init init(void) | 564 | static int __init init(void) |
491 | { | 565 | { |
492 | BUG_ON(set_h245_addr_hook != NULL); | 566 | BUG_ON(set_h245_addr_hook != NULL); |
@@ -496,6 +570,7 @@ static int __init init(void) | |||
496 | BUG_ON(nat_rtp_rtcp_hook != NULL); | 570 | BUG_ON(nat_rtp_rtcp_hook != NULL); |
497 | BUG_ON(nat_t120_hook != NULL); | 571 | BUG_ON(nat_t120_hook != NULL); |
498 | BUG_ON(nat_h245_hook != NULL); | 572 | BUG_ON(nat_h245_hook != NULL); |
573 | BUG_ON(nat_callforwarding_hook != NULL); | ||
499 | BUG_ON(nat_q931_hook != NULL); | 574 | BUG_ON(nat_q931_hook != NULL); |
500 | 575 | ||
501 | set_h245_addr_hook = set_h245_addr; | 576 | set_h245_addr_hook = set_h245_addr; |
@@ -505,6 +580,7 @@ static int __init init(void) | |||
505 | nat_rtp_rtcp_hook = nat_rtp_rtcp; | 580 | nat_rtp_rtcp_hook = nat_rtp_rtcp; |
506 | nat_t120_hook = nat_t120; | 581 | nat_t120_hook = nat_t120; |
507 | nat_h245_hook = nat_h245; | 582 | nat_h245_hook = nat_h245; |
583 | nat_callforwarding_hook = nat_callforwarding; | ||
508 | nat_q931_hook = nat_q931; | 584 | nat_q931_hook = nat_q931; |
509 | 585 | ||
510 | DEBUGP("ip_nat_h323: init success\n"); | 586 | DEBUGP("ip_nat_h323: init success\n"); |
@@ -521,6 +597,7 @@ static void __exit fini(void) | |||
521 | nat_rtp_rtcp_hook = NULL; | 597 | nat_rtp_rtcp_hook = NULL; |
522 | nat_t120_hook = NULL; | 598 | nat_t120_hook = NULL; |
523 | nat_h245_hook = NULL; | 599 | nat_h245_hook = NULL; |
600 | nat_callforwarding_hook = NULL; | ||
524 | nat_q931_hook = NULL; | 601 | nat_q931_hook = NULL; |
525 | synchronize_net(); | 602 | synchronize_net(); |
526 | } | 603 | } |
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c new file mode 100644 index 000000000000..6ffba63adca2 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_sip.c | |||
@@ -0,0 +1,249 @@ | |||
1 | /* SIP extension for UDP NAT alteration. | ||
2 | * | ||
3 | * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> | ||
4 | * based on RR's ip_nat_ftp.c and other modules. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | #include <linux/skbuff.h> | ||
13 | #include <linux/ip.h> | ||
14 | #include <linux/udp.h> | ||
15 | |||
16 | #include <linux/netfilter_ipv4.h> | ||
17 | #include <linux/netfilter_ipv4/ip_nat.h> | ||
18 | #include <linux/netfilter_ipv4/ip_nat_helper.h> | ||
19 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | ||
20 | #include <linux/netfilter_ipv4/ip_conntrack_sip.h> | ||
21 | |||
22 | MODULE_LICENSE("GPL"); | ||
23 | MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); | ||
24 | MODULE_DESCRIPTION("SIP NAT helper"); | ||
25 | |||
26 | #if 0 | ||
27 | #define DEBUGP printk | ||
28 | #else | ||
29 | #define DEBUGP(format, args...) | ||
30 | #endif | ||
31 | |||
32 | extern struct sip_header_nfo ct_sip_hdrs[]; | ||
33 | |||
34 | static unsigned int mangle_sip_packet(struct sk_buff **pskb, | ||
35 | enum ip_conntrack_info ctinfo, | ||
36 | struct ip_conntrack *ct, | ||
37 | const char **dptr, size_t dlen, | ||
38 | char *buffer, int bufflen, | ||
39 | struct sip_header_nfo *hnfo) | ||
40 | { | ||
41 | unsigned int matchlen, matchoff; | ||
42 | |||
43 | if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0) | ||
44 | return 0; | ||
45 | |||
46 | if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, | ||
47 | matchoff, matchlen, buffer, bufflen)) | ||
48 | return 0; | ||
49 | |||
50 | /* We need to reload this. Thanks Patrick. */ | ||
51 | *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
52 | return 1; | ||
53 | } | ||
54 | |||
55 | static unsigned int ip_nat_sip(struct sk_buff **pskb, | ||
56 | enum ip_conntrack_info ctinfo, | ||
57 | struct ip_conntrack *ct, | ||
58 | const char **dptr) | ||
59 | { | ||
60 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
61 | char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; | ||
62 | unsigned int bufflen, dataoff; | ||
63 | u_int32_t ip; | ||
64 | u_int16_t port; | ||
65 | |||
66 | dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
67 | |||
68 | ip = ct->tuplehash[!dir].tuple.dst.ip; | ||
69 | port = ct->tuplehash[!dir].tuple.dst.u.udp.port; | ||
70 | bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port)); | ||
71 | |||
72 | /* short packet ? */ | ||
73 | if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1)) | ||
74 | return 0; | ||
75 | |||
76 | /* Basic rules: requests and responses. */ | ||
77 | if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) { | ||
78 | const char *aux; | ||
79 | |||
80 | if ((ctinfo) < IP_CT_IS_REPLY) { | ||
81 | mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
82 | (*pskb)->len - dataoff, | ||
83 | buffer, bufflen, | ||
84 | &ct_sip_hdrs[POS_CONTACT]); | ||
85 | return 1; | ||
86 | } | ||
87 | |||
88 | if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
89 | (*pskb)->len - dataoff, | ||
90 | buffer, bufflen, &ct_sip_hdrs[POS_VIA])) | ||
91 | return 0; | ||
92 | |||
93 | /* This search should ignore case, but later.. */ | ||
94 | aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1, | ||
95 | (*pskb)->len - dataoff); | ||
96 | if (!aux) | ||
97 | return 0; | ||
98 | |||
99 | if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"), | ||
100 | ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff))) | ||
101 | return 1; | ||
102 | |||
103 | return mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
104 | (*pskb)->len - dataoff, | ||
105 | buffer, bufflen, | ||
106 | &ct_sip_hdrs[POS_CONTACT]); | ||
107 | } | ||
108 | if ((ctinfo) < IP_CT_IS_REPLY) { | ||
109 | if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
110 | (*pskb)->len - dataoff, | ||
111 | buffer, bufflen, &ct_sip_hdrs[POS_VIA])) | ||
112 | return 0; | ||
113 | |||
114 | /* Mangle Contact if exists only. - watch udp_nat_mangle()! */ | ||
115 | mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff, | ||
116 | buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]); | ||
117 | return 1; | ||
118 | } | ||
119 | /* This mangle requests headers. */ | ||
120 | return mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
121 | ct_sip_lnlen(*dptr, | ||
122 | *dptr + (*pskb)->len - dataoff), | ||
123 | buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]); | ||
124 | } | ||
125 | |||
126 | static int mangle_content_len(struct sk_buff **pskb, | ||
127 | enum ip_conntrack_info ctinfo, | ||
128 | struct ip_conntrack *ct, | ||
129 | const char *dptr) | ||
130 | { | ||
131 | unsigned int dataoff, matchoff, matchlen; | ||
132 | char buffer[sizeof("65536")]; | ||
133 | int bufflen; | ||
134 | |||
135 | dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
136 | |||
137 | /* Get actual SDP lenght */ | ||
138 | if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, | ||
139 | &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) { | ||
140 | |||
141 | /* since ct_sip_get_info() give us a pointer passing 'v=' | ||
142 | we need to add 2 bytes in this count. */ | ||
143 | int c_len = (*pskb)->len - dataoff - matchoff + 2; | ||
144 | |||
145 | /* Now, update SDP lenght */ | ||
146 | if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, | ||
147 | &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) { | ||
148 | |||
149 | bufflen = sprintf(buffer, "%u", c_len); | ||
150 | |||
151 | return ip_nat_mangle_udp_packet(pskb, ct, ctinfo, | ||
152 | matchoff, matchlen, | ||
153 | buffer, bufflen); | ||
154 | } | ||
155 | } | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static unsigned int mangle_sdp(struct sk_buff **pskb, | ||
160 | enum ip_conntrack_info ctinfo, | ||
161 | struct ip_conntrack *ct, | ||
162 | u_int32_t newip, u_int16_t port, | ||
163 | const char *dptr) | ||
164 | { | ||
165 | char buffer[sizeof("nnn.nnn.nnn.nnn")]; | ||
166 | unsigned int dataoff, bufflen; | ||
167 | |||
168 | dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
169 | |||
170 | /* Mangle owner and contact info. */ | ||
171 | bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); | ||
172 | if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, | ||
173 | buffer, bufflen, &ct_sip_hdrs[POS_OWNER])) | ||
174 | return 0; | ||
175 | |||
176 | if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, | ||
177 | buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION])) | ||
178 | return 0; | ||
179 | |||
180 | /* Mangle media port. */ | ||
181 | bufflen = sprintf(buffer, "%u", port); | ||
182 | if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, | ||
183 | buffer, bufflen, &ct_sip_hdrs[POS_MEDIA])) | ||
184 | return 0; | ||
185 | |||
186 | return mangle_content_len(pskb, ctinfo, ct, dptr); | ||
187 | } | ||
188 | |||
189 | /* So, this packet has hit the connection tracking matching code. | ||
190 | Mangle it, and change the expectation to match the new version. */ | ||
191 | static unsigned int ip_nat_sdp(struct sk_buff **pskb, | ||
192 | enum ip_conntrack_info ctinfo, | ||
193 | struct ip_conntrack_expect *exp, | ||
194 | const char *dptr) | ||
195 | { | ||
196 | struct ip_conntrack *ct = exp->master; | ||
197 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
198 | u_int32_t newip; | ||
199 | u_int16_t port; | ||
200 | |||
201 | DEBUGP("ip_nat_sdp():\n"); | ||
202 | |||
203 | /* Connection will come from reply */ | ||
204 | newip = ct->tuplehash[!dir].tuple.dst.ip; | ||
205 | |||
206 | exp->tuple.dst.ip = newip; | ||
207 | exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; | ||
208 | exp->dir = !dir; | ||
209 | |||
210 | /* When you see the packet, we need to NAT it the same as the | ||
211 | this one. */ | ||
212 | exp->expectfn = ip_nat_follow_master; | ||
213 | |||
214 | /* Try to get same port: if not, try to change it. */ | ||
215 | for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { | ||
216 | exp->tuple.dst.u.udp.port = htons(port); | ||
217 | if (ip_conntrack_expect_related(exp) == 0) | ||
218 | break; | ||
219 | } | ||
220 | |||
221 | if (port == 0) | ||
222 | return NF_DROP; | ||
223 | |||
224 | if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) { | ||
225 | ip_conntrack_unexpect_related(exp); | ||
226 | return NF_DROP; | ||
227 | } | ||
228 | return NF_ACCEPT; | ||
229 | } | ||
230 | |||
231 | static void __exit fini(void) | ||
232 | { | ||
233 | ip_nat_sip_hook = NULL; | ||
234 | ip_nat_sdp_hook = NULL; | ||
235 | /* Make sure noone calls it, meanwhile. */ | ||
236 | synchronize_net(); | ||
237 | } | ||
238 | |||
239 | static int __init init(void) | ||
240 | { | ||
241 | BUG_ON(ip_nat_sip_hook); | ||
242 | BUG_ON(ip_nat_sdp_hook); | ||
243 | ip_nat_sip_hook = ip_nat_sip; | ||
244 | ip_nat_sdp_hook = ip_nat_sdp; | ||
245 | return 0; | ||
246 | } | ||
247 | |||
248 | module_init(init); | ||
249 | module_exit(fini); | ||
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index c33244263b90..d20d557f915a 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c | |||
@@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void) | |||
1348 | module_init(ip_nat_snmp_basic_init); | 1348 | module_init(ip_nat_snmp_basic_init); |
1349 | module_exit(ip_nat_snmp_basic_fini); | 1349 | module_exit(ip_nat_snmp_basic_fini); |
1350 | 1350 | ||
1351 | module_param(debug, bool, 0600); | 1351 | module_param(debug, int, 0600); |
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index aad9d28c8d71..dbc83c5d7aa6 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c | |||
@@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) | |||
241 | struct iphdr *iph = skb->nh.iph; | 241 | struct iphdr *iph = skb->nh.iph; |
242 | unsigned long hashval; | 242 | unsigned long hashval; |
243 | u_int16_t sport, dport; | 243 | u_int16_t sport, dport; |
244 | struct tcphdr *th; | 244 | u_int16_t *ports; |
245 | struct udphdr *uh; | ||
246 | struct icmphdr *ih; | ||
247 | 245 | ||
248 | switch (iph->protocol) { | 246 | switch (iph->protocol) { |
249 | case IPPROTO_TCP: | 247 | case IPPROTO_TCP: |
250 | th = (void *)iph+iph->ihl*4; | ||
251 | sport = ntohs(th->source); | ||
252 | dport = ntohs(th->dest); | ||
253 | break; | ||
254 | case IPPROTO_UDP: | 248 | case IPPROTO_UDP: |
255 | uh = (void *)iph+iph->ihl*4; | 249 | case IPPROTO_SCTP: |
256 | sport = ntohs(uh->source); | 250 | case IPPROTO_DCCP: |
257 | dport = ntohs(uh->dest); | ||
258 | break; | ||
259 | case IPPROTO_ICMP: | 251 | case IPPROTO_ICMP: |
260 | ih = (void *)iph+iph->ihl*4; | 252 | ports = (void *)iph+iph->ihl*4; |
261 | sport = ntohs(ih->un.echo.id); | 253 | sport = ports[0]; |
262 | dport = (ih->type<<8)|ih->code; | 254 | dport = ports[1]; |
263 | break; | 255 | break; |
264 | default: | 256 | default: |
265 | if (net_ratelimit()) { | 257 | if (net_ratelimit()) { |
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 0bba3c2bb786..431a3ce6f7b7 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
@@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
147 | /* This packet will not be the same as the other: clear nf fields */ | 147 | /* This packet will not be the same as the other: clear nf fields */ |
148 | nf_reset(nskb); | 148 | nf_reset(nskb); |
149 | nskb->nfmark = 0; | 149 | nskb->nfmark = 0; |
150 | skb_init_secmark(nskb); | ||
150 | 151 | ||
151 | tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); | 152 | tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); |
152 | 153 | ||
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 7c6836c4646e..92980ab8ce48 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c | |||
@@ -28,9 +28,6 @@ | |||
28 | #include <linux/jhash.h> | 28 | #include <linux/jhash.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
31 | #include <linux/tcp.h> | ||
32 | #include <linux/udp.h> | ||
33 | #include <linux/sctp.h> | ||
34 | #include <linux/proc_fs.h> | 31 | #include <linux/proc_fs.h> |
35 | #include <linux/seq_file.h> | 32 | #include <linux/seq_file.h> |
36 | #include <linux/list.h> | 33 | #include <linux/list.h> |
@@ -83,6 +80,7 @@ struct ipt_hashlimit_htable { | |||
83 | /* used internally */ | 80 | /* used internally */ |
84 | spinlock_t lock; /* lock for list_head */ | 81 | spinlock_t lock; /* lock for list_head */ |
85 | u_int32_t rnd; /* random seed for hash */ | 82 | u_int32_t rnd; /* random seed for hash */ |
83 | int rnd_initialized; | ||
86 | struct timer_list timer; /* timer for gc */ | 84 | struct timer_list timer; /* timer for gc */ |
87 | atomic_t count; /* number entries in table */ | 85 | atomic_t count; /* number entries in table */ |
88 | 86 | ||
@@ -137,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) | |||
137 | 135 | ||
138 | /* initialize hash with random val at the time we allocate | 136 | /* initialize hash with random val at the time we allocate |
139 | * the first hashtable entry */ | 137 | * the first hashtable entry */ |
140 | if (!ht->rnd) | 138 | if (!ht->rnd_initialized) { |
141 | get_random_bytes(&ht->rnd, 4); | 139 | get_random_bytes(&ht->rnd, 4); |
140 | ht->rnd_initialized = 1; | ||
141 | } | ||
142 | 142 | ||
143 | if (ht->cfg.max && | 143 | if (ht->cfg.max && |
144 | atomic_read(&ht->count) >= ht->cfg.max) { | 144 | atomic_read(&ht->count) >= ht->cfg.max) { |
@@ -217,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo) | |||
217 | 217 | ||
218 | atomic_set(&hinfo->count, 0); | 218 | atomic_set(&hinfo->count, 0); |
219 | atomic_set(&hinfo->use, 1); | 219 | atomic_set(&hinfo->use, 1); |
220 | hinfo->rnd = 0; | 220 | hinfo->rnd_initialized = 0; |
221 | spin_lock_init(&hinfo->lock); | 221 | spin_lock_init(&hinfo->lock); |
222 | hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); | 222 | hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); |
223 | if (!hinfo->pde) { | 223 | if (!hinfo->pde) { |
@@ -381,49 +381,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) | |||
381 | dh->rateinfo.credit = dh->rateinfo.credit_cap; | 381 | dh->rateinfo.credit = dh->rateinfo.credit_cap; |
382 | } | 382 | } |
383 | 383 | ||
384 | static inline int get_ports(const struct sk_buff *skb, int offset, | ||
385 | u16 ports[2]) | ||
386 | { | ||
387 | union { | ||
388 | struct tcphdr th; | ||
389 | struct udphdr uh; | ||
390 | sctp_sctphdr_t sctph; | ||
391 | } hdr_u, *ptr_u; | ||
392 | |||
393 | /* Must not be a fragment. */ | ||
394 | if (offset) | ||
395 | return 1; | ||
396 | |||
397 | /* Must be big enough to read ports (both UDP and TCP have | ||
398 | them at the start). */ | ||
399 | ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); | ||
400 | if (!ptr_u) | ||
401 | return 1; | ||
402 | |||
403 | switch (skb->nh.iph->protocol) { | ||
404 | case IPPROTO_TCP: | ||
405 | ports[0] = ptr_u->th.source; | ||
406 | ports[1] = ptr_u->th.dest; | ||
407 | break; | ||
408 | case IPPROTO_UDP: | ||
409 | ports[0] = ptr_u->uh.source; | ||
410 | ports[1] = ptr_u->uh.dest; | ||
411 | break; | ||
412 | case IPPROTO_SCTP: | ||
413 | ports[0] = ptr_u->sctph.source; | ||
414 | ports[1] = ptr_u->sctph.dest; | ||
415 | break; | ||
416 | default: | ||
417 | /* all other protocols don't supprot per-port hash | ||
418 | * buckets */ | ||
419 | ports[0] = ports[1] = 0; | ||
420 | break; | ||
421 | } | ||
422 | |||
423 | return 0; | ||
424 | } | ||
425 | |||
426 | |||
427 | static int | 384 | static int |
428 | hashlimit_match(const struct sk_buff *skb, | 385 | hashlimit_match(const struct sk_buff *skb, |
429 | const struct net_device *in, | 386 | const struct net_device *in, |
@@ -449,8 +406,22 @@ hashlimit_match(const struct sk_buff *skb, | |||
449 | dst.src_ip = skb->nh.iph->saddr; | 406 | dst.src_ip = skb->nh.iph->saddr; |
450 | if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT | 407 | if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT |
451 | ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { | 408 | ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { |
452 | u_int16_t ports[2]; | 409 | u_int16_t _ports[2], *ports; |
453 | if (get_ports(skb, offset, ports)) { | 410 | |
411 | switch (skb->nh.iph->protocol) { | ||
412 | case IPPROTO_TCP: | ||
413 | case IPPROTO_UDP: | ||
414 | case IPPROTO_SCTP: | ||
415 | case IPPROTO_DCCP: | ||
416 | ports = skb_header_pointer(skb, skb->nh.iph->ihl*4, | ||
417 | sizeof(_ports), &_ports); | ||
418 | break; | ||
419 | default: | ||
420 | _ports[0] = _ports[1] = 0; | ||
421 | ports = _ports; | ||
422 | break; | ||
423 | } | ||
424 | if (!ports) { | ||
454 | /* We've been asked to examine this packet, and we | 425 | /* We've been asked to examine this packet, and we |
455 | can't. Hence, no choice but to drop. */ | 426 | can't. Hence, no choice but to drop. */ |
456 | *hotdrop = 1; | 427 | *hotdrop = 1; |
@@ -561,7 +532,7 @@ static void | |||
561 | hashlimit_destroy(const struct xt_match *match, void *matchinfo, | 532 | hashlimit_destroy(const struct xt_match *match, void *matchinfo, |
562 | unsigned int matchsize) | 533 | unsigned int matchsize) |
563 | { | 534 | { |
564 | struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; | 535 | struct ipt_hashlimit_info *r = matchinfo; |
565 | 536 | ||
566 | htable_put(r->hinfo); | 537 | htable_put(r->hinfo); |
567 | } | 538 | } |
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index b847ee409efb..61a2139f9cfd 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c | |||
@@ -1,1007 +1,499 @@ | |||
1 | /* Kernel module to check if the source address has been seen recently. */ | 1 | /* |
2 | /* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ | 2 | * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> |
3 | /* Author: Stephen Frost <sfrost@snowman.net> */ | 3 | * |
4 | /* Project Page: http://snowman.net/projects/ipt_recent/ */ | 4 | * This program is free software; you can redistribute it and/or modify |
5 | /* This software is distributed under the terms of the GPL, Version 2 */ | 5 | * it under the terms of the GNU General Public License version 2 as |
6 | /* This copyright does not cover user programs that use kernel services | 6 | * published by the Free Software Foundation. |
7 | * by normal system calls. */ | 7 | * |
8 | 8 | * This is a replacement of the old ipt_recent module, which carried the | |
9 | #include <linux/module.h> | 9 | * following copyright notice: |
10 | #include <linux/skbuff.h> | 10 | * |
11 | * Author: Stephen Frost <sfrost@snowman.net> | ||
12 | * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org | ||
13 | */ | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/moduleparam.h> | ||
11 | #include <linux/proc_fs.h> | 16 | #include <linux/proc_fs.h> |
12 | #include <linux/spinlock.h> | 17 | #include <linux/seq_file.h> |
13 | #include <linux/interrupt.h> | 18 | #include <linux/string.h> |
14 | #include <asm/uaccess.h> | ||
15 | #include <linux/ctype.h> | 19 | #include <linux/ctype.h> |
16 | #include <linux/ip.h> | 20 | #include <linux/list.h> |
17 | #include <linux/vmalloc.h> | 21 | #include <linux/random.h> |
18 | #include <linux/moduleparam.h> | 22 | #include <linux/jhash.h> |
23 | #include <linux/bitops.h> | ||
24 | #include <linux/skbuff.h> | ||
25 | #include <linux/inet.h> | ||
19 | 26 | ||
20 | #include <linux/netfilter_ipv4/ip_tables.h> | 27 | #include <linux/netfilter_ipv4/ip_tables.h> |
21 | #include <linux/netfilter_ipv4/ipt_recent.h> | 28 | #include <linux/netfilter_ipv4/ipt_recent.h> |
22 | 29 | ||
23 | #undef DEBUG | 30 | MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); |
24 | #define HASH_LOG 9 | 31 | MODULE_DESCRIPTION("IP tables recently seen matching module"); |
32 | MODULE_LICENSE("GPL"); | ||
25 | 33 | ||
26 | /* Defaults, these can be overridden on the module command-line. */ | ||
27 | static unsigned int ip_list_tot = 100; | 34 | static unsigned int ip_list_tot = 100; |
28 | static unsigned int ip_pkt_list_tot = 20; | 35 | static unsigned int ip_pkt_list_tot = 20; |
29 | static unsigned int ip_list_hash_size = 0; | 36 | static unsigned int ip_list_hash_size = 0; |
30 | static unsigned int ip_list_perms = 0644; | 37 | static unsigned int ip_list_perms = 0644; |
31 | #ifdef DEBUG | ||
32 | static int debug = 1; | ||
33 | #endif | ||
34 | |||
35 | static char version[] = | ||
36 | KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n"; | ||
37 | |||
38 | MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>"); | ||
39 | MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); | ||
40 | MODULE_LICENSE("GPL"); | ||
41 | module_param(ip_list_tot, uint, 0400); | 38 | module_param(ip_list_tot, uint, 0400); |
42 | module_param(ip_pkt_list_tot, uint, 0400); | 39 | module_param(ip_pkt_list_tot, uint, 0400); |
43 | module_param(ip_list_hash_size, uint, 0400); | 40 | module_param(ip_list_hash_size, uint, 0400); |
44 | module_param(ip_list_perms, uint, 0400); | 41 | module_param(ip_list_perms, uint, 0400); |
45 | #ifdef DEBUG | 42 | MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); |
46 | module_param(debug, bool, 0600); | 43 | MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); |
47 | MODULE_PARM_DESC(debug,"enable debugging output"); | 44 | MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); |
48 | #endif | 45 | MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); |
49 | MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); | 46 | |
50 | MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); | 47 | |
51 | MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); | 48 | struct recent_entry { |
52 | MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); | 49 | struct list_head list; |
53 | 50 | struct list_head lru_list; | |
54 | /* Structure of our list of recently seen addresses. */ | 51 | u_int32_t addr; |
55 | struct recent_ip_list { | 52 | u_int8_t ttl; |
56 | u_int32_t addr; | 53 | u_int8_t index; |
57 | u_int8_t ttl; | 54 | u_int16_t nstamps; |
58 | unsigned long last_seen; | 55 | unsigned long stamps[0]; |
59 | unsigned long *last_pkts; | ||
60 | u_int32_t oldest_pkt; | ||
61 | u_int32_t hash_entry; | ||
62 | u_int32_t time_pos; | ||
63 | }; | ||
64 | |||
65 | struct time_info_list { | ||
66 | u_int32_t position; | ||
67 | u_int32_t time; | ||
68 | }; | 56 | }; |
69 | 57 | ||
70 | /* Structure of our linked list of tables of recent lists. */ | 58 | struct recent_table { |
71 | struct recent_ip_tables { | 59 | struct list_head list; |
72 | char name[IPT_RECENT_NAME_LEN]; | 60 | char name[IPT_RECENT_NAME_LEN]; |
73 | int count; | ||
74 | int time_pos; | ||
75 | struct recent_ip_list *table; | ||
76 | struct recent_ip_tables *next; | ||
77 | spinlock_t list_lock; | ||
78 | int *hash_table; | ||
79 | struct time_info_list *time_info; | ||
80 | #ifdef CONFIG_PROC_FS | 61 | #ifdef CONFIG_PROC_FS |
81 | struct proc_dir_entry *status_proc; | 62 | struct proc_dir_entry *proc; |
82 | #endif /* CONFIG_PROC_FS */ | 63 | #endif |
64 | unsigned int refcnt; | ||
65 | unsigned int entries; | ||
66 | struct list_head lru_list; | ||
67 | struct list_head iphash[0]; | ||
83 | }; | 68 | }; |
84 | 69 | ||
85 | /* Our current list of addresses we have recently seen. | 70 | static LIST_HEAD(tables); |
86 | * Only added to on a --set, and only updated on --set || --update | ||
87 | */ | ||
88 | static struct recent_ip_tables *r_tables = NULL; | ||
89 | |||
90 | /* We protect r_list with this spinlock so two processors are not modifying | ||
91 | * the list at the same time. | ||
92 | */ | ||
93 | static DEFINE_SPINLOCK(recent_lock); | 71 | static DEFINE_SPINLOCK(recent_lock); |
72 | static DEFINE_MUTEX(recent_mutex); | ||
94 | 73 | ||
95 | #ifdef CONFIG_PROC_FS | 74 | #ifdef CONFIG_PROC_FS |
96 | /* Our /proc/net/ipt_recent entry */ | 75 | static struct proc_dir_entry *proc_dir; |
97 | static struct proc_dir_entry *proc_net_ipt_recent = NULL; | 76 | static struct file_operations recent_fops; |
98 | #endif | ||
99 | |||
100 | /* Function declaration for later. */ | ||
101 | static int | ||
102 | match(const struct sk_buff *skb, | ||
103 | const struct net_device *in, | ||
104 | const struct net_device *out, | ||
105 | const struct xt_match *match, | ||
106 | const void *matchinfo, | ||
107 | int offset, | ||
108 | unsigned int protoff, | ||
109 | int *hotdrop); | ||
110 | |||
111 | /* Function to hash a given address into the hash table of table_size size */ | ||
112 | static int hash_func(unsigned int addr, int table_size) | ||
113 | { | ||
114 | int result = 0; | ||
115 | unsigned int value = addr; | ||
116 | do { result ^= value; } while((value >>= HASH_LOG)); | ||
117 | |||
118 | #ifdef DEBUG | ||
119 | if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", | ||
120 | result & (table_size - 1), | ||
121 | addr, | ||
122 | table_size); | ||
123 | #endif | 77 | #endif |
124 | 78 | ||
125 | return(result & (table_size - 1)); | 79 | static u_int32_t hash_rnd; |
126 | } | 80 | static int hash_rnd_initted; |
127 | 81 | ||
128 | #ifdef CONFIG_PROC_FS | 82 | static unsigned int recent_entry_hash(u_int32_t addr) |
129 | /* This is the function which produces the output for our /proc output | ||
130 | * interface which lists each IP address, the last seen time and the | ||
131 | * other recent times the address was seen. | ||
132 | */ | ||
133 | |||
134 | static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) | ||
135 | { | 83 | { |
136 | int len = 0, count, last_len = 0, pkt_count; | 84 | if (!hash_rnd_initted) { |
137 | off_t pos = 0; | 85 | get_random_bytes(&hash_rnd, 4); |
138 | off_t begin = 0; | 86 | hash_rnd_initted = 1; |
139 | struct recent_ip_tables *curr_table; | ||
140 | |||
141 | curr_table = (struct recent_ip_tables*) data; | ||
142 | |||
143 | spin_lock_bh(&curr_table->list_lock); | ||
144 | for(count = 0; count < ip_list_tot; count++) { | ||
145 | if(!curr_table->table[count].addr) continue; | ||
146 | last_len = len; | ||
147 | len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); | ||
148 | len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); | ||
149 | len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); | ||
150 | len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); | ||
151 | len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); | ||
152 | for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { | ||
153 | if(!curr_table->table[count].last_pkts[pkt_count]) break; | ||
154 | len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); | ||
155 | } | ||
156 | len += sprintf(buffer+len,"\n"); | ||
157 | pos = begin + len; | ||
158 | if(pos < offset) { len = 0; begin = pos; } | ||
159 | if(pos > offset + length) { len = last_len; break; } | ||
160 | } | 87 | } |
161 | 88 | return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1); | |
162 | *start = buffer + (offset - begin); | ||
163 | len -= (offset - begin); | ||
164 | if(len > length) len = length; | ||
165 | |||
166 | spin_unlock_bh(&curr_table->list_lock); | ||
167 | return len; | ||
168 | } | 89 | } |
169 | 90 | ||
170 | /* ip_recent_ctrl provides an interface for users to modify the table | 91 | static struct recent_entry * |
171 | * directly. This allows adding entries, removing entries, and | 92 | recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl) |
172 | * flushing the entire table. | ||
173 | * This is done by opening up the appropriate table for writing and | ||
174 | * sending one of: | ||
175 | * xx.xx.xx.xx -- Add entry to table with current time | ||
176 | * +xx.xx.xx.xx -- Add entry to table with current time | ||
177 | * -xx.xx.xx.xx -- Remove entry from table | ||
178 | * clear -- Flush table, remove all entries | ||
179 | */ | ||
180 | |||
181 | static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) | ||
182 | { | 93 | { |
183 | static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; | 94 | struct recent_entry *e; |
184 | u_int32_t val; | 95 | unsigned int h; |
185 | int base, used = 0; | 96 | |
186 | char c, *cp; | 97 | h = recent_entry_hash(addr); |
187 | union iaddr { | 98 | list_for_each_entry(e, &table->iphash[h], list) |
188 | uint8_t bytes[4]; | 99 | if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) |
189 | uint32_t word; | 100 | return e; |
190 | } res; | 101 | return NULL; |
191 | uint8_t *pp = res.bytes; | 102 | } |
192 | int digit; | ||
193 | |||
194 | char buffer[20]; | ||
195 | int len, check_set = 0, count; | ||
196 | u_int32_t addr = 0; | ||
197 | struct sk_buff *skb; | ||
198 | struct ipt_recent_info *info; | ||
199 | struct recent_ip_tables *curr_table; | ||
200 | |||
201 | curr_table = (struct recent_ip_tables*) data; | ||
202 | |||
203 | if(size > 20) len = 20; else len = size; | ||
204 | |||
205 | if(copy_from_user(buffer,input,len)) return -EFAULT; | ||
206 | |||
207 | if(len < 20) buffer[len] = '\0'; | ||
208 | |||
209 | #ifdef DEBUG | ||
210 | if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); | ||
211 | #endif | ||
212 | 103 | ||
213 | cp = buffer; | 104 | static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) |
214 | while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } | 105 | { |
106 | list_del(&e->list); | ||
107 | list_del(&e->lru_list); | ||
108 | kfree(e); | ||
109 | t->entries--; | ||
110 | } | ||
215 | 111 | ||
216 | /* Check if we are asked to flush the entire table */ | 112 | static struct recent_entry * |
217 | if(!memcmp(cp,"clear",5)) { | 113 | recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl) |
218 | used += 5; | 114 | { |
219 | spin_lock_bh(&curr_table->list_lock); | 115 | struct recent_entry *e; |
220 | curr_table->time_pos = 0; | ||
221 | for(count = 0; count < ip_list_hash_size; count++) { | ||
222 | curr_table->hash_table[count] = -1; | ||
223 | } | ||
224 | for(count = 0; count < ip_list_tot; count++) { | ||
225 | curr_table->table[count].last_seen = 0; | ||
226 | curr_table->table[count].addr = 0; | ||
227 | curr_table->table[count].ttl = 0; | ||
228 | memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); | ||
229 | curr_table->table[count].oldest_pkt = 0; | ||
230 | curr_table->table[count].time_pos = 0; | ||
231 | curr_table->time_info[count].position = count; | ||
232 | curr_table->time_info[count].time = 0; | ||
233 | } | ||
234 | spin_unlock_bh(&curr_table->list_lock); | ||
235 | return used; | ||
236 | } | ||
237 | 116 | ||
238 | check_set = IPT_RECENT_SET; | 117 | if (t->entries >= ip_list_tot) { |
239 | switch(*cp) { | 118 | e = list_entry(t->lru_list.next, struct recent_entry, lru_list); |
240 | case '+': check_set = IPT_RECENT_SET; cp++; used++; break; | 119 | recent_entry_remove(t, e); |
241 | case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; | ||
242 | default: if(!isdigit(*cp)) return (used+1); break; | ||
243 | } | 120 | } |
121 | e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, | ||
122 | GFP_ATOMIC); | ||
123 | if (e == NULL) | ||
124 | return NULL; | ||
125 | e->addr = addr; | ||
126 | e->ttl = ttl; | ||
127 | e->stamps[0] = jiffies; | ||
128 | e->nstamps = 1; | ||
129 | e->index = 1; | ||
130 | list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); | ||
131 | list_add_tail(&e->lru_list, &t->lru_list); | ||
132 | t->entries++; | ||
133 | return e; | ||
134 | } | ||
244 | 135 | ||
245 | #ifdef DEBUG | 136 | static void recent_entry_update(struct recent_table *t, struct recent_entry *e) |
246 | if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); | 137 | { |
247 | #endif | 138 | e->stamps[e->index++] = jiffies; |
248 | /* Get addr (effectively inet_aton()) */ | 139 | if (e->index > e->nstamps) |
249 | /* Shamelessly stolen from libc, a function in the kernel for doing | 140 | e->nstamps = e->index; |
250 | * this would, of course, be greatly preferred, but our options appear | 141 | e->index %= ip_pkt_list_tot; |
251 | * to be rather limited, so we will just do it ourselves here. | 142 | list_move_tail(&e->lru_list, &t->lru_list); |
252 | */ | 143 | } |
253 | res.word = 0; | ||
254 | |||
255 | c = *cp; | ||
256 | for(;;) { | ||
257 | if(!isdigit(c)) return used; | ||
258 | val = 0; base = 10; digit = 0; | ||
259 | if(c == '0') { | ||
260 | c = *++cp; | ||
261 | if(c == 'x' || c == 'X') base = 16, c = *++cp; | ||
262 | else { base = 8; digit = 1; } | ||
263 | } | ||
264 | for(;;) { | ||
265 | if(isascii(c) && isdigit(c)) { | ||
266 | if(base == 8 && (c == '8' || c == '0')) return used; | ||
267 | val = (val * base) + (c - '0'); | ||
268 | c = *++cp; | ||
269 | digit = 1; | ||
270 | } else if(base == 16 && isascii(c) && isxdigit(c)) { | ||
271 | val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); | ||
272 | c = *++cp; | ||
273 | digit = 1; | ||
274 | } else break; | ||
275 | } | ||
276 | if(c == '.') { | ||
277 | if(pp > res.bytes + 2 || val > 0xff) return used; | ||
278 | *pp++ = val; | ||
279 | c = *++cp; | ||
280 | } else break; | ||
281 | } | ||
282 | used = cp - buffer; | ||
283 | if(c != '\0' && (!isascii(c) || !isspace(c))) return used; | ||
284 | if(c == '\n') used++; | ||
285 | if(!digit) return used; | ||
286 | 144 | ||
287 | if(val > max[pp - res.bytes]) return used; | 145 | static struct recent_table *recent_table_lookup(const char *name) |
288 | addr = res.word | htonl(val); | 146 | { |
147 | struct recent_table *t; | ||
289 | 148 | ||
290 | if(!addr && check_set == IPT_RECENT_SET) return used; | 149 | list_for_each_entry(t, &tables, list) |
150 | if (!strcmp(t->name, name)) | ||
151 | return t; | ||
152 | return NULL; | ||
153 | } | ||
291 | 154 | ||
292 | #ifdef DEBUG | 155 | static void recent_table_flush(struct recent_table *t) |
293 | if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); | 156 | { |
294 | #endif | 157 | struct recent_entry *e, *next; |
158 | unsigned int i; | ||
295 | 159 | ||
296 | /* Set up and just call match */ | 160 | for (i = 0; i < ip_list_hash_size; i++) { |
297 | info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); | 161 | list_for_each_entry_safe(e, next, &t->iphash[i], list) |
298 | if(!info) { return -ENOMEM; } | 162 | recent_entry_remove(t, e); |
299 | info->seconds = 0; | ||
300 | info->hit_count = 0; | ||
301 | info->check_set = check_set; | ||
302 | info->invert = 0; | ||
303 | info->side = IPT_RECENT_SOURCE; | ||
304 | strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); | ||
305 | info->name[IPT_RECENT_NAME_LEN-1] = '\0'; | ||
306 | |||
307 | skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); | ||
308 | if (!skb) { | ||
309 | used = -ENOMEM; | ||
310 | goto out_free_info; | ||
311 | } | ||
312 | skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); | ||
313 | if (!skb->nh.iph) { | ||
314 | used = -ENOMEM; | ||
315 | goto out_free_skb; | ||
316 | } | 163 | } |
317 | |||
318 | skb->nh.iph->saddr = addr; | ||
319 | skb->nh.iph->daddr = 0; | ||
320 | /* Clear ttl since we have no way of knowing it */ | ||
321 | skb->nh.iph->ttl = 0; | ||
322 | match(skb,NULL,NULL,NULL,info,0,0,NULL); | ||
323 | |||
324 | kfree(skb->nh.iph); | ||
325 | out_free_skb: | ||
326 | kfree(skb); | ||
327 | out_free_info: | ||
328 | kfree(info); | ||
329 | |||
330 | #ifdef DEBUG | ||
331 | if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); | ||
332 | #endif | ||
333 | return used; | ||
334 | } | 164 | } |
335 | 165 | ||
336 | #endif /* CONFIG_PROC_FS */ | ||
337 | |||
338 | /* 'match' is our primary function, called by the kernel whenever a rule is | ||
339 | * hit with our module as an option to it. | ||
340 | * What this function does depends on what was specifically asked of it by | ||
341 | * the user: | ||
342 | * --set -- Add or update last seen time of the source address of the packet | ||
343 | * -- matchinfo->check_set == IPT_RECENT_SET | ||
344 | * --rcheck -- Just check if the source address is in the list | ||
345 | * -- matchinfo->check_set == IPT_RECENT_CHECK | ||
346 | * --update -- If the source address is in the list, update last_seen | ||
347 | * -- matchinfo->check_set == IPT_RECENT_UPDATE | ||
348 | * --remove -- If the source address is in the list, remove it | ||
349 | * -- matchinfo->check_set == IPT_RECENT_REMOVE | ||
350 | * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds | ||
351 | * -- matchinfo->seconds | ||
352 | * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times | ||
353 | * -- matchinfo->hit_count | ||
354 | * --seconds and --hitcount can be combined | ||
355 | */ | ||
356 | static int | 166 | static int |
357 | match(const struct sk_buff *skb, | 167 | ipt_recent_match(const struct sk_buff *skb, |
358 | const struct net_device *in, | 168 | const struct net_device *in, const struct net_device *out, |
359 | const struct net_device *out, | 169 | const struct xt_match *match, const void *matchinfo, |
360 | const struct xt_match *match, | 170 | int offset, unsigned int protoff, int *hotdrop) |
361 | const void *matchinfo, | ||
362 | int offset, | ||
363 | unsigned int protoff, | ||
364 | int *hotdrop) | ||
365 | { | 171 | { |
366 | int pkt_count, hits_found, ans; | ||
367 | unsigned long now; | ||
368 | const struct ipt_recent_info *info = matchinfo; | 172 | const struct ipt_recent_info *info = matchinfo; |
369 | u_int32_t addr = 0, time_temp; | 173 | struct recent_table *t; |
370 | u_int8_t ttl = skb->nh.iph->ttl; | 174 | struct recent_entry *e; |
371 | int *hash_table; | 175 | u_int32_t addr; |
372 | int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; | 176 | u_int8_t ttl; |
373 | struct time_info_list *time_info; | 177 | int ret = info->invert; |
374 | struct recent_ip_tables *curr_table; | ||
375 | struct recent_ip_tables *last_table; | ||
376 | struct recent_ip_list *r_list; | ||
377 | |||
378 | #ifdef DEBUG | ||
379 | if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); | ||
380 | #endif | ||
381 | |||
382 | /* Default is false ^ info->invert */ | ||
383 | ans = info->invert; | ||
384 | 178 | ||
385 | #ifdef DEBUG | 179 | if (info->side == IPT_RECENT_DEST) |
386 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); | 180 | addr = skb->nh.iph->daddr; |
387 | #endif | 181 | else |
182 | addr = skb->nh.iph->saddr; | ||
388 | 183 | ||
389 | /* if out != NULL then routing has been done and TTL changed. | 184 | ttl = skb->nh.iph->ttl; |
390 | * We change it back here internally for match what came in before routing. */ | 185 | /* use TTL as seen before forwarding */ |
391 | if(out) ttl++; | 186 | if (out && !skb->sk) |
187 | ttl++; | ||
392 | 188 | ||
393 | /* Find the right table */ | ||
394 | spin_lock_bh(&recent_lock); | 189 | spin_lock_bh(&recent_lock); |
395 | curr_table = r_tables; | 190 | t = recent_table_lookup(info->name); |
396 | while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); | 191 | e = recent_entry_lookup(t, addr, |
397 | 192 | info->check_set & IPT_RECENT_TTL ? ttl : 0); | |
398 | #ifdef DEBUG | 193 | if (e == NULL) { |
399 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); | 194 | if (!(info->check_set & IPT_RECENT_SET)) |
400 | #endif | 195 | goto out; |
401 | 196 | e = recent_entry_init(t, addr, ttl); | |
402 | spin_unlock_bh(&recent_lock); | 197 | if (e == NULL) |
403 | 198 | *hotdrop = 1; | |
404 | /* Table with this name not found, match impossible */ | 199 | ret ^= 1; |
405 | if(!curr_table) { return ans; } | 200 | goto out; |
406 | |||
407 | /* Make sure no one is changing the list while we work with it */ | ||
408 | spin_lock_bh(&curr_table->list_lock); | ||
409 | |||
410 | r_list = curr_table->table; | ||
411 | if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; | ||
412 | |||
413 | if(!addr) { | ||
414 | #ifdef DEBUG | ||
415 | if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); | ||
416 | #endif | ||
417 | spin_unlock_bh(&curr_table->list_lock); | ||
418 | return ans; | ||
419 | } | ||
420 | |||
421 | #ifdef DEBUG | ||
422 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); | ||
423 | #endif | ||
424 | |||
425 | /* Get jiffies now in case they changed while we were waiting for a lock */ | ||
426 | now = jiffies; | ||
427 | hash_table = curr_table->hash_table; | ||
428 | time_info = curr_table->time_info; | ||
429 | |||
430 | orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); | ||
431 | /* Hash entry at this result used */ | ||
432 | /* Check for TTL match if requested. If TTL is zero then a match would never | ||
433 | * happen, so match regardless of existing TTL in that case. Zero means the | ||
434 | * entry was added via the /proc interface anyway, so we will just use the | ||
435 | * first TTL we get for that IP address. */ | ||
436 | if(info->check_set & IPT_RECENT_TTL) { | ||
437 | while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && | ||
438 | (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { | ||
439 | /* Collision in hash table */ | ||
440 | hash_result = (hash_result + 1) % ip_list_hash_size; | ||
441 | } | ||
442 | } else { | ||
443 | while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { | ||
444 | /* Collision in hash table */ | ||
445 | hash_result = (hash_result + 1) % ip_list_hash_size; | ||
446 | } | ||
447 | } | ||
448 | |||
449 | if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { | ||
450 | /* IP not in list and not asked to SET */ | ||
451 | spin_unlock_bh(&curr_table->list_lock); | ||
452 | return ans; | ||
453 | } | ||
454 | |||
455 | /* Check if we need to handle the collision, do not need to on REMOVE */ | ||
456 | if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { | ||
457 | #ifdef DEBUG | ||
458 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", | ||
459 | orig_hash_result, | ||
460 | hash_result, | ||
461 | r_list[hash_table[orig_hash_result]].addr, | ||
462 | addr); | ||
463 | #endif | ||
464 | |||
465 | /* We had a collision. | ||
466 | * orig_hash_result is where we started, hash_result is where we ended up. | ||
467 | * So, swap them because we are likely to see the same guy again sooner */ | ||
468 | #ifdef DEBUG | ||
469 | if(debug) { | ||
470 | printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); | ||
471 | printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", | ||
472 | r_list[hash_table[orig_hash_result]].hash_entry); | ||
473 | } | ||
474 | #endif | ||
475 | |||
476 | r_list[hash_table[orig_hash_result]].hash_entry = hash_result; | ||
477 | |||
478 | |||
479 | temp = hash_table[orig_hash_result]; | ||
480 | #ifdef DEBUG | ||
481 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); | ||
482 | #endif | ||
483 | hash_table[orig_hash_result] = hash_table[hash_result]; | ||
484 | hash_table[hash_result] = temp; | ||
485 | temp = hash_result; | ||
486 | hash_result = orig_hash_result; | ||
487 | orig_hash_result = temp; | ||
488 | time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; | ||
489 | if(hash_table[hash_result] != -1) { | ||
490 | r_list[hash_table[hash_result]].hash_entry = hash_result; | ||
491 | time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; | ||
492 | } | ||
493 | |||
494 | #ifdef DEBUG | ||
495 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); | ||
496 | #endif | ||
497 | } | 201 | } |
498 | 202 | ||
499 | if(hash_table[hash_result] == -1) { | 203 | if (info->check_set & IPT_RECENT_SET) |
500 | #ifdef DEBUG | 204 | ret ^= 1; |
501 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", | 205 | else if (info->check_set & IPT_RECENT_REMOVE) { |
502 | hash_result, addr); | 206 | recent_entry_remove(t, e); |
503 | #endif | 207 | ret ^= 1; |
504 | 208 | } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { | |
505 | /* New item found and IPT_RECENT_SET, so we need to add it */ | 209 | unsigned long t = jiffies - info->seconds * HZ; |
506 | location = time_info[curr_table->time_pos].position; | 210 | unsigned int i, hits = 0; |
507 | hash_table[r_list[location].hash_entry] = -1; | 211 | |
508 | hash_table[hash_result] = location; | 212 | for (i = 0; i < e->nstamps; i++) { |
509 | memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); | 213 | if (info->seconds && time_after(t, e->stamps[i])) |
510 | r_list[location].time_pos = curr_table->time_pos; | 214 | continue; |
511 | r_list[location].addr = addr; | 215 | if (++hits >= info->hit_count) { |
512 | r_list[location].ttl = ttl; | 216 | ret ^= 1; |
513 | r_list[location].last_seen = now; | 217 | break; |
514 | r_list[location].oldest_pkt = 1; | ||
515 | r_list[location].last_pkts[0] = now; | ||
516 | r_list[location].hash_entry = hash_result; | ||
517 | time_info[curr_table->time_pos].time = r_list[location].last_seen; | ||
518 | curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; | ||
519 | |||
520 | ans = !info->invert; | ||
521 | } else { | ||
522 | #ifdef DEBUG | ||
523 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", | ||
524 | hash_result, | ||
525 | addr); | ||
526 | #endif | ||
527 | |||
528 | /* Existing item found */ | ||
529 | location = hash_table[hash_result]; | ||
530 | /* We have a match on address, now to make sure it meets all requirements for a | ||
531 | * full match. */ | ||
532 | if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { | ||
533 | if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; | ||
534 | if(info->seconds && !info->hit_count) { | ||
535 | if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; | ||
536 | } | ||
537 | if(info->seconds && info->hit_count) { | ||
538 | for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { | ||
539 | if(r_list[location].last_pkts[pkt_count] == 0) break; | ||
540 | if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; | ||
541 | } | ||
542 | if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; | ||
543 | } | ||
544 | if(info->hit_count && !info->seconds) { | ||
545 | for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { | ||
546 | if(r_list[location].last_pkts[pkt_count] == 0) break; | ||
547 | hits_found++; | ||
548 | } | ||
549 | if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; | ||
550 | } | 218 | } |
551 | } | 219 | } |
552 | #ifdef DEBUG | ||
553 | if(debug) { | ||
554 | if(ans) | ||
555 | printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); | ||
556 | else | ||
557 | printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); | ||
558 | } | ||
559 | #endif | ||
560 | |||
561 | /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the | ||
562 | * current timestamp to the last_seen. */ | ||
563 | if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { | ||
564 | #ifdef DEBUG | ||
565 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); | ||
566 | #endif | ||
567 | /* Have to update our time info */ | ||
568 | time_loc = r_list[location].time_pos; | ||
569 | time_info[time_loc].time = now; | ||
570 | time_info[time_loc].position = location; | ||
571 | while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { | ||
572 | time_temp = time_info[time_loc].time; | ||
573 | time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; | ||
574 | time_info[(time_loc+1)%ip_list_tot].time = time_temp; | ||
575 | time_temp = time_info[time_loc].position; | ||
576 | time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; | ||
577 | time_info[(time_loc+1)%ip_list_tot].position = time_temp; | ||
578 | r_list[time_info[time_loc].position].time_pos = time_loc; | ||
579 | r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; | ||
580 | time_loc = (time_loc+1) % ip_list_tot; | ||
581 | } | ||
582 | r_list[location].time_pos = time_loc; | ||
583 | r_list[location].ttl = ttl; | ||
584 | r_list[location].last_pkts[r_list[location].oldest_pkt] = now; | ||
585 | r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; | ||
586 | r_list[location].last_seen = now; | ||
587 | } | ||
588 | /* If we have been asked to remove the entry from the list, just set it to 0 */ | ||
589 | if(info->check_set & IPT_RECENT_REMOVE) { | ||
590 | #ifdef DEBUG | ||
591 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); | ||
592 | #endif | ||
593 | /* Check if this is part of a collision chain */ | ||
594 | while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { | ||
595 | orig_hash_result++; | ||
596 | if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { | ||
597 | /* Found collision chain, how deep does this rabbit hole go? */ | ||
598 | #ifdef DEBUG | ||
599 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); | ||
600 | #endif | ||
601 | end_collision_chain = orig_hash_result; | ||
602 | } | ||
603 | } | ||
604 | if(end_collision_chain != -1) { | ||
605 | #ifdef DEBUG | ||
606 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); | ||
607 | #endif | ||
608 | /* Part of a collision chain, swap it with the end of the chain | ||
609 | * before removing. */ | ||
610 | r_list[hash_table[end_collision_chain]].hash_entry = hash_result; | ||
611 | temp = hash_table[end_collision_chain]; | ||
612 | hash_table[end_collision_chain] = hash_table[hash_result]; | ||
613 | hash_table[hash_result] = temp; | ||
614 | time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; | ||
615 | hash_result = end_collision_chain; | ||
616 | r_list[hash_table[hash_result]].hash_entry = hash_result; | ||
617 | time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; | ||
618 | } | ||
619 | location = hash_table[hash_result]; | ||
620 | hash_table[r_list[location].hash_entry] = -1; | ||
621 | time_loc = r_list[location].time_pos; | ||
622 | time_info[time_loc].time = 0; | ||
623 | time_info[time_loc].position = location; | ||
624 | while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { | ||
625 | time_temp = time_info[time_loc].time; | ||
626 | time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; | ||
627 | time_info[(time_loc+1)%ip_list_tot].time = time_temp; | ||
628 | time_temp = time_info[time_loc].position; | ||
629 | time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; | ||
630 | time_info[(time_loc+1)%ip_list_tot].position = time_temp; | ||
631 | r_list[time_info[time_loc].position].time_pos = time_loc; | ||
632 | r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; | ||
633 | time_loc = (time_loc+1) % ip_list_tot; | ||
634 | } | ||
635 | r_list[location].time_pos = time_loc; | ||
636 | r_list[location].last_seen = 0; | ||
637 | r_list[location].addr = 0; | ||
638 | r_list[location].ttl = 0; | ||
639 | memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); | ||
640 | r_list[location].oldest_pkt = 0; | ||
641 | ans = !info->invert; | ||
642 | } | ||
643 | spin_unlock_bh(&curr_table->list_lock); | ||
644 | return ans; | ||
645 | } | 220 | } |
646 | 221 | ||
647 | spin_unlock_bh(&curr_table->list_lock); | 222 | if (info->check_set & IPT_RECENT_SET || |
648 | #ifdef DEBUG | 223 | (info->check_set & IPT_RECENT_UPDATE && ret)) { |
649 | if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); | 224 | recent_entry_update(t, e); |
650 | #endif | 225 | e->ttl = ttl; |
651 | return ans; | 226 | } |
227 | out: | ||
228 | spin_unlock_bh(&recent_lock); | ||
229 | return ret; | ||
652 | } | 230 | } |
653 | 231 | ||
654 | /* This function is to verify that the rule given during the userspace iptables | ||
655 | * command is correct. | ||
656 | * If the command is valid then we check if the table name referred to by the | ||
657 | * rule exists, if not it is created. | ||
658 | */ | ||
659 | static int | 232 | static int |
660 | checkentry(const char *tablename, | 233 | ipt_recent_checkentry(const char *tablename, const void *ip, |
661 | const void *ip, | 234 | const struct xt_match *match, void *matchinfo, |
662 | const struct xt_match *match, | 235 | unsigned int matchsize, unsigned int hook_mask) |
663 | void *matchinfo, | ||
664 | unsigned int matchsize, | ||
665 | unsigned int hook_mask) | ||
666 | { | 236 | { |
667 | int flag = 0, c; | ||
668 | unsigned long *hold; | ||
669 | const struct ipt_recent_info *info = matchinfo; | 237 | const struct ipt_recent_info *info = matchinfo; |
670 | struct recent_ip_tables *curr_table, *find_table, *last_table; | 238 | struct recent_table *t; |
671 | 239 | unsigned i; | |
672 | #ifdef DEBUG | 240 | int ret = 0; |
673 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); | ||
674 | #endif | ||
675 | |||
676 | /* seconds and hit_count only valid for CHECK/UPDATE */ | ||
677 | if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } | ||
678 | if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } | ||
679 | if(info->check_set & IPT_RECENT_CHECK) flag++; | ||
680 | if(info->check_set & IPT_RECENT_UPDATE) flag++; | ||
681 | |||
682 | /* One and only one of these should ever be set */ | ||
683 | if(flag != 1) return 0; | ||
684 | |||
685 | /* Name must be set to something */ | ||
686 | if(!info->name || !info->name[0]) return 0; | ||
687 | 241 | ||
688 | /* Things look good, create a list for this if it does not exist */ | 242 | if (hweight8(info->check_set & |
689 | /* Lock the linked list while we play with it */ | 243 | (IPT_RECENT_SET | IPT_RECENT_REMOVE | |
690 | spin_lock_bh(&recent_lock); | 244 | IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) |
691 | 245 | return 0; | |
692 | /* Look for an entry with this name already created */ | 246 | if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && |
693 | /* Finds the end of the list and the entry before the end if current name does not exist */ | 247 | (info->seconds || info->hit_count)) |
694 | find_table = r_tables; | 248 | return 0; |
695 | while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); | 249 | if (info->name[0] == '\0' || |
250 | strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) | ||
251 | return 0; | ||
696 | 252 | ||
697 | /* If a table already exists just increment the count on that table and return */ | 253 | mutex_lock(&recent_mutex); |
698 | if(find_table) { | 254 | t = recent_table_lookup(info->name); |
699 | #ifdef DEBUG | 255 | if (t != NULL) { |
700 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); | 256 | t->refcnt++; |
701 | #endif | 257 | ret = 1; |
702 | find_table->count++; | 258 | goto out; |
703 | spin_unlock_bh(&recent_lock); | ||
704 | return 1; | ||
705 | } | 259 | } |
706 | 260 | ||
707 | spin_unlock_bh(&recent_lock); | 261 | t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, |
708 | 262 | GFP_KERNEL); | |
709 | /* Table with this name not found */ | 263 | if (t == NULL) |
710 | /* Allocate memory for new linked list item */ | 264 | goto out; |
711 | 265 | t->refcnt = 1; | |
712 | #ifdef DEBUG | 266 | strcpy(t->name, info->name); |
713 | if(debug) { | 267 | INIT_LIST_HEAD(&t->lru_list); |
714 | printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); | 268 | for (i = 0; i < ip_list_hash_size; i++) |
715 | printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); | 269 | INIT_LIST_HEAD(&t->iphash[i]); |
270 | #ifdef CONFIG_PROC_FS | ||
271 | t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir); | ||
272 | if (t->proc == NULL) { | ||
273 | kfree(t); | ||
274 | goto out; | ||
716 | } | 275 | } |
276 | t->proc->proc_fops = &recent_fops; | ||
277 | t->proc->data = t; | ||
717 | #endif | 278 | #endif |
279 | spin_lock_bh(&recent_lock); | ||
280 | list_add_tail(&t->list, &tables); | ||
281 | spin_unlock_bh(&recent_lock); | ||
282 | ret = 1; | ||
283 | out: | ||
284 | mutex_unlock(&recent_mutex); | ||
285 | return ret; | ||
286 | } | ||
718 | 287 | ||
719 | curr_table = vmalloc(sizeof(struct recent_ip_tables)); | 288 | static void |
720 | if(curr_table == NULL) return 0; | 289 | ipt_recent_destroy(const struct xt_match *match, void *matchinfo, |
721 | 290 | unsigned int matchsize) | |
722 | spin_lock_init(&curr_table->list_lock); | 291 | { |
723 | curr_table->next = NULL; | 292 | const struct ipt_recent_info *info = matchinfo; |
724 | curr_table->count = 1; | 293 | struct recent_table *t; |
725 | curr_table->time_pos = 0; | ||
726 | strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); | ||
727 | curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; | ||
728 | |||
729 | /* Allocate memory for this table and the list of packets in each entry. */ | ||
730 | #ifdef DEBUG | ||
731 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", | ||
732 | sizeof(struct recent_ip_list)*ip_list_tot, | ||
733 | info->name); | ||
734 | #endif | ||
735 | |||
736 | curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); | ||
737 | if(curr_table->table == NULL) { vfree(curr_table); return 0; } | ||
738 | memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); | ||
739 | #ifdef DEBUG | ||
740 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", | ||
741 | sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); | ||
742 | #endif | ||
743 | |||
744 | hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); | ||
745 | #ifdef DEBUG | ||
746 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); | ||
747 | #endif | ||
748 | if(hold == NULL) { | ||
749 | printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); | ||
750 | vfree(curr_table->table); | ||
751 | vfree(curr_table); | ||
752 | return 0; | ||
753 | } | ||
754 | for(c = 0; c < ip_list_tot; c++) { | ||
755 | curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; | ||
756 | } | ||
757 | 294 | ||
758 | /* Allocate memory for the hash table */ | 295 | mutex_lock(&recent_mutex); |
759 | #ifdef DEBUG | 296 | t = recent_table_lookup(info->name); |
760 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", | 297 | if (--t->refcnt == 0) { |
761 | sizeof(int)*ip_list_hash_size); | 298 | spin_lock_bh(&recent_lock); |
299 | list_del(&t->list); | ||
300 | spin_unlock_bh(&recent_lock); | ||
301 | recent_table_flush(t); | ||
302 | #ifdef CONFIG_PROC_FS | ||
303 | remove_proc_entry(t->name, proc_dir); | ||
762 | #endif | 304 | #endif |
763 | 305 | kfree(t); | |
764 | curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); | ||
765 | if(!curr_table->hash_table) { | ||
766 | printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); | ||
767 | vfree(hold); | ||
768 | vfree(curr_table->table); | ||
769 | vfree(curr_table); | ||
770 | return 0; | ||
771 | } | ||
772 | |||
773 | for(c = 0; c < ip_list_hash_size; c++) { | ||
774 | curr_table->hash_table[c] = -1; | ||
775 | } | 306 | } |
307 | mutex_unlock(&recent_mutex); | ||
308 | } | ||
776 | 309 | ||
777 | /* Allocate memory for the time info */ | 310 | #ifdef CONFIG_PROC_FS |
778 | #ifdef DEBUG | 311 | struct recent_iter_state { |
779 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", | 312 | struct recent_table *table; |
780 | sizeof(struct time_info_list)*ip_list_tot); | 313 | unsigned int bucket; |
781 | #endif | 314 | }; |
782 | 315 | ||
783 | curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); | 316 | static void *recent_seq_start(struct seq_file *seq, loff_t *pos) |
784 | if(!curr_table->time_info) { | 317 | { |
785 | printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); | 318 | struct recent_iter_state *st = seq->private; |
786 | vfree(curr_table->hash_table); | 319 | struct recent_table *t = st->table; |
787 | vfree(hold); | 320 | struct recent_entry *e; |
788 | vfree(curr_table->table); | 321 | loff_t p = *pos; |
789 | vfree(curr_table); | ||
790 | return 0; | ||
791 | } | ||
792 | for(c = 0; c < ip_list_tot; c++) { | ||
793 | curr_table->time_info[c].position = c; | ||
794 | curr_table->time_info[c].time = 0; | ||
795 | } | ||
796 | 322 | ||
797 | /* Put the new table in place */ | ||
798 | spin_lock_bh(&recent_lock); | 323 | spin_lock_bh(&recent_lock); |
799 | find_table = r_tables; | ||
800 | while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); | ||
801 | |||
802 | /* If a table already exists just increment the count on that table and return */ | ||
803 | if(find_table) { | ||
804 | find_table->count++; | ||
805 | spin_unlock_bh(&recent_lock); | ||
806 | #ifdef DEBUG | ||
807 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); | ||
808 | #endif | ||
809 | vfree(curr_table->time_info); | ||
810 | vfree(curr_table->hash_table); | ||
811 | vfree(hold); | ||
812 | vfree(curr_table->table); | ||
813 | vfree(curr_table); | ||
814 | return 1; | ||
815 | } | ||
816 | if(!last_table) r_tables = curr_table; else last_table->next = curr_table; | ||
817 | |||
818 | spin_unlock_bh(&recent_lock); | ||
819 | 324 | ||
820 | #ifdef CONFIG_PROC_FS | 325 | for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) { |
821 | /* Create our proc 'status' entry. */ | 326 | list_for_each_entry(e, &t->iphash[st->bucket], list) { |
822 | curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); | 327 | if (p-- == 0) |
823 | if (!curr_table->status_proc) { | 328 | return e; |
824 | vfree(hold); | ||
825 | printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); | ||
826 | /* Destroy the created table */ | ||
827 | spin_lock_bh(&recent_lock); | ||
828 | last_table = NULL; | ||
829 | curr_table = r_tables; | ||
830 | if(!curr_table) { | ||
831 | #ifdef DEBUG | ||
832 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); | ||
833 | #endif | ||
834 | spin_unlock_bh(&recent_lock); | ||
835 | return 0; | ||
836 | } | ||
837 | while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); | ||
838 | if(!curr_table) { | ||
839 | #ifdef DEBUG | ||
840 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); | ||
841 | #endif | ||
842 | spin_unlock_bh(&recent_lock); | ||
843 | return 0; | ||
844 | } | 329 | } |
845 | if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; | ||
846 | spin_unlock_bh(&recent_lock); | ||
847 | vfree(curr_table->time_info); | ||
848 | vfree(curr_table->hash_table); | ||
849 | vfree(curr_table->table); | ||
850 | vfree(curr_table); | ||
851 | return 0; | ||
852 | } | 330 | } |
853 | 331 | return NULL; | |
854 | curr_table->status_proc->owner = THIS_MODULE; | 332 | } |
855 | curr_table->status_proc->data = curr_table; | ||
856 | wmb(); | ||
857 | curr_table->status_proc->read_proc = ip_recent_get_info; | ||
858 | curr_table->status_proc->write_proc = ip_recent_ctrl; | ||
859 | #endif /* CONFIG_PROC_FS */ | ||
860 | |||
861 | #ifdef DEBUG | ||
862 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); | ||
863 | #endif | ||
864 | 333 | ||
865 | return 1; | 334 | static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
335 | { | ||
336 | struct recent_iter_state *st = seq->private; | ||
337 | struct recent_table *t = st->table; | ||
338 | struct recent_entry *e = v; | ||
339 | struct list_head *head = e->list.next; | ||
340 | |||
341 | while (head == &t->iphash[st->bucket]) { | ||
342 | if (++st->bucket >= ip_list_hash_size) | ||
343 | return NULL; | ||
344 | head = t->iphash[st->bucket].next; | ||
345 | } | ||
346 | (*pos)++; | ||
347 | return list_entry(head, struct recent_entry, list); | ||
866 | } | 348 | } |
867 | 349 | ||
868 | /* This function is called in the event that a rule matching this module is | 350 | static void recent_seq_stop(struct seq_file *s, void *v) |
869 | * removed. | ||
870 | * When this happens we need to check if there are no other rules matching | ||
871 | * the table given. If that is the case then we remove the table and clean | ||
872 | * up its memory. | ||
873 | */ | ||
874 | static void | ||
875 | destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) | ||
876 | { | 351 | { |
877 | const struct ipt_recent_info *info = matchinfo; | 352 | spin_unlock_bh(&recent_lock); |
878 | struct recent_ip_tables *curr_table, *last_table; | 353 | } |
879 | 354 | ||
880 | #ifdef DEBUG | 355 | static int recent_seq_show(struct seq_file *seq, void *v) |
881 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); | 356 | { |
882 | #endif | 357 | struct recent_entry *e = v; |
358 | unsigned int i; | ||
359 | |||
360 | i = (e->index - 1) % ip_pkt_list_tot; | ||
361 | seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", | ||
362 | NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); | ||
363 | for (i = 0; i < e->nstamps; i++) | ||
364 | seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); | ||
365 | seq_printf(seq, "\n"); | ||
366 | return 0; | ||
367 | } | ||
883 | 368 | ||
884 | if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; | 369 | static struct seq_operations recent_seq_ops = { |
370 | .start = recent_seq_start, | ||
371 | .next = recent_seq_next, | ||
372 | .stop = recent_seq_stop, | ||
373 | .show = recent_seq_show, | ||
374 | }; | ||
885 | 375 | ||
886 | /* Lock the linked list while we play with it */ | 376 | static int recent_seq_open(struct inode *inode, struct file *file) |
887 | spin_lock_bh(&recent_lock); | 377 | { |
378 | struct proc_dir_entry *pde = PDE(inode); | ||
379 | struct seq_file *seq; | ||
380 | struct recent_iter_state *st; | ||
381 | int ret; | ||
382 | |||
383 | st = kzalloc(sizeof(*st), GFP_KERNEL); | ||
384 | if (st == NULL) | ||
385 | return -ENOMEM; | ||
386 | ret = seq_open(file, &recent_seq_ops); | ||
387 | if (ret) | ||
388 | kfree(st); | ||
389 | st->table = pde->data; | ||
390 | seq = file->private_data; | ||
391 | seq->private = st; | ||
392 | return ret; | ||
393 | } | ||
888 | 394 | ||
889 | /* Look for an entry with this name already created */ | 395 | static ssize_t recent_proc_write(struct file *file, const char __user *input, |
890 | /* Finds the end of the list and the entry before the end if current name does not exist */ | 396 | size_t size, loff_t *loff) |
891 | last_table = NULL; | 397 | { |
892 | curr_table = r_tables; | 398 | struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); |
893 | if(!curr_table) { | 399 | struct recent_table *t = pde->data; |
894 | #ifdef DEBUG | 400 | struct recent_entry *e; |
895 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); | 401 | char buf[sizeof("+255.255.255.255")], *c = buf; |
896 | #endif | 402 | u_int32_t addr; |
403 | int add; | ||
404 | |||
405 | if (size > sizeof(buf)) | ||
406 | size = sizeof(buf); | ||
407 | if (copy_from_user(buf, input, size)) | ||
408 | return -EFAULT; | ||
409 | while (isspace(*c)) | ||
410 | c++; | ||
411 | |||
412 | if (size - (c - buf) < 5) | ||
413 | return c - buf; | ||
414 | if (!strncmp(c, "clear", 5)) { | ||
415 | c += 5; | ||
416 | spin_lock_bh(&recent_lock); | ||
417 | recent_table_flush(t); | ||
897 | spin_unlock_bh(&recent_lock); | 418 | spin_unlock_bh(&recent_lock); |
898 | return; | 419 | return c - buf; |
899 | } | 420 | } |
900 | while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); | ||
901 | 421 | ||
902 | /* If a table does not exist then do nothing and return */ | 422 | switch (*c) { |
903 | if(!curr_table) { | 423 | case '-': |
904 | #ifdef DEBUG | 424 | add = 0; |
905 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); | 425 | c++; |
906 | #endif | 426 | break; |
907 | spin_unlock_bh(&recent_lock); | 427 | case '+': |
908 | return; | 428 | c++; |
429 | default: | ||
430 | add = 1; | ||
431 | break; | ||
909 | } | 432 | } |
433 | addr = in_aton(c); | ||
910 | 434 | ||
911 | curr_table->count--; | 435 | spin_lock_bh(&recent_lock); |
912 | 436 | e = recent_entry_lookup(t, addr, 0); | |
913 | /* If count is still non-zero then there are still rules referenceing it so we do nothing */ | 437 | if (e == NULL) { |
914 | if(curr_table->count) { | 438 | if (add) |
915 | #ifdef DEBUG | 439 | recent_entry_init(t, addr, 0); |
916 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); | 440 | } else { |
917 | #endif | 441 | if (add) |
918 | spin_unlock_bh(&recent_lock); | 442 | recent_entry_update(t, e); |
919 | return; | 443 | else |
444 | recent_entry_remove(t, e); | ||
920 | } | 445 | } |
921 | |||
922 | #ifdef DEBUG | ||
923 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); | ||
924 | #endif | ||
925 | |||
926 | /* Count must be zero so we remove this table from the list */ | ||
927 | if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; | ||
928 | |||
929 | spin_unlock_bh(&recent_lock); | 446 | spin_unlock_bh(&recent_lock); |
447 | return size; | ||
448 | } | ||
930 | 449 | ||
931 | /* lock to make sure any late-runners still using this after we removed it from | 450 | static struct file_operations recent_fops = { |
932 | * the list finish up then remove everything */ | 451 | .open = recent_seq_open, |
933 | spin_lock_bh(&curr_table->list_lock); | 452 | .read = seq_read, |
934 | spin_unlock_bh(&curr_table->list_lock); | 453 | .write = recent_proc_write, |
935 | 454 | .release = seq_release_private, | |
936 | #ifdef CONFIG_PROC_FS | 455 | .owner = THIS_MODULE, |
937 | if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); | 456 | }; |
938 | #endif /* CONFIG_PROC_FS */ | 457 | #endif /* CONFIG_PROC_FS */ |
939 | vfree(curr_table->table[0].last_pkts); | ||
940 | vfree(curr_table->table); | ||
941 | vfree(curr_table->hash_table); | ||
942 | vfree(curr_table->time_info); | ||
943 | vfree(curr_table); | ||
944 | |||
945 | #ifdef DEBUG | ||
946 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); | ||
947 | #endif | ||
948 | 458 | ||
949 | return; | ||
950 | } | ||
951 | |||
952 | /* This is the structure we pass to ipt_register to register our | ||
953 | * module with iptables. | ||
954 | */ | ||
955 | static struct ipt_match recent_match = { | 459 | static struct ipt_match recent_match = { |
956 | .name = "recent", | 460 | .name = "recent", |
957 | .match = match, | 461 | .match = ipt_recent_match, |
958 | .matchsize = sizeof(struct ipt_recent_info), | 462 | .matchsize = sizeof(struct ipt_recent_info), |
959 | .checkentry = checkentry, | 463 | .checkentry = ipt_recent_checkentry, |
960 | .destroy = destroy, | 464 | .destroy = ipt_recent_destroy, |
961 | .me = THIS_MODULE | 465 | .me = THIS_MODULE, |
962 | }; | 466 | }; |
963 | 467 | ||
964 | /* Kernel module initialization. */ | ||
965 | static int __init ipt_recent_init(void) | 468 | static int __init ipt_recent_init(void) |
966 | { | 469 | { |
967 | int err, count; | 470 | int err; |
968 | 471 | ||
969 | printk(version); | 472 | if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) |
970 | #ifdef CONFIG_PROC_FS | 473 | return -EINVAL; |
971 | proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); | 474 | ip_list_hash_size = 1 << fls(ip_list_tot); |
972 | if(!proc_net_ipt_recent) return -ENOMEM; | ||
973 | #endif | ||
974 | |||
975 | if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { | ||
976 | printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); | ||
977 | ip_list_hash_size = 0; | ||
978 | } | ||
979 | |||
980 | if(!ip_list_hash_size) { | ||
981 | ip_list_hash_size = ip_list_tot*3; | ||
982 | count = 2*2; | ||
983 | while(ip_list_hash_size > count) count = count*2; | ||
984 | ip_list_hash_size = count; | ||
985 | } | ||
986 | |||
987 | #ifdef DEBUG | ||
988 | if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); | ||
989 | #endif | ||
990 | 475 | ||
991 | err = ipt_register_match(&recent_match); | 476 | err = ipt_register_match(&recent_match); |
477 | #ifdef CONFIG_PROC_FS | ||
992 | if (err) | 478 | if (err) |
993 | remove_proc_entry("ipt_recent", proc_net); | 479 | return err; |
480 | proc_dir = proc_mkdir("ipt_recent", proc_net); | ||
481 | if (proc_dir == NULL) { | ||
482 | ipt_unregister_match(&recent_match); | ||
483 | err = -ENOMEM; | ||
484 | } | ||
485 | #endif | ||
994 | return err; | 486 | return err; |
995 | } | 487 | } |
996 | 488 | ||
997 | /* Kernel module destruction. */ | 489 | static void __exit ipt_recent_exit(void) |
998 | static void __exit ipt_recent_fini(void) | ||
999 | { | 490 | { |
491 | BUG_ON(!list_empty(&tables)); | ||
1000 | ipt_unregister_match(&recent_match); | 492 | ipt_unregister_match(&recent_match); |
1001 | 493 | #ifdef CONFIG_PROC_FS | |
1002 | remove_proc_entry("ipt_recent",proc_net); | 494 | remove_proc_entry("ipt_recent", proc_net); |
495 | #endif | ||
1003 | } | 496 | } |
1004 | 497 | ||
1005 | /* Register our module with the kernel. */ | ||
1006 | module_init(ipt_recent_init); | 498 | module_init(ipt_recent_init); |
1007 | module_exit(ipt_recent_fini); | 499 | module_exit(ipt_recent_exit); |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 77d974443c7b..8cc8e1b36778 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
@@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, | |||
145 | 145 | ||
146 | /* This is where we call the helper: as the packet goes out. */ | 146 | /* This is where we call the helper: as the packet goes out. */ |
147 | ct = nf_ct_get(*pskb, &ctinfo); | 147 | ct = nf_ct_get(*pskb, &ctinfo); |
148 | if (!ct) | 148 | if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) |
149 | return NF_ACCEPT; | 149 | return NF_ACCEPT; |
150 | 150 | ||
151 | help = nfct_help(ct); | 151 | help = nfct_help(ct); |
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 4b0d361cc6e6..663a73ee3f2f 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c | |||
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, | |||
235 | } | 235 | } |
236 | 236 | ||
237 | /* See ip_conntrack_proto_tcp.c */ | 237 | /* See ip_conntrack_proto_tcp.c */ |
238 | if (hooknum == NF_IP_PRE_ROUTING && | 238 | if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && |
239 | nf_ip_checksum(skb, hooknum, dataoff, 0)) { | 239 | nf_ip_checksum(skb, hooknum, dataoff, 0)) { |
240 | if (LOG_INVALID(IPPROTO_ICMP)) | 240 | if (LOG_INVALID(IPPROTO_ICMP)) |
241 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 241 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fc2562415555..bd221ec3f81e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk) | |||
103 | } | 103 | } |
104 | 104 | ||
105 | struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, | 105 | struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, |
106 | unsigned long raddr, unsigned long laddr, | 106 | __be32 raddr, __be32 laddr, |
107 | int dif) | 107 | int dif) |
108 | { | 108 | { |
109 | struct hlist_node *node; | 109 | struct hlist_node *node; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3adfcf00..ce4cd5f35511 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -182,14 +182,6 @@ ctl_table ipv4_table[] = { | |||
182 | .strategy = &ipv4_doint_and_flush_strategy, | 182 | .strategy = &ipv4_doint_and_flush_strategy, |
183 | }, | 183 | }, |
184 | { | 184 | { |
185 | .ctl_name = NET_IPV4_AUTOCONFIG, | ||
186 | .procname = "ip_autoconfig", | ||
187 | .data = &ipv4_config.autoconfig, | ||
188 | .maxlen = sizeof(int), | ||
189 | .mode = 0644, | ||
190 | .proc_handler = &proc_dointvec | ||
191 | }, | ||
192 | { | ||
193 | .ctl_name = NET_IPV4_NO_PMTU_DISC, | 185 | .ctl_name = NET_IPV4_NO_PMTU_DISC, |
194 | .procname = "ip_no_pmtu_disc", | 186 | .procname = "ip_no_pmtu_disc", |
195 | .data = &ipv4_config.no_pmtu_disc, | 187 | .data = &ipv4_config.no_pmtu_disc, |
@@ -688,6 +680,24 @@ ctl_table ipv4_table[] = { | |||
688 | .mode = 0644, | 680 | .mode = 0644, |
689 | .proc_handler = &proc_dointvec | 681 | .proc_handler = &proc_dointvec |
690 | }, | 682 | }, |
683 | #ifdef CONFIG_NET_DMA | ||
684 | { | ||
685 | .ctl_name = NET_TCP_DMA_COPYBREAK, | ||
686 | .procname = "tcp_dma_copybreak", | ||
687 | .data = &sysctl_tcp_dma_copybreak, | ||
688 | .maxlen = sizeof(int), | ||
689 | .mode = 0644, | ||
690 | .proc_handler = &proc_dointvec | ||
691 | }, | ||
692 | #endif | ||
693 | { | ||
694 | .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE, | ||
695 | .procname = "tcp_slow_start_after_idle", | ||
696 | .data = &sysctl_tcp_slow_start_after_idle, | ||
697 | .maxlen = sizeof(int), | ||
698 | .mode = 0644, | ||
699 | .proc_handler = &proc_dointvec | ||
700 | }, | ||
691 | { .ctl_name = 0 } | 701 | { .ctl_name = 0 } |
692 | }; | 702 | }; |
693 | 703 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2b7b8055037..74998f250071 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -263,7 +263,7 @@ | |||
263 | #include <net/tcp.h> | 263 | #include <net/tcp.h> |
264 | #include <net/xfrm.h> | 264 | #include <net/xfrm.h> |
265 | #include <net/ip.h> | 265 | #include <net/ip.h> |
266 | 266 | #include <net/netdma.h> | |
267 | 267 | ||
268 | #include <asm/uaccess.h> | 268 | #include <asm/uaccess.h> |
269 | #include <asm/ioctls.h> | 269 | #include <asm/ioctls.h> |
@@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, | |||
622 | ssize_t res; | 622 | ssize_t res; |
623 | struct sock *sk = sock->sk; | 623 | struct sock *sk = sock->sk; |
624 | 624 | ||
625 | #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) | ||
626 | |||
627 | if (!(sk->sk_route_caps & NETIF_F_SG) || | 625 | if (!(sk->sk_route_caps & NETIF_F_SG) || |
628 | !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) | 626 | !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) |
629 | return sock_no_sendpage(sock, page, offset, size, flags); | 627 | return sock_no_sendpage(sock, page, offset, size, flags); |
630 | 628 | ||
631 | #undef TCP_ZC_CSUM_FLAGS | ||
632 | |||
633 | lock_sock(sk); | 629 | lock_sock(sk); |
634 | TCP_CHECK_TIMER(sk); | 630 | TCP_CHECK_TIMER(sk); |
635 | res = do_tcp_sendpages(sk, &page, offset, size, flags); | 631 | res = do_tcp_sendpages(sk, &page, offset, size, flags); |
@@ -726,9 +722,7 @@ new_segment: | |||
726 | /* | 722 | /* |
727 | * Check whether we can use HW checksum. | 723 | * Check whether we can use HW checksum. |
728 | */ | 724 | */ |
729 | if (sk->sk_route_caps & | 725 | if (sk->sk_route_caps & NETIF_F_ALL_CSUM) |
730 | (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | | ||
731 | NETIF_F_HW_CSUM)) | ||
732 | skb->ip_summed = CHECKSUM_HW; | 726 | skb->ip_summed = CHECKSUM_HW; |
733 | 727 | ||
734 | skb_entail(sk, tp, skb); | 728 | skb_entail(sk, tp, skb); |
@@ -937,7 +931,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo, | |||
937 | * calculation of whether or not we must ACK for the sake of | 931 | * calculation of whether or not we must ACK for the sake of |
938 | * a window update. | 932 | * a window update. |
939 | */ | 933 | */ |
940 | static void cleanup_rbuf(struct sock *sk, int copied) | 934 | void tcp_cleanup_rbuf(struct sock *sk, int copied) |
941 | { | 935 | { |
942 | struct tcp_sock *tp = tcp_sk(sk); | 936 | struct tcp_sock *tp = tcp_sk(sk); |
943 | int time_to_ack = 0; | 937 | int time_to_ack = 0; |
@@ -1072,11 +1066,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
1072 | break; | 1066 | break; |
1073 | } | 1067 | } |
1074 | if (skb->h.th->fin) { | 1068 | if (skb->h.th->fin) { |
1075 | sk_eat_skb(sk, skb); | 1069 | sk_eat_skb(sk, skb, 0); |
1076 | ++seq; | 1070 | ++seq; |
1077 | break; | 1071 | break; |
1078 | } | 1072 | } |
1079 | sk_eat_skb(sk, skb); | 1073 | sk_eat_skb(sk, skb, 0); |
1080 | if (!desc->count) | 1074 | if (!desc->count) |
1081 | break; | 1075 | break; |
1082 | } | 1076 | } |
@@ -1086,7 +1080,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
1086 | 1080 | ||
1087 | /* Clean up data we have read: This will do ACK frames. */ | 1081 | /* Clean up data we have read: This will do ACK frames. */ |
1088 | if (copied) | 1082 | if (copied) |
1089 | cleanup_rbuf(sk, copied); | 1083 | tcp_cleanup_rbuf(sk, copied); |
1090 | return copied; | 1084 | return copied; |
1091 | } | 1085 | } |
1092 | 1086 | ||
@@ -1110,6 +1104,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1110 | int target; /* Read at least this many bytes */ | 1104 | int target; /* Read at least this many bytes */ |
1111 | long timeo; | 1105 | long timeo; |
1112 | struct task_struct *user_recv = NULL; | 1106 | struct task_struct *user_recv = NULL; |
1107 | int copied_early = 0; | ||
1113 | 1108 | ||
1114 | lock_sock(sk); | 1109 | lock_sock(sk); |
1115 | 1110 | ||
@@ -1133,6 +1128,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1133 | 1128 | ||
1134 | target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); | 1129 | target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); |
1135 | 1130 | ||
1131 | #ifdef CONFIG_NET_DMA | ||
1132 | tp->ucopy.dma_chan = NULL; | ||
1133 | preempt_disable(); | ||
1134 | if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && | ||
1135 | !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) { | ||
1136 | preempt_enable_no_resched(); | ||
1137 | tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); | ||
1138 | } else | ||
1139 | preempt_enable_no_resched(); | ||
1140 | #endif | ||
1141 | |||
1136 | do { | 1142 | do { |
1137 | struct sk_buff *skb; | 1143 | struct sk_buff *skb; |
1138 | u32 offset; | 1144 | u32 offset; |
@@ -1220,7 +1226,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1220 | } | 1226 | } |
1221 | } | 1227 | } |
1222 | 1228 | ||
1223 | cleanup_rbuf(sk, copied); | 1229 | tcp_cleanup_rbuf(sk, copied); |
1224 | 1230 | ||
1225 | if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { | 1231 | if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { |
1226 | /* Install new reader */ | 1232 | /* Install new reader */ |
@@ -1274,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1274 | } else | 1280 | } else |
1275 | sk_wait_data(sk, &timeo); | 1281 | sk_wait_data(sk, &timeo); |
1276 | 1282 | ||
1283 | #ifdef CONFIG_NET_DMA | ||
1284 | tp->ucopy.wakeup = 0; | ||
1285 | #endif | ||
1286 | |||
1277 | if (user_recv) { | 1287 | if (user_recv) { |
1278 | int chunk; | 1288 | int chunk; |
1279 | 1289 | ||
@@ -1329,13 +1339,39 @@ do_prequeue: | |||
1329 | } | 1339 | } |
1330 | 1340 | ||
1331 | if (!(flags & MSG_TRUNC)) { | 1341 | if (!(flags & MSG_TRUNC)) { |
1332 | err = skb_copy_datagram_iovec(skb, offset, | 1342 | #ifdef CONFIG_NET_DMA |
1333 | msg->msg_iov, used); | 1343 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
1334 | if (err) { | 1344 | tp->ucopy.dma_chan = get_softnet_dma(); |
1335 | /* Exception. Bailout! */ | 1345 | |
1336 | if (!copied) | 1346 | if (tp->ucopy.dma_chan) { |
1337 | copied = -EFAULT; | 1347 | tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( |
1338 | break; | 1348 | tp->ucopy.dma_chan, skb, offset, |
1349 | msg->msg_iov, used, | ||
1350 | tp->ucopy.pinned_list); | ||
1351 | |||
1352 | if (tp->ucopy.dma_cookie < 0) { | ||
1353 | |||
1354 | printk(KERN_ALERT "dma_cookie < 0\n"); | ||
1355 | |||
1356 | /* Exception. Bailout! */ | ||
1357 | if (!copied) | ||
1358 | copied = -EFAULT; | ||
1359 | break; | ||
1360 | } | ||
1361 | if ((offset + used) == skb->len) | ||
1362 | copied_early = 1; | ||
1363 | |||
1364 | } else | ||
1365 | #endif | ||
1366 | { | ||
1367 | err = skb_copy_datagram_iovec(skb, offset, | ||
1368 | msg->msg_iov, used); | ||
1369 | if (err) { | ||
1370 | /* Exception. Bailout! */ | ||
1371 | if (!copied) | ||
1372 | copied = -EFAULT; | ||
1373 | break; | ||
1374 | } | ||
1339 | } | 1375 | } |
1340 | } | 1376 | } |
1341 | 1377 | ||
@@ -1355,15 +1391,19 @@ skip_copy: | |||
1355 | 1391 | ||
1356 | if (skb->h.th->fin) | 1392 | if (skb->h.th->fin) |
1357 | goto found_fin_ok; | 1393 | goto found_fin_ok; |
1358 | if (!(flags & MSG_PEEK)) | 1394 | if (!(flags & MSG_PEEK)) { |
1359 | sk_eat_skb(sk, skb); | 1395 | sk_eat_skb(sk, skb, copied_early); |
1396 | copied_early = 0; | ||
1397 | } | ||
1360 | continue; | 1398 | continue; |
1361 | 1399 | ||
1362 | found_fin_ok: | 1400 | found_fin_ok: |
1363 | /* Process the FIN. */ | 1401 | /* Process the FIN. */ |
1364 | ++*seq; | 1402 | ++*seq; |
1365 | if (!(flags & MSG_PEEK)) | 1403 | if (!(flags & MSG_PEEK)) { |
1366 | sk_eat_skb(sk, skb); | 1404 | sk_eat_skb(sk, skb, copied_early); |
1405 | copied_early = 0; | ||
1406 | } | ||
1367 | break; | 1407 | break; |
1368 | } while (len > 0); | 1408 | } while (len > 0); |
1369 | 1409 | ||
@@ -1386,12 +1426,42 @@ skip_copy: | |||
1386 | tp->ucopy.len = 0; | 1426 | tp->ucopy.len = 0; |
1387 | } | 1427 | } |
1388 | 1428 | ||
1429 | #ifdef CONFIG_NET_DMA | ||
1430 | if (tp->ucopy.dma_chan) { | ||
1431 | struct sk_buff *skb; | ||
1432 | dma_cookie_t done, used; | ||
1433 | |||
1434 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | ||
1435 | |||
1436 | while (dma_async_memcpy_complete(tp->ucopy.dma_chan, | ||
1437 | tp->ucopy.dma_cookie, &done, | ||
1438 | &used) == DMA_IN_PROGRESS) { | ||
1439 | /* do partial cleanup of sk_async_wait_queue */ | ||
1440 | while ((skb = skb_peek(&sk->sk_async_wait_queue)) && | ||
1441 | (dma_async_is_complete(skb->dma_cookie, done, | ||
1442 | used) == DMA_SUCCESS)) { | ||
1443 | __skb_dequeue(&sk->sk_async_wait_queue); | ||
1444 | kfree_skb(skb); | ||
1445 | } | ||
1446 | } | ||
1447 | |||
1448 | /* Safe to free early-copied skbs now */ | ||
1449 | __skb_queue_purge(&sk->sk_async_wait_queue); | ||
1450 | dma_chan_put(tp->ucopy.dma_chan); | ||
1451 | tp->ucopy.dma_chan = NULL; | ||
1452 | } | ||
1453 | if (tp->ucopy.pinned_list) { | ||
1454 | dma_unpin_iovec_pages(tp->ucopy.pinned_list); | ||
1455 | tp->ucopy.pinned_list = NULL; | ||
1456 | } | ||
1457 | #endif | ||
1458 | |||
1389 | /* According to UNIX98, msg_name/msg_namelen are ignored | 1459 | /* According to UNIX98, msg_name/msg_namelen are ignored |
1390 | * on connected socket. I was just happy when found this 8) --ANK | 1460 | * on connected socket. I was just happy when found this 8) --ANK |
1391 | */ | 1461 | */ |
1392 | 1462 | ||
1393 | /* Clean up data we have read: This will do ACK frames. */ | 1463 | /* Clean up data we have read: This will do ACK frames. */ |
1394 | cleanup_rbuf(sk, copied); | 1464 | tcp_cleanup_rbuf(sk, copied); |
1395 | 1465 | ||
1396 | TCP_CHECK_TIMER(sk); | 1466 | TCP_CHECK_TIMER(sk); |
1397 | release_sock(sk); | 1467 | release_sock(sk); |
@@ -1658,6 +1728,9 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
1658 | __skb_queue_purge(&sk->sk_receive_queue); | 1728 | __skb_queue_purge(&sk->sk_receive_queue); |
1659 | sk_stream_writequeue_purge(sk); | 1729 | sk_stream_writequeue_purge(sk); |
1660 | __skb_queue_purge(&tp->out_of_order_queue); | 1730 | __skb_queue_purge(&tp->out_of_order_queue); |
1731 | #ifdef CONFIG_NET_DMA | ||
1732 | __skb_queue_purge(&sk->sk_async_wait_queue); | ||
1733 | #endif | ||
1661 | 1734 | ||
1662 | inet->dport = 0; | 1735 | inet->dport = 0; |
1663 | 1736 | ||
@@ -1858,7 +1931,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
1858 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && | 1931 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
1859 | inet_csk_ack_scheduled(sk)) { | 1932 | inet_csk_ack_scheduled(sk)) { |
1860 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; | 1933 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
1861 | cleanup_rbuf(sk, 1); | 1934 | tcp_cleanup_rbuf(sk, 1); |
1862 | if (!(val & 1)) | 1935 | if (!(val & 1)) |
1863 | icsk->icsk_ack.pingpong = 1; | 1936 | icsk->icsk_ack.pingpong = 1; |
1864 | } | 1937 | } |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 035f2092d73a..b2d9021ad22b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
@@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) | |||
198 | return max(tp->snd_cwnd, ca->last_max_cwnd); | 198 | return max(tp->snd_cwnd, ca->last_max_cwnd); |
199 | } | 199 | } |
200 | 200 | ||
201 | static u32 bictcp_min_cwnd(struct sock *sk) | ||
202 | { | ||
203 | const struct tcp_sock *tp = tcp_sk(sk); | ||
204 | return tp->snd_ssthresh; | ||
205 | } | ||
206 | |||
207 | static void bictcp_state(struct sock *sk, u8 new_state) | 201 | static void bictcp_state(struct sock *sk, u8 new_state) |
208 | { | 202 | { |
209 | if (new_state == TCP_CA_Loss) | 203 | if (new_state == TCP_CA_Loss) |
@@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = { | |||
231 | .cong_avoid = bictcp_cong_avoid, | 225 | .cong_avoid = bictcp_cong_avoid, |
232 | .set_state = bictcp_state, | 226 | .set_state = bictcp_state, |
233 | .undo_cwnd = bictcp_undo_cwnd, | 227 | .undo_cwnd = bictcp_undo_cwnd, |
234 | .min_cwnd = bictcp_min_cwnd, | ||
235 | .pkts_acked = bictcp_acked, | 228 | .pkts_acked = bictcp_acked, |
236 | .owner = THIS_MODULE, | 229 | .owner = THIS_MODULE, |
237 | .name = "bic", | 230 | .name = "bic", |
diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c new file mode 100644 index 000000000000..bc54f7e9aea9 --- /dev/null +++ b/net/ipv4/tcp_compound.c | |||
@@ -0,0 +1,448 @@ | |||
1 | /* | ||
2 | * TCP Vegas congestion control | ||
3 | * | ||
4 | * This is based on the congestion detection/avoidance scheme described in | ||
5 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
6 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
7 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
8 | * October 1995. Available from: | ||
9 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
10 | * | ||
11 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
12 | * The main aspects that distinguish this implementation from the | ||
13 | * Arizona Vegas implementation are: | ||
14 | * o We do not change the loss detection or recovery mechanisms of | ||
15 | * Linux in any way. Linux already recovers from losses quite well, | ||
16 | * using fine-grained timers, NewReno, and FACK. | ||
17 | * o To avoid the performance penalty imposed by increasing cwnd | ||
18 | * only every-other RTT during slow start, we increase during | ||
19 | * every RTT during slow start, just like Reno. | ||
20 | * o Largely to allow continuous cwnd growth during slow start, | ||
21 | * we use the rate at which ACKs come back as the "actual" | ||
22 | * rate, rather than the rate at which data is sent. | ||
23 | * o To speed convergence to the right rate, we set the cwnd | ||
24 | * to achieve the right ("actual") rate when we exit slow start. | ||
25 | * o To filter out the noise caused by delayed ACKs, we use the | ||
26 | * minimum RTT sample observed during the last RTT to calculate | ||
27 | * the actual rate. | ||
28 | * o When the sender re-starts from idle, it waits until it has | ||
29 | * received ACKs for an entire flight of new data before making | ||
30 | * a cwnd adjustment decision. The original Vegas implementation | ||
31 | * assumed senders never went idle. | ||
32 | * | ||
33 | * | ||
34 | * TCP Compound based on TCP Vegas | ||
35 | * | ||
36 | * further details can be found here: | ||
37 | * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf | ||
38 | */ | ||
39 | |||
40 | #include <linux/config.h> | ||
41 | #include <linux/mm.h> | ||
42 | #include <linux/module.h> | ||
43 | #include <linux/skbuff.h> | ||
44 | #include <linux/inet_diag.h> | ||
45 | |||
46 | #include <net/tcp.h> | ||
47 | |||
48 | /* Default values of the Vegas variables, in fixed-point representation | ||
49 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
50 | */ | ||
51 | #define V_PARAM_SHIFT 1 | ||
52 | |||
53 | #define TCP_COMPOUND_ALPHA 3U | ||
54 | #define TCP_COMPOUND_BETA 1U | ||
55 | #define TCP_COMPOUND_GAMMA 30 | ||
56 | #define TCP_COMPOUND_ZETA 1 | ||
57 | |||
58 | /* TCP compound variables */ | ||
59 | struct compound { | ||
60 | u32 beg_snd_nxt; /* right edge during last RTT */ | ||
61 | u32 beg_snd_una; /* left edge during last RTT */ | ||
62 | u32 beg_snd_cwnd; /* saves the size of the cwnd */ | ||
63 | u8 doing_vegas_now; /* if true, do vegas for this RTT */ | ||
64 | u16 cntRTT; /* # of RTTs measured within last RTT */ | ||
65 | u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ | ||
66 | u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ | ||
67 | |||
68 | u32 cwnd; | ||
69 | u32 dwnd; | ||
70 | }; | ||
71 | |||
72 | /* There are several situations when we must "re-start" Vegas: | ||
73 | * | ||
74 | * o when a connection is established | ||
75 | * o after an RTO | ||
76 | * o after fast recovery | ||
77 | * o when we send a packet and there is no outstanding | ||
78 | * unacknowledged data (restarting an idle connection) | ||
79 | * | ||
80 | * In these circumstances we cannot do a Vegas calculation at the | ||
81 | * end of the first RTT, because any calculation we do is using | ||
82 | * stale info -- both the saved cwnd and congestion feedback are | ||
83 | * stale. | ||
84 | * | ||
85 | * Instead we must wait until the completion of an RTT during | ||
86 | * which we actually receive ACKs. | ||
87 | */ | ||
88 | static inline void vegas_enable(struct sock *sk) | ||
89 | { | ||
90 | const struct tcp_sock *tp = tcp_sk(sk); | ||
91 | struct compound *vegas = inet_csk_ca(sk); | ||
92 | |||
93 | /* Begin taking Vegas samples next time we send something. */ | ||
94 | vegas->doing_vegas_now = 1; | ||
95 | |||
96 | /* Set the beginning of the next send window. */ | ||
97 | vegas->beg_snd_nxt = tp->snd_nxt; | ||
98 | |||
99 | vegas->cntRTT = 0; | ||
100 | vegas->minRTT = 0x7fffffff; | ||
101 | } | ||
102 | |||
103 | /* Stop taking Vegas samples for now. */ | ||
104 | static inline void vegas_disable(struct sock *sk) | ||
105 | { | ||
106 | struct compound *vegas = inet_csk_ca(sk); | ||
107 | |||
108 | vegas->doing_vegas_now = 0; | ||
109 | } | ||
110 | |||
111 | static void tcp_compound_init(struct sock *sk) | ||
112 | { | ||
113 | struct compound *vegas = inet_csk_ca(sk); | ||
114 | const struct tcp_sock *tp = tcp_sk(sk); | ||
115 | |||
116 | vegas->baseRTT = 0x7fffffff; | ||
117 | vegas_enable(sk); | ||
118 | |||
119 | vegas->dwnd = 0; | ||
120 | vegas->cwnd = tp->snd_cwnd; | ||
121 | } | ||
122 | |||
123 | /* Do RTT sampling needed for Vegas. | ||
124 | * Basically we: | ||
125 | * o min-filter RTT samples from within an RTT to get the current | ||
126 | * propagation delay + queuing delay (we are min-filtering to try to | ||
127 | * avoid the effects of delayed ACKs) | ||
128 | * o min-filter RTT samples from a much longer window (forever for now) | ||
129 | * to find the propagation delay (baseRTT) | ||
130 | */ | ||
131 | static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt) | ||
132 | { | ||
133 | struct compound *vegas = inet_csk_ca(sk); | ||
134 | u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ | ||
135 | |||
136 | /* Filter to find propagation delay: */ | ||
137 | if (vrtt < vegas->baseRTT) | ||
138 | vegas->baseRTT = vrtt; | ||
139 | |||
140 | /* Find the min RTT during the last RTT to find | ||
141 | * the current prop. delay + queuing delay: | ||
142 | */ | ||
143 | |||
144 | vegas->minRTT = min(vegas->minRTT, vrtt); | ||
145 | vegas->cntRTT++; | ||
146 | } | ||
147 | |||
148 | static void tcp_compound_state(struct sock *sk, u8 ca_state) | ||
149 | { | ||
150 | |||
151 | if (ca_state == TCP_CA_Open) | ||
152 | vegas_enable(sk); | ||
153 | else | ||
154 | vegas_disable(sk); | ||
155 | } | ||
156 | |||
157 | |||
158 | /* 64bit divisor, dividend and result. dynamic precision */ | ||
159 | static inline u64 div64_64(u64 dividend, u64 divisor) | ||
160 | { | ||
161 | u32 d = divisor; | ||
162 | |||
163 | if (divisor > 0xffffffffULL) { | ||
164 | unsigned int shift = fls(divisor >> 32); | ||
165 | |||
166 | d = divisor >> shift; | ||
167 | dividend >>= shift; | ||
168 | } | ||
169 | |||
170 | /* avoid 64 bit division if possible */ | ||
171 | if (dividend >> 32) | ||
172 | do_div(dividend, d); | ||
173 | else | ||
174 | dividend = (u32) dividend / d; | ||
175 | |||
176 | return dividend; | ||
177 | } | ||
178 | |||
179 | /* calculate the quartic root of "a" using Newton-Raphson */ | ||
180 | static u32 qroot(u64 a) | ||
181 | { | ||
182 | u32 x, x1; | ||
183 | |||
184 | /* Initial estimate is based on: | ||
185 | * qrt(x) = exp(log(x) / 4) | ||
186 | */ | ||
187 | x = 1u << (fls64(a) >> 2); | ||
188 | |||
189 | /* | ||
190 | * Iteration based on: | ||
191 | * 3 | ||
192 | * x = ( 3 * x + a / x ) / 4 | ||
193 | * k+1 k k | ||
194 | */ | ||
195 | do { | ||
196 | u64 x3 = x; | ||
197 | |||
198 | x1 = x; | ||
199 | x3 *= x; | ||
200 | x3 *= x; | ||
201 | |||
202 | x = (3 * x + (u32) div64_64(a, x3)) / 4; | ||
203 | } while (abs(x1 - x) > 1); | ||
204 | |||
205 | return x; | ||
206 | } | ||
207 | |||
208 | |||
209 | /* | ||
210 | * If the connection is idle and we are restarting, | ||
211 | * then we don't want to do any Vegas calculations | ||
212 | * until we get fresh RTT samples. So when we | ||
213 | * restart, we reset our Vegas state to a clean | ||
214 | * slate. After we get acks for this flight of | ||
215 | * packets, _then_ we can make Vegas calculations | ||
216 | * again. | ||
217 | */ | ||
218 | static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event) | ||
219 | { | ||
220 | if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) | ||
221 | tcp_compound_init(sk); | ||
222 | } | ||
223 | |||
224 | static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, | ||
225 | u32 seq_rtt, u32 in_flight, int flag) | ||
226 | { | ||
227 | struct tcp_sock *tp = tcp_sk(sk); | ||
228 | struct compound *vegas = inet_csk_ca(sk); | ||
229 | u8 inc = 0; | ||
230 | |||
231 | if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) { | ||
232 | if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) { | ||
233 | vegas->cwnd = tp->snd_cwnd; | ||
234 | vegas->dwnd = 0; | ||
235 | } else | ||
236 | vegas->cwnd = tp->snd_cwnd - vegas->dwnd; | ||
237 | |||
238 | } | ||
239 | |||
240 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
241 | return; | ||
242 | |||
243 | if (vegas->cwnd <= tp->snd_ssthresh) | ||
244 | inc = 1; | ||
245 | else if (tp->snd_cwnd_cnt < tp->snd_cwnd) | ||
246 | tp->snd_cwnd_cnt++; | ||
247 | |||
248 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
249 | inc = 1; | ||
250 | tp->snd_cwnd_cnt = 0; | ||
251 | } | ||
252 | |||
253 | if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
254 | vegas->cwnd++; | ||
255 | |||
256 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
257 | * | ||
258 | * These are so named because they represent the approximate values | ||
259 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
260 | * precisely, they represent the amount of data sent during the RTT. | ||
261 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
262 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
263 | * bytes of data have been ACKed during the course of the RTT, giving | ||
264 | * an "actual" rate of: | ||
265 | * | ||
266 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
267 | * | ||
268 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
269 | * because delayed ACKs can cover more than one segment, so they | ||
270 | * don't line up nicely with the boundaries of RTTs. | ||
271 | * | ||
272 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
273 | * advance of the left edge of our send window, so that the number | ||
274 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
275 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
276 | */ | ||
277 | |||
278 | if (after(ack, vegas->beg_snd_nxt)) { | ||
279 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
280 | u32 old_wnd, old_snd_cwnd; | ||
281 | |||
282 | /* Here old_wnd is essentially the window of data that was | ||
283 | * sent during the previous RTT, and has all | ||
284 | * been acknowledged in the course of the RTT that ended | ||
285 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
286 | * is the cwnd during the previous RTT. | ||
287 | */ | ||
288 | if (!tp->mss_cache) | ||
289 | return; | ||
290 | |||
291 | old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / | ||
292 | tp->mss_cache; | ||
293 | old_snd_cwnd = vegas->beg_snd_cwnd; | ||
294 | |||
295 | /* Save the extent of the current window so we can use this | ||
296 | * at the end of the next RTT. | ||
297 | */ | ||
298 | vegas->beg_snd_una = vegas->beg_snd_nxt; | ||
299 | vegas->beg_snd_nxt = tp->snd_nxt; | ||
300 | vegas->beg_snd_cwnd = tp->snd_cwnd; | ||
301 | |||
302 | /* We do the Vegas calculations only if we got enough RTT | ||
303 | * samples that we can be reasonably sure that we got | ||
304 | * at least one RTT sample that wasn't from a delayed ACK. | ||
305 | * If we only had 2 samples total, | ||
306 | * then that means we're getting only 1 ACK per RTT, which | ||
307 | * means they're almost certainly delayed ACKs. | ||
308 | * If we have 3 samples, we should be OK. | ||
309 | */ | ||
310 | |||
311 | if (vegas->cntRTT > 2) { | ||
312 | u32 rtt, target_cwnd, diff; | ||
313 | u32 brtt, dwnd; | ||
314 | |||
315 | /* We have enough RTT samples, so, using the Vegas | ||
316 | * algorithm, we determine if we should increase or | ||
317 | * decrease cwnd, and by how much. | ||
318 | */ | ||
319 | |||
320 | /* Pluck out the RTT we are using for the Vegas | ||
321 | * calculations. This is the min RTT seen during the | ||
322 | * last RTT. Taking the min filters out the effects | ||
323 | * of delayed ACKs, at the cost of noticing congestion | ||
324 | * a bit later. | ||
325 | */ | ||
326 | rtt = vegas->minRTT; | ||
327 | |||
328 | /* Calculate the cwnd we should have, if we weren't | ||
329 | * going too fast. | ||
330 | * | ||
331 | * This is: | ||
332 | * (actual rate in segments) * baseRTT | ||
333 | * We keep it as a fixed point number with | ||
334 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
335 | */ | ||
336 | if (!rtt) | ||
337 | return; | ||
338 | |||
339 | brtt = vegas->baseRTT; | ||
340 | target_cwnd = ((old_wnd * brtt) | ||
341 | << V_PARAM_SHIFT) / rtt; | ||
342 | |||
343 | /* Calculate the difference between the window we had, | ||
344 | * and the window we would like to have. This quantity | ||
345 | * is the "Diff" from the Arizona Vegas papers. | ||
346 | * | ||
347 | * Again, this is a fixed point number with | ||
348 | * V_PARAM_SHIFT bits to the right of the binary | ||
349 | * point. | ||
350 | */ | ||
351 | |||
352 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
353 | |||
354 | dwnd = vegas->dwnd; | ||
355 | |||
356 | if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { | ||
357 | u64 v; | ||
358 | u32 x; | ||
359 | |||
360 | /* | ||
361 | * The TCP Compound paper describes the choice | ||
362 | * of "k" determines the agressiveness, | ||
363 | * ie. slope of the response function. | ||
364 | * | ||
365 | * For same value as HSTCP would be 0.8 | ||
366 | * but for computaional reasons, both the | ||
367 | * original authors and this implementation | ||
368 | * use 0.75. | ||
369 | */ | ||
370 | v = old_wnd; | ||
371 | x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA; | ||
372 | if (x > 1) | ||
373 | dwnd = x - 1; | ||
374 | else | ||
375 | dwnd = 0; | ||
376 | |||
377 | dwnd += vegas->dwnd; | ||
378 | |||
379 | } else if ((dwnd << V_PARAM_SHIFT) < | ||
380 | (diff * TCP_COMPOUND_BETA)) | ||
381 | dwnd = 0; | ||
382 | else | ||
383 | dwnd = | ||
384 | ((dwnd << V_PARAM_SHIFT) - | ||
385 | (diff * | ||
386 | TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT; | ||
387 | |||
388 | vegas->dwnd = dwnd; | ||
389 | |||
390 | } | ||
391 | |||
392 | /* Wipe the slate clean for the next RTT. */ | ||
393 | vegas->cntRTT = 0; | ||
394 | vegas->minRTT = 0x7fffffff; | ||
395 | } | ||
396 | |||
397 | tp->snd_cwnd = vegas->cwnd + vegas->dwnd; | ||
398 | } | ||
399 | |||
400 | /* Extract info for Tcp socket info provided via netlink. */ | ||
401 | static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) | ||
402 | { | ||
403 | const struct compound *ca = inet_csk_ca(sk); | ||
404 | if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { | ||
405 | struct tcpvegas_info *info; | ||
406 | |||
407 | info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, | ||
408 | sizeof(*info))); | ||
409 | |||
410 | info->tcpv_enabled = ca->doing_vegas_now; | ||
411 | info->tcpv_rttcnt = ca->cntRTT; | ||
412 | info->tcpv_rtt = ca->baseRTT; | ||
413 | info->tcpv_minrtt = ca->minRTT; | ||
414 | rtattr_failure:; | ||
415 | } | ||
416 | } | ||
417 | |||
418 | static struct tcp_congestion_ops tcp_compound = { | ||
419 | .init = tcp_compound_init, | ||
420 | .ssthresh = tcp_reno_ssthresh, | ||
421 | .cong_avoid = tcp_compound_cong_avoid, | ||
422 | .rtt_sample = tcp_compound_rtt_calc, | ||
423 | .set_state = tcp_compound_state, | ||
424 | .cwnd_event = tcp_compound_cwnd_event, | ||
425 | .get_info = tcp_compound_get_info, | ||
426 | |||
427 | .owner = THIS_MODULE, | ||
428 | .name = "compound", | ||
429 | }; | ||
430 | |||
431 | static int __init tcp_compound_register(void) | ||
432 | { | ||
433 | BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE); | ||
434 | tcp_register_congestion_control(&tcp_compound); | ||
435 | return 0; | ||
436 | } | ||
437 | |||
438 | static void __exit tcp_compound_unregister(void) | ||
439 | { | ||
440 | tcp_unregister_congestion_control(&tcp_compound); | ||
441 | } | ||
442 | |||
443 | module_init(tcp_compound_register); | ||
444 | module_exit(tcp_compound_unregister); | ||
445 | |||
446 | MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger"); | ||
447 | MODULE_LICENSE("GPL"); | ||
448 | MODULE_DESCRIPTION("TCP Compound"); | ||
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 91c2f41c7f58..857eefc52aab 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | |||
38 | int ret = 0; | 38 | int ret = 0; |
39 | 39 | ||
40 | /* all algorithms must implement ssthresh and cong_avoid ops */ | 40 | /* all algorithms must implement ssthresh and cong_avoid ops */ |
41 | if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { | 41 | if (!ca->ssthresh || !ca->cong_avoid) { |
42 | printk(KERN_ERR "TCP %s does not implement required ops\n", | 42 | printk(KERN_ERR "TCP %s does not implement required ops\n", |
43 | ca->name); | 43 | ca->name); |
44 | return -EINVAL; | 44 | return -EINVAL; |
@@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk) | |||
251 | } | 251 | } |
252 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | 252 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); |
253 | 253 | ||
254 | /* Lower bound on congestion window. */ | 254 | /* Lower bound on congestion window with halving. */ |
255 | u32 tcp_reno_min_cwnd(struct sock *sk) | 255 | u32 tcp_reno_min_cwnd(const struct sock *sk) |
256 | { | 256 | { |
257 | const struct tcp_sock *tp = tcp_sk(sk); | 257 | const struct tcp_sock *tp = tcp_sk(sk); |
258 | return tp->snd_ssthresh/2; | 258 | return tp->snd_ssthresh/2; |
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 31a4986dfbf7..78b7a6b9e4de 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c | |||
@@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) | |||
325 | return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); | 325 | return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); |
326 | } | 326 | } |
327 | 327 | ||
328 | static u32 bictcp_min_cwnd(struct sock *sk) | ||
329 | { | ||
330 | return tcp_sk(sk)->snd_ssthresh; | ||
331 | } | ||
332 | |||
333 | static void bictcp_state(struct sock *sk, u8 new_state) | 328 | static void bictcp_state(struct sock *sk, u8 new_state) |
334 | { | 329 | { |
335 | if (new_state == TCP_CA_Loss) | 330 | if (new_state == TCP_CA_Loss) |
@@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = { | |||
357 | .cong_avoid = bictcp_cong_avoid, | 352 | .cong_avoid = bictcp_cong_avoid, |
358 | .set_state = bictcp_state, | 353 | .set_state = bictcp_state, |
359 | .undo_cwnd = bictcp_undo_cwnd, | 354 | .undo_cwnd = bictcp_undo_cwnd, |
360 | .min_cwnd = bictcp_min_cwnd, | ||
361 | .pkts_acked = bictcp_acked, | 355 | .pkts_acked = bictcp_acked, |
362 | .owner = THIS_MODULE, | 356 | .owner = THIS_MODULE, |
363 | .name = "cubic", | 357 | .name = "cubic", |
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index ba7c63ca5bb1..1120245b2373 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
@@ -98,6 +98,10 @@ struct hstcp { | |||
98 | u32 ai; | 98 | u32 ai; |
99 | }; | 99 | }; |
100 | 100 | ||
101 | static int max_ssthresh = 100; | ||
102 | module_param(max_ssthresh, int, 0644); | ||
103 | MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)"); | ||
104 | |||
101 | static void hstcp_init(struct sock *sk) | 105 | static void hstcp_init(struct sock *sk) |
102 | { | 106 | { |
103 | struct tcp_sock *tp = tcp_sk(sk); | 107 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, | |||
119 | if (!tcp_is_cwnd_limited(sk, in_flight)) | 123 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
120 | return; | 124 | return; |
121 | 125 | ||
122 | if (tp->snd_cwnd <= tp->snd_ssthresh) | 126 | if (tp->snd_cwnd <= tp->snd_ssthresh) { |
123 | tcp_slow_start(tp); | 127 | /* RFC3742: limited slow start |
124 | else { | 128 | * the window is increased by 1/K MSS for each arriving ACK, |
129 | * for K = int(cwnd/(0.5 max_ssthresh)) | ||
130 | */ | ||
131 | if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) { | ||
132 | u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U); | ||
133 | if (++tp->snd_cwnd_cnt >= k) { | ||
134 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
135 | tp->snd_cwnd++; | ||
136 | tp->snd_cwnd_cnt = 0; | ||
137 | } | ||
138 | } else { | ||
139 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
140 | tp->snd_cwnd++; | ||
141 | } | ||
142 | } else { | ||
125 | /* Update AIMD parameters */ | 143 | /* Update AIMD parameters */ |
126 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { | 144 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { |
127 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | 145 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && |
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1b2ff53f98ed..3d92c1859267 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
@@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
246 | } | 246 | } |
247 | } | 247 | } |
248 | 248 | ||
249 | /* Lower bound on congestion window. */ | ||
250 | static u32 htcp_min_cwnd(struct sock *sk) | ||
251 | { | ||
252 | const struct tcp_sock *tp = tcp_sk(sk); | ||
253 | return tp->snd_ssthresh; | ||
254 | } | ||
255 | |||
256 | |||
257 | static void htcp_init(struct sock *sk) | 249 | static void htcp_init(struct sock *sk) |
258 | { | 250 | { |
259 | struct htcp *ca = inet_csk_ca(sk); | 251 | struct htcp *ca = inet_csk_ca(sk); |
@@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state) | |||
285 | static struct tcp_congestion_ops htcp = { | 277 | static struct tcp_congestion_ops htcp = { |
286 | .init = htcp_init, | 278 | .init = htcp_init, |
287 | .ssthresh = htcp_recalc_ssthresh, | 279 | .ssthresh = htcp_recalc_ssthresh, |
288 | .min_cwnd = htcp_min_cwnd, | ||
289 | .cong_avoid = htcp_cong_avoid, | 280 | .cong_avoid = htcp_cong_avoid, |
290 | .set_state = htcp_state, | 281 | .set_state = htcp_state, |
291 | .undo_cwnd = htcp_cwnd_undo, | 282 | .undo_cwnd = htcp_cwnd_undo, |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4a538bc1683d..e08245bdda3a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -71,6 +71,7 @@ | |||
71 | #include <net/inet_common.h> | 71 | #include <net/inet_common.h> |
72 | #include <linux/ipsec.h> | 72 | #include <linux/ipsec.h> |
73 | #include <asm/unaligned.h> | 73 | #include <asm/unaligned.h> |
74 | #include <net/netdma.h> | ||
74 | 75 | ||
75 | int sysctl_tcp_timestamps = 1; | 76 | int sysctl_tcp_timestamps = 1; |
76 | int sysctl_tcp_window_scaling = 1; | 77 | int sysctl_tcp_window_scaling = 1; |
@@ -1649,7 +1650,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) | |||
1649 | * Hence, we can detect timed out packets during fast | 1650 | * Hence, we can detect timed out packets during fast |
1650 | * retransmit without falling to slow start. | 1651 | * retransmit without falling to slow start. |
1651 | */ | 1652 | */ |
1652 | if (tcp_head_timedout(sk, tp)) { | 1653 | if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { |
1653 | struct sk_buff *skb; | 1654 | struct sk_buff *skb; |
1654 | 1655 | ||
1655 | skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint | 1656 | skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint |
@@ -1662,8 +1663,6 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) | |||
1662 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | 1663 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { |
1663 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1664 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
1664 | tp->lost_out += tcp_skb_pcount(skb); | 1665 | tp->lost_out += tcp_skb_pcount(skb); |
1665 | if (IsReno(tp)) | ||
1666 | tcp_remove_reno_sacks(sk, tp, tcp_skb_pcount(skb) + 1); | ||
1667 | 1666 | ||
1668 | /* clear xmit_retrans hint */ | 1667 | /* clear xmit_retrans hint */ |
1669 | if (tp->retransmit_skb_hint && | 1668 | if (tp->retransmit_skb_hint && |
@@ -1690,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
1690 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1689 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1691 | } | 1690 | } |
1692 | 1691 | ||
1692 | /* Lower bound on congestion window is slow start threshold | ||
1693 | * unless congestion avoidance choice decides to overide it. | ||
1694 | */ | ||
1695 | static inline u32 tcp_cwnd_min(const struct sock *sk) | ||
1696 | { | ||
1697 | const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; | ||
1698 | |||
1699 | return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; | ||
1700 | } | ||
1701 | |||
1693 | /* Decrease cwnd each second ack. */ | 1702 | /* Decrease cwnd each second ack. */ |
1694 | static void tcp_cwnd_down(struct sock *sk) | 1703 | static void tcp_cwnd_down(struct sock *sk) |
1695 | { | 1704 | { |
1696 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
1697 | struct tcp_sock *tp = tcp_sk(sk); | 1705 | struct tcp_sock *tp = tcp_sk(sk); |
1698 | int decr = tp->snd_cwnd_cnt + 1; | 1706 | int decr = tp->snd_cwnd_cnt + 1; |
1699 | 1707 | ||
1700 | tp->snd_cwnd_cnt = decr&1; | 1708 | tp->snd_cwnd_cnt = decr&1; |
1701 | decr >>= 1; | 1709 | decr >>= 1; |
1702 | 1710 | ||
1703 | if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) | 1711 | if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) |
1704 | tp->snd_cwnd -= decr; | 1712 | tp->snd_cwnd -= decr; |
1705 | 1713 | ||
1706 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); | 1714 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); |
@@ -3787,6 +3795,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk | |||
3787 | __tcp_checksum_complete_user(sk, skb); | 3795 | __tcp_checksum_complete_user(sk, skb); |
3788 | } | 3796 | } |
3789 | 3797 | ||
3798 | #ifdef CONFIG_NET_DMA | ||
3799 | static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) | ||
3800 | { | ||
3801 | struct tcp_sock *tp = tcp_sk(sk); | ||
3802 | int chunk = skb->len - hlen; | ||
3803 | int dma_cookie; | ||
3804 | int copied_early = 0; | ||
3805 | |||
3806 | if (tp->ucopy.wakeup) | ||
3807 | return 0; | ||
3808 | |||
3809 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | ||
3810 | tp->ucopy.dma_chan = get_softnet_dma(); | ||
3811 | |||
3812 | if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { | ||
3813 | |||
3814 | dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, | ||
3815 | skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); | ||
3816 | |||
3817 | if (dma_cookie < 0) | ||
3818 | goto out; | ||
3819 | |||
3820 | tp->ucopy.dma_cookie = dma_cookie; | ||
3821 | copied_early = 1; | ||
3822 | |||
3823 | tp->ucopy.len -= chunk; | ||
3824 | tp->copied_seq += chunk; | ||
3825 | tcp_rcv_space_adjust(sk); | ||
3826 | |||
3827 | if ((tp->ucopy.len == 0) || | ||
3828 | (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || | ||
3829 | (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { | ||
3830 | tp->ucopy.wakeup = 1; | ||
3831 | sk->sk_data_ready(sk, 0); | ||
3832 | } | ||
3833 | } else if (chunk > 0) { | ||
3834 | tp->ucopy.wakeup = 1; | ||
3835 | sk->sk_data_ready(sk, 0); | ||
3836 | } | ||
3837 | out: | ||
3838 | return copied_early; | ||
3839 | } | ||
3840 | #endif /* CONFIG_NET_DMA */ | ||
3841 | |||
3790 | /* | 3842 | /* |
3791 | * TCP receive function for the ESTABLISHED state. | 3843 | * TCP receive function for the ESTABLISHED state. |
3792 | * | 3844 | * |
@@ -3888,8 +3940,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3888 | tp->rcv_nxt == tp->rcv_wup) | 3940 | tp->rcv_nxt == tp->rcv_wup) |
3889 | tcp_store_ts_recent(tp); | 3941 | tcp_store_ts_recent(tp); |
3890 | 3942 | ||
3891 | tcp_rcv_rtt_measure_ts(sk, skb); | ||
3892 | |||
3893 | /* We know that such packets are checksummed | 3943 | /* We know that such packets are checksummed |
3894 | * on entry. | 3944 | * on entry. |
3895 | */ | 3945 | */ |
@@ -3903,14 +3953,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3903 | } | 3953 | } |
3904 | } else { | 3954 | } else { |
3905 | int eaten = 0; | 3955 | int eaten = 0; |
3956 | int copied_early = 0; | ||
3906 | 3957 | ||
3907 | if (tp->ucopy.task == current && | 3958 | if (tp->copied_seq == tp->rcv_nxt && |
3908 | tp->copied_seq == tp->rcv_nxt && | 3959 | len - tcp_header_len <= tp->ucopy.len) { |
3909 | len - tcp_header_len <= tp->ucopy.len && | 3960 | #ifdef CONFIG_NET_DMA |
3910 | sock_owned_by_user(sk)) { | 3961 | if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { |
3911 | __set_current_state(TASK_RUNNING); | 3962 | copied_early = 1; |
3963 | eaten = 1; | ||
3964 | } | ||
3965 | #endif | ||
3966 | if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { | ||
3967 | __set_current_state(TASK_RUNNING); | ||
3912 | 3968 | ||
3913 | if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { | 3969 | if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) |
3970 | eaten = 1; | ||
3971 | } | ||
3972 | if (eaten) { | ||
3914 | /* Predicted packet is in window by definition. | 3973 | /* Predicted packet is in window by definition. |
3915 | * seq == rcv_nxt and rcv_wup <= rcv_nxt. | 3974 | * seq == rcv_nxt and rcv_wup <= rcv_nxt. |
3916 | * Hence, check seq<=rcv_wup reduces to: | 3975 | * Hence, check seq<=rcv_wup reduces to: |
@@ -3926,8 +3985,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3926 | __skb_pull(skb, tcp_header_len); | 3985 | __skb_pull(skb, tcp_header_len); |
3927 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 3986 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
3928 | NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); | 3987 | NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); |
3929 | eaten = 1; | ||
3930 | } | 3988 | } |
3989 | if (copied_early) | ||
3990 | tcp_cleanup_rbuf(sk, skb->len); | ||
3931 | } | 3991 | } |
3932 | if (!eaten) { | 3992 | if (!eaten) { |
3933 | if (tcp_checksum_complete_user(sk, skb)) | 3993 | if (tcp_checksum_complete_user(sk, skb)) |
@@ -3968,6 +4028,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3968 | 4028 | ||
3969 | __tcp_ack_snd_check(sk, 0); | 4029 | __tcp_ack_snd_check(sk, 0); |
3970 | no_ack: | 4030 | no_ack: |
4031 | #ifdef CONFIG_NET_DMA | ||
4032 | if (copied_early) | ||
4033 | __skb_queue_tail(&sk->sk_async_wait_queue, skb); | ||
4034 | else | ||
4035 | #endif | ||
3971 | if (eaten) | 4036 | if (eaten) |
3972 | __kfree_skb(skb); | 4037 | __kfree_skb(skb); |
3973 | else | 4038 | else |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 672950e54c49..25ecc6e2478b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -71,6 +71,7 @@ | |||
71 | #include <net/inet_common.h> | 71 | #include <net/inet_common.h> |
72 | #include <net/timewait_sock.h> | 72 | #include <net/timewait_sock.h> |
73 | #include <net/xfrm.h> | 73 | #include <net/xfrm.h> |
74 | #include <net/netdma.h> | ||
74 | 75 | ||
75 | #include <linux/inet.h> | 76 | #include <linux/inet.h> |
76 | #include <linux/ipv6.h> | 77 | #include <linux/ipv6.h> |
@@ -1091,8 +1092,18 @@ process: | |||
1091 | bh_lock_sock(sk); | 1092 | bh_lock_sock(sk); |
1092 | ret = 0; | 1093 | ret = 0; |
1093 | if (!sock_owned_by_user(sk)) { | 1094 | if (!sock_owned_by_user(sk)) { |
1094 | if (!tcp_prequeue(sk, skb)) | 1095 | #ifdef CONFIG_NET_DMA |
1096 | struct tcp_sock *tp = tcp_sk(sk); | ||
1097 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | ||
1098 | tp->ucopy.dma_chan = get_softnet_dma(); | ||
1099 | if (tp->ucopy.dma_chan) | ||
1095 | ret = tcp_v4_do_rcv(sk, skb); | 1100 | ret = tcp_v4_do_rcv(sk, skb); |
1101 | else | ||
1102 | #endif | ||
1103 | { | ||
1104 | if (!tcp_prequeue(sk, skb)) | ||
1105 | ret = tcp_v4_do_rcv(sk, skb); | ||
1106 | } | ||
1096 | } else | 1107 | } else |
1097 | sk_add_backlog(sk, skb); | 1108 | sk_add_backlog(sk, skb); |
1098 | bh_unlock_sock(sk); | 1109 | bh_unlock_sock(sk); |
@@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk) | |||
1296 | /* Cleans up our, hopefully empty, out_of_order_queue. */ | 1307 | /* Cleans up our, hopefully empty, out_of_order_queue. */ |
1297 | __skb_queue_purge(&tp->out_of_order_queue); | 1308 | __skb_queue_purge(&tp->out_of_order_queue); |
1298 | 1309 | ||
1310 | #ifdef CONFIG_NET_DMA | ||
1311 | /* Cleans up our sk_async_wait_queue */ | ||
1312 | __skb_queue_purge(&sk->sk_async_wait_queue); | ||
1313 | #endif | ||
1314 | |||
1299 | /* Clean prequeue, it must be empty really */ | 1315 | /* Clean prequeue, it must be empty really */ |
1300 | __skb_queue_purge(&tp->ucopy.prequeue); | 1316 | __skb_queue_purge(&tp->ucopy.prequeue); |
1301 | 1317 | ||
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c new file mode 100644 index 000000000000..1f977b6ee9a1 --- /dev/null +++ b/net/ipv4/tcp_lp.c | |||
@@ -0,0 +1,338 @@ | |||
1 | /* | ||
2 | * TCP Low Priority (TCP-LP) | ||
3 | * | ||
4 | * TCP Low Priority is a distributed algorithm whose goal is to utilize only | ||
5 | * the excess network bandwidth as compared to the ``fair share`` of | ||
6 | * bandwidth as targeted by TCP. Available from: | ||
7 | * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf | ||
8 | * | ||
9 | * Original Author: | ||
10 | * Aleksandar Kuzmanovic <akuzma@northwestern.edu> | ||
11 | * | ||
12 | * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. | ||
13 | * As of 2.6.13, Linux supports pluggable congestion control algorithms. | ||
14 | * Due to the limitation of the API, we take the following changes from | ||
15 | * the original TCP-LP implementation: | ||
16 | * o We use newReno in most core CA handling. Only add some checking | ||
17 | * within cong_avoid. | ||
18 | * o Error correcting in remote HZ, therefore remote HZ will be keeped | ||
19 | * on checking and updating. | ||
20 | * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne | ||
21 | * OWD have a similar meaning as RTT. Also correct the buggy formular. | ||
22 | * o Handle reaction for Early Congestion Indication (ECI) within | ||
23 | * pkts_acked, as mentioned within pseudo code. | ||
24 | * o OWD is handled in relative format, where local time stamp will in | ||
25 | * tcp_time_stamp format. | ||
26 | * | ||
27 | * Port from 2.4.19 to 2.6.16 as module by: | ||
28 | * Wong Hoi Sing Edison <hswong3i@gmail.com> | ||
29 | * Hung Hing Lun <hlhung3i@gmail.com> | ||
30 | * | ||
31 | * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $ | ||
32 | */ | ||
33 | |||
34 | #include <linux/config.h> | ||
35 | #include <linux/module.h> | ||
36 | #include <net/tcp.h> | ||
37 | |||
38 | /* resolution of owd */ | ||
39 | #define LP_RESOL 1000 | ||
40 | |||
41 | /** | ||
42 | * enum tcp_lp_state | ||
43 | * @LP_VALID_RHZ: is remote HZ valid? | ||
44 | * @LP_VALID_OWD: is OWD valid? | ||
45 | * @LP_WITHIN_THR: are we within threshold? | ||
46 | * @LP_WITHIN_INF: are we within inference? | ||
47 | * | ||
48 | * TCP-LP's state flags. | ||
49 | * We create this set of state flag mainly for debugging. | ||
50 | */ | ||
51 | enum tcp_lp_state { | ||
52 | LP_VALID_RHZ = (1 << 0), | ||
53 | LP_VALID_OWD = (1 << 1), | ||
54 | LP_WITHIN_THR = (1 << 3), | ||
55 | LP_WITHIN_INF = (1 << 4), | ||
56 | }; | ||
57 | |||
58 | /** | ||
59 | * struct lp | ||
60 | * @flag: TCP-LP state flag | ||
61 | * @sowd: smoothed OWD << 3 | ||
62 | * @owd_min: min OWD | ||
63 | * @owd_max: max OWD | ||
64 | * @owd_max_rsv: resrved max owd | ||
65 | * @remote_hz: estimated remote HZ | ||
66 | * @remote_ref_time: remote reference time | ||
67 | * @local_ref_time: local reference time | ||
68 | * @last_drop: time for last active drop | ||
69 | * @inference: current inference | ||
70 | * | ||
71 | * TCP-LP's private struct. | ||
72 | * We get the idea from original TCP-LP implementation where only left those we | ||
73 | * found are really useful. | ||
74 | */ | ||
75 | struct lp { | ||
76 | u32 flag; | ||
77 | u32 sowd; | ||
78 | u32 owd_min; | ||
79 | u32 owd_max; | ||
80 | u32 owd_max_rsv; | ||
81 | u32 remote_hz; | ||
82 | u32 remote_ref_time; | ||
83 | u32 local_ref_time; | ||
84 | u32 last_drop; | ||
85 | u32 inference; | ||
86 | }; | ||
87 | |||
88 | /** | ||
89 | * tcp_lp_init | ||
90 | * | ||
91 | * Init all required variables. | ||
92 | * Clone the handling from Vegas module implementation. | ||
93 | */ | ||
94 | static void tcp_lp_init(struct sock *sk) | ||
95 | { | ||
96 | struct lp *lp = inet_csk_ca(sk); | ||
97 | |||
98 | lp->flag = 0; | ||
99 | lp->sowd = 0; | ||
100 | lp->owd_min = 0xffffffff; | ||
101 | lp->owd_max = 0; | ||
102 | lp->owd_max_rsv = 0; | ||
103 | lp->remote_hz = 0; | ||
104 | lp->remote_ref_time = 0; | ||
105 | lp->local_ref_time = 0; | ||
106 | lp->last_drop = 0; | ||
107 | lp->inference = 0; | ||
108 | } | ||
109 | |||
110 | /** | ||
111 | * tcp_lp_cong_avoid | ||
112 | * | ||
113 | * Implementation of cong_avoid. | ||
114 | * Will only call newReno CA when away from inference. | ||
115 | * From TCP-LP's paper, this will be handled in additive increasement. | ||
116 | */ | ||
117 | static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, | ||
118 | int flag) | ||
119 | { | ||
120 | struct lp *lp = inet_csk_ca(sk); | ||
121 | |||
122 | if (!(lp->flag & LP_WITHIN_INF)) | ||
123 | tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); | ||
124 | } | ||
125 | |||
126 | /** | ||
127 | * tcp_lp_remote_hz_estimator | ||
128 | * | ||
129 | * Estimate remote HZ. | ||
130 | * We keep on updating the estimated value, where original TCP-LP | ||
131 | * implementation only guest it for once and use forever. | ||
132 | */ | ||
133 | static u32 tcp_lp_remote_hz_estimator(struct sock *sk) | ||
134 | { | ||
135 | struct tcp_sock *tp = tcp_sk(sk); | ||
136 | struct lp *lp = inet_csk_ca(sk); | ||
137 | s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ | ||
138 | s64 m = 0; | ||
139 | |||
140 | /* not yet record reference time | ||
141 | * go away!! record it before come back!! */ | ||
142 | if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) | ||
143 | goto out; | ||
144 | |||
145 | /* we can't calc remote HZ with no different!! */ | ||
146 | if (tp->rx_opt.rcv_tsval == lp->remote_ref_time | ||
147 | || tp->rx_opt.rcv_tsecr == lp->local_ref_time) | ||
148 | goto out; | ||
149 | |||
150 | m = HZ * (tp->rx_opt.rcv_tsval - | ||
151 | lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - | ||
152 | lp->local_ref_time); | ||
153 | if (m < 0) | ||
154 | m = -m; | ||
155 | |||
156 | if (rhz != 0) { | ||
157 | m -= rhz >> 6; /* m is now error in remote HZ est */ | ||
158 | rhz += m; /* 63/64 old + 1/64 new */ | ||
159 | } else | ||
160 | rhz = m << 6; | ||
161 | |||
162 | /* record time for successful remote HZ calc */ | ||
163 | lp->flag |= LP_VALID_RHZ; | ||
164 | |||
165 | out: | ||
166 | /* record reference time stamp */ | ||
167 | lp->remote_ref_time = tp->rx_opt.rcv_tsval; | ||
168 | lp->local_ref_time = tp->rx_opt.rcv_tsecr; | ||
169 | |||
170 | return rhz >> 6; | ||
171 | } | ||
172 | |||
173 | /** | ||
174 | * tcp_lp_owd_calculator | ||
175 | * | ||
176 | * Calculate one way delay (in relative format). | ||
177 | * Original implement OWD as minus of remote time difference to local time | ||
178 | * difference directly. As this time difference just simply equal to RTT, when | ||
179 | * the network status is stable, remote RTT will equal to local RTT, and result | ||
180 | * OWD into zero. | ||
181 | * It seems to be a bug and so we fixed it. | ||
182 | */ | ||
183 | static u32 tcp_lp_owd_calculator(struct sock *sk) | ||
184 | { | ||
185 | struct tcp_sock *tp = tcp_sk(sk); | ||
186 | struct lp *lp = inet_csk_ca(sk); | ||
187 | s64 owd = 0; | ||
188 | |||
189 | lp->remote_hz = tcp_lp_remote_hz_estimator(sk); | ||
190 | |||
191 | if (lp->flag & LP_VALID_RHZ) { | ||
192 | owd = | ||
193 | tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - | ||
194 | tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); | ||
195 | if (owd < 0) | ||
196 | owd = -owd; | ||
197 | } | ||
198 | |||
199 | if (owd > 0) | ||
200 | lp->flag |= LP_VALID_OWD; | ||
201 | else | ||
202 | lp->flag &= ~LP_VALID_OWD; | ||
203 | |||
204 | return owd; | ||
205 | } | ||
206 | |||
207 | /** | ||
208 | * tcp_lp_rtt_sample | ||
209 | * | ||
210 | * Implementation or rtt_sample. | ||
211 | * Will take the following action, | ||
212 | * 1. calc OWD, | ||
213 | * 2. record the min/max OWD, | ||
214 | * 3. calc smoothed OWD (SOWD). | ||
215 | * Most ideas come from the original TCP-LP implementation. | ||
216 | */ | ||
217 | static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) | ||
218 | { | ||
219 | struct lp *lp = inet_csk_ca(sk); | ||
220 | s64 mowd = tcp_lp_owd_calculator(sk); | ||
221 | |||
222 | /* sorry that we don't have valid data */ | ||
223 | if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) | ||
224 | return; | ||
225 | |||
226 | /* record the next min owd */ | ||
227 | if (mowd < lp->owd_min) | ||
228 | lp->owd_min = mowd; | ||
229 | |||
230 | /* always forget the max of the max | ||
231 | * we just set owd_max as one below it */ | ||
232 | if (mowd > lp->owd_max) { | ||
233 | if (mowd > lp->owd_max_rsv) { | ||
234 | if (lp->owd_max_rsv == 0) | ||
235 | lp->owd_max = mowd; | ||
236 | else | ||
237 | lp->owd_max = lp->owd_max_rsv; | ||
238 | lp->owd_max_rsv = mowd; | ||
239 | } else | ||
240 | lp->owd_max = mowd; | ||
241 | } | ||
242 | |||
243 | /* calc for smoothed owd */ | ||
244 | if (lp->sowd != 0) { | ||
245 | mowd -= lp->sowd >> 3; /* m is now error in owd est */ | ||
246 | lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ | ||
247 | } else | ||
248 | lp->sowd = mowd << 3; /* take the measured time be owd */ | ||
249 | } | ||
250 | |||
251 | /** | ||
252 | * tcp_lp_pkts_acked | ||
253 | * | ||
254 | * Implementation of pkts_acked. | ||
255 | * Deal with active drop under Early Congestion Indication. | ||
256 | * Only drop to half and 1 will be handle, because we hope to use back | ||
257 | * newReno in increase case. | ||
258 | * We work it out by following the idea from TCP-LP's paper directly | ||
259 | */ | ||
260 | static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) | ||
261 | { | ||
262 | struct tcp_sock *tp = tcp_sk(sk); | ||
263 | struct lp *lp = inet_csk_ca(sk); | ||
264 | |||
265 | /* calc inference */ | ||
266 | if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) | ||
267 | lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); | ||
268 | |||
269 | /* test if within inference */ | ||
270 | if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) | ||
271 | lp->flag |= LP_WITHIN_INF; | ||
272 | else | ||
273 | lp->flag &= ~LP_WITHIN_INF; | ||
274 | |||
275 | /* test if within threshold */ | ||
276 | if (lp->sowd >> 3 < | ||
277 | lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) | ||
278 | lp->flag |= LP_WITHIN_THR; | ||
279 | else | ||
280 | lp->flag &= ~LP_WITHIN_THR; | ||
281 | |||
282 | pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, | ||
283 | tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, | ||
284 | lp->sowd >> 3); | ||
285 | |||
286 | if (lp->flag & LP_WITHIN_THR) | ||
287 | return; | ||
288 | |||
289 | /* FIXME: try to reset owd_min and owd_max here | ||
290 | * so decrease the chance the min/max is no longer suitable | ||
291 | * and will usually within threshold when whithin inference */ | ||
292 | lp->owd_min = lp->sowd >> 3; | ||
293 | lp->owd_max = lp->sowd >> 2; | ||
294 | lp->owd_max_rsv = lp->sowd >> 2; | ||
295 | |||
296 | /* happened within inference | ||
297 | * drop snd_cwnd into 1 */ | ||
298 | if (lp->flag & LP_WITHIN_INF) | ||
299 | tp->snd_cwnd = 1U; | ||
300 | |||
301 | /* happened after inference | ||
302 | * cut snd_cwnd into half */ | ||
303 | else | ||
304 | tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); | ||
305 | |||
306 | /* record this drop time */ | ||
307 | lp->last_drop = tcp_time_stamp; | ||
308 | } | ||
309 | |||
310 | static struct tcp_congestion_ops tcp_lp = { | ||
311 | .init = tcp_lp_init, | ||
312 | .ssthresh = tcp_reno_ssthresh, | ||
313 | .cong_avoid = tcp_lp_cong_avoid, | ||
314 | .min_cwnd = tcp_reno_min_cwnd, | ||
315 | .rtt_sample = tcp_lp_rtt_sample, | ||
316 | .pkts_acked = tcp_lp_pkts_acked, | ||
317 | |||
318 | .owner = THIS_MODULE, | ||
319 | .name = "lp" | ||
320 | }; | ||
321 | |||
322 | static int __init tcp_lp_register(void) | ||
323 | { | ||
324 | BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); | ||
325 | return tcp_register_congestion_control(&tcp_lp); | ||
326 | } | ||
327 | |||
328 | static void __exit tcp_lp_unregister(void) | ||
329 | { | ||
330 | tcp_unregister_congestion_control(&tcp_lp); | ||
331 | } | ||
332 | |||
333 | module_init(tcp_lp_register); | ||
334 | module_exit(tcp_lp_unregister); | ||
335 | |||
336 | MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun"); | ||
337 | MODULE_LICENSE("GPL"); | ||
338 | MODULE_DESCRIPTION("TCP Low Priority"); | ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f33c9dddaa12..07bb5a2b375e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3; | |||
59 | int sysctl_tcp_mtu_probing = 0; | 59 | int sysctl_tcp_mtu_probing = 0; |
60 | int sysctl_tcp_base_mss = 512; | 60 | int sysctl_tcp_base_mss = 512; |
61 | 61 | ||
62 | /* By default, RFC2861 behavior. */ | ||
63 | int sysctl_tcp_slow_start_after_idle = 1; | ||
64 | |||
62 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, | 65 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, |
63 | struct sk_buff *skb) | 66 | struct sk_buff *skb) |
64 | { | 67 | { |
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, | |||
138 | struct inet_connection_sock *icsk = inet_csk(sk); | 141 | struct inet_connection_sock *icsk = inet_csk(sk); |
139 | const u32 now = tcp_time_stamp; | 142 | const u32 now = tcp_time_stamp; |
140 | 143 | ||
141 | if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) | 144 | if (sysctl_tcp_slow_start_after_idle && |
145 | (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) | ||
142 | tcp_cwnd_restart(sk, __sk_dst_get(sk)); | 146 | tcp_cwnd_restart(sk, __sk_dst_get(sk)); |
143 | 147 | ||
144 | tp->lsndtime = now; | 148 | tp->lsndtime = now; |
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c new file mode 100644 index 000000000000..d7d517a3a238 --- /dev/null +++ b/net/ipv4/tcp_probe.c | |||
@@ -0,0 +1,181 @@ | |||
1 | /* | ||
2 | * tcpprobe - Observe the TCP flow with kprobes. | ||
3 | * | ||
4 | * The idea for this came from Werner Almesberger's umlsim | ||
5 | * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/kprobes.h> | ||
24 | #include <linux/socket.h> | ||
25 | #include <linux/tcp.h> | ||
26 | #include <linux/proc_fs.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/kfifo.h> | ||
29 | #include <linux/vmalloc.h> | ||
30 | |||
31 | #include <net/tcp.h> | ||
32 | |||
33 | MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>"); | ||
34 | MODULE_DESCRIPTION("TCP cwnd snooper"); | ||
35 | MODULE_LICENSE("GPL"); | ||
36 | |||
37 | static int port = 0; | ||
38 | MODULE_PARM_DESC(port, "Port to match (0=all)"); | ||
39 | module_param(port, int, 0); | ||
40 | |||
41 | static int bufsize = 64*1024; | ||
42 | MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); | ||
43 | module_param(bufsize, int, 0); | ||
44 | |||
45 | static const char procname[] = "tcpprobe"; | ||
46 | |||
47 | struct { | ||
48 | struct kfifo *fifo; | ||
49 | spinlock_t lock; | ||
50 | wait_queue_head_t wait; | ||
51 | struct timeval tstart; | ||
52 | } tcpw; | ||
53 | |||
54 | static void printl(const char *fmt, ...) | ||
55 | { | ||
56 | va_list args; | ||
57 | int len; | ||
58 | struct timeval now; | ||
59 | char tbuf[256]; | ||
60 | |||
61 | va_start(args, fmt); | ||
62 | do_gettimeofday(&now); | ||
63 | |||
64 | now.tv_sec -= tcpw.tstart.tv_sec; | ||
65 | now.tv_usec -= tcpw.tstart.tv_usec; | ||
66 | if (now.tv_usec < 0) { | ||
67 | --now.tv_sec; | ||
68 | now.tv_usec += 1000000; | ||
69 | } | ||
70 | |||
71 | len = sprintf(tbuf, "%lu.%06lu ", | ||
72 | (unsigned long) now.tv_sec, | ||
73 | (unsigned long) now.tv_usec); | ||
74 | len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); | ||
75 | va_end(args); | ||
76 | |||
77 | kfifo_put(tcpw.fifo, tbuf, len); | ||
78 | wake_up(&tcpw.wait); | ||
79 | } | ||
80 | |||
81 | static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, | ||
82 | struct msghdr *msg, size_t size) | ||
83 | { | ||
84 | const struct tcp_sock *tp = tcp_sk(sk); | ||
85 | const struct inet_sock *inet = inet_sk(sk); | ||
86 | |||
87 | if (port == 0 || ntohs(inet->dport) == port || | ||
88 | ntohs(inet->sport) == port) { | ||
89 | printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", | ||
90 | NIPQUAD(inet->saddr), ntohs(inet->sport), | ||
91 | NIPQUAD(inet->daddr), ntohs(inet->dport), | ||
92 | size, tp->snd_nxt, tp->snd_una, | ||
93 | tp->snd_cwnd, tcp_current_ssthresh(sk), | ||
94 | tp->snd_wnd); | ||
95 | } | ||
96 | |||
97 | jprobe_return(); | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | static struct jprobe tcp_send_probe = { | ||
102 | .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, | ||
103 | .entry = (kprobe_opcode_t *) &jtcp_sendmsg, | ||
104 | }; | ||
105 | |||
106 | |||
107 | static int tcpprobe_open(struct inode * inode, struct file * file) | ||
108 | { | ||
109 | kfifo_reset(tcpw.fifo); | ||
110 | do_gettimeofday(&tcpw.tstart); | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | static ssize_t tcpprobe_read(struct file *file, char __user *buf, | ||
115 | size_t len, loff_t *ppos) | ||
116 | { | ||
117 | int error = 0, cnt; | ||
118 | unsigned char *tbuf; | ||
119 | |||
120 | if (!buf || len < 0) | ||
121 | return -EINVAL; | ||
122 | |||
123 | if (len == 0) | ||
124 | return 0; | ||
125 | |||
126 | tbuf = vmalloc(len); | ||
127 | if (!tbuf) | ||
128 | return -ENOMEM; | ||
129 | |||
130 | error = wait_event_interruptible(tcpw.wait, | ||
131 | __kfifo_len(tcpw.fifo) != 0); | ||
132 | if (error) | ||
133 | return error; | ||
134 | |||
135 | cnt = kfifo_get(tcpw.fifo, tbuf, len); | ||
136 | error = copy_to_user(buf, tbuf, cnt); | ||
137 | |||
138 | vfree(tbuf); | ||
139 | |||
140 | return error ? error : cnt; | ||
141 | } | ||
142 | |||
143 | static struct file_operations tcpprobe_fops = { | ||
144 | .owner = THIS_MODULE, | ||
145 | .open = tcpprobe_open, | ||
146 | .read = tcpprobe_read, | ||
147 | }; | ||
148 | |||
149 | static __init int tcpprobe_init(void) | ||
150 | { | ||
151 | int ret = -ENOMEM; | ||
152 | |||
153 | init_waitqueue_head(&tcpw.wait); | ||
154 | spin_lock_init(&tcpw.lock); | ||
155 | tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock); | ||
156 | |||
157 | if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) | ||
158 | goto err0; | ||
159 | |||
160 | ret = register_jprobe(&tcp_send_probe); | ||
161 | if (ret) | ||
162 | goto err1; | ||
163 | |||
164 | pr_info("TCP watch registered (port=%d)\n", port); | ||
165 | return 0; | ||
166 | err1: | ||
167 | proc_net_remove(procname); | ||
168 | err0: | ||
169 | kfifo_free(tcpw.fifo); | ||
170 | return ret; | ||
171 | } | ||
172 | module_init(tcpprobe_init); | ||
173 | |||
174 | static __exit void tcpprobe_exit(void) | ||
175 | { | ||
176 | kfifo_free(tcpw.fifo); | ||
177 | proc_net_remove(procname); | ||
178 | unregister_jprobe(&tcp_send_probe); | ||
179 | |||
180 | } | ||
181 | module_exit(tcpprobe_exit); | ||
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c new file mode 100644 index 000000000000..11b42a7135c1 --- /dev/null +++ b/net/ipv4/tcp_veno.c | |||
@@ -0,0 +1,231 @@ | |||
1 | /* | ||
2 | * TCP Veno congestion control | ||
3 | * | ||
4 | * This is based on the congestion detection/avoidance scheme described in | ||
5 | * C. P. Fu, S. C. Liew. | ||
6 | * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." | ||
7 | * IEEE Journal on Selected Areas in Communication, | ||
8 | * Feb. 2003. | ||
9 | * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf | ||
10 | */ | ||
11 | |||
12 | #include <linux/config.h> | ||
13 | #include <linux/mm.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/skbuff.h> | ||
16 | #include <linux/inet_diag.h> | ||
17 | |||
18 | #include <net/tcp.h> | ||
19 | |||
20 | /* Default values of the Veno variables, in fixed-point representation | ||
21 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
22 | */ | ||
23 | #define V_PARAM_SHIFT 1 | ||
24 | static const int beta = 3 << V_PARAM_SHIFT; | ||
25 | |||
26 | /* Veno variables */ | ||
27 | struct veno { | ||
28 | u8 doing_veno_now; /* if true, do veno for this rtt */ | ||
29 | u16 cntrtt; /* # of rtts measured within last rtt */ | ||
30 | u32 minrtt; /* min of rtts measured within last rtt (in usec) */ | ||
31 | u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ | ||
32 | u32 inc; /* decide whether to increase cwnd */ | ||
33 | u32 diff; /* calculate the diff rate */ | ||
34 | }; | ||
35 | |||
36 | /* There are several situations when we must "re-start" Veno: | ||
37 | * | ||
38 | * o when a connection is established | ||
39 | * o after an RTO | ||
40 | * o after fast recovery | ||
41 | * o when we send a packet and there is no outstanding | ||
42 | * unacknowledged data (restarting an idle connection) | ||
43 | * | ||
44 | */ | ||
45 | static inline void veno_enable(struct sock *sk) | ||
46 | { | ||
47 | struct veno *veno = inet_csk_ca(sk); | ||
48 | |||
49 | /* turn on Veno */ | ||
50 | veno->doing_veno_now = 1; | ||
51 | |||
52 | veno->minrtt = 0x7fffffff; | ||
53 | } | ||
54 | |||
55 | static inline void veno_disable(struct sock *sk) | ||
56 | { | ||
57 | struct veno *veno = inet_csk_ca(sk); | ||
58 | |||
59 | /* turn off Veno */ | ||
60 | veno->doing_veno_now = 0; | ||
61 | } | ||
62 | |||
63 | static void tcp_veno_init(struct sock *sk) | ||
64 | { | ||
65 | struct veno *veno = inet_csk_ca(sk); | ||
66 | |||
67 | veno->basertt = 0x7fffffff; | ||
68 | veno->inc = 1; | ||
69 | veno_enable(sk); | ||
70 | } | ||
71 | |||
72 | /* Do rtt sampling needed for Veno. */ | ||
73 | static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) | ||
74 | { | ||
75 | struct veno *veno = inet_csk_ca(sk); | ||
76 | u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ | ||
77 | |||
78 | /* Filter to find propagation delay: */ | ||
79 | if (vrtt < veno->basertt) | ||
80 | veno->basertt = vrtt; | ||
81 | |||
82 | /* Find the min rtt during the last rtt to find | ||
83 | * the current prop. delay + queuing delay: | ||
84 | */ | ||
85 | veno->minrtt = min(veno->minrtt, vrtt); | ||
86 | veno->cntrtt++; | ||
87 | } | ||
88 | |||
89 | static void tcp_veno_state(struct sock *sk, u8 ca_state) | ||
90 | { | ||
91 | if (ca_state == TCP_CA_Open) | ||
92 | veno_enable(sk); | ||
93 | else | ||
94 | veno_disable(sk); | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * If the connection is idle and we are restarting, | ||
99 | * then we don't want to do any Veno calculations | ||
100 | * until we get fresh rtt samples. So when we | ||
101 | * restart, we reset our Veno state to a clean | ||
102 | * state. After we get acks for this flight of | ||
103 | * packets, _then_ we can make Veno calculations | ||
104 | * again. | ||
105 | */ | ||
106 | static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) | ||
107 | { | ||
108 | if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) | ||
109 | tcp_veno_init(sk); | ||
110 | } | ||
111 | |||
112 | static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, | ||
113 | u32 seq_rtt, u32 in_flight, int flag) | ||
114 | { | ||
115 | struct tcp_sock *tp = tcp_sk(sk); | ||
116 | struct veno *veno = inet_csk_ca(sk); | ||
117 | |||
118 | if (!veno->doing_veno_now) | ||
119 | return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); | ||
120 | |||
121 | /* limited by applications */ | ||
122 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
123 | return; | ||
124 | |||
125 | /* We do the Veno calculations only if we got enough rtt samples */ | ||
126 | if (veno->cntrtt <= 2) { | ||
127 | /* We don't have enough rtt samples to do the Veno | ||
128 | * calculation, so we'll behave like Reno. | ||
129 | */ | ||
130 | tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); | ||
131 | } else { | ||
132 | u32 rtt, target_cwnd; | ||
133 | |||
134 | /* We have enough rtt samples, so, using the Veno | ||
135 | * algorithm, we determine the state of the network. | ||
136 | */ | ||
137 | |||
138 | rtt = veno->minrtt; | ||
139 | |||
140 | target_cwnd = ((tp->snd_cwnd * veno->basertt) | ||
141 | << V_PARAM_SHIFT) / rtt; | ||
142 | |||
143 | veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; | ||
144 | |||
145 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
146 | /* Slow start. */ | ||
147 | tcp_slow_start(tp); | ||
148 | } else { | ||
149 | /* Congestion avoidance. */ | ||
150 | if (veno->diff < beta) { | ||
151 | /* In the "non-congestive state", increase cwnd | ||
152 | * every rtt. | ||
153 | */ | ||
154 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
155 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
156 | tp->snd_cwnd++; | ||
157 | tp->snd_cwnd_cnt = 0; | ||
158 | } else | ||
159 | tp->snd_cwnd_cnt++; | ||
160 | } else { | ||
161 | /* In the "congestive state", increase cwnd | ||
162 | * every other rtt. | ||
163 | */ | ||
164 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
165 | if (veno->inc | ||
166 | && tp->snd_cwnd < | ||
167 | tp->snd_cwnd_clamp) { | ||
168 | tp->snd_cwnd++; | ||
169 | veno->inc = 0; | ||
170 | } else | ||
171 | veno->inc = 1; | ||
172 | tp->snd_cwnd_cnt = 0; | ||
173 | } else | ||
174 | tp->snd_cwnd_cnt++; | ||
175 | } | ||
176 | |||
177 | } | ||
178 | if (tp->snd_cwnd < 2) | ||
179 | tp->snd_cwnd = 2; | ||
180 | else if (tp->snd_cwnd > tp->snd_cwnd_clamp) | ||
181 | tp->snd_cwnd = tp->snd_cwnd_clamp; | ||
182 | } | ||
183 | /* Wipe the slate clean for the next rtt. */ | ||
184 | /* veno->cntrtt = 0; */ | ||
185 | veno->minrtt = 0x7fffffff; | ||
186 | } | ||
187 | |||
188 | /* Veno MD phase */ | ||
189 | static u32 tcp_veno_ssthresh(struct sock *sk) | ||
190 | { | ||
191 | const struct tcp_sock *tp = tcp_sk(sk); | ||
192 | struct veno *veno = inet_csk_ca(sk); | ||
193 | |||
194 | if (veno->diff < beta) | ||
195 | /* in "non-congestive state", cut cwnd by 1/5 */ | ||
196 | return max(tp->snd_cwnd * 4 / 5, 2U); | ||
197 | else | ||
198 | /* in "congestive state", cut cwnd by 1/2 */ | ||
199 | return max(tp->snd_cwnd >> 1U, 2U); | ||
200 | } | ||
201 | |||
202 | static struct tcp_congestion_ops tcp_veno = { | ||
203 | .init = tcp_veno_init, | ||
204 | .ssthresh = tcp_veno_ssthresh, | ||
205 | .cong_avoid = tcp_veno_cong_avoid, | ||
206 | .rtt_sample = tcp_veno_rtt_calc, | ||
207 | .set_state = tcp_veno_state, | ||
208 | .cwnd_event = tcp_veno_cwnd_event, | ||
209 | |||
210 | .owner = THIS_MODULE, | ||
211 | .name = "veno", | ||
212 | }; | ||
213 | |||
214 | static int __init tcp_veno_register(void) | ||
215 | { | ||
216 | BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); | ||
217 | tcp_register_congestion_control(&tcp_veno); | ||
218 | return 0; | ||
219 | } | ||
220 | |||
221 | static void __exit tcp_veno_unregister(void) | ||
222 | { | ||
223 | tcp_unregister_congestion_control(&tcp_veno); | ||
224 | } | ||
225 | |||
226 | module_init(tcp_veno_register); | ||
227 | module_exit(tcp_veno_unregister); | ||
228 | |||
229 | MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); | ||
230 | MODULE_LICENSE("GPL"); | ||
231 | MODULE_DESCRIPTION("TCP Veno"); | ||
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 0c340c3756c2..4247da1384bf 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c | |||
@@ -1,7 +1,24 @@ | |||
1 | /* | 1 | /* |
2 | * TCP Westwood+ | 2 | * TCP Westwood+: end-to-end bandwidth estimation for TCP |
3 | * | 3 | * |
4 | * Angelo Dell'Aera: TCP Westwood+ support | 4 | * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 |
5 | * | ||
6 | * Support at http://c3lab.poliba.it/index.php/Westwood | ||
7 | * Main references in literature: | ||
8 | * | ||
9 | * - Mascolo S, Casetti, M. Gerla et al. | ||
10 | * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 | ||
11 | * | ||
12 | * - A. Grieco, s. Mascolo | ||
13 | * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer | ||
14 | * Comm. Review, 2004 | ||
15 | * | ||
16 | * - A. Dell'Aera, L. Grieco, S. Mascolo. | ||
17 | * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : | ||
18 | * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 | ||
19 | * | ||
20 | * Westwood+ employs end-to-end bandwidth measurement to set cwnd and | ||
21 | * ssthresh after packet loss. The probing phase is as the original Reno. | ||
5 | */ | 22 | */ |
6 | 23 | ||
7 | #include <linux/config.h> | 24 | #include <linux/config.h> |
@@ -22,6 +39,8 @@ struct westwood { | |||
22 | u32 accounted; | 39 | u32 accounted; |
23 | u32 rtt; | 40 | u32 rtt; |
24 | u32 rtt_min; /* minimum observed RTT */ | 41 | u32 rtt_min; /* minimum observed RTT */ |
42 | u8 first_ack; /* flag which infers that this is the first ack */ | ||
43 | u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ | ||
25 | }; | 44 | }; |
26 | 45 | ||
27 | 46 | ||
@@ -49,9 +68,11 @@ static void tcp_westwood_init(struct sock *sk) | |||
49 | w->bw_est = 0; | 68 | w->bw_est = 0; |
50 | w->accounted = 0; | 69 | w->accounted = 0; |
51 | w->cumul_ack = 0; | 70 | w->cumul_ack = 0; |
71 | w->reset_rtt_min = 1; | ||
52 | w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; | 72 | w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; |
53 | w->rtt_win_sx = tcp_time_stamp; | 73 | w->rtt_win_sx = tcp_time_stamp; |
54 | w->snd_una = tcp_sk(sk)->snd_una; | 74 | w->snd_una = tcp_sk(sk)->snd_una; |
75 | w->first_ack = 1; | ||
55 | } | 76 | } |
56 | 77 | ||
57 | /* | 78 | /* |
@@ -63,10 +84,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b) | |||
63 | return (((7 * a) + b) >> 3); | 84 | return (((7 * a) + b) >> 3); |
64 | } | 85 | } |
65 | 86 | ||
66 | static inline void westwood_filter(struct westwood *w, u32 delta) | 87 | static void westwood_filter(struct westwood *w, u32 delta) |
67 | { | 88 | { |
68 | w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); | 89 | /* If the filter is empty fill it with the first sample of bandwidth */ |
69 | w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); | 90 | if (w->bw_ns_est == 0 && w->bw_est == 0) { |
91 | w->bw_ns_est = w->bk / delta; | ||
92 | w->bw_est = w->bw_ns_est; | ||
93 | } else { | ||
94 | w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); | ||
95 | w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); | ||
96 | } | ||
70 | } | 97 | } |
71 | 98 | ||
72 | /* | 99 | /* |
@@ -91,6 +118,15 @@ static void westwood_update_window(struct sock *sk) | |||
91 | struct westwood *w = inet_csk_ca(sk); | 118 | struct westwood *w = inet_csk_ca(sk); |
92 | s32 delta = tcp_time_stamp - w->rtt_win_sx; | 119 | s32 delta = tcp_time_stamp - w->rtt_win_sx; |
93 | 120 | ||
121 | /* Initialize w->snd_una with the first acked sequence number in order | ||
122 | * to fix mismatch between tp->snd_una and w->snd_una for the first | ||
123 | * bandwidth sample | ||
124 | */ | ||
125 | if (w->first_ack) { | ||
126 | w->snd_una = tcp_sk(sk)->snd_una; | ||
127 | w->first_ack = 0; | ||
128 | } | ||
129 | |||
94 | /* | 130 | /* |
95 | * See if a RTT-window has passed. | 131 | * See if a RTT-window has passed. |
96 | * Be careful since if RTT is less than | 132 | * Be careful since if RTT is less than |
@@ -108,6 +144,16 @@ static void westwood_update_window(struct sock *sk) | |||
108 | } | 144 | } |
109 | } | 145 | } |
110 | 146 | ||
147 | static inline void update_rtt_min(struct westwood *w) | ||
148 | { | ||
149 | if (w->reset_rtt_min) { | ||
150 | w->rtt_min = w->rtt; | ||
151 | w->reset_rtt_min = 0; | ||
152 | } else | ||
153 | w->rtt_min = min(w->rtt, w->rtt_min); | ||
154 | } | ||
155 | |||
156 | |||
111 | /* | 157 | /* |
112 | * @westwood_fast_bw | 158 | * @westwood_fast_bw |
113 | * It is called when we are in fast path. In particular it is called when | 159 | * It is called when we are in fast path. In particular it is called when |
@@ -123,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk) | |||
123 | 169 | ||
124 | w->bk += tp->snd_una - w->snd_una; | 170 | w->bk += tp->snd_una - w->snd_una; |
125 | w->snd_una = tp->snd_una; | 171 | w->snd_una = tp->snd_una; |
126 | w->rtt_min = min(w->rtt, w->rtt_min); | 172 | update_rtt_min(w); |
127 | } | 173 | } |
128 | 174 | ||
129 | /* | 175 | /* |
@@ -162,12 +208,6 @@ static inline u32 westwood_acked_count(struct sock *sk) | |||
162 | return w->cumul_ack; | 208 | return w->cumul_ack; |
163 | } | 209 | } |
164 | 210 | ||
165 | static inline u32 westwood_bw_rttmin(const struct sock *sk) | ||
166 | { | ||
167 | const struct tcp_sock *tp = tcp_sk(sk); | ||
168 | const struct westwood *w = inet_csk_ca(sk); | ||
169 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | ||
170 | } | ||
171 | 211 | ||
172 | /* | 212 | /* |
173 | * TCP Westwood | 213 | * TCP Westwood |
@@ -175,9 +215,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk) | |||
175 | * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 | 215 | * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 |
176 | * so avoids ever returning 0. | 216 | * so avoids ever returning 0. |
177 | */ | 217 | */ |
178 | static u32 tcp_westwood_cwnd_min(struct sock *sk) | 218 | static u32 tcp_westwood_bw_rttmin(const struct sock *sk) |
179 | { | 219 | { |
180 | return westwood_bw_rttmin(sk); | 220 | const struct tcp_sock *tp = tcp_sk(sk); |
221 | const struct westwood *w = inet_csk_ca(sk); | ||
222 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | ||
181 | } | 223 | } |
182 | 224 | ||
183 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) | 225 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) |
@@ -191,17 +233,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) | |||
191 | break; | 233 | break; |
192 | 234 | ||
193 | case CA_EVENT_COMPLETE_CWR: | 235 | case CA_EVENT_COMPLETE_CWR: |
194 | tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); | 236 | tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
195 | break; | 237 | break; |
196 | 238 | ||
197 | case CA_EVENT_FRTO: | 239 | case CA_EVENT_FRTO: |
198 | tp->snd_ssthresh = westwood_bw_rttmin(sk); | 240 | tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
241 | /* Update RTT_min when next ack arrives */ | ||
242 | w->reset_rtt_min = 1; | ||
199 | break; | 243 | break; |
200 | 244 | ||
201 | case CA_EVENT_SLOW_ACK: | 245 | case CA_EVENT_SLOW_ACK: |
202 | westwood_update_window(sk); | 246 | westwood_update_window(sk); |
203 | w->bk += westwood_acked_count(sk); | 247 | w->bk += westwood_acked_count(sk); |
204 | w->rtt_min = min(w->rtt, w->rtt_min); | 248 | update_rtt_min(w); |
205 | break; | 249 | break; |
206 | 250 | ||
207 | default: | 251 | default: |
@@ -235,7 +279,7 @@ static struct tcp_congestion_ops tcp_westwood = { | |||
235 | .init = tcp_westwood_init, | 279 | .init = tcp_westwood_init, |
236 | .ssthresh = tcp_reno_ssthresh, | 280 | .ssthresh = tcp_reno_ssthresh, |
237 | .cong_avoid = tcp_reno_cong_avoid, | 281 | .cong_avoid = tcp_reno_cong_avoid, |
238 | .min_cwnd = tcp_westwood_cwnd_min, | 282 | .min_cwnd = tcp_westwood_bw_rttmin, |
239 | .cwnd_event = tcp_westwood_event, | 283 | .cwnd_event = tcp_westwood_event, |
240 | .get_info = tcp_westwood_info, | 284 | .get_info = tcp_westwood_info, |
241 | .pkts_acked = tcp_westwood_pkts_acked, | 285 | .pkts_acked = tcp_westwood_pkts_acked, |
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 3e174c83bfe7..817ed84511a6 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/string.h> | 13 | #include <linux/string.h> |
14 | #include <linux/netfilter.h> | 14 | #include <linux/netfilter.h> |
15 | #include <linux/netfilter_ipv4.h> | 15 | #include <linux/netfilter_ipv4.h> |
16 | #include <net/inet_ecn.h> | ||
17 | #include <net/ip.h> | 16 | #include <net/ip.h> |
18 | #include <net/xfrm.h> | 17 | #include <net/xfrm.h> |
19 | 18 | ||
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb) | |||
24 | 23 | ||
25 | EXPORT_SYMBOL(xfrm4_rcv); | 24 | EXPORT_SYMBOL(xfrm4_rcv); |
26 | 25 | ||
27 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) | ||
28 | { | ||
29 | struct iphdr *outer_iph = skb->nh.iph; | ||
30 | struct iphdr *inner_iph = skb->h.ipiph; | ||
31 | |||
32 | if (INET_ECN_is_ce(outer_iph->tos)) | ||
33 | IP_ECN_set_ce(inner_iph); | ||
34 | } | ||
35 | |||
36 | static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) | 26 | static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) |
37 | { | 27 | { |
38 | switch (nexthdr) { | 28 | switch (nexthdr) { |
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) | |||
113 | 103 | ||
114 | xfrm_vec[xfrm_nr++] = x; | 104 | xfrm_vec[xfrm_nr++] = x; |
115 | 105 | ||
116 | iph = skb->nh.iph; | 106 | if (x->mode->input(x, skb)) |
107 | goto drop; | ||
117 | 108 | ||
118 | if (x->props.mode) { | 109 | if (x->props.mode) { |
119 | if (iph->protocol != IPPROTO_IPIP) | ||
120 | goto drop; | ||
121 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | ||
122 | goto drop; | ||
123 | if (skb_cloned(skb) && | ||
124 | pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) | ||
125 | goto drop; | ||
126 | if (x->props.flags & XFRM_STATE_DECAP_DSCP) | ||
127 | ipv4_copy_dscp(iph, skb->h.ipiph); | ||
128 | if (!(x->props.flags & XFRM_STATE_NOECN)) | ||
129 | ipip_ecn_decapsulate(skb); | ||
130 | skb->mac.raw = memmove(skb->data - skb->mac_len, | ||
131 | skb->mac.raw, skb->mac_len); | ||
132 | skb->nh.raw = skb->data; | ||
133 | memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); | ||
134 | decaps = 1; | 110 | decaps = 1; |
135 | break; | 111 | break; |
136 | } | 112 | } |
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c new file mode 100644 index 000000000000..a9e6b3dd19c9 --- /dev/null +++ b/net/ipv4/xfrm4_mode_transport.c | |||
@@ -0,0 +1,83 @@ | |||
1 | /* | ||
2 | * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. | ||
3 | * | ||
4 | * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
5 | */ | ||
6 | |||
7 | #include <linux/init.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/skbuff.h> | ||
11 | #include <linux/stringify.h> | ||
12 | #include <net/dst.h> | ||
13 | #include <net/ip.h> | ||
14 | #include <net/xfrm.h> | ||
15 | |||
16 | /* Add encapsulation header. | ||
17 | * | ||
18 | * The IP header will be moved forward to make space for the encapsulation | ||
19 | * header. | ||
20 | * | ||
21 | * On exit, skb->h will be set to the start of the payload to be processed | ||
22 | * by x->type->output and skb->nh will be set to the top IP header. | ||
23 | */ | ||
24 | static int xfrm4_transport_output(struct sk_buff *skb) | ||
25 | { | ||
26 | struct xfrm_state *x; | ||
27 | struct iphdr *iph; | ||
28 | int ihl; | ||
29 | |||
30 | iph = skb->nh.iph; | ||
31 | skb->h.ipiph = iph; | ||
32 | |||
33 | ihl = iph->ihl * 4; | ||
34 | skb->h.raw += ihl; | ||
35 | |||
36 | x = skb->dst->xfrm; | ||
37 | skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | /* Remove encapsulation header. | ||
42 | * | ||
43 | * The IP header will be moved over the top of the encapsulation header. | ||
44 | * | ||
45 | * On entry, skb->h shall point to where the IP header should be and skb->nh | ||
46 | * shall be set to where the IP header currently is. skb->data shall point | ||
47 | * to the start of the payload. | ||
48 | */ | ||
49 | static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) | ||
50 | { | ||
51 | int ihl = skb->data - skb->h.raw; | ||
52 | |||
53 | if (skb->h.raw != skb->nh.raw) | ||
54 | skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); | ||
55 | skb->nh.iph->tot_len = htons(skb->len + ihl); | ||
56 | skb->h.raw = skb->data; | ||
57 | return 0; | ||
58 | } | ||
59 | |||
60 | static struct xfrm_mode xfrm4_transport_mode = { | ||
61 | .input = xfrm4_transport_input, | ||
62 | .output = xfrm4_transport_output, | ||
63 | .owner = THIS_MODULE, | ||
64 | .encap = XFRM_MODE_TRANSPORT, | ||
65 | }; | ||
66 | |||
67 | static int __init xfrm4_transport_init(void) | ||
68 | { | ||
69 | return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); | ||
70 | } | ||
71 | |||
72 | static void __exit xfrm4_transport_exit(void) | ||
73 | { | ||
74 | int err; | ||
75 | |||
76 | err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); | ||
77 | BUG_ON(err); | ||
78 | } | ||
79 | |||
80 | module_init(xfrm4_transport_init); | ||
81 | module_exit(xfrm4_transport_exit); | ||
82 | MODULE_LICENSE("GPL"); | ||
83 | MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); | ||
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c new file mode 100644 index 000000000000..f8d880beb12f --- /dev/null +++ b/net/ipv4/xfrm4_mode_tunnel.c | |||
@@ -0,0 +1,125 @@ | |||
1 | /* | ||
2 | * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. | ||
3 | * | ||
4 | * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
5 | */ | ||
6 | |||
7 | #include <linux/init.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/skbuff.h> | ||
11 | #include <linux/stringify.h> | ||
12 | #include <net/dst.h> | ||
13 | #include <net/inet_ecn.h> | ||
14 | #include <net/ip.h> | ||
15 | #include <net/xfrm.h> | ||
16 | |||
17 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) | ||
18 | { | ||
19 | struct iphdr *outer_iph = skb->nh.iph; | ||
20 | struct iphdr *inner_iph = skb->h.ipiph; | ||
21 | |||
22 | if (INET_ECN_is_ce(outer_iph->tos)) | ||
23 | IP_ECN_set_ce(inner_iph); | ||
24 | } | ||
25 | |||
26 | /* Add encapsulation header. | ||
27 | * | ||
28 | * The top IP header will be constructed per RFC 2401. The following fields | ||
29 | * in it shall be filled in by x->type->output: | ||
30 | * tot_len | ||
31 | * check | ||
32 | * | ||
33 | * On exit, skb->h will be set to the start of the payload to be processed | ||
34 | * by x->type->output and skb->nh will be set to the top IP header. | ||
35 | */ | ||
36 | static int xfrm4_tunnel_output(struct sk_buff *skb) | ||
37 | { | ||
38 | struct dst_entry *dst = skb->dst; | ||
39 | struct xfrm_state *x = dst->xfrm; | ||
40 | struct iphdr *iph, *top_iph; | ||
41 | int flags; | ||
42 | |||
43 | iph = skb->nh.iph; | ||
44 | skb->h.ipiph = iph; | ||
45 | |||
46 | skb->nh.raw = skb_push(skb, x->props.header_len); | ||
47 | top_iph = skb->nh.iph; | ||
48 | |||
49 | top_iph->ihl = 5; | ||
50 | top_iph->version = 4; | ||
51 | |||
52 | /* DS disclosed */ | ||
53 | top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); | ||
54 | |||
55 | flags = x->props.flags; | ||
56 | if (flags & XFRM_STATE_NOECN) | ||
57 | IP_ECN_clear(top_iph); | ||
58 | |||
59 | top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? | ||
60 | 0 : (iph->frag_off & htons(IP_DF)); | ||
61 | if (!top_iph->frag_off) | ||
62 | __ip_select_ident(top_iph, dst->child, 0); | ||
63 | |||
64 | top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); | ||
65 | |||
66 | top_iph->saddr = x->props.saddr.a4; | ||
67 | top_iph->daddr = x->id.daddr.a4; | ||
68 | top_iph->protocol = IPPROTO_IPIP; | ||
69 | |||
70 | memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | ||
75 | { | ||
76 | struct iphdr *iph = skb->nh.iph; | ||
77 | int err = -EINVAL; | ||
78 | |||
79 | if (iph->protocol != IPPROTO_IPIP) | ||
80 | goto out; | ||
81 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | ||
82 | goto out; | ||
83 | |||
84 | if (skb_cloned(skb) && | ||
85 | (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) | ||
86 | goto out; | ||
87 | |||
88 | if (x->props.flags & XFRM_STATE_DECAP_DSCP) | ||
89 | ipv4_copy_dscp(iph, skb->h.ipiph); | ||
90 | if (!(x->props.flags & XFRM_STATE_NOECN)) | ||
91 | ipip_ecn_decapsulate(skb); | ||
92 | skb->mac.raw = memmove(skb->data - skb->mac_len, | ||
93 | skb->mac.raw, skb->mac_len); | ||
94 | skb->nh.raw = skb->data; | ||
95 | memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); | ||
96 | err = 0; | ||
97 | |||
98 | out: | ||
99 | return err; | ||
100 | } | ||
101 | |||
102 | static struct xfrm_mode xfrm4_tunnel_mode = { | ||
103 | .input = xfrm4_tunnel_input, | ||
104 | .output = xfrm4_tunnel_output, | ||
105 | .owner = THIS_MODULE, | ||
106 | .encap = XFRM_MODE_TUNNEL, | ||
107 | }; | ||
108 | |||
109 | static int __init xfrm4_tunnel_init(void) | ||
110 | { | ||
111 | return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); | ||
112 | } | ||
113 | |||
114 | static void __exit xfrm4_tunnel_exit(void) | ||
115 | { | ||
116 | int err; | ||
117 | |||
118 | err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); | ||
119 | BUG_ON(err); | ||
120 | } | ||
121 | |||
122 | module_init(xfrm4_tunnel_init); | ||
123 | module_exit(xfrm4_tunnel_exit); | ||
124 | MODULE_LICENSE("GPL"); | ||
125 | MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); | ||
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 4ef8efaf6a67..ac9d91d4bb05 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c | |||
@@ -12,67 +12,10 @@ | |||
12 | #include <linux/skbuff.h> | 12 | #include <linux/skbuff.h> |
13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
14 | #include <linux/netfilter_ipv4.h> | 14 | #include <linux/netfilter_ipv4.h> |
15 | #include <net/inet_ecn.h> | ||
16 | #include <net/ip.h> | 15 | #include <net/ip.h> |
17 | #include <net/xfrm.h> | 16 | #include <net/xfrm.h> |
18 | #include <net/icmp.h> | 17 | #include <net/icmp.h> |
19 | 18 | ||
20 | /* Add encapsulation header. | ||
21 | * | ||
22 | * In transport mode, the IP header will be moved forward to make space | ||
23 | * for the encapsulation header. | ||
24 | * | ||
25 | * In tunnel mode, the top IP header will be constructed per RFC 2401. | ||
26 | * The following fields in it shall be filled in by x->type->output: | ||
27 | * tot_len | ||
28 | * check | ||
29 | * | ||
30 | * On exit, skb->h will be set to the start of the payload to be processed | ||
31 | * by x->type->output and skb->nh will be set to the top IP header. | ||
32 | */ | ||
33 | static void xfrm4_encap(struct sk_buff *skb) | ||
34 | { | ||
35 | struct dst_entry *dst = skb->dst; | ||
36 | struct xfrm_state *x = dst->xfrm; | ||
37 | struct iphdr *iph, *top_iph; | ||
38 | int flags; | ||
39 | |||
40 | iph = skb->nh.iph; | ||
41 | skb->h.ipiph = iph; | ||
42 | |||
43 | skb->nh.raw = skb_push(skb, x->props.header_len); | ||
44 | top_iph = skb->nh.iph; | ||
45 | |||
46 | if (!x->props.mode) { | ||
47 | skb->h.raw += iph->ihl*4; | ||
48 | memmove(top_iph, iph, iph->ihl*4); | ||
49 | return; | ||
50 | } | ||
51 | |||
52 | top_iph->ihl = 5; | ||
53 | top_iph->version = 4; | ||
54 | |||
55 | /* DS disclosed */ | ||
56 | top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); | ||
57 | |||
58 | flags = x->props.flags; | ||
59 | if (flags & XFRM_STATE_NOECN) | ||
60 | IP_ECN_clear(top_iph); | ||
61 | |||
62 | top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? | ||
63 | 0 : (iph->frag_off & htons(IP_DF)); | ||
64 | if (!top_iph->frag_off) | ||
65 | __ip_select_ident(top_iph, dst->child, 0); | ||
66 | |||
67 | top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); | ||
68 | |||
69 | top_iph->saddr = x->props.saddr.a4; | ||
70 | top_iph->daddr = x->id.daddr.a4; | ||
71 | top_iph->protocol = IPPROTO_IPIP; | ||
72 | |||
73 | memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); | ||
74 | } | ||
75 | |||
76 | static int xfrm4_tunnel_check_size(struct sk_buff *skb) | 19 | static int xfrm4_tunnel_check_size(struct sk_buff *skb) |
77 | { | 20 | { |
78 | int mtu, ret = 0; | 21 | int mtu, ret = 0; |
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb) | |||
121 | if (err) | 64 | if (err) |
122 | goto error; | 65 | goto error; |
123 | 66 | ||
124 | xfrm4_encap(skb); | 67 | err = x->mode->output(skb); |
68 | if (err) | ||
69 | goto error; | ||
125 | 70 | ||
126 | err = x->type->output(x, skb); | 71 | err = x->type->output(x, skb); |
127 | if (err) | 72 | if (err) |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8604c747bca5..c0465284dfac 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
@@ -17,8 +17,6 @@ | |||
17 | static struct dst_ops xfrm4_dst_ops; | 17 | static struct dst_ops xfrm4_dst_ops; |
18 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; | 18 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; |
19 | 19 | ||
20 | static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; | ||
21 | |||
22 | static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) | 20 | static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) |
23 | { | 21 | { |
24 | return __ip_route_output_key((struct rtable**)dst, fl); | 22 | return __ip_route_output_key((struct rtable**)dst, fl); |
@@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) | |||
237 | 235 | ||
238 | static inline int xfrm4_garbage_collect(void) | 236 | static inline int xfrm4_garbage_collect(void) |
239 | { | 237 | { |
240 | read_lock(&xfrm4_policy_afinfo.lock); | ||
241 | xfrm4_policy_afinfo.garbage_collect(); | 238 | xfrm4_policy_afinfo.garbage_collect(); |
242 | read_unlock(&xfrm4_policy_afinfo.lock); | ||
243 | return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); | 239 | return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); |
244 | } | 240 | } |
245 | 241 | ||
@@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = { | |||
299 | 295 | ||
300 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { | 296 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |
301 | .family = AF_INET, | 297 | .family = AF_INET, |
302 | .lock = RW_LOCK_UNLOCKED, | ||
303 | .type_map = &xfrm4_type_map, | ||
304 | .dst_ops = &xfrm4_dst_ops, | 298 | .dst_ops = &xfrm4_dst_ops, |
305 | .dst_lookup = xfrm4_dst_lookup, | 299 | .dst_lookup = xfrm4_dst_lookup, |
306 | .find_bundle = __xfrm4_find_bundle, | 300 | .find_bundle = __xfrm4_find_bundle, |
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index dbabf81a9b7b..81e1751c966e 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c | |||
@@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, | |||
131 | 131 | ||
132 | static struct xfrm_state_afinfo xfrm4_state_afinfo = { | 132 | static struct xfrm_state_afinfo xfrm4_state_afinfo = { |
133 | .family = AF_INET, | 133 | .family = AF_INET, |
134 | .lock = RW_LOCK_UNLOCKED, | ||
135 | .init_flags = xfrm4_init_flags, | 134 | .init_flags = xfrm4_init_flags, |
136 | .init_tempsel = __xfrm4_init_tempsel, | 135 | .init_tempsel = __xfrm4_init_tempsel, |
137 | .state_lookup = __xfrm4_state_lookup, | 136 | .state_lookup = __xfrm4_state_lookup, |